二分k均值原理及python實現請見:
(1)(2) python實現
c++**:基於python的思路寫的(求指點)
#include #include #include #include #include using namespace std;
typedef vectortuple;
const int dims = 2;
const int k = 2;
const int bik = 4;
void dokmeans(vector& tuples,tuple subcentroids,vector& subclusterassment1,tuple& subclusterassment2);
void assigntuples(vectorclusters,vectortuples,tuple means);
float getdist(const tuple& t1, const tuple& t2);
float getval(vectorclusters, tuple means);
tuple updatemeans(const vector& cluster_i);
void print(vectorclustes);
int main()
//read the data讀取資料
vectortuples;
int pos = 0;
while(!file.eof())
vectorcentroids; // generated centroids 生成的聚類中心
vectorclusterassment1; // samples index in different centroids 樣本對應的簇索引
tuple clusterassment2; // samples se 每個樣本到其聚類中心的平方誤差
//initialize the first centroid初始化第乙個聚類中心(採樣本均值)
tuple centroid(dims+1,0);
for(int i = 0; i < tuples.size();i++)
for(int j = 1; j <= dims; j++)
centroid[j] /= tuples.size();
centroids.push_back(centroid);
//initialize the clusterassment1and clusterassmen2 初始化樣本簇索引為0及計算樣本到聚類中心的距離
for(int i = 0; i < tuples.size(); i++)
//inviding二分
int test = 0;
while(centroids.size() < bik) //是否已經得到bik個聚類中心
//2means clustering //對該簇進行二聚類
tuple subcentroids[2];
vectorsubclusterassment1;
tuple subclusterassment2;
dokmeans(subdataset,subcentroids,subclusterassment1,subclusterassment2);
//computer the sum sse of cluster i //二聚類後,計算對應sse 及總的聚類代價
float ssei = 0;
float sseother = 0;
float ssesum = 0
//該簇聚類後的sse
for (int x = 0; x < subclusterassment2.size(); x++)
ssei += subclusterassment2[x];
//其他簇的sse
for (int y = 0; y < tuples.size(); y++)
ssesum = ssei + sseother;
//find the cluster who has the minimal sse
if( ssesum < minsse)
minindex = i;
minsse = ssesum;
}} //update the centroids //更新聚類中心
int yuan_size = centroids.size();
centroids[minindex].swap(minsubcentroid[0]); //minindex簇二聚類後索引為0的子簇的聚類中心取代該簇原來的聚類中心,
centroids.push_back(minsubcentroid[1]); //minindex簇二聚類後索引為1的子簇的聚類中心新增到後面
cout << "yuan minsubclusterassment1: " << endl;
for(int jj = 0; jj < minsubclusterassment1.size(); jj++)
cout << minsubclusterassment1[jj] << " ";
cout << endl;
//update the clusterassment //更新樣本的簇索引
//更新兩個新簇的索引
for(int s = 0; s < minsubclusterassment1.size(); s++)
cout << "minindex: " << minindex << endl;
cout << "updated minsubclusterassment1: " << endl;
for(int jj = 0; jj < minsubclusterassment1.size(); jj++)
cout << minsubclusterassment1[jj] << " ";
cout << endl;
//更新樣本簇索引,將樣本簇索引為minindex的全部換成兩個新子簇的索引
int count = 0;
for(int a = 0; a < clusterassment1.size(); a++)
}cout << "clusterassment1: " << endl;
for(int jj = 0; jj < clusterassment1.size(); jj++)
cout << clusterassment1[jj] << " ";
cout << endl;
} cout << "bikmenas is done!" << endl;
system("pause");
return 0;
}void dokmeans(vector& tuples,tuple subcentroids,vector& subclusterassment1,tuple& subclusterassment2)
assigntuples(clusters,tuples,means);
double newval = getval(clusters,means);
double oldval = -1;
int t = 0;
while ((abs(newval - oldval)) > 1)
subcentroids[0] = means[0];
subcentroids[1] = means[1];
for(int i = 0; i < tuples.size();i++)
subclusterassment1.push_back(label);
subclusterassment2.push_back(dist); }
} void assigntuples(vectorclusters, vectortuples,tuple means)
} clusters[label].push_back(tuples[i]);
} }
float getdist(const tuple& t1, const tuple& t2)
float getval(vectorclusters, tuple means)
return val;
}
tuple updatemeans(const vector& cluster_i)
實現結果:
k Means 二分k 均值演算法)
在上一節中我們已經講了k 均值演算法,當時我們選取的質心是隨機選取的,沒有什麼依據,所以聚類的結果很可能出現誤差,為了降低這種誤差的出現我們今天來研究一種優化的k 均值演算法 二分k 均值演算法,看到名稱我們就能明白在每次劃分的時候都是將資料劃分成倆份,直到達到我們要求的聚類數。怎麼來分?選取哪一堆...
k means k均值聚類 及二分k均值聚類
from numpy import def load data file name data fr open file name for line in fr.readlines cur line line.strip split t flt line map float,cur line retu...
二分 Ybt 最大均值
給定正整數序列 a,求乙個平均數最大的,長度不小於 l 的 連續的 子段 為了方便計算所以所有數都乘上1000.then,二分均值。n nn 的範圍到 105 10 5 105考慮二分判斷結果如何在 o n o n o n 或 o nl ogn o nlogn o nlog n 的複雜度內解決 我們...