kmean.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
from numpy import *
#匯入資料
defloaddataset
(filename):
datamat =
fr = open(filename)
for line in fr.readlines():
curline = line.strip().split('\t')
fltline = map(float, curline)
return datamat
#計算兩個向量的歐氏距離
defdisteclud
(veca, vecb):
return sqrt(sum(power(veca-vecb, 2)))
#為給定資料集構建乙個包含k個隨機質心的集合
defrandcent
(dataset, k):
n = shape(dataset)[1]
centroids = mat(zeros((k, n)))
for j in range(n):
minj = min(dataset[:, j])
randj = float(max(dataset[:,j] - minj))
#random.rand(k, 1)生成k個[0,1]中的隨機數
centroids[:, j] = minj + randj * random.rand(k, 1)
return centroids
#k均值聚類演算法
defkmeans
(dataset, k, distmeas=disteclud, createcent=randcent):
m = shape(dataset)[0] #資料點總數
clusterassment = mat(zeros((m, 2))) #簇分配結果矩陣,一列記錄簇索引值,一列儲存誤差
centroids =createcent(dataset, k)
clusterchanged = true
#按 計算質心-分配-重新計算 反覆迭代
while clusterchanged:
clusterchanged = false
for i in range(m):
mindist = inf
minindex = -1
#尋找最近的質心
for j in range(k):
distji = distmeas(centroids[j,:], dataset[i,:])
if distji < mindist:
mindist = distji
minindex = j
if clusterassment[i,0] != minindex:
clusterchanged = true
clusterassment[i,:] = minindex, mindist**2
#更新質心的位置
for cent in range(k):
ptsinclust = dataset[nonzero(clusterassment[:,0].a==cent)[0]]
centroids[cent,:] = mean(ptsinclust, axis=0)
return centroids, clusterassment
#二分k-均值聚類演算法
defbikeans
(dataset, k, distmeas=disteclud):
m = shape(dataset)[0]
clusterassment = mat(zeros((m, 2))) #第一例儲存簇分配結果,第二列儲存平方誤差
centroid0 = mean(dataset, axis=0).tolist()[0] #建立乙個初始簇
cenlist = [centroid0] #儲存所有質心
for j in range(m):
clusterassment[j,1] = distmeas(mat(centroid0), dataset[j,:]) ** 2
while (len(cenlist) < k):
lowestsse = inf
for i in range(len(cenlist)):
ptsincurrcluster = dataset[nonzero(clusterassment[:,0].a==i)[0],:] #將簇中的所有點看成乙個小的資料集
centroidmat, splitclustass = kmeans(ptsincurrcluster, 2, distmeas) #生成2個質心簇
ssesplit = sum(splitclustass[:,1]) #誤差和
ssenotsplit = sum(clusterassment[nonzero(clusterassment[:,0].a != i)[0], 1]) #剩餘資料集誤差和
if (ssesplit + ssenotsplit) < lowestsse:
bestcenttosplit = i
bestnewcents = centroidmat
bestclusass = splitclustass.copy()
lowestsse = ssesplit + ssenotsplit
#將要劃分的簇中所有點的簇分配結果進行修改
bestclusass[nonzero(bestclusass[:,0].a==0)[0], 0] = bestcenttosplit #將編號為0的簇改為劃分簇的編號
bestclusass[nonzero(bestclusass[:,0].a==1)[0], 0] = len(cenlist) #將編號為1的簇改為新增簇的編號
print
'the bestcenttosplit is: ', bestcenttosplit
print
'the len of bestclustass is: ', len(bestclusass)
cenlist[bestcenttosplit] = bestnewcents[0,:].tolist()[0] #修改質心列表
clusterassment[nonzero(clusterassment[:,0].a == bestcenttosplit)[0],:] = bestclusass
return mat(cenlist), clusterassment
測試
>>> import kmean
>>> datamat = mat(loaddataset('testset2.txt'))
>>> centlist, myassments = bikeans(datamat, 3)
the bestcenttosplit is: 0
the len of bestclustass is: 60
the bestcenttosplit is: 0
the len of bestclustass is: 40
>>> centlist
matrix([[-2.94737575, 3.3263781 ],
[-0.45965615, -2.7782156 ],
[ 2.93386365, 3.12782785]])
>>> myassments
matrix([[ 2.00000000e+00, 1.45461050e-01],
[ 0.00000000e+00, 6.80213825e-01],
[ 1.00000000e+00, 1.02184582e+00],
[ 2.00000000e+00, 1.34548760e+00],
[ 0.00000000e+00, 1.35376464e+00],
[ 1.00000000e+00, 3.87167519e+00],
[ 2.00000000e+00, 8.37259951e-01],
[ 0.00000000e+00, 2.20116272e-01],
......
機器學習 利用K 均值聚類演算法對未標註資料分組
聚類是一種無監督的學習,它將相似的物件歸到同乙個簇中。有點像全自動分類。聚類方法幾乎可以應用於所有物件,簇內的物件越相似,聚類的效果越好。聚類分析試圖將相似物件歸入同一簇,將不相似物件歸到不同簇。相似這一概念取決於所選的相似度計算方法。優點 易於實現。缺點 可能收斂到區域性最小值,在大規模資料集上收...
利用K means聚類演算法對未標註資料分組
def loaddataset filename datamat assume last column is target value fr open filename for line in fr.readlines curline line.strip split t fltline map f...
k均值聚類演算法
輸入 簇的數目k和包含n個物件的資料庫。輸出 k個簇,使平方誤差準則最小。演算法步驟 1.為每個聚類確定乙個初始聚類中心,這樣就有k 個初始聚類中心。2.將樣本集中的樣本按照最小距離原則分配到最鄰近聚類 3.使用每個聚類中的樣本均值作為新的聚類中心。4.重複步驟2.3直到聚類中心不再變化。5.結束,...