from numpy import *
def loaddataset(filename):
datamat =
fr = open(filename)
for line in fr.readlines():
curline = line.strip().split('\t')
fltline = map(float,curline)
return datamat
def disteclude(veca, vecb):
return sqrt(sum(pow((veca-vecb).a,2)))
def randcent(dataset, k):
n = shape(dataset)[1]
centroids = mat(zeros((k,n)))
for j in range(n):
print dataset[:,j]
minj = min(dataset[:,j])
rangej = float(max(dataset[:,j])-minj)
centroids[:,j] = minj + rangej * random.rand(k,1)
return centroids
def kmeans(dataset, k, distmeas = disteclude, createcent=randcent):
m = shape(dataset)[0]
clusterassment = mat(zeros((m,2)))
centroids = createcent(dataset,k)
clusterchanged = true
while clusterchanged:
for i in range(m):
mindist = inf; minindex = -1
for j in range(k):
distji = distmeas(centroids[j,:],dataset[i,:])
if distji < mindist:
mindist = distji
minindex = j
clusterchanged = true if clusterassment[i,0] != minindex else false
clusterassment[i,:] = minindex, mindist**2
print centroids
for cent in range(k):
ptsinclust = dataset[nonzero(clusterassment[:,0].a == cent)[0]] #get datas which belong to cent
centroids[cent,:] = mean(ptsinclust, axis = 0) #update the centroids
return centroids, clusterassment
# dataset = loaddataset('testset.txt')
# kmeans(mat(dataset), 3, disteclude, randcent)
def bikmeans(dataset, k, distmeas = disteclude):
m = shape(dataset)[0]
clusterassment = mat(zeros((m,2)))
centroid0 = mean(dataset, axis=0).tolist()[0]
centlist = [centroid0]
for j in range(m):
clusterassment[j,1] = distmeas(mat(centroid0), dataset[j,:])**2
while(len(centlist) < k):
lowestsse = inf
for i in range(len(centlist)):
ptsincurrcluster = dataset[nonzero(clusterassment[:,0].a == i)[0],:]
centroidmat, splitclustass = kmeans(ptsincurrcluster, 2, distmeas)
ssesplit = sum(splitclustass[:,1])
ssenotsplit = sum(clusterassment[nonzero(clusterassment[:,0].a != i)[0],1])
print "ssesplit, and notsplit:", ssesplit, ssenotsplit
if (ssesplit + ssenotsplit) < lowestsse:
bestcenttosplit = i
bestnewcents = centroidmat
bestclustass = splitclustass.copy()
lowestsse = ssesplit + ssenotsplit
'''after partition,the bestcenttosplit will replace the original cluster'''
bestclustass[nonzero(bestclustass[:,0].a == 1)[0],0] = len(centlist)
bestclustass[nonzero(bestclustass[:,0].a == 0)[0],0] = bestcenttosplit
print 'the bestcenttosplit is:', bestcenttosplit
print 'the len of bestcustass is:', len(bestclustass)
centlist[bestcenttosplit] = bestnewcents[0,:]
'''update the clusterassment'''
clusterassment[nonzero(clusterassment[:,0].a == bestcenttosplit)[0],:] = bestclustass
return centlist, clusterassment
# datmat = mat(loaddataset('testset2.txt'))
# centlist, mynewassments = bikmeans(datmat, 3)
# print centlist
機器學習實戰Kmeans
from numpy import import matplotlib.pyplot as plt import pandas as pd load dataset url names sepal length sepal width petal length petal width class d...
機器學習實戰之K Means聚類
俗話說的好 物以類聚,人以群分 今天我們要講的聚類演算法很大程度上可以印證此話。聚類是一種非監督學習,什麼是非監督學習?與之前學習的分類和回歸不同 監督學習 監督學習是有有label標籤的,而非監督學習沒有。我們再回到聚類上,聚類是把相似的物件歸到同一簇中,有點像全自動分類。聚類的應用場景有很多,例...
機器學習機器學習實戰 kmeans
簡介 聚類演算法是一種無監督學習,它將相似的物件歸類到同一簇中。聚類的方法可以應用所有的物件,簇內的物件越相似,聚類效果也就越好。聚類和分類的最大不同之處在於,分類的目標是已知的,聚類是完全無監督學習,類別沒有像分類那樣被預先定義出來,所以叫做無監督學習。kmeans演算法是實際中最常用的聚類演算法...