from numpy import *
import operator
def createdataset():
group = array([[1.0,1.1], [1.0,1.0], [0,0], [0,0.1]])
labels = ['a', 'a', 'b', 'b']
return group, labels
def classify0(inx, dataset, labels, k):
'''k-近鄰演算法'''
datasetsize = dataset.shape[0]
sorteddistindicies = ((((tile(inx,(datasetsize, 1))-dataset)**2).sum(axis=1))**0.5).argsort()
classcount =
for i in range(k):
voteilabel = labels[sorteddistindicies[i]]
classcount[voteilabel] = classcount.get(voteilabel, 0) + 1
sortedclasscount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=true)
return sortedclasscount[0][0]
def file2matrix(filename):
'''將文字記錄轉化為numpy的解析程式'''
fr = open(filename)
arrayolines = fr.readlines()
numberoflines = len(arrayolines)
returnmat = zeros((numberoflines,3))
classlabelvector =
index = 0
for line in arrayolines:
line = line.strip()
listfromline = line.split('\t')
returnmat[index,:] = listfromline[0:3]
index += 1
return returnmat, classlabelvector
def autonorm(dataset):
'''歸一化特徵值'''
minvals = dataset.min(0)
maxvals = dataset.max(0)
ranges = maxvals - minvals
normdataset = zeros(shape(dataset))
m = dataset.shape[0]
normdataset = dataset - tile(minvals,(m,1))
normdataset = normdataset/tile(ranges,(m,1))
return normdataset, ranges, minvals
def datingclasstest():
'''分類器針對約會**的測試**'''
horatio = 0.10
datingdatamat, datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
m = normmat.shape[0]
numtestvecs = int(m*horatio)
errorcount = 0.0
for i in range(numtestvecs):
classifierresult = classify0(normmat[i,:],normmat[numtestvecs:m,:],datinglabels[numtestvecs:m],3)
print("the classifier came back with: %d, the real answer is: %d" %(classifierresult, datinglabels[i]))
if(classifierresult != datinglabels[i]):
errorcount += 1.0
print("the total error rate is : %f"%(errorcount/float(numtestvecs)))
def classifyperson():
'''約會****函式'''
resultlist = ['not at all', 'in small doses', 'in large doses']
percenttats = float(input("percentage of time spent playing video games?"))
ffmiles = float(input("frequent of ice cream consumed per year?"))
icecream = float(input("liters of ice cream consumed per year?"))
datingdatamat, datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
inarr = array([ffmiles, percenttats, icecream])
classifierresult = classify0((inarr-minvals)/ranges, normmat, datinglabels, 3)
print("you will probably like this person: ", resultlist[classifierresult - 1])
# 示列:手寫識別系統
from os import listdir
def img2vector(filename):
'''準備資料:將影象轉換為測試向量'''
returnvect = zeros((1,1024))
fr = open(filename)
for i in range(32):
linestr = fr.readline()
for j in range(32):
returnvect[0,32*i+j] = int(linestr[j])
return returnvect
def handwritingclasstest():
'''手寫數字識別系統的測試**'''
hwlabels =
trainingfilelist = listdir('trainingdigits')
m = len(trainingfilelist)
trainingmat = zeros((m,1024))
for i in range(m):
filenamestr = trainingfilelist[i]
classnumstr = int(filenamestr.split('.')[0].split('_')[0])
trainingmat[i,:] = img2vector('trainingdigits/%s'%filenamestr)
testfilelist = listdir('testdigits')
errorcount = 0.0
mtest = len(testfilelist)
for i in range(mtest):
filenamestr = testfilelist[i]
classnumstr = int(filenamestr.split('.')[0].split('_')[0])
vectorundertest = img2vector('testdigits/%s'%filenamestr)
classifierresult = classify0(vectorundertest,trainingmat,hwlabels,3)
print("the classifier came back with: %d, the real number is: %d"%(classifierresult, classnumstr))
if(classifierresult!=classnumstr):
errorcount += 1.0
print("\nthe total number of errors is: %d"% errorcount)
print("\nthe total error rate is: %f"%(errorcount/float(mtest)))
Python最近鄰演算法(KNN)
近朱者赤,近墨者黑。學習筆記 knn 乙個樣本在特徵空間中,總有k個與之最相似 即特徵空間中最鄰近 的樣本。其中,大多數屬於某一類別,則該樣本也屬於這個類別。計算步驟 1.算距離 算出測試樣本到訓練集中每個樣本的距離。例如 歐氏距離 2.找鄰居 找出距離最近的k個訓練物件。k值的選取 交叉驗證 3....
最近鄰規則演算法(KNN)
最近鄰演算法 knn 是乙個基於例項學習的分類演算法。如果乙個例項在特徵空間中的k個最相似 即特徵空間中最近鄰 的例項中的大多數屬於某乙個類別,則該例項也屬於這個類別。所選擇的鄰居都是已經正確分類的例項。演算法步驟 1 把所有分類好的 有標籤label 的資料 例項 作為訓練集 2 選擇好引數k 3...
KNN 最近鄰演算法初探
本文質量不咋地,目的是記錄一下自己的 還有乙個原因是我發現別人的部落格寫的太好了!knn 演算法屬於監督學習的演算法,基本原理是對整個資料整體進行打標籤之後,對乙個新的元素根據其在向量空間中的位置來對其分類。k近鄰演算法是在訓練資料集中找到與該例項最鄰近的k個例項,這k個例項的多數屬於某個類,我們就...