機器學習演算法 2 K近鄰演算法實戰

from numpy import *

from os import listdir

import operator

import time


def warrper():

starttime = time.time()


pretime = time.time()

runtime = (pretime-starttime)

print("the running time:",runtime)

return warrper

# 計算距離然後對距離進行排序,取前k項較小的,並返回其中類別最多的乙個

def classify0(inx,dataset,labels,k):

datasetsize = dataset.shape[0]


diffmat = tile(inx,(datasetsize,1))-dataset


sqdistance = sqdiffmat.sum(axis=1)

distance =sqdistance**0.5

sorteddistances = distance.argsort()

classcount = {}

for i in range(k):

votelable = labels[sorteddistances[i]]

classcount[votelable] = classcount.get(votelable,0)+1

sortedclasscount = sorted(classcount.items(),key = operator.itemgetter(1),reverse=true)

return sortedclasscount[0][0]

# 將影象格式處理為乙個向量

def img2vetor(filename):

returnvect = zeros((1,1024))

fr = open(filename)

for i in range(32):

linestr = fr.readline()

for j in range(32):

returnvect[0,32*i+j] = int(linestr[j])

return returnvect

# 呼叫訓練資料和測試資料

def handwritingclasstest():

hwlables =

trainingfilelist = listdir('./trainingdigits')

m = len(trainingfilelist)

trainingmat = zeros((m,1024))

for i in range(m):

# print(trainingfilelist[i])

filenamestr = trainingfilelist[i]

filestr = filenamestr.split('.')[0]

classnumstr = int(filestr.split('_')[0])

trainingmat[i,:] = img2vetor('./trainingdigits/%s'%filenamestr)

testfilelist = listdir('./testdigits')

errorcount = 0

mtest = len(testfilelist)

for j in range(mtest):

testfilename = testfilelist[j]

testclassnum = int(testfilename.split('_')[0])

vectorundertest = img2vetor('./testdigits/%s'%testfilename)

classresult = classify0(vectorundertest,trainingmat,hwlables,3)

print('the classifier come back with:%d,the real answer is %d' %(classresult,testclassnum))

if classresult!=testclassnum:

errorcount +=1

print("\nthe totle error is %d" %errorcount)

print("\nthe totle error rate is %f"%(errorcount/float(mtest)))



the totle error is 10

the totle error rate is 0.010571

the running time: 38.14647126197815

process finished with exit code 0

from numpy import *

from os import listdir

from sklearn.neighbors import kneighborsclassifier

import time

def warrper():

starttime = time.time()


pretime = time.time()

runtime = (pretime-starttime)


return warrper

# 這一步是必須的,要把影象轉化成一維向量

def img2vector(filename):

returnvector = zeros((1,1024))

fr = open(filename)

for i in range(32):

linestr = fr.readline()

for j in range(32):


return returnvector

# 獲取訓練了資料的影象資料,並轉化為向量

def training2vetor():

trainingfilelist = listdir('./trainingdigits')

m = len(trainingfilelist)

trainmat = zeros((m,1024))

hwlabels =

for i in range(m):

trainmat[i, :] = img2vector('./trainingdigits/%s' % trainingfilelist[i])

trainingnum = int(trainingfilelist[i].split('_')[0])

return trainmat,hwlabels

# 對測試資料進行測試

def testclass():

clf = kneighborsclassifier(n_neighbors=3,algorithm='kd_tree',n_jobs=-1)

trainmat,hwlabels = training2vetor()


testclasslist = listdir('./testdigits')

mtest = len(testclasslist)

errorcount = 0

testlabels =

for i in range(mtest):

testname = testclasslist[i]

testnum = int(testname.split('_')[0])

testvector = img2vector('./testdigits/%s'%testclasslist[i])


if testresult!=testnum:


print("\nthe totle error is %d" % errorcount)

print("\nthe totle error rate is %f" % (errorcount / float(mtest)))


# 執行完後,明顯發現呼叫庫比純手寫的代價執行效率要低,故安裝乙個裝飾器來對比兩個程式執行時間


the totle error is 12

the totle error rate is 0.012685


process finished with exit code 0

