'''
knn.py
'''from numpy import *
from operator
def createdataset():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['a','a','b','b']
return group,labels
'''intx: 輸入向量,與dataset矩陣矩陣的行數一致
k: 鄰近的個數
'''def classify0(intx,dataset,labels,k):
datasetsize = dataset.shape[0]
# 計算歐式距離
diffmat = tile(intx,(datasetsize,1)) - dataset # 矩陣減法
# tile(a,reps) a沿各個維度重複的次數
sqdiffmat = diffmat ** 2
sqdistances = sqdiffmat.sum(axis = 1) # 矩陣行相加,列的引數是0
distances = sqdistances ** 0.5
#print("distances:")
#print distances
sorteddistindicies = distances.argsort() # argsort函式返回的是陣列值從小到大的索引值
#print("sorteddistindicies:")
#print sorteddistindicies
classcount ={} #dict 型別
for i in range(k):
voteilabel = label[sorteddistindicies[i]]
#print ("i: %d, sorteddistindicies[%d]: %d"%(i,i,sorteddistindicies[i]))
#print ("voteilabel:%s"%(voteilabel))
classcount[voteilabel] = classcount.get(voteilabel,0) + 1
#d.get(k[, d]) => d[k] if k in d else d. d defaults to none.提取標籤出現的個數
#print ("classcount[%s]:%d"%(voteilabel,classcount[voteilabel]))
sortedclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse = true)
#classcount.iteritems()遍歷
#operator.itemgetter函式選取第幾個維的數字,和sorted一起用可根據選出的那個維進行排序
return sortedclasscount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayoflines = fr.readlines()
numberoflines = len(arrayoflines)
returnmat = zeros((numberoflines,3)) #3列矩陣
classlabelvector = #list
index = 0
for line for arrayoflines:
line = line.strip() # 去除回車
listfromline = line.split('\t')
returnmat[index,:] = listfromline[0:3]
index += 1
return returnmax, classlabelvector
''' 歸一化
newvalue = (oldvalue - minvalue)/(maxvalue - minvalue)
'''
def autonorm(dataset):
minvals = dataset.min(0)
maxvals = dataset.max(0)
ranges = maxvals - minvals
normdataset = zeros(shape(dataset))
m = dataset.shape[0]
normdataset = normdataset - tile(minvals,(m,1))
normdataset = normdataset / tile(ranges,(m,1))
return normdataset, ranges, minvals
'''test
'''def datingdatatest():
horatio = 0.10
datingdatamat, datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
m = normmat.shape[0]
normtestvecs = int (m * horatio)
errorcount = 0.0
for i in range(normtestvecs):
classifierresult = classify0(normmat[i,:],\
normmat[normtestvecs:m,:],\
datinglabels[normtestvecs:m],\
3)print ("the classifier came back with: %d, the real answer is:%d"\
%(classifierresult,datinglabels[i]))
if(classifierresult != datinglabels[i]):
errorcount += 1.0
print ("the total error rate is: %f" % (errorcount/float(numtestvecs)))
'''classify
'''def classifyperson():
resultlist = ['not at all','in small doses','in large doses']
percenttats = float(raw_input\
("percentage of time spent playing video games?"))
ffmiles = float(raw_input\
("frequent flier miles earned per year?"))
icecream = float(raw_input\
("liters of ice cream consumed per year?"))
datingdatamat,datinglabels = file2matrix('datingtestset2.txt')
normmat, ranges, minvals = autonorm(datingdatamat)
inarr = array([percenttats,ffmiles,icecream])
classifierresult = classify0((inarr-minvals)/ranges,\
normmat,\
datinglabels,\
3)print ("you will probably like this person: ",\
resultlist[classifierresult-1])
《機器學習實戰》筆記(三) Ch3 決策樹
資訊增益 熵 劃分資料集 遞迴構建決策樹 測試演算法 使用決策樹執行分類 使用演算法 決策樹的儲存 例子 使用決策樹 眼睛型別 目標 通過決策樹 患者需要佩戴的 眼睛型別。fr open lensens.txt lenses inst.strip split t for inst in fr.rea...
機器學習實戰
花了一段時間,總算把 機器學習實戰 粗讀了一遍,重點就在這個粗讀上。這本書的確不錯,機器學習的幾個經典演算法都涉及了,每個演算法都有1 2個實際例子進行說明,都有實實在在的 讓我想起了linus的 talk is cheap,show me the code 那句名言。但多年來養成的習慣,從來都是喜...
ch1機器學習基礎
分類 將例項資料劃分到合適的分類中 回歸 通過給定資料點擬合最有曲線從而 數值型資料 以上兩個任務都屬於監督學習,因為這類演算法必須知道 什麼,即目標變數的分類資訊或目標數值。無監督學習 資料沒有類別資訊,也不會給出目標值 聚類 將資料集分成由類似的物件組成的多個類的過程 密度估計 將尋找描述資料統...