import operatorfrom math import log
def calcshannonent(dataset):
numentries=len(dataset)
labelcounts={}
for featvec in dataset:
currentlabel=featvec[-1]
if currentlabel not in labelcounts.keys():
labelcounts[currentlabel]=0
labelcounts[currentlabel]+=1
shannonent=0.0
for key in labelcounts:
prob=float(labelcounts[key])/numentries
shannonent-=prob*log(prob,2)
return shannonent
def createdataset():
dataset=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no su***cing','flippers']
return dataset,labels
def splitdataset(dataset,axis,value):
redataset=
for featvec in dataset:
if featvec[axis]==value:
reducefeatvec=featvec[:axis]
reducefeatvec.extend(featvec[axis+1:])
return redataset
def choosebestfeaturetosplit(dataset):
numfeatures=len(dataset[0])-1
baseentropy=calcshannonent(dataset)
bestinfogain=0.0
bestfeature= -1
for i in range(numfeatures):
featlist=[example[i] for example in dataset]
uniquevals=set(featlist)
newentropy=0.0
for value in uniquevals:
subdataset=splitdataset(dataset,i,value)
prob=len(subdataset)/float(len(dataset))
newentropy += prob*calcshannonent(subdataset)
infogain=baseentropy-newentropy
if (infogain>bestinfogain):
bestinfogain=infogain
bestfeature=i
return bestfeature
def majoritycnt(classlist):
classcount={}
for vote in classlist:
if vote not in classcount.keys(): classcount[vote]=0
classcount[vote]+=1
sortedclasscount=sorted(classcount.iteritems(),key=operator.itemgetter(1),reverse=ture)
return sortedclasscount[0][0]
def createtree(dataset,labels):
classlist=[example[-1] for example in dataset]
if classlist.count(classlist[0])==len(classlist):
return classlist[0]
if len(dataset[0])==1:
return majoritycnt(classlist)
bestfeat=choosebestfeaturetosplit(dataset)
bestfeatlabel=labels[bestfeat]
mytree=}# 初始化mytree,分類結果以字典形式儲存
del(labels[bestfeat])
featvalues=[example[bestfeat] for example in dataset]
uniquevals=set(featvalues)
for value in uniquevals:
sublabels=labels[:] # 去掉了這個最好的標籤後 繼續遞迴查詢分類,複製了去掉最好的標籤
mytree[bestfeatlabel][value]=createtree(splitdataset(dataset,bestfeat,value),sublabels)
return mytree
def classify(inputtree,featlabels,testvec):#函式的作用將標籤字串轉化為索引
# inputtree:是輸入的決策樹物件
# featlabels:是我們要**的特徵值的label,如:['throat','mustache'],#featlabels:是我們要**的分類列表,如:['no su***cing','flippers']
# testvec:是要**的特徵值向量,如[1,0],第乙個特徵值是1的結點,且第二個特徵值是2的葉節點
#labels = "色澤,根蒂,敲聲,紋理,臍部,觸感".split(',')
#print(classify(mytree,labels,"青綠,蜷縮,沉悶,稍糊,稍凹,軟粘".split(',')))
fistsides = list(inputtree.keys())
firststr = fistsides[0]
#firststr=inputtree.keys()[0]
seconddict=inputtree[firststr]
featindex=featlabels.index(firststr)#.index() 在featlabels中找到和firststr一樣的的索引位置,
# 即找到第乙個標籤的索引,例如"no su***cing"的索引,索引為多少,testvec的第幾個位置就是描述該標籤的幾個特徵值的某乙個,
#也就是說,標籤的順序和testvec中特徵值的順序是對應的。如下所示:
# labels = "色澤,根蒂,敲聲,紋理,臍部,觸感".split(',')
# print(classify(mytree,labels,"青綠,蜷縮,沉悶,稍糊,稍凹,軟粘".split(',')))
for key in seconddict.keys():##**的label下面的分類特徵值
if testvec[featindex]==key:#比較特徵值,決策樹是根據特徵的值劃分的,
if type(seconddict[key]).__name__=='dict':
classlabel=classify(seconddict[key],featlabels,testvec)
else:
classlabel=seconddict[key]
return classlabel
#最好能夠在每次執行分類時呼叫已經構造好的決策樹。可以使用python中的pickle模組序列化物件。
# 序列化物件可以在磁碟上儲存物件pickle.dump(),
# 並在需要的時候讀取出來pickle.load()。任何物件都可以執行序列化操作,字典物件也不例外!!!
def storetree(inputtree,filename):#filenam是定義的乙個新的資料夾,目的是把樹儲存進去
import pickle
fw=open(filename,'wb')#以寫入模式開啟檔案filename
pickle.dump(inputtree,fw)#將要儲存的樹inputtree放入filename檔案中
fw.close()#關閉檔案
def gradtree(filename):
import pickle
fr=open(filename,'rb')#開啟已經儲存樹的檔案,
return pickle.load(fr)#從檔案中把內容載入出來,load模組接受返回的物件,這個過程叫做拆封。
機器學習實戰 決策樹
決策樹 2 python語言在函式中傳遞的是列表的引用,在函式內部對列表物件的修改,將會影響該列表物件的整個生存週期。為了消除這個不良影響,我們需要在函式的開始宣告乙個新列表物件。在本節中,指的是在劃分資料集函式中,傳遞的引數dataset列表的引用,為了不影響dataset我們重新宣告了乙個ret...
機器學習實戰決策樹
這幾天一直在學習機器學習實戰python 實現,在程式清單的3 6 獲取及誒單數程式,書上的程式是這樣的 def getnumleafs mytree numleafs 0.0 firststr list dict.keys mytree 0 seconddict mytree firststr p...
機器學習實戰 決策樹
class sklearn.tree.decisiontreeclassifier criterion gini splitter best max depth none,min samples split 2,min samples leaf 1,min weight fraction leaf ...