機器學習實戰 決策樹

2021-09-06 08:50:10 字數 4296 閱讀 5013

import operator

from math import log

def calcshannonent(dataset):

numentries=len(dataset)

labelcounts={}

for featvec in dataset:

currentlabel=featvec[-1]

if currentlabel not in labelcounts.keys():

labelcounts[currentlabel]=0

labelcounts[currentlabel]+=1

shannonent=0.0

for key in labelcounts:

prob=float(labelcounts[key])/numentries

shannonent-=prob*log(prob,2)

return shannonent

def createdataset():

dataset=[[1,1,'yes'],

[1,1,'yes'],

[1,0,'no'],

[0,1,'no'],

[0,1,'no']]

labels=['no su***cing','flippers']

return dataset,labels

def splitdataset(dataset,axis,value):

redataset=

for featvec in dataset:

if featvec[axis]==value:

reducefeatvec=featvec[:axis]

reducefeatvec.extend(featvec[axis+1:])

return redataset

def choosebestfeaturetosplit(dataset):

numfeatures=len(dataset[0])-1

baseentropy=calcshannonent(dataset)

bestinfogain=0.0

bestfeature= -1

for i in range(numfeatures):

featlist=[example[i] for example in dataset]

uniquevals=set(featlist)

newentropy=0.0

for value in uniquevals:

subdataset=splitdataset(dataset,i,value)

prob=len(subdataset)/float(len(dataset))

newentropy += prob*calcshannonent(subdataset)

infogain=baseentropy-newentropy

if (infogain>bestinfogain):

bestinfogain=infogain

bestfeature=i

return bestfeature

def majoritycnt(classlist):

classcount={}

for vote in classlist:

if vote not in classcount.keys(): classcount[vote]=0

classcount[vote]+=1

sortedclasscount=sorted(classcount.iteritems(),key=operator.itemgetter(1),reverse=ture)

return sortedclasscount[0][0]

def createtree(dataset,labels):

classlist=[example[-1] for example in dataset]

if classlist.count(classlist[0])==len(classlist):

return classlist[0]

if len(dataset[0])==1:

return majoritycnt(classlist)

bestfeat=choosebestfeaturetosplit(dataset)

bestfeatlabel=labels[bestfeat]

mytree=}# 初始化mytree,分類結果以字典形式儲存

del(labels[bestfeat])

featvalues=[example[bestfeat] for example in dataset]

uniquevals=set(featvalues)

for value in uniquevals:

sublabels=labels[:] # 去掉了這個最好的標籤後 繼續遞迴查詢分類,複製了去掉最好的標籤

mytree[bestfeatlabel][value]=createtree(splitdataset(dataset,bestfeat,value),sublabels)

return mytree

def classify(inputtree,featlabels,testvec):#函式的作用將標籤字串轉化為索引

# inputtree:是輸入的決策樹物件

# featlabels:是我們要**的特徵值的label,如:['throat','mustache'],#featlabels:是我們要**的分類列表,如:['no su***cing','flippers']

# testvec:是要**的特徵值向量,如[1,0],第乙個特徵值是1的結點,且第二個特徵值是2的葉節點

#labels = "色澤,根蒂,敲聲,紋理,臍部,觸感".split(',')

#print(classify(mytree,labels,"青綠,蜷縮,沉悶,稍糊,稍凹,軟粘".split(',')))

fistsides = list(inputtree.keys())

firststr = fistsides[0]

#firststr=inputtree.keys()[0]

seconddict=inputtree[firststr]

featindex=featlabels.index(firststr)#.index() 在featlabels中找到和firststr一樣的的索引位置,

# 即找到第乙個標籤的索引,例如"no su***cing"的索引,索引為多少,testvec的第幾個位置就是描述該標籤的幾個特徵值的某乙個,

#也就是說,標籤的順序和testvec中特徵值的順序是對應的。如下所示:

# labels = "色澤,根蒂,敲聲,紋理,臍部,觸感".split(',')

# print(classify(mytree,labels,"青綠,蜷縮,沉悶,稍糊,稍凹,軟粘".split(',')))

for key in seconddict.keys():##**的label下面的分類特徵值

if testvec[featindex]==key:#比較特徵值,決策樹是根據特徵的值劃分的,

if type(seconddict[key]).__name__=='dict':

classlabel=classify(seconddict[key],featlabels,testvec)

else:

classlabel=seconddict[key]

return classlabel

#最好能夠在每次執行分類時呼叫已經構造好的決策樹。可以使用python中的pickle模組序列化物件。

# 序列化物件可以在磁碟上儲存物件pickle.dump(),

# 並在需要的時候讀取出來pickle.load()。任何物件都可以執行序列化操作,字典物件也不例外!!!

def storetree(inputtree,filename):#filenam是定義的乙個新的資料夾,目的是把樹儲存進去

import pickle

fw=open(filename,'wb')#以寫入模式開啟檔案filename

pickle.dump(inputtree,fw)#將要儲存的樹inputtree放入filename檔案中

fw.close()#關閉檔案

def gradtree(filename):

import pickle

fr=open(filename,'rb')#開啟已經儲存樹的檔案,

return pickle.load(fr)#從檔案中把內容載入出來,load模組接受返回的物件,這個過程叫做拆封。

機器學習實戰 決策樹

決策樹 2 python語言在函式中傳遞的是列表的引用,在函式內部對列表物件的修改,將會影響該列表物件的整個生存週期。為了消除這個不良影響,我們需要在函式的開始宣告乙個新列表物件。在本節中,指的是在劃分資料集函式中,傳遞的引數dataset列表的引用,為了不影響dataset我們重新宣告了乙個ret...

機器學習實戰決策樹

這幾天一直在學習機器學習實戰python 實現,在程式清單的3 6 獲取及誒單數程式,書上的程式是這樣的 def getnumleafs mytree numleafs 0.0 firststr list dict.keys mytree 0 seconddict mytree firststr p...

機器學習實戰 決策樹

class sklearn.tree.decisiontreeclassifier criterion gini splitter best max depth none,min samples split 2,min samples leaf 1,min weight fraction leaf ...