#####計算夏農熵
#####劃分資料集並選擇最好的
from math import log
def calcshannonent
(dataset)
: num=
len(dataset)
labelcounts=
for featvec in dataset:
currentlabel=featvec[-1
] #print
(currentlabel)
if currentlabel not in labelcounts.
keys()
: labelcounts[currentlabel]=0
labelcounts[currentlabel]+=1
shannonent=
0.0for key in labelcounts:
prob=
float
(labelcounts[key]
/num)
shannonent-=prob*
log(prob,2)
return shannonent
def creatdataset()
: dataset=[[
1,1,
'yes'],
[1,1
,'yes'],
[1,0
,'no'],
[0,1
,'no'],
[0,1
,'no']]
labels=
['no su***cing'
,'flippers'
]return dataset,labels
#為了保留原始資料,新建乙個資料列表
def splitdataset
(dataset,axis,value)
:'''
帶劃分的資料集
劃分資料集的特徵
需要返回的特徵的值
得到除了在axis處其餘的數
'''retdataset=
for featvec in dataset:
if featvec[axis]
==value:
reducedfeatvec=featvec[
:axis]
reducedfeatvec.
extend
(featvec[axis+1:
])retdataset.
(reducedfeatvec)
return retdataset
def choosebestfeaturetosplit
(dataset)
: numfeature=
len(dataset[0]
)-1print
("numberfea is"
,numfeature)
baseentropy=
calcshannonent
(dataset)
bestinfogain=
0 bestfeature=
-1 #from
0 start
for i in
range
(numfeature)
: featlist=
[example[i]
for example in dataset]
#print
("example is"
,[example for example in dataset]
)print
("example[i] is"
,[example[i]
for example in dataset]
) uniquevals=
set(featlist) #自動去重
print
("uniquevals is"
,uniquevals)
newentropy=
0for value in uniquevals:
subdataset=
splitdataset
(dataset,i,value)
print
("subdataset is"
,subdataset)
prob=
float
(len
(subdataset)
/len
(dataset)
) newentropy+=prob*
calcshannonent
(subdataset)
infogain=baseentropy-newentropy
print
("infogain is"
,infogain)
if(infogain>bestinfogain)
: bestinfogain=infogain
bestfeature=i
return bestfeature
mydat,labels=
creatdataset()
print
(choosebestfeaturetosplit
(mydat)
)
output:
numberfea is 2
example[i] is [1,
1,1,
0,0]
uniquevals is
subdataset is [[1
,'no'],
[1,'no']]
subdataset is [[1
,'yes'],
[1,'yes'],
[0,'no']]
infogain is 0.4199730940219749
example[i] is [1,
1,0,
1,1]
uniquevals is
subdataset is [[1
,'no']]
subdataset is [[1
,'yes'],
[1,'yes'],
[0,'no'],
[0,'no']]
infogain is 0.17095059445466854
0
機器學習實戰 決策樹
決策樹 2 python語言在函式中傳遞的是列表的引用,在函式內部對列表物件的修改,將會影響該列表物件的整個生存週期。為了消除這個不良影響,我們需要在函式的開始宣告乙個新列表物件。在本節中,指的是在劃分資料集函式中,傳遞的引數dataset列表的引用,為了不影響dataset我們重新宣告了乙個ret...
機器學習實戰決策樹
這幾天一直在學習機器學習實戰python 實現,在程式清單的3 6 獲取及誒單數程式,書上的程式是這樣的 def getnumleafs mytree numleafs 0.0 firststr list dict.keys mytree 0 seconddict mytree firststr p...
機器學習實戰 決策樹
class sklearn.tree.decisiontreeclassifier criterion gini splitter best max depth none,min samples split 2,min samples leaf 1,min weight fraction leaf ...