匯入需要的python庫
from math import log
建立資料
def creatdata():
#ages: 0-young 1-middle-aged 2-older
#work: 0-no 1-yes
#house: 0-no 1-yes
#loan_credit: 0-nomal 1-good 2-great
#loan: no yes
data=[[0, 0, 0, 0, 'no'],
[0, 0, 0, 1, 'no'],
[0, 1, 0, 1, 'yes'],
[0, 1, 1, 0, 'yes'],
[0, 0, 0, 0, 'no'],
[1, 0, 0, 0, 'no'],
[1, 0, 0, 1, 'no'],
[1, 1, 1, 1, 'yes'],
[1, 0, 1, 2, 'yes'],
[1, 0, 1, 2, 'yes'],
[2, 0, 1, 2, 'yes'],
[2, 0, 1, 1, 'yes'],
[2, 1, 0, 1, 'yes'],
[2, 1, 0, 2, 'yes'],
[2, 0, 0, 0, 'no']]
labels=['age','work','house','loan']
return data,labels
計算資訊熵(夏農熵)
def calculate_shannonent(data):
num=len(data)
label_num={}
#get the number of each class
for feature in data:
#get the label
currentlabel=feature[-1]
if currentlabel not in label_num.keys():
label_num[currentlabel]=0 #initialize the class
label_num[currentlabel]+=1 #count the classes number
#print (label_num)
#calculate shannonent
shannonent=0.0
for key in label_num:
prob=float(label_num[key])/num
shannonent-=prob*log(prob,2)
#print (shannonent)
return shannonent
對每個特徵進行分類
def splitdataset(dataset,axis,value):
retdataset=
for featvec in dataset:
if featvec[axis]==value:
reducedfeatvec=featvec[:axis]
#print ('&&&&&&&&&&&&&')
#print (reducedfeatvec)
#print ('&&&&&&&&&&&&&')
reducedfeatvec.extend(featvec[axis+1:])
return retdataset
通過對比資訊增益計算最優特徵
def bestfeature(data):
#calculate the features num
num_feature=len(data[0])-1
#calculate the shannonent of data
baseentropy=calculate_shannonent(data)
#calculate the infogain
#initialize the bestinfogain & best_feature
bastinfogain=0.0
best_feature=-1
for i in range(num_feature):
# the data in i columns
featlist=[example[i] for example in data]
#print ('********')
#print (featlist)
#print ('********')
#check unique values of the current class
uniquevals=set(featlist)
#print ('$$$$$$$$$$$$$$$$$$$$$$$$$')
#print (uniquevals)
#print ('$$$$$$$$$$$$$$$$$$$$$$$$$')
#calculate the infogain of the current feature
newentroy=0.0
for value in uniquevals:
sub_data=splitdataset(data,i,value)
#print ('__________________')
#print (sub_data)
#print ('__________________')
prob=len(sub_data)/float(len(data))
newentroy+=prob*calculate_shannonent((sub_data))
infogain=baseentropy-newentroy
print ('the %d infogain is %.3f'%(i,infogain))
if (infogain>bastinfogain):
bastinfogain=infogain
best_feature=i
return best_feature
main函式輸出最優特性索引
if __name__=='__main__':
data,features=creatdata()
print ('the best index is ', bestfeature(data))
輸出:
the 0 infogain is 0.083
the 1 infogain is 0.324
the 2 infogain is 0.420
the 3 infogain is 0.363
the best index is 2
決策樹理論 Decision tree
決策樹面試被提及的題目 1 決策樹的原理 從根結點開始,對例項的某一特徵進行測試,根據測試的結果,將例項分配到其子結點 每乙個子結點對應著該特徵的乙個取值。如此遞迴地對例項進行測試並分配,直至達到葉結點。最後將例項分配到葉結點的類中。2 決策樹的過程 步驟 決策樹學習通常包括3個步驟 特徵選擇 決策...
Decision Tree演算法(決策樹)
1.定義 決策樹是一種十分常用的分類方法。決策樹是乙個樹結構 可以是二叉樹或非二叉樹 其每個非葉節點表示乙個特徵屬性上的測試,每個分支代表這個特徵屬性在某個值域上的輸出,而每個葉節點存放乙個類別。使用決策樹進行決策的過程就是從根節點開始,測試待分類項中相應的特徵屬性,並按照其值選擇輸出分支,直到到達...
決策樹(decision tree)分類演算法
2.1 決策樹 decision tree 分類演算法 決策樹是以例項為基礎的歸納學習演算法。它從一組無次序 無規則的元組中推理出決策樹表示形式的分類規則。它採用自頂向下的遞迴方式,在決策樹的內部結點進行屬性值 的比較,並根據不同的屬性值從該結點向下分支,葉結點是要學習劃分的類。從根到葉結點的一條路...