#coding=utf-8
# 構造資料集
def createdataset():
dataset = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
features = ['no su***cing','flippers']
return dataset,features
# 採用字典來遞迴構造決策樹
def treegrowth(dataset,features):
classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0])==len(classlist):
return classlist[0]
if len(dataset[0])==1:# no more features
return classify(classlist)
bestfeat = findbestsplit(dataset)#bestfeat is the index of best feature
bestfeatlabel = features[bestfeat]
mytree = }
featvalues = [example[bestfeat] for example in dataset]
uniquefeatvalues = set(featvalues)
del (features[bestfeat])
for values in uniquefeatvalues:
subdataset = splitdataset(dataset,bestfeat,values)
mytree[bestfeatlabel][values] = treegrowth(subdataset,features)
return mytree
# 當沒有多餘的feature,但是剩下的樣本不完全是一樣的類別是,採用出現次數多的那個類別
def classify(classlist):
classcount = {}
for vote in classlist:
if vote not in classcount.keys():
classcount[vote] = 0
classcount[vote] += 1
sortedclasscount = sorted(classcount.iteritems(),key = operator.itemgetter(1),reverse = true)
return sortedclasscount[0][0]
# 尋找用於**的最佳屬性(遍歷所有屬性,算資訊增益)
def findbestsplit(dataset):
numfeatures = len(dataset[0])-1
baseentropy = calcshannonent(dataset)
bestinfogain = 0.0
bestfeat = -1
for i in range(numfeatures):
featvalues = [example[i] for example in dataset]
uniquefeatvalues = set(featvalues)
newentropy = 0.0
for val in uniquefeatvalues:
subdataset = splitdataset(dataset,i,val)
prob = len(subdataset)/float(len(dataset))
newentropy += prob*calcshannonent(subdataset)
if(baseentropy - newentropy)>bestinfogain:
bestinfogain = baseentropy - newentropy
bestfeat = i
return bestfeat
# 選擇完**屬性後,就行資料集的**
def splitdataset(dataset,feat,values):
retdataset =
for featvec in dataset:
if featvec[feat] == values:
reducedfeatvec = featvec[:feat]
reducedfeatvec.extend(featvec[feat+1:])
return retdataset
# 計算資料集的熵
def calcshannonent(dataset):
numentries = len(dataset)
labelcounts = {}
for featvec in dataset:
currentlabel = featvec[-1]
if currentlabel not in labelcounts.keys():
labelcounts[currentlabel] = 0
labelcounts[currentlabel] += 1
shannonent = 0.0
for key in labelcounts:
prob = float(labelcounts[key])/numentries
if prob != 0:
shannonent -= prob*log(prob,2)
return shannonent
# 根據構造的決策樹進行資料的分類
def predict(tree,newobject):
while isinstance(tree,dict):
key = tree.keys()[0]
tree = tree[key][newobject[key]]
return tree
if __name__ == '__main__':
dataset,features = createdataset()
tree = treegrowth(dataset,features)
print tree
print predict(tree,)
print predict(tree,)
print predict(tree,)
print predict(tree,)
決策樹演算法 python實現
定義 資訊增益 再劃分資料之前之後資訊發生的變化。香濃熵 簡稱熵 集合資訊的度量方式,熵是資訊的期望值。其實決策樹主要就是選擇最優劃分屬性對給定集合進行劃分,隨著花粉的不斷進行,我們希望最終決策樹所包含的樣本盡量屬於同一類別,即結點的 純度 越來越高。資訊增益的計算 1.資訊熵的計算ent d 越小...
Python實現決策樹演算法
決策樹的一般流程 檢測資料集中的每個子項是否屬於同乙個分類 if so return 類標籤 else 尋找劃分資料集的最好特徵 劃分資料集 建立分支 節點 from math import log import operator 生成樣本資料集 defcreatedataset dataset 1...
決策樹演算法 Python實現
import matplotlib.pyplot as plt import pandas as pd from sklearn.datasets import fetch california housing housing fetch california housing print housi...