from math import log
import operator
# 計算資料集的熵
def calsshannonent(dataset):
numentries = len(dataset)
labelcounts = {}
for featvec in dataset:
currentlabel = featvec[-1]
if currentlabel not in labelcounts.keys():
labelcounts[currentlabel] = 0
labelcounts[currentlabel] += 1
shannonent = 0.0
for key in labelcounts:
prob = float(labelcounts[key]) / numentries
shannonent -= prob * log(prob, 2)
return shannonent
#建立資料集
def createdataset():
dataset = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no su***cing', 'flippers']
return dataset, labels
#按照給定特徵劃分資料集
def splitdataset(dataset, axis, value):
retdataset =
for featvec in dataset:
if featvec[axis] == value:
reducedfeatvec = featvec[:axis]
reducedfeatvec.extend(featvec[axis + 1:])
return retdataset
#選擇劃分資料集的最好特徵
def choosebestfeaturetosplit(dataset):
numfeatures = len(dataset[0]) - 1
baseentropy = calsshannonent(dataset)
bestinfogain = 0.0;
bestfeature = -1
for i in range(numfeatures):
featlist = [example[i] for example in dataset]
uniquevals = set(featlist)
newentropy = 0.0
for value in uniquevals:
subdataset = splitdataset(dataset, i, value)
prob = len(subdataset) / float(len(dataset))
newentropy += prob * calsshannonent(subdataset)
infogain = baseentropy - newentropy
if (infogain > bestinfogain):
bestinfogain = infogain
bestfeature = i
return bestfeature
#多數表決決定葉子分類
def majoritycnt(classlist):
classcount = {}
for vote in classlist:
if vote not in classcount.keys():
classcount[vote] = 0
classcount[vote] += 1
sortedclasscount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=true)
return sortedclasscount[0][0]
#建立樹結構
def createtree(dataset, labels):
classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
if len(dataset[0]) == 1:
return majoritycnt(classlist)
bestfeat = choosebestfeaturetosplit(dataset)
bestfeatlabel = labels[bestfeat]
mytree = }
del (labels[bestfeat])
featvalues = [example[bestfeat] for example in dataset]
uniquevals = set(featvalues)
for value in uniquevals:
sublabels = labels[:]
mytree[bestfeatlabel][value] = createtree(splitdataset(dataset, bestfeat, value), sublabels)
return mytree
#分類def classify(inputtree, featlabels, testvec):
firststr = list(inputtree.keys())[0]
seconddict = inputtree[firststr]
featindex = featlabels.index(firststr)
for key in seconddict.keys():
if testvec[featindex] == key:
if type(seconddict[key]).__name__ == 'dict':
classlabel = classify(seconddict[key], featlabels, testvec)
else:
classlabel = seconddict[key]
return classlabel
#序列化樹結構
def storetree(inputtree, filename):
import pickle
fw = open(filename, 'wb')
pickle.dump(inputtree, fw)
fw.close()
#反序列化樹結構
def grabtree(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
決策樹實現
決策樹是大資料技術進行分類和 的一種,決策樹學習是一種基於例項歸納學習演算法,它主要從一組無次序 無規則的例項中推理出以決策樹所代表分類規則。它採用從上向下的遞迴方式,在決策樹的內部節點進行屬性的比較,並根據不同屬性值判斷從該節點向下的分支,在決策樹的葉節點得到結論。如下 import pandas...
python實現決策樹
決策樹是乙個 模型 他代表的是物件屬性與物件值之間的一種對映關係。樹中每個節點表示某個物件,而每個分叉路徑則代表某個可能的屬性值,而每個葉節點則對應從根節點到該葉節點所經歷的路徑所表示的物件的值。詳細關於決策樹的討論,請自行google。一 找到最優分割位置 1 針對樣本資料,需要在其不同的維度 d...
決策樹和CART決策樹
首先簡單介紹下決策樹 說到決策樹肯定離不開資訊熵 什麼是資訊熵 不要被這名字唬住,其實很簡單 乙個不太可能的時間居然發生了,要比乙個非常可能的時間發生提供更多的資訊。訊息說 今天早上太陽公升起 資訊量是很少的,以至於沒有必要傳送。但另一條訊息說 今天早上日食 資訊量就很豐富。概率越大資訊量就越少,與...