from math import log
import operator
import csv
def readdataset(filename):
'''讀取資料
:param filename: 資料檔名,csv格式
:return: 以列表形式返回資料列表和特徵列表
'''with open(filename) as f:
reader = csv.reader(f)
header_row = next(reader)
labels = header_row[1:7]
dataset =
for line in reader:
tempvect = line[1:7]
return dataset, labels
def infoent(dataset):
'''計算資訊熵
:param dataset: 輸入資料集
:return: 返回資訊熵
'''numdata = len(dataset)
labels = {}
for featvec in dataset:
label = featvec[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
infoent = 0
for lab in labels.keys():
prop = float(labels[lab]) / numdata
infoent -= (prop * log(prop, 2))
return infoent
def splitdataset(dataset, axis, value):
'''對某個特徵進行劃分後的資料集
:param dataset: 資料集
:param axis: 劃分屬性的下標
:param value: 劃分屬性值
:return: 返回剩餘資料集
'''restdataset =
for featvec in dataset:
if featvec[axis] == value:
restfeatvec = featvec[:axis]
restfeatvec.extend(featvec[axis + 1:])
return restdataset
def bestfeaturesplit(dataset):
'''最優屬性劃分
:param dataset: 輸入需要劃分的資料集
:return: 返回最優劃分屬性的下標
'''numfeature = len(dataset[0]) - 1
baseinfoent = infoent(dataset)
bestinfogain = 0
bestfeature = -1
for i in range(numfeature):
featlist = [example[i] for example in dataset]
uniquevalue = set(featlist)
newent = 0
for value in uniquevalue:
subdataset = splitdataset(dataset, i, value)
prop = len(subdataset) / float(len(dataset))
newent += prop * infoent(subdataset)
infogain = baseinfoent - newent
if (infogain > bestinfogain):
bestinfogain = infogain
bestfeature = i
return bestfeature
def majorclass(classlist):
'''對葉節點的分類結果進行劃分,按照數量大小
:param classlist: 葉節點上的樣本數量
:return: 返回葉節點劃分結果
'''classcount = {}
for vote in classlist:
if vote not in classcount:
classcount[vote] = 0
classcount[vote] += 1
sortedclasscount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=true) # 返回陣列
return sortedclasscount[0][0]
def createtree(dataset, labels, datasetfull, labelsfull):
'''遞迴建立決策樹
:param dataset: 資料集列表
:param labels: 標籤集列表
:param datasetfull: 資料集列表,再傳一次
:param labelsfull: 標籤集列表,再傳一次
:return: 返回決策樹字典
'''classlist = [example[-1] for example in dataset]
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
if len(dataset[0]) == 1:
return (majorclass(classlist))
bestfeat = bestfeaturesplit(dataset)
bestfeatlabel = labels[bestfeat]
mytree = }
del (labels[bestfeat])
featvalues = [example[bestfeat] for example in dataset]
uniqueval = set(featvalues)
# 建立所有屬性標籤的所有值,以防漏掉某些取值
bestfeatindex = labelsfull.index(bestfeatlabel)
featvaluesfull = [example[bestfeatindex] for example in datasetfull]
uniquevalfull = set(featvaluesfull)
if uniqueval == uniquevalfull:
for value in uniqueval:
sublabels = labels[:] # 遞迴回退過程需要繼續使用標籤,所以前行過程標籤副本
mytree[bestfeatlabel][value] = createtree(splitdataset(dataset, bestfeat,
value),sublabels, datasetfull, labelsfull)
else:
for value in uniqueval:
sublabels = labels[:] # 遞迴回退過程需要繼續使用標籤,所以前行過程標籤副本
mytree[bestfeatlabel][value] = createtree(splitdataset(dataset, bestfeat,
value), sublabels, datasetfull, labelsfull)
uniquevalfull.remove(value)
for value in uniquevalfull:
mytree[bestfeatlabel][value] = majorclass(classlist)
return mytree
if __name__ == '__main__':
filename = 'c:\\users\\14399\\desktop\\西瓜2.0.csv'
dataset, labels = readdataset(filename)
datasetfull = dataset[:]
labelsfull = labels[:]
mytree = createtree(dataset, labels, datasetfull, labelsfull)
print(mytree)
生成決策樹:}, '模糊': '否', '清晰': }, '青綠': '是', '淺白': '是'}}, '蜷縮': '是'}}}}
參考:
決策樹 ID3演算法
id3演算法通過計算每個屬性的資訊增益,認為資訊增益越大屬性越優,每次劃分選取資訊增益最大的屬性為劃分標準,重複這個過程,直到構成一棵決策樹。資訊熵是描述事件給我們的驚訝程度,如果所有事件的概率均等,那熵值大,驚訝程度低。如果有一事件的概率極高而其他極低,熵值便低,驚訝程度大。其計算公式如下 資訊增...
決策樹 ID3演算法
一 決策樹基本概念 在機器學習中,決策樹是乙個 模型,它代表的是物件屬性與物件值之間的一種對映關係。本質上決策樹是通 過一系列規則對資料進行分類的過程。下圖為經典決策樹例項。如圖所示,例項是由 屬性 值 對表示的 例項是用一系列固定的屬性和他們的值構成。目標函式具有離散的輸出值 上圖給每個例項賦予乙...
決策樹ID3演算法
typeerror dict keys object does not support indexing 9.typeerror dict keys object does not support indexing 這個問題是python版本的問題 如果使用的是python2 firststr my...