from math import log
import operator
defcreatedateset()
: dataset =[[
"青年"
,"否"
,"否"
,"一般"
,"否"],
["青年"
,"否"
,"否"
,"好"
,"否"],
["青年"
,"是"
,"否"
,"好"
,"是"],
["青年"
,"是"
,"是"
,"一般"
,"是"],
["青年"
,"否"
,"否"
,"一般"
,"否"],
["中年"
,"否"
,"否"
,"一般"
,"否"],
["中年"
,"否"
,"否"
,"好"
,"否"],
["中年"
,"是"
,"是"
,"好"
,"是"],
["中年"
,"否"
,"是"
,"非常好"
,"是"],
["中年"
,"否"
,"是"
,"非常好"
,"是"],
["老年"
,"否"
,"是"
,"非常好"
,"是"],
["老年"
,"否"
,"是"
,"好"
,"是"],
["老年"
,"是"
,"否"
,"好"
,"是"],
["老年"
,"是"
,"否"
,"非常好"
,"是"],
["老年"
,"否"
,"否"
,"一般"
,"否"]]
featrues_names =
["年齡"
,"有工作"
,"有自己的房子"
,"信貸情況"
]return dataset, featrues_names
defcalcshannonent
(dataset)
:"""
資料集包含標籤
:return 返回夏農熵
"""# 使用字典儲存不同label計數
label =
num =
len(dataset)
for i in
range
(num)
:if dataset[i][-
1]notin label.keys():
label[dataset[i][-
1]]=
1else
: label[dataset[i][-
1]]+=
1 shannonent =
0for key in label:
p = label[key]
/ num
shannonent +=
-p * log(p,2)
return shannonent
defdatasplit
(dataset, axis, value)
:"""
:param dataset: 資料集包含標籤
:param axis: 資料集特徵
:param value: 資料集特徵的值
:return: 返回特徵等於某個值去掉此特徵的資料集
"""num =
len(dataset)
data_feature =
for i in
range
(num)
:if dataset[i]
[axis]
== value:
data_left = dataset[i]
[:axis]
data_right = dataset[i]
[axis +1:
] data_merge = data_left + data_right
return data_feature
defchoosebestfeaturetosplit
(dataset)
:"""
:param dataset: 資料集
:return:
best_feature,最優特徵索引
infogainratemax,最大資訊增益率
"""base_shanonent = calcshannonent(dataset)
features_nums =
len(dataset[0]
)-1 data_nums =
len(dataset)
infogainratemax =
0 best_feature =-1
for i in
range
(features_nums)
: feature_list =
[feature[i]
for feature in dataset]
# 取某條資料某個特徵
feature_list_unique =
set(feature_list)
shanonent_ =
0 h_feature =
0for j in feature_list_unique:
data_feature = datasplit(dataset, i, j)
p =len(data_feature)
/ data_nums
h_feature +=
-p * log(p,2)
shanonent_ += p * calcshannonent(data_feature)
infogainrate =
(base_shanonent - shanonent_)
/ h_feature
if infogainrate > infogainratemax:
infogainratemax = infogainrate
best_feature = i
return best_feature, infogainratemax
defmajortitycnt
(classlist)
:"""
:param classlist:標籤列表
:return: 列表中數量最大的標籤值
"""classcount =
for vote in classlist:
if vote not
in classcount.keys():
classcount[vote]=0
classcount[vote]+=1
sortedclasscount =
sorted
(classcount.iteritems(
), \
key=operator.itemgetter(1)
, reverse=
true
)return sortedclasscount[0]
[0]def
createtree
(dataset, features_names)
: classlist =
[feature[-1
]for feature in dataset]
iflen
(set
(classlist))==
1:return classlist[0]
iflen
(dataset[0]
)==1:
return majortitycnt(classlist)
best_feature, infogainratemax = choosebestfeaturetosplit(dataset)
# 最好特徵索引
best_feature_name = features_names[best_feature]
# 最好特徵名稱
mytree =
}del
(features_names[best_feature]
) featvalues =
[feature[best_feature]
for feature in dataset]
uniquevals =
set(featvalues)
for value in uniquevals:
sublabels = features_names[:]
mytree[best_feature_name]
[value]
= createtree(datasplit(dataset, best_feature, value)
, sublabels)
return mytree
dataset, features_names = createdateset(
(createtree(dataset, features_names)
)結果:}}
}
最大資訊熵增益 決策樹與資訊增益
今天我們開始介紹決策樹。它既可以用於分類,也可以用於回歸。這裡我們主要介紹更加常見的分類用法。概念決策樹,顧名思義,它的形狀類似於一棵樹,我們可以簡單把它畫出來 如上圖,最上面的乙個點我們叫它根節點 root node 最下面不再進行分類的點我們叫它葉節點 leaf node 決策樹的分類過程是這樣...
決策樹資訊增益
決策樹和整合演算法都是樹模型 決策樹 從根節點一步步走到葉子節點,所有的資料都會落到葉子節點,既可以做分類也可以做回歸。一顆樹有三種節點組成,根節點,中間幾點,葉子節點。根節點是第乙個選擇節點,也是最重要的乙個選擇特徵。葉子節點是存放最終的結果。決策樹的訓練和測試 訓練是建立一棵樹。測試是讓資料從根...
決策樹 資訊熵 資訊增益 基尼係數
決策樹系列目錄 文末有彩蛋 決策樹 決策樹演算法原理 id3,c4.5,cart 決策樹 決策樹引數介紹 分類和回歸 決策樹 決策樹sklearn調參 gridsearchcv 決策樹 python 實現決策樹 決策樹應用例項 鐵達尼號分類 決策樹應用例項 使用者流失 模型 決策樹應用例項 銀行借貸...