from sklearn import tree
import pydotplus
defcart_skl_test()
: df = pd.read_csv(
"../dataset/liquefaction_data_mle.csv"
) x = df[
['csr'
,'vs']]
y = df[
'target'
] clf = tree.decisiontreeclassifier(
) clf.fit(x, y)
dot_data = tree.export_graphviz(clf, out_file=
none
) graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png(
"carttree.png"
)
import pandas as pd
import math
def
get_gini
(dataset)
: num_instances =
len(dataset)
# 資料個數
label_counts =
# 統計當前各標籤資料量
for featvec in dataset:
current_label = featvec[-1
]if current_label not
in label_counts.keys():
label_counts[current_label]=0
label_counts[current_label]+=1
sum_prob =
0.0for key in label_counts:
prob =
float
(label_counts[key]
)/ num_instances
sum_prob = sum_prob + math.
pow(prob,2)
gini =
1- sum_prob
return gini
def
splitdataset
(dataset, axis, value)
: leftdataset =
rightdataset =
for featvec in dataset:
if featvec[axis]
<= value:
else
:# print(leftdataset)
# print(rightdataset)
return leftdataset, rightdataset
def
choosebestfeaturetosplit
(dataset)
:# 決策屬性不算
numfeatures =
len(dataset[0]
)-1 bestinfogini =
1.0 bestfeature =-1
bestsplitvalue =-1
basegini = get_gini(dataset)
for i in
range
(numfeatures)
:# 把第i列屬性的值取出來生成一維陣列
featlist =
[example[i]
for example in dataset]
# 剔除重複值,並排序
uniquevals =
list
(set
(featlist)
) uniquevals.sort(
)
featuresplit =-1
# 當前屬性下的最佳分割點
featuregini =
1.0# 當前屬性下的最小gini值
# 選擇當前屬性下的最佳分割點
for j in
range
(len
(uniquevals)-1
):value =
(uniquevals[j]
+ uniquevals[j+1]
)/2 left_dataset, right_dataset = splitdataset(dataset, i, value)
prob =
len(left_dataset)
/float
(len
(dataset)
) currentgini = prob * get_gini(left_dataset)+(
1- prob)
* get_gini(right_dataset)
if currentgini < featuregini:
featuregini = currentgini
featuresplit = value
# 選擇最佳屬性及其分割點
if featuregini < bestinfogini:
bestinfogini = featuregini
bestfeature = i
bestsplitvalue = featuresplit
print
("bestfeature: {}, bestsplitvalue: {}, gini: {}"
.format
(bestfeature, bestsplitvalue, basegini)
)return bestfeature, bestsplitvalue, bestinfogini
def
createtree
(dataset, parafeaturename)
:# 拷貝標籤
classlist =
[example[-1
]for example in dataset]
# 當結點中所有標籤相同時-->葉子結點
if classlist.count(classlist[0]
)==len(classlist)
:return classlist[0]
bestfeat, bestsplit, gini = choosebestfeaturetosplit(dataset)
bestfeaturename = parafeaturename[bestfeat]
mytree =
}#運用字典儲存樹
# 遞迴建立樹
lefttree, righttree = splitdataset(dataset, bestfeat, bestsplit)
mytree[bestfeaturename]
["<="
+str
(bestsplit)
]= createtree(lefttree, parafeaturename)
mytree[bestfeaturename]
[">"
+str
(bestsplit)
]= createtree(righttree, parafeaturename)
return mytree
if __name__ ==
"__main__"
:# cart_skl_test()
df = pd.read_csv(
"../dataset/liquefaction_data_mle.csv"
)#讀取.csv資料
featurename = df.columns.values
dataset =
for i in df.values:
tree = createtree(dataset, featurename)
print
(tree)
自寫**結果展示:
},
'>0.26':
}}}}},
'>15.55': 1.0}}}}}},
'>16.35':
}}}}}
}}}}}
調包結果展示:
決策樹CART的python實現
cart演算法只做二元切分,因此每個樹節點包含待切分的特徵,待切分的特徵值,左子樹,右子樹。import numpy as np class treenode object def init self,feat,val,right,left featuretospliton feat valueof...
決策樹和CART決策樹
首先簡單介紹下決策樹 說到決策樹肯定離不開資訊熵 什麼是資訊熵 不要被這名字唬住,其實很簡單 乙個不太可能的時間居然發生了,要比乙個非常可能的時間發生提供更多的資訊。訊息說 今天早上太陽公升起 資訊量是很少的,以至於沒有必要傳送。但另一條訊息說 今天早上日食 資訊量就很豐富。概率越大資訊量就越少,與...
決策樹之CART
本系列分享由三篇部落格組成,建議從前往後閱讀學習。決策樹之id3 決策樹之c4.5 決策樹之cart 前面我們講到了決策樹演算法id3,和c4.5。c4.5是在id3的基礎上發展而來的,c4.5還存在的缺陷有 1 c4.5不能解決回歸的問題。2 c4.5需要進行多個對數計算,效率比較低。3 對於離散...