根據決策樹分而治之的思想,使用gini準則,封裝乙個決策樹分類演算法,同時能實現調節兩個超引數:樹深和葉子節點最小樣本數。
import numpy as np
from collections import counter
'''encapsulate the decision tree method
author:evan
'''class decisiontreeclassifier:
def __init__(self,max_depth=2,min_samples_leaf=1):
self.tree_ = none
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
def fit(self, x, y):
self.tree_ = self.creat_tree(x,y)
return self
def creat_tree(self, x, y,current_depth=1):
if current_depth>self.max_depth:
return none
d, v, g = try_split(x, y,self.min_samples_leaf)
if d == -1 or g == 0:
return none
node = node(d, v, g)
x_left, x_right, y_left, y_right = cut(x, y, v, d)
node.children_left = self.creat_tree(x_left, y_left,current_depth+1)
if node.children_left is none:
label = counter(y_left).most_common(1)[0][0]
node.children_left = node(l=label)
node.children_right = self.creat_tree(x_right, y_right,current_depth+1)
if node.children_right is none:
label = counter(y_right).most_common(1)[0][0]
node.children_right = node(l=label)
return node
def predict(self, x):
assert self.tree_ is not none, 'call the fit() method first'
return np.array([self._predict(x, self.tree_) for x in x])
def _predict(self, x, node):
if node.label is not none:
return node.label
if x[node.dim] <= node.value:
return self._predict(x, node.children_left)
else:
return self._predict(x, node.children_right)
def cut(x,y,v,d):
'''將資料一分為二'''
ind_left = (x[:,d]<=v)
ind_right = (x[:,d]>v)
return x[ind_left], x[ind_right], y[ind_left], y[ind_right]
def try_split(x,y,min_samples_leaf):
'''劃分資料集,返回最好的劃分點'''
best_g = 1
best_d = -1
best_v = -1
for d in range(x.shape[1]):
sorted_index = np.argsort(x[:,d])
for i in range(len(x)-1):
if x[sorted_index[i],d] == x[sorted_index[i+1],d]:
continue
v = (x[sorted_index[i],d]+x[sorted_index[i+1],d])/2
# print("d={},v={}".format(d,v))
x_left,x_right,y_left,y_right = cut(x,y,v,d)
gini_all =gini(y_left)+gini(y_right)
# print("d={},v={},g={}".format(d,v,gini_all))
if gini_all= min_samples_leaf and len(y_right) >= min_samples_leaf:
best_g = gini_all
best_d = d
best_v = v
return best_d,best_v,best_g
# define node class
class node():
def __init__(self, d=none, v=none, g=none, l=none):
self.dim = d
self.value = v
self.gini = g
self.label = l
self.children_left = none
self.children_right = none
def __repr__(self):
return "node(d={},v={},g={},l={})".format(self.dim, self.value, self.gini, self.label)
# compute gini
def gini(y):
counter = counter(y)
result = 0
for v in counter.values():
result += (v/len(y))**2
return 1 - result
簡易版redux實現
redux其實只有幾個重要的api,getstate,subscribe和dispatch,getstate用來獲取狀態,subscribe監聽狀態的改變,dispatch派發事件改變狀態,下面就來看下。首先是createstore,它接收三個引數,分別是reducer函式,初始狀態值,還有就是中介...
決策樹演算法 python實現
定義 資訊增益 再劃分資料之前之後資訊發生的變化。香濃熵 簡稱熵 集合資訊的度量方式,熵是資訊的期望值。其實決策樹主要就是選擇最優劃分屬性對給定集合進行劃分,隨著花粉的不斷進行,我們希望最終決策樹所包含的樣本盡量屬於同一類別,即結點的 純度 越來越高。資訊增益的計算 1.資訊熵的計算ent d 越小...
Python實現決策樹演算法
決策樹的一般流程 檢測資料集中的每個子項是否屬於同乙個分類 if so return 類標籤 else 尋找劃分資料集的最好特徵 劃分資料集 建立分支 節點 from math import log import operator 生成樣本資料集 defcreatedataset dataset 1...