1 讀取資料
2 特徵工程
2.1 去除無效特徵
2.2 缺失值處理
2.3 特徵組合
2.4 相似特徵保一
2.5 特徵型別轉換
1 隨機森林
1.1 原理
1.2 流程
1.3 優點
1.4 缺點
2 引數調優
2.1 引數詳解
結果:# find the best n_estimators for randomforestclassifier
from sklearn.ensemble import randomforestclassifier
from sklearn.cross_validation import kfold
print('finding best n_estimators for randomforestclassifier...')
min_score = 100000
best_n = 0
scores_n =
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
print("the number of trees : ".format(n))
t1 = time.time()
rfc_score = 0.
rfc = randomforestclassifier(n_estimators=n)
for train_k, test_k in kfold(len(train_kobe), n_folds=10, shuffle=true):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('done processing trees (sec)'.format(n, t2-t1))
print(best_n, min_score)
# find best max_depth for randomforestclassifier
print('finding best max_depth for randomforestclassifier...')
min_score = 100000
best_m = 0
scores_m =
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
print("the max depth : ".format(m))
t1 = time.time()
rfc_score = 0.
rfc = randomforestclassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in kfold(len(train_kobe), n_folds=10, shuffle=true):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('done processing trees (sec)'.format(m, t2-t1))
print(best_m, min_score)
