使用網格搜尋法對7個模型進行調優(調參時採用五折交叉驗證的方式),並進行模型評估
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import standardscaler
from sklearn import metrics
from sklearn.model_selection import gridsearchcv
from sklearn.linear_model import logisticregression
from sklearn.svm import svc
from sklearn.tree import decisiontreeclassifier
from sklearn.ensemble import randomforestclassifier
from sklearn.ensemble import gradientboostingclassifier
from xgboost import xgbclassifier
from lightgbm import lgbmclassifier
import warnings
warnings.filterwarnings(
"ignore"
, category=futurewarning, module=
"sklearn"
, lineno=
196)
#讀取資料
data_all = pd.read_csv(r'd:\datawhale\12.17\data_all.csv'
)print
('資料的行列'
,data_all.shape)
#劃分資料集
x = data_all.drop(
['status'
],axis=1)
y = data_all[
'status'
]x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=
0.3,random_state=
2018
)#歸一化
sc = standardscaler(
)sc.fit(x_train)
# 估算每個特徵的平均值和標準差
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
#定義網格搜尋交叉驗證函式(5折)
defgridsearch
(model,parameters)
: grid = gridsearchcv(model,parameters,scoring=
'accuracy'
,cv=5)
grid = grid.fit(x_train_std,y_train)
ifhasattr
(model,
'decision_function'):
y_predict_pro = grid.decision_function(x_test_std)
else
: y_predict_pro = grid.predict_proba(x_test_std)[:
,1]print
('best score:'
,grid.best_score_)
print
(grid.best_params_)
print
('test score:'
,grid.score(x_test_std,y_test)
)print
('auc:'
,metrics.roc_auc_score(y_test,y_predict_pro)
)#邏輯回歸
print
('邏輯回歸:'
)parameters =
lr = logisticregression(
)lr.fit(x_train_std,y_train)
gridsearch(lr, parameters)
print(''
)#svm
print
('svm:'
)parameters =
svc = svc(
)svc.fit(x_train_std,y_train)
gridsearch(svc,parameters)
print(''
)#決策樹
print
('決策樹:'
)parameters =
clf = decisiontreeclassifier(
)clf.fit(x_train_std,y_train)
gridsearch(clf,parameters)
print(''
)#隨機森林
print
('隨機森林:'
)parameters =
rfc = randomforestclassifier(random_state=
2018
)rfc.fit(x_train_std,y_train)
gridsearch(rfc,parameters)
print(''
)#gbdt
print
('gbdt:'
)parameters =
gbdt = gradientboostingclassifier(random_state=
2018
)gbdt.fit(x_train_std,y_train)
gridsearch(gbdt,parameters)
print(''
)#xgboost
print
('xgboost:'
)parameters =
xgbs = xgbclassifier(
)xgbs.fit(x_train_std,y_train)
gridsearch(xgbs,parameters)
print(''
)#lightgbm
parameters =
lgbm = lgbmclassifier(random_state=
2018
)lgbm.fit(x_train_std,y_train)
gridsearch(lgbm,parameters)
print(''
)
4.結果
邏輯回歸
決策樹
隨機森林
其他幾個還在跑= =
一周演算法實踐day1 模型構建
這份資料集是金融資料 非原始資料,已經處理過了 我們要做的是 貸款使用者是否會逾期。中 status 是結果標籤 0表示未逾期,1表示逾期。data all pd.read csv data all.csv x train,x test,y train,y test train test split...
一周又是一周
三年又三年,之後又三年,這就九年了,老大!無間道的話還迴盪在耳邊,而我入職以來,這就已經是第四周了。時間飛逝歲月如梭,一天一天的日子,感覺起來也挺快的。除了第一周剛上任的時候,感覺時間慢慢流,現在進入工作狀態以後,時間還是一閃而過的。白天的時間最是難熬,特別是下午的時候。工作時間久,而下午的時候也是...
演算法,第一周
題目 給定乙個整數陣列 nums 和乙個目標值 target,請你在該陣列中找出和為目標值的那 兩個 整數,並返回他們的陣列下標。你可以假設每種輸入只會對應乙個答案。但是,你不能重複利用這個陣列中同樣的元素。示例 給定 nums 2,7,11,15 target 9 因為 nums 0 nums 1...