用邏輯回歸、svm和決策樹;隨機森林和xgboost進行模型構建,評分方式任意,如準確率等。
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import decisiontreeclassifier
from sklearn.model_selection import cross_val_score
datapath = 'd:\\data\\dataanalyse\\'
datafile = 'dataprocessbytwotask.csv'
data = pd.read_csv(datapath+datafile,encoding='gbk')
train_data, test_data = train_test_split(data, test_size=0.3, random_state=2018)
print("train_data",train_data.shape)
print("test_data",test_data.shape)
y_train = train_data['status']
x_train = train_data.drop(['status'],axis =1)
y_test = test_data['status']
x_test = test_data.drop(['status'],axis =1)
clf = decisiontreeclassifier(max_depth=none, min_samples_split=2,random_state=0)
# 訓練樣本
scores_train = cross_val_score(clf, x_train, y_train)
print('測試資料集準確率=',scores_train.mean())
scores_test = cross_val_score(clf, x_test, y_test)
print('測試資料集準確率=',scores_test.mean())
輸出結果:
測試資料集準確率= 0.7012350276158706
測試資料集準確率= 0.6691776554344605
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import randomforestclassifier
from sklearn.model_selection import cross_val_score
datapath = 'd:\\data\\dataanalyse\\'
datafile = 'dataprocessbytwotask.csv'
data = pd.read_csv(datapath+datafile,encoding='gbk')
train_data, test_data = train_test_split(data, test_size=0.3, random_state=2018)
print("train_data",train_data.shape)
print("test_data",test_data.shape)
y_train = train_data['status']
x_train = train_data.drop(['status'],axis =1)
y_test = test_data['status']
x_test = test_data.drop(['status'],axis =1)
clf = randomforestclassifier(n_jobs=2)
# 訓練樣本
clf.fit(x_train, y_train)
clf.predict(x_test)
scores_train = cross_val_score(clf, x_train, y_train)
print('測試資料集準確率=',scores_train.mean())
scores_test = cross_val_score(clf, x_test, y_test)
print('測試資料集準確率=',scores_test.mean())
輸出結果:
測試資料集準確率= 0.7787475721945868
測試資料集準確率= 0.7520794391727867
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import accuracy_score
from xgboost import xgbclassifier
from xgboost import plot_importance
from sklearn import metrics
model = xgbclassifier(learning_rate=0.01,
n_estimators=10, # 樹的個數-10棵樹建立xgboost
max_depth=4, # 樹的深度
min_child_weight = 1, # 葉子節點最小權重
gamma=0., # 懲罰項中葉子結點個數前的引數
subsample=1, # 所有樣本建立決策樹
colsample_btree=1, # 所有特徵建立決策樹
scale_pos_weight=1, # 解決樣本個數不平衡的問題
random_state=27, # 隨機數
slient = 0
)model.fit(x_train,y_train)
#**y_pred = model.predict(x_test)
print("accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
y_train_proba = model.predict_proba(x_train)[:,1]
print("auc score (train): %f" % metrics.roc_auc_score(y_train, y_train_proba))
y_proba = model.predict_proba(x_test)[:,1]
print("auc score (test): %f" % metrics.roc_auc_score(y_test, y_proba))
結果:
accuracy : 0.7551
auc score (train): 0.807798
auc score (test): 0.756977
xgboost:在python中使用xgboost DataWhale 資料探勘 Task3
特徵工程 import pandas as pd import numpy as np import tsfresh as tsf from tsfresh import extract features,select features from tsfresh.utilities.datafram...
task3初級演算法
熵 原本物理學中的定義,後來夏農將其引申到啦資訊理論領域,用來表示資訊量的大小。資訊量大 分類越不 純淨 對應的熵值就越大,反之亦然。資訊熵的計算公式 聯合熵 一維隨機變數分布推廣到多維隨機變數分布。聯合熵的計算公式 條件熵 h y x 表示在已知隨機變數 x 的條件下隨機變數 y 的不確定性。條件...
TASK3 資料型別
magic 函式用來產生魔方矩陣。魔方矩陣中每行 列和兩條對角線上的元素和相等。a magic 3 a 816 3574 92 b magic 4 b 1623 135111089 76124 1415 1主要包括數值型別,邏輯型別,字串,函式控制代碼,結構體和單元陣列型別。整數型別 數值的預設儲存...