變數重要性和變數選擇in xgboost

2021-09-06 10:06:16 字數 4126 閱讀 3768







# select features using threshold

selection = selectfrommodel(model, threshold=thresh, prefit=true)

select_x_train = selection.transform(x_train)

# train model

selection_model = xgbclassifier()

selection_model.fit(select_x_train, y_train)

# eval model

select_x_test = selection.transform(x_test)

y_pred = selection_model.predict(select_x_test)


# use feature importance for feature selection

from numpy import loadtxt

from numpy import sort

from xgboost import xgbclassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.feature_selection import selectfrommodel

# load data

dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")

# split data into x and y

x = dataset[:,0:8]

y = dataset[:,8]

# split data into train and test sets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=7)

# fit model on all training data

model = xgbclassifier()

model.fit(x_train, y_train)

# make predictions for test data and evaluate

y_pred = model.predict(x_test)

predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)

print("accuracy: %.2f%%" % (accuracy * 100.0))

# fit model using each importance as a threshold

thresholds = sort(model.feature_importances_)

for thresh in thresholds:

# select features using threshold

selection = selectfrommodel(model, threshold=thresh, prefit=true)

select_x_train = selection.transform(x_train)

# train model

selection_model = xgbclassifier()

selection_model.fit(select_x_train, y_train)

# eval model

select_x_test = selection.transform(x_test)

y_pred = selection_model.predict(select_x_test)

predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)

print("thresh=%.3f, n=%d, accuracy: %.2f%%" % (thresh, select_x_train.shape[1], accuracy*100.0))


accuracy: 77.95%

thresh=0.071, n=8, accuracy: 77.95%

thresh=0.073, n=7, accuracy: 76.38%

thresh=0.084, n=6, accuracy: 77.56%

thresh=0.090, n=5, accuracy: 76.38%

thresh=0.128, n=4, accuracy: 76.38%

thresh=0.160, n=3, accuracy: 74.80%

thresh=0.186, n=2, accuracy: 71.65%

thresh=0.208, n=1, accuracy: 63.78%


首先準備一些測試資料,create table tynametable idint,typename nvarchar 10 insert into tynametable values 1,射手 insert into tynametable values 10,法師 insert into ty...


線上某業務,頻繁出現iops 使用率100 的 每秒4000iops 現象,每次持續接近1個小時,從慢請求的日誌發現是乙個 getmore 請求耗時1個小時,導致iops高 深入調查之後,最終發現竟是乙個索引選擇的問題。2017 11 01t15 04 17.498 0800 i command c...

