from sklearn.feature_selection import variancethreshold,selectkbest,chi2
from sklearn.datasets import load_iris
import pandas as pd
x,y = load_iris(return_x_y=true)
x_df = pd.dataframe(x,columns=list("abcd"))
(chi2,pval) = chi2(x_df,y)
dict_feature = {}
for i,j in zip(x_df.columns.values,chi2):
dict_feature[i]=j
#對字典按照values排序
ls = sorted(dict_feature.items(),key=lambda item:item[1],reverse=true)
#特徵選取數量
k =2
ls_new_feature=
for i in range(k):
x_new = x_df[ls_new_feature]
from sklearn.feature_selection import variancethreshold,selectkbest,chi2
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
#用於度量特徵和離散目標的互資訊
x,y = load_iris(return_x_y=true)
x_df = pd.dataframe(x,columns=list("abcd"))
feature_cat = ["a","d"]
discrete_features =
feature = x_df.columns.values.tolist()
for k in feature_cat:
if k in feature:
mu = mutual_info_classif(x_df,y,discrete_features=discrete_features,
n_neighbors=3, copy=true, random_state=none)
dict_feature = {}
for i,j in zip(x_df.columns.values,mu):
dict_feature[i]=j
#對字典按照values排序
ls = sorted(dict_feature.items(),key=lambda item:item[1],reverse=true)
#特徵選取數量
k =2
ls_new_feature=
for i in range(k):
x_new = x_df[ls_new_feature]
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.feature_selection import selectfrommodel
from sklearn.linear_model import logisticregression
x,y = load_iris(return_x_y=true)
x_df = pd.dataframe(x,columns=list("abcd"))
sf = selectfrommodel(estimator=logisticregression(penalty="l1", c=0.1),
threshold=none,
prefit=false,
norm_order=1)
sf.fit(x_df,y)
x_new = x_df[x_df.columns.values[sf.get_support()]]
from sklearn.feature_selection import variancethreshold
from sklearn.datasets import load_iris
import pandas as pd
x,y = load_iris(return_x_y=true)
x_df = pd.dataframe(x,columns=list("abcd"))
#建議作為數值特徵的篩選方法,對於分類特徵可以考慮每個類別的佔比問題
ts = 0.5
vt = variancethreshold(threshold=ts)
vt.fit(x_df)
#檢視各個特徵的方差
dict_variance = {}
for i,j in zip(x_df.columns.values,vt.variances_):
dict_variance[i] = j
#獲取保留了的特徵的特徵名
ls = list()
for i,j in dict_variance.items():
if j >= ts:
x_new = pd.dataframe(vt.fit_transform(x_df),columns=ls)
特徵工程 之 特徵篩選
從現有的m個特徵中選出n個特徵 n m 降低特徵維度減少計算量的同時,使模型效果達到最優。在實際業務中,用於模型中的特徵維度往往很高,幾萬維。如一些ctr預估問題中,特徵維度高達上億維,維度過高會增大模型計算複雜度。但實際情況是,並不是每個特徵對模型的 都是有效果的,所以需要去除一些不必要的特徵,從...
機器學習 特徵工程 特徵篩選
1 冗餘 部分特徵相關度太高,消耗計算效能,影響決策樹分支的選擇。2 雜訊 部分特徵是對 結果有負影響 3 降維 減少特徵數量 降維,使模型泛化能力更強,減少過擬合 4 特徵選擇與降維的關係 特徵選擇只篩選掉原本特徵裡和結果 關係不大的,後者做特徵的計算組合構成新特徵。svd pca降維也能解決一定...
隨機森林特徵篩選
剛看到一篇介紹特徵篩選的文章,裡面介紹基於模型的特徵排名,附加了乙個隨機森林的python程式,感覺挺好,趕緊mark下來。程式使用了skliearn機器學習庫,資料集為boston房屋 資料,源程式如下所示 fromsklearn.cross validationimportcross val s...