Skip to content

Latest commit

 

History

History
112 lines (78 loc) · 3.09 KB

2特征选择.md

File metadata and controls

112 lines (78 loc) · 3.09 KB

特征选择

参考

介绍各种特征选择

  1. scikit-learn中树算法(树的分支过程,本身就是特征的选择过程)

from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier

iris = datasets.load_iris()

X = iris.data
y = iris.target
model = ExtraTreesClassifier(n_estimators=100)

model.fit(X,y)
print(iris.feature_names) # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print(model.feature_importances_) # [0.10048684 0.05654855 0.40099705 0.44196756]
  1. RFE(递归特征消除算法)搜索算法
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

model = LinearRegression()
iris = datasets.load_iris()

X = iris.data
y = iris.target

rfe = RFE(model,3)
rfe = rfe.fit(X,y)

print(rfe.support_) # [ True False  True  True]
print(rfe.ranking_) # [1 2 1 1]
  1. 利用LassoCV进行特征选择
from sklearn import datasets
from sklearn.linear_model import LassoCV

iris = datasets.load_iris()

X = iris.data
y = iris.target

model = LassoCV(cv=5,random_state=0,alphas=[0.001,0.0001,0.1,1]).fit(X,y)

print(model.alpha_)
print(model.coef_) # [-0.11192739 -0.0392068   0.22963358  0.60707621]  小于0的就是剔除的特征
  1. 移除低方差特征
from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X) # 移除了第一列
  1. 单变量特征选择
SelectKBest 移除那些除了评分最高的 K 个特征之外的所有特征
SelectPercentile 移除除了用户指定的最高得分百分比之外的所有特征
对每个特征应用常见的单变量统计测试: 假阳性率(false positive rate) SelectFpr, 伪发现率(false discovery rate) SelectFdr , 或者族系误差(family wise error) SelectFwe 。
GenericUnivariateSelect 允许使用可配置方法来进行单变量特征选择。它允许超参数搜索评估器来选择最好的单变量特征。
  1. SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel



# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1", random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)
X_new
  1. 相关系数法 相关系数
#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()