zoukankan html css js c++ java

数据挖掘特征提取方法-汇集

1.基于树模型提取特征

  1 # 使用树模型提取特征
  2 import numpy as np
  3 from sklearn import feature_selection
  4 from sklearn.ensemble import GradientBoostingClassifier
  5 
  6 matrix = np.array(X)
  7 target = np.array(target)
  8 temp = feature_selection.SelectFromModel(GradientBoostingClassifier()).fit(matrix, target)
  9 indx = temp._get_support_mask().tolist()
 10 scores = get_importance(temp.estimator_).tolist()
 11 result = temp.transform(matrix).tolist()
 12 return scores, indx, result
 13 
 14 # X: array-like
 15 # target: array-like
 16 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
 17

2.基于L1，L2惩罚值提取特征

  1 # 基于L1,L2惩罚值提取特征
  2 import numpy as np
  3 from sklearn import feature_selection
  4 from sklearn.linear_model import LogisticRegression
  5 
  6 matrix = np.array(arr0)
  7 target = np.array(target)
  8 temp = feature_selection.SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit(matrix, target)
  9 indx = temp._get_support_mask().tolist()
 10 scores = get_importance(temp.estimator_).tolist()
 11 result = temp.transform(matrix).tolist()
 12 return scores, indx, result
 13 
 14 # X: array-like
 15 # target: array-like
 16 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
 17

3.递归特征消除法提取特征

  1 # 递归特征消除法
  2 import numpy as np
  3 from sklearn import feature_selection
  4 from sklearn.linear_model import LogisticRegression
  5 
  6 matrix = np.array(X)
  7 target = np.array(target)
  8 temp = feature_selection.RFE(estimator=LogisticRegression(), n_features_to_select=n_features).fit(matrix, target)
  9 scores = temp.ranking_.tolist()
 10 indx = temp.support_.tolist()
 11 result = temp.transform(matrix).tolist()
 12 return scores, indx, result
 13 
 14 # X: array-like
 15 # target: array-like
 16 # n-features: int
 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
 18

4.互信息选择法提取特征

  1 # 互信息选择法
  2 from minepy import MINE
  3 import numpy as np
  4 from sklearn import feature_selection
  5 
  6 matrix = np.array(X)
  7 target = np.array(target)
  8 def mic(x, y):
  9     m = MINE()
 10     m.compute_score(x, y)
 11     return (m.mic(), 0.5)
 12 temp = feature_selection.SelectKBest(lambda X, Y: np.array(list(map(lambda x: mic(x, Y), X.T))).T[0], k=k).fit(matrix, target)
 13 scores = temp.scores_.tolist()
 14 indx = temp.get_support().tolist()
 15 result = temp.transform(matrix).tolist()
 16 return scores, indx, result
 17 
 18 # X: array-like
 19 # target: array-like
 20 # k: int
 21 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
 22

5.利用相关系数选择特征

  1 # 利用相关系数选择特征
  2 import numpy as np
  3 from sklearn import feature_selection
  4 from sklearn.feature_selection import chi2
  5 
  6 matrix = np.array(X)
  7 target = np.array(target)
  8 temp = feature_selection.SelectKBest(lambda X, Y: np.array(list(map(lambda x: abs(pearsonr(x, Y)[0]), X.T))), k=k).fit(matrix, target)
  9 scores = temp.scores_.tolist()
 10 indx = temp.get_support().tolist()
 11 result = temp.transform(matrix).tolist()
 12 return scores, indx, result
 13 
 14 # X: array-like
 15 # target: array-like
 16 # k: int
 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
 18

6.卡方检验法提取特征

  1 # 卡方检验法提取特征
  2 import numpy as np
  3 from sklearn import feature_selection
  4 from sklearn.feature_selection import chi2
  5 
  6 matrix = np.array(X)
  7 target = np.array(target)
  8 temp = feature_selection.SelectKBest(chi2, k=k).fit(matrix, target)
  9 scores = temp.scores_.tolist()
 10 indx = temp.get_support().tolist()
 11 result = temp.transform(matrix).tolist()
 12 return scores, indx, result
 13 
 14 # X: array-like
 15 # target: array-like
 16 # k: int
 17 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
 18

7.利用方差选择特征

  1 # 利用方差选择特征
  2 import numpy as np
  3 from sklearn import feature_selection
  4 
  5 matrix = np.array(X)
  6 temp = feature_selection.VarianceThreshold(threshold=t).fit(matrix)
  7 scores = [np.var(el) for el in matrix.T]
  8 indx = temp.get_support().tolist()
  9 result = temp.transform(matrix).tolist()
 10 return scores, indx, result
 11 
 12 # X: array-like
 13 # t: float
 14 # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
 15

参考：https://www.kesci.com/

查看全文

相关阅读:
c语言 inline函数
 ubuntu18.04 安装新版本openssl
convert_mnist_data.cpp
caffe 安装方法和记录
 【转】jdbc:oracle:thin:@localhost:1521:orcl与jdbc:oracle:thin:@//localhost:1521/orcl区别
 【原创】【HCIA】验证GaussDB 100系统函数to_nchar()是否能支持最大字节长度8000
【原创】达梦数据库DM7备份与恢复
 【LINUX】操作系统时钟与硬件时钟设置
 【文档】ORACLE RAC 修改SCAN监听端口、VIP网络配置参考文档
 【转】11G R2 RAC: HOW TO IDENTIFY THE MASTER NODE IN RAC

原文地址：https://www.cnblogs.com/jean925/p/9314864.html