zoukankan      html  css  js  c++  java
  • python 特征选择 绘图 + mine

    demo代码:

    # _*_coding:UTF-8_*_
    import numpy as np
    import sys 
    import pandas as pd
    from pandas import Series,DataFrame
    import numpy as np
    import sys 
    from sklearn import preprocessing
    from sklearn.ensemble import ExtraTreesClassifier
    import os
    from minepy import MINE
    
    def iterbrowse(path):
        for home, dirs, files in os.walk(path):
            for filename in files:
                yield os.path.join(home, filename)
    
    
    def get_data(filename):
        white_verify = []
        with open(filename) as f:
            lines = f.readlines()
            data = {}
            for line in lines:
                a = line.split("	")
                if len(a) != 78: 
                    print(line)
                    raise Exception("fuck")
                white_verify.append([float(n) for n in a[3:]])
        return white_verify
    
    
    if __name__ == '__main__':
        # pdb.set_trace()
        neg_file = "cc_data/black_all.txt"
        pos_file = "cc_data/white_all.txt"
        X = []
        y = []
        if os.path.isfile(pos_file):
            if pos_file.endswith('.txt'):
                pos_set = np.genfromtxt(pos_file)
            elif pos_file.endswith('.npy'):
                pos_set = np.load(pos_file)
            X.extend(pos_set)
            y += [0] * len(pos_set)
        if os.path.isfile(neg_file):
            if neg_file.endswith('.txt'):
                neg_set = np.genfromtxt(neg_file)
            elif neg_file.endswith('.npy'):
                neg_set = np.load(neg_file)
    
            '''
            X.extend(list(neg_set) * 5)
            y += [1] * (5 * len(neg_set))
            '''
            X.extend(neg_set)
            y += [1] * len(neg_set)
    
        print("len of X:", len(X))
        print("X sample:", X[:3])
        print("len of y:", len(y))
        print("y sample:", y[:3])
        X = [x[3:] for x in X]
        print("filtered X sample:", X[:3])
    
        cols = [str(i + 6) for i in range(len(X[0]))]
        clf = ExtraTreesClassifier()
        clf.fit(X, y)
        print (clf.feature_importances_)
        print "Features sorted by their score:"
        print sorted(zip(clf.feature_importances_, cols), reverse=True)
    
        black_verify = []
        for f in iterbrowse("todo/top"):
            print(f)
            black_verify += get_data(f)
        # ValueError: operands could not be broadcast together with shapes (1,74) (75,) (1,74)
        print(black_verify)
        black_verify_labels = [3] * len(black_verify)
    
        white_verify = get_data("todo/white_verify.txt")
        print(white_verify)
        white_verify_labels = [2] * len(white_verify)
    
        unknown_verify = get_data("todo/pek_feature74.txt")
        print(unknown_verify)
    
        # extend data
        X = np.concatenate((X, black_verify))
        y += black_verify_labels
        X = np.concatenate((X, white_verify))
        y += white_verify_labels
    
        #################################### plot ####################################
        data_train = pd.DataFrame(X)
        # cols = [str(i) for i in range(6, 81)]
        data_train.columns = cols
    
        # add label column
        # data_train = data_train.assign(label=pd.Series(y))
        data_train["label"] = pd.Series(y)
    
        print(data_train.info())
        print(data_train.columns)
    
    
    
        import matplotlib.pyplot as plt
    
        for col in cols:
            fig = plt.figure(figsize=(20, 16), dpi=8)
            fig.set(alpha=0.2)
            plt.figure()
            data_train[data_train.label == 0.0][col].plot()
            data_train[data_train.label == 1.0][col].plot()
            data_train[data_train.label == 2.0][col].plot()
            data_train[data_train.label == 3.0][col].plot()
            plt.xlabel(u"sample data id")
            plt.ylabel(u"value")
            plt.title(col)
            plt.legend((u'white', u'black', u"white-todo", u"black-todo"), loc='best')
            plt.show()
    
        print "calculate MINE mic value:"
        for col in cols:
            print col,
            mine = MINE(alpha=0.6, c=15,
                        est="mic_approx")  # http://minepy.readthedocs.io/en/latest/python.html#second-example
            mine.compute_score(data_train[col], y)
            print "MIC=", mine.mic()
    
        sys.exit(-1)
    

     extend data 表示待预测的数据

    关于mic:

    from __future__ import division
    import numpy as np
    import matplotlib.pyplot as plt
    from minepy import MINE
    
    
    rs = np.random.RandomState(seed=0)
    
    def mysubplot(x, y, numRows, numCols, plotNum,
                  xlim=(-4, 4), ylim=(-4, 4)):
    
        r = np.around(np.corrcoef(x, y)[0, 1], 1)
        mine = MINE(alpha=0.6, c=15, est="mic_approx")
        mine.compute_score(x, y)
        mic = np.around(mine.mic(), 1)
        ax = plt.subplot(numRows, numCols, plotNum,
                         xlim=xlim, ylim=ylim)
        ax.set_title('Pearson r=%.1f
    MIC=%.1f' % (r, mic),fontsize=10)
        ax.set_frame_on(False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.plot(x, y, ',')
        ax.set_xticks([])
        ax.set_yticks([])
        return ax
    
    def rotation(xy, t):
        return np.dot(xy, [[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])
    
    def mvnormal(n=1000):
        cors = [1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0]
        for i, cor in enumerate(cors):
            cov = [[1, cor],[cor, 1]]
            xy = rs.multivariate_normal([0, 0], cov, n)
            mysubplot(xy[:, 0], xy[:, 1], 3, 7, i+1)
    
    def rotnormal(n=1000):
        ts = [0, np.pi/12, np.pi/6, np.pi/4, np.pi/2-np.pi/6,
              np.pi/2-np.pi/12, np.pi/2]
        cov = [[1, 1],[1, 1]]
        xy = rs.multivariate_normal([0, 0], cov, n)
        for i, t in enumerate(ts):
            xy_r = rotation(xy, t)
            mysubplot(xy_r[:, 0], xy_r[:, 1], 3, 7, i+8)
    
    def others(n=1000):
        x = rs.uniform(-1, 1, n)
        y = 4*(x**2-0.5)**2 + rs.uniform(-1, 1, n)/3
        mysubplot(x, y, 3, 7, 15, (-1, 1), (-1/3, 1+1/3))
    
        y = rs.uniform(-1, 1, n)
        xy = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
        xy = rotation(xy, -np.pi/8)
        lim = np.sqrt(2+np.sqrt(2)) / np.sqrt(2)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, 16, (-lim, lim), (-lim, lim))
    
        xy = rotation(xy, -np.pi/8)
        lim = np.sqrt(2)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, 17, (-lim, lim), (-lim, lim))
    
        y = 2*x**2 + rs.uniform(-1, 1, n)
        mysubplot(x, y, 3, 7, 18, (-1, 1), (-1, 3))
    
        y = (x**2 + rs.uniform(0, 0.5, n)) * 
            np.array([-1, 1])[rs.random_integers(0, 1, size=n)]
        mysubplot(x, y, 3, 7, 19, (-1.5, 1.5), (-1.5, 1.5))
    
        y = np.cos(x * np.pi) + rs.uniform(0, 1/8, n)
        x = np.sin(x * np.pi) + rs.uniform(0, 1/8, n)
        mysubplot(x, y, 3, 7, 20, (-1.5, 1.5), (-1.5, 1.5))
    
        xy1 = np.random.multivariate_normal([3, 3], [[1, 0], [0, 1]], int(n/4))
        xy2 = np.random.multivariate_normal([-3, 3], [[1, 0], [0, 1]], int(n/4))
        xy3 = np.random.multivariate_normal([-3, -3], [[1, 0], [0, 1]], int(n/4))
        xy4 = np.random.multivariate_normal([3, -3], [[1, 0], [0, 1]], int(n/4))
        xy = np.concatenate((xy1, xy2, xy3, xy4), axis=0)
        mysubplot(xy[:, 0], xy[:, 1], 3, 7, 21, (-7, 7), (-7, 7))
    
    plt.figure(facecolor='white')
    mvnormal(n=800)
    rotnormal(n=200)
    others(n=800)
    plt.tight_layout()
    plt.show()
    
    _images/relationships.png
  • 相关阅读:
    JavaScript打印99乘法表
    Python列表推导式玩法
    Python错误重试方法
    pandas + jupyter进行数据处理
    新手小白的爬虫神器-无代码高效爬取数据
    Adb连接模拟器出现版本错误
    Python发送多人邮件报错
    Django入门
    git clone 下载速度解决办法
    Python实现自动刷抖音
  • 原文地址:https://www.cnblogs.com/bonelee/p/9081328.html
Copyright © 2011-2022 走看看