zoukankan      html  css  js  c++  java
  • 愚人节作业

    1.X轴的分割,用了cut,groupby函数,这个cut可以做成任意多份,得到近似的函数分布
    2.作plot(x,y)与hist直方图,可以看到图像还是很接近的
    3.用KL散度刻画两个分布的差异,并以list形式输出
    4.对list中按从大到小排列,可以得到feature的差异性,也就是统计量角度的重要性排序
    优点:对于之后给定的一组特征,不管多大,都可以得到他们的重要性排序
    改进:刚装的mysqldb,要学习如何用python直接操作mysql
    # _*_ coding :utf8 _*_
    from __future__ import division
    import pandas as pd
    import math
    import matplotlib.pyplot as plt
    import numpy as np
    import MySQLdb
    # conn=MySQLdb.connect('localhost','root','','qwe')
    # cur = conn.cursor()
    #
    # aa=cur.execute("select * from unpaid")
    # print aa
    # info = cur.fetchmany(aa)[0]
    # # for ii in info:
    # # print ii
    # cur.close()
    # conn.commit()
    # conn.close()

    passed=pd.read_csv('C:UsersjiejiaoDesktoppass.csv')
    unpaid=pd.read_csv('C:UsersjiejiaoDesktopunpaid.csv')
    paid=pd.read_csv('C:UsersjiejiaoDesktoppaid.csv')

    feature=pd.read_csv('C:UsersjiejiaoDesktoppaid.csv')
    feature=feature.dropna(how='all',axis=1)
    feature=feature.drop(['uid','status','f50','f7'],axis=1)

    columns1=feature.columns
    columns2=[]
    columns3=[]
    for name in columns1:
    grouped=paid[name].groupby(paid[name])
    if len(grouped)>5:
    # print name
    columns2.append(name)
    else:
    columns3.append(name)
    print 'columns2:',columns2
    print 'columns3:',columns3
    feature1=[]
    feature2=[]
    for name in columns2:
    unpaid[name]=unpaid[name].fillna(unpaid[name].mean())
    paid[name]=paid[name].fillna(paid[name].mean())
    a=unpaid[name]-np.mean(unpaid[name])
    unpaid[name]=a/np.sqrt(np.sum(a**2))
    b=paid[name]-np.mean(paid[name])
    paid[name]=b/np.sqrt(np.sum(b**2))

    cutpoint=[]
    for i in range(1001):
    m = min(unpaid[name].min(), paid[name].min())
    M = max(unpaid[name].max(), paid[name].max())
    # print m, M
    d=(M-m)/1000.0
    c=m+i*d
    cutpoint.append(c)
    # print cutpoint

    grouplabel=range(1000)
    # print grouplabel
    # print paid[name]
    # print unpaid[name]
    paid['numgroup']=pd.cut(paid[name],cutpoint,labels=grouplabel)
    unpaid['numgroup']=pd.cut(unpaid[name],cutpoint,labels=grouplabel)
    # print unpaid
    # print paid
    Np=[]
    Nu=[]
    Np.append(paid[name].groupby(paid['numgroup']).count())
    Nu.append(unpaid[name].groupby(unpaid['numgroup']).count())
    Nu=np.array(Nu)
    Nu=(Nu/Nu.sum()).transpose()
    Np=np.array(Np)
    Np=(Np/Np.sum()).transpose()
    Nu=Nu+10**(-6)
    Np = Np + 10**(-6)

    # plt.subplot(221)
    # plt.title(name)
    # plt.plot(grouplabel,Nu,color='g')
    # plt.subplot(222)
    # plt.title('paid')
    # plt.plot(grouplabel,Np,color='b')
    #
    # plt.subplot(223)
    # plt.title(name)
    # unpaid[name].hist(normed=True,color='k',alpha=0.5,bins=50)
    # plt.subplot(224)
    # plt.title('paid')
    # paid[name].hist(normed=True,color='b',alpha=0.5,bins=44)
    # plt.show()

    def asymetricKL(P,Q):
    t=[math.log(x) for x in (P/Q)]
    return np.multiply(t,P.transpose()).sum()

    tt= (asymetricKL(Nu,Np)+asymetricKL(Np,Nu))/2.0
    feature1.append(tt)
    print feature1
    feature1_diff=pd.DataFrame(feature1,index=columns2)
    #
    idx=[]
    for name in columns3:
    idx1 = passed[name]
    idx1= list(set(idx1))
    idx=[x for x in idx1 if str(x)!='nan']
    # print idx
    Np=[]
    Nu=[]
    Np= paid[name].groupby(passed[name]).count()
    Nu= unpaid[name].groupby(passed[name]).count()
    s_unpaid=pd.DataFrame(Nu,index=idx)
    s_unpaid[np.isnan(s_unpaid)]=0
    s_paid=pd.DataFrame(Np,index=idx)
    s_paid[np.isnan(s_paid)]=0
    Nu = np.array(s_unpaid[name])
    Np =np.array(s_paid[name])
    # print Nu,Np
    Nu=Nu+10**(-6)
    Np = Np + 10**(-6)
    # print Nu,Np
    # plt.subplot(223)
    # plt.title(name)
    # unpaid[name].hist(normed=True,color='k',alpha=0.5,bins=50)
    # plt.subplot(224)
    # plt.title('paid')
    # paid[name].hist(normed=True,color='b',alpha=0.5,bins=44)
    # plt.show()

    def asymetricKL(P,Q):
    t=[math.log(x) for x in (P/Q)]
    return np.multiply(t,P.transpose()).sum()

    tt= (asymetricKL(Nu,Np)+asymetricKL(Np,Nu))/2.0
    feature2.append(tt)
    feature2_diff=pd.DataFrame(feature2,index=columns3)
    print feature2_diff
     
  • 相关阅读:
    注解
    es
    集合collection-map-list-set
    spring boot Configuration Annotation Proessor not found in classpath
    mvn
    linux_elasticsearch_jdk_ssh
    Floyd算法学习
    同一个job,不同shell之间传递参数
    jenkins post build tasks插件中log text参数的使用说明
    一个强大的jenkins 批量修改job的插件Configuration Slicing
  • 原文地址:https://www.cnblogs.com/jojo123/p/6656111.html
Copyright © 2011-2022 走看看