zoukankan      html  css  js  c++  java
  • python相关性分析与p值检验

    ## 最近两天的成果

    '''
                ##########################################
                #                                        #
                #           不忘初心  砥砺前行.          #
                #                                418__yj #
                ##########################################
    '''
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy.stats import pearsonr
    import datetime
    import os
    
    #求原始图像各波段相关系数与P值
    def corr_p(data,spad):
        print('[INFO]处理原始光谱曲线')
        l1=[]
        l2=[]
        col=data.columns
        num=len(data.index)
        index=np.linspace(0,num-1,num)
        data.index=index
        spad.index=index
        for i in col:
            #pearsonr函数返回两个值,分别为相关系数以及P值(显著性水平)
            #l1:相关系数列表,l2:p值列表
            value=pearsonr(spad[spad.columns[0]],data[i])
            l1.append(value[0])
            l2.append(round(value[1],3))
        corr_se=pd.Series(l1,index=col)
        p_se=pd.Series(l2,index=col)
    
        #因为不可避免的存在0.01,0.05水平线不存在,因此依次在附近寻找了+-0.002范围的值
        index_001_list=[0.010,0.011,0.009,0.012,0.008]
        index_005_list=[0.050,0.051,0.049,0.052,0.048]
        index_001=[]
        index_001_01=[]
        index_005=[]
        index_005_01=[]
        for i in index_001_list:
            index_001.append(list(p_se[p_se==i].index.values))
            index_001_01.append(list(p_se[p_se==-i].index.values))
        for i in index_005_list:
            index_005.append(list(p_se[p_se==i].index.values))
            index_005_01.append(list(p_se[p_se==-i].index.values))
        #数据清洗
        index_001=[list(i) for i in index_001 if len(list(i))!=0]
        index_001_01=[list(i) for i in index_001_01 if len(list(i))!=0]
        index_005=[list(i) for i in index_005 if len(list(i))!=0]
        index_005_01=[list(i) for i in index_005_01 if len(list(i))!=0]
        print(index_001,index_005)
        #p=0.01,p=0.05所对应波段的相关系数值
        p_001=corr_se[index_001[0][0]]
        p_005=corr_se[index_005[0][0]]
        #对单个值的横向填充为PD.SERIES
        p_001_data=get_p_value(p_001,corr_se,name='p_001')
        p_005_data=get_p_value(p_005,corr_se,name='p_005')
        corr=pd.concat([corr_se,p_001_data,p_005_data],axis=1)
        idmax=corr_se.idxmax()
        idmin=corr_se.idxmin()
        print('p_001,p_005')
        print(p_001,p_005)
        print('*')
        print('[INFO]idmax:%s,idmin:%s'%(idmax,idmin))
        #绘图
        s='diff.png'
        xticks=np.arange(339,2539,200)
        draw(corr,s,xticks)
        #写入txt文档,0.01,0.05交点用于分析
        with open('./output/corr_original.txt','w') as f:
            f.writelines(u'最大相关系数所对应波段:'+str(idmax)+'
    ')
            f.writelines('相关系数最大值:'+str(corr_se[idmax])+'
    ')
            f.writelines('负相关最大所对应波段:'+str(idmin)+'
    ')
            f.writelines('负相关最大值:'+str(corr_se[idmin])+'
    ')
            f.writelines('0.05水平线与负相关系数曲线交点:'+str(index_005)+'
    ')
            f.writelines('0.05水平线与正相关系数曲线交点:'+str(index_005_01)+'
    ')
            f.writelines('0.01水平线与负相关系数曲线交点:'+str(index_001)+'
    ')
            f.writelines('0.01水平线与正相关系数曲线交点:'+str(index_001_01)+'
    ')
    
    def get_p_value(p_value,corr_se,name):
        #empty=np.zeros_like(corr_se)
        min_corr=corr_se.min()
        max_corr=corr_se.max()
        if min_corr*max_corr>0:
            se=pd.Series(p_value,index=corr_se.index)
            se.name=name
            return se
        else:
            #empty_1=empty.copy()
            #empty_1[:]=p_se
            se_1=pd.Series(p_value,index=corr_se.index)
            se_1.name=name
            #empty_2=empty.copy()
            #empty_2[:]=-p_se
            se_2=pd.Series(-p_value,index=corr_se.index)
            se_2.name=name+'_01'
            return pd.concat([se_1,se_2],axis=1)
    def draw(corr,s,xticks):
        '''
        if corr.columns[0]==338:
            xticks=np.arange(338,2538,200)
            xlim=(338,2538)
        elif corr.columns[0]==339:
            xticks=np.arange(339,2539,200)
            xlim=(339,2539)
        UnboundLocalError:
        '''
        print('corrr.columns:%s'%corr.columns)
    
        style={0:'k','p_001':'k','p_001_01':'k','p_005':'k--','p_005_01':'k--'}
        corr.plot(style=style,xticks=xticks,xlim=(xticks.min(),xticks.max()),figsize=(12,9))
        ax=plt.gca()
        ax.spines['top'].set_color('none')
        ax.spines['right'].set_color('none')
        plt.savefig('./output/'+s)
    
    def diff_corr_p(data,spad):
        print('[INFO]处理一阶导数曲线')
        l1=[]
        l2=[]
        #相比原始图像,一阶导数要先转为np.array做差分处理,再转为dataframe
        array=np.array(data)
        diff_array=np.diff(array,axis=1)
    
        num=len(data.index)
        index=np.linspace(0,num-1,num)
        spad.index=index
        data=pd.DataFrame(diff_array,columns=data.columns[:-1],index=index)
        col=data.columns
        #输出Excel
        #data.to_excel('./output/diff.xlsx')
        for i in col:
            #pearsonr函数返回两个值,分别为相关系数以及P值(显著性水平)
            #l1:相关系数列表,l2:p值列表
            value=pearsonr(spad[spad.columns[0]],data[i])
            l1.append(value[0])
            l2.append(round(value[1],3))
        corr_se=pd.Series(l1,index=col)
        p_se=pd.Series(l2,index=col)
    
        #因为不可避免的存在0.01,0.05水平线不存在,因此依次在附近寻找了+-0.002范围的值
        index_001_list=[0.010,0.011,0.009,0.012,0.008]
        index_005_list=[0.050,0.051,0.049,0.052,0.048]
        index_001=[]
        index_001_01=[]
        index_005=[]
        index_005_01=[]
        for i in index_001_list:
            index_001.append(list(p_se[p_se==i].index.values))
            index_001_01.append(list(p_se[p_se==-i].index.values))
        for i in index_005_list:
            index_005.append(list(p_se[p_se==i].index.values))
            index_005_01.append(list(p_se[p_se==-i].index.values))
        #数据清洗
        index_001=[list(i) for i in index_001 if len(list(i))!=0]
        index_001_01=[list(i) for i in index_001_01 if len(list(i))!=0]
        index_005=[list(i) for i in index_005 if len(list(i))!=0]
        index_005_01=[list(i) for i in index_005_01 if len(list(i))!=0]
        print(index_001,index_005)
        #p=0.01,p=0.05所对应波段的相关系数值
        p_001=corr_se[index_001[0][0]]
        p_005=corr_se[index_005[0][0]]
        #对单个值的横向填充为PD.SERIES
        p_001_data=get_p_value(p_001,corr_se,name='p_001')
        p_005_data=get_p_value(p_005,corr_se,name='p_005')
        corr=pd.concat([corr_se,p_001_data,p_005_data],axis=1)
        idmax=corr_se.idxmax()
        idmin=corr_se.idxmin()
        print('[INFO]idmax:%s,idmin:%s'%(idmax,idmin))
        #绘图
        s='diff.png'
        xticks=np.arange(339,2539,200)
        draw(corr,s,xticks)
        #写入txt文档,0.01,0.05交点用于分析
        with open('./output/corr_diff.txt','w') as f:
            f.writelines(u'最大相关系数所对应波段:'+str(idmax)+'
    ')
            f.writelines('相关系数最大值:'+str(corr_se[idmax])+'
    ')
            f.writelines('负相关最大所对应波段:'+str(idmin)+'
    ')
            f.writelines('负相关最大值:'+str(corr_se[idmin])+'
    ')
            f.writelines('0.05水平线与负相关系数曲线交点:'+str(index_005)+'
    ')
            f.writelines('0.05水平线与正相关系数曲线交点:'+str(index_005_01)+'
    ')
            f.writelines('0.01水平线与负相关系数曲线交点:'+str(index_001)+'
    ')
            f.writelines('0.01水平线与正相关系数曲线交点:'+str(index_001_01)+'
    ')
    def main():
        starttime = datetime.datetime.now()
        print(__doc__)
        print('''该脚本可能会运行几分钟,最终结果会保存在当前目录的output文件夹下,包括以下内容:
              1:经过重采样处理的SVC的.sig文件以EXCEL形式汇总[sig.xlsx]
              2: 所有小区一阶导数[diff.xlsx]
              3:0.05水平,0.01水平的原始图像相关性检验[original.png]
              4:0.05水平,0.01水平的一阶导数光谱相关性检验[diff.png]
              5:原始图像相关性最大波段的及相关系数[corr_original.txt]
              6:一阶导数相关性最大波段的及相关系数[corr_diff.txt]
             说明:本人才疏学浅,对遥感反演原理不甚了解,数据处理中有诸多纰漏,望慎重使用,以免给各位带来不必要的麻烦。
             ''')
        print('[INFO]加载数据集...')
        path_sig='../output/sig.xlsx'
        sig=pd.read_excel(path_sig,'Sheet3')
        path='../spad/spad.xlsx'
        spad=pd.read_excel(path)
    
        if not os.path.exists('./output'):
            os.mkdir('./output')
    
        corr_p(sig.copy(),spad.copy())
        diff_corr_p(sig.copy(),spad.copy())
        endtime = datetime.datetime.now()
        print('-'*80)
        print('程序运行时间:%s s'%((endtime - starttime).seconds))
    
    
    '''
    corr_se[1334]
    Out[28]: -0.16722162390032191
    
    EMPTY_SE_001=np.zeros_like(corr_se)
    EMPTY_SE_001[:]=-0.16722162390032191
    se_01=pd.Series(EMPTY_SE_001,index=index)
    corr=pd.concat([corr_se,se_01,se_05],axis=1)
    '''
    #diff_array=np.diff(sig_array,axis=1)
    '''
    
    diff_corr_se[diff_p_se[diff_p_se==0.01].index]
    Out[100]:
    991    -0.167330
    1071   -0.167740
    1100    0.166970
    1215    0.166174
    1232    0.167815
    1308   -0.166201
    1426   -0.166492
    1709   -0.166323
    1735   -0.167574
    1819   -0.166347
    2094   -0.167152
    2129   -0.167569
    2321    0.167649
    2383    0.167009
    2478    0.167431
    2505    0.166817
    dtype: float64
    '''
    '''
    diff_1.idxmax()
    Out[139]:
    0    759
    1    339
    2    339
    1    339
    2    339
    dtype: int64
    
    diff_1[0][759]
    Out[140]: 0.8388844431717504
    
    diff_1.idxmin()
    Out[141]:
    0    523
    1    339
    2    339
    1    339
    2    339
    dtype: int64
    
    diff_1[0][523]
    Out[142]: -0.7787709002181252
    '''
    main()
    

  • 相关阅读:
    Trojan.DL.Agent.nxd和RootKit.Agent.yj木马清除
    Java中的格式化数值(eg:保留两位小数)
    Int16, Int32, Int64的一点感悟
    在win2003上设置asp网站
    WPF学习笔记.
    对WF工作流异常(Event on interface type for instance id cannot be delivered)的一点总结.
    创建,安装,调试 Windows Service
    灵活而又可怕的params参数数组
    (转) 输入码、区位码、国标码与机内码
    SQL Server 2008 未来将不再包含全文检索功能, 再研究此功能已经没多大意思了.
  • 原文地址:https://www.cnblogs.com/yangjing000/p/9924041.html
Copyright © 2011-2022 走看看