zoukankan      html  css  js  c++  java
  • Pandas Dataframe的一些操作

    读取txt

    # ' 和 " 没区别,路径为  ,./ 表示为当前目录下路径
    df = pd.read_csv("./input/"+year+"ha.txt",skiprows =13,skipfooter=2,header=None,sep=" ")
    #skiprows=[0,2,5]跳过指定行 skiprows=[i for i in range(1,3)]跳过除首行外前n行
    #skipfooter=2 跳过末尾n行
    #skiprows= lambda x: logic(x) 跳过符合条件的行 其中logic为设定的筛选函数,可参照如下设置
    def logic(index):
        if index % 3 == 0:
           return True
        return False

     读取dbf

    import pandas as pd
    from simpledbf import Dbf5 #未下载需下载pip install simpledbf
    dbf = Dbf5(r"D:1DATAdem.dbf")
    df = dbf.to_dataframe()
    df['Id']=df.index
    
    #删除列
    del df[0]
    del df[5]
    del df['name'] df=df.iloc[0:221]#选择前221行 df.columns = ['location_code','Hub','Autho']#给列重命名
    df3.columns=['Eigenvec']
    df3=df3['Eigenvec'].str.split('  ', expand=True)#在某列内部以空格进行分割
    df3['new'] = df3.index#创建新列 和索引列相同
    loca = df3['new'].str.split(' ', expand=True)
    
    result = pd.merge(df1, df2, how='inner', on=['location_code'])#公共列为'location_code' 取交集
    result = pd.merge(result, df3, how='inner', on=['location_code'])
    
    result.to_excel('./output/worldtrade_'+year+'_all.xlsx',sheet_name='Sheet')#导出
    

    编码错误

    df_w = pd.read_table(r'C:Userslablist.txt', encoding='ISO-8859-1')#加r
    #encoding = 'utf-8'
    

    将多个dataframe存到一个excel中不同sheet中:

    from pandas import ExcelWriter
    fields=['degree','weighted degree','Eigenvec','Autho','Hub']
    years=['1992','2004','2009','2015','2018']
    for year in years:
        writer = ExcelWriter('./output/n_'+year+'.xlsx')
        #writer = pd.ExcelWriter('./output/n_1992.xlsx', engine='xlsxwriter')
        for field in fields:
            df = pd.read_excel('./output/'+year+'.xlsx')
            #field='degree'
            df_5=df.sort_values(by=field, ascending=False).head(5)#对某列按照字段降序排列
            df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]#选出符合条件的rows
            df_=pd.concat([df_5,df_n])  
            df_=df_.sort_values(by=field, ascending=False)
            df1=df_[['Label',field]]#提取出列
            df1['Index']=df1[field]/df1[field].max()*100
            df2 = df1.drop_duplicates(subset='Label')#除去重复的行 根据Label属性列
            df2.to_excel(writer,sheet_name=field)#sheet的名字和field一致
        writer.save()#注意保存放置的位置 需要都存好后再保存

    根据某一类进行排序,相等值序列相同,同时需要某些固定的值:

    from pandas import ExcelWriter
    fields=['degree','weighted degree','Eigenvec','Autho','Hub']
    years=['1992','2004','2009','2015','2018']
    for year in years:
        writer = ExcelWriter('./output5/sort/n_'+year+'.xlsx')
        #writer = pd.ExcelWriter('./output5/sort/n_1992.xlsx', engine='xlsxwriter')
        for field in fields:
            df = pd.read_excel('./output5/'+year+'.xlsx')
            dfs=df.sort_values(by=field, ascending=False).reset_index(drop=True)#更新index
            dfs['sort_index']=dfs.index+1
            dfs=dfs[['Label',field,'sort_index']]
            for ind in dfs.index:
                if ind > 0:
                    if dfs[field][ind]==dfs[field][ind-1]:
                        dfs['sort_index'][ind]=dfs['sort_index'][ind-1]
                        dfs['sort_index'][ind+1:]=dfs['sort_index'][ind+1:]-1
            df_5=df.sort_values(by=field, ascending=False).head(5)
            df_n=df.loc[df['Label'].isin(['IND','CHN','RUS','BEL','ZAF'])]
            df_=pd.concat([df_5,df_n])  
            df_=df_.sort_values(by=field, ascending=False)
            df1=df_[['Label',field]]
            df1['Index/%']=df1[field]/df1[field].max()*100
            df2 = df1.drop_duplicates(subset='Label')
            df2['sort_index'] = df2['Label'].map(dfs.set_index('Label')['sort_index'])#匹配dfs(多)中的'sort_index',匹配字段为Label https://stackoverflow.com/questions/46789098/create-new-column-in-dataframe-with-match-values-from-other-dataframe
            df2 = df2[[field, 'sort_index', 'Label','Index/%']]#按照想的给列排序导出
            df2['Index/%']=df2['Index/%'].round(decimals=2)#对这一列保留小数点后两位小数
            df2.to_excel(writer,sheet_name=field)
        writer.save()   
    

      

    将一文件夹下所有的xlsx文件转为csv以方便在ArcGIS中使用:

    import pandas as pd
    import glob
    
    #  folder C:Usersadmin
    for filepath in glob.iglob(r'C:Usersadmin*.xlsx'):
        data_xlsx = pd.read_excel(filepath, 'Sheet1', index_col=None)
        data_xlsx.to_csv(filepath[:-5]+'.csv', encoding='utf-8')
        #filepath[:-5]表示文件路径去除后五位,即'.xlsx'后的文件名
        print(filepath)

    其他遍历文件下所有文件的代码可移步参考:https://www.newbedev.com/python/howto/how-to-iterate-over-files-in-a-given-directory/

    按规则选择行

    df.loc[df['column_name'] == some_value]
    
    df.loc[df['column_name'].isin(some_values)]
    df.loc[~df['column_name'].isin(some_values)]
    
    
    df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]
    

      

     创建dataframe 并增加新的行

    import pandas as pd 
      
    # create an Empty DataFrame 
    # object With column names only 
    df = pd.DataFrame(columns = ['Name', 'Articles', 'Improved']) 
    print(df) 
      
    # append rows to an empty DataFrame 
    df = df.append({'Name' : 'Ankit', 'Articles' : 97, 'Improved' : 2200},  
                    ignore_index = True) 
    df = df.append({'Name' : 'Aishwary', 'Articles' : 30, 'Improved' : 50}, 
                    ignore_index = True) 
    df = df.append({'Name' : 'yash', 'Articles' : 17, 'Improved' : 220}, 
                   ignore_index = True) 
      
    df 
    

      

    统计某列数据各个值出现的频次:

    import pandas as pd
    import glob
    from pandas import ExcelWriter
    
    dft = pd.DataFrame(columns = ['USA', 'CHN', 'JPN','IND','ZAF'])
    fields=['maxtrade_partner_code','maxex_partner_code','maxim_partner_code']
    #  folder C:Usersadmin
    writer = ExcelWriter('G:/out.xlsx')
    
    for field in fields:
        print(field)
        for filepath in glob.iglob(r'D:1Moneyylh工作安排9-1*.xlsx'):#遍历文件夹所有xlsx文件
            print(filepath)
            df = pd.read_excel(filepath, 'Sheet 1', index_col=None)
            idx = pd.Index(df[field]) #索引
            co=pd.DataFrame(idx.value_counts()) #统计频次转为 dataframe
            dft = dft.append({'USA':co.at['USA',field],'CHN' :co.at['CHN',field],'JPN':co.at['JPN',field] ,'IND' :co.at['IND',field],'ZAF':co.at['ZAF',field]},ignore_index = True)        
        dft.to_excel(writer,sheet_name=field)
        dft.drop(dft.index, inplace=True)
    writer.save()
    

    将dataframe中的负值替换为其他

    df[df < 0] = np.nan 
    

      

     简单数据探索

    import seaborn as sns
    import matplotlib.pyplot as plt
    
    f, ax = plt.subplots(figsize=(20, 7))#设置图片尺寸
    
    sns.displot(df['RASTERVALU']);#对这一列数据频率直方图显示
    sns.lineplot(x=df['Id'],y=df['RASTERVALU']);#折线图
    

      

  • 相关阅读:
    MTK 关闭耳机调至最大音量时,提示损伤听力
    MTK LCM的添加
    chmod chown
    ubuntu14.04设置静态IP
    MTK NTP和NITZ更新时间的问题
    Rk3288 双屏异显单触摸
    MTK 修改默认时区
    MTK 屏幕旋转90度
    MTK WIFI底部加入返回按钮
    MTK 自定义按键添加广播
  • 原文地址:https://www.cnblogs.com/icydengyw/p/13582669.html
Copyright © 2011-2022 走看看