zoukankan      html  css  js  c++  java
  • Python pandas

    1.得到指定行的索引值。

    In [7]:  
    dframe = pd.DataFrame({"A":list("abcde"), "B":list("fghij")}, index=[10,11,12,13,14])
    #dframe
    """
    Out[7]:
        A   B
    10  a   f
    11  b   g
    12  c   h
    13  d   i
    14  e   j
    """
    # 得到指定行的索引值
    dframe.index[2]
    #output: 12
    #删除指定行
    dframe.drop(dframe.index[2])
    """
    Out[99]:
        A  B
    10  a  f
    11  b  g
    13  d  i
    14  e  j
    """

    2.一些作业记录

    Question 1:

    import pandas as pd
    import string
    import re
    energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
    energy1 = energy.drop([0])
    
    for col in energy1.columns:
        if col[:7] == 'Unnamed':
            energy1.rename(columns={col:'Country'}, inplace=True)
        if col[-6:] == 'capita':
            energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
        if col[-10:] == 'Production':
            energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)
    
    #nergy1.reset_index()gy1.set_index('Country')
    energy1 = energy1.set_index('Country')
    #nergy1
    #GDP = pd.read_csv('world_bank.csv', skiprows=4)
    #GDP 
    
    for row in energy1.index:
        if row[:17] == "Republic of Korea":
            energy1.rename(index = {row : "South Korea"}, inplace=True)
        if row[:24] == "United States of America":
            energy1.rename(index = {row : "United States"}, inplace=True)
        if row[:14] == "United Kingdom":
            energy1.rename(index = {row : "United Kingdom"}, inplace=True)
        if row[:16] ==  "China, Hong Kong":
            energy1.rename(index = {row : "Hong Kong"}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : re.sub(u"\(.*?\)","",row)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip()}, inplace=True)
    
    #====read_csv============
    GDP = pd.read_csv('world_bank.csv', skiprows=4)
    """
    "Korea, Rep.": "South Korea", 
    "Iran, Islamic Rep.": "Iran",
    "Hong Kong SAR, China": "Hong Kong"
    """
    for col in GDP.columns:
        if col == 'Country Name':
            GDP.rename(columns = {col : "Country"},inplace=True)
    GDP = GDP.set_index('Country')
    
    for row in GDP.index:
        if row[:11]== "Korea, Rep." :         
            GDP.rename(index = {row : "South Korea"}, inplace=True)
        if row[:18]=="Iran, Islamic Rep.":
            GDP.rename(index = {row : "Iran"}, inplace=True)
        if row[:9] == 'Hong kong':
            GDP.rename(index = {row : "Hong Kong"}, inplace=True)
    #===========read_excel======
    ScimEn = pd.read_excel('scimagojr-3.xlsx')
    ScimEn = ScimEn.set_index('Country')
    #===========merge==========
    df_merged = pd.merge(energy1, GDP, how='inner', left_index=True, right_index =True)
    df_merged = pd.merge(ScimEn, df_merged, how = 'inner', left_index=True, right_index=True)
    df_merged = df_merged.sort(['Rank'], ascending=True)
    
    df_merged = df_merged[df_merged['Rank'] <= 15]

     Question2:

    def answer_two():
        energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
        energy1 = energy.drop([0])
    
        for col in energy1.columns:
            if col[:7] == 'Unnamed':
                energy1.rename(columns={col:'Country'}, inplace=True)
            if col[-6:] == 'capita':
                energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
            if col[-10:] == 'Production':
                energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)
    
    #nergy1.reset_index()gy1.set_index('Country')
        energy1 = energy1.set_index('Country')
    #nergy1
    #GDP = pd.read_csv('world_bank.csv', skiprows=4)
    #GDP 
        for row in energy1.index:
            if row[:17] == "Republic of Korea":
                energy1.rename(index = {row : "South Korea"}, inplace=True)
            if row[:24] == "United States of America":
                energy1.rename(index = {row : "United States"}, inplace=True)
            if row[:14] == "United Kingdom":
                energy1.rename(index = {row : "United Kingdom"}, inplace=True)
            if row[:16] ==  "China, Hong Kong":
                energy1.rename(index = {row : "Hong Kong"}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : re.sub(u"\(.*?\)","",row)}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : row.rstrip()}, inplace=True)
    
    #====read_csv============
        GDP = pd.read_csv('world_bank.csv', skiprows=4)
    
        for col in GDP.columns:
            if col == 'Country Name':
                GDP.rename(columns = {col : "Country"},inplace=True)
        GDP = GDP.set_index('Country')
    
        for row in GDP.index:
            if row[:11]== "Korea, Rep." :         
                GDP.rename(index = {row : "South Korea"}, inplace=True)
            if row[:18]=="Iran, Islamic Rep.":
                GDP.rename(index = {row : "Iran"}, inplace=True)
            if row[:9] == 'Hong kong':
                GDP.rename(index = {row : "Hong Kong"}, inplace=True)
    #===========read_excel======
        ScimEn = pd.read_excel('scimagojr-3.xlsx')
        ScimEn = ScimEn.set_index('Country')
    #===========merge==========
        df_merged = pd.merge(energy1, GDP, how='outer', left_index=True, right_index =True)
        df_merged = pd.merge(ScimEn, df_merged, how = 'outer', left_index=True, right_index=True)
        
        return len(df_merged.index.unique()) -15
    View Code

    Question 3:

    def answer_three():
        Top15 = answer_one()
        #print(Top15.columns)
        cols = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
           'Citations per document', 'H index', 'Energy Supply',
           'Energy Supply per Capita', '% Renewable', 'Country Code',
           'Indicator Name', 'Indicator Code']
        Top15_new = Top15.drop(cols, axis =1)
        #print(Top15_new.columns)
        Top15_new['avgGDP'] = Top15_new.mean(axis=1)
        Top15_new = Top15_new.sort(['avgGDP'], ascending=False)
        #Top15_new = Top15_new['avgGDP']
        result = pd.Series(Top15_new['avgGDP'])
        return result

    Question 4:

    def answer_four():
        Top15 = answer_one()
        result = (Top15.loc[['United Kingdom']]['2015'] -Top15.loc[['United Kingdom']]['2006'])/(Top15.loc[['United Kingdom']]['2006'])
        return result

    Question 5:

    def answer_five():
        Top15 = answer_one()
        mean = Top15['Energy Supply per Capita'].mean()
        
        return mean
    print(answer_five())

    Question 6:

    def answer_six():
        """
        What country has the maximum % Renewable and what is the     percentage?
    
    This function should return a tuple with the name of the country and the percentage
        """
        Top15 = answer_one()
        Top15 = Top15.reset_index()
        Top = Top15.loc[:, ['Country', '% Renewable']]
        Top = Top.set_index('Country') 
        return (Top.idxmax(), Top.max())
    
    answer_six()
    View Code

    Question 7:

    def answer_seven():
        Top15 = answer_one()
        Top = Top15['Self-citations']
        Top2 = Top15['Citations']
        Top15['ratio'] = Top.div(Top2)
        Top15.reset_index()
        Top_new = Top15.loc[:,['Country', 'ratio']]
        Top_new.set_index('Country')
        return (Top_new.idxmax(), Top_new.max())
    answer_seven()

    Question 8:

    def answer_eight():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15 = Top15.sort(['populations'], ascending=False)
        Top15 = Top15.loc[:, ['populations']]
        print(Top15)
        return Top15.index[2]
    answer_eight()

    Question 9: 出现错误: AttributeError: 'float' object has no attribute 'sqrt' 

    #出现错误
    """
    AttributeError: 'float' object has no attribute 'sqrt'
    """
    
    def answer_nine():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
        Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
        print(Top)
        corr = Top15['Docs per Capita'].corr(Top15['Energy Supply per Capita'])
        return corr
    print(answer_nine())

    修改后:

    def answer_nine():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
        Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
        print(Top)
    #修改后的方法    
        correlation=Top15['Docs per Capita'].astype('float64').corr(Top15['Energy Supply per Capita'].astype('float64'))
        return correlation
    print(answer_nine())

    Question 10:

    def answer_ten():
        Top15 = answer_one()
        Top15 = Top15.sort(['% Renewable'], ascending = False)
        print(Top15)
        median = Top15['% Renewable'][7]
        Top15['HighRenew'] = Top15['% Renewable']
        Top15['HighRenew'] = Top15['HighRenew'].apply(lambda x : 1 if x >= median else 0)
        Top15 = Top15.sort(['Rank'], ascending=True) 
        print(Top15)
        return Top15['HighRenew']
    answer_ten()

     Question 11:

    import numpy as np
    def answer_eleven():
        Top15 = answer_one()
        ContinentDict  = {'China':'Asia', 
                      'United States':'North America', 
                      'Japan':'Asia', 
                      'United Kingdom':'Europe', 
                      'Russian Federation':'Europe', 
                      'Canada':'North America', 
                      'Germany':'Europe', 
                      'India':'Asia',
                      'France':'Europe', 
                      'South Korea':'Asia', 
                      'Italy':'Europe', 
                      'Spain':'Europe', 
                      'Iran':'Asia',
                      'Australia':'Australia', 
                      'Brazil':'South America'}
        Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
        Top15 = Top15.reset_index()
        # Top15.set_index('Continent')
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']).astype(float)
       #需要astype,不然用到mean出现numerical的错误
        df = Top15.loc[:, ['Continent', 'populations']]
        #df = df.set_index('Continent')
        #print(df.groupby(level=0)['populations'].agg({'size' : np.size, 'sum': np.sum, 'mean':np.mean, 'std':np.std}))
        df_size = (df.set_index('Continent').groupby(level=0)['populations'].agg({'size' : np.size}))
        df_sum = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'sum' : np.sum})))
        df_mean = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'mean' : np.mean})))
        df_std = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'std' : np.std})))
        df.set_index('Continent')
        df_1 = pd.merge(df_size, df_sum, how='inner', left_index=True, right_index=True)
        df_2 = pd.merge(df_mean, df_std, how='inner', left_index=True, right_index=True)
        df = pd.merge(df_1, df_2, how='inner', left_index=True, right_index=True)
     
        return(df)
    answer_eleven()

    Question 12:

    有agg和cut()的函数。

    def answer_twelve():
        Top15 = answer_one()
        ContinentDict  = {'China':'Asia', 
                      'United States':'North America', 
                      'Japan':'Asia', 
                      'United Kingdom':'Europe', 
                      'Russian Federation':'Europe', 
                      'Canada':'North America', 
                      'Germany':'Europe', 
                      'India':'Asia',
                      'France':'Europe', 
                      'South Korea':'Asia', 
                      'Italy':'Europe', 
                      'Spain':'Europe', 
                      'Iran':'Asia',
                      'Australia':'Australia', 
                      'Brazil':'South America'}
        Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
        Top15 = Top15.reset_index()
        Top15['bins'] = pd.cut(Top15['% Renewable'],5)
       
        df = Top15.loc[:, ['Country', 'Continent','bins' ]]
        print(df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size}))
        result = df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size})
        
        return result
    answer_twelve()

    Question 13:

    为数字添加千位符号:e.g. 317615384.61538464 -> 317,615,384.61538464

    #数字添加千位符号
    
    def answer_thirteen():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['populations'] = Top15['populations'].map(lambda x : format(x, ','))
        Top15 = Top15.reset_index()
        result = Top15.loc[:,['Country', 'populations']]
        result = result.set_index('Country')
        return result
    answer_thirteen()
    The Safest Way to Get what you Want is to Try and Deserve What you Want.
  • 相关阅读:
    pip或easy_install安装库报错:SSL: CERTIFICATE_VERIFY_FAILED
    js 闭包
    php 安装xdebug进行调试(phpstorm)
    Linux下一个最简单的不依赖第三库的的C程序(2)
    Linux下一个最简单的不依赖第三库的的C程序(1)
    windbg .net 程序的死锁检测 常用方法(个人备份笔记)
    自定义经纬度索引(非RTree、Morton Code[z order curve]、Geohash的方式)
    通过经纬度坐标计算距离的方法(经纬度距离计算)
    根据2个经纬度点,计算这2个经纬度点之间的距离(通过经度纬度得到距离)
    The version of SOS does not match the version of CLR you are debugging; SOS.dll版本不匹配; Dump文件不同环境mscordacwks.dll版本问题
  • 原文地址:https://www.cnblogs.com/Shinered/p/9239476.html
Copyright © 2011-2022 走看看