zoukankan      html  css  js  c++  java
  • Python pandas

    1.得到指定行的索引值。

    In [7]:  
    dframe = pd.DataFrame({"A":list("abcde"), "B":list("fghij")}, index=[10,11,12,13,14])
    #dframe
    """
    Out[7]:
        A   B
    10  a   f
    11  b   g
    12  c   h
    13  d   i
    14  e   j
    """
    # 得到指定行的索引值
    dframe.index[2]
    #output: 12
    #删除指定行
    dframe.drop(dframe.index[2])
    """
    Out[99]:
        A  B
    10  a  f
    11  b  g
    13  d  i
    14  e  j
    """

    2.一些作业记录

    Question 1:

    import pandas as pd
    import string
    import re
    energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
    energy1 = energy.drop([0])
    
    for col in energy1.columns:
        if col[:7] == 'Unnamed':
            energy1.rename(columns={col:'Country'}, inplace=True)
        if col[-6:] == 'capita':
            energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
        if col[-10:] == 'Production':
            energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)
    
    #nergy1.reset_index()gy1.set_index('Country')
    energy1 = energy1.set_index('Country')
    #nergy1
    #GDP = pd.read_csv('world_bank.csv', skiprows=4)
    #GDP 
    
    for row in energy1.index:
        if row[:17] == "Republic of Korea":
            energy1.rename(index = {row : "South Korea"}, inplace=True)
        if row[:24] == "United States of America":
            energy1.rename(index = {row : "United States"}, inplace=True)
        if row[:14] == "United Kingdom":
            energy1.rename(index = {row : "United Kingdom"}, inplace=True)
        if row[:16] ==  "China, Hong Kong":
            energy1.rename(index = {row : "Hong Kong"}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : re.sub(u"\(.*?\)","",row)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip()}, inplace=True)
    
    #====read_csv============
    GDP = pd.read_csv('world_bank.csv', skiprows=4)
    """
    "Korea, Rep.": "South Korea", 
    "Iran, Islamic Rep.": "Iran",
    "Hong Kong SAR, China": "Hong Kong"
    """
    for col in GDP.columns:
        if col == 'Country Name':
            GDP.rename(columns = {col : "Country"},inplace=True)
    GDP = GDP.set_index('Country')
    
    for row in GDP.index:
        if row[:11]== "Korea, Rep." :         
            GDP.rename(index = {row : "South Korea"}, inplace=True)
        if row[:18]=="Iran, Islamic Rep.":
            GDP.rename(index = {row : "Iran"}, inplace=True)
        if row[:9] == 'Hong kong':
            GDP.rename(index = {row : "Hong Kong"}, inplace=True)
    #===========read_excel======
    ScimEn = pd.read_excel('scimagojr-3.xlsx')
    ScimEn = ScimEn.set_index('Country')
    #===========merge==========
    df_merged = pd.merge(energy1, GDP, how='inner', left_index=True, right_index =True)
    df_merged = pd.merge(ScimEn, df_merged, how = 'inner', left_index=True, right_index=True)
    df_merged = df_merged.sort(['Rank'], ascending=True)
    
    df_merged = df_merged[df_merged['Rank'] <= 15]

     Question2:

    def answer_two():
        energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
        energy1 = energy.drop([0])
    
        for col in energy1.columns:
            if col[:7] == 'Unnamed':
                energy1.rename(columns={col:'Country'}, inplace=True)
            if col[-6:] == 'capita':
                energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
            if col[-10:] == 'Production':
                energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)
    
    #nergy1.reset_index()gy1.set_index('Country')
        energy1 = energy1.set_index('Country')
    #nergy1
    #GDP = pd.read_csv('world_bank.csv', skiprows=4)
    #GDP 
        for row in energy1.index:
            if row[:17] == "Republic of Korea":
                energy1.rename(index = {row : "South Korea"}, inplace=True)
            if row[:24] == "United States of America":
                energy1.rename(index = {row : "United States"}, inplace=True)
            if row[:14] == "United Kingdom":
                energy1.rename(index = {row : "United Kingdom"}, inplace=True)
            if row[:16] ==  "China, Hong Kong":
                energy1.rename(index = {row : "Hong Kong"}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : re.sub(u"\(.*?\)","",row)}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
        for row in energy1.index:
            energy1.rename(index = {row : row.rstrip()}, inplace=True)
    
    #====read_csv============
        GDP = pd.read_csv('world_bank.csv', skiprows=4)
    
        for col in GDP.columns:
            if col == 'Country Name':
                GDP.rename(columns = {col : "Country"},inplace=True)
        GDP = GDP.set_index('Country')
    
        for row in GDP.index:
            if row[:11]== "Korea, Rep." :         
                GDP.rename(index = {row : "South Korea"}, inplace=True)
            if row[:18]=="Iran, Islamic Rep.":
                GDP.rename(index = {row : "Iran"}, inplace=True)
            if row[:9] == 'Hong kong':
                GDP.rename(index = {row : "Hong Kong"}, inplace=True)
    #===========read_excel======
        ScimEn = pd.read_excel('scimagojr-3.xlsx')
        ScimEn = ScimEn.set_index('Country')
    #===========merge==========
        df_merged = pd.merge(energy1, GDP, how='outer', left_index=True, right_index =True)
        df_merged = pd.merge(ScimEn, df_merged, how = 'outer', left_index=True, right_index=True)
        
        return len(df_merged.index.unique()) -15
    View Code

    Question 3:

    def answer_three():
        Top15 = answer_one()
        #print(Top15.columns)
        cols = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
           'Citations per document', 'H index', 'Energy Supply',
           'Energy Supply per Capita', '% Renewable', 'Country Code',
           'Indicator Name', 'Indicator Code']
        Top15_new = Top15.drop(cols, axis =1)
        #print(Top15_new.columns)
        Top15_new['avgGDP'] = Top15_new.mean(axis=1)
        Top15_new = Top15_new.sort(['avgGDP'], ascending=False)
        #Top15_new = Top15_new['avgGDP']
        result = pd.Series(Top15_new['avgGDP'])
        return result

    Question 4:

    def answer_four():
        Top15 = answer_one()
        result = (Top15.loc[['United Kingdom']]['2015'] -Top15.loc[['United Kingdom']]['2006'])/(Top15.loc[['United Kingdom']]['2006'])
        return result

    Question 5:

    def answer_five():
        Top15 = answer_one()
        mean = Top15['Energy Supply per Capita'].mean()
        
        return mean
    print(answer_five())

    Question 6:

    def answer_six():
        """
        What country has the maximum % Renewable and what is the     percentage?
    
    This function should return a tuple with the name of the country and the percentage
        """
        Top15 = answer_one()
        Top15 = Top15.reset_index()
        Top = Top15.loc[:, ['Country', '% Renewable']]
        Top = Top.set_index('Country') 
        return (Top.idxmax(), Top.max())
    
    answer_six()
    View Code

    Question 7:

    def answer_seven():
        Top15 = answer_one()
        Top = Top15['Self-citations']
        Top2 = Top15['Citations']
        Top15['ratio'] = Top.div(Top2)
        Top15.reset_index()
        Top_new = Top15.loc[:,['Country', 'ratio']]
        Top_new.set_index('Country')
        return (Top_new.idxmax(), Top_new.max())
    answer_seven()

    Question 8:

    def answer_eight():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15 = Top15.sort(['populations'], ascending=False)
        Top15 = Top15.loc[:, ['populations']]
        print(Top15)
        return Top15.index[2]
    answer_eight()

    Question 9: 出现错误: AttributeError: 'float' object has no attribute 'sqrt' 

    #出现错误
    """
    AttributeError: 'float' object has no attribute 'sqrt'
    """
    
    def answer_nine():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
        Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
        print(Top)
        corr = Top15['Docs per Capita'].corr(Top15['Energy Supply per Capita'])
        return corr
    print(answer_nine())

    修改后:

    def answer_nine():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
        Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
        print(Top)
    #修改后的方法    
        correlation=Top15['Docs per Capita'].astype('float64').corr(Top15['Energy Supply per Capita'].astype('float64'))
        return correlation
    print(answer_nine())

    Question 10:

    def answer_ten():
        Top15 = answer_one()
        Top15 = Top15.sort(['% Renewable'], ascending = False)
        print(Top15)
        median = Top15['% Renewable'][7]
        Top15['HighRenew'] = Top15['% Renewable']
        Top15['HighRenew'] = Top15['HighRenew'].apply(lambda x : 1 if x >= median else 0)
        Top15 = Top15.sort(['Rank'], ascending=True) 
        print(Top15)
        return Top15['HighRenew']
    answer_ten()

     Question 11:

    import numpy as np
    def answer_eleven():
        Top15 = answer_one()
        ContinentDict  = {'China':'Asia', 
                      'United States':'North America', 
                      'Japan':'Asia', 
                      'United Kingdom':'Europe', 
                      'Russian Federation':'Europe', 
                      'Canada':'North America', 
                      'Germany':'Europe', 
                      'India':'Asia',
                      'France':'Europe', 
                      'South Korea':'Asia', 
                      'Italy':'Europe', 
                      'Spain':'Europe', 
                      'Iran':'Asia',
                      'Australia':'Australia', 
                      'Brazil':'South America'}
        Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
        Top15 = Top15.reset_index()
        # Top15.set_index('Continent')
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']).astype(float)
       #需要astype,不然用到mean出现numerical的错误
        df = Top15.loc[:, ['Continent', 'populations']]
        #df = df.set_index('Continent')
        #print(df.groupby(level=0)['populations'].agg({'size' : np.size, 'sum': np.sum, 'mean':np.mean, 'std':np.std}))
        df_size = (df.set_index('Continent').groupby(level=0)['populations'].agg({'size' : np.size}))
        df_sum = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'sum' : np.sum})))
        df_mean = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'mean' : np.mean})))
        df_std = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'std' : np.std})))
        df.set_index('Continent')
        df_1 = pd.merge(df_size, df_sum, how='inner', left_index=True, right_index=True)
        df_2 = pd.merge(df_mean, df_std, how='inner', left_index=True, right_index=True)
        df = pd.merge(df_1, df_2, how='inner', left_index=True, right_index=True)
     
        return(df)
    answer_eleven()

    Question 12:

    有agg和cut()的函数。

    def answer_twelve():
        Top15 = answer_one()
        ContinentDict  = {'China':'Asia', 
                      'United States':'North America', 
                      'Japan':'Asia', 
                      'United Kingdom':'Europe', 
                      'Russian Federation':'Europe', 
                      'Canada':'North America', 
                      'Germany':'Europe', 
                      'India':'Asia',
                      'France':'Europe', 
                      'South Korea':'Asia', 
                      'Italy':'Europe', 
                      'Spain':'Europe', 
                      'Iran':'Asia',
                      'Australia':'Australia', 
                      'Brazil':'South America'}
        Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
        Top15 = Top15.reset_index()
        Top15['bins'] = pd.cut(Top15['% Renewable'],5)
       
        df = Top15.loc[:, ['Country', 'Continent','bins' ]]
        print(df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size}))
        result = df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size})
        
        return result
    answer_twelve()

    Question 13:

    为数字添加千位符号:e.g. 317615384.61538464 -> 317,615,384.61538464

    #数字添加千位符号
    
    def answer_thirteen():
        Top15 = answer_one()
        Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
        Top15['populations'] = Top15['populations'].map(lambda x : format(x, ','))
        Top15 = Top15.reset_index()
        result = Top15.loc[:,['Country', 'populations']]
        result = result.set_index('Country')
        return result
    answer_thirteen()
    The Safest Way to Get what you Want is to Try and Deserve What you Want.
  • 相关阅读:
    Laravel 初始化
    ant design pro 左上角 logo 修改
    请求到服务端后是怎么处理的
    Websocket 知识点
    王道数据结构 (7) KMP 算法
    王道数据结构 (6) 简单的模式匹配算法
    王道数据结构 (4) 单链表 删除节点
    王道数据结构 (3) 单链表 插入节点
    王道数据结构 (2) 单链表 尾插法
    王道数据结构 (1) 单链表 头插法
  • 原文地址:https://www.cnblogs.com/Shinered/p/9239476.html
Copyright © 2011-2022 走看看