zoukankan      html  css  js  c++  java
  • Pandas_工资集处理

    
    import numpy as np
    import pandas as pd
    from pandas import Series,DataFrame
    
    # 1--读取数据文件
    file_obj=open('Baltimore_City_Employee_Salaries_FY2016.csv')
    salary_df=pd.read_csv(file_obj)
    file_obj.close()
    salary_df.head()
    
    Name JobTitle AgencyID Agency HireDate AnnualSalary GrossPay
    0 Aaron,Patricia G Facilities/Office Services II A03031 OED-Employment Dev (031) 10/24/1979 12:00:00 AM $56705.00 $54135.44
    1 Aaron,Petra L ASSISTANT STATE'S ATTORNEY A29045 States Attorneys Office (045) 09/25/2006 12:00:00 AM $75500.00 $72445.87
    2 Abbey,Emmanuel CONTRACT SERV SPEC II A40001 M-R Info Technology (001) 05/01/2013 12:00:00 AM $60060.00 $59602.58
    3 Abbott-Cole,Michelle Operations Officer III A90005 TRANS-Traffic (005) 11/28/2014 12:00:00 AM $70000.00 $59517.21
    4 Abdal-Rahim,Naim A EMT Firefighter Suppression A64120 Fire Department (120) 03/30/2011 12:00:00 AM $64365.00 $74770.82
    # 2--查看数据情况
    salary_df.shape  # (13818, 7)
    salary_df.describe()
    salary_df.info()  # 可以看到 Grosspay 的个数为 13546,不是13818,有缺失值
    salary_df.isnull().sum() #  可以看到 Grosspay缺失值的个数为 272
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 13818 entries, 0 to 13817
    Data columns (total 7 columns):
     #   Column        Non-Null Count  Dtype 
    ---  ------        --------------  ----- 
     0   Name          13818 non-null  object
     1   JobTitle      13818 non-null  object
     2   AgencyID      13818 non-null  object
     3   Agency        13818 non-null  object
     4   HireDate      13818 non-null  object
     5   AnnualSalary  13818 non-null  object
     6   GrossPay      13546 non-null  object
    dtypes: object(7)
    memory usage: 755.8+ KB
    
    
    Name              0
    JobTitle          0
    AgencyID          0
    Agency            0
    HireDate          0
    AnnualSalary      0
    GrossPay        272
    dtype: int64
    
    # 3--数据清理
    # 3-1)删除缺失值
    salary_df=salary_df.dropna(how='any',axis=0)  # 删除存在缺失值的整行数据
    salary_df.isnull().sum()
    
    Name            0
    JobTitle        0
    AgencyID        0
    Agency          0
    HireDate        0
    AnnualSalary    0
    GrossPay        0
    dtype: int64
    
    # 3-2)去除 AnnualSalary,GrossPay 两列中的 $号,并将这两列数据类型转为 float
    salary_df['AnnualSalary']=salary_df['AnnualSalary'].str.strip('$')
    salary_df['GrossPay']=salary_df['GrossPay'].str.strip('$')
    salary_df.head()
    
    Name JobTitle AgencyID Agency HireDate AnnualSalary GrossPay
    0 Aaron,Patricia G Facilities/Office Services II A03031 OED-Employment Dev (031) 10/24/1979 12:00:00 AM 56705.00 54135.44
    1 Aaron,Petra L ASSISTANT STATE'S ATTORNEY A29045 States Attorneys Office (045) 09/25/2006 12:00:00 AM 75500.00 72445.87
    2 Abbey,Emmanuel CONTRACT SERV SPEC II A40001 M-R Info Technology (001) 05/01/2013 12:00:00 AM 60060.00 59602.58
    3 Abbott-Cole,Michelle Operations Officer III A90005 TRANS-Traffic (005) 11/28/2014 12:00:00 AM 70000.00 59517.21
    4 Abdal-Rahim,Naim A EMT Firefighter Suppression A64120 Fire Department (120) 03/30/2011 12:00:00 AM 64365.00 74770.82
    salary_df['GrossPay'].dtype  # dtype('O') 不是 dtype('float')
    # salary_df['AnnualSalary']=salary_df['AnnualSalary'].astype(float)
    # salary_df['GrossPay']=salary_df['GrossPay'].astype(float)  # 直接用下面这句:
    salary_df[['AnnualSalary','GrossPay']]=salary_df[['AnnualSalary','GrossPay']].astype(float)
    salary_df['GrossPay'].dtype   # dtype('float64')
    
    dtype('O')
    
    # 3-3) 新建一列,用于存放入职月份:
    salary_df['month']=salary_df['HireDate'].str.split('/').str[0]  # 或者:
    # salary_df['month']=salary_df['HireDate'].str.split('/').str.get(0)  
    salary_df.head()
    salary_df[['HireDate','month']].head()  # 只查看这两列
    
    HireDate month
    0 10/24/1979 12:00:00 AM 10
    1 09/25/2006 12:00:00 AM 09
    2 05/01/2013 12:00:00 AM 05
    3 11/28/2014 12:00:00 AM 11
    4 03/30/2011 12:00:00 AM 03
    # 4--数据探索
    # 4-1)工资分布:
    salary_df['AnnualSalary'].hist(bins=20)  # 基本呈正态分布,高薪的人员较少,30000美元左右的人最多
    
    <matplotlib.axes._subplots.AxesSubplot at 0x21999d528c8>
    

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4Qb3Vh1N-1585835263436)(output_6_1.png)]

    # 4-2)入职月份统计:
    month=salary_df['month'].value_counts()
    month.plot(kind='barh')  # 6月份入职人数最多,HR最忙
    
    <matplotlib.axes._subplots.AxesSubplot at 0x219a114ef08>
    

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bycizMRr-1585835263437)(output_7_1.png)]

    # 5-聚合运算
    # 5-1)计算年薪平均值和职位个数:
    salary_df['AnnualSalary'].mean()  # 53507.98394359959  所有员工的平均年薪
    # 5-2) 计算各职位的年薪平均值 看哪个职位的平均工资高,职位数,及该职位的最高工资
    salary_df.groupby('JobTitle')['AnnualSalary'].mean()
    salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
    
    mean count max
    JobTitle
    911 LEAD OPERATOR 49816.750000 4 50162.0
    911 OPERATOR 44829.461538 65 50829.0
    911 OPERATOR SUPERVISOR 57203.500000 4 57579.0
    ACCOUNT EXECUTIVE 57200.000000 4 57200.0
    ACCOUNTANT I 49065.866667 15 57579.0
    ... ... ... ...
    ZONING APPEALS ADVISOR BMZA 53636.000000 1 53636.0
    ZONING APPEALS OFFICER 67800.000000 1 67800.0
    ZONING ENFORCEMENT OFFICER 65800.000000 1 65800.0
    ZONING EXAMINER I 45628.500000 2 48811.0
    ZONING EXAMINER II 56150.000000 1 56150.0

    1034 rows × 3 columns

    # 5-3) 对上述各职位的年薪的平均值进行降序排列
    jobtitle_salary_df=salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
    jobtitle_sort_salary_df=jobtitle_salary_df.sort_values(by='mean',ascending=False)[:30]  # 知识点:df.sort_values(by=列名,ascending=False)
    jobtitle_sort_salary_df['mean'].plot(kind='bar')
    
    <matplotlib.axes._subplots.AxesSubplot at 0x219a52aa788>
    

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tlawTPZu-1585835263439)(output_9_1.png)]

    # 5-4) 对上述各职位的人数进行降序排列
    jobtitle_salary_df=salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
    jobtitle_sort_salary_df=jobtitle_salary_df.sort_values(by='count',ascending=False)[:30]  # 知识点:df.sort_values(by=列名,ascending=False)
    jobtitle_sort_salary_df['count'].plot(kind='bar')
    
    <matplotlib.axes._subplots.AxesSubplot at 0x219a6327e48>
    

    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u6352dec-1585835263439)(output_10_1.png)]

    # 我们再看看其他的需求:
    # 从头开始:
    import numpy as np
    import pandas as pd
    from pandas import Series,DataFrame
    
    # 1--读取数据文件
    file_obj=open('Baltimore_City_Employee_Salaries_FY2016.csv')
    salary_df=pd.read_csv(file_obj)
    file_obj.close()
    
    # 2--删除含缺失值的行,
    salary_df=salary_df.dropna(how='any',axis=0)  # 删除存在缺失值的整行数据
    salary_df.isnull().sum()
    
    # 3--去除 AnnualSalary,GrossPay 两列中的 $号,并将这两列数据类型转为 float
    salary_df['AnnualSalary']=salary_df['AnnualSalary'].str.strip('$')
    salary_df['GrossPay']=salary_df['GrossPay'].str.strip('$')
    salary_df[['AnnualSalary','GrossPay']]=salary_df[['AnnualSalary','GrossPay']].astype(float)
    salary_df.head()
    
    # 4--查看 AnnualSalary的最大值及所在的行号:
    salary_df['AnnualSalary'].max()  # 238772.0
    salary_df['AnnualSalary'].argmax()  # 8701  # se.argmax()之前没有学到,这里补充一下
    
    8701
    
    
    
  • 相关阅读:
    Linux 安装 PostgreSQL
    【Linux】ZeroMQ 在 centos下的安装
    Celery
    Django学习之完成数据库主从复制、读写分离和一主多从情况下的使用办法
    python异步编程之asyncio(百万并发)
    Python正则表达式中的re.S,re.M,re.I的作用
    云开发 :云原生(Cloud Native)
    极简Unity调用Android方法
    UnityShader快速上手指南(四)
    UnityShader快速上手指南(三)
  • 原文地址:https://www.cnblogs.com/Collin-pxy/p/13038572.html
Copyright © 2011-2022 走看看