zoukankan      html  css  js  c++  java
  • 数据分析——pandas

    简介

    1 import pandas as pd
    2 
    3 # 在数据挖掘前一个数据分析、筛选、清理的多功能工具
    4 '''
    5 pandas 可以读入excel、csv等文件;可以创建Series序列,DataFrame表格,日期数组data_range
    6 '''

    数据类型

     1 # 将excel文件,csv文件读取并转换为pandas的DataFrame
     2 # df_score = pd.read_csv()
     3 df_score = pd.read_excel('./score.xlsx')
     4 # df_score.values   #数据
     5 # df_score.columns  #列名
     6 # print df_score.describe() #计算表的各项数据,count,mean,std,中位数等
     7 
     8 # 创建一个默认索引从0开始的Series
     9 s = pd.Series([1, 2, 3, 4, 5, 6])
    10 # 创建自定义索引的数组,索引由index指定,和前面数组依次对应
    11 s = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'], dtype=int)
    12 # 使用字典创建一个DataFrame,字典的Key会自动成为列名,一个Key默认对应一列数据
    13 df1 = pd.DataFrame({'math': [1, 2, 3, 4, 5], 'physic': [5, 6, 7, 8, 9]}, index=['a', 'b', 'c', 'd', 'e'])
    14 '''
    15 # df1.values  数据
    16 # df1.head(2)   前两行数据
    17 # df1.tail(2)   最后两行数据
    18 # df1.index     索引
    19 # df1.columns   列名
    20 '''
    21 # 生成从20180101开始的时间序列,peroids是增加量,默认增加单位是天D,H小时,s秒
    22 dates = pd.date_range('20180101', periods=10, freq='D')
    23 # 创建使用时间索引的Series
    24 # s = pd.Series(range(10),index=dates)
    25 # 取出指定间隔的行数据
    26 # s['2018-01-01':'2018-01-05']
    27 # print dates

    票房分析

     1 df_imdb = pd.read_csv('./IMDB.csv')
     2 
     3 # print df_imdb
     4 # print df_imdb.columns
     5 # df_imdb['Title'].head(5)  #选出Title列的前五行
     6 # df_imdb['Title'].tail(3)
     7 # df_imdb.Title.head(3)     #同[]的形式
     8 # df_imdb['Revenue (Millions)'].max()   #最大票房
     9 # df_imdb['Revenue (Millions)'].idxmax()    #最大票房的索引
    10 # df_imdb[50:51]
    11 # df_imdb[50:51]['Title']
    12 # df_imdb[50:51]['Revenue (Millions)']  #取出50行,不包括51行
    13 # 取出50-56行,收尾都包含,第一维度是行,第二维度是列
    14 # df_imdb.loc[50:56,['Director','Year']]
    15 # df_imdb[50:56].loc[:,'Director','Year']
    16 # 取出1-5行(不包含第5行),2-4(不包含第4列)列的数据,使用整数索引操作,与numpy用法类似
    17 # df_imdb.iloc[1:5,2:4]
    18 # 统计Director列中不同导演出现的次数
    19 # df_imdb['Director'].value_counts()
    20 # 将票房大于5亿美元的电影选出来
    21 # df_imdb[df_imdb['Revenue (Millions)']>500].Director
    22 # df_imdb[df_imdb['Revenue (Millions)']>700]['Title']
    23 # 将电影风格描述中含有Sci-Fi(科幻) 关键字的找出
    24 # df_imdb[df_imdb['Genre'].str.contains('Sci-Fi')]
    25 
    26 # 将缺失数据(NaN)填充为0,也可以自己根据项目需求指定其他数据
    27 # df_score.fillna(0)
    28 # 将缺失数据的行移除(默认操作,可以使用axis=1指定删除列df_score.dropna(axis=1))
    29 # 0删除行,1删除列
    30 # df_score.dropna()
    31 # 在DataFram中增加一列平均值avg,计算当前DataFram中每行的平均值作为avg的数据
    32 # 前后赋值数据的行数要对应,axis=1表示按行计算,axis=0(默认值),表示按列计算
    33 # df_score['avg'] = df_score.mean(axis=1)
    34 # 按照性别分组并求和指定成绩
    35 # df_score.iloc[:,4:7].groupby(u'性别').sum()
    36 # df_score.loc[:,[u'音乐',u'性别']].groupby(u'性别').sum()
    37 # 按照男女分组并绘图,bar柱状图,pie饼状图
    38 # df_score[u'性别'].value_counts().plot(kind='bar')
    39 # df_score[u'性别'].value_counts().plot(kind='pie')
    40 # & 数学大于80且化学大于60
    41 # df_score[(df_score[u'数学']>80) &(df_score[u'化学']>60) ]
    42 
    43 # 使用lambda,配合apply方法将日期中的指定年份或月份等提取出来
    44 # apply函数会将lambda一次作用到数据集的每个元素
    45 # datas = pd.Series(['20190901','20190902','20190903'])
    46 # datas.apply(lambda x:x[0:4])
    47 # datas.apply(lambda x:x[4:6])
    48 
    49 # 创建一个数据的副本
    50 # df_copy = df.copy()
    51 # df_copy['R_Sum'] = df['SibSp']+df['Parch']
    52 
    53 # 计算数学列的总和、平均值等,里面的字符串必须有同名函数
    54 # df[u'数学'].agg(['sum','mean','max','std'])
    55 
    56 # pandas(Series、DataFrame)类型转换为numpy(array)类型
    57 # df[u'数学'].values
    58 # df.loc[:,[u'数学',u'化学']].values
    59 
    60 # 按照指定列的值排序,可指定正序倒序,默认正序
    61 # df[u'数学'].sort_values()
    62 # 按照索引排序
    63 # df[u'数学'].sort_index()
    64 # df[u'数学'].sort_values(ascending=False)
    65 # 添加新列sum,值为每行总和,并倒序排列
    66 # df['sum'] = df.sum(axis=1)
    67 # df[u'sum'].sort_values(ascending=False)
    68 
    69 
    70 # 取出Embarked,Survived字段,按照两个字段顺序做层次分组,然后做计算总和
    71 # r = df.loc[:,['Embarked','Survived']].groupby(['Embarked','Survived']).size()
    72 # r.C
    73 # r.C[1]
    74 # r.Q
    75 # r.Q[0]
    76 # r.Q[1]
    77 # r1 = df.loc[:,['Embarked','Survived']].groupby(['Survived','Embarked']).size()
    78 # r2 = df.loc[:,['Embarked','Survived']].groupby('Embarked').size()
    79 # r3 = df.loc[:,['Embarked','Survived']].groupby('Survived').size()

    运行结果

    """
    上面的运行结果 r
    Embarked  Survived
    C         0            75
              1            93
    Q         0            47
              1            30
    S         0           427
              1           217
    dtype: int64
    
    r.C结果
    Survived
    0    75
    1    93
    dtype: int64
    
    r.C[1]结果
    93
    
    r1结果
    Survived  Embarked
    0         C            75
              Q            47
              S           427
    1         C            93
              Q            30
              S           217
    dtype: int64
    
    r2结果
    Embarked
    C    168
    Q     77
    S    644
    dtype: int64
    
    r3结果
    Survived
    0    549
    1    342
    dtype: int64
    """
    

      

    标注:

    '''
    1.axis转换行列
    2.DataFrame筛选一行或一列时会转化为Series类型,可以直接后面加[数字]直接进行选择,但Series不能使用DataFrame的方法(groupby等)
    3.筛选出来的数据的索引仍是原索引,不会重新排列新索引
    '''
    

      

    统计拍片数前10的某导演,指导电影的总票房

     1 def piaofang():
     2     director10 = df_imdb['Director'].value_counts().head(10)
     3     # print director10.index[0]
     4     revenues = 0
     5     for d in director10.index:
     6         print df_imdb[df_imdb['Director'] == d]['Revenue (Millions)'].sum()
     7 
     8 # piaofang()
     9 
    10 # df_imdb[df_imdb['Director']=='']['Revenue (Millions)'].sum()

    票房分析

    特征

    '''
    PassengerId:乘客的唯一标志
    Survived:1获救,0死亡
    Pclass:座舱等级 3最好,1最差
    Name,Sex,Age,
    SibSp:船上有没有兄弟姐妹
    Parch:父母等直系亲属是否在船上
    Ticket,
    Fare:票价或消费
    Cabin:座舱号
    Embarked:从哪个港口登船
    891
    '''
    

      

    导入类库

    1 import numpy as np
    2 import matplotlib.pyplot as pt
    3 import pandas as pd

    准备数据

    1 titanic = pd.read_csv('./Titanic.csv')
    2 
    3 titanic.fillna(int(titanic[u'Age'].mean()))

    测试代码

     1 # print titanic['Age']
     2 
     3 # print titanic[u'Age'].mean()
     4 # print titanic.loc[:,u'Survived'].value_counts()     #存活比例
     5 # print titanic.loc[:,u'Survived'].count()            #总人数
     6 
     7 # print titanic.loc[:, u'Sex'].value_counts()             #男女分类
     8 # print titanic[titanic[u'Sex'] == u'male']['Survived'].value_counts()    #男性生死分类
     9 
    10 # print titanic.columns
    11 # print titanic[titanic[u'Age'] <= 18][u'Survived'].value_counts()
    12 # print titanic[(titanic[u'Age'] > 18) & (titanic[u'Age'] < 60)][u'Survived'].value_counts()
    13 # print titanic[titanic[u'Age'] >= 60][u'Survived'].value_counts()
    14 
    15 # print titanic[u'Fare']
    16 # print titanic[u'Fare'].max()          #贫富差距
    17 # print titanic[u'Fare'].min()
    18 
    19 # print titanic[u'Pclass'].value_counts()
    20 # print titanic[u'Pclass'].value_counts()[1]
    21 # print titanic[u'Pclass'].value_counts()[3]              #座舱
    22 # print titanic[titanic[u'Pclass'] == 1]['Survived'].value_counts()
    23 # print titanic[titanic[u'Pclass'] == 3]['Survived'].value_counts()
    24 
    25 # print titanic[u'SibSp'].value_counts()
    26 # print titanic[u'Parch'].value_counts()
    27 
    28 # print titanic[u'Embarked'].value_counts()

    案例源码

      1 class Titanic(object):
      2     def __init__(self):
      3         self.data = titanic
      4 
      5     # 1.存活率是多少
      6     def rate_survive(self):
      7         survived = self.data.loc[:, 'Survived'].value_counts()[1]
      8         death = self.data.loc[:, 'Survived'].value_counts()[0]
      9         rate = float(survived) / (float(death) + float(survived))
     10         print '总人数:{},存活人数:{},死亡人数:{}'.format(survived + death, survived, death)
     11         return u'存活率:' + '%.2f' % rate
     12 
     13     # 2.哪个年龄段存活率最高
     14     def max_survive(self):
     15         age18_survived = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[1]
     16         age18_death = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[0]
     17         age18_rate = float(age18_survived) / (float(age18_survived) + float(age18_death))
     18 
     19         age1860_survived = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[1]
     20         age1860_death = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[0]
     21         age1860_rate = float(age1860_survived) / (float(age1860_survived) + float(age1860_death))
     22 
     23         age60_survived = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[1]
     24         age60_death = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[0]
     25         age60_rate = float(age60_survived) / (float(age60_survived) + float(age60_death))
     26 
     27         rate = [age18_rate, age60_rate, age1860_rate]
     28         age_data = ['18岁以下', '18-60岁', '60岁以上']
     29         max_rate = max(rate)
     30         age_range = age_data[rate.index(max(rate))]
     31         return '存活率最高的年龄段是{},存活率为{}'.format(age_range, max_rate)
     32 
     33     # 3.女性存活率是否高于男性
     34     def than_survive(self):
     35         male_survied = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[1]
     36         male_death = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[0]
     37         rate_male = float(male_survied) / (float(male_survied) + float(male_death))
     38         print '男性共有{}人,存活{}人,死亡{}人'.format(male_death + male_survied, male_survied, male_death)
     39         female_survied = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[1]
     40         female_death = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[0]
     41         rate_female = float(female_survied) / (float(female_survied) + float(female_death))
     42         print '女性共有{}人,存活{}人,死亡{}人'.format(female_death + female_survied, female_survied, female_death)
     43         if rate_male > rate_female:
     44             return u'男性存活率更高,存活率为:%.2f' % rate_male
     45         else:
     46             return u'女性存活率更高,存活率为:%.2f' % rate_female
     47 
     48     # 4.船上是否出现贫富差距
     49     def poor_wealth(self):
     50         max_wealth = self.data[u'Fare'].max()
     51         max_poor = self.data[u'Fare'].min()
     52         if max_wealth - max_poor > 500:
     53             return '船上乘客最多消费了{},最少消费了{},存在贫富差距'.format(max_wealth, max_poor)
     54         else:
     55             return '船上乘客最多消费了{},最少消费了{},不存在贫富差距'.format(max_wealth, max_poor)
     56 
     57     # 5.头等舱乘客的存活率是否高于经济舱
     58     def pclass_survive(self):
     59         pclass1_survived = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[1]
     60         pclass1_death = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[0]
     61         pclass1_rate = float(pclass1_survived) / (float(pclass1_survived) + float(pclass1_death))
     62 
     63         pclass3_survived = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[1]
     64         pclass3_death = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[0]
     65         pclass3_rate = float(pclass3_survived) / (float(pclass3_survived) + float(pclass3_death))
     66 
     67         if pclass3_rate > pclass1_rate:
     68             return '头等舱乘客存活率更高,存活率为{}'.format(pclass3_rate)
     69         else:
     70             return '经济舱乘客存活率更高,存活率为{}'.format(pclass1_rate)
     71 
     72     # 6.有亲属在船上乘客比率,有亲属是否会影响存活率
     73     def family_survive(self):
     74         has_family = self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'PassengerId'].count()
     75         no_family = self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'PassengerId'].count()
     76         rate_family = float(has_family) / (float(has_family) + float(no_family))
     77 
     78         has_family_survived = 
     79             self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[1]
     80         has_family_death = 
     81             self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[0]
     82         has_family_rate = float(has_family_survived) / (float(has_family_survived) + float(has_family_death))
     83 
     84         no_family_survived = 
     85             self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[1]
     86         no_family_death = 
     87             self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[0]
     88         no_family_rate = float(no_family_survived) / (float(no_family_survived) + float(no_family_death))
     89 
     90         print '船上乘客中有亲属也在船上的有{}人,无亲属在船上的有{}人,有亲属在船上的乘客的比率为{}'.format(has_family, no_family, rate_family)
     91         if has_family_rate > no_family_rate:
     92             return '有亲属在船上的乘客存活率更高,存活率为{}'.format(has_family_rate)
     93         else:
     94             return '无亲属在船上的乘客存活率更高,存活率为{}'.format(no_family_rate)
     95 
     96     # 7.从哪个港口登船是否影响获救
     97     def emarked_survive(self):
     98         Embarked_S_survived = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[1]
     99         Embarked_S_death = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[0]
    100         Embarked_S_rate = float(Embarked_S_survived) / (float(Embarked_S_survived) + float(Embarked_S_death))
    101 
    102         Embarked_C_survived = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[1]
    103         Embarked_C_death = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[0]
    104         Embarked_C_rate = float(Embarked_C_survived) / (float(Embarked_C_survived) + float(Embarked_C_death))
    105 
    106         Embarked_Q_survived = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[1]
    107         Embarked_Q_death = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[0]
    108         Embarked_Q_rate = float(Embarked_Q_survived) / (float(Embarked_Q_survived) + float(Embarked_Q_death))
    109 
    110         embarked = ['S港口', 'C港口', 'Q港口']
    111         rate = [Embarked_S_rate, Embarked_C_rate, Embarked_Q_rate]
    112         max_rate = max(rate)
    113         return '{}存活率最大,为{}'.format(embarked[rate.index(max_rate)], max_rate)
    114 
    115     # 8.不同年龄段女性的获救率
    116     def female_survive(self):
    117         female18_survived = 
    118             self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1]
    119         female18_death = 
    120             self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0]
    121         female18_rate = float(female18_survived) / (float(female18_survived) + float(female18_death))
    122 
    123         female1850_survived = 
    124             self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][
    125                 u'Survived'].value_counts()[1]
    126         female1850_death = 
    127             self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][
    128                 u'Survived'].value_counts()[0]
    129         female1850_rate = float(female1850_survived) / (float(female1850_survived) + float(female1850_death))
    130 
    131         female50_survived = 
    132             self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1]
    133         female50_death = 
    134             self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0]
    135         female50_rate = float(female50_survived) / (float(female50_survived) + float(female50_death))
    136         
    137         return '18岁以下女性存活率:{},18-50岁女性存活率:{},50岁以上女性存活率:{}'.format(female18_rate, female1850_rate, female50_rate)
    138 
    139 
    140 if __name__ == '__main__':
    141     tt = Titanic()
    142     # print tt.rate_survive()
    143     # print tt.than_survive()
    144     # print tt.max_survive()
    145     # print tt.poor_wealth()
    146     # print tt.pclass_survive()
    147     # print tt.family_survive()
    148     # print tt.emarked_survive()
    149     print tt.female_survive()

    DATA-->INFOMATION-->KNOWLEDGE-->WISDOM

    数据-->信息-->知识-->智慧

    爬虫-->数据库-->数据分析-->机器学习

    • 信息:通过某种方式组织和处理数据,分析数据间的关系,数据就有了意义
    • 知识:如果说数据是一个事实的集合,从中可以得出关于事实的结论。那么知识(Knowledge)就是信息的集合,它使信息变得有用。知识是对信息的应用,是一个对信息判断和确认的过程,这个过程结合了经验、上下文、诠释和反省。知识可以回答“如何?”的问题,可以帮助我们建模和仿真
    • 智慧:智慧可以简单的归纳为做正确判断和决定的能力,包括对知识的最佳使用。智慧可以回答“为什么”的问题。回到前面的例子,根据故障对客户的业务影响可以识别改进点
  • 相关阅读:
    Windows 解压缩XX.zip.001 XX.z01分卷文件的方法
    Android Library 发布开源库 JCenter & JitPack 攻略
    常见场景下Fragment和Activity的生命周期对比
    Android Studio 插件 ADBWifi 无线调试真机
    Flutter upgrade更新版本引发的无法启动调试APP的错误 target:kernel_snapshot failed”
    Glide异常:Failed to find GeneratedAppGlideModule 解决实践
    Android 讯飞语音听写SDK快速接入(附空指针解决和修改对话框文字方法)
    Android 自定义View—清爽小巧灵活的多节点进度条
    Android EXCEL 解析 xls 和 xlsx,方法其实很简单
    解决repo从codeaurora.org同步Android代码失败问题
  • 原文地址:https://www.cnblogs.com/siplips/p/9838789.html
Copyright © 2011-2022 走看看