zoukankan      html  css  js  c++  java
  • Python数据分析流程

    一.数据分析的步骤:

    1.查看数据并提出问题

    2.数据清洗

    3.代码编写,提取出结果数据,并分析是否有异常数据,修改代码

    4.根据数据选择合适的图表进行展示

    5.根据图表小组讨论交流获得最终的结果

    二.环境与原始数据准备

    安装Anaconda2版本,同时更新软件包更新最新版本  conda upgrade --all

    下载first.zip文件,解压

    里面有3张csv文件分别是enrollments.csv,daily_engagements.csv,project_submission.csv和一个ipython的notebook

    启动cmd 切换到解压之后的文件 输入 jupyter notebook 启动ipython笔记本

    三.分析数据

    1.从csv加载数据

    import unicodecsv
    
    
    def readcsv(filename):
        with open(filename,'rb') as f:
            #以字典的形式存放每一行数据
            reader = unicodecsv.DictReader(f)
            return list(reader)    
    
    ## 从 daily_engagement.csv 和 project_submissions.csv 载入数据并存
    ## 储至下面的变量中,然后检查每张表的第1行。
    
    daily_engagement = readcsv('daily-engagement.csv')
    project_submissions = readcsv('project-submissions.csv')
    enrollments = readcsv('enrollments.csv')
    
    print daily_engagement[0]
    print project_submissions[0]
    print enrollments[0]
    

     2.修正数据类型

    from datetime import datetime as dt
    
    # 将字符串格式的时间转为 Python datetime 类型的时间。
    # 如果没有时间字符串传入,返回 None
    def parse_date(date):
        if date == '':
            return None
        else:
            return dt.strptime(date, '%Y-%m-%d')
    
        
    # 将可能是空字符串或字符串类型的数据转为 整型 或 None。
    def parse_maybe_int(i):
        if i == '':
            return None
        else:
            return int(i)
    
    # 清理 enrollments 表格中的数据类型(取消的日期,参加日期,退出的天数,是否取消,是否是Udacity测试账号)
    for enrollment in enrollments:
        enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
        enrollment['join_date'] = parse_date(enrollment['join_date'])
        enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
        enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
        enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
        
    enrollments[0]
    
    # 清理 engagement 的数据类型(时间,课程数量,课程完成数量,项目完成情况,共花费多少时间)
    for engagement_record in daily_engagement:
        engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
        engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
        engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
        engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
        engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
        
    daily_engagement[0]
    
    # 清理 submissions 的数据类型(项目创建的时间,完成的时间)
    for submission in project_submissions:
        submission['creation_date'] = parse_date(submission['creation_date'])
        submission['completion_date'] = parse_date(submission['completion_date'])
    
    project_submissions[0]
    

    3.修改数据中的格式问题

    ## 将 daily_engagement 表中的 "acct" 重命名为 ”account_key"
    for engagement_record in daily_engagement:
        engagement_record['account_key'] = engagement_record['acct']
        del [engagement_record['acct']]
    

     4.探索数据

    ## 计算每张表中的总行数,和独立学生(拥有独立的 account keys)的数量
    def unique_student_data(data):
        unique_data = set()
        for data_point in data:
            unique_data.add(data_point['account_key'])
        return unique_data
    len(enrollments)
    unique_enrolled_students = unique_student_data(enrollments)
    len(unique_enrolled_students)
    
    len(daily_engagement)
    unique_daily_engagement = unique_student_data(daily_engagement)
    len(unique_daily_engagement)
    
    len(project_submissions)
    unique_project_submissions = unique_student_data(project_submissions)
    len(unique_project_submissions)
    

     5.找出问题数据

    ## 计算出有问题的数据点条数(在 enrollments 中存在,但在 engagement 表中缺失)
    num_problem_students = 0
    for enrollment in enrollments:
        if enrollment['account_key'] not in unique_daily_engagement and enrollment['join_date'] != enrollment['cancel_date']:
            num_problem_students +=1
            print enrollment
            print num_problem_students
    

     6.追踪剩余的问题(移除数据集的测试账号)

    # 为所有 Udacity 测试帐号建立一组 set 
    udacity_test_account = set()
    for enrollment in enrollments:
        if enrollment['is_udacity']:
            udacity_test_account.add(enrollment['account_key'])
    len(udacity_test_account)
    
    
    # 通过 account_key 删除所有 Udacity 的测试帐号
    def remove_udacity_account(data):
        non_udacity_data = []
        for data_point in data:
            if data_point['account_key'] not in udacity_test_account:
                non_udacity_data.append(data_point)
        return non_udacity_data
    
    # 从3张表中移除所有 Udacity 的测试帐号
    non_udacity_enrollments = remove_udacity_account(enrollments)
    non_udacity_engagement = remove_udacity_account(daily_engagement)
    non_udacity_submissions = remove_udacity_account(project_submissions)
    
    #创建一个叫 paid_students 的字典,并在字典中存储所有还没有取消或者注册时间超过7天的学生
    paid_students = {}
    for enrollment in non_udacity_enrollments:
      #如果没有取消并且退课的期限已经超过,就记录学生的key和报名时间 if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7: account_key = enrollment['account_key'] enrollment_date = enrollment['join_date']
         #如果account_key不在已缴费的记录中,则将学生记录添加进paid_student中 if account_key not in paid_students or enrollment_date > paid_students[account_key]: paid_students[account_key] = enrollment_date len(paid_students)#获取了所有已入学的学生记录

     7.获取第一周就已经付费报名的学生

    #计算时间差,一周以内,按天计算
    def within_one_week(join_date ,engagement_date): time_delta = join_date - enrollment_date return time_delta.days >= 0 and time_delta.days < 7
    #存放已报名的用户 def remove_free_trial_cancels(data): new_data = [] for data_point in data: if data_point['account_key'] in paid_students: new_data.append(data_point) return new_data paid_enrollment = remove_free_trial_cancels(non_udacity_enrollments) paid_engagement = remove_free_trial_cancels(non_udacity_engagement) paid_project_missions = remove_free_trial_cancels(non_udacity_submissions) print len(paid_enrollment) print len(paid_engagement) print len(paid_project_missions)
    ## 创建一个 engagement 记录的列表,该列表只包括付费学生以及加入的前7天的学生的记录
    ## 输入符合要求的行数 paid_engagement_in_first_week = [] for engagement_record in paid_engagement: join_date = paid_students[engagement_record['account_key']] engagement_record_date = engagement_record['utc_date'] if within_one_week(join_date,engagement_record_date): paid_engagement_in_first_week.append(engagement_record) len(paid_engagement_in_first_week)
    from collections import defaultdict
    import numpy as np
    #创建基于 student 对 engagement 进行分组的字典,字典的键为帐号(account key),值为包含互动记录的列表
    def group_data(data,key_name):
        grouped_data = defaultdict(list)
        for data_point in data:
            key = data_point[key_name]
            grouped_data[key].append(data_point)
        return grouped_data
    
    # 创建一个包含学生在第1周在教室所花总时间和字典。键为帐号(account key),值为数字(所花总时间)
    def sum_grouped_items(grouped_data,field_name):
        sumed_data = {}
        for key,data_points in grouped_data.items():
            total = 0
            for data_point in data_points:
                total += data_point[field_name]
            sumed_data[key] = total
        return sumed_data
    
    # 汇总和描述关于教室所花时间的数据
    def describe_data(data):
        print 'Mean:', np.mean(data)
        print 'Standard deviation:', np.std(data)
        print 'Minimum:', np.min(data)
        print 'Maximum:', np.max(data)
    

     8.获取学习时间最长的学生和时间

    total_minutes_by_account = sum_grouped_items(engagement_by_account,'total_minutes_visited')
    
    student_with_max_minutes = None
    max_minutes = 0
    for student,total_nums in total_minutes_by_account.items():
        if total_nums > max_minutes:
            max_minutes = total_nums
            student_with_max_minutes = student
    print max_minutes
    
    for engagement_record in paid_engagement_in_first_week:
        if engagement_record['account_key'] == student:
            print engagement_record
    

     9.找出第一周的访问数

    ## 找出第1周学生访问教室天数的平均值、标准差、最小值、最大值。
    for engagement_record in paid_engagement:
        if engagement_record['num_courses_visited'] > 0:
            engagement_record['has_visited'] = 1
        else:
            engagement_record['has_visited'] = 0
            
    days_visited_by_account = sum_grouped_items(engagement_by_account,'has_visited')
    describe_data(days_visited_by_account.values())
    

     10.区分项目通过的学生

    ## 创建两个付费学生第1周的互动数据列表(engagement)。第1个包含通过项目的学生,第2个包含没通过项目的学生。
    
    subway_project_lesson_keys = ['746169184', '3176718735']
    #定义存放通过项目的学员的key pass_subway_project = set() for submission in paid_project_missions: project = submission['lesson_key'] rating = submission['assigned_rating']
      #如果等级是passed和distinction加入到pass_subway_project集合中 if project in subway_project_lesson_keys and (rating == 'PASSED' or rating == 'DISTINCTION'): pass_subway_project.add(submission['account_key']) passing_engagement = [] #存放通过项目的学生 non_passing_engagement =[] #存放没有通过项目的学生 for engagement_record in paid_engagement_in_first_week: if engagement_record['account_key'] in pass_subway_project: passing_engagement.append(engagement_record) else: non_passing_engagement.append(engagement_record) print len(passing_engagement) print len(non_passing_engagement)

     11.对比两组学生的数据

    ## 计算你所感兴趣的数据指标,并分析通过项目和没有通过项目的两组学生有何异同。
    ## 你可以从我们之前使用过的数据指标开始(教室的访问时间、课程完成数、访问天数)。
    passing_engagement_by_account = group_data(passing_engagement,'account_key')
    non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key')
    
    print 'non-passing students'
    non_passing_minute = sum_grouped_items(non_passing_engagement_by_account,'total_minutes_visited')
    describe_data(non_passing_minute.values())
    print 'passing students'
    passing_minute = sum_grouped_items(passing_engagement_by_account,'total_minutes_visited')
    describe_data(passing_minute.values())
    
    print 'non-passing lessons'
    non_passing_lessons = sum_grouped_items(non_passing_engagement_by_account,'lessons_completed')
    describe_data(non_passing_lessons.values())
    print 'passing lessons'
    passing_lessons = sum_grouped_items(passing_engagement_by_account,'lessons_completed')
    describe_data(passing_lessons.values())
    
    print 'non-passing visited'
    non_passing_visited = sum_grouped_items(non_passing_engagement_by_account,'has_visited')
    describe_data(non_passing_visited.values())
    print 'passing visited'
    passing_visited = sum_grouped_items(passing_engagement_by_account,'has_visited')
    describe_data(passing_visited.values())
    

     12.绘制直方图

    %pylab inline
    import matplotlib.pyplot as plt
    import numpy as np
    
    def describe_data(data):
        print 'Mean:', np.mean(data)
        print 'Standard deviation:', np.std(data)
        print 'Minimum:', np.min(data)
        print 'Maximum:', np.max(data)
        plt.hist(data)
        
    describe_data(passing_minute.values())
    describe_data(non_passing_minute.values())
    

     13.改进图表并分析

    ## 至少改进一幅之前的可视化图表,尝试导入 seaborn 库使你的图表看起来更美观。
    ## 加入轴标签及表头,并修改一个或多个 hist() 内的变量。
    %pylab inline
    import seaborn as sns
    sns.set(color_codes=True)
    plt.hist(non_passing_minute.values(),bins=8)
    plt.xlabel('mean of minut')
    plt.title('Distribution of classroom visits in the first week ' + 
              'for students who do not pass the subway project')
    
    plt.hist(passing_minute.values(),bins=8)
    plt.xlabel('mean of minut')
    plt.title('Distribution of classroom visits in the first week ' + 
              'for students who do not pass the subway project')
    
  • 相关阅读:
    mysql中profile的使用
    6、MySQL索引种类
    MySql事务
    MySQL视图(view)
    MySql数据库命令基本操作
    2、MySQL常见数据库引擎及比较?
    1、列举常见的关系型数据库和非关系型都有那些?
    Python中的顺序表
    双端队列
    手持移动端特殊链接:打电话,发短信,发邮件
  • 原文地址:https://www.cnblogs.com/luhuajun/p/7640899.html
Copyright © 2011-2022 走看看