zoukankan      html  css  js  c++  java
  • Python数据分析流程

    一.数据分析的步骤:

    1.查看数据并提出问题

    2.数据清洗

    3.代码编写,提取出结果数据,并分析是否有异常数据,修改代码

    4.根据数据选择合适的图表进行展示

    5.根据图表小组讨论交流获得最终的结果

    二.环境与原始数据准备

    安装Anaconda2版本,同时更新软件包更新最新版本  conda upgrade --all

    下载first.zip文件,解压

    里面有3张csv文件分别是enrollments.csv,daily_engagements.csv,project_submission.csv和一个ipython的notebook

    启动cmd 切换到解压之后的文件 输入 jupyter notebook 启动ipython笔记本

    三.分析数据

    1.从csv加载数据

    import unicodecsv
    
    
    def readcsv(filename):
        with open(filename,'rb') as f:
            #以字典的形式存放每一行数据
            reader = unicodecsv.DictReader(f)
            return list(reader)    
    
    ## 从 daily_engagement.csv 和 project_submissions.csv 载入数据并存
    ## 储至下面的变量中,然后检查每张表的第1行。
    
    daily_engagement = readcsv('daily-engagement.csv')
    project_submissions = readcsv('project-submissions.csv')
    enrollments = readcsv('enrollments.csv')
    
    print daily_engagement[0]
    print project_submissions[0]
    print enrollments[0]
    

     2.修正数据类型

    from datetime import datetime as dt
    
    # 将字符串格式的时间转为 Python datetime 类型的时间。
    # 如果没有时间字符串传入,返回 None
    def parse_date(date):
        if date == '':
            return None
        else:
            return dt.strptime(date, '%Y-%m-%d')
    
        
    # 将可能是空字符串或字符串类型的数据转为 整型 或 None。
    def parse_maybe_int(i):
        if i == '':
            return None
        else:
            return int(i)
    
    # 清理 enrollments 表格中的数据类型(取消的日期,参加日期,退出的天数,是否取消,是否是Udacity测试账号)
    for enrollment in enrollments:
        enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
        enrollment['join_date'] = parse_date(enrollment['join_date'])
        enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
        enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
        enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
        
    enrollments[0]
    
    # 清理 engagement 的数据类型(时间,课程数量,课程完成数量,项目完成情况,共花费多少时间)
    for engagement_record in daily_engagement:
        engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
        engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
        engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
        engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
        engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
        
    daily_engagement[0]
    
    # 清理 submissions 的数据类型(项目创建的时间,完成的时间)
    for submission in project_submissions:
        submission['creation_date'] = parse_date(submission['creation_date'])
        submission['completion_date'] = parse_date(submission['completion_date'])
    
    project_submissions[0]
    

    3.修改数据中的格式问题

    ## 将 daily_engagement 表中的 "acct" 重命名为 ”account_key"
    for engagement_record in daily_engagement:
        engagement_record['account_key'] = engagement_record['acct']
        del [engagement_record['acct']]
    

     4.探索数据

    ## 计算每张表中的总行数,和独立学生(拥有独立的 account keys)的数量
    def unique_student_data(data):
        unique_data = set()
        for data_point in data:
            unique_data.add(data_point['account_key'])
        return unique_data
    len(enrollments)
    unique_enrolled_students = unique_student_data(enrollments)
    len(unique_enrolled_students)
    
    len(daily_engagement)
    unique_daily_engagement = unique_student_data(daily_engagement)
    len(unique_daily_engagement)
    
    len(project_submissions)
    unique_project_submissions = unique_student_data(project_submissions)
    len(unique_project_submissions)
    

     5.找出问题数据

    ## 计算出有问题的数据点条数(在 enrollments 中存在,但在 engagement 表中缺失)
    num_problem_students = 0
    for enrollment in enrollments:
        if enrollment['account_key'] not in unique_daily_engagement and enrollment['join_date'] != enrollment['cancel_date']:
            num_problem_students +=1
            print enrollment
            print num_problem_students
    

     6.追踪剩余的问题(移除数据集的测试账号)

    # 为所有 Udacity 测试帐号建立一组 set 
    udacity_test_account = set()
    for enrollment in enrollments:
        if enrollment['is_udacity']:
            udacity_test_account.add(enrollment['account_key'])
    len(udacity_test_account)
    
    
    # 通过 account_key 删除所有 Udacity 的测试帐号
    def remove_udacity_account(data):
        non_udacity_data = []
        for data_point in data:
            if data_point['account_key'] not in udacity_test_account:
                non_udacity_data.append(data_point)
        return non_udacity_data
    
    # 从3张表中移除所有 Udacity 的测试帐号
    non_udacity_enrollments = remove_udacity_account(enrollments)
    non_udacity_engagement = remove_udacity_account(daily_engagement)
    non_udacity_submissions = remove_udacity_account(project_submissions)
    
    #创建一个叫 paid_students 的字典,并在字典中存储所有还没有取消或者注册时间超过7天的学生
    paid_students = {}
    for enrollment in non_udacity_enrollments:
      #如果没有取消并且退课的期限已经超过,就记录学生的key和报名时间 if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7: account_key = enrollment['account_key'] enrollment_date = enrollment['join_date']
         #如果account_key不在已缴费的记录中,则将学生记录添加进paid_student中 if account_key not in paid_students or enrollment_date > paid_students[account_key]: paid_students[account_key] = enrollment_date len(paid_students)#获取了所有已入学的学生记录

     7.获取第一周就已经付费报名的学生

    #计算时间差,一周以内,按天计算
    def within_one_week(join_date ,engagement_date): time_delta = join_date - enrollment_date return time_delta.days >= 0 and time_delta.days < 7
    #存放已报名的用户 def remove_free_trial_cancels(data): new_data = [] for data_point in data: if data_point['account_key'] in paid_students: new_data.append(data_point) return new_data paid_enrollment = remove_free_trial_cancels(non_udacity_enrollments) paid_engagement = remove_free_trial_cancels(non_udacity_engagement) paid_project_missions = remove_free_trial_cancels(non_udacity_submissions) print len(paid_enrollment) print len(paid_engagement) print len(paid_project_missions)
    ## 创建一个 engagement 记录的列表,该列表只包括付费学生以及加入的前7天的学生的记录
    ## 输入符合要求的行数 paid_engagement_in_first_week = [] for engagement_record in paid_engagement: join_date = paid_students[engagement_record['account_key']] engagement_record_date = engagement_record['utc_date'] if within_one_week(join_date,engagement_record_date): paid_engagement_in_first_week.append(engagement_record) len(paid_engagement_in_first_week)
    from collections import defaultdict
    import numpy as np
    #创建基于 student 对 engagement 进行分组的字典,字典的键为帐号(account key),值为包含互动记录的列表
    def group_data(data,key_name):
        grouped_data = defaultdict(list)
        for data_point in data:
            key = data_point[key_name]
            grouped_data[key].append(data_point)
        return grouped_data
    
    # 创建一个包含学生在第1周在教室所花总时间和字典。键为帐号(account key),值为数字(所花总时间)
    def sum_grouped_items(grouped_data,field_name):
        sumed_data = {}
        for key,data_points in grouped_data.items():
            total = 0
            for data_point in data_points:
                total += data_point[field_name]
            sumed_data[key] = total
        return sumed_data
    
    # 汇总和描述关于教室所花时间的数据
    def describe_data(data):
        print 'Mean:', np.mean(data)
        print 'Standard deviation:', np.std(data)
        print 'Minimum:', np.min(data)
        print 'Maximum:', np.max(data)
    

     8.获取学习时间最长的学生和时间

    total_minutes_by_account = sum_grouped_items(engagement_by_account,'total_minutes_visited')
    
    student_with_max_minutes = None
    max_minutes = 0
    for student,total_nums in total_minutes_by_account.items():
        if total_nums > max_minutes:
            max_minutes = total_nums
            student_with_max_minutes = student
    print max_minutes
    
    for engagement_record in paid_engagement_in_first_week:
        if engagement_record['account_key'] == student:
            print engagement_record
    

     9.找出第一周的访问数

    ## 找出第1周学生访问教室天数的平均值、标准差、最小值、最大值。
    for engagement_record in paid_engagement:
        if engagement_record['num_courses_visited'] > 0:
            engagement_record['has_visited'] = 1
        else:
            engagement_record['has_visited'] = 0
            
    days_visited_by_account = sum_grouped_items(engagement_by_account,'has_visited')
    describe_data(days_visited_by_account.values())
    

     10.区分项目通过的学生

    ## 创建两个付费学生第1周的互动数据列表(engagement)。第1个包含通过项目的学生,第2个包含没通过项目的学生。
    
    subway_project_lesson_keys = ['746169184', '3176718735']
    #定义存放通过项目的学员的key pass_subway_project = set() for submission in paid_project_missions: project = submission['lesson_key'] rating = submission['assigned_rating']
      #如果等级是passed和distinction加入到pass_subway_project集合中 if project in subway_project_lesson_keys and (rating == 'PASSED' or rating == 'DISTINCTION'): pass_subway_project.add(submission['account_key']) passing_engagement = [] #存放通过项目的学生 non_passing_engagement =[] #存放没有通过项目的学生 for engagement_record in paid_engagement_in_first_week: if engagement_record['account_key'] in pass_subway_project: passing_engagement.append(engagement_record) else: non_passing_engagement.append(engagement_record) print len(passing_engagement) print len(non_passing_engagement)

     11.对比两组学生的数据

    ## 计算你所感兴趣的数据指标,并分析通过项目和没有通过项目的两组学生有何异同。
    ## 你可以从我们之前使用过的数据指标开始(教室的访问时间、课程完成数、访问天数)。
    passing_engagement_by_account = group_data(passing_engagement,'account_key')
    non_passing_engagement_by_account = group_data(non_passing_engagement,'account_key')
    
    print 'non-passing students'
    non_passing_minute = sum_grouped_items(non_passing_engagement_by_account,'total_minutes_visited')
    describe_data(non_passing_minute.values())
    print 'passing students'
    passing_minute = sum_grouped_items(passing_engagement_by_account,'total_minutes_visited')
    describe_data(passing_minute.values())
    
    print 'non-passing lessons'
    non_passing_lessons = sum_grouped_items(non_passing_engagement_by_account,'lessons_completed')
    describe_data(non_passing_lessons.values())
    print 'passing lessons'
    passing_lessons = sum_grouped_items(passing_engagement_by_account,'lessons_completed')
    describe_data(passing_lessons.values())
    
    print 'non-passing visited'
    non_passing_visited = sum_grouped_items(non_passing_engagement_by_account,'has_visited')
    describe_data(non_passing_visited.values())
    print 'passing visited'
    passing_visited = sum_grouped_items(passing_engagement_by_account,'has_visited')
    describe_data(passing_visited.values())
    

     12.绘制直方图

    %pylab inline
    import matplotlib.pyplot as plt
    import numpy as np
    
    def describe_data(data):
        print 'Mean:', np.mean(data)
        print 'Standard deviation:', np.std(data)
        print 'Minimum:', np.min(data)
        print 'Maximum:', np.max(data)
        plt.hist(data)
        
    describe_data(passing_minute.values())
    describe_data(non_passing_minute.values())
    

     13.改进图表并分析

    ## 至少改进一幅之前的可视化图表,尝试导入 seaborn 库使你的图表看起来更美观。
    ## 加入轴标签及表头,并修改一个或多个 hist() 内的变量。
    %pylab inline
    import seaborn as sns
    sns.set(color_codes=True)
    plt.hist(non_passing_minute.values(),bins=8)
    plt.xlabel('mean of minut')
    plt.title('Distribution of classroom visits in the first week ' + 
              'for students who do not pass the subway project')
    
    plt.hist(passing_minute.values(),bins=8)
    plt.xlabel('mean of minut')
    plt.title('Distribution of classroom visits in the first week ' + 
              'for students who do not pass the subway project')
    
  • 相关阅读:
    scala之伴生对象的继承
    scala之伴生对象说明
    “Failed to install the following Android SDK packages as some licences have not been accepted” 错误
    PATH 环境变量重复问题解决
    Ubuntu 18.04 配置java环境
    JDBC的基本使用2
    DCL的基本语法(授权)
    ZJNU 1374
    ZJNU 2184
    ZJNU 1334
  • 原文地址:https://www.cnblogs.com/luhuajun/p/7640899.html
Copyright © 2011-2022 走看看