zoukankan      html  css  js  c++  java
  • [读书笔记] Python数据分析 (二) 引言

     

    1. 数据分析的任务:数据读写,数据准备(清洗,修整,规范化,重塑,切片切块,变形),转换,建模计算,呈现(模型/数据)

    2. 数据集:

    bit.ly的1.usa.gov数据:URL缩短服务bit.ly和美国政府usa.gov合作从.gov或.mil用户那里收集的匿名数据

    # -*- coding:utf-8 -*-
    #导入json模块,将json字符串转换为python字典
    import json
    from collections import defaultdict
    from collections import Counter
    from pandas import DataFrame, Series
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt
    
    path = "E:/Programming/Python/PythonDataAnalysis/datasets/usagov_bitly/example.txt"
    #list comprehension
    records = [json.loads(line) for line in open(path)]
    #对时区计数, 同时保证tz必须在records中
    time_zones = [rec['tz'] for rec in records if 'tz' in rec.keys()]
    #--------------方法1:------------
    #时区计数
    def get_counts(sequence):
    	counts = {}
    	for x in sequence:
    		if x in counts:
    			counts[x] += 1
    		else:
    			counts[x] = 1
    	return counts
    #取得前n个最常使用的时区
    def top_counts(count_dict,n = 10):
    	value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
    	value_key_pairs.sort()
    	return value_key_pairs[-n:]
    counts = get_counts(time_zones)
    print(counts)
    top_counts = top_counts(counts)
    print(top_counts)
    #--------------方法2:------------
    def get_counts2(sequence):
    	counts = defaultdict(int)
    	for x in sequence:
    		counts[x] += 1
    	return counts
    #--------------方法3:------------
    #引入collections的Counter对象
    def get_counts3(time_zones,n=10):
    	counts = Counter(time_zones)
    	return counts.most_common(n)
    
    top_counts3 = get_counts3(time_zones,10)
    print(top_counts3)
    #--------------方法3:------------
    #用pandas对时区进行计数
    #将records转换为DataFrame对象
    frame = DataFrame(records)
    #frame['tz']返回的对象有一个value_counts方法
    tz_counts = frame['tz'].value_counts()
    print(tz_counts[:10])
    #fillna()函数填补空缺值NA
    clean_tz = frame['tz'].fillna("Missing")
    print(clean_tz)
    #空字符串为Unknown
    clean_tz[clean_tz == ''] = "Unknown"
    tz_counts = clean_tz.value_counts()
    print(tz_counts[:10])
    #利用counts的plot方法
    tz_counts[:10].plot(kind = "barh",rot=0)
    plt.show()
    #用户浏览器分析
    results = Series([x.split()[0] for x in frame.a.dropna()])
    #打印前8的浏览器
    print(results.value_counts()[:8])
    cframe = frame[frame.a.notnull()]
    operating_system = np.where(cframe['a'].str.contains("Windows"),"Windows","Not Windows")
    windows = 0
    nonWindows = 0
    for op in operating_system:
    	if op == "Windows":
    		windows += 1
    	else:
    		nonWindows += 1
    print("windows:",windows,"nonWindows:",nonWindows)
    #使用windows/nonwindows给时区分组
    by_tz_os = cframe.groupby(['tz',operating_system])
    agg_counts = by_tz_os.size().unstack().fillna(0)
    print(agg_counts[:10])
    #选取最常见的时区
    indexer = agg_counts.sum(1).argsort()
    print(indexer)
    count_subset = agg_counts.take(indexer)[-10:]
    print(count_subset)
    #绘制windows/nonwindows 堆叠条形图
    count_subset.plot(kind="barh",stacked=True)
    #不加这句语句,在Ipython中可以显示但是脚本运行不显示
    plt.show()
    #规范化
    normed_subset = count_subset.div(count_subset.sum(1),axis = 0)
    normed_subset.plot(kind = "barh",stacked=True)
    plt.show()

    MovieLens 1M数据集:20世纪90年末到21世纪初6000名用户提供的4000部电影评分100万条数据,分为3个表:电影评分,电影元数据(类型,年代),用户的人口统计学数据(年龄,右边,性别,职业)

    # -*- coding: utf-8 -*-
    import pandas as pd 
    import os
    #数据读取,读成3个表
    path = 'E:/Programming/Python/PythonDataAnalysis/datasets/movielens/'
    unames = ['user_id','gender','age','occupation','zip']
    upath = os.path.join(path,'users.dat')
    users = pd.read_table(upath,sep = "::",header=None,names=unames,engine='python')
    rnames = ['user_id',"movie_id","rating","timestamp"]
    ratings = pd.read_table(path+'ratings.dat',sep = "::",header=None,names=rnames,engine='python')
    mnames = ['movie_id','title','genres']
    movies = pd.read_table(path+'movies.dat',sep ="::",header=None,names=mnames,engine='python')
    #数据表整合
    data = pd.merge(pd.merge(ratings,users),movies)
    print(data[:10])
    print(data.ix[0])
    #按性别计算每部电影的得分,index 中是标签,columns中是列标签
    mean_ratings = data.pivot_table('rating',index = 'title',columns = "gender",aggfunc='mean')
    print(mean_ratings[:10])
    #过滤掉评分不足250条的电影
    ratings_by_title = data.groupby('title').size()
    print(ratings_by_title[:10])
    active_titles = ratings_by_title[ratings_by_title >= 250]
    print(active_titles)
    #按照评论>=250的index筛选
    mean_ratings = mean_ratings.ix[active_titles.index]
    top_female_ratings = mean_ratings.sort_index(by='F',ascending=False)
    print(top_female_ratings[:10])
    #计算男性女性得分分歧最大的电影
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    
    sorted_by_diff = mean_ratings.sort_index(by = 'diff')
    #分歧最大且女性更喜欢的电影
    print(sorted_by_diff[:15])
    #对结果反序取出前15行,男性观众更喜欢的电影
    print(sorted_by_diff[::-1][:15])
    #分歧最大的电影,计算方差或者标准差
    rating_std_by_title = data.groupby('title')['rating'].std()
    #使用active_title进行过滤
    rating_std_by_title = rating_std_by_title.ix[active_titles]
    rating_std_by_title.order(ascending=False)
    print(rating_std_by_title[:15])
    

    1880-2010年间婴儿名字频率数据

    # -*- coding:utf-8 -*-
    import pandas as pd 
    import matplotlib.pyplot as plt
    import numpy as np
    path = 'E:/Programming/Python/PythonDataAnalysis/datasets/babynames/'
    names1880 = pd.read_csv(path+'yob1880.txt',names = ['name','sex','births'],engine='python')
    #按照sex对数据进行简单分组
    names1880.groupby('sex').births.sum()
    #将单个文件中的数据整合到一个数据表中
    years = range(1880,2011)
    pieces = []
    columns = ['name','sex','birth']
    for year in years:
    	subpath = 'yob%d.txt' % year
    	frame = pd.read_csv(path+subpath,names = columns)
    	frame['year'] = year
    	pieces.append(frame)
    names = pd.concat(pieces,ignore_index = True)
    #使用pivot_table()函数进行聚合
    total_births = names.pivot_table('birth',index = 'year',columns = 'sex',aggfunc = sum)
    print(total_births.tail())
    #插入prop列存放指定的婴儿数相对于总出生数的比例
    def add_prop(group):
    	births = group.birth.astype(float)
    	group['prop'] = births/births.sum()
    	return group
    
    names = names.groupby(['year','sex']).apply(add_prop)
    #取出每个sex/year组合的前1000个名字
    def get_top1000(group):
    	return group.sort_values(by='birth',ascending=False)[1:1000]
    grouped = names.groupby(['year','sex'])
    top1000 = grouped.apply(get_top1000)
    #接下来的'命名趋势'分析针对这top1000个数据集
    #取出男性
    boys = top1000[top1000.sex == 'M']
    #取出女性
    girls = top1000[top1000.sex == 'F']
    total_births = top1000.pivot_table('birth',index = 'year',columns = 'name',aggfunc = sum)
    subset = total_births[['John','Harry','Mary','Marilyn']]
    subset.plot(subplots = True,figsize = (12,10),grid=False,title = "Number of births per year")
    plt.show()
    #观察名字多样性变化
    table = top1000.pivot_table('prop',index = 'year',columns = 'sex',aggfunc = sum)
    table.plot(title = "sum of table1000.prop by year and sex",yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10))
    plt.show()
    # 名字最后一个字母的变化
    

      

  • 相关阅读:
    我的WCF之旅(1):创建一个简单的WCF程序
    与众不同 windows phone (15) Media(媒体)之后台播放音频
    与众不同 windows phone (14) Media(媒体)之音频播放器, 视频播放器, 与 Windows Phone 的音乐和视频中心集成
    与众不同 windows phone (10) Push Notification(推送通知)之推送 Tile 通知, 推送自定义信息
    与众不同 windows phone (17) Graphic and Animation(画图和动画)
    与众不同 windows phone (5) Chooser(选择器)
    与众不同 windows phone (26) Contacts and Calendar(联系人和日历)
    与众不同 windows phone (7) Local Database(本地数据库)
    与众不同 windows phone (19) Device(设备)之陀螺仪传感器, Motion API
    与众不同 windows phone (16) Media(媒体)之编辑图片, 保存图片到相册, 与图片的上下文菜单“应用程序...”和“共享...”关联, 与 Windows Phone 的图片中心集成
  • 原文地址:https://www.cnblogs.com/vincentcheng/p/7903179.html
Copyright © 2011-2022 走看看