zoukankan      html  css  js  c++  java
  • [读书笔记] Python数据分析 (二) 引言

     

    1. 数据分析的任务:数据读写,数据准备(清洗,修整,规范化,重塑,切片切块,变形),转换,建模计算,呈现(模型/数据)

    2. 数据集:

    bit.ly的1.usa.gov数据:URL缩短服务bit.ly和美国政府usa.gov合作从.gov或.mil用户那里收集的匿名数据

    # -*- coding:utf-8 -*-
    #导入json模块,将json字符串转换为python字典
    import json
    from collections import defaultdict
    from collections import Counter
    from pandas import DataFrame, Series
    import pandas as pd 
    import numpy as np 
    import matplotlib.pyplot as plt
    
    path = "E:/Programming/Python/PythonDataAnalysis/datasets/usagov_bitly/example.txt"
    #list comprehension
    records = [json.loads(line) for line in open(path)]
    #对时区计数, 同时保证tz必须在records中
    time_zones = [rec['tz'] for rec in records if 'tz' in rec.keys()]
    #--------------方法1:------------
    #时区计数
    def get_counts(sequence):
    	counts = {}
    	for x in sequence:
    		if x in counts:
    			counts[x] += 1
    		else:
    			counts[x] = 1
    	return counts
    #取得前n个最常使用的时区
    def top_counts(count_dict,n = 10):
    	value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
    	value_key_pairs.sort()
    	return value_key_pairs[-n:]
    counts = get_counts(time_zones)
    print(counts)
    top_counts = top_counts(counts)
    print(top_counts)
    #--------------方法2:------------
    def get_counts2(sequence):
    	counts = defaultdict(int)
    	for x in sequence:
    		counts[x] += 1
    	return counts
    #--------------方法3:------------
    #引入collections的Counter对象
    def get_counts3(time_zones,n=10):
    	counts = Counter(time_zones)
    	return counts.most_common(n)
    
    top_counts3 = get_counts3(time_zones,10)
    print(top_counts3)
    #--------------方法3:------------
    #用pandas对时区进行计数
    #将records转换为DataFrame对象
    frame = DataFrame(records)
    #frame['tz']返回的对象有一个value_counts方法
    tz_counts = frame['tz'].value_counts()
    print(tz_counts[:10])
    #fillna()函数填补空缺值NA
    clean_tz = frame['tz'].fillna("Missing")
    print(clean_tz)
    #空字符串为Unknown
    clean_tz[clean_tz == ''] = "Unknown"
    tz_counts = clean_tz.value_counts()
    print(tz_counts[:10])
    #利用counts的plot方法
    tz_counts[:10].plot(kind = "barh",rot=0)
    plt.show()
    #用户浏览器分析
    results = Series([x.split()[0] for x in frame.a.dropna()])
    #打印前8的浏览器
    print(results.value_counts()[:8])
    cframe = frame[frame.a.notnull()]
    operating_system = np.where(cframe['a'].str.contains("Windows"),"Windows","Not Windows")
    windows = 0
    nonWindows = 0
    for op in operating_system:
    	if op == "Windows":
    		windows += 1
    	else:
    		nonWindows += 1
    print("windows:",windows,"nonWindows:",nonWindows)
    #使用windows/nonwindows给时区分组
    by_tz_os = cframe.groupby(['tz',operating_system])
    agg_counts = by_tz_os.size().unstack().fillna(0)
    print(agg_counts[:10])
    #选取最常见的时区
    indexer = agg_counts.sum(1).argsort()
    print(indexer)
    count_subset = agg_counts.take(indexer)[-10:]
    print(count_subset)
    #绘制windows/nonwindows 堆叠条形图
    count_subset.plot(kind="barh",stacked=True)
    #不加这句语句,在Ipython中可以显示但是脚本运行不显示
    plt.show()
    #规范化
    normed_subset = count_subset.div(count_subset.sum(1),axis = 0)
    normed_subset.plot(kind = "barh",stacked=True)
    plt.show()

    MovieLens 1M数据集:20世纪90年末到21世纪初6000名用户提供的4000部电影评分100万条数据,分为3个表:电影评分,电影元数据(类型,年代),用户的人口统计学数据(年龄,右边,性别,职业)

    # -*- coding: utf-8 -*-
    import pandas as pd 
    import os
    #数据读取,读成3个表
    path = 'E:/Programming/Python/PythonDataAnalysis/datasets/movielens/'
    unames = ['user_id','gender','age','occupation','zip']
    upath = os.path.join(path,'users.dat')
    users = pd.read_table(upath,sep = "::",header=None,names=unames,engine='python')
    rnames = ['user_id',"movie_id","rating","timestamp"]
    ratings = pd.read_table(path+'ratings.dat',sep = "::",header=None,names=rnames,engine='python')
    mnames = ['movie_id','title','genres']
    movies = pd.read_table(path+'movies.dat',sep ="::",header=None,names=mnames,engine='python')
    #数据表整合
    data = pd.merge(pd.merge(ratings,users),movies)
    print(data[:10])
    print(data.ix[0])
    #按性别计算每部电影的得分,index 中是标签,columns中是列标签
    mean_ratings = data.pivot_table('rating',index = 'title',columns = "gender",aggfunc='mean')
    print(mean_ratings[:10])
    #过滤掉评分不足250条的电影
    ratings_by_title = data.groupby('title').size()
    print(ratings_by_title[:10])
    active_titles = ratings_by_title[ratings_by_title >= 250]
    print(active_titles)
    #按照评论>=250的index筛选
    mean_ratings = mean_ratings.ix[active_titles.index]
    top_female_ratings = mean_ratings.sort_index(by='F',ascending=False)
    print(top_female_ratings[:10])
    #计算男性女性得分分歧最大的电影
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    
    sorted_by_diff = mean_ratings.sort_index(by = 'diff')
    #分歧最大且女性更喜欢的电影
    print(sorted_by_diff[:15])
    #对结果反序取出前15行,男性观众更喜欢的电影
    print(sorted_by_diff[::-1][:15])
    #分歧最大的电影,计算方差或者标准差
    rating_std_by_title = data.groupby('title')['rating'].std()
    #使用active_title进行过滤
    rating_std_by_title = rating_std_by_title.ix[active_titles]
    rating_std_by_title.order(ascending=False)
    print(rating_std_by_title[:15])
    

    1880-2010年间婴儿名字频率数据

    # -*- coding:utf-8 -*-
    import pandas as pd 
    import matplotlib.pyplot as plt
    import numpy as np
    path = 'E:/Programming/Python/PythonDataAnalysis/datasets/babynames/'
    names1880 = pd.read_csv(path+'yob1880.txt',names = ['name','sex','births'],engine='python')
    #按照sex对数据进行简单分组
    names1880.groupby('sex').births.sum()
    #将单个文件中的数据整合到一个数据表中
    years = range(1880,2011)
    pieces = []
    columns = ['name','sex','birth']
    for year in years:
    	subpath = 'yob%d.txt' % year
    	frame = pd.read_csv(path+subpath,names = columns)
    	frame['year'] = year
    	pieces.append(frame)
    names = pd.concat(pieces,ignore_index = True)
    #使用pivot_table()函数进行聚合
    total_births = names.pivot_table('birth',index = 'year',columns = 'sex',aggfunc = sum)
    print(total_births.tail())
    #插入prop列存放指定的婴儿数相对于总出生数的比例
    def add_prop(group):
    	births = group.birth.astype(float)
    	group['prop'] = births/births.sum()
    	return group
    
    names = names.groupby(['year','sex']).apply(add_prop)
    #取出每个sex/year组合的前1000个名字
    def get_top1000(group):
    	return group.sort_values(by='birth',ascending=False)[1:1000]
    grouped = names.groupby(['year','sex'])
    top1000 = grouped.apply(get_top1000)
    #接下来的'命名趋势'分析针对这top1000个数据集
    #取出男性
    boys = top1000[top1000.sex == 'M']
    #取出女性
    girls = top1000[top1000.sex == 'F']
    total_births = top1000.pivot_table('birth',index = 'year',columns = 'name',aggfunc = sum)
    subset = total_births[['John','Harry','Mary','Marilyn']]
    subset.plot(subplots = True,figsize = (12,10),grid=False,title = "Number of births per year")
    plt.show()
    #观察名字多样性变化
    table = top1000.pivot_table('prop',index = 'year',columns = 'sex',aggfunc = sum)
    table.plot(title = "sum of table1000.prop by year and sex",yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10))
    plt.show()
    # 名字最后一个字母的变化
    

      

  • 相关阅读:
    暑假集训Day1 整数划分
    暑假集训day1 水题 乘法最大
    暑假集训Day0
    【不知道什么专题】——历年几道难题的分析。
    开发语言之我见
    选择器IDEA Maven不见了
    js 里函数调用之call
    js 闭包 匿名函数 JavaScript的IIFE(即时执行方法)immediately-invoked function expression
    ideal-check项目
    浏览器内部工作原理
  • 原文地址:https://www.cnblogs.com/vincentcheng/p/7903179.html
Copyright © 2011-2022 走看看