1. 数据分析的任务:数据读写,数据准备(清洗,修整,规范化,重塑,切片切块,变形),转换,建模计算,呈现(模型/数据)
2. 数据集:
bit.ly的1.usa.gov数据:URL缩短服务bit.ly和美国政府usa.gov合作从.gov或.mil用户那里收集的匿名数据
# -*- coding:utf-8 -*- #导入json模块,将json字符串转换为python字典 import json from collections import defaultdict from collections import Counter from pandas import DataFrame, Series import pandas as pd import numpy as np import matplotlib.pyplot as plt path = "E:/Programming/Python/PythonDataAnalysis/datasets/usagov_bitly/example.txt" #list comprehension records = [json.loads(line) for line in open(path)] #对时区计数, 同时保证tz必须在records中 time_zones = [rec['tz'] for rec in records if 'tz' in rec.keys()] #--------------方法1:------------ #时区计数 def get_counts(sequence): counts = {} for x in sequence: if x in counts: counts[x] += 1 else: counts[x] = 1 return counts #取得前n个最常使用的时区 def top_counts(count_dict,n = 10): value_key_pairs = [(count,tz) for tz, count in count_dict.items()] value_key_pairs.sort() return value_key_pairs[-n:] counts = get_counts(time_zones) print(counts) top_counts = top_counts(counts) print(top_counts) #--------------方法2:------------ def get_counts2(sequence): counts = defaultdict(int) for x in sequence: counts[x] += 1 return counts #--------------方法3:------------ #引入collections的Counter对象 def get_counts3(time_zones,n=10): counts = Counter(time_zones) return counts.most_common(n) top_counts3 = get_counts3(time_zones,10) print(top_counts3) #--------------方法3:------------ #用pandas对时区进行计数 #将records转换为DataFrame对象 frame = DataFrame(records) #frame['tz']返回的对象有一个value_counts方法 tz_counts = frame['tz'].value_counts() print(tz_counts[:10]) #fillna()函数填补空缺值NA clean_tz = frame['tz'].fillna("Missing") print(clean_tz) #空字符串为Unknown clean_tz[clean_tz == ''] = "Unknown" tz_counts = clean_tz.value_counts() print(tz_counts[:10]) #利用counts的plot方法 tz_counts[:10].plot(kind = "barh",rot=0) plt.show() #用户浏览器分析 results = Series([x.split()[0] for x in frame.a.dropna()]) #打印前8的浏览器 print(results.value_counts()[:8]) cframe = frame[frame.a.notnull()] operating_system = np.where(cframe['a'].str.contains("Windows"),"Windows","Not Windows") windows = 0 nonWindows = 0 for op in operating_system: if op == "Windows": windows += 1 else: nonWindows += 1 print("windows:",windows,"nonWindows:",nonWindows) #使用windows/nonwindows给时区分组 by_tz_os = cframe.groupby(['tz',operating_system]) agg_counts = by_tz_os.size().unstack().fillna(0) print(agg_counts[:10]) #选取最常见的时区 indexer = agg_counts.sum(1).argsort() print(indexer) count_subset = agg_counts.take(indexer)[-10:] print(count_subset) #绘制windows/nonwindows 堆叠条形图 count_subset.plot(kind="barh",stacked=True) #不加这句语句,在Ipython中可以显示但是脚本运行不显示 plt.show() #规范化 normed_subset = count_subset.div(count_subset.sum(1),axis = 0) normed_subset.plot(kind = "barh",stacked=True) plt.show()
MovieLens 1M数据集:20世纪90年末到21世纪初6000名用户提供的4000部电影评分100万条数据,分为3个表:电影评分,电影元数据(类型,年代),用户的人口统计学数据(年龄,右边,性别,职业)
# -*- coding: utf-8 -*- import pandas as pd import os #数据读取,读成3个表 path = 'E:/Programming/Python/PythonDataAnalysis/datasets/movielens/' unames = ['user_id','gender','age','occupation','zip'] upath = os.path.join(path,'users.dat') users = pd.read_table(upath,sep = "::",header=None,names=unames,engine='python') rnames = ['user_id',"movie_id","rating","timestamp"] ratings = pd.read_table(path+'ratings.dat',sep = "::",header=None,names=rnames,engine='python') mnames = ['movie_id','title','genres'] movies = pd.read_table(path+'movies.dat',sep ="::",header=None,names=mnames,engine='python') #数据表整合 data = pd.merge(pd.merge(ratings,users),movies) print(data[:10]) print(data.ix[0]) #按性别计算每部电影的得分,index 中是标签,columns中是列标签 mean_ratings = data.pivot_table('rating',index = 'title',columns = "gender",aggfunc='mean') print(mean_ratings[:10]) #过滤掉评分不足250条的电影 ratings_by_title = data.groupby('title').size() print(ratings_by_title[:10]) active_titles = ratings_by_title[ratings_by_title >= 250] print(active_titles) #按照评论>=250的index筛选 mean_ratings = mean_ratings.ix[active_titles.index] top_female_ratings = mean_ratings.sort_index(by='F',ascending=False) print(top_female_ratings[:10]) #计算男性女性得分分歧最大的电影 mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] sorted_by_diff = mean_ratings.sort_index(by = 'diff') #分歧最大且女性更喜欢的电影 print(sorted_by_diff[:15]) #对结果反序取出前15行,男性观众更喜欢的电影 print(sorted_by_diff[::-1][:15]) #分歧最大的电影,计算方差或者标准差 rating_std_by_title = data.groupby('title')['rating'].std() #使用active_title进行过滤 rating_std_by_title = rating_std_by_title.ix[active_titles] rating_std_by_title.order(ascending=False) print(rating_std_by_title[:15])
1880-2010年间婴儿名字频率数据
# -*- coding:utf-8 -*- import pandas as pd import matplotlib.pyplot as plt import numpy as np path = 'E:/Programming/Python/PythonDataAnalysis/datasets/babynames/' names1880 = pd.read_csv(path+'yob1880.txt',names = ['name','sex','births'],engine='python') #按照sex对数据进行简单分组 names1880.groupby('sex').births.sum() #将单个文件中的数据整合到一个数据表中 years = range(1880,2011) pieces = [] columns = ['name','sex','birth'] for year in years: subpath = 'yob%d.txt' % year frame = pd.read_csv(path+subpath,names = columns) frame['year'] = year pieces.append(frame) names = pd.concat(pieces,ignore_index = True) #使用pivot_table()函数进行聚合 total_births = names.pivot_table('birth',index = 'year',columns = 'sex',aggfunc = sum) print(total_births.tail()) #插入prop列存放指定的婴儿数相对于总出生数的比例 def add_prop(group): births = group.birth.astype(float) group['prop'] = births/births.sum() return group names = names.groupby(['year','sex']).apply(add_prop) #取出每个sex/year组合的前1000个名字 def get_top1000(group): return group.sort_values(by='birth',ascending=False)[1:1000] grouped = names.groupby(['year','sex']) top1000 = grouped.apply(get_top1000) #接下来的'命名趋势'分析针对这top1000个数据集 #取出男性 boys = top1000[top1000.sex == 'M'] #取出女性 girls = top1000[top1000.sex == 'F'] total_births = top1000.pivot_table('birth',index = 'year',columns = 'name',aggfunc = sum) subset = total_births[['John','Harry','Mary','Marilyn']] subset.plot(subplots = True,figsize = (12,10),grid=False,title = "Number of births per year") plt.show() #观察名字多样性变化 table = top1000.pivot_table('prop',index = 'year',columns = 'sex',aggfunc = sum) table.plot(title = "sum of table1000.prop by year and sex",yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10)) plt.show() # 名字最后一个字母的变化