# -*- coding: utf-8 -*-
"""
Created on Wed Feb 15 20:36:15 2017
@author: zzpp220
"""
import json,sys
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
#=======================dataset1-web用户数据=======================================================
# path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
# records=[json.loads(lines) for lines in open(path)]##json可将java脚本的格式化成字典对象
#
# frame=DataFrame(records)
# ####==============================================================================
# tz=frame['tz']##只取出该列的所有值,或者tz=frame.tz
# # ##frame['tz']返回的col_tz(series)有 一个value_counts的方法, return series 统计出tz中想同内容有多少个
# # col_tz=frame['tz'].value_counts() ##返回只有一个元素的元祖对象
# # subna_tz=tz.fillna('Missing')## repalce 'nan' with 'Missing'
# # subna_tz[subna_tz=='']='Unknown'##!!!查找空白并且替换空白的方法
# # col_tz_1=subna_tz.value_counts()
# # col_tz_1[:10].plot(kind='barh',rot=0)
# #==============================================================================
# a=frame.a.dropna()##去掉‘a’字段中的nan值
# tmp2=[x.split()[0] for x in a]##取'a'字段中每条记录的第一个空格前的内容
# result=Series(tmp2)
# #print result[:10]
# count_result=result.value_counts()
# ##将a字段的非nan的记录按照Windows与否的进行分解
# notnull_a=frame.a.notnull()
# frame_notnulla=frame[notnull_a]
# condition=frame_notnulla['a'].str.contains('Windows')
# operation_system=np.where(condition,'Windows','Not Windows')
# ##根据'tz'字段和新得到的操作系统对数据表进行分组:
# #==============================================================================
# # tz[tz=='']='Unknown'
# # gru_tz_os=frame_notnulla.groupby([tz,operation_system])
# # agg_counts=gru_tz_os.size().unstack().fillna(0)
# #==============================================================================
# gru_tz_os=frame_notnulla.groupby(['tz',operation_system])
# agg_counts=gru_tz_os.size().unstack().fillna(0)
# indexer=agg_counts.sum(1)##对每行的条目进行求和
# sort_index=indexer.argsort()##给每个条目按照总和的高低进行排序,给出索引号
#
# sub_sort_index=agg_counts.take(sort_index)[-10:]###按照上面索引的顺序截取分组表的后10行记录
# sub_sort_index.plot(kind='barh',stacked=True)##堆积
# ##归一化,改进优化比例显示
# normed=sub_sort_index.div(sub_sort_index.sum(1),axis=0)
# normed.plot(kind='barh',stacked=True)
#==============================================================================
#==================dataset2-电影评分============================================================
#
# dir_path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/movielens/'
#
# #==================将每个表分别读到一个DataFrame的对象中============================================================
# unames=['user_id','gender','age','occupation','zip_code']##表中的各个字段
# users=pd.read_table(dir_path+'users.dat',sep='::',header=None,names=unames)
#
# mnames=['movie_id','title','genres']
# movies=pd.read_table(dir_path+'movies.dat',sep='::',header=None,names=mnames)
#
# rnames=['user_id','movie_id','rating','timestamp']
# ratings=pd.read_table(dir_path+'ratings.dat',sep='::',header=None,names=rnames)
# #==============================================================================
#
# ##==================依次连接多个DataFrame的对象=======================
# whole=pd.merge(pd.merge(users,ratings),movies)##表的顺序就是总表中字段的排列顺序,相同的字段名会自动判断
# ##whole.ix[0] 这是每天记录的查看方法,不能直接whole[0]
# ##按性别计算每部电影的平均分,所以要选择的字段为'title','gender'其中名字作为行名,性别做为字段名
# avg_ratings=whole.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
#
#
# ##先对title进行分组,然后利用size()得到一个包含各电影分组大小的Series对象--rating_by_title
# ##count_movie=whole.title.value_counts()--作用同下
# rating_by_title=whole.groupby('title').size()##直接whole.groupby('title')不会有任何输出
# needed_movie=rating_by_title.index[rating_by_title>=250]##筛选出评分数大于250的电影名
# avg_rating_needed=avg_ratings.ix[needed_movie]
#
# desc_feamale_rating=avg_rating_needed.sort_index(by='F',ascending=False)##按女性的喜爱程度降序排列
#
# ##计算男女分歧最大的电影
# avg_rating_needed['diff']=avg_rating_needed['M']-avg_rating_needed['F']
# sort_by_diff=avg_rating_needed.sort_index(by='diff')
#
# ##
# #========单纯计算分歧最大的电影,计算得分数据的方差或者标准差:对whole按电影名分类======
# ##按照电影名分类,对每个电影类内的rating列进行求平均差
# rating_std_title=whole.groupby('title')['rating'].std()
# ##从计算后的列表中选出评分记录大于250的电影名
# rating_std_title=rating_std_title.ix[needed_movie]
# ##按降序排列,第一即为方差最大的电影,即为分歧最大的电影
# rating_std_title.order(ascending=False)[:10]
# #==============================================================================
#
#
#================dataset3-婴儿姓名==============================================================
#
path_name='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/names/'
columns=['name','gender','births']
#names1880=pd.read_csv(path_name+'yob1880.txt',names=['name','gender','births'])##birth表示当年出生使用该名字的次数
#birth_sum=names1880.groupby('gender').births.sum()#按'gender'列分组,在每组内,对births列求和
#################将所有年度的表的信息汇总到一个总表,并加上列year
years=range(1880,2011)
piece=[]
for year in years:
sep_path=path_name+'yob%d.txt' % year
sep_frame=pd.read_csv(sep_path,names=columns)
sep_frame['year']=year##为每个表添加字段-年,并赋值为当年
piece.append(sep_frame)
names=pd.concat(piece,ignore_index=True)##!!!默认按行组合多个
##按照性别(column,字段名)对每年(index行名)的births(首参数)进行求和(aggfunc)
total_births=names.pivot_table('births',index='year',columns='gender',aggfunc=sum)
total_births.plot(title='Total births by gneder and year')
#groupby_total_births=names.groupby(['year','gender']).births.sum()
#groupby_total_births.plot(title='Total births by gneder and year')