1.python读取数据
ipython -pylab 1. path ='python/data01.txt' open(path).readline() #读取第一行的数据# 2.import json records =[json.load(line) for line in open(path)] #json模块将数据按行加载到records对象中# records[0]['tz'] #第一行的tz键值,python从0开始# time_zones =[rec['tz'] for rec in records if 'tz' in rec] value_key_pairs =[(count,tz) for tz ,count in count_dict.items()] from collections import Counter counts =Counter(time_zones) #统计个数 counts.most_common(10) #最多10个的信息
pandas read_table ,标准的读取函数
import pandas as pd unames =['user_id','gender','age','occupation','zip'] users =pd.read_table('python/users.dat',sep='::',header=Noner,names=unames) #sep是间隔符 users =read_csv('python/users.csv',encoding='gbk')
2.pandas Dataframe将数据表示为一个表格
from pandas import DataFrame ,Series import pandas as pd;import numpy as np frame = DataFrame(records) tz_counts = frame['tz'].value_counts() clean_tz = frame['tz'].fillna('Missing')#替换缺失值
clean_tz[clean_tz== '']='Unknown' tz_counts=clean_tz.value_counts() tz_counts[:10].plot(kind='barh',rot=0)
Series.plot方法的函数:
3.pandas 合并表
data =pd.merge(pd.merge(rating,users),movies)
求平均分数
mean_ratings = data.privot_table('rating',rows ='title',cols ='gender',aggfunc ='mean')
ratings_by_title =data.groupby('title').size()
active_titles =ratings_by_title.index[ratings_by_title>=200]#活跃的用户评论数大于200
mean_ratings = mean_ratings.ix[active_titles]