zoukankan      html  css  js  c++  java
  • 111

    数据分析 (电影数据)

    import pandas as pd
    
    uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
    fuser  = '//home//yunpiao//data/1M//users.dat'
    fmovie = '/home/yunpiao/data/1M/movies.dat'
    fratings = '/home/yunpiao/data/1M/ratings.dat'
    
    pusers = pd.read_table(fuser, sep='::', header=None, names=uname, engine='python')
    uname = ['user_id','movie_id', 'rating', 'timestamp']
    prating = pd.read_table(fratings, sep='::', header=None, names=uname, engine='python')
    uname = ['movie_id', 'title', 'genres']
    %timeit pmovie = pd.read_table(fmovie, sep='::', header=None, names=uname,engine='python')
    
    
    100 loops, best of 3: 11.5 ms per loop
    

    切片

    pusers[:5]
    
    user_id gender age occupation zip
    0 1 F 1 10 48067
    1 2 M 56 16 70072
    2 3 M 25 15 55117
    3 4 M 45 7 02460
    4 5 M 25 20 55455
    prating[:5]
    
    user_id movie_id rating timestamp
    0 1 1193 5 978300760
    1 1 661 3 978302109
    2 1 914 3 978301968
    3 1 3408 4 978300275
    4 1 2355 5 978824291
    pmovie[1:10:4]
    
    movie_id title genres
    1 2 Jumanji (1995) Adventure|Children's|Fantasy
    5 6 Heat (1995) Action|Crime|Thriller
    9 10 GoldenEye (1995) Action|Adventure|Thriller
    data = pd.merge(pd.merge(prating,pusers),pmovie)
    print(data.ix[6])
    
    user_id                                           19
    movie_id                                        1193
    rating                                             5
    timestamp                                  982730936
    gender                                             M
    age                                                1
    occupation                                        10
    zip                                            48073
    title         One Flew Over the Cuckoo's Nest (1975)
    genres                                         Drama
    Name: 6, dtype: object
    
    mean_ratings = data.pivot_table('rating',index='title', columns='gender', aggfunc='mean')
    mean_ratings[:5]
    
    gender F M
    title
    $1,000,000 Duck (1971) 3.375000 2.761905
    'Night Mother (1986) 3.388889 3.352941
    'Til There Was You (1997) 2.675676 2.733333
    'burbs, The (1989) 2.793478 2.962085
    ...And Justice for All (1979) 3.828571 3.689024
    rating_by_title = data.groupby('title').size()
    rating_by_title[:4]
    
    title
    $1,000,000 Duck (1971)        37
    'Night Mother (1986)          70
    'Til There Was You (1997)     52
    'burbs, The (1989)           303
    dtype: int64
    
    active_title = rating_by_title.index[rating_by_title >= 250]
    print(active_title)
    
    Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
           u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
           u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
           u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
           u'2001: A Space Odyssey (1968)', u'2010 (1984)',
           ...
           u'X-Men (2000)', u'Year of Living Dangerously (1982)',
           u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
           u'Young Frankenstein (1974)', u'Young Guns (1988)',
           u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
           u'Zero Effect (1998)', u'eXistenZ (1999)'],
          dtype='object', name=u'title', length=1216)
    
    mean_ratings = mean_ratings.ix[active_title]
    mean_ratings[:3]
    
    gender F M
    title
    'burbs, The (1989) 2.793478 2.962085
    10 Things I Hate About You (1999) 3.646552 3.311966
    101 Dalmatians (1961) 3.791444 3.500000
    top_demale_ratings = mean_ratings.sort_values(by='M',ascending=False)
    top_demale_ratings['M'][:3]
    
    title
    Godfather, The (1972)                                                  4.583333
    Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)    4.576628
    Shawshank Redemption, The (1994)                                       4.560625
    Name: M, dtype: float64
    
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    mean_ratings[:5]
    
    gender F M diff
    title
    'burbs, The (1989) 2.793478 2.962085 0.168607
    10 Things I Hate About You (1999) 3.646552 3.311966 -0.334586
    101 Dalmatians (1961) 3.791444 3.500000 -0.291444
    101 Dalmatians (1996) 3.240000 2.911215 -0.328785
    12 Angry Men (1957) 4.184397 4.328421 0.144024
    top_diff = mean_ratings.sort_values(by="diff", ascending=False)
    top_diff[:4:1]
    
    gender F M diff
    title
    Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
    Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
    Dumb & Dumber (1994) 2.697987 3.336595 0.638608
    Longest Day, The (1962) 3.411765 4.031447 0.619682
    rating_std_by_title = data.groupby('title')['rating'].std()
    rating_std_by_title = rating_std_by_title.ix[active_title]
    rating_std_by_title.sort_values(ascending=False)[:10]
    
    title
    Dumb & Dumber (1994)                     1.321333
    Blair Witch Project, The (1999)          1.316368
    Natural Born Killers (1994)              1.307198
    Tank Girl (1995)                         1.277695
    Rocky Horror Picture Show, The (1975)    1.260177
    Eyes Wide Shut (1999)                    1.259624
    Evita (1996)                             1.253631
    Billy Madison (1995)                     1.249970
    Fear and Loathing in Las Vegas (1998)    1.246408
    Bicentennial Man (1999)                  1.245533
    Name: rating, dtype: float64
    
  • 相关阅读:
    工作也是一样,认真对待,你是在为自己工作
    程序员学习能力提升三要素(转载)
    该读些啥书
    每个程序员都应读的书
    微博时光机定时发送微博
    WordPress快速建站
    Tweenlite的用法
    Away3D粒子系统中文快速上手指南
    操盘手 李彪 照片[转]
    URLClassLoader加载class到当前线程类加载器【zt】
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/5840226.html
Copyright © 2011-2022 走看看