zoukankan      html  css  js  c++  java
  • 111

    数据分析 (电影数据)

    import pandas as pd
    
    uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
    fuser  = '//home//yunpiao//data/1M//users.dat'
    fmovie = '/home/yunpiao/data/1M/movies.dat'
    fratings = '/home/yunpiao/data/1M/ratings.dat'
    
    pusers = pd.read_table(fuser, sep='::', header=None, names=uname, engine='python')
    uname = ['user_id','movie_id', 'rating', 'timestamp']
    prating = pd.read_table(fratings, sep='::', header=None, names=uname, engine='python')
    uname = ['movie_id', 'title', 'genres']
    %timeit pmovie = pd.read_table(fmovie, sep='::', header=None, names=uname,engine='python')
    
    
    100 loops, best of 3: 11.5 ms per loop
    

    切片

    pusers[:5]
    
    user_id gender age occupation zip
    0 1 F 1 10 48067
    1 2 M 56 16 70072
    2 3 M 25 15 55117
    3 4 M 45 7 02460
    4 5 M 25 20 55455
    prating[:5]
    
    user_id movie_id rating timestamp
    0 1 1193 5 978300760
    1 1 661 3 978302109
    2 1 914 3 978301968
    3 1 3408 4 978300275
    4 1 2355 5 978824291
    pmovie[1:10:4]
    
    movie_id title genres
    1 2 Jumanji (1995) Adventure|Children's|Fantasy
    5 6 Heat (1995) Action|Crime|Thriller
    9 10 GoldenEye (1995) Action|Adventure|Thriller
    data = pd.merge(pd.merge(prating,pusers),pmovie)
    print(data.ix[6])
    
    user_id                                           19
    movie_id                                        1193
    rating                                             5
    timestamp                                  982730936
    gender                                             M
    age                                                1
    occupation                                        10
    zip                                            48073
    title         One Flew Over the Cuckoo's Nest (1975)
    genres                                         Drama
    Name: 6, dtype: object
    
    mean_ratings = data.pivot_table('rating',index='title', columns='gender', aggfunc='mean')
    mean_ratings[:5]
    
    gender F M
    title
    $1,000,000 Duck (1971) 3.375000 2.761905
    'Night Mother (1986) 3.388889 3.352941
    'Til There Was You (1997) 2.675676 2.733333
    'burbs, The (1989) 2.793478 2.962085
    ...And Justice for All (1979) 3.828571 3.689024
    rating_by_title = data.groupby('title').size()
    rating_by_title[:4]
    
    title
    $1,000,000 Duck (1971)        37
    'Night Mother (1986)          70
    'Til There Was You (1997)     52
    'burbs, The (1989)           303
    dtype: int64
    
    active_title = rating_by_title.index[rating_by_title >= 250]
    print(active_title)
    
    Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
           u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
           u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
           u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
           u'2001: A Space Odyssey (1968)', u'2010 (1984)',
           ...
           u'X-Men (2000)', u'Year of Living Dangerously (1982)',
           u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
           u'Young Frankenstein (1974)', u'Young Guns (1988)',
           u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
           u'Zero Effect (1998)', u'eXistenZ (1999)'],
          dtype='object', name=u'title', length=1216)
    
    mean_ratings = mean_ratings.ix[active_title]
    mean_ratings[:3]
    
    gender F M
    title
    'burbs, The (1989) 2.793478 2.962085
    10 Things I Hate About You (1999) 3.646552 3.311966
    101 Dalmatians (1961) 3.791444 3.500000
    top_demale_ratings = mean_ratings.sort_values(by='M',ascending=False)
    top_demale_ratings['M'][:3]
    
    title
    Godfather, The (1972)                                                  4.583333
    Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)    4.576628
    Shawshank Redemption, The (1994)                                       4.560625
    Name: M, dtype: float64
    
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    mean_ratings[:5]
    
    gender F M diff
    title
    'burbs, The (1989) 2.793478 2.962085 0.168607
    10 Things I Hate About You (1999) 3.646552 3.311966 -0.334586
    101 Dalmatians (1961) 3.791444 3.500000 -0.291444
    101 Dalmatians (1996) 3.240000 2.911215 -0.328785
    12 Angry Men (1957) 4.184397 4.328421 0.144024
    top_diff = mean_ratings.sort_values(by="diff", ascending=False)
    top_diff[:4:1]
    
    gender F M diff
    title
    Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
    Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
    Dumb & Dumber (1994) 2.697987 3.336595 0.638608
    Longest Day, The (1962) 3.411765 4.031447 0.619682
    rating_std_by_title = data.groupby('title')['rating'].std()
    rating_std_by_title = rating_std_by_title.ix[active_title]
    rating_std_by_title.sort_values(ascending=False)[:10]
    
    title
    Dumb & Dumber (1994)                     1.321333
    Blair Witch Project, The (1999)          1.316368
    Natural Born Killers (1994)              1.307198
    Tank Girl (1995)                         1.277695
    Rocky Horror Picture Show, The (1975)    1.260177
    Eyes Wide Shut (1999)                    1.259624
    Evita (1996)                             1.253631
    Billy Madison (1995)                     1.249970
    Fear and Loathing in Las Vegas (1998)    1.246408
    Bicentennial Man (1999)                  1.245533
    Name: rating, dtype: float64
    
  • 相关阅读:
    EasyDSS前端界面在页面缩小时内置列表仍需手动刷新的优化
    【解决方案】家庭保姆犯罪案频出,EasyDSS视频监控平台如何确保家政安全?
    EasyDSS现场录视频流合成后出现视频内容部分丢失的问题排查及解决
    TSINGSEE青犀视频基于流媒体技术EasyDSS搭建酒店IPTV直播/点播平台
    EasyDSS视频直播列表页面横向滚动条和纵向滚动条不能同步的问题优化
    【解决方案】电力巡检进入智能化时代,无人机+EasyDSS开启智能巡检新模式
    EasyDSS新内核版本测试删除录像文件后存在残留问题调整优化
    IT常识
    Java面试题+算法案例
    数据库理论概述
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/5840226.html
Copyright © 2011-2022 走看看