zoukankan      html  css  js  c++  java
  • 111

    数据分析 (电影数据)

    import pandas as pd
    
    uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
    fuser  = '//home//yunpiao//data/1M//users.dat'
    fmovie = '/home/yunpiao/data/1M/movies.dat'
    fratings = '/home/yunpiao/data/1M/ratings.dat'
    
    pusers = pd.read_table(fuser, sep='::', header=None, names=uname, engine='python')
    uname = ['user_id','movie_id', 'rating', 'timestamp']
    prating = pd.read_table(fratings, sep='::', header=None, names=uname, engine='python')
    uname = ['movie_id', 'title', 'genres']
    %timeit pmovie = pd.read_table(fmovie, sep='::', header=None, names=uname,engine='python')
    
    
    100 loops, best of 3: 11.5 ms per loop
    

    切片

    pusers[:5]
    
    user_id gender age occupation zip
    0 1 F 1 10 48067
    1 2 M 56 16 70072
    2 3 M 25 15 55117
    3 4 M 45 7 02460
    4 5 M 25 20 55455
    prating[:5]
    
    user_id movie_id rating timestamp
    0 1 1193 5 978300760
    1 1 661 3 978302109
    2 1 914 3 978301968
    3 1 3408 4 978300275
    4 1 2355 5 978824291
    pmovie[1:10:4]
    
    movie_id title genres
    1 2 Jumanji (1995) Adventure|Children's|Fantasy
    5 6 Heat (1995) Action|Crime|Thriller
    9 10 GoldenEye (1995) Action|Adventure|Thriller
    data = pd.merge(pd.merge(prating,pusers),pmovie)
    print(data.ix[6])
    
    user_id                                           19
    movie_id                                        1193
    rating                                             5
    timestamp                                  982730936
    gender                                             M
    age                                                1
    occupation                                        10
    zip                                            48073
    title         One Flew Over the Cuckoo's Nest (1975)
    genres                                         Drama
    Name: 6, dtype: object
    
    mean_ratings = data.pivot_table('rating',index='title', columns='gender', aggfunc='mean')
    mean_ratings[:5]
    
    gender F M
    title
    $1,000,000 Duck (1971) 3.375000 2.761905
    'Night Mother (1986) 3.388889 3.352941
    'Til There Was You (1997) 2.675676 2.733333
    'burbs, The (1989) 2.793478 2.962085
    ...And Justice for All (1979) 3.828571 3.689024
    rating_by_title = data.groupby('title').size()
    rating_by_title[:4]
    
    title
    $1,000,000 Duck (1971)        37
    'Night Mother (1986)          70
    'Til There Was You (1997)     52
    'burbs, The (1989)           303
    dtype: int64
    
    active_title = rating_by_title.index[rating_by_title >= 250]
    print(active_title)
    
    Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
           u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
           u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
           u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
           u'2001: A Space Odyssey (1968)', u'2010 (1984)',
           ...
           u'X-Men (2000)', u'Year of Living Dangerously (1982)',
           u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
           u'Young Frankenstein (1974)', u'Young Guns (1988)',
           u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
           u'Zero Effect (1998)', u'eXistenZ (1999)'],
          dtype='object', name=u'title', length=1216)
    
    mean_ratings = mean_ratings.ix[active_title]
    mean_ratings[:3]
    
    gender F M
    title
    'burbs, The (1989) 2.793478 2.962085
    10 Things I Hate About You (1999) 3.646552 3.311966
    101 Dalmatians (1961) 3.791444 3.500000
    top_demale_ratings = mean_ratings.sort_values(by='M',ascending=False)
    top_demale_ratings['M'][:3]
    
    title
    Godfather, The (1972)                                                  4.583333
    Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)    4.576628
    Shawshank Redemption, The (1994)                                       4.560625
    Name: M, dtype: float64
    
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    mean_ratings[:5]
    
    gender F M diff
    title
    'burbs, The (1989) 2.793478 2.962085 0.168607
    10 Things I Hate About You (1999) 3.646552 3.311966 -0.334586
    101 Dalmatians (1961) 3.791444 3.500000 -0.291444
    101 Dalmatians (1996) 3.240000 2.911215 -0.328785
    12 Angry Men (1957) 4.184397 4.328421 0.144024
    top_diff = mean_ratings.sort_values(by="diff", ascending=False)
    top_diff[:4:1]
    
    gender F M diff
    title
    Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
    Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
    Dumb & Dumber (1994) 2.697987 3.336595 0.638608
    Longest Day, The (1962) 3.411765 4.031447 0.619682
    rating_std_by_title = data.groupby('title')['rating'].std()
    rating_std_by_title = rating_std_by_title.ix[active_title]
    rating_std_by_title.sort_values(ascending=False)[:10]
    
    title
    Dumb & Dumber (1994)                     1.321333
    Blair Witch Project, The (1999)          1.316368
    Natural Born Killers (1994)              1.307198
    Tank Girl (1995)                         1.277695
    Rocky Horror Picture Show, The (1975)    1.260177
    Eyes Wide Shut (1999)                    1.259624
    Evita (1996)                             1.253631
    Billy Madison (1995)                     1.249970
    Fear and Loathing in Las Vegas (1998)    1.246408
    Bicentennial Man (1999)                  1.245533
    Name: rating, dtype: float64
    
  • 相关阅读:
    Floyd_Warshall算法
    Bellman_Ford算法
    深度优先搜索
    广度优先搜索
    贪心算法_活动选择
    动态规划_0-1背包问题
    算法导论_动态规划_最长回文子序列
    算法导论_动态规划_最长公共子序列
    动态规划解决分割问题
    2016 Google中国开发者大会游记
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/5840226.html
Copyright © 2011-2022 走看看