zoukankan      html  css  js  c++  java
  • 根据用户注册信息推荐 代码部分

    代码1:

    # coding=gbk
    '''
    数据集:
    BX-Users.csv,包含用户的ID、位置和年龄。
    BX-Books.csv,包含图书的ISBN、标题、作者、发表年代、出版社和缩略。
    BX-Book-Ratings.csv, 包含用户对图书的评分信息。
    
    比较两种p(f,i)两种定义方式,给[年龄<25]和[年龄>50]两类用户推荐的前10本书
    '''
    import pandas as pd
    users=pd.read_csv('../data/BX-Users.csv',sep=';',dtype={'Age':float})
    books=pd.read_csv('../data/BX-Books.csv',sep=';',escapechar='\')
    rates=pd.read_csv('../data/BX-Book-Ratings.csv',sep=';',nrows=80000,dtype={'Book-Rating':float})
    
    #分类用户,只保存用户ID
    AgeL25=set(users[users.Age<25].ix[:,0])
    AgeG50=set(users[users.Age>25].ix[:,0])
    
    #以字典形式存储书的id和名称
    books={a:b for a, b in books[[0,1]].itertuples(index=False)}
    rates=rates[rates['ISBN'].isin(books)]
    
    RateL25=rates[rates['User-ID'].isin(AgeL25)]
    RateG50=rates[rates['User-ID'].isin(AgeG50)]
    
    #第一种方式,使用 25岁以下用户最热门书籍作为给25岁以下用户的推荐,50岁以上亦同
    #这种方式会将全年龄段都热门书籍推荐给25岁以下用户
    
    rankL25=dict()
    
    #按书籍分组
    groups = RateL25.groupby(['ISBN'])
    for book,group in groups:
        rankL25[book]=len(group)
    
    recL25=[books[x[0]] for x in sorted(rankL25.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]]
    
    rankG50=dict()
    
    #按书籍分组
    groups = RateG50.groupby(['ISBN'])
    for book,group in groups:
        rankG50[book]=len(group)
    
    recG50=[books[x[0]] for x in sorted(rankG50.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]]
    
    print recL25
    print recG50
    
    '''
    结果中有三本书是一样的,因为这三本书各年龄段都热门
    ['Wild Animus', 'The Lovely Bones: A Novel', 'The Da Vinci Code', "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", "She's Come Undone (Oprah's Book Club)"]
    ['Wild Animus', 'The Lovely Bones: A Novel', 'The Da Vinci Code', 'Life of Pi', 'Divine Secrets of the Ya-Ya Sisterhood: A Novel']
    '''
    
    #第2中方式,将读者中25岁以下用户占比最大的小说推荐给25岁以下用户,50岁以上亦同
    #这种方式可以解决全年龄段热门读物会都被推荐的问题
    #当有本书只有一个读者时,它的推荐比重就会是1,在分母中加上alpha,为了解决这个问题
    alpha=10
    
    groups = rates.groupby(['ISBN'])
    bookrates={book:len(group) for book,group in groups}
    
    for book,rank in rankL25.items():
        rankL25[book]=rank/(bookrates[book]+alpha)
    recL25=[books[x[0]] for x in sorted(rankL25.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]]
    
    for book,rank in rankG50.items():
        rankG50[book]=rank/(bookrates[book]+alpha)
    recG50=[books[x[0]] for x in sorted(rankG50.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]]
    
    print recL25
    print recG50

    代码2:

    # coding=gbk
    import pandas as pd
    from sklearn import cross_validation
    
    users=pd.read_csv('data/BX-Users.csv',sep=';',dtype={'Age':float})
    rates=pd.read_csv('data/BX-Book-Ratings.csv',sep=';',dtype={'Book-Rating':float})
    
    #排除年龄为空及小于3岁和大于110岁的用户信息,以及其评价信息
    users=users[(pd.notnull(users['Age']))&(users['Age']>0)&(users['Age']<100)]#只取用户ID和年龄
    rates=rates[rates['User-ID'].isin(users['User-ID'])]#排除年龄不对的用户的评价
    
    #地址只取国籍
    def dealLocation(x):
        z=x['Location'].split(',')
        if len(z)<3:
            return 'False'
        else:
            return z[len(z)-1].strip()
    users['Location']=users.apply(dealLocation,axis=1)
    userdict = {a:(b,c) for a, b, c in users.itertuples(index=False)}
    
    #先按国籍分组,然后按年龄分组,年龄从1到99岁,划分为20个区间
    groups=users.groupby(['Location'])
    userclass=dict()
    userclassRec=dict()
    for loc,group in groups:
        userclass[loc]=dict()
        userclassRec[loc]=dict()
        for i in range(20):
            userclass[loc][i]=set(group[(group['Age']>(i*5))&(group['Age']<=((i+1)*5))]['User-ID'])
            userclassRec[loc][i]=dict()
    
    #将评分分为测试组和训练组
    train,test=cross_validation.train_test_split(rates,test_size=0.2)
    train = pd.DataFrame(train,columns=['User-ID', 'ISBN', 'Book-Rating'])
    test = pd.DataFrame(test,columns=['User-ID', 'ISBN', 'Book-Rating'])
    
    #计算每类用户的推荐
    groups=train.groupby(['ISBN'])
    for book,group in groups:
        busers=set(group['User-ID'])
        for u in busers:
            uinfo = userdict[u]
            loc = uinfo[0]
            ageclass = int((uinfo[1]-1)/5)
            if book not in userclassRec[loc][ageclass]:
                userclassRec[loc][ageclass][book]=0
            userclassRec[loc][ageclass][book]+=1.0/(len(busers)+5)
    
    #根据评分各个用户组里的书籍
    for loc,ages in userclassRec.items():
        for age,books in ages.items():
            userclassRec[loc][age]=[i[0] for i in sorted(userclassRec[loc][age].items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:20]]
            
    
    groups=test.groupby(['User-ID'])
    total=0
    accurate=0
    for u,group in groups:
        uinfo = userdict[u]
        loc = uinfo[0]
        ageclass = int((uinfo[1]-1)/5)
        
        total += len(userclassRec[loc][ageclass])
        for book in set(group['ISBN']):
            if book in userclassRec[loc][ageclass]:
                accurate +=1
    
    print accurate*1.0/total
  • 相关阅读:
    调试SQLSERVER (二)使用Windbg调试SQLSERVER的环境设置
    调试SQLSERVER (一)生成dump文件的方法
    SQLSERVER中如何快速比较两张表的不一样
    Leptonica在VS2010中的编译及简单使用举例
    UVALive 3135--Argus+自己定义优先队列的优先规则
    mysql---总体备份和增量备份
    OllyDbg 使用笔记 (十二)
    《TCP/IP具体解释卷2:实现》笔记--IP:网际协议
    blurImage做图片模糊处理报错free(): invalid next size
    docker网络配置方法总结
  • 原文地址:https://www.cnblogs.com/porco/p/4421511.html
Copyright © 2011-2022 走看看