代码:用户标签
通过标签将用户和物品联系起来
# coding=gbk import pandas as pd import math data=pd.read_csv('delicious.dat',sep=' ',header=None) class SimpleTagBased: #{用户1:{标签A:5,...}...} user_tag=dict() #用户评价过的物品,在推荐时过滤掉这些物品 user_item=dict() #每个标签对应的物品 tag_item=dict() tag_count=dict() item_count=dict() item_tag=dict() def __init__(self,data): for user,item,tags in data.itertuples(index=False): if type(tags)==float: continue if user not in self.user_item: self.user_item[user]=list() self.user_item[user].append(item) if item not in self.item_count: self.item_count[item]=0 self.item_count[item]+=1 if item not in self.item_tag: self.item_tag[item]=dict() tags=tags.split(' ') if user not in self.user_tag: self.user_tag[user]=dict() for tag in tags: tag = tag.lower() if tag not in self.user_tag[user]: self.user_tag[user][tag]=0 self.user_tag[user][tag]+=1 if tag not in self.tag_item: self.tag_item[tag]=dict() if item not in self.tag_item[tag]: self.tag_item[tag][item]=0 self.tag_item[tag][item]+=1 if tag not in self.tag_count: self.tag_count[tag]=0 self.tag_count[tag]+=1 if tag not in self.item_tag[item]: self.item_tag[item][tag]=0 self.item_tag[item][tag]+=1 def recommend(self,user): viewedItem=self.user_item[user] rank=dict() utags = self.user_tag[user] for tag,weight in utags.items(): for item,wt in self.tag_item[tag].items(): if item in viewedItem: continue if item not in rank: rank[item]=0 #用户user对物品item的喜好程度 rank[item]+=weight*1.0/math.log(1+self.tag_count[tag])*wt/math.log(1+self.item_count[item]) rank=[a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]]; res = [] for item in rank: #物品被打的最多的10个标签作为物品的描述 res.append([a[0] for a in sorted(self.item_tag[item].items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]]) #用户最常用的10个标签作为用户兴趣描述 userdesc = [a[0] for a in sorted(utags.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]] return (userdesc,res) stb = SimpleTagBased(data=data) userdesc,res = stb.recommend(104) print userdesc print res
结果:
--用户常用标签
['software', 'webdesign', 'tools', 'dev', 'howto', 'free', 'freeware', 'opensource', 'reference', 'linux']
--所推荐物品具有的标签
['css', 'webdesign', 'reference', 'design', 'web', 'development', 'html', 'tools', 'webdev', 'programming'],
['webdesign', 'templates', 'design', 'css', 'opensource', 'web', 'free', 'html', 'layout', 'template'],
['fonts', 'typography', 'webdesign', 'design', 'tools', 'css', 'web', 'font', 'type', 'reference'],
['opensource', 'software', 'freeware', 'linux', 'free', 'windows', 'tools', 'reference', 'download', 'alternative'],
['freeware', 'software', 'utilities', 'tools', 'free', 'reference', 'list', 'windows', 'download', 'opensource']
# coding=gbk import pandas as pd import math data=pd.read_csv('delicious.dat',sep=' ',header=None) item_tag=dict() for user,item,tags in data.itertuples(index=False): #如果tags为nan,跳过 if type(tags)==float: continue if item not in item_tag: item_tag[item]=dict() tags=tags.split(' ') for tag in tags: tag = tag.lower() if tag not in item_tag[item]: item_tag[item][tag]=0 item_tag[item][tag]+=1 def recommend(taga,n,item_tag): nb=dict() nab=dict() na = 0 l = len(item_tag) i=1 for item,tags in item_tag.items(): print i*1.0/l i+=1 if taga not in tags: for tag,v in tags.items(): if tag not in nb: nb[tag]=0 nb[tag]+=v*v else: av = tags[taga] na +=av*av for tag,v in tags.items(): if tag==taga: continue if tag not in nb: nb[tag]=0 nb[tag]+=v*v if tag not in nab: nab[tag]=0 nab[tag]+=av*v rank=dict() na = math.sqrt(na) for tag,v in nab.items(): rank[tag]=v/na/math.sqrt(nb[tag]) res = [a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:n]] return res tags=recommend('webdesign',10,item_tag) print tags
结果:和webdesign相关的标签
['design', 'css', 'web', 'webdev', 'html', 'web_design', 'inspiration', 'xhtml', 'webdevelopment', 'resources']
![](https://images0.cnblogs.com/blog/610239/201504/240903377505493.jpg)
# coding=gbk import pandas as pd import math data=pd.read_csv('delicious.dat',sep=' ',header=None) item_tag=dict() user_tag=dict() for user,item,tags in data.itertuples(index=False): #如果tags为nan,跳过 if type(tags)==float: continue if item not in item_tag: item_tag[item]=dict() if user not in user_tag: user_tag[user]=dict() tags=tags.split(' ') for tag in tags: tag = tag.lower() if tag not in item_tag[item]: item_tag[item][tag]=0 item_tag[item][tag]+=1 if tag not in user_tag[user]: user_tag[user][tag]=0 user_tag[user][tag]+=1 def recommend(user,item,user_tag,item_tag,alpha): utgs = user_tag[user] itgs = item_tag[item] udesc = [a[0] for a in sorted(utgs.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]] idesc = [a[0] for a in sorted(itgs.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]] rank = dict() maxu = max(utgs.values()) maxi = max(itgs.values()) for tag,v in utgs.items(): if tag not in rank: rank[tag] = 0 rank[tag]+=(1-alpha)*v/maxu for tag,v in itgs.items(): if tag not in rank: rank[tag] = 0 rank[tag]+= alpha*v/maxi res=[a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]]; return udesc,idesc,res udesc,idesc,res=recommend(104,33911,user_tag,item_tag,0.8) print udesc print idesc print res
结果:
--用户常用标签
['software', 'webdesign', 'tools', 'dev', 'howto', 'free', 'freeware', 'opensource', 'reference', 'linux']
--物品常被打标签
['web', 'softwareagents', 'java', 'howto', 'moviles', 'documentation', 'semantica', 'hpi', 'api', 'agents']
--推荐标签
['howto', 'web', 'moviles', 'softwareagents', 'hpi', 'agents', 'api', 'jade', 'agentes', 'java']