zoukankan      html  css  js  c++  java
  • python 新闻推荐系统(基于新闻内容推荐)

    # -*- coding:utf-8 -*-
    __version__ = '1.0.0.0'
    """
    @brief : 基于新闻的内容推荐系统
    @details: 详细信息
    @author : zhphuang
    @date : 2019-08-07
    """
    import jieba

    from pandas import *
    from sklearn.metrics import pairwise_distances
    from bs4 import BeautifulSoup
    from sklearn import feature_extraction
    from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
    from mysql_util import MySqlUtils


    class ContentRecommend(object):

    def __init__(self):
    DATABASE_CONFIG = {
    "host": "127.0.0.1",
    "port": 3306,
    "user": "root",
    "passwd": "root",
    "db": "recommend",
    "charset": "utf8",
    }

    self.con = MySqlUtils(DATABASE_CONFIG)

    self.recommend_count = 10
    self.all_count = 1000
    self.reload_data()

    def reload_data(self):
    corpus = []
    sql = "select a.news_id,a.content,b.source_url as url from news_content as a left join news as b on a.news_id=b.news_id"
    self.df = DataFrame(list(self.con.query(sql)), columns=['news_id', 'content', 'url'])
    zhPattern = re.compile(u'[u4e00-u9fa5]+')
    for index, row in self.df[0:self.all_count].iterrows():
    print(index)
    content = row['content']
    bs = BeautifulSoup(content, "html.parser")
    segments = []
    segs = jieba.cut(bs.text)
    for seg in segs:
    if zhPattern.search(seg):
    segments.append(seg)
    corpus.append(' '.join(segments))

    vectorizer = TfidfVectorizer() # 该类会统计每个词语的tf-idf权值
    tfidf = vectorizer.fit_transform(corpus) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    words = vectorizer.get_feature_names()
    print(len(words))
    print(words)
    print(vectorizer.vocabulary_)
    a = tfidf.todense()
    print(a)

    self.distance_matrix = pairwise_distances(
    tfidf,
    metric='cosine'
    )

    def analysis(self):
    for index, item in enumerate(self.distance_matrix):
    #min_value = np.min(np.delete(item, index))
    #min_index = np.argmin(np.delete(item, index))
    b = np.argsort(item)[1:self.recommend_count]
    print("="*10 + "与%s相似的文章有:" % self.df.iloc[index, 0] + "="*10 )
    for index_2 in b:
    print(self.df.iloc[index_2, 0], "余弦距离:%s" % item[index_2])

    def get_similar_news_id(self, news_id):
    for index, item in enumerate(self.distance_matrix):
    if self.df.iloc[index, 0] == news_id:
    b = np.argsort(item)[1:self.recommend_count]
    print("="*10 + "与%s相似的文章有:" % self.df.iloc[index, 0] + "(链接地址:%s)" % self.df.iloc[index, 2] + "="*10 )
    for index_2 in b:
    print(self.df.iloc[index_2, 0], "余弦相似度:%s" % item[index_2], "链接地址:%s" % self.df.iloc[index_2, 2])

    def get_recommend_news_by_uid(self, uid):
    pass

    def input_news_id(self):
    while 1:
    print("输入newsId:")
    news_id = input()
    self.get_similar_news_id(news_id)


    if __name__ == '__main__':
    ContentRecommend().input_news_id()

    当值未一旬,而视茫茫,而发苍苍,而齿牙动摇
  • 相关阅读:
    ExtJs005继承
    ExtJs004define定义类
    ExtJS笔记
    解决vscode-pandoc插件生成pdf中文显示问题
    UDP学习笔记(.NET)
    WPF类库不能添加资源词典(xaml)的解决办法
    解决win10下获取操作系统版本为6.2.900(win8)的问题
    [转] Unit Test 访问Internal类型和方法
    VS2017使用小技巧(持续更新。。。)
    [转]Github遇到Permanently added the RSA host key for IP address '192.30.252.128' to the list of known host
  • 原文地址:https://www.cnblogs.com/niuniuc/p/11314661.html
Copyright © 2011-2022 走看看