zoukankan html css js c++ java

爬取豆瓣电影影评，生成wordcloud词云，并利用监督学习根据评论自动打星

本文的完整源码在git位置：https://github.com/OceanBBBBbb/douban-ml

爬取豆瓣影评

爬豆瓣的影评比较简单，豆瓣没有做限制，甚至你都不用登陆就可以看全部，我这里用的bs4和urllib获取的页面信息：

# 获取页面
def get_html(url):
    head = {}
    head[
        'User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    target_req = request.Request(url=url, headers=head)
    target_response = request.urlopen(target_req)
    target_html = target_response.read().decode('utf-8', 'ignore')
    # 创建BeautifulSoup对象
    listmain_soup = BeautifulSoup(target_html, 'lxml')
    return listmain_soup

获取页面后，解析页面上的评论内容，这里我拿了给出的评分、短评和展示的部分详细评价。

评分在这个页面里不是一个明显的数值，豆瓣用5种不同的class来表示用户的评星。我这里用了个

遍历找评分的办法。

# 解析豆瓣评论并存到文件中
def resolu_soup(soup_page):
    reviews = soup_page.find_all('div', class_='main review-item')
    for each_review in reviews:
        # 评分是1-5星，用class不同来表示的。allstar40 main-title-rating,h2里面是主评，<div class="short-content">是详评
        score = 0 #未知的评分
        for i in range(1, 6):
            class_name='allstar'+str(i*10)+' main-title-rating'
            score_maybe=each_review.find_all('span', class_=class_name)
            if(len(score_maybe)>0):
                score = i
        print("这个人的评分是"+str(score))
        short_review = each_review.find('h2').text.replace(' ','') #短评
        with open("../doc/data_fkwxr.txt", "a",encoding='utf-8') as f: #写到样本库用于--学习可能有没有分的
            f.write(short_review+" "+str(score)+'
')
        long_review = each_review.find('div', class_='short-content').text.replace(' ','').replace('(展开)','')

        with open("../doc/fkwxr.txt", "a",encoding='utf-8') as f: #写在用于分析电影评价的文本里
            f.write(short_review+" "+long_review+'
')

打开文本的方式with open("../doc/data_fkwxr.txt", "a",encoding='utf-8')

中的第二个参数，需要提前了解一下：

而这里的'a'是以追加的方式进行写入。

使用word-cloud直观看影评

因为我们拉取的文本是用户自由输入的，在使用word_cloud分析前，可以先用jieba分词预处理一下：

def build_key_word(path):  # 通过词频产生特征
    d = {}
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            for word in jieba.cut(line.strip()):
                # p = re.compile(r'w', re.L)
                # result = p.sub("", word)
                # if not result or result == ' ':  # 空字符
                #     continue
                if len(word) > 1:  # 避免大量无意义的词语进入统计范围
                    d[word] = d.get(word, 0) + 1
    kw_list = sorted(d, key=lambda x: d[x], reverse=True)
    size = int(len(kw_list) * 0.3)  # 取最前的30%
    mood = set(kw_list[:size])
    mood_without_stop=list(mood)
    temp_list={}
    for ii in mood_without_stop:
        temp_list[ii]=d[ii]
    return temp_list

然后使用wordcloud生成图就可以了，这里在wordcloud官网有入门的详细教程，只是要得到一个图的话，还是比较简单的。

这里可以看一下电影《新喜剧之王》的效果图：

使用朴素贝叶斯算法进行分类训练，预测评论标题将给的评星

这里我为了图方便，前面存储这个的时候，存的就是影评的标题加已知的星级，而没有给评星的，就会是0，在训练前，我过滤了一下没有评星的，根据评论自己按常理，

给出了一个评星，不然数据实在是太少了。

# 简单的监督学习例子
# 使用朴素贝叶斯算法进行分类训练。
from nltk.stem import WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier
import pickle
import jieba

def proc_text(text):
    # 分词
    raw_words = jieba.cut(text, cut_all=True)
    # 词形归一化
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
    # True 表示该词在文本中，为了使用nltk中的分类器
    return {word: True for word in words}

def train_data_from_txt(model_name,txt):
    # 读取文本
    # a = numpy.loadtxt('data_xxjzw.txt',encoding='UTF-8') #这个是读数的。。
    with open(txt, encoding="utf-8") as fp:
        train_data=[]
        for line in fp:
            txt_split=line.split(' ')
            train_data.append([proc_text(txt_split[0]),int(txt_split[1])])
        # 训练模型
        nb_model = NaiveBayesClassifier.train(train_data)
        # 把模型训练集存起来
        f = open(model_name, 'wb')
        pickle.dump(nb_model, f)
        f.close()

if __name__ == '__main__':
    # train_data_from_txt('data_xxjzw_classifier.pickle','../doc/data_xxjzw.txt') #得到心训练模型
    # # 测试模型
    text6 = '看预告片觉得这真是一部好极了的电影，期待正式上映'
    f = open('data_xxjzw_classifier.pickle', 'rb')
    classifier = pickle.load(f) #读取结果集就可以了
    f.close()
    print("预测本评论给出的评分："+str(classifier.classify(proc_text(text6))))

查看全文

相关阅读:
11 数值的整数次方
 10 二进制中1的个数
 6 重建二叉树
 5 从尾到头打印链表
 计算机网络面试题
 Http和Https的区别
 UVALive 7749 Convex Contour (计算几何)
Gym 101190H Hard Refactoring (模拟坑题)
UVa 11324 The Largest Clique (强连通分量+DP)
HDU 6006 Engineer Assignment (状压DP)

原文地址：https://www.cnblogs.com/MyOceansWeb/p/10343062.html