zoukankan      html  css  js  c++  java
  • python爬虫及结巴分词《攀登者》影评分析

    《攀登者》影评爬取及分析

    0、项目结构

    1572940236843

    其中simkai.ttf为字体文件,Windows查看系统自带的字体

    C:WindowsFonts
    

    一、爬取豆瓣影评数据

    # -*- coding: utf-8 -*-
    """爬取豆瓣影评"""
    import requests
    from lxml import etree
    import time
    
    url = "https://movie.douban.com/subject/30413052/comments?start=%d&limit=20&sort=new_score&status=P"
    
    #请求头
    headers = {'Host': 'movie.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    #'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Cookie': 'bid=TXwfIvNFTRE; douban-fav-remind=1; __gads=ID=e042951d078c30b3:T=1570518321:S=ALNI_Mbp-ZmoryuBFEnTQy24mwdf0B89ig; __utma=30149280.1448315194.1570518324.1570518324.1572927825.2; __utmz=30149280.1570518324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=589509e524ead00f.1572927824.1.1572927824.1572927824.; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1572927825; __utmc=30149280; __utma=223695111.1094105223.1572927825.1572927825.1572927825.1; __utmb=223695111.0.10.1572927825; __utmc=223695111; __utmz=223695111.1572927825.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'}
    
    if __name__ == '__main__':
        
        f = open("./climb.csv", mode="w", encoding='utf-8')
        f.write("author	comment	votes
    ")
        
        #start:0,20,40,...,200
        for i in range(11):#range左闭右开
            #1拼接url,只能获取前11页数据
            if i == 10:#最后一页
                url_climb = url%(200)
            else:
                url_climb = url%(i*20)
                
            #2发起请求,设置编码,获取文本内容
            response = requests.get(url_climb, headers = headers)
            response.encoding = "utf-8"
            text = response.text
            
            #存储
            #with open("./climb.html", mode="w", encoding="utf-8") as f:
            #    f.write(text)
                
            #使用etree解析
            html = etree.HTML(text)
            comments = html.xpath('//div[@id="comments"]/div[@class="comment-item"]')
            for comment in comments:
                #获取评论人
                author = comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()
                #获取评论内容
                p = comment.xpath('.//span[@class="short"]/text()')[0].strip()
                
                #获取这条评论对应的点赞数
                vote = comment.xpath('.//span[@class="votes"]/text()')[0].strip()
                
                #print(author, p, vote)
                f.write("%s	%s	%s
    " % (author,p,vote))
           
            #打印提示信息,并休眠一秒,反爬虫
            print("第%d页的数据保存成功" % (i+1))
            time.sleep(1)
                
        f.close()      
    

    二、对评论信息进行情感分析

    # -*- coding: utf-8 -*-
    """
    pandas:python data analysis lib,返回值为DataFrame(行,列),行是样本,列为属性    
    """
    import pandas as pd
    from snownlp import SnowNLP
    
    # 显示所有列
    pd.set_option('display.max_columns', None)
    
    def convert(comment):
        """将传入的评论进行情感分析"""
        snow = SnowNLP(str(comment))
        sentiments = snow.sentiments#0(消极评论)-1(积极评论)
        return sentiments
    
    if __name__ =='__main__':
        
        data = pd.read_csv('./climb.csv', '	')
        #print(data.head(), "
    ", data.shape)
        
        #获取评论数据,进行情感分析,DataFrame就会新增加一列名为‘情感评分’的数据
        data['情感评分'] = data.comment.apply(convert)
        data.sort_values(by='情感评分', ascending=False, inplace=True)
        
        #保存数据
        data.to_csv('./climb_snownlp.csv', sep='	', index=False, encoding='utf-8')
        
        print(data[:5])
        print(data[-5:])
    

    三、对评论数据进行jieba分词,生成关键词条形图和词云

    # -*- coding: utf-8 -*-
    
    import pandas as pd
    import jieba
    from jieba import analyse
    import matplotlib.pyplot as plt
    import numpy as np
    import wordcloud
    from PIL import Image
    
    if __name__ == '__main__':
        data = pd.read_csv('./climb.csv', sep='	')
        
        #列表生成式,获取所有评论信息
        comments = ';'.join([str(c) for c in data['comment'].tolist()])
        #print(comments)
        
        #使用jieba库对文本进行分词,返回的是生成器
        gen_ret = jieba.cut(comments)
        seg_words = '/'.join(gen_ret)
        #print(seg_words)
        
        #对分好的词进行分析,topK返回的关键词个数,withWeight带着权重
        tags_ret = analyse.extract_tags(seg_words, topK=500, withWeight=True)
        #print(tags_ret)
        #将数据转换成DataFrame
        df_ret = pd.DataFrame(tags_ret, columns=['词语', '重要性'])
        df_ret.sort_values(by='重要性', ascending=False, inplace=True)#根据重要性降序排列
        #print(df_ret)
        
        #可视化,500个词语,选取前20个分析
        plt.barh(y=np.arange(0,20), width=df_ret[:20]['重要性'][::-1])
        plt.ylabel('Importance')
        plt.yticks(np.arange(0,20), labels=df_ret[:20]['词语'][::-1], fontproperties='KaiTi')
        #保存条形图!!!保存代码一定要写在show之前,dpi表示屏幕像素密度
        plt.savefig('./条形图_20个keyword.jpg', dpi=200)
        plt.show()
        
        #词云操作
        bg = np.array(Image.open('./bg.jpg'))#词云的图片
        words = dict(tags_ret)#将标签转为词典
        cloud = wordcloud.WordCloud(width=1200, height=968,
                            font_path='./simkai.ttf',#字体路径
                            background_color='white', mask=bg,
                            max_words=500, max_font_size=150)
        #生成词云图片
        word_cloud = cloud.generate_from_frequencies(words)
        plt.figure(figsize=(12,12))
        plt.imshow(word_cloud)
        #词云保存
        plt.savefig('./攀登者词云.jpg', dpi=200)
        plt.show()  
    
  • 相关阅读:
    背水一战 Windows 10 (90)
    背水一战 Windows 10 (89)
    背水一战 Windows 10 (88)
    背水一战 Windows 10 (87)
    背水一战 Windows 10 (86)
    背水一战 Windows 10 (85)
    背水一战 Windows 10 (84)
    背水一战 Windows 10 (83)
    背水一战 Windows 10 (82)
    背水一战 Windows 10 (81)
  • 原文地址:https://www.cnblogs.com/zxfei/p/11799044.html
Copyright © 2011-2022 走看看