zoukankan      html  css  js  c++  java
  • python爬虫及结巴分词《攀登者》影评分析

    《攀登者》影评爬取及分析

    0、项目结构

    1572940236843

    其中simkai.ttf为字体文件,Windows查看系统自带的字体

    C:WindowsFonts
    

    一、爬取豆瓣影评数据

    # -*- coding: utf-8 -*-
    """爬取豆瓣影评"""
    import requests
    from lxml import etree
    import time
    
    url = "https://movie.douban.com/subject/30413052/comments?start=%d&limit=20&sort=new_score&status=P"
    
    #请求头
    headers = {'Host': 'movie.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    #'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Cookie': 'bid=TXwfIvNFTRE; douban-fav-remind=1; __gads=ID=e042951d078c30b3:T=1570518321:S=ALNI_Mbp-ZmoryuBFEnTQy24mwdf0B89ig; __utma=30149280.1448315194.1570518324.1570518324.1572927825.2; __utmz=30149280.1570518324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=589509e524ead00f.1572927824.1.1572927824.1572927824.; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1572927825; __utmc=30149280; __utma=223695111.1094105223.1572927825.1572927825.1572927825.1; __utmb=223695111.0.10.1572927825; __utmc=223695111; __utmz=223695111.1572927825.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'}
    
    if __name__ == '__main__':
        
        f = open("./climb.csv", mode="w", encoding='utf-8')
        f.write("author	comment	votes
    ")
        
        #start:0,20,40,...,200
        for i in range(11):#range左闭右开
            #1拼接url,只能获取前11页数据
            if i == 10:#最后一页
                url_climb = url%(200)
            else:
                url_climb = url%(i*20)
                
            #2发起请求,设置编码,获取文本内容
            response = requests.get(url_climb, headers = headers)
            response.encoding = "utf-8"
            text = response.text
            
            #存储
            #with open("./climb.html", mode="w", encoding="utf-8") as f:
            #    f.write(text)
                
            #使用etree解析
            html = etree.HTML(text)
            comments = html.xpath('//div[@id="comments"]/div[@class="comment-item"]')
            for comment in comments:
                #获取评论人
                author = comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()
                #获取评论内容
                p = comment.xpath('.//span[@class="short"]/text()')[0].strip()
                
                #获取这条评论对应的点赞数
                vote = comment.xpath('.//span[@class="votes"]/text()')[0].strip()
                
                #print(author, p, vote)
                f.write("%s	%s	%s
    " % (author,p,vote))
           
            #打印提示信息,并休眠一秒,反爬虫
            print("第%d页的数据保存成功" % (i+1))
            time.sleep(1)
                
        f.close()      
    

    二、对评论信息进行情感分析

    # -*- coding: utf-8 -*-
    """
    pandas:python data analysis lib,返回值为DataFrame(行,列),行是样本,列为属性    
    """
    import pandas as pd
    from snownlp import SnowNLP
    
    # 显示所有列
    pd.set_option('display.max_columns', None)
    
    def convert(comment):
        """将传入的评论进行情感分析"""
        snow = SnowNLP(str(comment))
        sentiments = snow.sentiments#0(消极评论)-1(积极评论)
        return sentiments
    
    if __name__ =='__main__':
        
        data = pd.read_csv('./climb.csv', '	')
        #print(data.head(), "
    ", data.shape)
        
        #获取评论数据,进行情感分析,DataFrame就会新增加一列名为‘情感评分’的数据
        data['情感评分'] = data.comment.apply(convert)
        data.sort_values(by='情感评分', ascending=False, inplace=True)
        
        #保存数据
        data.to_csv('./climb_snownlp.csv', sep='	', index=False, encoding='utf-8')
        
        print(data[:5])
        print(data[-5:])
    

    三、对评论数据进行jieba分词,生成关键词条形图和词云

    # -*- coding: utf-8 -*-
    
    import pandas as pd
    import jieba
    from jieba import analyse
    import matplotlib.pyplot as plt
    import numpy as np
    import wordcloud
    from PIL import Image
    
    if __name__ == '__main__':
        data = pd.read_csv('./climb.csv', sep='	')
        
        #列表生成式,获取所有评论信息
        comments = ';'.join([str(c) for c in data['comment'].tolist()])
        #print(comments)
        
        #使用jieba库对文本进行分词,返回的是生成器
        gen_ret = jieba.cut(comments)
        seg_words = '/'.join(gen_ret)
        #print(seg_words)
        
        #对分好的词进行分析,topK返回的关键词个数,withWeight带着权重
        tags_ret = analyse.extract_tags(seg_words, topK=500, withWeight=True)
        #print(tags_ret)
        #将数据转换成DataFrame
        df_ret = pd.DataFrame(tags_ret, columns=['词语', '重要性'])
        df_ret.sort_values(by='重要性', ascending=False, inplace=True)#根据重要性降序排列
        #print(df_ret)
        
        #可视化,500个词语,选取前20个分析
        plt.barh(y=np.arange(0,20), width=df_ret[:20]['重要性'][::-1])
        plt.ylabel('Importance')
        plt.yticks(np.arange(0,20), labels=df_ret[:20]['词语'][::-1], fontproperties='KaiTi')
        #保存条形图!!!保存代码一定要写在show之前,dpi表示屏幕像素密度
        plt.savefig('./条形图_20个keyword.jpg', dpi=200)
        plt.show()
        
        #词云操作
        bg = np.array(Image.open('./bg.jpg'))#词云的图片
        words = dict(tags_ret)#将标签转为词典
        cloud = wordcloud.WordCloud(width=1200, height=968,
                            font_path='./simkai.ttf',#字体路径
                            background_color='white', mask=bg,
                            max_words=500, max_font_size=150)
        #生成词云图片
        word_cloud = cloud.generate_from_frequencies(words)
        plt.figure(figsize=(12,12))
        plt.imshow(word_cloud)
        #词云保存
        plt.savefig('./攀登者词云.jpg', dpi=200)
        plt.show()  
    
  • 相关阅读:
    5 浏览器跨域问题
    4 html文件引用问题
    3 Oracle 32位客户端安装及arcgis连接
    2 虚拟机Oracle11.2.0.4服务器端,第三方图形化界面安装步骤
    1 主机WiFi连接下与虚拟机通信问题
    数列分块入门 3 题解
    数列分块入门 2 题解
    SDOI2009 HH的项链 题解
    数列分块入门 1 题解
    [SHOI2002]空中都市 题解
  • 原文地址:https://www.cnblogs.com/zxfei/p/11799044.html
Copyright © 2011-2022 走看看