zoukankan      html  css  js  c++  java
  • python大作业

    词云---利用python对电影评价的爬取

    一、抓取网页数据

    1:网页爬取一些数据的前期工作

    from urllib import request
    resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
    html_data = resp.read().decode('utf-8')

    :2:爬取得到的html解析

    from bs4 import BeautifulSoup as bs soup = bs(html_data, 'html.parser') nowplaying_movie = soup.find_all('div', id='nowplaying') nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

    在上图中可以看到data-subject属性里面id,而在img标签的电影的名字,两个属性来获得电影的id和名称。

    nowplaying_list = [] 
    for i in nowplaying_movie_list:        
            nowplaying_dict = {}        
            nowplaying_dict['id'] = i['data-subject']       
            for tag_img_item in i.find_all('img'):            
                nowplaying_dict['name'] = tag_img_item['alt']            
                nowplaying_list.append(nowplaying_dict)

    二、数据的处理

    comments = ''
    for k in range(len(eachCommentList)):
        comments = comments + (str(eachCommentList[k])).strip()


    三、词云生成图片

    import matplotlib.pyplot as plt
    %matplotlib inline
    
    import matplotlib
    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
    from wordcloud import WordCloud#词云包
    
    wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) 
    word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
    word_frequence_list = []
    for key in word_frequence:
        temp = (key,word_frequence[key])
        word_frequence_list.append(temp)
    
    wordcloud=wordcloud.fit_words(word_frequence_list)
    plt.imshow(wordcloud)


    付源码
    
    # -*- coding: utf-8 -*-
    
    import warnings
    warnings.filterwarnings("ignore")
    import jieba  # 分词包
    import numpy  # numpy计算包
    import codecs  # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
    import re
    import pandas as pd
    import matplotlib.pyplot as plt
    from PIL import Image
    from urllib import request
    from bs4 import BeautifulSoup as bs
    from wordcloud import WordCloud,ImageColorGenerator # 词云包
    import matplotlib
    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
    
    
    
    # 分析网页函数
    def getNowPlayingMovie_list():
        resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
        html_data = resp.read().decode('utf-8')
        soup = bs(html_data, 'html.parser')
        nowplaying_movie = soup.find_all('div', id='nowplaying')
        nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
        nowplaying_list = []
        for item in nowplaying_movie_list:
            nowplaying_dict = {}
            nowplaying_dict['id'] = item['data-subject']
            for tag_img_item in item.find_all('img'):
                nowplaying_dict['name'] = tag_img_item['alt']
                nowplaying_list.append(nowplaying_dict)
        return nowplaying_list
    
    # 爬取评论函数
    def getCommentsById(movieId, pageNum):
        eachCommentList = []
        if pageNum > 0:
            start = (pageNum - 1) * 20
        else:
            return False
        requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
        print(requrl)
        resp = request.urlopen(requrl)
        html_data = resp.read().decode('utf-8')
        soup = bs(html_data, 'html.parser')
        comment_div_lits = soup.find_all('div', class_='comment')
        for item in comment_div_lits:
            if item.find_all('p')[0].string is not None:
                eachCommentList.append(item.find_all('p')[0].string)
        return eachCommentList
    
    def main():
        # 循环获取第一个电影的前10页评论
        commentList = []
        NowPlayingMovie_list = getNowPlayingMovie_list()
        for i in range(10):
            num = i + 1
            commentList_temp = getCommentsById(NowPlayingMovie_list[0]['id'], num)
            commentList.append(commentList_temp)
    
        # 将列表中的数据转换为字符串
        comments = ''
        for k in range(len(commentList)):
            comments = comments + (str(commentList[k])).strip()
    
        # 使用正则表达式去除标点符号
        pattern = re.compile(r'[u4e00-u9fa5]+')
        filterdata = re.findall(pattern, comments)
        cleaned_comments = ''.join(filterdata)
    
        # 使用结巴分词进行中文分词
        segment = jieba.lcut(cleaned_comments)
        words_df = pd.DataFrame({'segment': segment})
    
        # 去掉停用词
        stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="	", names=['stopword'],
                                encoding='utf-8')  # quoting=3全不引用
        words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
    
        # 统计词频
        words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
        words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
        #  print(words_stat.head())
    
        bg_pic = numpy.array(Image.open("alice_mask.png"))
    
        # 用词云进行显示
        wordcloud = WordCloud(
            font_path="simhei.ttf",
            background_color="white",
            max_font_size=80,
            width = 2000,
            height = 1800,
            mask = bg_pic,
            mode = "RGBA"
        )
        word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
        # print(word_frequence)
        """
        word_frequence_list = []
        for key in word_frequence:
            temp = (key, word_frequence[key])
            word_frequence_list.append(temp)
            #print(word_frequence_list)
        """
        wordcloud = wordcloud.fit_words(word_frequence)
    
        image_colors = ImageColorGenerator(bg_pic) # 根据图片生成词云颜色
    
        plt.imshow(wordcloud) #显示词云图片
        plt.axis("off")
        plt.show()
        wordcloud.to_file('show_Chinese.png')  # 把词云保存下来
    
    main()
    

      

  • 相关阅读:
    Elasticsearch之下载源码
    Elasticsearch之settings和mappings(图文详解)
    Editplus下载、安装并最佳配色方案(强烈推荐)
    在CentOS下安装tomcat并配置环境变量(改默认端口8080为8081)
    Elasticsearch之中文分词器插件es-ik的自定义热更新词库
    Elasticsearch之中文分词器插件es-ik的自定义词库
    Elasticsearch之IKAnalyzer的过滤停止词
    md5增加指定的加密规则,进行加密
    unity中怎样获取全部子物体的组件
    Plus One
  • 原文地址:https://www.cnblogs.com/qq1141100952com/p/8906070.html
Copyright © 2011-2022 走看看