zoukankan      html  css  js  c++  java
  • 爬取猫眼电影影评

    主题:对即将上映的大侦探皮卡丘电影保持什么态度?

    主要内容

    蒂姆·古德曼(贾斯提斯·史密斯 饰) 为寻找下落不明的父亲来到莱姆市,意外与父亲的前宝可梦搭档大侦探皮卡丘(瑞恩·雷诺兹 配音)相遇,并惊讶地发现自己是唯一能听懂皮卡丘说话的人类,他们决定组队踏上揭开真相的刺激冒险之路。探案过程中他们邂逅了各式各样的宝可梦,并意外发现了一个足以毁灭整个宝可梦宇宙的惊天阴谋。

    爬取对象:猫眼电影影评

    爬取限制:pc端无法获取影评(移动端可以)

    爬取内容

    爬取评论部分的用户ID、用户名、评论、评分、时间五项。

    爬取的json数据切入口:http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset=0&startTime=2019-05-09%2022%3A25%3A03

    爬取结果存入CSV以及数据库

    词频及词语显示

    评论者性别分析

    这部电影除去未知性别的,在已知性别的评论者男性的比例比较多,说明这部电影男性的

    爱好者比较多。

    评论者评分等级分析

    根据上面分饼图可得满分的占了70%左右,4.5分以上占了7.4%左右,可知这部电影的

     评价十分高,应该是非常好看的,值得去观看。

    城市分布显示

    总结

     对于此次影评的分析,可以看出在即将上映的前夕,大部分影迷对于这部电影怀抱着回忆童年的心态,皮卡丘的名字被大多数人提及,证明绝大部分群体应该都观看过宠物小精灵,决大部分人对这部电影充满了期待,从城市分布可以看出观影群体主要以一二线城市为主。

    全部代码

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import sqlite3
    import pandas as pd
    import time
    import pandas
    import random
    import json



    #设置合理的user-agent,爬取数据函数
    def getData(url):
    headers =[
    {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36','Cookie': '_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'},
    { 'User-Agent': 'Mozilla / 5.0(Linux;Android 6.0; Nexus 5 Build / MRA58N) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 73.0 .3683.103Mobile Safari / 537.36','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'},
    {'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'}
    ]
    # proxies = [{'https': 'https://120.83.111.194:9999','http':'http://14.20.235.120:808'},{"http": "http://119.131.90.115:9797",
    # "https": "https://14.20.235.96:9797"}]
    get=requests.get(url, headers=headers[random.randint(0,2)]);
    get.encoding = 'utf-8'
    return get

    #数据处理函数
    def dataProcess(data):
    data = json.loads(data.text)['cmts']
    allData = []
    for i in data:
    dataList = {}
    dataList['id'] = i['id']
    dataList['nickName'] = i['nickName']
    dataList['cityName'] = i['cityName'] if 'cityName' in i else '' # 处理cityName不存在的情况
    dataList['content'] = i['content'].replace(' ', ' ', 10) # 处理评论内容换行的情况
    dataList['score'] = i['score']
    dataList['startTime'] = i['startTime']
    if "gender" in i:
    dataList['gendar'] = i["gender"]
    else:
    dataList['gendar'] = i["gender"] = 0
    allData.append(dataList)
    return allData


    allData=[]
    for i in range(67):
    get=getData('http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset={}&startTime=2019-05-09%2022%3A25%3A03'.format(i*15))
    allData.extend(dataProcess(get))

    #处理后的数据保存为csv文件
    pd.Series(allData)
    newsdf=pd.DataFrame(allData)
    newsdf.to_csv('news.csv',encoding='utf-8')


    # #把csv文件保存到sqlite
    # newsdf = pd.read_csv('news.csv')
    # with sqlite3.connect('sqlitetest.sqlite') as db:
    # newsdf.to_sql('data',con = db)




    # 评论者性别分布可视化
    def sexProcess(gender):
    from pyecharts import Pie
    list_num = []
    list_num.append(gender.count(0)) # 未知
    list_num.append(gender.count(1)) # 男
    list_num.append(gender.count(2)) # 女
    attr = ["未知","男","女"]
    pie = Pie("性别饼图",title_pos="center")
    pie.add("", attr, list_num,is_label_show=True)
    pie.render("sex_pie.html")

    gendar=[]
    for i in allData:
    gendar.append(i['gendar'])
    sexProcess(gendar)

    # 评论者评分等级环状饼图
    def scoreProcess(scores):
    from pyecharts import Pie
    list_num = []
    list_num.append(scores.count(0))
    list_num.append(scores.count(0.5))
    list_num.append(scores.count(1))
    list_num.append(scores.count(1.5))
    list_num.append(scores.count(2))
    list_num.append(scores.count(2.5))
    list_num.append(scores.count(3))
    list_num.append(scores.count(3.5))
    list_num.append(scores.count(4))
    list_num.append(scores.count(4.5))
    list_num.append(scores.count(5))
    attr = ["0", "0.5", "1","1.5","2","2.5", "3", "3.5","4","4.5","5"]
    pie = Pie("评分等级环状饼图",title_pos="center")
    pie.add("", attr, list_num, is_label_show=True,
    label_text_color=None,
    radius=[40, 75],
    legend_orient="vertical",
    legend_pos="left",
    legend_top="100px",
    center=[50,60]
    )
    pie.render("score_pie.html")

    scores=[]
    for i in allData:
    scores.append(i['score'])
    scoreProcess(scores)

    # 观众分布图
    def cityProcess(citysTotal):
    from pyecharts import Geo
    geo =Geo("《何以为家》观众分布", title_color='#fff', title_pos='center',
    width=1200,height = 600, background_color = '#404a95')
    attr, value = geo.cast(citysTotal)
    geo.add("", attr, value, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',
    legend_pos = 'right', is_geo_effect_show = True, maptype='china',
    symbol_size=10)
    geo.render("city_geo.html")


    # 城市名称的处理
    citysTotal={}
    coordinatesJson = pd.read_json('city_coordinates.json',encoding='utf-8')
    for i in allData:
    for j in coordinatesJson:
    if str(i['cityName']) in str(j) :
    if str(j) not in citysTotal:
    citysTotal[str(j)]=1
    else:
    citysTotal[str(j)]=citysTotal[str(j)]+1
    break

    cityProcess(citysTotal)

     

     
  • 相关阅读:
    Java Object类
    Java StringBuffer类
    Java String 类
    vuex的使用总结
    vue 周期函数
    vue-cli keep-alive用法以及activated,deactivated
    vue-cli 跳转方式
    vue-cli watch/timer
    vue-cli 使用better-scroll
    vue-cli less使用
  • 原文地址:https://www.cnblogs.com/liliguang/p/10841181.html
Copyright © 2011-2022 走看看