zoukankan      html  css  js  c++  java
  • 爬取猫眼电影影评

    主题:对即将上映的大侦探皮卡丘电影保持什么态度?

    主要内容

    蒂姆·古德曼(贾斯提斯·史密斯 饰) 为寻找下落不明的父亲来到莱姆市,意外与父亲的前宝可梦搭档大侦探皮卡丘(瑞恩·雷诺兹 配音)相遇,并惊讶地发现自己是唯一能听懂皮卡丘说话的人类,他们决定组队踏上揭开真相的刺激冒险之路。探案过程中他们邂逅了各式各样的宝可梦,并意外发现了一个足以毁灭整个宝可梦宇宙的惊天阴谋。

    爬取对象:猫眼电影影评

    爬取限制:pc端无法获取影评(移动端可以)

    爬取内容

    爬取评论部分的用户ID、用户名、评论、评分、时间五项。

    爬取的json数据切入口:http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset=0&startTime=2019-05-09%2022%3A25%3A03

    爬取结果存入CSV以及数据库

    词频及词语显示

    评论者性别分析

    这部电影除去未知性别的,在已知性别的评论者男性的比例比较多,说明这部电影男性的

    爱好者比较多。

    评论者评分等级分析

    根据上面分饼图可得满分的占了70%左右,4.5分以上占了7.4%左右,可知这部电影的

     评价十分高,应该是非常好看的,值得去观看。

    城市分布显示

    总结

     对于此次影评的分析,可以看出在即将上映的前夕,大部分影迷对于这部电影怀抱着回忆童年的心态,皮卡丘的名字被大多数人提及,证明绝大部分群体应该都观看过宠物小精灵,决大部分人对这部电影充满了期待,从城市分布可以看出观影群体主要以一二线城市为主。

    全部代码

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import sqlite3
    import pandas as pd
    import time
    import pandas
    import random
    import json



    #设置合理的user-agent,爬取数据函数
    def getData(url):
    headers =[
    {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36','Cookie': '_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'},
    { 'User-Agent': 'Mozilla / 5.0(Linux;Android 6.0; Nexus 5 Build / MRA58N) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 73.0 .3683.103Mobile Safari / 537.36','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'},
    {'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10','Cookie':'_lxsdk_cuid=16a8d7b1613c8-0a2b4d109e58f-b781636-144000-16a8d7b1613c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid_n_v=v1; iuuid=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; webp=true; ci=20%2C%E5%B9%BF%E5%B7%9E; selectci=; __mta=45946523.1557151818494.1557367174996.1557368154367.23; _lxsdk=1BB9A320700C11E995DE7D45B75E59C6FC50A50D996543D0819E9EB2E6507E92; __mta=45946523.1557151818494.1557368154367.1557368240554.24; from=canary; _lxsdk_s=16a9a2807fa-ea7-e79-c55%7C%7C199'}
    ]
    # proxies = [{'https': 'https://120.83.111.194:9999','http':'http://14.20.235.120:808'},{"http": "http://119.131.90.115:9797",
    # "https": "https://14.20.235.96:9797"}]
    get=requests.get(url, headers=headers[random.randint(0,2)]);
    get.encoding = 'utf-8'
    return get

    #数据处理函数
    def dataProcess(data):
    data = json.loads(data.text)['cmts']
    allData = []
    for i in data:
    dataList = {}
    dataList['id'] = i['id']
    dataList['nickName'] = i['nickName']
    dataList['cityName'] = i['cityName'] if 'cityName' in i else '' # 处理cityName不存在的情况
    dataList['content'] = i['content'].replace(' ', ' ', 10) # 处理评论内容换行的情况
    dataList['score'] = i['score']
    dataList['startTime'] = i['startTime']
    if "gender" in i:
    dataList['gendar'] = i["gender"]
    else:
    dataList['gendar'] = i["gender"] = 0
    allData.append(dataList)
    return allData


    allData=[]
    for i in range(67):
    get=getData('http://m.maoyan.com/mmdb/comments/movie/346629.json?_v_=yes&offset={}&startTime=2019-05-09%2022%3A25%3A03'.format(i*15))
    allData.extend(dataProcess(get))

    #处理后的数据保存为csv文件
    pd.Series(allData)
    newsdf=pd.DataFrame(allData)
    newsdf.to_csv('news.csv',encoding='utf-8')


    # #把csv文件保存到sqlite
    # newsdf = pd.read_csv('news.csv')
    # with sqlite3.connect('sqlitetest.sqlite') as db:
    # newsdf.to_sql('data',con = db)




    # 评论者性别分布可视化
    def sexProcess(gender):
    from pyecharts import Pie
    list_num = []
    list_num.append(gender.count(0)) # 未知
    list_num.append(gender.count(1)) # 男
    list_num.append(gender.count(2)) # 女
    attr = ["未知","男","女"]
    pie = Pie("性别饼图",title_pos="center")
    pie.add("", attr, list_num,is_label_show=True)
    pie.render("sex_pie.html")

    gendar=[]
    for i in allData:
    gendar.append(i['gendar'])
    sexProcess(gendar)

    # 评论者评分等级环状饼图
    def scoreProcess(scores):
    from pyecharts import Pie
    list_num = []
    list_num.append(scores.count(0))
    list_num.append(scores.count(0.5))
    list_num.append(scores.count(1))
    list_num.append(scores.count(1.5))
    list_num.append(scores.count(2))
    list_num.append(scores.count(2.5))
    list_num.append(scores.count(3))
    list_num.append(scores.count(3.5))
    list_num.append(scores.count(4))
    list_num.append(scores.count(4.5))
    list_num.append(scores.count(5))
    attr = ["0", "0.5", "1","1.5","2","2.5", "3", "3.5","4","4.5","5"]
    pie = Pie("评分等级环状饼图",title_pos="center")
    pie.add("", attr, list_num, is_label_show=True,
    label_text_color=None,
    radius=[40, 75],
    legend_orient="vertical",
    legend_pos="left",
    legend_top="100px",
    center=[50,60]
    )
    pie.render("score_pie.html")

    scores=[]
    for i in allData:
    scores.append(i['score'])
    scoreProcess(scores)

    # 观众分布图
    def cityProcess(citysTotal):
    from pyecharts import Geo
    geo =Geo("《何以为家》观众分布", title_color='#fff', title_pos='center',
    width=1200,height = 600, background_color = '#404a95')
    attr, value = geo.cast(citysTotal)
    geo.add("", attr, value, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',
    legend_pos = 'right', is_geo_effect_show = True, maptype='china',
    symbol_size=10)
    geo.render("city_geo.html")


    # 城市名称的处理
    citysTotal={}
    coordinatesJson = pd.read_json('city_coordinates.json',encoding='utf-8')
    for i in allData:
    for j in coordinatesJson:
    if str(i['cityName']) in str(j) :
    if str(j) not in citysTotal:
    citysTotal[str(j)]=1
    else:
    citysTotal[str(j)]=citysTotal[str(j)]+1
    break

    cityProcess(citysTotal)

     

     
  • 相关阅读:
    Android开发 ViewConfiguration View的配置信息类
    Android 开发 倒计时功能 转载
    Android 开发 关于7.0 FileUriExposedException异常 详解
    Android 开发 实现文本搜索功能
    Android 开发 Activity里获取View的宽度和高度 转载
    Android 开发 存储目录的详解
    Android 开发 Fresco框架点击小图显示全屏大图实现 ZoomableDraweeView
    Android 开发 将window变暗
    Android 开发 DisplayMetrics获取Android设备的屏幕高宽与其他信息
    Android 开发 DP、PX、SP转换详解
  • 原文地址:https://www.cnblogs.com/liliguang/p/10841181.html
Copyright © 2011-2022 走看看