zoukankan      html  css  js  c++  java
  • 爬取猫眼数据

    //源码

    #
    # 导包
    #
    import pyximport
    import requests
    from fake_useragent import UserAgent
    import json
    import os
    import pandas as pd
    import csv
    import datetime

    #
    #
    #
    # 代码
    # http://maoyan.com/films/42964
    #
    #

    #伪表头定义
    pyximport.install()
    ua=UserAgent()
    headers = {
    # "User-agent":UserAgent(verity_ssl=False).random,
    "User-agent":ua.random,
    "Host":"m.maoyan.com",
    #"Referer":"http://m.maoyan.com/movie/1217236/comments?_v_=yes"
    "Referer":"http://m.maoyan.com/movie/42964/comments?_v_=yes"
    }

    #请求参数定义
    offsets = [0,15,30,45,60,75,90,105,120,135,150,165,180]
    startTime="0"
    randomTime = ""
    list_info = []

    for offset in offsets:
    comment_api='http://m.maoyan.com/mmdb/comments/movie/42964.json?_v_=yes&offset={0}&startTime={1}'.format(offset,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    response_comment = requests.get(comment_api,headers=headers)
    json_comment=response_comment.text
    json_comment=json.loads(json_comment)
    #print(json_comment)
    json_response = json_comment['cmts']
    for data in json_response:
    cityName = data['cityName']
    content=data['content']
    if "gender" in data:
    gender = data['gender']
    else:
    gender=0
    nickName = data['nickName']
    userLevel = data['userLevel']
    score = data['score']
    list_one=[nickName,gender,cityName,userLevel,score,content]
    list_info.append(list_one)
    #print("offset:"+offset+",startTime:"+startTime)
    #重新定义请求参数

    print("正在存储数据:")
    file_size=os.path.getsize(r'D:B_Hakkelujahpythonmaoyan.csv')
    prStr = "文件大小:{0}".format(file_size)
    print(prStr)
    if file_size==0:
    print("空文件添加数据")
    # 表头
    name = ['评论者昵称', '性别', '所在城市','猫眼等级','评分','评论内容']
    # 建立DataFrame对象
    file_test = pd.DataFrame(columns=name, data=list_info)
    # 数据写入
    file_test.to_csv(r'D:B_Hakkelujahpythonmaoyan.csv', encoding='utf_8_sig', index=False)
    print("数据添加完毕")
    #pd.read_csv(file_name, encoding='utf-8')

    原文:

    https://mp.weixin.qq.com/s?__biz=MjM5MjAwODM4MA==&mid=2650706418&idx=1&sn=20e57b7b1c8caa4c0b06d6dbd2b94aaa&chksm=bea6e02189d16937c8c3d934264f24b599576b14b76361018b55cca76fb73a127d4f6681af98&mpshare=1&scene=1&srcid=101045ENCgxgoTId8LKXrIaE&pass_ticket=Cgz9TOK3J64evSI%2B9Ev7kLigZCJHUOKf8eJe9%2FagJaUdYdhyn53lL%2FeRC4NnDrUq#rd

    注:

    数据爬取记录

    1.分析接口(包括接口参数的变化)

    2.分析JSON数据(数据解析)

    3.数据存储(文件、数据库)

  • 相关阅读:
    hibernate的核心配置
    hibernate的映射配置
    数据库的维护
    索引
    数据库规范化设计
    数据控制DCL
    触发器
    SQL存储过程简介
    Transact-SQL简介
    sysdatabaes表与sysobjects表
  • 原文地址:https://www.cnblogs.com/newrohlzy/p/9973795.html
Copyright © 2011-2022 走看看