zoukankan      html  css  js  c++  java
  • python3爬取东方财富股东户数2013-2019年数据

    !/usr/bin/env python
     -*- coding:utf-8 -*-
    
    
    import re
    import csv
    import time
    import random
    import requests
    import json
    from bokeh.models import pd
    from requests import session
    
    
    class spider_DongFangCaiFu(object):
        def __init__(self):
            #初始化需要记录断点数据
            self.sync_log_dict = {
                #分类url
                "category_num": 0,
                #页数url
                "page_num": 1,
               #总页数
                "total_page":100,
            }
            # 日期
            self.start_date_list=[
                "2019-12-31",
                "2018-12-31",
                "2017-12-31",
                "2016-12-31",
                "2015-12-31",
                "2014-12-31",
                "2013-12-31",
            ]
    
        # 详情解析
        def parse_detail(self):
            for one_date_index in range(self.sync_log_dict["category_num"], 2):
                # 设置时间间隔
                time.sleep(random.random())
                start_date = self.start_date_list[one_date_index]
                for two_page_index in range(self.sync_log_dict["page_num"], 60):
                    time.sleep(random.uniform(1, 3))
                    url = "http://data.eastmoney.com/DataCenter_V3/gdhs/GetList.ashx?"
                    params = {
                        "reportdate": start_date,
                        "market": "",
                        "changerate": "",
                        "range": "",
                        "pagesize": "50",
                        "page": str(two_page_index),
                        "sortRule": "-1",
                        "sortType": "NoticeDate",
                        "js": "var%20DzSDuvmw",
                        "param": "",
                        "rt": "51196634",
                    }
                    headers = {
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                    }
    
                    response = requests.get(url, headers=headers, params=params, verify=False, timeout=5)
                    res_dict = response.text
                    # print(res_dict)
                    # data_list= re.search('"data":([.*])', res_dict).group(1)
                    data_list= re.search('"data":([.*])', res_dict).group(1)
                    # print(data_list)
                    if len(data_list)<=2:
                        break
                    # data_str = re.findall('[(.*?)]', data_list)[0]
                    ## python3 字符串转换dict,并换行写入txt  encoding = "utf-8"
                    for i in eval(data_list):
                        with open('dfcf.txt', 'a+',encoding = "utf-8" ) as file:
                            """
                            import json
                            print json.dumps('中国')
                            "u4e2du56fd"
                            print json.dumps('中国',ensure_ascii=False)
                            "中国"
                            """
                            line = json.dumps(i,ensure_ascii=False)
                            file.write(line + '
    ',)
    
    if __name__ == '__main__':
        run=spider_DongFangCaiFu()
        run.parse_detail()
    
  • 相关阅读:
    Delux DLVB13摄像头在Windows Vista下的使用
    Windows在删除文件时怎么不确认了?
    Tornado启动仿真器时出现错误:error : simulator failed to initialize before timeout.
    VxWorks下使用双向链表的小例子
    MPI错误:提示XXX Credentials for yyy rejected connecting to XXX
    运行Google CTemplate首页的例子遇到_CrtIsValidHeapPointer异常
    拖延不是毛病,是你不够强大
    BNF范式含义和基本用法
    堆栈的区别
    永不抱怨
  • 原文地址:https://www.cnblogs.com/gqv2009/p/12324246.html
Copyright © 2011-2022 走看看