zoukankan      html  css  js  c++  java
  • python3爬取东方财富股东户数2013-2019年数据

    !/usr/bin/env python
     -*- coding:utf-8 -*-
    
    
    import re
    import csv
    import time
    import random
    import requests
    import json
    from bokeh.models import pd
    from requests import session
    
    
    class spider_DongFangCaiFu(object):
        def __init__(self):
            #初始化需要记录断点数据
            self.sync_log_dict = {
                #分类url
                "category_num": 0,
                #页数url
                "page_num": 1,
               #总页数
                "total_page":100,
            }
            # 日期
            self.start_date_list=[
                "2019-12-31",
                "2018-12-31",
                "2017-12-31",
                "2016-12-31",
                "2015-12-31",
                "2014-12-31",
                "2013-12-31",
            ]
    
        # 详情解析
        def parse_detail(self):
            for one_date_index in range(self.sync_log_dict["category_num"], 2):
                # 设置时间间隔
                time.sleep(random.random())
                start_date = self.start_date_list[one_date_index]
                for two_page_index in range(self.sync_log_dict["page_num"], 60):
                    time.sleep(random.uniform(1, 3))
                    url = "http://data.eastmoney.com/DataCenter_V3/gdhs/GetList.ashx?"
                    params = {
                        "reportdate": start_date,
                        "market": "",
                        "changerate": "",
                        "range": "",
                        "pagesize": "50",
                        "page": str(two_page_index),
                        "sortRule": "-1",
                        "sortType": "NoticeDate",
                        "js": "var%20DzSDuvmw",
                        "param": "",
                        "rt": "51196634",
                    }
                    headers = {
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                    }
    
                    response = requests.get(url, headers=headers, params=params, verify=False, timeout=5)
                    res_dict = response.text
                    # print(res_dict)
                    # data_list= re.search('"data":([.*])', res_dict).group(1)
                    data_list= re.search('"data":([.*])', res_dict).group(1)
                    # print(data_list)
                    if len(data_list)<=2:
                        break
                    # data_str = re.findall('[(.*?)]', data_list)[0]
                    ## python3 字符串转换dict,并换行写入txt  encoding = "utf-8"
                    for i in eval(data_list):
                        with open('dfcf.txt', 'a+',encoding = "utf-8" ) as file:
                            """
                            import json
                            print json.dumps('中国')
                            "u4e2du56fd"
                            print json.dumps('中国',ensure_ascii=False)
                            "中国"
                            """
                            line = json.dumps(i,ensure_ascii=False)
                            file.write(line + '
    ',)
    
    if __name__ == '__main__':
        run=spider_DongFangCaiFu()
        run.parse_detail()
    
  • 相关阅读:
    客户端发现响应内容类型为“text/html”,但应该是“text/xml”
    [转]AJAX Control Toolkit 介绍及构建开发环境
    kafka删除topic详解
    influxdb问题解决
    logback配置
    kafka环境搭建测试
    Hdu 1753 大明A+B <高精度小数相加>
    POJ 1966 <点连通度>
    POJ 2446 Chessboard 二分图的最大匹配 <建图>
    Hlg 1522 子序列的和 <单调队列>
  • 原文地址:https://www.cnblogs.com/gqv2009/p/12324246.html
Copyright © 2011-2022 走看看