zoukankan      html  css  js  c++  java
  • 财经数据(5)-开盘啦股票标签数据爬虫

    本次主要是抓取开盘啦股票概念数据

    采用多进程、requests完成数据的爬取

    采用Pandas库完成数据比对,实现mysql数据存储

    具体代码如下:

    # -*- coding: utf-8 -*-
    import pandas as pd
    import tushare as ts
    import time
    import requests
    import json
    from sqlalchemy import create_engine
    from multiprocessing import Pool
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    
    
    # ====================Tushare股票code获取====================================================================================================================
    def getCode():
        print("-------------------------------------------")
        print("开始从Tushare接口获取股票列表数据")
    
        # 初始化tushare.pro接口
        pro = ts.pro_api('ac16b470869c5d82db5033ae9288f77b282d2b5519507d6d2c72fdd7')
    
        # L 表示正常上市,P 表示暂停上市
        l_list = pro.stock_basic(list_status='L', fields='ts_code,symbol,name,area,exchange,list_status,list_date')
        p_list = pro.stock_basic(list_status='P', fields='ts_code,symbol,name,area,exchange,list_status,list_date')
    
        # 合并正常上市、暂停上市数据
        stock_list = pd.concat([l_list, p_list], axis=0, ignore_index=True)
    
        # 创建空列表
        code_list = []
        for index, row in stock_list.iterrows():
            symbol = row['symbol']
            code_list.append(symbol)
    
        return code_list
    
    # ====================爬取PC端开盘啦板块数据====================================================================================================================
    def Kplspider(data_list):
        print("-------------------------------------------")
        # 构造空html列表
        html_list = []
    
        # 构造URL请求、user-agent头文件
        url = 'https://pchq.kaipanla.com/w1/api/index.php'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'}
    
        session = requests.Session()
        # 禁用安全请求警告
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        for data in data_list:
            try:
                html = session.post(url=url, data=data, headers=headers, verify=False).text
                html_list.append(html)
            except Exception as spider_error:
                print("html抓取过程报错,错误信息为:%s" % spider_error)
    
        # 分别创建用于存储tag、concept的空Dataframe
        stock_tag = pd.DataFrame(); stock_concept = pd.DataFrame()
    
        print("-------------------------------------------")
        print("股票标签、所属概念数据开始解析")
        for html in html_list:
            # 解析开盘啦股票标签
            code = json.loads(html)['trend']['code']
            day = json.loads(html)['trend']['day']
            tag = json.loads(html)["pankou"]["tag"]
            stock_tag = stock_tag.append({'symbol': code, 'tag': tag, 'in_date':day}, ignore_index=True)
    
            cept_list = json.loads(html)["stockplate"]
            try:
                for cept in cept_list:
                    stock_concept = stock_concept.append({'symbol':code, 'concept': cept[0], 'in_date': day}, ignore_index=True)
            except Exception as parser_error:
                print("html抓取过程报错,错误信息为:%s" % parser_error)
                print("%s概念数据请求为空,请知悉" % code)
    
        # 创建Pandas读写数据库引擎
        engine = create_engine('mysql://root:123456@127.0.0.1/quant?charset=utf8')
    
        # 开始存储标签数据
        old_tag = pd.read_sql('select * from is_belong_zyj', engine)
        stock_tag = stock_tag[['symbol','tag','in_date']]
        stock_tag = stock_tag.append(old_tag,ignore_index=True,sort=False)
        stock_tag.drop_duplicates(subset=['symbol', 'tag'], keep=False,inplace=True)
    
        stock_tag.to_sql('is_belong_zyj', engine, if_exists='append', index=False)
    
        print(stock_tag)
        print("本次存储开盘啦标签数据%s条" % stock_tag.shape[0])
    
        # 开始存储所属概念数据
        old_concept = pd.read_sql('select * from belong_concept',engine)
        stock_concept = stock_tag[['symbol','concept','in_date']]
        stock_concept = stock_tag.append(old_tag,ignore_index=True,sort=False)
        stock_concept.drop_duplicates(subset=['symbol', 'concept'], keep=False,inplace=True)
    
        stock_concept.to_sql('belong_concept', engine, if_exists='append', index=False)
    
        print(stock_concept)
        print("本次存储开盘啦标签数据%s条" % stock_concept.shape[0])
    
    
    # ====================主函数====================================================================================================================================
    if __name__ == '__main__':
        print("开盘啦股票标签及概念爬虫程序开始执行")
        print("-------------------------------------")
        start = time.time()
    
        # 调用getCode
        code_list = getCode()
    
        # 获取当前日期
        cur_date = time.strftime("%Y%m%d", time.localtime())
    
        # 创建多进程
        pool = Pool(processes=4)
    
        # 构造post请求表单
        data_list = []
        for code in code_list:
            data = {'c': 'PCArrangeData','a': 'GetHQPlate','StockID': code,'Day': cur_date,'SelType': '1, 2, 3, 8, 9, 5, 6, 7','UserID': 399083,'Token': '71aef0e806e61ad3169ddc9473e37886'}
            data_list.append(data)
    
        # 开启多进程爬取开盘啦数据
        try:
            pool.map(Kplspider, (data_list,))
        except Exception as error:
            print("进程执行过程报错,错误信息为:%s" % error)
    
        end = time.time()
        print('开盘啦股票标签及概念爬虫程序共执行%0.2f秒.' % ((end - start)))
        print("开盘啦股票标签及概念爬虫程序执行完成")

      

    执行效果展示:

     Mysql存储数据展示:

  • 相关阅读:
    VS,VAX一些快捷键记录
    UnrealScript中的Cpptext{}段落
    UScript在VS下的阅读及调试
    Unreal中的网络同步机制
    UDK编辑器 49条小提示(转)
    VS正则表达式常用篇
    Legacy:Within
    Hadoop + HBase + Hive 完全分布式部署笔记
    Windows批处理调用逻辑备份
    CentOS 下修改/dev/shm 大小解决ORA00845
  • 原文地址:https://www.cnblogs.com/Iceredtea/p/12159175.html
Copyright © 2011-2022 走看看