zoukankan      html  css  js  c++  java
  • 爬取沪深a股数据

    首先从东方财富网获取股票代码

    再从网易财经下载股票历史数据

    import requests
    import random
    from bs4 import BeautifulSoup as bs
    import time
    #import redis
    import re
    import json
    
    def get_stock_names():
        """
        通过东方财富网上爬取股票的名称代码,并存入redis数据库和本地txt文档
        """
        rds = redis.from_url('redis://:666666@192.168.3.98:6379', db=1, decode_responses=True)   # 连接redis db1
    
        url = "http://quote.eastmoney.com/stocklist.html"
        headers = {
                'Referer': 'http://quote.eastmoney.com/center/gridlist.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
            }
    
        response = requests.get(url, headers=headers).content.decode('utf-8')   # 网站编码为gbk 需要解码
        soup = bs(response, 'lxml')
        all_ul = soup.find('div', id='table_wrapper-table').find_all('ul')   # 获取两个ul 标签数据
        with open('stock_names.txt', 'w+', encoding='utf-8') as f:  
            for ul in all_ul:
                all_a = ul.find_all('a')            # 获取ul 下的所有的a 标签
                for a in all_a:
                    rds.rpush('stock_names', a.text)       # a.text 为a标签中的text数据  rpush将数据右侧插入数据库
                    f.write(a.text + '
    ')
    
    
    def get_data(stocklist, outfile=r'D:PycharmProjectsweb_scrapingstockdata'):
        headers = {
            'Referer': 'http://quotes.money.163.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        #filelist = [os.path.splitext(file)[0] for file in os.listdir(r'D:PycharmProjectsweb_scrapingstockdata')]
        for stock_code, stock_name in stocklist:
            #if stock_code in filelist: continue
            try:
                #stock_code = stock_name.split('(')[1].split(')')[0]
                # 由于东方财富网上获取的代码一部分为基金,无法获取数据,故将基金剔除掉。
                # 沪市股票以6,9开头,深市以0,2,3开头,但是部分基金也是2开头,201/202/203/204这些也是基金
                # 另外获取data的网址股票代码 沪市前加0, 深市前加1
                if int(stock_code[0]) in [0, 2, 3, 6, 9]:
                    if int(stock_code[0]) in [6, 9]:
                        stock_code_new = '0' + stock_code
                    elif int(stock_code[0]) in [0, 2, 3]:
                        if not int(stock_code[:3]) in [201, 202, 203, 204]:
                            stock_code_new = '1' + stock_code
                        else: continue
                    else: continue
                else: continue
    
                stock_url = 'http://quotes.money.163.com/trade/lsjysj_{}.html'.format(stock_code)
                respones = requests.get(stock_url, headers=headers).text
                soup = bs(respones, 'lxml')
                start_time = soup.find('input', {'name': 'date_start_type'}).get('value').replace('-', '')  #获取起始时间
                end_time = soup.find('input', {'name': 'date_end_type'}).get('value').replace('-', '')  #获取结束时间
                time.sleep(random.choice([1, 2]))  #两次访问之间休息1-2秒
                download_url = "http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP".format(stock_code_new, start_time, end_time)
                data = requests.get(download_url, headers=headers)
                file_name = outfile + '\{}.csv'.format(stock_code)
                with open(file_name, 'wb') as f:
                    for chunk in data.iter_content(chunk_size=10000):  #批量写入数据
                        if chunk:
                            f.write(chunk)
                print("{}数据已下载".format(stock_code))
    
            except Exception as e:
                print("{}({})数据下载报错".format(stock_name, stock_code))
                print(e)
    
    
    
    import os  
    # 获取目录下所有文件,绝对路径
    # 方法一
    def file_name(file_dir):   
        L=[]   
        for root, dirs, files in os.walk(file_dir):  
            for file in files:
                if os.path.splitext(file)[1] == '.jpeg':
                    L.append(os.path.join(root, file))
        return L
    
    # 方法二
    def listdir(path, list_name):  
        for file in os.listdir(path):  #不包括子目录文件 -> 递归
            file_path = os.path.join(path, file)
            if os.path.isdir(file_path):
                listdir(file_path, list_name)
            elif os.path.splitext(file_path)[1]=='.jpeg':
                list_name.append(file_path)
    
    
    stocklist = []  #3770支,只有'0','3','6'开头的
    max_page = 189
    for i in range(max_page):
        url = '''http://1.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405721872315676919_1566176986516&pn={}
        &pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2&
        fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152
        &_=1566176986517'''.format(i+1)
        response = requests.get(url).content.decode('utf-8')
        json_text = re.sub(r'jQuery112405721872315676919_1566176986516(', '', response)[:-2]
        #json_str = re.sub(r')', '', response)
        json_text = json.loads(json_text)
        for fi in json_text['data']['diff']:
            stocklist.append([fi['f12'], fi['f14']])
            
    
    # 下载数据
    get_data(stocklist, outfile=r'D:PycharmProjectsweb_scrapingstockdata')
    

      

    参考资料:

    爬虫:爬取股票历史交易数据

    爬取东方财富股票信息网

    Python爬虫(5):比Selenium快100倍的方法爬东方财富网财务报表

  • 相关阅读:
    Oracle 11g 新特性 安全性增强 说明
    Oracle 11g 新特性 安全性增强 说明
    Oracle 11g 新特性 Invisible Indexes(不可见的索引) 说明
    Oracle 只读表空间 说明
    Openfiler 配置 NFS 示例
    Oracle v$session 中sql_id 为 null 说明
    Oracle v$session 中sql_id 为 null 说明
    Oracle 11g 新特性 管理 SPFILE 说明
    Oracle 11.2.0.1 RAC GRID 无法启动 : Oracle High Availability Services startup failed
    Oracle 11g 新特性 管理 SPFILE 说明
  • 原文地址:https://www.cnblogs.com/iupoint/p/11375932.html
Copyright © 2011-2022 走看看