Python批量下载上交所上市公司报告

zoukankan html css js c++ java

Python批量下载上交所上市公司报告

上交所的上市公司报告搜索页面http://www.sse.com.cn/disclosure/listedinfo/announcement/

通过查看页面调用的接口可以发现:

获取上交所全部股票代码http://www.sse.com.cn/js/common/ssesuggestdata.js
获取上交所全部基金代码http://www.sse.com.cn/js/common/ssesuggestfunddata.js
获取上交所全部E债券代码http://www.sse.com.cn/js/common/ssesuggestEbonddata.js
获取上交所全部T债券代码http://www.sse.com.cn/js/common/ssesuggestTbonddata.js

随便在页面上查询一个股票的年报，调用的接口如下http://query.sse.com.cn/security/stock/queryCompanyBulletin.do?jsonCallBack=jsonpCallback47120&isPagination=true&productId=600000&keyWord=&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType=YEARLY&beginDate=2016-07-20&endDate=2019-07-20&pageHelp.pageSize=25&pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=5&_=1563587837130

通过多次使用不同的条件调用查询该接口可以发现该接口的一部分url参数如下：
isPagination这名字一看就应该是是否分页，为了方便爬取，我们将该参数设为false
productId股票代码
keyWord关键字
securityType证券类型

参数意义 securityType
全部 0101,120100,020100,020200,120200
主板 0101
科创板 120100,020100,020200,120200
reportType reportType2报告类型

参数意义 reportType reportType2
全部 ALL
定期公告 ALL DQBG
年报 YEARLY DQBG
第一季度季报 QUATER1 DQBG
半年报 QUATER2 DQBG
第三季度季报 QUATER3 DQBG
临时公告 ALL LSGG
上市公司章程 SHGSZC LSGG
发行上市公告 FXSSGG LSGG
公司治理 GSZL LSGG
股东大会会议 GDDH LSGG
IPO公司公告 IPOGG LSGG
其他 QT LSGG
beginDate开始日期
endDate结束日期，接口每次查询的日期间隔不超过3年
以page开头的参数应该都是和分页有关的，在设置isPagination=false后这些参数都不用填写

直接调用查询接口，返回接口error系统繁忙，需要加上请求头Referer: http://www.sse.com.cn/disclosure/listedinfo/announcement/，表示是在上交所网站页面上调用的接口。

以下是我用python写的批量下载上交所上市公司报告的代码，运行后会给每个股票建一个以股票代码命名的文件夹，存放相应的pdf文件。（温馨提示：下载全部报告需要巨大的磁盘空间）

import os
import time
import requests
from copy import deepcopy

URL_SSE = "http://www.sse.com.cn/disclosure/listedinfo/announcement/"
# 股票
URL_SSE_STOCK = "http://www.sse.com.cn/js/common/ssesuggestdata.js"
# 基金
URL_SSE_FUND = "http://www.sse.com.cn/js/common/ssesuggestfunddata.js"
# E债券
URL_SSE_EBOND = "http://www.sse.com.cn/js/common/ssesuggestEbonddata.js"
# T债券
URL_SSE_TBOND = "http://www.sse.com.cn/js/common/ssesuggestTbonddata.js"
# 查询
URL_QUERY_COMPANY = "http://query.sse.com.cn/security/stock/queryCompanyBulletin.do"

URL_PDF = "http://static.sse.com.cn"

# 报告类型
REPORT_TYPE = {
'全部': ('ALL', ''),
'定期公告': ('ALL', 'DQBG'),
'年报': ('YEARLY', 'DQBG'),
'第一季度季报': ('QUATER1', 'DQBG'),
'半年报': ('QUATER2', 'DQBG'),
'第三季度季报': ('QUATER3', 'DQBG'),
'临时公告': ('ALL', 'LSGG'),
'上市公司章程': ('SHGSZC', 'LSGG'),
'发行上市公告': ('FXSSGG', 'LSGG'),
'公司治理': ('GSZL', 'LSGG'),
'股东大会会议': ('GDDH', 'LSGG'),
'IPO公司公告': ('IPOGG', 'LSGG'),
'其他': ('QT', 'LSGG'),
}

# 证券类型
SECURITY_TYPE = {
'全部': '0101,120100,020100,020200,120200',
'主板': '0101',
'科创板': '120100,020100,020200,120200',
}

HEADER = {
'Referer': URL_SSE,
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
}

URL_PARAM = {
# 是否分页
'isPagination': 'false',
'productId': '600000',
# 关键字
'keyWord': '',
'securityType': SECURITY_TYPE['全部'],
'reportType2': 'DQBG',
'reportType': 'YEARLY',
'beginDate': '2016-07-17',
'endDate': '2019-07-17',
}

def get_all_codes(url):
res = requests.get(url)
content = res.content.decode()
tmp = content.split('_t.push({val:"')
code, name, pinyin = [], [], []
for i in tmp[1:]:
item = i.split('"')
code.append(item[0])
name.append(item[2])
pinyin.append(item[4])
# print(code)
return code, name, pinyin

def get_pdf_url(code, begin_date, end_date, security_type='全部', report_type='年报'):
url_param = deepcopy(URL_PARAM)
url_param['productId'] = code
url_param['securityType'] = SECURITY_TYPE[security_type]
url_param['reportType2'] = REPORT_TYPE[report_type][1]
url_param['reportType'] = REPORT_TYPE[report_type][0]
url_param['beginDate'] = begin_date
url_param['endDate'] = end_date
result = requests.get(URL_QUERY_COMPANY, params=url_param, headers=HEADER).json()['result']
return [(URL_PDF + i['URL'], i['BULLETIN_TYPE'], i['BULLETIN_YEAR'], i['SSEDATE']) for i in result]

def save_pdf(code, pdf_title_urls, path='./'):
file_path = os.path.join(path, code)
if not os.path.isdir(file_path):
os.makedirs(file_path)
for url, r_type, year, date in pdf_title_urls:
date = ''.join(date.split('-'))
file_name = '_'.join([code, r_type, year, date]) + '.pdf'
file_full_name = os.path.join(file_path, file_name)
# print(file_full_name)
rs = requests.get(url, stream=True)
with open(file_full_name, "wb") as fp:
for chunk in rs.iter_content(chunk_size=10240):
if chunk:
fp.write(chunk)

def download_report(code):
month_day = time.strftime('-%m-%d', time.localtime())
year = int(time.strftime('%Y', time.localtime()))
while True:
year_3 = year - 3
begin_date = str(year_3) + month_day
end_date = str(year) + month_day
pdf_urls = get_pdf_url(code, begin_date, end_date)
# for i in title_urls:
# print(i)
if pdf_urls:
for i in range(1, 4):
try:
save_pdf(code, pdf_urls)
break
except Exception as e:
print(f'[{code}] 第{i}次尝试下载出错', e)
else:
print(f'[{code}] 下载失败')
else:
print(f'[{code}] 完毕')
break
year = year_3
if year < 1900:
break

def main():
stock_codes, _, _ = get_all_codes(URL_SSE_STOCK)
len_stock_codes = len(stock_codes)
for index, code in enumerate(stock_codes):
print(f'股票总数:{len_stock_codes}, 已完成:{index} ', end='')
download_report(code)
print('任务完成')

if __name__ == '__main__':
main()

查看全文

相关阅读:
jQuery之选择器
 JAVA之网页截屏
 AJAX之JSON
JSP之AJAX
JSP之邮箱检验
 【16】LRUChache
hashmap与currentHashMap
Day1 工厂模式
 D0 设计模式
 【15】【有点特殊的dp】剪绳子

原文地址：https://www.cnblogs.com/fengkun125/p/11226076.html