zoukankan      html  css  js  c++  java
  • 一个爬取股票信息的爬虫程序

      给大家分享一个早前爬取东方财富网股票信息的爬虫程序,回头来看做了好多改进,特别是数据处理部分使用了heapd模块,方便快捷一步到位...

     1 # _*_ coding:utf-8 _*_
     2 
     3 import requests,re,json,time,os
     4 import heapq
     5 from bs4 import BeautifulSoup
     6 
     7 class GPINFO(object):
     8     """docstring for GPINFO"""
     9     def __init__(self):
    10         self.Url = 'http://quote.eastmoney.com/stocklist.html'
    11         self.BaseData = []
    12         self.Date = time.strftime('%Y%m%d')
    13         self.Record = 'basedata'+self.Date
    14         if os.path.exists(self.Record):
    15             print ('record exist...')
    16             self.BaseData = self.get_base_data_from_record()
    17         else:
    18             print ('fuck-get data again...')
    19             self.get_data()
    20 
    21     def write_record(self,text):
    22         with open(self.Record,'ab') as f:
    23             f.write((text+'
    ').encode('utf-8'))
    24 
    25     def get_base_data_from_record(self):
    26         ll = []
    27         with open(self.Record,'rb') as f:
    28             json_l = f.readlines()
    29             for j in json_l:
    30                 ll.append(json.loads(j.decode('utf-8')))
    31         return ll
    32 
    33     def get_data(self):
    34         #请求数据
    35         orihtml = requests.get(self.Url).content
    36         #创建 beautifulsoup 对象
    37         soup = BeautifulSoup(orihtml,'lxml')
    38         #采集每一个股票的信息
    39         count = 0
    40         for a in soup.find('div',class_='quotebody').find_all('a',{'target':'_blank'}):
    41             record_d = {}
    42             #代号
    43             num = a.get_text().split('(')[1].strip(')')
    44             if not (num.startswith('00') or num.startswith('60')):continue #只需要6*/0*
    45             record_d['num']=num
    46             #名称
    47             name = a.get_text().split('(')[0]
    48             record_d['name']=name
    49             #详情页
    50             detail_url = a['href']
    51             record_d['detail_url']=detail_url
    52 
    53             cwzburl = detail_url
    54             #发送请求
    55             try:
    56                 cwzbhtml = requests.get(cwzburl,timeout=30).content
    57             except Exception as e:
    58                 print ('perhaps timeout:',e)
    59                 continue
    60             #创建soup对象
    61             cwzbsoup = BeautifulSoup(cwzbhtml,'lxml')
    62 
    63             #财务指标列表 [浦发银行,总市值    净资产    净利润    市盈率    市净率    毛利率    净利率    ROE] roe:净资产收益率
    64             try:
    65                 cwzb_list = cwzbsoup.find('div',class_='cwzb').tbody.tr.get_text().split()
    66             except Exception as e:
    67                 print ('error:',e)
    68                 continue
    69             #去除退市股票
    70             if '-' not in cwzb_list:
    71                 record_d['data']=cwzb_list
    72                 self.BaseData.append(record_d)
    73                 self.write_record(json.dumps(record_d))
    74                 count=count+1
    75                 print (len(self.BaseData))
    76 
    77 def main():
    78     test = GPINFO()
    79     result = test.BaseData
    80     #[浦发银行,总市值    净资产    净利润    市盈率    市净率    毛利率    净利率    ROE] roe:净资产收益率]
    81     top_10 = heapq.nlargest(10,result,key=lambda r:float(r['data'][7].strip('%')))
    82     for i in top_10:
    83         print(i['data'])
    84 
    85 if __name__ == '__main__':
    86     main()

      程序主函数部分是为了获取净利率前10名的股票信息,打印结果如下:

    ['绵石投资', '52.2亿', '14.0亿', '1.25亿', '30.90', '3.73', '42.25%', '2047.04%', '9.27%']
    ['国投安信', '556亿', '270亿', '21.1亿', '19.80', '2.12', '5.90%', '487.53%', '7.79%']
    ['川投能源', '379亿', '202亿', '28.0亿', '10.16', '1.91', '37.01%', '402.64%', '14.58%']
    ['ST明科', '47.6亿', '9.25亿', '5.11千万', '68.00', '5.14', '2.38%', '345.11%', '5.68%']
    ['华联控股', '93.6亿', '31.5亿', '4.76亿', '14.54', '3.74', '46.25%', '328.53%', '20.88%']
    ['上海九百', '68.2亿', '12.3亿', '1.61亿', '31.67', '5.56', '54.00%', '297.99%', '13.21%']
    ['凯瑞德', '46.7亿', '1.14亿', '3.27千万', '107.10', '40.94', '16.07%', '294.19%', '33.41%']
    ['鲁信创投', '172亿', '38.6亿', '3.32亿', '38.48', '4.64', '28.67%', '244.43%', '9.26%']
    ['博闻科技', '35.0亿', '6.56亿', '2.23千万', '117.65', '5.36', '-16.07%', '215.27%', '3.41%']
    ['万泽股份', '71.8亿', '13.7亿', '6.87千万', '78.38', '5.29', '22.57%', '203.15%', '5.13%']
  • 相关阅读:
    splay
    开车旅行(2012day1T3)
    LCT入门
    最小瓶颈路
    poj 3041 Asteroids
    sql waitfor 延时执行
    [Microsoft][ODBC SQL Server Driver][DBNETLIB]SQL Server 不存在或访问被拒绝
    SQL Server中行列转换
    sql中 with rollup 、with cube、grouping 统计函数用法
    sql 分组后 组内排名
  • 原文地址:https://www.cnblogs.com/diaosir/p/6289571.html
Copyright © 2011-2022 走看看