zoukankan      html  css  js  c++  java
  • 抓取主板市盈率,市净率和股息率

    思想:

    1. 分析中国指数有限公司的网页数据存储情况。

    2.交易日的获取,之后拼接为网页链接

    3.逐个的访问网页连接获取数据

    4.数据储存为CSV文件。

    使用的知识:

    1.网页解析BeautifulSoup标签的获得,标签内容的获取。

    2.正则表达式的使用,提取url的时间。

    3.数据存储,写入

    4.意外情况的处理。

    5.时间的记录

    代码:

      1 # hanbb
      2 # come on!!!
      3 import tushare as ts
      4 import datetime
      5 import requests
      6 from bs4 import BeautifulSoup
      7 import csv
      8 import re
      9 
     10 # 交易时间的获取
     11 end = datetime.date.today()
     12 data = ts.get_hist_data('000012',start='2015-01-01',end='end')  # 可以在这里修改开始时间
     13 date = list(data.index)
     14 
     15 # 网站链接的拼接
     16 urls = []
     17 urls_3 = []
     18 urls_2= []
     19 urls_1= []
     20 for date_time in date:
     21     url = 'http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?type=zy4&date=' + date_time
     22     urls.append(url)
     23     url_3 = 'http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?type=zy3&date=' + date_time
     24     urls_3.append(url_3)
     25     url_2 = 'http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?type=zy2&date=' + date_time
     26     urls_2.append(url_2)
     27     url_1 = 'http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?type=zy1&date=' + date_time
     28     urls_1.append(url_1)
     29 # print(urls)
     30 
     31 # 网页的访问
     32 def get_html_text(url):
     33     try:
     34         r = requests.get(url)
     35         r.raise_for_status()
     36         r.encoding = r.apparent_encoding
     37         return r.text
     38     except:
     39         return ""
     40 
     41 # 网页的解析 和 获取信息
     42 def getinfo(url):
     43     html = get_html_text(url)
     44     soup = BeautifulSoup(html,"html.parser")
     45     td = soup.find_all("td")
     46     
     47     # 将url的时间提取出来作为列表的索引
     48     infodate = re.findall(r'd{4}-d{2}-d{2}',url)[0]
     49     #print(infodate)
     50     #print(td)
     51     #print(len(td)
     52     
     53     # 提取列表的时间
     54     if len(td) == 48:
     55         info = infodate,td[0].string, td[1].string, td[2].string, td[3].string, td[4].string, td[5].string, td[6].string, td[
     56             7].string, 
     57                td[8].string, td[9].string, td[10].string, td[11].string, td[12].string, td[13].string, td[14].string, 
     58                td[15].string, 
     59                td[16].string, td[17].string, td[18].string, td[19].string, td[20].string, td[21].string, td[22].string, 
     60                td[23].string, 
     61                td[24].string, td[25].string, td[26].string, td[27].string, td[28].string, td[29].string, td[30].string, 
     62                td[31].string, 
     63                td[32].string, td[33].string, td[34].string, td[35].string, td[36].string, td[37].string, td[38].string, 
     64                td[39].string, 
     65                td[40].string, td[41].string, td[42].string, td[43].string, td[44].string, td[45].string, td[46].string, 
     66                td[47].string,
     67     else:
     68         info = infodate,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     69                0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     70                0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     71                0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     72                0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     73                0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,
     74     return info
     75 
     76 # 数据储存
     77 def save(filename,info):
     78     file = open('E:\download\{}.csv'.format(filename), 'a', newline='')  # 打开的文件名称,追加模式,不写newline=''会出现行间距变大
     79     writerfile = csv.writer(file)  # 写入命令
     80     writerfile.writerow(info)  # 写入内容
     81     file.close()  # 关闭文件
     82 
     83 def main():
     84     guxilv = "guxilv"
     85     shijinglv = 'shijinglv'
     86     gundongshiyinglv = "gundongshiyinglv"
     87     jingtaishiyinglv = "jingtaishiyinglv"
     88     print("正在采集数据,请稍后")
     89     start_time = datetime.datetime.now()
     90     count = 0
     91     
     92     # 采集数据 股息率
     93     for url in urls:
     94         save(guxilv,getinfo(url))
     95         count+=1
     96     print("已完成股息率的采集,共{}条数据存入{}文件夹".format(count*48,guxilv))
     97     end_time = datetime.datetime.now()
     98     print((end_time - start_time).seconds)
     99 
    100     # 采集数据 市净率
    101     count = 0
    102     for url in urls_3:
    103         save(shijinglv, getinfo(url))
    104         count += 1
    105     print("已完成市净率的采集,共{}条数据存入{}文件夹".format(count*48,shijinglv))
    106     end_time = datetime.datetime.now()
    107     print((end_time - start_time).seconds)
    108 
    109     # 采集数据 动态市盈率
    110     count = 0
    111     for url in urls_2:
    112         save(gundongshiyinglv, getinfo(url))
    113         count += 1
    114     print("已完成滚动市盈率的采集,共{}条数据存入{}文件夹".format(count*48, gundongshiyinglv))
    115     end_time = datetime.datetime.now()
    116     print((end_time - start_time).seconds)
    117 
    118     # 采集数据 静态市盈率
    119     count = 0
    120     for url in urls_1:
    121         save(jingtaishiyinglv, getinfo(url))
    122         count += 1
    123     print("已完成静态市盈率的采集,共{}条数据存入{}文件夹".format(count*48,jingtaishiyinglv))
    124     end_time = datetime.datetime.now()
    125     print((end_time - start_time).seconds)
    126 
    127 main()

  • 相关阅读:
    ubuntu 16.04 安装 python selenium
    DNS 小问题
    Ubuntu下安装setuptools
    ubuntu16.04LTS更换阿里源
    Ubuntu下安装 Phantomjs
    root和user切换
    Navicat破解安装教程
    urllib2
    MySQL划重点-查询-聚合-分组
    vi编辑器
  • 原文地址:https://www.cnblogs.com/hanbb/p/7993341.html
Copyright © 2011-2022 走看看