zoukankan html css js c++ java

爬取股票信息

#目标：获取上交所和深交所所有股票的名称和交易信息
#输出：保存到文件中
#技术路线：requests-bs4-re

#候选网站选取原则：
#                    股票信息静态存在与html界面中，非js代码生成，没有robots协议限制
#选取心态：
#        不要纠结于某个网站，多找信息源进行尝试

#程序结构程序设计
#步骤1：从东方财富网获取股票列表
#步骤2：根据股票列表逐个到百度股票获取个股信息
#步骤3：将结果存储到文件中


import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHtTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text 
    except:
        return ''

def getStockList(lst,stockURL):
    html = getHtTMLText(stockURL)
    soup = BeautifulSoup(html,'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']
            number = re.findall(r'[s][hz]d{6}',href)[0]
            if number is not '':
                lst.append(number)
        except:
            continue


def getStockInfo(lst,stockURL,fpath):
      count = 0
      for stock in lst:
          url = stockURL + stock +  '.html'
          html = getHtTMLText(url)
          try:
              if html == '':
                  continue              
              infoDict = {}
              soup = BeautifulSoup(html,'html.parser')
              stockInfo = soup.find('div',attrs = {'class':'stock-bets'})
              name = stockInfo.find_all(attrs = {'class':'bets-name'})[0]
              infoDict.update({'股票名称':name.text.split()[0]})
              print('
'+url)
              print({'股票名称':name.text.split()[0]})              
              keyList = stockInfo.find_all('dt')
              valueList = stockInfo.find_all('dd')
              for i in range(len(keyList)):
                  key = keyList[i].string.strip()
                  val = valueList[i].string.strip()
                  infoDict[key] = val
                  print('	' + key + ':' + val)
              with open(fpath,'a',encoding = 'utf-8') as f:
                  f.write(str(infoDict)+'
')
                  count = count + 1
                  print('
当前进度：{:.2f}%'.format(count*100/len(lst)),end = '')
          except:
              count = count + 1
              print('
当前进度：{:.2f}%'.format(count*100/len(lst)),end = '')
              #traceback.print_exc()
              continue
    
def main():
    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    stock_info_url = 'http://gupiao.baidu.com/stock/'
    output_file = 'D://BaiduStockInfo.txt'
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()

查看全文

相关阅读:
单一index.php实现PHP任意层级文件夹遍历(原创自Zjmainstay)
php读取文件内容至字符串中，同时去除换行、空行、行首行尾空格（原创自Zjmainstay）
php获取页面并切割页面div内容
 jQuery单击双击实现链接的提取、插入与删除
 PHP 利用AJAX获取网页并输出（原创自Zjmainstay）
php 数组首字符过滤功能
 点击图片添加文件在Chrome中使用的兼容问题
 php读取txt文件组成SQL并插入数据库（原创自Zjmainstay）
为博客园添加标签云动画
 jQuery动态增删改查表格信息，可左键/右键提示（原创自Zjmainstay）

原文地址：https://www.cnblogs.com/zhanghaijie/p/8418264.html