zoukankan      html  css  js  c++  java
  • python爬取股票信息

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import traceback
     4 import re
     5 
     6 
     7 def getHTMLText(url):
     8     try:
     9         r = requests.get(url)
    10         r.raise_for_status()
    11         r.encoding = r.apparent_encoding
    12         return r.text
    13     except:
    14         return ""
    15 
    16 
    17 def getStockList(lst, stockURL):
    18     html = getHTMLText(stockURL)
    19     soup = BeautifulSoup(html, 'html.parser')
    20     a = soup.find_all('a')
    21     for i in a:
    22         try:
    23             href = i.attrs['href']
    24             lst.append(re.findall(r"[s][hz]d{6}", href)[0])
    25         except:
    26             continue
    27 
    28 
    29 def getStockInfo(lst, stockURL, fpath):
    30     for stock in lst:
    31         url = stockURL + stock + ".html"
    32         html = getHTMLText(url)
    33         try:
    34             if html == "":
    35                 continue
    36             infoDict = {}
    37             soup = BeautifulSoup(html, 'html.parser')
    38             stockInfo = soup.find('div', attrs={'class': 'stock-bets'})
    39 
    40             name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
    41             infoDict.update({'股票名称': name.text.split()[0]})
    42 
    43             keyList = stockInfo.find_all('dt')
    44             valueList = stockInfo.find_all('dd')
    45             for i in range(len(keyList)):
    46                 key = keyList[i].text
    47                 val = valueList[i].text
    48                 infoDict[key] = val
    49 
    50             with open(fpath, 'a', encoding='utf-8') as f:
    51                 f.write(str(infoDict) + '
    ')
    52         except:
    53             traceback.print_exc()
    54             continue
    55 
    56 
    57 def main():
    58     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    59     stock_info_url = 'http://gupiao.baidu.com/stock/'
    60     output_file = 'D:/BaiduStockInfo.txt'
    61     slist = []
    62     getStockList(slist, stock_list_url)
    63     getStockInfo(slist, stock_info_url, output_file)
    64 
    65 
    66 main()

    这是慕课上的源代码,直接粘贴的,不知道为什么运行一直报错。以下是错误。如果有人解决了,麻烦说一声,谢谢啦。

    下面是慕课中修改的代码,也是源代码,直接粘贴的,但是是可以运行出来的。

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import traceback
     4 import re
     5  
     6 def getHTMLText(url, code="utf-8"):
     7     try:
     8         r = requests.get(url)
     9         r.raise_for_status()
    10         r.encoding = code
    11         return r.text
    12     except:
    13         return ""
    14  
    15 def getStockList(lst, stockURL):
    16     html = getHTMLText(stockURL, "GB2312")
    17     soup = BeautifulSoup(html, 'html.parser') 
    18     a = soup.find_all('a')
    19     for i in a:
    20         try:
    21             href = i.attrs['href']
    22             lst.append(re.findall(r"[s][hz]d{6}", href)[0])
    23         except:
    24             continue
    25  
    26 def getStockInfo(lst, stockURL, fpath):
    27     count = 0
    28     for stock in lst:
    29         url = stockURL + stock + ".html"
    30         html = getHTMLText(url)
    31         try:
    32             if html=="":
    33                 continue
    34             infoDict = {}
    35             soup = BeautifulSoup(html, 'html.parser')
    36             stockInfo = soup.find('div',attrs={'class':'stock-bets'})
    37  
    38             name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
    39             infoDict.update({'股票名称': name.text.split()[0]})
    40              
    41             keyList = stockInfo.find_all('dt')
    42             valueList = stockInfo.find_all('dd')
    43             for i in range(len(keyList)):
    44                 key = keyList[i].text
    45                 val = valueList[i].text
    46                 infoDict[key] = val
    47              
    48             with open(fpath, 'a', encoding='utf-8') as f:
    49                 f.write( str(infoDict) + '
    ' )
    50                 count = count + 1
    51                 print("
    当前进度: {:.2f}%".format(count*100/len(lst)),end="")
    52         except:
    53             count = count + 1
    54             print("
    当前进度: {:.2f}%".format(count*100/len(lst)),end="")
    55             continue
    56  
    57 def main():
    58     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    59     stock_info_url = 'http://gupiao.baidu.com/stock/'
    60     output_file = 'D:/BaiduStockInfo.txt'
    61     slist=[]
    62     getStockList(slist, stock_list_url)
    63     getStockInfo(slist, stock_info_url, output_file)
    64  
    65 main()

    至于为什么,本人也不是特别清楚,代码主题部分是一样的,具体原因不清楚。我把编码方式删去了,和原先一样。但是程序依然可以运行。我不知道为什么,加了进度条,程序就可以运行了。

    这是别人的代码分析 ,个人觉得还是很好的,很仔细:https://segmentfault.com/a/1190000010520835

  • 相关阅读:
    PHP 计算页面执行时间
    PHP 实现多服务器共享 SESSION 数据
    你不了解PHP的10件事情
    智能的PHP缩图类
    php配置文件php.ini 中文版
    PclZip:强大的PHP压缩与解压缩zip类
    https 加载问题
    docker 安装sentry
    requests ssl 报错
    解决长时间计划任务rsync同步进程数过多
  • 原文地址:https://www.cnblogs.com/tianqianlan/p/9451070.html
Copyright © 2011-2022 走看看