# -*- coding: utf-8 -*- import requests import traceback import re import os from bs4 import BeautifulSoup # 獲取網頁內容 def get_html_text(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' # 獲取股票代碼列表 def get_stock_list(url): # 獲取股票列表網頁 html = get_html_text(url) # 解析 soup = BeautifulSoup(html, 'html.parser') # 獲取所有超鏈接a標籤 a = soup.find_all('a') # 提取a標籤中的股票代碼 lst = [] for i in a: try: href = i.attrs['href'] # 捕捉股票代碼 lst.append(re.findall(r'[s][hz]d{6}', href)[0]) except: continue return lst # 獲取並寫入每隻個股的信息 def get_and_write_stock_info(lst): desktop = os.path.join(os.path.expanduser("~"), 'Desktop') # 獲取每隻股票的信息 for i, stock in enumerate(lst): try: url = STOCK_URL + stock + '.html' html = get_html_text(url) if html == '': continue soup = BeautifulSoup(html, 'html.parser') stock_info = soup.find('div', attrs={'class': 'stock-bets'}) info_dict = {} # 獲取股票名稱 info_dict.update({'股票代碼': stock}) name = stock_info.find_all(attrs={'class': 'bets-name'})[0] info_dict.update({'股票名稱': name.text.split()[0]}) # 獲取其他股票信息 key_list = stock_info.find_all('dt') value_list = stock_info.find_all('dd') if len(key_list) == 0: continue for k, v in zip(key_list, value_list): info_dict[k.text] = v.text # 每隻個股的信息寫入文件 with open(desktop + '\' + SAVE_FILE_PATH, 'a', encoding='utf-8') as f: f.write(str(info_dict) + ' ') print(" 當前進度: {:.2f}%".format(i * 100 / len(lst)), end="") except: continue # 主函數 if __name__ == '__main__': # 東方財富網股票代碼鏈接 STOCK_LIST_URL = 'http://quote.eastmoney.com/stocklist.html' # 百度股票的每隻個股的信息 STOCK_URL = 'https://gupiao.baidu.com/stock/' # 保存路徑 SAVE_FILE_PATH = '股票信息.txt' # 獲取股票代碼列表 stock_list = get_stock_list(STOCK_LIST_URL) get_and_write_stock_info(stock_list)
参考:
Python小爬虫,爬取当前全部股票信息
https://blog.csdn.net/weixin_44521703/article/details/95525861
Python小爬虫,爬取当前全部股票信息
https://blog.csdn.net/weixin_44521703/article/details/95525861