#目标:获取上交所和深交所所有股票的名称和交易信息 #输出:保存到文件中 #技术路线:requests-bs4-re #候选网站选取原则: # 股票信息静态存在与html界面中,非js代码生成,没有robots协议限制 #选取心态: # 不要纠结于某个网站,多找信息源进行尝试 #程序结构程序设计 #步骤1:从东方财富网获取股票列表 #步骤2:根据股票列表逐个到百度股票获取个股信息 #步骤3:将结果存储到文件中 import requests from bs4 import BeautifulSoup import traceback import re def getHtTMLText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '' def getStockList(lst,stockURL): html = getHtTMLText(stockURL) soup = BeautifulSoup(html,'html.parser') a = soup.find_all('a') for i in a: try: href = i.attrs['href'] number = re.findall(r'[s][hz]d{6}',href)[0] if number is not '': lst.append(number) except: continue def getStockInfo(lst,stockURL,fpath): count = 0 for stock in lst: url = stockURL + stock + '.html' html = getHtTMLText(url) try: if html == '': continue infoDict = {} soup = BeautifulSoup(html,'html.parser') stockInfo = soup.find('div',attrs = {'class':'stock-bets'}) name = stockInfo.find_all(attrs = {'class':'bets-name'})[0] infoDict.update({'股票名称':name.text.split()[0]}) print(' '+url) print({'股票名称':name.text.split()[0]}) keyList = stockInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].string.strip() val = valueList[i].string.strip() infoDict[key] = val print(' ' + key + ':' + val) with open(fpath,'a',encoding = 'utf-8') as f: f.write(str(infoDict)+' ') count = count + 1 print(' 当前进度:{:.2f}%'.format(count*100/len(lst)),end = '') except: count = count + 1 print(' 当前进度:{:.2f}%'.format(count*100/len(lst)),end = '') #traceback.print_exc() continue def main(): stock_list_url = 'http://quote.eastmoney.com/stocklist.html' stock_info_url = 'http://gupiao.baidu.com/stock/' output_file = 'D://BaiduStockInfo.txt' slist = [] getStockList(slist,stock_list_url) getStockInfo(slist,stock_info_url,output_file) main()