爬取网站: 沪港通https://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=sh&t=sh
和深港通https://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=sh&t=sz
(url只是最后一个字母不一样)
# coding=utf-8 import pandas as pd import numpy as np import datetime from bs4 import BeautifulSoup import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By def get_browser(): chrome_options = Options() chrome_options.add_argument("--headless") br = webdriver.Chrome(options=chrome_options) return br def get_shareholding(): for exchange in ["sh", "sz"]: URL = ( "http://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=" + exchange.lower() ) browser = get_browser() browser.get(URL) today = datetime.date.today() start_date = today.replace(year=today.year - 1) end_date = today date_list = pd.date_range( start=start_date, end=end_date, freq="1D", closed="left" ).strftime("%Y/%m/%d") for date in date_list: try: js = "document.getElementById('txtShareholdingDate').value='{}';".format( date ) browser.execute_script(js) browser.find_element(By.ID, "txtShareholdingDate").click() browser.find_element(By.ID, "btnSearch").click() soup = BeautifulSoup(browser.page_source, "html.parser") data = [] for tr in ( soup.find("table", {"id": "mutualmarket-result"}) .find("tbody") .findAll("tr") ): code = ( tr.find("td", {"class": "col-stock-code"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) name = ( tr.find("td", {"class": "col-stock-name"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) shareholding = ( tr.find("td", {"class": "col-shareholding"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) shareholding_percent = ( tr.find("td", {"class": "col-shareholding-percent"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) data.append([code, name, shareholding, shareholding_percent]) df = pd.DataFrame(data, columns=["code", "name", "shareholding", "shareholding_percent"]) df["Symbol"] = df["name"].apply(lambda x: x[-7:-1].replace("#", "0")) df["shareholding_percent"] = ( df["shareholding_percent"] .apply(lambda x: x[:-1] if len(x) > 0 else np.nan) .astype("float64") ) df["shareholding"] = ( df["shareholding"] .apply(lambda x: x.replace(",", "")) .astype("float64") ) date = date.replace("/", "-") df["Tradedate"] = date del df["code"], df["name"] import pdb; pdb.set_trace() time.sleep(2) except Exception as er: print(er) browser.close() browser.quit() if __name__ == "__main__": get_shareholding()
注意:
1.这里用pdb打断了, 输出结果是pandas的dataframe类型, 一般都会导入到数据库, 这里就不做演示了, 公司都会有自己封装好的方法
2.这个网站提供的数据是当前时间往前推一个自然年的数据, 每天的数据, 有个SEARCH按钮可以手动选择,
这里用无界面Chrome浏览器 + 执行js选择日期 + click确认 实现这一操作
windows系统下 需要手动下载chromedriver.exe 版本号与本地浏览器匹配, 浏览器地址栏输入chrome://version/ 第一行就是版本号
exe本地路径通过webdriver的executable_path参数指定, 如果放在和模块同一目录下可以省略(这里就省略了)
下载地址 http://chromedriver.storage.googleapis.com/index.html
3.这个网站的数据有些奇怪, 我已经做了一些特殊数据处理, 转化成常见的数据, 输入df.head()可预览数据的前五行
返回数据示例