要爬取的凤凰财经网址:http://app.finance.ifeng.com/list/stock.php?t=hs
本作主要采用的技术是jsoup,相关介绍网页:https://www.jianshu.com/p/69b395bee43a
爬取程序:
package com.ufo.hy.agumaster.tool; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.ufo.hy.agumaster.entity.Stock; /** * Crawl stock code/name from FengHuang finance website:http://app.finance.ifeng.com/list/stock.php?t=hs * Main package:jsoup * Dependency: * <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency> * @author heyang * */ public class FenghuangCrawler { private static final String SRC_URL="http://app.finance.ifeng.com/list/stock.php?t=hs"; private static final String ENCODING = "utf-8"; // Used to save stock code names private List<Stock> stockList; public FenghuangCrawler() { stockList=new ArrayList<Stock>(); String url=SRC_URL; int idx=0; while(true) { System.out.println(url); String html = getUrlHtml(url,ENCODING); Document doc = Jsoup.parse(html,ENCODING); // Find core node Element divtab01 = doc.getElementsByClass("tab01").last(); // Find stocks Elements trs=divtab01.getElementsByTag("tr"); for(Element tr:trs) { Elements tds=tr.getElementsByTag("td"); if(tds.size()>2) { Element codeElm=tds.get(0).getElementsByTag("a").last(); Element nameElm=tds.get(1).getElementsByTag("a").last(); Stock s=new Stock(idx++,codeElm.text(),nameElm.text()); stockList.add(s); } } // Find next page url Element lastLink=divtab01.getElementsByTag("a").last(); if(lastLink.text().equals("下一页")) { url="http://app.finance.ifeng.com/list/stock.php"+lastLink.attr("href"); }else { break; } } for(Stock s:stockList) { System.out.println(s); } System.out.println("共找到"+idx+"个股票."); } private String getUrlHtml(String url, String encoding) { StringBuffer sb = new StringBuffer(); URL urlObj = null; URLConnection openConnection = null; InputStreamReader isr = null; BufferedReader br = null; try { urlObj = new URL(url); openConnection = urlObj.openConnection(); isr = new InputStreamReader(openConnection.getInputStream(), encoding); br = new BufferedReader(isr); String temp = null; while ((temp = br.readLine()) != null) { sb.append(temp + " "); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (isr != null) { isr.close(); } } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } public List<Stock> getStockList() { return stockList; } public static void main(String[] args) { // 根据需要设置代理 System.setProperty("http.proxyHost", ""); System.setProperty("http.proxyPort", ""); new FenghuangCrawler(); } }
运行结果节选:
...
Stock id:3743 code:002752 name:昇兴股份 Stock id:3744 code:000796 name:凯撒旅业 Stock id:3745 code:603233 name:大参林 Stock id:3746 code:000048 name:京基智农 Stock id:3747 code:300463 name:迈克生物 Stock id:3748 code:300485 name:赛升药业 Stock id:3749 code:603387 name:基蛋生物 Stock id:3750 code:002469 name:三维工程 Stock id:3751 code:600052 name:浙江广厦 Stock id:3752 code:002187 name:广百股份 Stock id:3753 code:300069 name:金利华电 Stock id:3754 code:300317 name:珈伟新能 Stock id:3755 code:002637 name:赞宇科技 Stock id:3756 code:001914 name:招商积余 Stock id:3757 code:000564 name:供销大集 Stock id:3758 code:002363 name:隆基机械 Stock id:3759 code:603709 name:中源家居 Stock id:3760 code:000802 name:北京文化 Stock id:3761 code:002127 name:南极电商 Stock id:3762 code:600107 name:美尔雅 Stock id:3763 code:002678 name:珠江钢琴 Stock id:3764 code:002083 name:孚日股份 Stock id:3765 code:300325 name:德威新材 共找到3766个股票.
这是2020年5月1日的数据。
参考资料:
https://www.jianshu.com/p/3430f4d0b384 https://blog.csdn.net/qq_28940573/article/details/99295276
--2020-04-30--