一 Jsoup包
下载链接:http://download.csdn.net/detail/u014000832/7994245
二 爬取搜狐新闻网站标题等内容
package com.test1; import java.io.IOException; import org.jsoup.*; import org.jsoup.helper.*; import org.jsoup.nodes.*; import org.jsoup.parser.*; import org.jsoup.select.*; public class HtmlParser { static void print(Document doc){ //<h1 itemprop="headline">日本物流公司瞄准中国海淘族 跨境快递4天到货</h1> Elements h = doc.select("h1[itemprop]");//标题 System.out.println(h.text()); Elements time = doc.select("div.time");//时间 System.out.println(time.text()); /* Element source = doc.select("span[itemprop=name]").first();//来源 System.out.println(source.text()); */ Elements source = doc.select("div[class=source]"); System.out.println(source.text()); Elements body = doc.select("div[itemprop=articleBody]"); System.out.println(body.text()); } public static void main(String[] args) { // TODO Auto-generated method stub try { String url = "http://news.sohu.com/"; Document doc,TempDoc; doc = Jsoup.connect(url).get(); //System.out.println(doc); //findalllinks Elements links = doc.select("a[href]"); for (Element link : links){ String StrUrl = link.attr("abs:href"); if (StrUrl.startsWith("http://news.sohu.com/2016")) { System.out.println(StrUrl); TempDoc = Jsoup.connect(StrUrl).get(); print(TempDoc); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
原文:http://blog.csdn.net/u012315428/article/details/51136490 , 复制链接打开