需求分析:抓取:http://tools.2345.com/rili.htm中的万年历(阳历、阴历等等)。
1.首先为抓取的内容创建一个类。实现封装。
package com.wan.domain; public class Almanac { private String solar; /* 阳历 e.g.2016年 4月11日 星期一 */ private String lunar; /* 阴历 e.g. 猴年 三月初五*/ private String chineseAra; /* 天干地支纪年法 e.g.丙申年 壬辰月 癸亥日*/ private String should; /* 宜e.g. 求子 祈福 开光 祭祀 安床*/ private String avoid; /* 忌 e.g. 玉堂(黄道)危日,忌出行*/ public String getSolar() { return solar; } public void setSolar(String solar) { this.solar = solar; } public String getLunar() { return lunar; } public void setLunar(String lunar) { this.lunar = lunar; } public String getChineseAra() { return chineseAra; } public void setChineseAra(String chineseAra) { this.chineseAra = chineseAra; } public String getShould() { return should; } public void setShould(String should) { this.should = should; } public String getAvoid() { return avoid; } public void setAvoid(String avoid) { this.avoid = avoid; } public Almanac(String solar, String lunar, String chineseAra, String should,String avoid) { this.solar = solar; this.lunar = lunar; this.chineseAra = chineseAra; this.should = should; this.avoid = avoid; } }
2.编写逻辑,实现抓取(需要导入相应的jar包:commons-httpclient-3.0.1.jar、commons-logging.jar、httpcore-4.4.jar、jsoup-1.7.3.jar、org.apache.httpcomponents.httpclient_4.5.3.jar)
package com.wan.controller; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import org.apache.http.HttpEntity; import org.apache.http.ParseException; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.wan.domain.Almanac; public class AlmanacUtil { /** * 单例工具类 */ private AlmanacUtil() { } /** * 获取万年历信息 * @return */ public static Almanac getAlmanac(){ String url="http://tools.2345.com/rili.htm"; String html=pickData(url); Almanac almanac=analyzeHTMLByString(html); return almanac; } /* * 爬取网页信息 */ private static String pickData(String url) { CloseableHttpClient httpclient = HttpClients.createDefault(); try { HttpGet httpget = new HttpGet(url); CloseableHttpResponse response = httpclient.execute(httpget); try { // 获取响应实体 HttpEntity entity = response.getEntity(); // 打印响应状态 if (entity != null) { return EntityUtils.toString(entity); } } finally { response.close(); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { // 关闭连接,释放资源 try { httpclient.close(); } catch (IOException e) { e.printStackTrace(); } } return null; } /* * 使用jsoup解析网页信息 */ private static Almanac analyzeHTMLByString(String html){ String solarDate,lunarDate,chineseAra,should,avoid=" "; Document document = Jsoup.parse(html); //公历时间 solarDate=getSolarDate(document,"bjtime"); //农历时间 Element eLunarDate=document.getElementById("info_nong"); lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11); //天干地支纪年法 Element eChineseAra=document.getElementById("info_chang"); chineseAra=eChineseAra.text().toString(); //宜 should=getSuggestion(document,"yi"); //忌 avoid=getSuggestion(document,"ji"); Almanac almanac=new Almanac(solarDate,lunarDate,chineseAra,should,avoid); return almanac; } /* * 获取忌/宜 */ private static String getSuggestion(Document doc,String id){ Element element=doc.getElementById(id); Elements elements=element.getElementsByTag("a"); StringBuffer sb=new StringBuffer(); for (Element e : elements) { sb.append(e.text()+" "); } return sb.toString(); } /* * 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。 * @return yyyy年MM月dd日 EEEE */ private static String getSolarDate(Document doc,String id) { Calendar calendar = Calendar.getInstance(); Date solarDate = calendar.getTime(); SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE"); return formatter.format(solarDate); } }
注:公历时间没有实现网页抓取。是获取的系统的时间
3.编写测试
package com.wan.test; import com.wan.controller.AlmanacUtil; import com.wan.domain.Almanac; public class Test { public static void main(String[] args) { // TODO Auto-generated method stub Almanac almanac=AlmanacUtil.getAlmanac(); System.out.println("公历时间"+almanac.getSolar()); System.out.println("农历时间"+almanac.getLunar()); System.out.println("天干地支"+almanac.getChineseAra()); System.out.println("宜"+almanac.getShould()); System.out.println("忌"+almanac.getAvoid()); } }
最后在控制台输出: