zoukankan      html  css  js  c++  java
  • httpclient+jsoup实现网页信息抓取

    需求分析:抓取:http://tools.2345.com/rili.htm中的万年历(阳历、阴历等等)。

    1.首先为抓取的内容创建一个类。实现封装。

    package com.wan.domain;
    
    public class Almanac {
    	private String solar;        /* 阳历 e.g.2016年 4月11日 星期一 */
    	private String lunar;        /* 阴历 e.g. 猴年 三月初五*/
    	private String chineseAra;    /* 天干地支纪年法 e.g.丙申年 壬辰月 癸亥日*/
    	private String should;         /* 宜e.g. 求子 祈福 开光 祭祀 安床*/
    	private String avoid;         /* 忌 e.g. 玉堂(黄道)危日,忌出行*/
    	public String getSolar() {
    		return solar;
    	}
    	public void setSolar(String solar) {
    		this.solar = solar;
    	}
    	public String getLunar() {
    		return lunar;
    	}
    	public void setLunar(String lunar) {
    		this.lunar = lunar;
    	}
    	public String getChineseAra() {
    		return chineseAra;
    	}
    	public void setChineseAra(String chineseAra) {
    		this.chineseAra = chineseAra;
    	}
    	public String getShould() {
    		return should;
    	}
    	public void setShould(String should) {
    		this.should = should;
    	}
    	public String getAvoid() {
    		return avoid;
    	}
    	public void setAvoid(String avoid) {
    		this.avoid = avoid;
    	}
    	 public Almanac(String solar, String lunar, String chineseAra, String should,String avoid) {
    		 this.solar = solar;
    		 this.lunar = lunar;
    		 this.chineseAra = chineseAra;
    		 this.should = should;
    		 this.avoid = avoid;
         }
    	
    }
    

    2.编写逻辑,实现抓取(需要导入相应的jar包:commons-httpclient-3.0.1.jar、commons-logging.jar、httpcore-4.4.jar、jsoup-1.7.3.jar、org.apache.httpcomponents.httpclient_4.5.3.jar)

    package com.wan.controller;
    
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.Calendar;
    import java.util.Date;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.ParseException;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.wan.domain.Almanac;
    
    public class AlmanacUtil {
    	/**
         * 单例工具类
         */
        private AlmanacUtil() {
        }
        /**
         * 获取万年历信息
         * @return
         */
        public static Almanac getAlmanac(){
            String url="http://tools.2345.com/rili.htm";
            String html=pickData(url);
            Almanac almanac=analyzeHTMLByString(html);
            return almanac;
        }
        
        /*
         * 爬取网页信息
         */
        private static String pickData(String url) {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            try {
                HttpGet httpget = new HttpGet(url);
                CloseableHttpResponse response = httpclient.execute(httpget);
                try {
                    // 获取响应实体
                    HttpEntity entity = response.getEntity();
                    // 打印响应状态
                    if (entity != null) {
                        return EntityUtils.toString(entity);
                    }
                } finally {
                    response.close();
                }
            } catch (ClientProtocolException e) {
                e.printStackTrace();
            } catch (ParseException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                // 关闭连接,释放资源
                try {
                    httpclient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return null;
        }
        
        /*
         * 使用jsoup解析网页信息
         */
        private static Almanac analyzeHTMLByString(String html){
            String solarDate,lunarDate,chineseAra,should,avoid=" ";
            Document document = Jsoup.parse(html);
            //公历时间
            solarDate=getSolarDate(document,"bjtime");
            //农历时间
            Element eLunarDate=document.getElementById("info_nong");
            lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
            //天干地支纪年法
            Element eChineseAra=document.getElementById("info_chang");
            chineseAra=eChineseAra.text().toString();    
            //宜
            should=getSuggestion(document,"yi");
            //忌
            avoid=getSuggestion(document,"ji");
            Almanac almanac=new Almanac(solarDate,lunarDate,chineseAra,should,avoid);
            return almanac;
        }
        /*
         * 获取忌/宜
         */
        private static String getSuggestion(Document doc,String id){
            Element element=doc.getElementById(id);
            Elements elements=element.getElementsByTag("a");
            StringBuffer sb=new StringBuffer();
            for (Element e : elements) {
                sb.append(e.text()+" ");
            }
            return sb.toString();
        }
    
        /*
         * 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。
         * @return yyyy年MM月dd日 EEEE
         */
        private static String getSolarDate(Document doc,String id) {
            Calendar calendar = Calendar.getInstance();
            Date solarDate = calendar.getTime();
            SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
            return formatter.format(solarDate);
        }
    }
    

    注:公历时间没有实现网页抓取。是获取的系统的时间

    3.编写测试

    package com.wan.test;
    
    import com.wan.controller.AlmanacUtil;
    import com.wan.domain.Almanac;
    
    public class Test {
    
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		 	Almanac almanac=AlmanacUtil.getAlmanac();
    	        System.out.println("公历时间"+almanac.getSolar());
    	        System.out.println("农历时间"+almanac.getLunar());
    	        System.out.println("天干地支"+almanac.getChineseAra());
    	        System.out.println("宜"+almanac.getShould());
    	        System.out.println("忌"+almanac.getAvoid());
    	}
    
    }
    

    最后在控制台输出:

     

  • 相关阅读:
    CSS的margin塌陷
    css white-space
    float的理解
    html标签元素分类
    Sublime text3使用技巧及快捷键
    JSON
    js原生Ajax的封装与使用
    XMLHttpRequest基础知识
    HTTP的一些基础知识
    创建兼容的XHR对象
  • 原文地址:https://www.cnblogs.com/wanyong-wy/p/7655293.html
Copyright © 2011-2022 走看看