zoukankan      html  css  js  c++  java
  • HtmlparseUtil.java

    该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已!
    详细看这里:http://gundumw100.iteye.com/blog/704311

     
    import java.util.*;  
    import org.htmlparser.Node;  
    import org.htmlparser.NodeFilter;  
    import org.htmlparser.Parser;  
    import org.htmlparser.filters.AndFilter;  
    import org.htmlparser.filters.HasAttributeFilter;  
    import org.htmlparser.filters.NodeClassFilter;  
    import org.htmlparser.filters.TagNameFilter;  
    import org.htmlparser.tags.BodyTag;  
    import org.htmlparser.tags.LinkTag;  
    import org.htmlparser.util.NodeList;  
    import org.htmlparser.util.ParserException;  
      
    /** 
     * httpclient与htmlparse对网页的解析 
     *  
     * @author Administrator 
     *  
     */  
    public class HtmlparseUtil {  
        WebHttpClient util=new WebHttpClient();  
        /** 
         * 获得网页中的超链接,将href和text保存在Map中:map(href,text) 
         * @param url 
         * @param charset 
         * @return 
         */  
        public Map<String, String> linkGet(String url, String charset) {  
            String content=util.getWebContentByGet(url,charset);  
            Map<String, String> linkMap = new HashMap<String, String>();  
            try {  
                //开始解析  
                Parser parser = Parser.createParser(content, charset);  
                // 过滤出<a></a>标签  
                NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);  
                NodeList list = parser.extractAllNodesThatMatch(linkFilter);  
                Node node = null;  
                for (int i = 0; i < list.size(); i++) {  
                    node = list.elementAt(i);  
                    // 获得网页中的链接map(href,text)  
                    linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));  
                }  
            } catch (ParserException e) {  
                e.printStackTrace();  
            }   
            return linkMap;  
        }  
      
        /** 
         * 获得网页<body></body>标签中的内容, 保存在body中 
         * @param url 
         * @param charset 
         * @return 
         */  
        public String bodyGet(String url, String charset) {  
            String content=util.getWebContentByGet(url,charset);  
            String body = "";  
            try {  
                Parser parser = Parser.createParser(content, charset);  
                // 过滤<body></body>标签  
                NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);  
                NodeList list = parser.extractAllNodesThatMatch(bodyFilter);  
                Node node = null;  
                for (int i = 0; i < list.size(); i++) {  
                    node = list.elementAt(i);  
                    // 获得网页内容 保存在content中  
                    body = ((BodyTag) node).getBody();  
                }  
            } catch (ParserException e) {  
                e.printStackTrace();  
            }  
            return body;  
        }  
      
        /** 
         * 过滤出class为term的<span>元素,并获得他们的文本 
         * @param url 
         * @param charset 
         * @return 
         */  
        public Map<String,String> termGet(String url, String charset) {  
            String content=util.getWebContentByGet(url,charset);  
              
            Map<String, String> map = new HashMap<String, String>();  
            try {  
                //开始解析  
                // 过滤出class为term的<span>元素  
                Parser parser = Parser.createParser(content, charset);  
                AndFilter filter =   
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));  
                  
                Node node = null;  
                NodeList nodeList = parser.parse(filter);  
                  
                for (int i = 0; i < nodeList.size(); i++) {  
                    node = nodeList.elementAt(i);  
                    map.put("term", node.toPlainTextString());  
                }  
                // 过滤出class为start-time的<span>元素  
                Parser parser2 = Parser.createParser(content, charset);  
                AndFilter filter2 =   
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));  
                NodeList nodeList2 = parser2.parse(filter2);  
                for (int i = 0; i < nodeList2.size(); i++) {  
                    node = nodeList2.elementAt(i);  
                    map.put("start-time", node.toPlainTextString());  
                }  
                // 过滤出id为J_SingleEndTimeLabel的<span>元素  
                Parser parser3 = Parser.createParser(content, charset);  
                AndFilter filter3 =   
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));  
                NodeList nodeList3 = parser3.parse(filter3);  
                for (int i = 0; i < nodeList3.size(); i++) {  
                    node = nodeList3.elementAt(i);  
                    map.put("end-time", node.toPlainTextString());  
                }  
                  
                // 过滤出class为box post的<div>元素  
                Parser parser4 = Parser.createParser(content, charset);  
                AndFilter filter4 =   
                    new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));  
                NodeList nodeList4 = parser4.parse(filter4);  
                for (int i = 0; i < nodeList4.size(); i++) {  
                    node = nodeList4.elementAt(i);  
                    String temp=node.toPlainTextString().trim();  
                    temp=temp.substring(10,20).trim();  
                    map.put("pre-term", temp);  
                }  
                  
                // 过滤出class为J_AwardNumber的<span>元素  
                Parser parser5 = Parser.createParser(content, charset);  
    //          AndFilter filter5 =   
    //                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));  
                NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));  
                StringBuffer buffer=new StringBuffer();  
                for (int i = 0; i < nodeList5.size(); i++) {  
                    node = nodeList5.elementAt(i);  
                    buffer.append(","+node.toPlainTextString());  
                }  
                buffer.append("|");  
                  
                // 过滤出class为blue J_AwardNumber的<span>元素  
                Parser parser6 = Parser.createParser(content, charset);  
    //          AndFilter filter6 =   
    //                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));  
                NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));  
                for (int i = 0; i < nodeList6.size(); i++) {  
                    node = nodeList6.elementAt(i);  
                    buffer.append(node.toPlainTextString()+",");  
                }  
                  
                map.put("numbers", buffer.toString());  
            } catch (ParserException e) {  
                // TODO Auto-generated catch block  
                e.printStackTrace();  
            }  
              
            return map;  
        }  
          
        private String processText(String content){     
            content=content.trim().replaceAll("&nbsp;", "");     
    //      content=content.replaceAll("<p>", "
    ");     
    //      content=content.replaceAll("</TD>", "");     
    //      content=content.replaceAll("</div>", "");     
    //      content=content.replaceAll("</a>", "");     
    //      content=content.replaceAll("<a href=.*>", "");     
            return content;     
        }     
          
        public static void main(String[] str) {  
              
            String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";  
            HtmlparseUtil util=new HtmlparseUtil();  
            Map<String,String> map=util.termGet(url, "gb2312");  
            System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span>  
            System.out.println("start-time="+map.get("start-time"));//  
            System.out.println("end-time="+map.get("end-time"));//  
            System.out.println("pre-term="+map.get("pre-term"));//  
            System.out.println("numbers="+map.get("numbers"));//  
              
            /* 
            Map<String, String> linkMap = util.linkGet(url, "gb2312"); 
            for (String s : linkMap.keySet()) { 
                System.out.println(s + " = " + linkMap.get(s)); 
                //如果是个链接,则再获取它的<body>中的内容 
    //          if (s.startsWith("http")) { 
    //              util.bodyGet(s, "gb2312"); 
    //          } 
            } 
            */  
              
        }  
          
    }  
  • 相关阅读:
    h5实现 微信的授权登录
    js判断浏览器的环境(pc端,移动端,还是微信浏览器)
    动态判断时间插件显示到年月日时分秒
    H5发起微信支付
    Vue项目结合vux使用
    Swift学习笔记一:常量和变量
    iOS开发之解决系统数字键盘无文字时delete键无法监听的技巧
    Swift3.0之获取设备识别号deviceNo和保存账户AccountId
    Swift3.0之自定义debug阶段控制台打印
    Xcode之command+/快捷键添加注释不起作用
  • 原文地址:https://www.cnblogs.com/coprince/p/3152441.html
Copyright © 2011-2022 走看看