zoukankan      html  css  js  c++  java
  • 抓取HTML网页数据

    (转)htmlparse filter使用

    该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已! 
    详细看这里:http://gundumw100.javaeye.com/blog/704311

     

    import java.util.*;   
    import org.htmlparser.Node;   
    import org.htmlparser.NodeFilter;   
    import org.htmlparser.Parser;   
    import org.htmlparser.filters.AndFilter;   
    import org.htmlparser.filters.HasAttributeFilter;   
    import org.htmlparser.filters.NodeClassFilter;   
    import org.htmlparser.filters.TagNameFilter;   
    import org.htmlparser.tags.BodyTag;   
    import org.htmlparser.tags.LinkTag;   
    import org.htmlparser.util.NodeList;   
    import org.htmlparser.util.ParserException;   
      
      
    public class HtmlparseUtil {   
        WebHttpClient util=new WebHttpClient();   
          
        public Map<String, String> linkGet(String url, String charset) {   
            String content=util.getWebContentByGet(url,charset);   
            Map<String, String> linkMap = new HashMap<String, String>();   
            try {   
                //开始解析   
                Parser parser = Parser.createParser(content, charset);   
                // 过滤出<a></a>标签   
                NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);   
                NodeList list = parser.extractAllNodesThatMatch(linkFilter);   
                Node node = null;   
                for (int i = 0; i < list.size(); i++) {   
                    node = list.elementAt(i);   
                    // 获得网页中的链接map(href,text)   
                    linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));   
                }   
            } catch (ParserException e) {   
                e.printStackTrace();   
            }    
            return linkMap;   
        }   
      
          
        public String bodyGet(String url, String charset) {   
            String content=util.getWebContentByGet(url,charset);   
            String body = "";   
            try {   
                Parser parser = Parser.createParser(content, charset);   
                // 过滤<body></body>标签   
                NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);   
                NodeList list = parser.extractAllNodesThatMatch(bodyFilter);   
                Node node = null;   
                for (int i = 0; i < list.size(); i++) {   
                    node = list.elementAt(i);   
                    // 获得网页内容 保存在content中   
                    body = ((BodyTag) node).getBody();   
                }   
            } catch (ParserException e) {   
                e.printStackTrace();   
            }   
            return body;   
        }   
      
          
        public Map<String,String> termGet(String url, String charset) {   
            String content=util.getWebContentByGet(url,charset);   
               
            Map<String, String> map = new HashMap<String, String>();   
            try {   
                //开始解析   
                // 过滤出class为term的<span>元素   
                Parser parser = Parser.createParser(content, charset);   
                AndFilter filter =    
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));   
                   
                Node node = null;   
                NodeList nodeList = parser.parse(filter);   
                   
                for (int i = 0; i < nodeList.size(); i++) {   
                    node = nodeList.elementAt(i);   
                    map.put("term", node.toPlainTextString());   
                }   
                // 过滤出class为start-time的<span>元素   
                Parser parser2 = Parser.createParser(content, charset);   
                AndFilter filter2 =    
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));   
                NodeList nodeList2 = parser2.parse(filter2);   
                for (int i = 0; i < nodeList2.size(); i++) {   
                    node = nodeList2.elementAt(i);   
                    map.put("start-time", node.toPlainTextString());   
                }   
                // 过滤出id为J_SingleEndTimeLabel的<span>元素   
                Parser parser3 = Parser.createParser(content, charset);   
                AndFilter filter3 =    
                    new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));   
                NodeList nodeList3 = parser3.parse(filter3);   
                for (int i = 0; i < nodeList3.size(); i++) {   
                    node = nodeList3.elementAt(i);   
                    map.put("end-time", node.toPlainTextString());   
                }   
                   
                // 过滤出class为box post的<div>元素   
                Parser parser4 = Parser.createParser(content, charset);   
                AndFilter filter4 =    
                    new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));   
                NodeList nodeList4 = parser4.parse(filter4);   
                for (int i = 0; i < nodeList4.size(); i++) {   
                    node = nodeList4.elementAt(i);   
                    String temp=node.toPlainTextString().trim();   
                    temp=temp.substring(10,20).trim();   
                    map.put("pre-term", temp);   
                }   
                   
                // 过滤出class为J_AwardNumber的<span>元素   
                Parser parser5 = Parser.createParser(content, charset);   
    //          AndFilter filter5 =    
    //                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));   
                NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));   
                StringBuffer buffer=new StringBuffer();   
                for (int i = 0; i < nodeList5.size(); i++) {   
                    node = nodeList5.elementAt(i);   
                    buffer.append(","+node.toPlainTextString());   
                }   
                buffer.append("|");   
                   
                // 过滤出class为blue J_AwardNumber的<span>元素   
                Parser parser6 = Parser.createParser(content, charset);   
    //          AndFilter filter6 =    
    //                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));   
                NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));   
                for (int i = 0; i < nodeList6.size(); i++) {   
                    node = nodeList6.elementAt(i);   
                    buffer.append(node.toPlainTextString()+",");   
                }   
                   
                map.put("numbers", buffer.toString());   
            } catch (ParserException e) {   
                // TODO Auto-generated catch block   
                e.printStackTrace();   
            }   
               
            return map;   
        }   
           
        private String processText(String content){      
            content=content.trim().replaceAll("&nbsp;", "");      
    //      content=content.replaceAll("<p>", "
    ");      
    //      content=content.replaceAll("</TD>", "");      
    //      content=content.replaceAll("</div>", "");      
    //      content=content.replaceAll("</a>", "");      
    //      content=content.replaceAll("<a href=.*>", "");      
            return content;      
        }      
           
        public static void main(String[] str) {   
               
            String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";   
            HtmlparseUtil util=new HtmlparseUtil();   
            Map<String,String> map=util.termGet(url, "gb2312");   
            System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span>   
            System.out.println("start-time="+map.get("start-time"));//   
            System.out.println("end-time="+map.get("end-time"));//   
            System.out.println("pre-term="+map.get("pre-term"));//   
            System.out.println("numbers="+map.get("numbers"));//   
               
              
               
        }   
           
    }
  • 相关阅读:
    python路径相关
    python之json
    python之正则表达式备忘
    MD5 SHA1 HMAC HMAC_SHA1区别
    微信根据openid给用户发送图文消息
    最近做的几个小程序
    5000万pv小程序,高并发及缓存优化,入坑
    小程序 后台发送模板消息
    mysql 组合索引
    php 拆分txt小说章节保存到数据库
  • 原文地址:https://www.cnblogs.com/shide/p/3394957.html
Copyright © 2011-2022 走看看