zoukankan      html  css  js  c++  java
  • 转载 HtmlParser 抓取大众点评评论的代码

    import java.io.IOException;  
    import org.apache.commons.httpclient.HttpClient;
    import org.apache.commons.httpclient.HttpException;
    import org.apache.commons.httpclient.methods.GetMethod;
    import org.apache.commons.httpclient.params.HttpMethodParams;
    import org.htmlparser.NodeFilter;
    import org.htmlparser.Parser;
    import org.htmlparser.filters.NodeClassFilter;
    import org.htmlparser.filters.OrFilter;
    import org.htmlparser.tags.Div;
    import org.htmlparser.util.NodeList;
    import org.htmlparser.util.ParserException;

    public class TT {
    /**
    *
    @param args
    *
    @throws IOException
    *
    @throws HttpException
    *
    @throws ParserException
    */
    public static void main(String[] args) throws HttpException, IOException, ParserException {
    String resource
    = getContent("http://www.dianping.com/shop/1968937");
    getReview(resource);

    }

    public static String getContent(String url) throws HttpException, IOException {
    HttpClient hc
    =new HttpClient();
    GetMethod gm
    =new GetMethod(url);
    hc.getParams().setParameter(HttpMethodParams.USER_AGENT,
    "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2");//设置信息
    hc.executeMethod(gm);
    return gm.getResponseBodyAsString();

    }

    public static void getReview(String resource) throws ParserException {
    Parser myParser
    = new Parser(resource);
    NodeList nodeList
    = null;
    //myParser.setEncoding("gb2312");
    NodeFilter divFilter = new NodeClassFilter(Div.class);
    OrFilter lastFilter
    = new OrFilter();
    lastFilter.setPredicates(
    new NodeFilter[] { divFilter });
    try {
    int count = 0;
    nodeList
    = myParser.parse(lastFilter);
    for (int i = 0; i <= nodeList.size(); i++) {
    if (nodeList.elementAt(i) instanceof Div) {
    Div div
    = (Div) nodeList.elementAt(i);
    String id
    = div.getAttribute("id");
    if (id != null && id.startsWith("review_")) {
    System.out.println(
    "--------------------------------" + ++count);
    String content
    = div.getChildrenHTML();
    content
    = content.replaceAll("//<p>.*</p>", "")
    .replaceAll(
    "<span.*</span>", "")
    .replaceAll(
    "<br/>", "/n")
    .replaceAll(
    " ", " ");
    System.out.println(content);
    }
    }
    }
    }
    catch (ParserException e) {
    e.printStackTrace();
    }
    }

    }
  • 相关阅读:
    十六进制转十进制
    十进制转十六进制
    历届试题 高僧斗法
    历届试题 错误票据
    历届试题 大臣的旅费
    历届试题 九宫重排/八数码问题
    Skip the Class
    历届试题 剪格子
    leetcode 337. House Robber III
    猿辅导 2019年 校招提前批笔试
  • 原文地址:https://www.cnblogs.com/xiao0/p/2179957.html
Copyright © 2011-2022 走看看