zoukankan      html  css  js  c++  java
  • htmlparser实现从网页上抓取数据(收集)

    http://luoye1989hzx.blog.163.com/blog/static/1699218892010828364066/


    本文引用自luoye1989《htmlparser实现从网页上抓取数据(收集)》

    package parser;



    import java.io.BufferedReader;

    import java.io.BufferedWriter;

    import java.io.FileWriter;

    import java.io.IOException;

    import java.io.InputStream;

    import java.io.InputStreamReader;

    import java.net.MalformedURLException;

    import java.net.URL;



    /**

    * 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件

    *

    * @author chenguoyong

    *

    */

    public class ScrubSelectedWeb {

    private final static String CRLF = System.getProperty("line.separator");



    /**

    * @param args

    */

    public static void main(String[] args) {

    try {

    URL ur = new URL("http://10.249.187.199:8083/injs100/");

    InputStream instr = ur.openStream();

    String s, str;

    BufferedReader in = new BufferedReader(new InputStreamReader(instr));

    StringBuffer sb = new StringBuffer();

    BufferedWriter out = new BufferedWriter(new FileWriter(

    "D:/outPut.txt"));

    while ((s = in.readLine()) != null) {

    sb.append(s + CRLF);

    }

    System.out.println(sb);

    str = new String(sb);

    out.write(str);

    out.close();

    in.close();

    } catch (MalformedURLException e) {

    e.printStackTrace();

    } catch (IOException e) {

    e.printStackTrace();

    }



    }



    }

    基本能实现网页抓取,不过要手动输入URL,此外没有重构。只是一个简单的思路。


    1.htmlparser 使用

    htmlparser是一个纯的java写的html解析的库,htmlparser不依赖于其它的java库,htmlparser主要用于改造 或提取html。htmlparser能超高速解析html,而且不会出错。毫不夸张地说,htmlparser就是目前最好的html解 析和分析的工具。无论你是想抓取网页数据还是改造html的内容,用了htmlparser绝对会忍不住称赞。由于htmlparser 结构设计精良,所以扩展htmlparser 非常便利。



    Htmlparser中文论坛. http://bbs.hexiao.cn/thread.php?fid=6


    Constructor Summary


    Parser()

    Parser(URLConnection connection)
    Construct a parser using the provided URLConnection.

    Method:

    static Parser createParser(String html, String charset)
    Creates the parser on an input string.

    void visitAllNodesWith(NodeVisitor visitor)
    Apply the given visitor to the current page.



    HtmlPage(Parser parser)
    NodeList
    getBody()


    TableTag[]
    getTables()


    String
    getTitle()


    void
    setTitle(String title)


    void
    visitTag(Tag tag)
    Called for each Tag visited.





    Constructor Summary

    NodeList()



    NodeList(Node node)
    Create a one element node list.



    NodeList extractAllNodesThatMatch(NodeFilter filter)
    Filter the list with the given filter non-recursively.

    NodeList extractAllNodesThatMatch(NodeFilter filter, boolean recursive)
    Filter the list with the given filter.

    Node elementAt(int i)




    1. html代码里面所有的链接地址和链接名称



    package parser;



    import org.htmlparser.Parser;

    import org.htmlparser.Node;

    import org.htmlparser.NodeFilter;

    import org.htmlparser.Parser;

    import org.htmlparser.filters.TagNameFilter;

    import org.htmlparser.tags.LinkTag;

    import org.htmlparser.tags.TableTag;

    import org.htmlparser.util.NodeList;

    import org.htmlparser.util.ParserException;

    import org.htmlparser.visitors.HtmlPage;



    /**

    * htmlparser取得一段html代码里面所有的链接地址和链接名称

    *

    * @author chenguoyong

    *

    */

    public class Testhtmlparser {



    /**

    * @param args

    */

    public static void main(String[] args) {

    String htmlcode = "<HTML><HEAD><TITLE>AAA</TITLE></HEAD><BODY>"

    + "<a href='http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html'>连接1</a>"

    + "<a href='http://topic.csdn.net'>连接2</a></BODY></HTML>";

    // 创建Parser对象根据传给字符串和指定的编码

    Parser parser = Parser.createParser(htmlcode, "GBK");

    // 创建HtmlPage对象HtmlPage(Parser parser)

    HtmlPage page = new HtmlPage(parser);

    try {

    // HtmlPage extends visitor,Apply the given visitor to the current

    // page.

    parser.visitAllNodesWith(page);

    } catch (ParserException e1) {

    e1 = null;

    }

    // 所有的节点

    NodeList nodelist = page.getBody();

    // 建立一个节点filter用于过滤节点

    NodeFilter filter = new TagNameFilter("A");

    // 得到所有过滤后,想要的节点

    nodelist = nodelist.extractAllNodesThatMatch(filter, true);

    for (int i = 0; i < nodelist.size(); i++) {

    LinkTag link = (LinkTag) nodelist.elementAt(i);

    // 链接地址

    System.out.println(link.getAttribute("href") + "\n");

    // 链接名称

    System.out.println(link.getStringText());

    }



    }



    }



    结果如下:

    http://topic.csdn.net/u/20080522/14/0ff402ef-c382-499a-8213-ba6b2f550425.html

    连接1

    http://topic.csdn.net

    连接2


    2. 使用HtmlParser抓去网页内容

    package parser;



    import org.htmlparser.Parser;

    import org.htmlparser.beans.StringBean;

    import org.htmlparser.filters.NodeClassFilter;

    import org.htmlparser.parserapplications.StringExtractor;

    import org.htmlparser.tags.BodyTag;

    import org.htmlparser.util.NodeList;

    import org.htmlparser.util.ParserException;



    /**

    * 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.

    * 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,

    * 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.

    *

    * @author chenguoyong

    *

    */

    public class GetContent {

    public void getContentUsingStringBean(String url) {

    StringBean sb = new StringBean();

    sb.setLinks(true); // 是否显示web页面的连接(Links)

    // 为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false

    sb.setCollapse(true); // 如果是true的话把一系列空白字符用一个字符替代.

    sb.setReplaceNonBreakingSpaces(true);// If true regular space

    sb

    .setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");

    System.out.println("The Content is :\n" + sb.getStrings());



    }



    public void getContentUsingStringExtractor(String url, boolean link) {

    // StringExtractor内部机制和上面的一样.做了一下包装

    StringExtractor se = new StringExtractor(url);

    String text = null;

    try {

    text = se.extractStrings(link);

    System.out.println("The content is :\n" + text);

    } catch (ParserException e) {

    e.printStackTrace();

    }

    }



    public void getContentUsingParser(String url) {

    NodeList nl;

    try {

    Parser p = new Parser(url);

    nl = p.parse(new NodeClassFilter(BodyTag.class));

    BodyTag bt = (BodyTag) nl.elementAt(0);

    System.out.println(bt.toPlainTextString()); // 保留原来的内容格式. 包含js代码

    } catch (ParserException e) {

    e.printStackTrace();

    }

    }



    /**

    * @param args

    */

    public static void main(String[] args) {

    String url = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";

    //new GetContent().getContentUsingParser(url);

    //--------------------------------------------------

    new GetContent().getContentUsingStringBean(url);



    }



    }


    3.将整个html内容保存到指定文件



    package parser;



    import java.io.BufferedReader;

    import java.io.BufferedWriter;

    import java.io.FileWriter;

    import java.io.IOException;

    import java.io.InputStream;

    import java.io.InputStreamReader;

    import java.net.MalformedURLException;

    import java.net.URL;



    /**

    * 基本能实现网页抓取,不过要手动输入URL 将整个html内容保存到指定文件

    *

    * @author chenguoyong

    *

    */

    public class ScrubSelectedWeb {

    private final static String CRLF = System.getProperty("line.separator");



    /**

    * @param args

    */

    public static void main(String[] args) {

    try {

    URL ur = new URL("http://www.google.cn/");

    InputStream instr = ur.openStream();

    String s, str;

    BufferedReader in = new BufferedReader(new InputStreamReader(instr));

    StringBuffer sb = new StringBuffer();

    BufferedWriter out = new BufferedWriter(new FileWriter(

    "D:/outPut.txt"));

    while ((s = in.readLine()) != null) {

    sb.append(s + CRLF);

    }

    System.out.println(sb);

    str = new String(sb);

    out.write(str);

    out.close();

    in.close();

    } catch (MalformedURLException e) {

    e.printStackTrace();

    } catch (IOException e) {

    e.printStackTrace();

    }



    }



    }


    4利用htmlparser提取网页纯文本的例子



    package parser;



    import org.htmlparser.Node;

    import org.htmlparser.NodeFilter;

    import org.htmlparser.Parser;

    import org.htmlparser.filters.TagNameFilter;

    import org.htmlparser.tags.TableTag;

    import org.htmlparser.util.NodeList;



    /**

    * 标题:利用htmlparser提取网页纯文本的例子

    */

    public class TestHTMLParser2 {

    /**

    * 读取目标html内容

    *

    */

    public static void testHtml() {

    try {

    String sCurrentLine;

    String sTotalString;

    sCurrentLine = "";

    sTotalString = "";

    java.io.InputStream l_urlStream;

    java.net.URL l_url = new java.net.URL(

    "http://10.249.187.199:8083/injs100/");

    java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url

    .openConnection();

    l_connection.connect();

    l_urlStream = l_connection.getInputStream();

    java.io.BufferedReader l_reader = new java.io.BufferedReader(

    new java.io.InputStreamReader(l_urlStream));

    while ((sCurrentLine = l_reader.readLine()) != null) {

    sTotalString += sCurrentLine + "\r\n";

    }



    String testText = extractText(sTotalString);

    } catch (Exception e) {

    e.printStackTrace();

    }



    }

    /**

    * 抽取纯文本信息

    * @param inputHtml:html文本

    * @return

    * @throws Exception

    */

    public static String extractText(String inputHtml) throws Exception {

    StringBuffer text = new StringBuffer();

    Parser parser = Parser.createParser(new String(inputHtml.getBytes(),

    "GBK"), "GBK");

    // 遍历所有的节点

    NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {

    public boolean accept(Node node) {

    return true;

    }

    });



    System.out.println(nodes.size());

    for (int i = 0; i < nodes.size(); i++) {

    Node nodet = nodes.elementAt(i);

    //字符串的代表性节点:节点的描述

    text.append(new String(nodet.toPlainTextString().getBytes("GBK"))

    + "\r\n");

    }

    return text.toString();

    }

    /**

    * 读取文件的方式/utl 来分析内容. filePath也可以是一个Url.

    * @param resource :文件/Url

    * @throws Exception

    */

    public static void test5(String resource) throws Exception {

    Parser myParser = new Parser(resource);

    myParser.setEncoding("GBK");

    String filterStr = "table";

    NodeFilter filter = new TagNameFilter(filterStr);

    NodeList nodeList = myParser.extractAllNodesThatMatch(filter);

    /*for(int i=0;i<nodeList.size();i++)

    {

    TableTag tabletag = (TableTag) nodeList.elementAt(i);

    //标签名称

    System.out.println(tabletag.getTagName());

    System.out.println(tabletag.getText());

    }*/

    TableTag tabletag = (TableTag) nodeList.elementAt(1);









    }



    public static void main(String[] args) throws Exception {

    test5("http://10.249.187.199:8083/injs100/");

    //testHtml();

    }

    }


    5.html解析table



    package parser;



    import org.apache.log4j.Logger;

    import org.htmlparser.NodeFilter;

    import org.htmlparser.Parser;

    import org.htmlparser.filters.NodeClassFilter;

    import org.htmlparser.filters.OrFilter;

    import org.htmlparser.filters.TagNameFilter;

    import org.htmlparser.tags.TableColumn;

    import org.htmlparser.tags.TableRow;

    import org.htmlparser.tags.TableTag;

    import org.htmlparser.util.NodeList;

    import org.htmlparser.util.ParserException;



    import junit.framework.TestCase;



    public class ParserTestCase extends TestCase {

    private static final Logger logger = Logger.getLogger(ParserTestCase.class);



    public ParserTestCase(String name) {

    super(name);

    }



    /**

    * 测试对<table>

    * <tr>

    * <td></td>

    * </tr>

    * </table>的解析

    */

    public void testTable() {

    Parser myParser;

    NodeList nodeList = null;

    myParser = Parser

    .createParser(

    "<body> "

    + "<table id=’table1′ >"

    + "<tr id='tro1'><td>1-11</td><td>1-12</td><td>1-13</td></tr>"

    + "<tr id='tro2'><td>1-21</td><td>1-22</td><td>1-23</td></tr>"

    + "<tr id='tro3'><td>1-31</td><td>1-32</td><td>1-33</td></tr></table>"

    + "<table id=’table2′ >"

    + "<tr id='tro4'><td>2-11</td><td>2-12</td><td>2-13</td></tr>"

    + "<tr id='tro5'><td>2-21</td><td>2-22</td><td>2-23</td></tr>"

    + "<tr id='tro6'><td>2-31</td><td>2-32</td><td>2-33</td></tr></table>"

    + "</body>", "GBK");

    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);

    OrFilter lastFilter = new OrFilter();

    lastFilter.setPredicates(new NodeFilter[] { tableFilter });

    try {

    nodeList = myParser.parse(lastFilter);

    for (int i = 0; i <= nodeList.size(); i++) {

    if (nodeList.elementAt(i) instanceof TableTag) {

    TableTag tag = (TableTag) nodeList.elementAt(i);

    TableRow[] rows = tag.getRows();



    for (int j = 0; j < rows.length; j++) {

    TableRow tr = (TableRow) rows[j];

    System.out.println(tr.getAttribute("id"));

    if (tr.getAttribute("id").equalsIgnoreCase("tro1")) {

    TableColumn[] td = tr.getColumns();

    for (int k = 0; k < td.length; k++) {



    // logger.fatal("<td>" +

    // td[k].toPlainTextString());

    System.out.println("<td>"

    + td[k].toPlainTextString());

    }

    }



    }



    }

    }



    } catch (ParserException e) {

    e.printStackTrace();

    }

    }



    /**

    * 得到目标数据

    *

    * @param url:目标url

    * @throws Exception

    */

    public static void getDatabyUrl(String url) throws Exception {

    Parser myParser = new Parser(url);

    NodeList nodeList = null;

    myParser.setEncoding("gb2312");

    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);

    OrFilter lastFilter = new OrFilter();

    lastFilter.setPredicates(new NodeFilter[] { tableFilter });

    try {

    nodeList = myParser.parse(lastFilter);

    // 可以从数据table的size:19-21开始到结束

    for (int i = 15; i <= nodeList.size(); i++) {

    if (nodeList.elementAt(i) instanceof TableTag) {

    TableTag tag = (TableTag) nodeList.elementAt(i);

    TableRow[] rows = tag.getRows();

    for (int j = 0; j < rows.length; j++) {

    TableRow tr = (TableRow) rows[j];

    if (tr.getAttribute("id") != null

    && tr.getAttribute("id").equalsIgnoreCase(

    "tr02")) {

    TableColumn[] td = tr.getColumns();

    // 对不起,没有你要查询的记录!

    if (td.length == 1) {

    System.out.println("对不起,没有你要查询的记录");

    } else {

    for (int k = 0; k < td.length; k++) {

    System.out.println("<td>内容:"

    + td[k].toPlainTextString().trim());

    }

    }



    }



    }



    }

    }



    } catch (ParserException e) {

    e.printStackTrace();

    }



    }



    /**

    * 测试已经得出有数据时table:22个,没有数据时table:19个

    *

    * @param args

    */

    public static void main(String[] args) {

    try {



    // getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619505000000008942&type=1006&pkValue=619505000000008942");

    getDatabyUrl("http://gd.12530.com/user/querytonebytype.do?field=tonecode&condition=619272000000001712&type=1006&pkValue=619272000000001712");

    } catch (Exception e) {



    e.printStackTrace();

    }

    }



    }


    6.html解析常用





    package com.jscud.test;

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStreamReader;

    import org.htmlparser.Node;
    import org.htmlparser.NodeFilter;
    import org.htmlparser.Parser;
    import org.htmlparser.filters.NodeClassFilter;
    import org.htmlparser.filters.OrFilter;
    import org.htmlparser.nodes.TextNode;
    import org.htmlparser.tags.LinkTag;
    import org.htmlparser.util.NodeList;
    import org.htmlparser.util.ParserException;
    import org.htmlparser.visitors.HtmlPage;
    import org.htmlparser.visitors.TextExtractingVisitor;

    import com.jscud.util.LogMan; //一个日志记录类

    /**
    * 演示了Html Parse的应用.
    *
    * @author scud http://www.jscud.com (http://www.jscud.com/)
    */

    public class ParseHtmlTest
    {

    public static void main(String[] args) throws Exception
    {
    String aFile = "e:/jscud/temp/test.htm";

    String content = readTextFile(aFile, "GBK");

    test1(content);
    System.out.println("====================================");

    test2(content);
    System.out.println("====================================");

    test3(content);
    System.out.println("====================================");

    test4(content);
    System.out.println("====================================");

    test5(aFile);
    System.out.println("====================================");

    //访问外部资源,相对慢
    test5("http://www.jscud.com (http://www.jscud.com/)");
    System.out.println("====================================");

    }

    /**
    * 读取文件的方式来分析内容.
    * filePath也可以是一个Url.
    *
    * @param resource 文件/Url
    */
    public static void test5(String resource) throws Exception
    {
    Parser myParser = new Parser(resource);

    //设置编码
    myParser.setEncoding("GBK");

    HtmlPage visitor = new HtmlPage(myParser);

    myParser.visitAllNodesWith(visitor);

    String textInPage = visitor.getTitle();

    System.out.println(textInPage);
    }

    /**
    * 按页面方式处理.对一个标准的Html页面,推荐使用此种方式.
    */
    public static void test4(String content) throws Exception
    {
    Parser myParser;
    myParser = Parser.createParser(content, "GBK");

    HtmlPage visitor = new HtmlPage(myParser);

    myParser.visitAllNodesWith(visitor);

    String textInPage = visitor.getTitle();

    System.out.println(textInPage);
    }

    /**
    * 利用Visitor模式解析html页面.
    *
    * 小优点:翻译了<>等符号
    * 缺点:好多空格,无法提取link
    *
    */
    public static void test3(String content) throws Exception
    {
    Parser myParser;
    myParser = Parser.createParser(content, "GBK");

    TextExtractingVisitor visitor = new TextExtractingVisitor();

    myParser.visitAllNodesWith(visitor);

    String textInPage = visitor.getExtractedText();

    System.out.println(textInPage);
    }

    /**
    * 得到普通文本和链接的内容.
    *
    * 使用了过滤条件.
    */
    public static void test2(String content) throws ParserException
    {
    Parser myParser;
    NodeList nodeList = null;

    myParser = Parser.createParser(content, "GBK");

    NodeFilter textFilter = new NodeClassFilter(TextNode.class);
    NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);

    //暂时不处理 meta
    //NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);

    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });

    nodeList = myParser.parse(lastFilter);

    Node[] nodes = nodeList.toNodeArray();

    for (int i = 0; i < nodes.length; i++)
    {
    Node anode = (Node) nodes[i];

    String line = "";
    if (anode instanceof TextNode)
    {
    TextNode textnode = (TextNode) anode;
    //line = textnode.toPlainTextString().trim();
    line = textnode.getText();
    }
    else if (anode instanceof LinkTag)
    {
    LinkTag linknode = (LinkTag) anode;

    line = linknode.getLink();
    //@todo ("") 过滤jsp标签:可以自己实现这个函数
    //line = StringFunc.replace(line, "<%.*%>", "");
    }

    if (isTrimEmpty(line))
    continue;

    System.out.println(line);
    }
    }

    /**
    * 解析普通文本节点.
    *
    * @param content
    * @throws ParserException
    */
    public static void test1(String content) throws ParserException
    {
    Parser myParser;
    Node[] nodes = null;

    myParser = Parser.createParser(content, null);

    nodes = myParser.extractAllNodesThatAre(TextNode.class); //exception could be thrown here

    for (int i = 0; i < nodes.length; i++)
    {
    TextNode textnode = (TextNode) nodes[i];
    String line = textnode.toPlainTextString().trim();
    if (line.equals(""))
    continue;
    System.out.println(line);
    }

    }

    /**
    * 读取一个文件到字符串里.
    *
    * @param sFileName 文件名
    * @param sEncode String
    * @return 文件内容
    */
    public static String readTextFile(String sFileName, String sEncode)
    {
    StringBuffer sbStr = new StringBuffer();

    try
    {
    File ff = new File(sFileName);
    InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
    sEncode);
    BufferedReader ins = new BufferedReader(read);

    String dataLine = "";
    while (null != (dataLine = ins.readLine()))
    {
    sbStr.append(dataLine);
    sbStr.append("\r\n");
    }

    ins.close();
    }
    catch (Exception e)
    {
    LogMan.error("read Text File Error", e);
    }

    return sbStr.toString();
    }

    /**
    * 去掉左右空格后字符串是否为空
    * @param astr String
    * @return boolean
    */
    public static boolean isTrimEmpty(String astr)
    {
    if ((null == astr) || (astr.length() == 0))
    {
    return true;
    }
    if (isBlank(astr.trim()))
    {
    return true;
    }
    return false;
    }

    /**
    * 字符串是否为空:null或者长度为0.
    * @param astr 源字符串.
    * @return boolean
    */
    public static boolean isBlank(String astr)
    {
    if ((null == astr) || (astr.length() == 0))
    {
    return true;
    }
    else
    {
    return false;
    }
    }

    }

  • 相关阅读:
    Fedora 19+ 启动顺序调整
    朗科U903 低级格式化后,量产错误:read onlypage (控制器芯片群联2251-03)的解决方案
    python中快速删除实例对象中的所有属性
    python中如何用sys.excepthook来对全局异常进行捕获、显示及输出到error日志中
    通过ctypes获得python windows process的内存使用情况
    Python的XMLRPC机制:实现跨进程间、client/server端通信
    PIL Image 转成 wx.Image、wx.Bitmap
    wxpython线程安全的方法
    11
    列表去重的几种方法
  • 原文地址:https://www.cnblogs.com/alamps/p/2582288.html
Copyright © 2011-2022 走看看