zoukankan      html  css  js  c++  java
  • htmlparser使用例子(全) 转载

    1.import java.net.URL; 
    2. 
    3.import junit.framework.TestCase; 
    4. 
    5.import org.apache.log4j.Logger; 
    6.import org.htmlparser.Node; 
    7.import org.htmlparser.NodeFilter; 
    8.import org.htmlparser.Parser; 
    9.import org.htmlparser.Tag; 
    10.import org.htmlparser.beans.LinkBean; 
    11.import org.htmlparser.filters.NodeClassFilter; 
    12.import org.htmlparser.filters.OrFilter; 
    13.import org.htmlparser.filters.TagNameFilter; 
    14.import org.htmlparser.tags.HeadTag; 
    15.import org.htmlparser.tags.ImageTag; 
    16.import org.htmlparser.tags.InputTag; 
    17.import org.htmlparser.tags.LinkTag; 
    18.import org.htmlparser.tags.OptionTag; 
    19.import org.htmlparser.tags.SelectTag; 
    20.import org.htmlparser.tags.TableColumn; 
    21.import org.htmlparser.tags.TableRow; 
    22.import org.htmlparser.tags.TableTag; 
    23.import org.htmlparser.tags.TitleTag; 
    24.import org.htmlparser.util.NodeIterator; 
    25.import org.htmlparser.util.NodeList; 
    26.import org.htmlparser.util.ParserException; 
    27.import org.htmlparser.visitors.HtmlPage; 
    28.import org.htmlparser.visitors.NodeVisitor; 
    29.import org.htmlparser.visitors.ObjectFindingVisitor; 
    30. 
    31.public class T extends TestCase { 
    32. 
    33.  private static final Logger logger = Logger.getLogger(T.class); 
    34. 
    35.  public T(String name) { 
    36.    super(name); 
    37.  } 
    38. 
    39.  /*
    40.   * 测试ObjectFindVisitor的用法
    41.   */ 
    42.  public void testImageVisitor() { 
    43.    try
    44.      ImageTag imgLink; 
    45.      ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class); 
    46.      Parser parser = new Parser(); 
    47.      parser.setURL("http://www.google.com"); 
    48.      parser.setEncoding(parser.getEncoding()); 
    49.      parser.visitAllNodesWith(visitor); 
    50.      Node[] nodes = visitor.getTags(); 
    51.      for (int i = 0; i < nodes.length; i++) { 
    52.        imgLink = (ImageTag) nodes[i]; 
    53.        logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL()); 
    54.        logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn()); 
    55.        logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC")); 
    56.      } 
    57.    } catch (Exception e) { 
    58.      e.printStackTrace(); 
    59.    } 
    60.  } 
    61. 
    62.  /*
    63.   * 测试TagNameFilter用法
    64.   */ 
    65.  public void testNodeFilter() { 
    66.    try
    67.      NodeFilter filter = new TagNameFilter("IMG"); 
    68.      Parser parser = new Parser(); 
    69.      parser.setURL("http://www.google.com"); 
    70.      parser.setEncoding(parser.getEncoding()); 
    71.      NodeList list = parser.extractAllNodesThatMatch(filter); 
    72.      for (int i = 0; i < list.size(); i++) { 
    73.        logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml()); 
    74.      } 
    75.    } catch (Exception e) { 
    76.      e.printStackTrace(); 
    77.    } 
    78. 
    79.  } 
    80. 
    81.  /*
    82.   * 测试NodeClassFilter用法
    83.   */ 
    84.  public void testLinkTag() { 
    85.    try
    86. 
    87.      NodeFilter filter = new NodeClassFilter(LinkTag.class); 
    88.      Parser parser = new Parser(); 
    89.      parser.setURL("http://www.google.com"); 
    90.      parser.setEncoding(parser.getEncoding()); 
    91.      NodeList list = parser.extractAllNodesThatMatch(filter); 
    92.      for (int i = 0; i < list.size(); i++) { 
    93.        LinkTag node = (LinkTag) list.elementAt(i); 
    94.        logger.fatal("testLinkTag() Link is :" + node.extractLink()); 
    95.      } 
    96.    } catch (Exception e) { 
    97.      e.printStackTrace(); 
    98.    } 
    99. 
    100.  } 
    101. 
    102.  /*
    103.   * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法
    104.   */ 
    105.  public void testLinkCSS() { 
    106.    try
    107. 
    108.      Parser parser = new Parser(); 
    109.      parser.setInputHTML("<head><title>Link Test</title>" 
    110.          + "<link href=’/test01/css.css' text='text/css' rel='stylesheet' />" 
    111.          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>" 
    112.          + "<body>"); 
    113.      parser.setEncoding(parser.getEncoding()); 
    114. 
    115.      for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { 
    116.        Node node = e.nextNode(); 
    117.        logger.fatal("testLinkCSS()" + node.getText() + node.getClass()); 
    118. 
    119.      } 
    120.    } catch (Exception e) { 
    121.      e.printStackTrace(); 
    122.    } 
    123.  } 
    124. 
    125.  /*
    126.   * 测试OrFilter的用法
    127.   */ 
    128.  public void testOrFilter() { 
    129.    NodeFilter inputFilter = new NodeClassFilter(InputTag.class); 
    130.    NodeFilter selectFilter = new NodeClassFilter(SelectTag.class); 
    131. 
    132.    NodeList nodeList = null; 
    133. 
    134.    try
    135.      Parser parser = new Parser(); 
    136.      parser 
    137.          .setInputHTML("<head><title>OrFilter Test</title>" 
    138.              + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />" 
    139.              + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" 
    140.              + "</head>" 
    141.              + "<body>" 
    142.              + "<input type='text' value='text1′ name='text1′/>" 
    143.              + "<input type='text' value='text2′ name='text2′/>" 
    144.              + "<select><option id='1′>1</option><option id='2′>2</option><option id='3′></option></select>" 
    145.              + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>"); 
    146. 
    147.      parser.setEncoding(parser.getEncoding()); 
    148.      OrFilter lastFilter = new OrFilter(); 
    149.      lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter }); 
    150.      nodeList = parser.parse(lastFilter); 
    151.      for (int i = 0; i <= nodeList.size(); i++) { 
    152.        if (nodeList.elementAt(i) instanceof InputTag) { 
    153.          InputTag tag = (InputTag) nodeList.elementAt(i); 
    154.          logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:" 
    155.              + tag.getAttribute("value")); 
    156.        } 
    157.        if (nodeList.elementAt(i) instanceof SelectTag) { 
    158.          SelectTag tag = (SelectTag) nodeList.elementAt(i); 
    159.          NodeList list = tag.getChildren(); 
    160. 
    161.          for (int j = 0; j < list.size(); j++) { 
    162.            OptionTag option = (OptionTag) list.elementAt(j); 
    163.            logger.fatal("OrFilter Option" + option.getOptionText()); 
    164.          } 
    165. 
    166.        } 
    167.      } 
    168. 
    169.    } catch (ParserException e) { 
    170.      e.printStackTrace(); 
    171.    } 
    172.  } 
    173. 
    174.  /*
    175.   * 测试对<table><tr><td></td></tr></table>的解析
    176.   */ 
    177.  public void testTable() { 
    178.    Parser myParser; 
    179.    NodeList nodeList = null; 
    180.    myParser = Parser.createParser("<body> " + "<table id='table1′ >" 
    181.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" 
    182.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" 
    183.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >" 
    184.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" 
    185.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" 
    186.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK"); 
    187.    NodeFilter tableFilter = new NodeClassFilter(TableTag.class); 
    188.    OrFilter lastFilter = new OrFilter(); 
    189.    lastFilter.setPredicates(new NodeFilter[] { tableFilter }); 
    190.    try
    191.      nodeList = myParser.parse(lastFilter); 
    192.      for (int i = 0; i <= nodeList.size(); i++) { 
    193.        if (nodeList.elementAt(i) instanceof TableTag) { 
    194.          TableTag tag = (TableTag) nodeList.elementAt(i); 
    195.          TableRow[] rows = tag.getRows(); 
    196. 
    197.          for (int j = 0; j < rows.length; j++) { 
    198.            TableRow tr = (TableRow) rows[j]; 
    199.            TableColumn[] td = tr.getColumns(); 
    200.            for (int k = 0; k < td.length; k++) { 
    201.              logger.fatal("<td>" + td[k].toPlainTextString()); 
    202.            } 
    203. 
    204.          } 
    205. 
    206.        } 
    207.      } 
    208. 
    209.    } catch (ParserException e) { 
    210.      e.printStackTrace(); 
    211.    } 
    212.  } 
    213. 
    214.  /*
    215.   * 测试NodeVisitor的用法,遍历所有节点
    216.   */ 
    217.  public void testVisitorAll() { 
    218.    try
    219.      Parser parser = new Parser(); 
    220.      parser.setURL("http://www.google.com"); 
    221.      parser.setEncoding(parser.getEncoding()); 
    222.      NodeVisitor visitor = new NodeVisitor() { 
    223.        public void visitTag(Tag tag) { 
    224.          logger.fatal("testVisitorAll()  Tag name is :" + tag.getTagName() + " Class is :" 
    225.              + tag.getClass()); 
    226.        } 
    227. 
    228.      }; 
    229. 
    230.      parser.visitAllNodesWith(visitor); 
    231.    } catch (ParserException e) { 
    232.      e.printStackTrace(); 
    233.    } 
    234.  } 
    235. 
    236.  /*
    237.   * 测试对指定Tag的NodeVisitor的用法
    238.   */ 
    239.  public void testTagVisitor() { 
    240.    try
    241. 
    242.      Parser parser = new Parser("<head><title>dddd</title>" 
    243.          + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />" 
    244.          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>" 
    245.          + "<body>" + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>"); 
    246.      NodeVisitor visitor = new NodeVisitor() { 
    247.        public void visitTag(Tag tag) { 
    248.          if (tag instanceof HeadTag) { 
    249.            logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName() 
    250.                + " Class is :" + tag.getClass() + " Text is :" + tag.getText()); 
    251.          } else if (tag instanceof TitleTag) { 
    252.            logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName() 
    253.                + " Class is :" + tag.getClass() + " Text is :" + tag.getText()); 
    254. 
    255.          } else if (tag instanceof LinkTag) { 
    256.            logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName() 
    257.                + " Class is :" + tag.getClass() + " Text is :" + tag.getText() 
    258.                + " getAttribute is :" + tag.getAttribute("href")); 
    259.          } else
    260.            logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " Class is :" 
    261.                + tag.getClass() + " Text is :" + tag.getText()); 
    262.          } 
    263. 
    264.        } 
    265. 
    266.      }; 
    267. 
    268.      parser.visitAllNodesWith(visitor); 
    269.    } catch (Exception e) { 
    270.      e.printStackTrace(); 
    271.    } 
    272.  } 
    273. 
    274.  /*
    275.   * 测试HtmlPage的用法
    276.   */ 
    277.  public void testHtmlPage() { 
    278.    String inputHTML = "<html>" + "<head>" 
    279.        + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>" 
    280.        + "Welcome to HTMLParser" + "<table id='table1′ >" 
    281.        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" 
    282.        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" 
    283.        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >" 
    284.        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" 
    285.        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" 
    286.        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>"
    287.    Parser parser = new Parser(); 
    288.    try
    289. 
    290.      parser.setInputHTML(inputHTML); 
    291.      parser.setEncoding(parser.getURL()); 
    292.      HtmlPage page = new HtmlPage(parser); 
    293.      parser.visitAllNodesWith(page); 
    294.      logger.fatal("testHtmlPage -title is :" + page.getTitle()); 
    295.      NodeList list = page.getBody(); 
    296. 
    297.      for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) { 
    298.        Node node = iterator.nextNode(); 
    299.        logger.fatal("testHtmlPage -node  is :" + node.toHtml()); 
    300.      } 
    301. 
    302.    } catch (ParserException e) { 
    303.      // TODO Auto-generated catch block 
    304.      e.printStackTrace(); 
    305.    } 
    306.  } 
    307. 
    308.  /*
    309.   * 测试LinkBean的用法
    310.   */ 
    311.  public void testLinkBean() { 
    312.    Parser parser = new Parser(); 
    313. 
    314.    LinkBean linkBean = new LinkBean(); 
    315.    linkBean.setURL("http://www.google.com"); 
    316.    URL[] urls = linkBean.getLinks(); 
    317. 
    318.    for (int i = 0; i < urls.length; i++) { 
    319.      URL url = urls[i]; 
    320.      logger.fatal("testLinkBean() -url  is :" + url); 
    321.    } 
    322. 
    323.  } 
    324. 
    325.}
  • 相关阅读:
    字段与表的对应关系
    java初学代码,还不太熟练
    编程学习心得
    ps中经常遇到的问题
    R语言矩阵运算加速
    写代码过程中一些数字推理公式
    EXCEL中常用的函数
    css样式中常见的属性
    R语言的基本矩阵运算
    excel常用的函数
  • 原文地址:https://www.cnblogs.com/Alex80/p/4973897.html
Copyright © 2011-2022 走看看