这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!
后续想通过Java把数据导入到Excel或者生成一个报表!
1 package gethtml; 2 3 4 5 import java.io.IOException; 6 import org.jsoup.Jsoup; 7 import org.jsoup.nodes.Document; 8 import org.jsoup.nodes.Element; 9 import org.jsoup.select.Elements; 10 11 /**从智联招聘获取招聘信息 12 * @author syskey 13 * @url 智联招聘网站链接(建议不要更改) 14 * @city 搜索工作的城市 15 * @keywrods 搜索工作的相关关键字 16 */ 17 18 public class JsoupHtml { 19 20 private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl="; //智联招聘网站 21 private String city="西安"; //搜索工作的城市 22 private String keywords="java"; //搜索工作的关键字 23 public JsoupHtml(String city,String keywords){ 24 this.city=city; 25 this.keywords =keywords; 26 27 } 28 29 public void getZhiLianWork(){ 30 try { 31 for (int i=0;i<10;i++) { 32 System.out.println("*********开始遍历第"+(i+1)+"页的求职信息*********"); 33 Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get(); 34 Element content = doc.getElementById("newlist_list_content_table"); 35 Elements zwmcEls = content.getElementsByClass("zwmc"); 36 Elements gsmcEls = content.getElementsByClass("gsmc"); 37 Elements zwyxEls = content.getElementsByClass("zwyx"); 38 Elements gzddEls = content.getElementsByClass("gzdd"); 39 Elements gxsjEls = content.getElementsByClass("gxsj"); 40 for(int j = 0;j<zwmcEls .size();j++){ 41 42 System.out.println( 43 zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+ 44 "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+ 45 "*****"+gxsjEls.get(j).tagName("a").text()); 46 System.out.println(); 47 } 48 System.out.println("*********结束遍历第"+(i+1)+"页的求职信息*********"); 49 50 } 51 52 } catch (IOException e) { 53 // TODO Auto-generated catch block 54 e.printStackTrace(); 55 } 56 } 57 public static void main(String[] args) { 58 59 JsoupHtml jHtml = new JsoupHtml("上海", "java"); 60 jHtml.getZhiLianWork(); 61 62 } 63 64 }
更新源代码,支持生成html表格:
package jsouphtml; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupHtml { public static void main(String[] args) { try { String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?"; String city ="西安"; String keywords = "java"; BufferedWriter bWriter = new BufferedWriter( new OutputStreamWriter( new FileOutputStream("output.html"),"utf-8")); bWriter.write(""); File input = new File("input.html"); Document doc2 = Jsoup.parse(input, "UTF-8", ""); Element table = doc2.getElementById("workinfo"); table.text(""); Element theader = table.appendElement("tr"); theader.appendElement("th").text("序号"); theader.appendElement("th").text("职位名称"); theader.appendElement("th").text("公司名称"); theader.appendElement("th").text("职位月薪"); theader.appendElement("th").text("工作地点"); theader.appendElement("th").text("发布日期"); for(int page=0;page<10;page++){ Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get(); Element content = doc.getElementById("newlist_list_content_table"); Elements zwmcEls = content.getElementsByClass("zwmc"); Elements gsmcEls = content.getElementsByClass("gsmc"); Elements zwyxEls = content.getElementsByClass("zwyx"); Elements gzddEls = content.getElementsByClass("gzdd"); Elements gxsjEls = content.getElementsByClass("gxsj"); for(int i = 1;i<zwmcEls .size();i++){ Element tr =table.appendElement("tr"); tr.appendElement("td").text((page+1)+"-"+i); tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text()); tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text()); tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text()); tr.appendElement("td").text(gzddEls.get(i).tagName("a").text()); tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text()); } } System.out.println(doc2.html()); bWriter.write(doc2.html()); bWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
output.html模板:
<!doctype html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="Generator" content="EditPlus®"> <meta name="Author" content=""> <meta name="Keywords" content=""> <meta name="Description" content=""> <title>智联工作信息</title> <style> body{margin:0;padding:0;} .header{height:100px;100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px; font-family:"微软雅黑";} .body{100%;background:#fff;} .body table{90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; } .body table th,td{min-50px;max-300px;} .feeter{height:30px;100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px; font-family:"微软雅黑";} </style> </head> <body> <div class="header">智联工作信息</div> <div class="body"> <table class="work" border="1"> <tbody id="workinfo"> </tbody> </table> </div> <div class="feeter">版权所有 翻版必究@2017 sysker</div> </body> </html>