zoukankan      html  css  js  c++  java
  • Java实例——基于jsoup的简单爬虫实现(从智联获取工作信息)

      这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!

      后续想通过Java把数据导入到Excel或者生成一个报表!

     1 package gethtml;
     2 
     3 
     4 
     5 import java.io.IOException;
     6 import org.jsoup.Jsoup;
     7 import org.jsoup.nodes.Document;
     8 import org.jsoup.nodes.Element;
     9 import org.jsoup.select.Elements;
    10 
    11 /**从智联招聘获取招聘信息
    12  * @author syskey
    13  * @url 智联招聘网站链接(建议不要更改)
    14  * @city 搜索工作的城市
    15  * @keywrods 搜索工作的相关关键字
    16  */
    17 
    18 public class JsoupHtml {
    19     
    20     private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=";  //智联招聘网站
    21     private  String city="西安"; //搜索工作的城市
    22     private  String keywords="java";  //搜索工作的关键字
    23     public JsoupHtml(String city,String keywords){        
    24         this.city=city;
    25         this.keywords =keywords;
    26         
    27     }
    28     
    29     public void getZhiLianWork(){
    30         try {
    31             for (int i=0;i<10;i++) {
    32                     System.out.println("*********开始遍历第"+(i+1)+"页的求职信息*********");
    33                     Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get();                    
    34                     Element content = doc.getElementById("newlist_list_content_table");            
    35                     Elements zwmcEls = content.getElementsByClass("zwmc");
    36                     Elements gsmcEls = content.getElementsByClass("gsmc");            
    37                     Elements zwyxEls = content.getElementsByClass("zwyx");            
    38                     Elements gzddEls = content.getElementsByClass("gzdd");            
    39                     Elements gxsjEls = content.getElementsByClass("gxsj");
    40                     for(int j = 0;j<zwmcEls .size();j++){
    41                         
    42                         System.out.println(
    43                                 zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+
    44                                 "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+
    45                                 "*****"+gxsjEls.get(j).tagName("a").text());
    46                         System.out.println();
    47                 }
    48                     System.out.println("*********结束遍历第"+(i+1)+"页的求职信息*********");
    49             
    50             }
    51             
    52         } catch (IOException e) {
    53             // TODO Auto-generated catch block
    54             e.printStackTrace();
    55         }
    56     }
    57     public static void main(String[] args) {    
    58         
    59         JsoupHtml jHtml = new JsoupHtml("上海", "java");
    60         jHtml.getZhiLianWork();
    61         
    62     }
    63 
    64 }

       更新源代码,支持生成html表格:

    package jsouphtml;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class JsoupHtml {
    
    	public static void main(String[] args) {		
    		try {
    			String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?";
    			String city ="西安";
    			String keywords = "java";
    			BufferedWriter bWriter = new BufferedWriter(
    					new OutputStreamWriter(
    							new FileOutputStream("output.html"),"utf-8"));
    			bWriter.write("");
    			
    			
    			File input = new File("input.html");
    			Document doc2 = Jsoup.parse(input, "UTF-8",	"");
    			Element table = doc2.getElementById("workinfo");
    			table.text("");
    			Element theader = table.appendElement("tr");
    			theader.appendElement("th").text("序号");
    			theader.appendElement("th").text("职位名称");
    			theader.appendElement("th").text("公司名称");
    			theader.appendElement("th").text("职位月薪");
    			theader.appendElement("th").text("工作地点");
    			theader.appendElement("th").text("发布日期");			
    		
    			
    			for(int page=0;page<10;page++){				
    				Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get();				
    				Element content = doc.getElementById("newlist_list_content_table");			
    				Elements zwmcEls = content.getElementsByClass("zwmc");
    				Elements gsmcEls = content.getElementsByClass("gsmc");			
    				Elements zwyxEls = content.getElementsByClass("zwyx");			
    				Elements gzddEls = content.getElementsByClass("gzdd");			
    				Elements gxsjEls = content.getElementsByClass("gxsj");
    				
    				for(int i = 1;i<zwmcEls .size();i++){				
    					Element tr =table.appendElement("tr");
    					tr.appendElement("td").text((page+1)+"-"+i);
    					tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gzddEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text());
    				}
    			}
    			System.out.println(doc2.html());
    			bWriter.write(doc2.html());
    			bWriter.close();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    		
    	}
    
    }
    

      output.html模板:

    <!doctype html>
    <html lang="en">
     <head>
      <meta charset="UTF-8">
      <meta name="Generator" content="EditPlus®">
      <meta name="Author" content="">
      <meta name="Keywords" content="">
      <meta name="Description" content="">
      <title>智联工作信息</title>
      <style>
      body{margin:0;padding:0;}
    	.header{height:100px;100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px;
    		font-family:"微软雅黑";}
    	.body{100%;background:#fff;}
    	.body table{90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; }
    	.body table th,td{min-50px;max-300px;}
    	.feeter{height:30px;100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px;
    		font-family:"微软雅黑";}
      </style>
     </head>
     <body>
    	<div class="header">智联工作信息</div>
    	<div class="body">
    		<table class="work" border="1">
    			<tbody id="workinfo">
    			</tbody>
    		</table>
    	</div>
    	<div class="feeter">版权所有 翻版必究@2017 sysker</div>
     </body>
    </html>
    

      

  • 相关阅读:
    某个牛人做WINDOWS系统文件详解
    常用ASP脚本程序集锦
    LINUX基础:文件安全与权限
    proftpd+mysql+quota
    apache2.0.49tomcat5.0.19jk2建立virtualHost
    URL Redirection(转) Anny
    顶级域名后缀列表(转) Anny
    \u4E00\u9FA5意义 Anny
    How to POST Form Data Using Ruby(转) Anny
    How to get rid of 'Enter password to unlock your login keyring' in Ubuntu(转) Anny
  • 原文地址:https://www.cnblogs.com/caoleiCoding/p/6476203.html
Copyright © 2011-2022 走看看