zoukankan      html  css  js  c++  java
  • Java实例——基于jsoup的简单爬虫实现(从智联获取工作信息)

      这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!

      后续想通过Java把数据导入到Excel或者生成一个报表!

     1 package gethtml;
     2 
     3 
     4 
     5 import java.io.IOException;
     6 import org.jsoup.Jsoup;
     7 import org.jsoup.nodes.Document;
     8 import org.jsoup.nodes.Element;
     9 import org.jsoup.select.Elements;
    10 
    11 /**从智联招聘获取招聘信息
    12  * @author syskey
    13  * @url 智联招聘网站链接(建议不要更改)
    14  * @city 搜索工作的城市
    15  * @keywrods 搜索工作的相关关键字
    16  */
    17 
    18 public class JsoupHtml {
    19     
    20     private String url="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=";  //智联招聘网站
    21     private  String city="西安"; //搜索工作的城市
    22     private  String keywords="java";  //搜索工作的关键字
    23     public JsoupHtml(String city,String keywords){        
    24         this.city=city;
    25         this.keywords =keywords;
    26         
    27     }
    28     
    29     public void getZhiLianWork(){
    30         try {
    31             for (int i=0;i<10;i++) {
    32                     System.out.println("*********开始遍历第"+(i+1)+"页的求职信息*********");
    33                     Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+(i+1)+"&isadv=0").get();                    
    34                     Element content = doc.getElementById("newlist_list_content_table");            
    35                     Elements zwmcEls = content.getElementsByClass("zwmc");
    36                     Elements gsmcEls = content.getElementsByClass("gsmc");            
    37                     Elements zwyxEls = content.getElementsByClass("zwyx");            
    38                     Elements gzddEls = content.getElementsByClass("gzdd");            
    39                     Elements gxsjEls = content.getElementsByClass("gxsj");
    40                     for(int j = 0;j<zwmcEls .size();j++){
    41                         
    42                         System.out.println(
    43                                 zwmcEls.get(j).tagName("a").text()+"*****"+gsmcEls.get(j).tagName("a").text()+
    44                                 "*****"+zwyxEls.get(j).tagName("a").text()+"*****"+gzddEls.get(j).tagName("a").text()+
    45                                 "*****"+gxsjEls.get(j).tagName("a").text());
    46                         System.out.println();
    47                 }
    48                     System.out.println("*********结束遍历第"+(i+1)+"页的求职信息*********");
    49             
    50             }
    51             
    52         } catch (IOException e) {
    53             // TODO Auto-generated catch block
    54             e.printStackTrace();
    55         }
    56     }
    57     public static void main(String[] args) {    
    58         
    59         JsoupHtml jHtml = new JsoupHtml("上海", "java");
    60         jHtml.getZhiLianWork();
    61         
    62     }
    63 
    64 }

       更新源代码,支持生成html表格:

    package jsouphtml;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class JsoupHtml {
    
    	public static void main(String[] args) {		
    		try {
    			String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?";
    			String city ="西安";
    			String keywords = "java";
    			BufferedWriter bWriter = new BufferedWriter(
    					new OutputStreamWriter(
    							new FileOutputStream("output.html"),"utf-8"));
    			bWriter.write("");
    			
    			
    			File input = new File("input.html");
    			Document doc2 = Jsoup.parse(input, "UTF-8",	"");
    			Element table = doc2.getElementById("workinfo");
    			table.text("");
    			Element theader = table.appendElement("tr");
    			theader.appendElement("th").text("序号");
    			theader.appendElement("th").text("职位名称");
    			theader.appendElement("th").text("公司名称");
    			theader.appendElement("th").text("职位月薪");
    			theader.appendElement("th").text("工作地点");
    			theader.appendElement("th").text("发布日期");			
    		
    			
    			for(int page=0;page<10;page++){				
    				Document doc = Jsoup.connect(url+city+"&kw="+keywords+"&p="+page).get();				
    				Element content = doc.getElementById("newlist_list_content_table");			
    				Elements zwmcEls = content.getElementsByClass("zwmc");
    				Elements gsmcEls = content.getElementsByClass("gsmc");			
    				Elements zwyxEls = content.getElementsByClass("zwyx");			
    				Elements gzddEls = content.getElementsByClass("gzdd");			
    				Elements gxsjEls = content.getElementsByClass("gxsj");
    				
    				for(int i = 1;i<zwmcEls .size();i++){				
    					Element tr =table.appendElement("tr");
    					tr.appendElement("td").text((page+1)+"-"+i);
    					tr.appendElement("td").text(zwmcEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gsmcEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(zwyxEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gzddEls.get(i).tagName("a").text());
    					tr.appendElement("td").text(gxsjEls.get(i).tagName("a").text());
    				}
    			}
    			System.out.println(doc2.html());
    			bWriter.write(doc2.html());
    			bWriter.close();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    		
    	}
    
    }
    

      output.html模板:

    <!doctype html>
    <html lang="en">
     <head>
      <meta charset="UTF-8">
      <meta name="Generator" content="EditPlus®">
      <meta name="Author" content="">
      <meta name="Keywords" content="">
      <meta name="Description" content="">
      <title>智联工作信息</title>
      <style>
      body{margin:0;padding:0;}
    	.header{height:100px;100%;background:#39c;color:#fff;text-align:center;line-height:100px;font-size:40px;
    		font-family:"微软雅黑";}
    	.body{100%;background:#fff;}
    	.body table{90%;margin:0 auto;color:#2e2e2e;border:1px solid #cad9ea; border-collapse: collapse; }
    	.body table th,td{min-50px;max-300px;}
    	.feeter{height:30px;100%;background:#39c;color:#fff;text-align:center;line-height:30px;font-size:14px;
    		font-family:"微软雅黑";}
      </style>
     </head>
     <body>
    	<div class="header">智联工作信息</div>
    	<div class="body">
    		<table class="work" border="1">
    			<tbody id="workinfo">
    			</tbody>
    		</table>
    	</div>
    	<div class="feeter">版权所有 翻版必究@2017 sysker</div>
     </body>
    </html>
    

      

  • 相关阅读:
    TensorRT 开始
    Vim 常用操作
    Android AsyncTask 的实现及 cancel 方式
    让 Andriod TextView 中的文本链接可点击的方法—— Linkify 及其他
    Android通过百度地图API用Service和Alarm在后台定时获取地理位置信息
    【转】Mac OS X 快捷键合集
    iframe导致的IE6下https页面安全提示
    [转载]IE的版本识别
    [转]iframe异步加载
    IE的html条件注释
  • 原文地址:https://www.cnblogs.com/caoleiCoding/p/6476203.html
Copyright © 2011-2022 走看看