zoukankan      html  css  js  c++  java
  • java爬虫入门--用jsoup爬取汽车之家的新闻

    概述

    使用jsoup来进行网页数据爬取。jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

    详细

    jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

    jsoup的主要功能如下:

    1. 从一个URL,文件或字符串中解析HTML;

    2. 使用DOM或CSS选择器来查找、取出数据;

    3. 可操作HTML元素、属性、文本;

    jsoup是基于MIT协议发布的,可放心使用于商业项目

    第一步:项目预览

    blob.png

    第二步:代码实现

    主程序为GrapNews类,实现了从汽车网摘取相关内容的功能。GrapNews有main函数,执行即可。

    package net.sinolbs.ycd.news;
    
    import java.net.URLEncoder;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    /**
     * TODO
     * 2017年5月21日上午12:25:30
     */
    public class GrapNews {
    	
        public static boolean isContainChinese(String str) {
            Pattern p = Pattern.compile("[u4e00-u9fa5]");
            Matcher m = p.matcher(str);
            if (m.find()) {
                return true;
            }
            return false;
        }
        
        /**
         * 从笑话集抓取笑话
         * @param size
         * @param baseUrl
         * @param domainName
         * @param newsListClassOrId
         * @param classOrId
         * @param newsULIndex
         * @param newsContentClassOrId
         * @param titleTagOrClass
         * @param dateTag
         * @return
         */
        public static ArrayList<News> getNewsFromJokeji(int size,String baseUrl,String domainName,
        		String newsListClassOrId,int newsULIndex,
        		String newsContentClassOrId,String titleTagOrClass,String dateTag){
            ArrayList<News> newsList = new ArrayList<News>();
            Document doc;
            Element element =null;
            Element title =null;
            News news = null;
    		try {
    			doc = Jsoup.connect(baseUrl).timeout(10000).get();
    			element = (Element) doc.getElementsByClass(newsListClassOrId).first();
    	        Elements elements = element.getElementsByTag("li");
    	        if(elements!=null&&elements.size()>0){
    	            for(Element ele:elements){
    	                news = new News();
    	                title = ele.select("a").first();
    	                if(title==null){
    	                	continue;
    	                }
    	                news.setTitle(title.getElementsByTag(titleTagOrClass).text());
    	                if(news.getTitle()==null||news.getTitle().equals("")){
    	                	continue;
    	                }
    	                news.setHref(domainName+title.attr("href"));
    	                if(dateTag!=null){
    	                	String date=ele.select("i").text();
    	  	                news.setDate(date);
    	                }
    	                String newsUrl =news.getHref();
    					if (isContainChinese(news.getHref())) {
    						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
    								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
    					}
    	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
    	                String text=newsDoc.getElementById(newsContentClassOrId).html();
    	                text=deleteImg(text);
    	                text=deleteA(text);
    	                 StringBuffer textBuffer=new StringBuffer(5);
    	                 textBuffer.append("<!DOCTYPE html><html><head><meta name="content-type" content="text/html; charset=UTF-8">");
    	                 textBuffer.append("</head><body>");
    	                 textBuffer.append(deleteSource(text));
    	                 textBuffer.append("</body></html>");
    	                 news.setContent(textBuffer.toString());
    	                 news.setContent(textBuffer.toString());
    	                 System.out.println("标题====="+news.getTitle());
    	                 System.out.println("href====="+news.getHref());
    	                 System.out.println("content====="+news.getContent());
    	                 newsList.add(news);
    	                if(newsList.size()==size){
    	                	break;
    	                }
    	            }
    	        }
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
            return newsList;
        }
    
        /**
         * 从汽车之家抓新闻
         * @param size
         * @param baseUrl
         * @param domainName
         * @param newsListId
         * @param newsContentClass
         * @param titleTagOrClass
         * @param limitHref
         * @param dateTag
         * @param needDeleteAlt
         * @return
         */
        public static ArrayList<News> getNewsFromCarHome(int size,String baseUrl,String domainName,String newsListId,
        		String newsContentClass,String titleTag,String dateTag,String needDeleteAlt){
            ArrayList<News> newsList = new ArrayList<News>();
            Document doc;
            Elements elements =null;
            Element title =null;
            News news = null;
    		try {
    			doc = Jsoup.connect(baseUrl).timeout(10000).get();
    			elements = (Elements) doc.getElementById(newsListId).children();
    	        if(elements!=null&&elements.size()>0){
    	            for(Element ele:elements){
    	                news = new News();
    	                title = ele.select("a").first();
    	                if(title==null){
    	                	continue;
    	                }
    	                news.setTitle(title.getElementsByTag(titleTag).text());
    	                if(news.getTitle()==null||news.getTitle().equals("")){
    	                	continue;
    	                }
    	                news.setHref(domainName+title.attr("href"));
    	                if(dateTag!=null){
    	                	String date=ele.select("i").text();
    	  	                news.setDate(date);
    	                }
    	                String newsUrl =news.getHref();
    					if (isContainChinese(news.getHref())) {
    						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
    								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
    					}
    	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
    	                String text=newsDoc.getElementsByClass(newsContentClass).html();
    	                if(text.indexOf("余下全文")>0||text.indexOf("未经许可")>0
    	                		||text.indexOf("禁止转载")>0||text.indexOf("公众号")>0||text.indexOf("公众账号")>0){
    	                	continue;
    	                }
    	                 text=replaceImgSrcFromDataSrc(text,true,needDeleteAlt);
    	                 int index=text.lastIndexOf("(");
    	                 if(index>0){
    	                	 text=text.substring(0,index);
    	                 }
    	                 StringBuffer textBuffer=new StringBuffer(5);
    	                 textBuffer.append("<!DOCTYPE html><html><head><meta name="content-type" content="text/html; charset=UTF-8">");
    	                 textBuffer.append("</head><body>");
    	                 textBuffer.append(deleteSource(text));
    	                 textBuffer.append("</body></html>");
    	                 news.setContent(textBuffer.toString());
    	                 news.setContent(textBuffer.toString());
    	                 System.out.println("标题====="+news.getTitle());
    	                 System.out.println("href====="+news.getHref());
    	                 System.out.println("content====="+news.getContent());
    	                 newsList.add(news);
    	                if(newsList.size()==size){
    	                	break;
    	                }
    	            }
    	        }
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
            return newsList;
        }
        
        
        
    	public static String getVideoFromMiaoPai(String baseUrl) throws Exception{
    		Document doc= Jsoup.connect(baseUrl).timeout(10000).get();
    		String html=doc.html().trim();
            return getUrlFromMiaoPaiHtml(html);
        }
        
    	public static String getUrlFromMiaoPaiHtml(String html){
    		int startIndex=html.indexOf("videoSrc");
        	int endIndex=html.indexOf("poster");
        	String videoUrl=html.substring(startIndex+11,endIndex+5);
        	int index=videoUrl.indexOf('"');
        	if(index>0){
        		return videoUrl.substring(0, index);
        	}
        	return videoUrl;
    	}
    	
    	public static String getVideoPhotoFromMiaoPaiHtml(String html){
    		System.out.println(html);
    		int startIndex=html.indexOf("poster");
    		int index=html.substring(startIndex).indexOf("jpg");
        	return html.substring(startIndex+9,startIndex+index+3);
    	}
    	
        public static void main(String[] args) throws Exception{
        	getNewsFromCarHome(2,"http://m.autohome.com.cn/channel","http://m.autohome.com.cn","list","details","h4","time","汽车之家");
        	getNewsFromJokeji(3,"http://www.jokeji.cn/list.htm","http://www.jokeji.cn","list_title",1,"text110","a","i");
        	getNewsFromSouHu(20,"http://m.sohu.com/c/1592/","a",null,null);
        }
    	 /**
         * 从秒拍抓视频
         * @param size
         * @param baseUrl
         * @param domainName
         * @param newsListId
         * @param newsContentClass
         * @param titleTagOrClass
         * @param limitHref
         * @param dateTag
         * @param needDeleteAlt
         * @return
         */
    	public static ArrayList<News> getVideoFromMiaopai(int size,String baseUrl){
    		ArrayList<News> newsList = new ArrayList<News>();
    		try {
    		    News news = null;
    		    Element videoEmement=null;
    		    Document doc = null;
    		    String videoUrl=null;
    		    doc = Jsoup.connect(baseUrl).timeout(10000).get();
    			Elements 	elements = doc.getElementsByClass("videoCont");
    			String videoDetailUrl="";
    	        if(elements!=null&&elements.size()>0){
    	            for(Element ele:elements){
    	            videoEmement=ele.getElementsByClass("MIAOPAI_player").first();
    	            String videoId=videoEmement.attr("data-scid").toString();
    	            String videoPhotoUrl=videoEmement.attr("data-img").toString();
    	            String videoTitle=ele.getElementsByClass("viedoAbout").first().getElementsByTag("p").text();
    	            System.out.println("视频id"+videoId);
    	            System.out.println("视频封面url"+videoPhotoUrl);
    	            System.out.println("视频标题"+videoTitle);
    	            news = new News();
    	            if(videoId!=null){
    	            	news.setTitle(videoTitle);
    	            	videoDetailUrl="http://www.miaopai.com/show/"+videoId+".html";
    	            	doc = Jsoup.connect("http://www.miaopai.com/show/"+videoId+".html").timeout(10000).get();
    	            	System.out.println("视频详情url========"+videoDetailUrl);
    	            	news.setHref("http://m.miaopai.com/show/"+videoId);
    	            	news.setPhotoUrl(videoPhotoUrl);
    	            }
    	            if(doc!=null){
    	            	 videoUrl=getUrlFromMiaoPaiHtml(doc.html());
    	            }
    	            if(videoUrl!=null){
    	                 news.setContent(createVideoHtml(videoUrl, videoPhotoUrl));
    	                 System.out.println("视频url====="+videoUrl);
    	                 System.out.println("视频html======"+news.getContent());
    	                 newsList.add(news);
    	            }
    	           
    	            }
    	        }
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
            return newsList;
        }
    
    	public static String createVideoHtml(String videoUrl,String videoPhotoUrl) {
    		Document doc;
    		StringBuffer textBuffer = new StringBuffer(5);
    		textBuffer.append("<!DOCTYPE html><html><head><meta name="content-type" content="text/html; charset=UTF-8">");
    		textBuffer.append("</head><body>");
    		textBuffer.append("<div align="center">");
    		textBuffer.append(" <video></video> </div>");
    		textBuffer.append("</body></html>");
    		doc = Jsoup.parse(textBuffer.toString());
    		doc.getElementsByTag("body").attr("style", "height:400px;");
    		doc.getElementsByTag("video").attr("style", "100%;max-height:400px;")
    			.attr("poster", videoPhotoUrl).attr("autoplay", "autoplay")
    				.attr("controls", "controls").attr("src", videoUrl);
    		return doc.toString();
    	}
    	
        /**
         * 从搜狐抓新闻
         * @param size
         * @param baseUrl
         * @param domainName
         * @param newsListId
         * @param newsContentClass
         * @param titleTagOrClass
         * @param limitHref
         * @param dateTag
         * @param needDeleteAlt
         * @return
         */
    	public static ArrayList<News> getNewsFromSouHu(int size,String baseUrl,
    			String titleTag,String dateTag,String needDeleteAlt){
            ArrayList<News> newsList = new ArrayList<News>();
            Document doc;
            Element element =null;
            Element title =null;
            News news = null;
    		try {
    			doc = Jsoup.connect(baseUrl).timeout(10000).get();
    			element =doc.getElementsByTag("section").get(2);
    			element = element.getElementsByClass("headlines").get(0);
    			 Elements elements=element.children(); 
    	        if(elements!=null&&elements.size()>0){
    	            for(Element ele:elements){
    	                news = new News();
    	                title = ele.select("a").first();
    	                if(title==null){
    	                	continue;
    	                }
    	                news.setTitle(title.getElementsByTag(titleTag).text());
    	                if(news.getTitle()==null||news.getTitle().equals("")
    	                		||news.getTitle().indexOf("广告")>0||news.getTitle().indexOf("视频")>0){
    	                	continue;
    	                }
    	                news.setHref("https://m.sohu.com"+title.attr("href"));
    	                if(dateTag!=null){
    	                    String dateStr=ele.select(dateTag).first().text();
    	  	                news.setDate(dateStr);
    	                }
    	                String newsUrl =news.getHref();
    					if (isContainChinese(news.getHref())) {
    						newsUrl = URLEncoder.encode(news.getHref(), "utf-8")
    								.toLowerCase().replace("%3a", ":").replace("%2f", "/");
    					}
    	                Document newsDoc = Jsoup.connect(newsUrl).timeout(10000).get();
    	                String text=newsDoc.getElementsByTag("article").html();
    	                if(text.indexOf("未经许可")>0||text.indexOf("禁止转载")>0
    	                		||text.indexOf("公众号")>0||text.indexOf("公众账号")>0){
    	                	continue;
    	                }
    	                int index=text.indexOf("<p class="para">");
    	                int lastIndex=text.indexOf("<div class="expend-wp"> ");
    	                if(lastIndex>0){
    	                	 text=text.substring(index,lastIndex);
    	                }else if(index>0){
    	                	text=text.substring(index,text.length());
    	                }
    	                text=replaceImgSrcFromDataSrc(text,true,null);
    	                if(text==null||text.length()==0){
    	                	continue;
    	                }
    	                 StringBuffer textBuffer=new StringBuffer(5);
    	                 textBuffer.append("<!DOCTYPE html><html><head>"
    	                 		+ "<meta name="content-type" content="text/html; charset=UTF-8">");
    	                 textBuffer.append("</head><body>");
    	                 textBuffer.append(deleteSource(text));
    	                 textBuffer.append("</body></html>");
    	                 news.setContent(textBuffer.toString());
    	                 news.setContent(textBuffer.toString());
    	                 System.out.println("标题====="+news.getTitle());
    	                 System.out.println("href====="+news.getHref());
    	                 System.out.println("content====="+news.getContent());
    	                 newsList.add(news);
    	                if(newsList.size()==size){
    	                	break;
    	                }
    	            }
    	        }
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
            return newsList;
        }
        
        private static String deleteImg(String text) {
    		return text.replaceAll("<img [^>]*>", "");
    	}
        
        private static String deleteA(String text) {
    		return text.replaceAll("<a[^>]*>(.*?)</a>", "");
    	}
        
        private static String deleteSource(String text) {
     		return text.replaceAll("\(.*?\)|\[.*?]", "");
     	}
    	/**
         * 删除a标签中的href
         * @param content
         * @return
         */
        public static String removeHref(String content){  
            Document document = Jsoup.parse(content);  
            Elements elements = document.select("a[href]");  
            for(Element el:elements){  
                el.removeAttr("href");  
            }  
            return document.html();  
        }  
        
        
    	/**
    	 * 将htmlBody中所有img标签中的src内容替换为原data-src的内容, <br/>
    	 * 如果不报含data-src,则src的内容不会被替换 <br/>
    	 * @param htmlBody html内容
    	 * @param needDeleteAlt 需要剔除的图片的alt信息
    	 * @param imgUrlNeedAddProtocolPrefix 图片的url是否需要添加http协议前缀
    	 * @return 返回替换后的内容
    	 */
    	public static String replaceImgSrcFromDataSrc(String htmlBody,
    			boolean imgUrlNeedAddProtocolPrefix,String needDeleteAlt) {
    		Document document = Jsoup.parseBodyFragment(htmlBody);
    		List<Element> nodes = document.select("img");
    		int nodeLenth = nodes.size();
    		if(nodeLenth==0){
    			return htmlBody;
    		}
    		for (int i = 0; i < nodeLenth; i++) {
    			Element e = nodes.get(i);
    			String dataSrc = e.attr("data-src");
    			if (isNotBlank(dataSrc)) {
    				e.attr("src", dataSrc);
    				e.removeAttr("data-src");
    			}
    			String originalSrc = e.attr("original");
    			if (isNotBlank(originalSrc)) {
    				e.attr("src", "http:"+originalSrc);
    				e.removeAttr("originalSrc");
    			}
    			String originalHiddenSrc = e.attr("original-hidden");
    			if (isNotBlank(originalHiddenSrc)) {
    				e.attr("src", "http:"+originalHiddenSrc);
    				e.removeAttr("original-hidden");
    			}
    		}
    		if (htmlBody.contains("<html>")) {
    			if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
    				return document.toString();
    			}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
    				return document.toString().replace("src="//", "src="http://");
    			}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
    				return document.toString().replace("src="//", "src="http://")
    						.replace("alt="+needDeleteAlt, "");
    			}
    			return document.toString().replace("alt="+needDeleteAlt, "");
    		} else {
    			if(needDeleteAlt==null&&!imgUrlNeedAddProtocolPrefix){
    				return document.select("body>*").toString();
    			}else if(needDeleteAlt==null&&imgUrlNeedAddProtocolPrefix){
    				return document.select("body>*").toString().replace("src="//", "src="http://");
    			}else if(needDeleteAlt!=null&&imgUrlNeedAddProtocolPrefix){
    				return document.select("body>*").
    						toString().replace("src="//", "src="http://").replace("alt="+needDeleteAlt, "");
    			}
    			return document.select("body>*").toString().replace("alt="+needDeleteAlt, "");
    		}
    	
    	}
    	
    	
    	private static boolean isNotBlank(String str){
    		if(str == null)
    			return false;
    		else if(str.trim().length() == 0)
    			return false;
    		else
    			return true;
    	}
    }

    还有一个载体类,用于把趴下来的网页内容进行封装到一个类里面。

    package net.sinolbs.ycd.news;
    
    /**
     * 新闻数据载体
     */
    public class News {
    	private int id;
    	private String title;
    	private String href;
    	private String content;
        private String date;
        private String photoUrl;
    	public News() {
    	}
    
    	public News(String title, String href, String content, int id) {
    		this.title = title;
    		this.content = content;
    		this.href = href;
    		this.id = id;
    	}
    
    	public int getId() {
    		return id;
    	}
    
    	public void setId(int id) {
    		this.id = id;
    	}
    
    
    	public String getTitle() {
    		return title;
    	}
    
    	public void setTitle(String title) {
    		this.title = title;
    	}
    
    	public String getHref() {
    		return href;
    	}
    
    	public void setHref(String href) {
    		this.href = href;
    	}
    
    	public String getContent() {
    		return content;
    	}
    
    	public void setContent(String content) {
    		this.content = content;
    	}
    
    	public String getDate() {
    		return date;
    	}
    
    	public void setDate(String date) {
    		this.date = date;
    	}
    
    	public String getPhotoUrl() {
    		return photoUrl;
    	}
    
    	public void setPhotoUrl(String photoUrl) {
    		this.photoUrl = photoUrl;
    	}
    	
    }

    第三步:运行效果

    运行GrapNews类(有main方法)。

    blob.png

    注:本文著作权归作者,由demo大师发表,拒绝转载,转载需要作者授权

  • 相关阅读:
    JAVA_NIO 与Netty框架
    Socket编程
    P3368树状数组2(树状数组//改段求点)
    P3373 树状数组1(树状数组//改点求段)
    树状数组
    P1197 星球大战(并查集+链式向前星)
    P2024 食物链(种类并查集||带权并查集)
    P1111 修复公路(kruscal+并查集)
    P1387 最大正方形+P1736 创意吃鱼法(矩形上的dp+预处理)
    P2330 繁忙的城市(krusal最小生成树)
  • 原文地址:https://www.cnblogs.com/demodashi/p/8458089.html
Copyright © 2011-2022 走看看