zoukankan      html  css  js  c++  java
  • 爬虫获取数据

    1.pom.xml文件

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
      <modelVersion>4.0.0</modelVersion>
      <groupId>test01</groupId>
      <artifactId>test01</artifactId>
    	<version>1.0</version>
    	<packaging>jar</packaging>
    	
     	<properties> 
     		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 
    		<target.version>1.0</target.version>
    		<spring.version>4.2.3.RELEASE</spring.version>
     		<quartz.version>1.8.6</quartz.version> 
    	</properties> 
    	
    	<dependencies>
    		<dependency>
    			<groupId>junit</groupId>
    			<artifactId>junit</artifactId>
    			<version>4.11</version>
    		</dependency>
    		<dependency>
    			<groupId>log4j</groupId>
    			<artifactId>log4j</artifactId>
    			<version>1.2.17</version>
    		</dependency>
    		<dependency>
    			<groupId>org.slf4j</groupId>
    			<artifactId>slf4j-log4j12</artifactId>
    			<version>1.7.5</version>
    		</dependency>
    		<!-- WebCollector dependency -->
    		<dependency>
    			<groupId>cn.edu.hfut.dmic.webcollector</groupId>
    			<artifactId>WebCollector</artifactId>
    			<version>2.09</version>
    		</dependency>
    		<!-- selenium -->
    		<dependency>
    			<groupId>org.seleniumhq.selenium</groupId>
    			<artifactId>selenium-java</artifactId>
    			<version>2.44.0</version>
    		</dependency>
    		<!-- phantomjsdriver(selenium webdriver 第三方支持) -->
    		<dependency>
    			<groupId>com.github.detro</groupId>
    			<artifactId>phantomjsdriver</artifactId>
    			<version>1.2.0</version>
    		</dependency>
    		<dependency>
    			<groupId>com.alibaba</groupId>
    			<artifactId>druid</artifactId>
    			<version>1.0.31</version>
    		</dependency>
    		<dependency>
    			<groupId>mysql</groupId>
    			<artifactId>mysql-connector-java</artifactId>
    			<version>6.0.6</version>
    		</dependency>
    		<dependency>
    			<groupId>org.springframework</groupId>
    			<artifactId>spring-context</artifactId>
    			<version>${spring.version}</version>
    			<exclusions>
    				<!-- Exclude Commons Logging in favor of SLF4j -->
    				<exclusion>
    					<groupId>commons-logging</groupId>
    					<artifactId>commons-logging</artifactId>
    				</exclusion>
    			</exclusions>
    		</dependency>
    		<!-- jsonpath -->
    		<dependency>
    		    <groupId>net.minidev</groupId>
    		    <artifactId>json-smart</artifactId>
    		    <version>2.2.1</version>
    		</dependency>
    		<dependency>
    		    <groupId>com.jayway.jsonpath</groupId>
    		    <artifactId>json-path</artifactId>
    		    <version>2.2.0</version>
    		</dependency>
    		<dependency><!--3.0.7没这个包 -->
    			<groupId>org.springframework</groupId>
    			<artifactId>spring-context-support</artifactId>
    			<version>${spring.version}</version>
    		</dependency>
    		<dependency>
    			<groupId>org.springframework</groupId>
    			<artifactId>spring-webmvc</artifactId>
    			<version>${spring.version}</version>
    		</dependency>
    		<dependency>
    			<groupId>org.springframework</groupId>
    			<artifactId>spring-orm</artifactId>
    			<version>${spring.version}</version>
    			<type>jar</type>
    			<scope>compile</scope>
    		</dependency>
    
    		<dependency>
    			<groupId>org.springframework</groupId>
    			<artifactId>spring-test</artifactId>
    			<version>${spring.version}</version>
    			<type>jar</type>
    			<scope>test</scope>
    		</dependency>
    		<dependency>
    			<groupId>org.quartz-scheduler</groupId>
    			<artifactId>quartz</artifactId>
    			<version>${quartz.version}</version>
    		</dependency>
    		<dependency>
    			<groupId>net.sf.json-lib</groupId>
    			<artifactId>json-lib</artifactId>
    			<version>2.4</version>
    		</dependency>
    		<dependency>
    			<groupId>com.alibaba</groupId>
    			<artifactId>fastjson</artifactId>
    			<version>1.2.16.sec01</version>
    		</dependency>
    	</dependencies>
    	<build>
        <finalName>test01</finalName>
      </build>
    </project>
      
    

      2.测试文件

    package test01;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class test {
    	public static void main(String[] args) {
    		System.setProperty("http.maxRedirects", "50");
    		System.getProperties().setProperty("proxySet", "true");
    		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("http.proxyPort", "8080");
    		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("https.proxyPort", "8080");
    		getCountry();
    		System.out.println(111);
    	}
    	
    	/**
    	 * 模板
    	 * @return
    	 */
    	public static List<Map<String, Object>> getCountry() {
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		try {
    			Document doc = Jsoup
    					.connect("https://news.zhibo8.cc/nba/more.htm")
    					.timeout(3000)
    					.get();
    			
    			Element e = doc
    					.getElementById("boxlist");
    			Elements c = e
    					.select("div.dataList ul li");
    			for (Element e2 : c) {
    				Map<String, Object> map = new HashMap<String, Object>();
    				//关键字
    				String data_country_id = e2.attr("data-label");
    				//目标网站来源
    				map.put("fromStation", "直播吧");
    				//抓取频道
    				String channel;
    				map.put("fromStation", "NBA新闻滚动");
    				//列表图
    				String colImg;
    				map.put("colImg", "无");
    				//标题
    				String title  = e2.select(".articleTitle a").html();
    				map.put("title", title);
    				//作者
    				String author;
    				//时间
    				String time = e2.select(".postTime").html();
    				map.put("time", time);
    				//参考来源
    				String ReferenceSource = e2.select(".source").html();;
    				map.put("ReferenceSource", ReferenceSource);
    				//评论数
    				String commentsNumber;
    				//评论列表
    				String commentsList;
    				//正文
    				String content;
    				//详情图片
    				String imgDetail ;
    				//新闻URL
    				String newsURL = e2.select(".articleTitle a").attr("href");
    				map.put("newsURL", newsURL);
    				list.add(map);
    			}
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		System.out.println(list);
    		return list;
    	}
    }
    

      

    package test01;
    
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Calendar;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.Executors;
    import java.util.concurrent.ScheduledExecutorService;
    import java.util.concurrent.TimeUnit;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.alibaba.fastjson.JSON;
    import com.jayway.jsonpath.Configuration;
    import com.jayway.jsonpath.JsonPath;
    import com.suning.web.service.NewerService;
    import com.suning.web.util.JDBCUtil;
    import com.suning.web.util.JsonpUntil;
    
    public class SportsTest {
    	public static JDBCUtil jdbcutil;
    	public static NewerService newerService = new NewerService();
    	public static void main(String[] args) {
    		System.setProperty("http.maxRedirects", "50");
    		System.getProperties().setProperty("proxySet", "true");
    		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("http.proxyPort", "8080");
    		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("https.proxyPort", "8080");
    		/*Runnable runnable1 = new Runnable() {  
                public void run() {  
                	String[] keyword = {"day.html","interfb.html","innerfb.html","nba.html","cba.html","sports.html"};
                	for(String key : keyword){          		
                		getSportsList(key);
                	}
                }  
            };  
            ScheduledExecutorService service = Executors  
                    .newSingleThreadScheduledExecutor();  
            // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间  
            service.scheduleAtFixedRate(runnable1, 0, 86400, TimeUnit.SECONDS);*/
    		//getSportsList("day.html");
    		//首页详情
    		//getMainContent("http://resource.ttplus.cn/publish/app/data/2017/07/20/67522/share1.html");
    		//新闻详情
    		getSportContent("http://www.ttplus.cn/publish/app/data/2017/07/20/67559/share1.html");
            //getRealTime();
    	}
    	/**
    	 * 24小时
    	 */
    	private static List<Map<String,Object>> getRealTime() {
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    		String getUrl = "http://www.ttplus.cn/24h?lastid=";
    		String key = "";		
    		try {
    			String commentDe = JsonpUntil.encode(getUrl, key).toString();
    			//----------------------------------------------------------------jsonpath------------------start
    			System.out.println(commentDe);
    			String type1 = JsonPath.parse(commentDe).read("$.type");//返回数据的状态
    			System.out.println(type1);
    			if("success".equals(type1)){
    				List<Map<String, Object>> pData = JsonPath
    						.using(Configuration.defaultConfiguration())
    						.parse(commentDe)
    						.read("$.content[?(@.newstime > 0)]", List.class);
    				for(Map<String,Object> comm : pData){
    					Map<String, Object> map2 = new HashMap<String, Object>();					
    					//标题
    					String title = (String) comm.get("title");
    					map2.put("title", title);
    					//目标网站来源
    					map2.put("fromStation", "体坛+");					
    					//抓取频道
    					map2.put("channel", "24H");					
    					//作者
    					String author = (String) comm.get("authorName");
    					map2.put("author", author);
    					//时间
    					String time = formatter.format(new Date((Long) (comm.get("newstime"))));
    					map2.put("time", time);
    					//新闻URL
    					String newsURL = "";
    					map2.put("newsURL", newsURL);	
    					//在原网站数据库中id
    					int aid = (Integer) comm.get("id");
    					//详情图片
    					String imgUrl = "";
    					map2.put("imgUrl", imgUrl);
    					//评论数
    					String commentsNumber = "";
    					map2.put("commentsNumber", commentsNumber);
    					//关键字
    					map2.put("keyword", "");//用来分开保存
    					
    					//新闻内容--------------------------start
    					List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
    					Map<String, Object> map = new HashMap<String, Object>();
    					//标题
    					map.put("title",title);
    					//作者
    					map.put("author",author);
    					//时间
    					map.put("article_info",time);
    					//关键字
    					String tags = "";
    					map.put("tags",tags);
    					//图文信息
    					String detail = "";
    					List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
    					if(imgS.size() > 0){
    						for(Map<String,Object> img : imgS){
    							String imgHref = (String) img.get("imgurl");
    							detail = detail + imgHref + "@/";
    						}
    					}
    					detail = detail + (String) comm.get("content")+"@/";
    					map.put("detail",detail);
    					
    					//评论
    					List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
    					map.put("commentsList", commentList);
    					commentsList.add(map);
    					//新闻内容--------------------------end
    					map2.put("commentsList", commentsList);
    					
    					list.add(map2);					
    				}				
    			}
    			
    			
    			
    			
    			
    			//----------------------------------------------------------------jsonpath------------------end
    			//把json乱码转成utf-8并以集合形式存贮
    			Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
    			String type = parseData.get("type").toString();//返回数据的状态
    			if("success".equals(type)){
    				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("content").toString());
    				for(Map<String,Object> comm : pData){
    					Map<String, Object> map2 = new HashMap<String, Object>();					
    					//标题
    					String title = (String) comm.get("title");
    					map2.put("title", title);
    					//目标网站来源
    					map2.put("fromStation", "体坛+");					
    					//抓取频道
    					map2.put("channel", "24H");					
    					//作者
    					String author = (String) comm.get("authorName");
    					map2.put("author", author);
    					//时间
    					String time = formatter.format(new Date((Long) (comm.get("newstime"))));
    					map2.put("time", time);
    					//新闻URL
    					String newsURL = "";
    					map2.put("newsURL", newsURL);	
    					//在原网站数据库中id
    					int aid = (Integer) comm.get("id");
    					//详情图片
    					String imgUrl = "";
    					map2.put("imgUrl", imgUrl);
    					//评论数
    					String commentsNumber = "";
    					map2.put("commentsNumber", commentsNumber);
    					//关键字
    					map2.put("keyword", "");//用来分开保存
    					
    					//新闻内容--------------------------start
    					List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
    					Map<String, Object> map = new HashMap<String, Object>();
    					//标题
    					map.put("title",title);
    					//作者
    					map.put("author",author);
    					//时间
    					map.put("article_info",time);
    					//关键字
    					String tags = "";
    					map.put("tags",tags);
    					//图文信息
    					String detail = "";
    					List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
    					if(imgS.size() > 0){
    						for(Map<String,Object> img : imgS){
    							String imgHref = (String) img.get("imgurl");
    							detail = detail + imgHref + "@/";
    						}
    					}
    					detail = detail + (String) comm.get("content")+"@/";
    					map.put("detail",detail);
    					
    					//评论
    					List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
    					map.put("commentsList", commentList);
    					commentsList.add(map);
    					//新闻内容--------------------------end
    					map2.put("commentsList", commentsList);
    					
    					list.add(map2);					
    				}				
    			}
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		System.out.println(list);
    		return list;
    	}
    	/**
    	 * 获取体坛+网站所有信息
    	 */
    	public static List<Map<String,Object>> getSportsList(String val){
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		String url = "http://www.ttplus.cn/";
    		//新建一个数组用来存放已经保存的新闻id
    		try {
    			Document doc = Jsoup.connect(url+val).timeout(3000).get();
    			Map<String, Object> map1 = new HashMap<String, Object>();
    			Map<String, Object> map2 = new HashMap<String, Object>();
    			//轮播图片的跳转
    			Elements main = doc.select("#swiper-wrapper .swiper-slide");
    			if(main.size() > 0){
    				for (Element li : main) {
    					//标题
    					String title = li.select("a p").text();
    					map1.put("title", title);
    					//目标网站来源
    					map1.put("fromStation", "体坛+");
    					//抓取频道
    					map1.put("channel", "首页滚动");
    					//作者
    					String author="";
    					map1.put("author", author);
    					//时间
    					String time="";
    					map1.put("time_info", time);
    					
    					//列表图
    					String imgUrl = li.select("a img").attr("src");
    					map1.put("imgUrl", imgUrl);
    					//评论数
    					String commentsNumber = "";
    					map1.put("commentsNumber", commentsNumber);
    					//关键字
    					map1.put("keyword", "main");
    					//新闻URL
    					String newsURL = li.select("a").attr("href");
    					
    					List<Map<String,Object>> detail = new ArrayList<Map<String,Object>>();
    					if(newsURL.contains("http://resource.ttplus.cn/publish/app/data/")){
    						//标题id
    						String aid = newsURL.split("/")[9];
    						map1.put("newsURL", newsURL);	
    						/**
    						 * 轮播图详情
    						 */
    						detail = getSportContent(newsURL);
    						map1.put("detail", detail);	
    						list.add(map1);
    					}else{
    						continue;
    					}
    				}
    			}
    			//模块部分
    			Elements part = doc.select("#newsListBox #newsList li");
    			if(part.size() > 0){
    				for(Element li : part){
    					//标题
    					String title = li.select("a .newsBox-bd h3").text();
    					map2.put("title", title);
    					//目标网站来源
    					map2.put("fromStation", "体坛+");
    					
    					//抓取频道
    					map2.put("channel", "首页滚动");
    					
    					Elements deta = li.select("a .newsBox-bd p span");
    					//作者
    					String author = deta.get(0).text();
    					map2.put("author", author);
    					//时间
    					String time = deta.get(1).text();
    					map2.put("time", time);
    					//新闻URL
    					String newsURL = li.select("a").attr("href");
    					map2.put("newsURL", newsURL);	
    					//在原网站数据库中id
    					String aid = newsURL.split("/")[9];
    					//详情图片
    					String imgUrl = li.select("a .newsBox-hd img").attr("src");
    					map2.put("imgUrl", imgUrl);
    					//评论数
    					String commentsNumber = deta.get(2).text();
    					map2.put("commentsNumber", commentsNumber);
    					//关键字
    					map2.put("keyword", val);//用来分开保存
    					
    					//评论列表
    					if(!newsURL.contains("video.html")){						
    						List<Map<String,Object>> commentsList = getSportContent(newsURL);
    						map2.put("commentsList", commentsList);
    					}else{
    						continue;
    					}
    					list.add(map2);
    				}
    			}
    			
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		//System.out.println(list);
    		return list;
    	}
    	/**
    	 * 获取详细信息
    	 */
    	@SuppressWarnings("unchecked")
    	public static List<Map<String,Object>> getSportContent(String newsURL){
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    		Calendar cal = Calendar.getInstance();
    	    int year = cal.get(Calendar.YEAR);
    		try {
    			Map<String, Object> map = new HashMap<String, Object>();
    			Document doc = Jsoup.connect(newsURL).timeout(3000).get();
    			String pubtime = doc.select("#author_id h6").attr("id");
    			//详情图片
    			String detail = "";
    			if("pubtime3".equals(pubtime)){
    				//标题
    				String title = doc.select(".d-title .h1-title").text();
    				map.put("title",title);
    				//作者
    				String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
    				map.put("author",author);
    				//时间
    				String article_info = year + "-" +doc.select("#author_id #pubtime3 .pull-left").text();
    				map.put("article_info",article_info);
    				//关键字
    				String tags = doc.select("#author_id #pubtime3 .original").text();
    				map.put("tags",tags);
    			}else if("pubtime1".equals(pubtime)){				
    				//标题
    				String title = doc.select(".d-title .h1-title").text();
    				map.put("title",title);
    				//作者
    				String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
    				map.put("author",author);
    				//时间
    				String article_info = year + "-" +doc.select("#author_id #pubtime").text();
    				map.put("article_info",article_info);
    				//关键字
    				String tags = "";
    				map.put("tags",tags);
    			}else if("pubtime".equals(pubtime)){				
    				//标题
    				String title = doc.select(".d-title .h1-title").text();
    				map.put("title",title);
    				Elements pull_left = doc.select("#author_id #pubtime span");
    				//时间
    				String article_info = year + "-" +pull_left.get(1).text();
    				map.put("article_info",article_info);
    				//作者
    				String author = pull_left.get(0).text();
    				map.put("author",author);
    				//关键字
    				String tags = "";
    				map.put("tags",tags);
    			}else if("pubtime4".equals(pubtime)){
    				//标题
    				String title = doc.select(".d-title .h1-title").text();
    				map.put("title",title);
    				Elements pull_left = doc.select("#author_id #pubtime4 span");
    				//时间
    				String article_info = year + "-" +pull_left.get(1).text();
    				map.put("article_info",article_info);
    				//作者
    				String author = pull_left.get(0).text();
    				map.put("author",author);
    				//关键字
    				String tags = pull_left.get(2).text();
    				String tag = doc.select(".m-detail .m-detail-hd-ft .m-detail-type span").text();
    				if(!"".equals(tag) && null != tag){
    					tags = tags + ";" + tag;
    				}
    				map.put("tags",tags);
    				//标题图
    				String titleImg = doc.select(".m-detail .m-detail-hd img").attr("src");
    				if(!"".equals(titleImg) && null != titleImg){
    					detail = detail + titleImg + "@/";
    				}
    			}
    			
    			
    			Elements pList = doc
    					.select(".m-detail-bd p");
    			if(pList.size() > 0){//图文信息获取
    				for(Element p : pList){						
    					String data_src = p.select("img").attr("src");
    					if("".equals(data_src) || null ==data_src){
    						detail = detail + p.text()+ "@/";
    					}else if(!"".equals(p.select("strong").text()) || null != p.select("strong").text()){
    						detail = detail + p.select("strong").text() + "@/";
    					}else{
    						detail = detail + data_src + "@/";
    					}
    				}					
    			}
    			map.put("detail",detail);
    			
    			//评论
    			String aid = newsURL.split("/")[9];			//当前新闻的id
    			String getUrl = "http://app.ttplus.cn:1102/v2/commpent/news/www/"+aid+"/0";
    			String key = "callback=callback_cmt&_="+System.currentTimeMillis();
    			
    			String commentDe = JsonpUntil.encode(getUrl, key).toString();
    			commentDe = commentDe.substring(13, commentDe.length() - 2);
    			System.out.println(commentDe);
    			
    			//---------jsonPath--------------start
    			int count1 = JsonPath.parse(commentDe).read("$.count");
    			if(count1 > 0){
    				List<Map<String,Object>> pData = JsonPath
    						.using(Configuration.defaultConfiguration())
    						.parse(commentDe)
    						.read("$.comment[?(@.id > 0)]", List.class);
    				for(Map<String,Object> comm : pData){
    					Map<String, Object> commentMap = new HashMap<String, Object>();
    					//评论人信息
    					String comment_user = (String) comm.get("username");
    					commentMap.put("comment_user", comment_user);
    					//评论时间
    					String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
    					commentMap.put("comment_time", comment_time);
    					//评论内容
    					String comment_content = (String) comm.get("content");
    					commentMap.put("comment_content", comment_content);
    					
    				}
    			}
    			
    			
    			
    			//---------jsonPath-------------end		
    			//把json乱码转成utf-8并以集合形式存贮
    			Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
    			List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
    			int count = (Integer) JSON.parse(parseData.get("count").toString());
    
    			if(count > 0){
    				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("comment").toString());
    				for(Map<String,Object> comm : pData){
    					Map<String, Object> commentMap = new HashMap<String, Object>();
    					//评论人信息
    					String comment_user = (String) comm.get("username");
    					commentMap.put("comment_user", comment_user);
    					//评论时间
    					String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
    					commentMap.put("comment_time", comment_time);
    					//评论内容
    					String comment_content = (String) comm.get("content");
    					commentMap.put("comment_content", comment_content);
    					commentList.add(commentMap);
    				}
    				map.put("commentNumber", commentList.size());
    			}
    			map.put("commentsList", commentList);			
    			list.add(map);						
    		} catch (IOException e) {
    			e.printStackTrace();
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		//System.out.println(list);
    		return list;
    	}
    }
    

      

    package test01;
    
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Locale;
    import java.util.Map;
    import java.util.Set;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.alibaba.fastjson.JSON;
    import com.suning.web.util.JsonpUntil;
    import com.suning.web.util.StringUtil;
    
    public class OnFiresTest {
    	public static void main(String[] args) {
    		System.setProperty("http.maxRedirects", "50");
    		System.getProperties().setProperty("proxySet", "true");
    		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("http.proxyPort", "8080");
    		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
    		System.getProperties().setProperty("https.proxyPort", "8080");
    		System.out.println("onfire");
    		//OnFire篮球APP获取
    		//Set aids = new HashSet();
    		//getOnFireList(1,aids);
    		getContent("http://www.bbonfire.com/news/detail?p=pc&aid=56374");
    		/*Runnable runnable1 = new Runnable() {  
    			Set aids = new HashSet();
                public void run() {  
                	getOnFireList(1,aids); 
                	//System.out.println(aids);
                }  
            };  
            Runnable runnable2 = new Runnable() {  
            	Set aids = new HashSet();
                public void run() {  
                	getOnFireList(2,aids);
                	getOnFireList(3,aids);
                }  
            }; 
            ScheduledExecutorService service = Executors  
                    .newSingleThreadScheduledExecutor();  
            // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间  
            service.scheduleAtFixedRate(runnable1, 0, 1800, TimeUnit.SECONDS);
            service.scheduleAtFixedRate(runnable2, 0, 86400, TimeUnit.SECONDS);*/
    	}
    	/**
    	 * 抓取OnFire篮球APP包
    	 * 当i为1时为推荐,30分钟抓取一次;
    	 * 2时为专栏,24小时抓取一次
    	 * 3时为精译,24小时抓取一次
    	 */
    	public static List<Map<String,Object>> getOnFireList(int i,Set aids){
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		String url = "http://www.bbonfire.com";
    		//新建一个数组用来存放已经保存的新闻id
    		try {
    			Document doc = Jsoup
    					.connect(url+"/news/index?c="+i+"&p=pc")
    					.timeout(3000)
    					.get();
    			
    			Elements e = doc
    					.select(".news-list .news-item");
    			if(e.size() > 0){
    				for (Element e2 : e) {
    					Map<String, Object> map = new HashMap<String, Object>();
    					//标题
    					String title  = e2.select(".news-title a").text();
    					map.put("title", title);
    					//目标网站来源
    					map.put("fromStation", "OnFire");
    					//抓取频道
    					String channel = "";
    					if(i == 1){
    						channel =  "推荐";
    					}else if(i == 2){
    						channel =  "专栏";
    					}else{
    						channel = "精译";
    					}
    					map.put("channel", channel);
    					//作者
    					String author = "";
    					map.put("author", author);
    					//时间
    					String time_info = e2.select(".news-info .time-info").text();
    					map.put("time_info", time_info);
    					//新闻URL
    					String newsURL = e2.select(".news-title a").attr("href");
    					map.put("newsURL", url+newsURL);
    					//在原网站数据库中id
    					String aid = StringUtil.getNumbers(e2.select(".news-title a").attr("href"));
    					//判断数组中是否已经有此id,有跳过循环,没有存入
    					if(aids.contains(aid)){
    						continue;
    					}else{					
    						map.put("aid", aid);
    						aids.add(aid);
    					}
    					//标题图地址
    					String imgUrl = e2.select(".news-thumb a img").attr("src");
    					map.put("imgUrl", imgUrl);
    					//评论数
    					String commentsNumber = e2.select(".news-rel .news-comment").text().replace("评论", "").replace(" ", "");
    					map.put("commentsNumber", commentsNumber);
    					//关键字
    					map.put("keyword", "");//用来分开保存
    					//获取详情
    					List<Map<String,Object>> commentsList = getContent(url+newsURL);
    					if(commentsList.size() > 0){//不是图文信息则跳过当前循环				
    						map.put("content", commentsList);
    						
    						list.add(map);
    					}else{
    						continue;
    					}
    				}
    			}
    
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		System.out.println(list);
    		return list;		
    	}
    	/**
    	 * 获取详情信息
    	 * @return
    	 */
    	private static List<Map<String, Object>> getContent(String contentUrl) {
    		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
    		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    		
    		try {
    			Document doc = Jsoup
    					.connect(contentUrl)
    					.timeout(3000)
    					.get();
    			//System.out.println(doc);
    			Map<String, Object> map = new HashMap<String, Object>();
    			//图文信息
    			if(!"transparent".equals(doc.select("embed").attr("wmode"))){//判断图文消息		
    				//标题
    				String title = doc.select(".article h1").text();
    				map.put("title",title);
    				//时间
    				SimpleDateFormat form1 = new SimpleDateFormat("yyyy年MM月dd日 HH:mm");
    				String article_info = formatter.format(form1.parse(doc.select(".article-info .time").text())).toString();
    				map.put("article_info",article_info);
    				//作者
    				String author = doc.select(".article-info .author").text();
    				map.put("author",author);
    				//详情图片
    				String detail = "";
    				Elements pList = doc
    						.select(".article-content p");
    				if(pList.size() > 0){//图文信息获取
    					for(Element p : pList){						
    						String data_src = p.select("img").attr("data-src");
    						if("".equals(data_src) || null ==data_src){
    							detail = detail + p.text()+ "@/";
    						}else{
    							detail = detail + data_src + "@/";
    						}
    					}					
    				}
    				map.put("detail",detail);
    				//关键字
    				String tags = "";
    				Elements spanList = doc.select(".article-tag span");
    				if(spanList.size() > 0){
    					for(Element span : spanList){
    						tags = tags + span.text() + ";";
    					}
    				}
    				map.put("tags",tags);
    				
    				//评论
    				String aid = doc.select("#commentHTML").attr("data-articleid");				//当前新闻的id
    				String getUrl = "http://www.bbonfire.com/api/list";
    				String key = "p=comment&isjs=1&articleid="+aid+"&len=15&hotlen=5";
    				String commentDe = JsonpUntil.encode(getUrl, key).toString();
    				//System.out.println(commentDe);
    				//把json乱码转成utf-8并以集合形式存贮
    				Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
    				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("data").toString());
    				List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
    				if(pData.size() > 0){
    					for(Map<String,Object> comm : pData){
    						Map<String, Object> commentMap = new HashMap<String, Object>();
    						//评论人信息
    						Map<String,Object> comment_user = (Map<String, Object>) comm.get("userInfo");
    						commentMap.put("comment_user", comment_user.get("screen_name").toString());
    						//评论时间
    						SimpleDateFormat form = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US);
    						String comment_time = formatter.format(form.parse((String)comm.get("ctime")));
    						commentMap.put("comment_time", comment_time);
    						//评论内容
    						String comment_content = (String) comm.get("content");
    						commentMap.put("comment_content", comment_content);
    						commentList.add(commentMap);
    					}
    					map.put("commentsList", commentList);
    					map.put("commentNumber", commentList.size());
    				}
    				
    				
    				list.add(map);
    			}
    
    		} catch (IOException e) {
    			e.printStackTrace();
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		System.out.println(list);
    		return list;		
    	}
    }
    

      3.ajxa请求

    package com.suning.web.util;
    
    import java.io.StringWriter;
    
    import org.apache.commons.codec.Charsets;
    import org.apache.commons.io.output.WriterOutputStream;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpHost;
    import org.apache.http.HttpResponse;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.conn.params.ConnRouteParams;
    import org.apache.http.impl.client.DefaultHttpClient;
    import org.apache.http.message.BasicHeader;
    
    public class JsonpUntil {
    
    	public static StringWriter encode(String url,String key) throws  Exception{
    		StringWriter sw = null ;
    		HttpClient httpClient = new DefaultHttpClient();
    		HttpHost proxy = new HttpHost("10.19.110.55", 8080);
    		httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxy);
    		if(!"".equals(key) && null != key){
    			url = url+"?"+key;
    		}
    		HttpGet httpGet = new HttpGet(url);
    		httpGet.addHeader(new BasicHeader("Cookie", "_snma=1%7C149567342565754882%7C1495673425657%7C1495673446005%7C1495714227730%7C3%7C3; idsLoginUserIdLastTime=16030136; authId=si9343022161FCD46A3745D6F3A1BCB180; secureToken=5E769A7ADD32F1977AC2104266C010F3"));
    		HttpResponse loginResponse = httpClient.execute(httpGet);
    		HttpEntity loginEntity = loginResponse.getEntity();
    		if("HTTP/1.1 404 Not Found".trim().equals(loginResponse.getStatusLine().toString().trim()))
    		{
    			System.out.println(url);
    			System.out.println("此条信息异常!");
    		}
    		else
    		{
    			sw = new StringWriter();
    			try (WriterOutputStream out = new WriterOutputStream(sw, Charsets.UTF_8))
    			{
    				loginEntity.writeTo(out);
    			}
    		}
    		return sw;
    	}
    }
    

      

  • 相关阅读:
    xml 总结(一)数据岛,命名空间
    activiti designer 安装到 myeclipse
    activiti5.15 学习笔记
    goole网址IP
    form 中Enctype=multipart/form-data 的作用
    上传文件form表单enctype="multipart/form-data"传值解决办法(代原代码)
    淘宝初始化样式
    js闭包使用之处
    CSS Sprites
    iframe用的场景
  • 原文地址:https://www.cnblogs.com/sho560/p/7267085.html
Copyright © 2011-2022 走看看