1.pom.xml文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>test01</groupId>
<artifactId>test01</artifactId>
<version>1.0</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<target.version>1.0</target.version>
<spring.version>4.2.3.RELEASE</spring.version>
<quartz.version>1.8.6</quartz.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.5</version>
</dependency>
<!-- WebCollector dependency -->
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.09</version>
</dependency>
<!-- selenium -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.44.0</version>
</dependency>
<!-- phantomjsdriver(selenium webdriver 第三方支持) -->
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.31</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>6.0.6</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
<exclusions>
<!-- Exclude Commons Logging in favor of SLF4j -->
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- jsonpath -->
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>2.2.0</version>
</dependency>
<dependency><!--3.0.7没这个包 -->
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-webmvc</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-orm</artifactId>
<version>${spring.version}</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
<type>jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>${quartz.version}</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.16.sec01</version>
</dependency>
</dependencies>
<build>
<finalName>test01</finalName>
</build>
</project>
2.测试文件
package test01;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class test {
public static void main(String[] args) {
System.setProperty("http.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
System.getProperties().setProperty("http.proxyPort", "8080");
System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
System.getProperties().setProperty("https.proxyPort", "8080");
getCountry();
System.out.println(111);
}
/**
* 模板
* @return
*/
public static List<Map<String, Object>> getCountry() {
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
try {
Document doc = Jsoup
.connect("https://news.zhibo8.cc/nba/more.htm")
.timeout(3000)
.get();
Element e = doc
.getElementById("boxlist");
Elements c = e
.select("div.dataList ul li");
for (Element e2 : c) {
Map<String, Object> map = new HashMap<String, Object>();
//关键字
String data_country_id = e2.attr("data-label");
//目标网站来源
map.put("fromStation", "直播吧");
//抓取频道
String channel;
map.put("fromStation", "NBA新闻滚动");
//列表图
String colImg;
map.put("colImg", "无");
//标题
String title = e2.select(".articleTitle a").html();
map.put("title", title);
//作者
String author;
//时间
String time = e2.select(".postTime").html();
map.put("time", time);
//参考来源
String ReferenceSource = e2.select(".source").html();;
map.put("ReferenceSource", ReferenceSource);
//评论数
String commentsNumber;
//评论列表
String commentsList;
//正文
String content;
//详情图片
String imgDetail ;
//新闻URL
String newsURL = e2.select(".articleTitle a").attr("href");
map.put("newsURL", newsURL);
list.add(map);
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(list);
return list;
}
}
package test01;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.suning.web.service.NewerService;
import com.suning.web.util.JDBCUtil;
import com.suning.web.util.JsonpUntil;
public class SportsTest {
public static JDBCUtil jdbcutil;
public static NewerService newerService = new NewerService();
public static void main(String[] args) {
System.setProperty("http.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
System.getProperties().setProperty("http.proxyPort", "8080");
System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
System.getProperties().setProperty("https.proxyPort", "8080");
/*Runnable runnable1 = new Runnable() {
public void run() {
String[] keyword = {"day.html","interfb.html","innerfb.html","nba.html","cba.html","sports.html"};
for(String key : keyword){
getSportsList(key);
}
}
};
ScheduledExecutorService service = Executors
.newSingleThreadScheduledExecutor();
// 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间
service.scheduleAtFixedRate(runnable1, 0, 86400, TimeUnit.SECONDS);*/
//getSportsList("day.html");
//首页详情
//getMainContent("http://resource.ttplus.cn/publish/app/data/2017/07/20/67522/share1.html");
//新闻详情
getSportContent("http://www.ttplus.cn/publish/app/data/2017/07/20/67559/share1.html");
//getRealTime();
}
/**
* 24小时
*/
private static List<Map<String,Object>> getRealTime() {
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String getUrl = "http://www.ttplus.cn/24h?lastid=";
String key = "";
try {
String commentDe = JsonpUntil.encode(getUrl, key).toString();
//----------------------------------------------------------------jsonpath------------------start
System.out.println(commentDe);
String type1 = JsonPath.parse(commentDe).read("$.type");//返回数据的状态
System.out.println(type1);
if("success".equals(type1)){
List<Map<String, Object>> pData = JsonPath
.using(Configuration.defaultConfiguration())
.parse(commentDe)
.read("$.content[?(@.newstime > 0)]", List.class);
for(Map<String,Object> comm : pData){
Map<String, Object> map2 = new HashMap<String, Object>();
//标题
String title = (String) comm.get("title");
map2.put("title", title);
//目标网站来源
map2.put("fromStation", "体坛+");
//抓取频道
map2.put("channel", "24H");
//作者
String author = (String) comm.get("authorName");
map2.put("author", author);
//时间
String time = formatter.format(new Date((Long) (comm.get("newstime"))));
map2.put("time", time);
//新闻URL
String newsURL = "";
map2.put("newsURL", newsURL);
//在原网站数据库中id
int aid = (Integer) comm.get("id");
//详情图片
String imgUrl = "";
map2.put("imgUrl", imgUrl);
//评论数
String commentsNumber = "";
map2.put("commentsNumber", commentsNumber);
//关键字
map2.put("keyword", "");//用来分开保存
//新闻内容--------------------------start
List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
Map<String, Object> map = new HashMap<String, Object>();
//标题
map.put("title",title);
//作者
map.put("author",author);
//时间
map.put("article_info",time);
//关键字
String tags = "";
map.put("tags",tags);
//图文信息
String detail = "";
List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
if(imgS.size() > 0){
for(Map<String,Object> img : imgS){
String imgHref = (String) img.get("imgurl");
detail = detail + imgHref + "@/";
}
}
detail = detail + (String) comm.get("content")+"@/";
map.put("detail",detail);
//评论
List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
map.put("commentsList", commentList);
commentsList.add(map);
//新闻内容--------------------------end
map2.put("commentsList", commentsList);
list.add(map2);
}
}
//----------------------------------------------------------------jsonpath------------------end
//把json乱码转成utf-8并以集合形式存贮
Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
String type = parseData.get("type").toString();//返回数据的状态
if("success".equals(type)){
List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("content").toString());
for(Map<String,Object> comm : pData){
Map<String, Object> map2 = new HashMap<String, Object>();
//标题
String title = (String) comm.get("title");
map2.put("title", title);
//目标网站来源
map2.put("fromStation", "体坛+");
//抓取频道
map2.put("channel", "24H");
//作者
String author = (String) comm.get("authorName");
map2.put("author", author);
//时间
String time = formatter.format(new Date((Long) (comm.get("newstime"))));
map2.put("time", time);
//新闻URL
String newsURL = "";
map2.put("newsURL", newsURL);
//在原网站数据库中id
int aid = (Integer) comm.get("id");
//详情图片
String imgUrl = "";
map2.put("imgUrl", imgUrl);
//评论数
String commentsNumber = "";
map2.put("commentsNumber", commentsNumber);
//关键字
map2.put("keyword", "");//用来分开保存
//新闻内容--------------------------start
List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
Map<String, Object> map = new HashMap<String, Object>();
//标题
map.put("title",title);
//作者
map.put("author",author);
//时间
map.put("article_info",time);
//关键字
String tags = "";
map.put("tags",tags);
//图文信息
String detail = "";
List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
if(imgS.size() > 0){
for(Map<String,Object> img : imgS){
String imgHref = (String) img.get("imgurl");
detail = detail + imgHref + "@/";
}
}
detail = detail + (String) comm.get("content")+"@/";
map.put("detail",detail);
//评论
List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
map.put("commentsList", commentList);
commentsList.add(map);
//新闻内容--------------------------end
map2.put("commentsList", commentsList);
list.add(map2);
}
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(list);
return list;
}
/**
* 获取体坛+网站所有信息
*/
public static List<Map<String,Object>> getSportsList(String val){
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
String url = "http://www.ttplus.cn/";
//新建一个数组用来存放已经保存的新闻id
try {
Document doc = Jsoup.connect(url+val).timeout(3000).get();
Map<String, Object> map1 = new HashMap<String, Object>();
Map<String, Object> map2 = new HashMap<String, Object>();
//轮播图片的跳转
Elements main = doc.select("#swiper-wrapper .swiper-slide");
if(main.size() > 0){
for (Element li : main) {
//标题
String title = li.select("a p").text();
map1.put("title", title);
//目标网站来源
map1.put("fromStation", "体坛+");
//抓取频道
map1.put("channel", "首页滚动");
//作者
String author="";
map1.put("author", author);
//时间
String time="";
map1.put("time_info", time);
//列表图
String imgUrl = li.select("a img").attr("src");
map1.put("imgUrl", imgUrl);
//评论数
String commentsNumber = "";
map1.put("commentsNumber", commentsNumber);
//关键字
map1.put("keyword", "main");
//新闻URL
String newsURL = li.select("a").attr("href");
List<Map<String,Object>> detail = new ArrayList<Map<String,Object>>();
if(newsURL.contains("http://resource.ttplus.cn/publish/app/data/")){
//标题id
String aid = newsURL.split("/")[9];
map1.put("newsURL", newsURL);
/**
* 轮播图详情
*/
detail = getSportContent(newsURL);
map1.put("detail", detail);
list.add(map1);
}else{
continue;
}
}
}
//模块部分
Elements part = doc.select("#newsListBox #newsList li");
if(part.size() > 0){
for(Element li : part){
//标题
String title = li.select("a .newsBox-bd h3").text();
map2.put("title", title);
//目标网站来源
map2.put("fromStation", "体坛+");
//抓取频道
map2.put("channel", "首页滚动");
Elements deta = li.select("a .newsBox-bd p span");
//作者
String author = deta.get(0).text();
map2.put("author", author);
//时间
String time = deta.get(1).text();
map2.put("time", time);
//新闻URL
String newsURL = li.select("a").attr("href");
map2.put("newsURL", newsURL);
//在原网站数据库中id
String aid = newsURL.split("/")[9];
//详情图片
String imgUrl = li.select("a .newsBox-hd img").attr("src");
map2.put("imgUrl", imgUrl);
//评论数
String commentsNumber = deta.get(2).text();
map2.put("commentsNumber", commentsNumber);
//关键字
map2.put("keyword", val);//用来分开保存
//评论列表
if(!newsURL.contains("video.html")){
List<Map<String,Object>> commentsList = getSportContent(newsURL);
map2.put("commentsList", commentsList);
}else{
continue;
}
list.add(map2);
}
}
} catch (IOException e) {
e.printStackTrace();
}
//System.out.println(list);
return list;
}
/**
* 获取详细信息
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getSportContent(String newsURL){
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Calendar cal = Calendar.getInstance();
int year = cal.get(Calendar.YEAR);
try {
Map<String, Object> map = new HashMap<String, Object>();
Document doc = Jsoup.connect(newsURL).timeout(3000).get();
String pubtime = doc.select("#author_id h6").attr("id");
//详情图片
String detail = "";
if("pubtime3".equals(pubtime)){
//标题
String title = doc.select(".d-title .h1-title").text();
map.put("title",title);
//作者
String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
map.put("author",author);
//时间
String article_info = year + "-" +doc.select("#author_id #pubtime3 .pull-left").text();
map.put("article_info",article_info);
//关键字
String tags = doc.select("#author_id #pubtime3 .original").text();
map.put("tags",tags);
}else if("pubtime1".equals(pubtime)){
//标题
String title = doc.select(".d-title .h1-title").text();
map.put("title",title);
//作者
String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
map.put("author",author);
//时间
String article_info = year + "-" +doc.select("#author_id #pubtime").text();
map.put("article_info",article_info);
//关键字
String tags = "";
map.put("tags",tags);
}else if("pubtime".equals(pubtime)){
//标题
String title = doc.select(".d-title .h1-title").text();
map.put("title",title);
Elements pull_left = doc.select("#author_id #pubtime span");
//时间
String article_info = year + "-" +pull_left.get(1).text();
map.put("article_info",article_info);
//作者
String author = pull_left.get(0).text();
map.put("author",author);
//关键字
String tags = "";
map.put("tags",tags);
}else if("pubtime4".equals(pubtime)){
//标题
String title = doc.select(".d-title .h1-title").text();
map.put("title",title);
Elements pull_left = doc.select("#author_id #pubtime4 span");
//时间
String article_info = year + "-" +pull_left.get(1).text();
map.put("article_info",article_info);
//作者
String author = pull_left.get(0).text();
map.put("author",author);
//关键字
String tags = pull_left.get(2).text();
String tag = doc.select(".m-detail .m-detail-hd-ft .m-detail-type span").text();
if(!"".equals(tag) && null != tag){
tags = tags + ";" + tag;
}
map.put("tags",tags);
//标题图
String titleImg = doc.select(".m-detail .m-detail-hd img").attr("src");
if(!"".equals(titleImg) && null != titleImg){
detail = detail + titleImg + "@/";
}
}
Elements pList = doc
.select(".m-detail-bd p");
if(pList.size() > 0){//图文信息获取
for(Element p : pList){
String data_src = p.select("img").attr("src");
if("".equals(data_src) || null ==data_src){
detail = detail + p.text()+ "@/";
}else if(!"".equals(p.select("strong").text()) || null != p.select("strong").text()){
detail = detail + p.select("strong").text() + "@/";
}else{
detail = detail + data_src + "@/";
}
}
}
map.put("detail",detail);
//评论
String aid = newsURL.split("/")[9]; //当前新闻的id
String getUrl = "http://app.ttplus.cn:1102/v2/commpent/news/www/"+aid+"/0";
String key = "callback=callback_cmt&_="+System.currentTimeMillis();
String commentDe = JsonpUntil.encode(getUrl, key).toString();
commentDe = commentDe.substring(13, commentDe.length() - 2);
System.out.println(commentDe);
//---------jsonPath--------------start
int count1 = JsonPath.parse(commentDe).read("$.count");
if(count1 > 0){
List<Map<String,Object>> pData = JsonPath
.using(Configuration.defaultConfiguration())
.parse(commentDe)
.read("$.comment[?(@.id > 0)]", List.class);
for(Map<String,Object> comm : pData){
Map<String, Object> commentMap = new HashMap<String, Object>();
//评论人信息
String comment_user = (String) comm.get("username");
commentMap.put("comment_user", comment_user);
//评论时间
String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
commentMap.put("comment_time", comment_time);
//评论内容
String comment_content = (String) comm.get("content");
commentMap.put("comment_content", comment_content);
}
}
//---------jsonPath-------------end
//把json乱码转成utf-8并以集合形式存贮
Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
int count = (Integer) JSON.parse(parseData.get("count").toString());
if(count > 0){
List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("comment").toString());
for(Map<String,Object> comm : pData){
Map<String, Object> commentMap = new HashMap<String, Object>();
//评论人信息
String comment_user = (String) comm.get("username");
commentMap.put("comment_user", comment_user);
//评论时间
String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
commentMap.put("comment_time", comment_time);
//评论内容
String comment_content = (String) comm.get("content");
commentMap.put("comment_content", comment_content);
commentList.add(commentMap);
}
map.put("commentNumber", commentList.size());
}
map.put("commentsList", commentList);
list.add(map);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
//System.out.println(list);
return list;
}
}
package test01;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.suning.web.util.JsonpUntil;
import com.suning.web.util.StringUtil;
public class OnFiresTest {
public static void main(String[] args) {
System.setProperty("http.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
System.getProperties().setProperty("http.proxyPort", "8080");
System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
System.getProperties().setProperty("https.proxyPort", "8080");
System.out.println("onfire");
//OnFire篮球APP获取
//Set aids = new HashSet();
//getOnFireList(1,aids);
getContent("http://www.bbonfire.com/news/detail?p=pc&aid=56374");
/*Runnable runnable1 = new Runnable() {
Set aids = new HashSet();
public void run() {
getOnFireList(1,aids);
//System.out.println(aids);
}
};
Runnable runnable2 = new Runnable() {
Set aids = new HashSet();
public void run() {
getOnFireList(2,aids);
getOnFireList(3,aids);
}
};
ScheduledExecutorService service = Executors
.newSingleThreadScheduledExecutor();
// 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间
service.scheduleAtFixedRate(runnable1, 0, 1800, TimeUnit.SECONDS);
service.scheduleAtFixedRate(runnable2, 0, 86400, TimeUnit.SECONDS);*/
}
/**
* 抓取OnFire篮球APP包
* 当i为1时为推荐,30分钟抓取一次;
* 2时为专栏,24小时抓取一次
* 3时为精译,24小时抓取一次
*/
public static List<Map<String,Object>> getOnFireList(int i,Set aids){
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
String url = "http://www.bbonfire.com";
//新建一个数组用来存放已经保存的新闻id
try {
Document doc = Jsoup
.connect(url+"/news/index?c="+i+"&p=pc")
.timeout(3000)
.get();
Elements e = doc
.select(".news-list .news-item");
if(e.size() > 0){
for (Element e2 : e) {
Map<String, Object> map = new HashMap<String, Object>();
//标题
String title = e2.select(".news-title a").text();
map.put("title", title);
//目标网站来源
map.put("fromStation", "OnFire");
//抓取频道
String channel = "";
if(i == 1){
channel = "推荐";
}else if(i == 2){
channel = "专栏";
}else{
channel = "精译";
}
map.put("channel", channel);
//作者
String author = "";
map.put("author", author);
//时间
String time_info = e2.select(".news-info .time-info").text();
map.put("time_info", time_info);
//新闻URL
String newsURL = e2.select(".news-title a").attr("href");
map.put("newsURL", url+newsURL);
//在原网站数据库中id
String aid = StringUtil.getNumbers(e2.select(".news-title a").attr("href"));
//判断数组中是否已经有此id,有跳过循环,没有存入
if(aids.contains(aid)){
continue;
}else{
map.put("aid", aid);
aids.add(aid);
}
//标题图地址
String imgUrl = e2.select(".news-thumb a img").attr("src");
map.put("imgUrl", imgUrl);
//评论数
String commentsNumber = e2.select(".news-rel .news-comment").text().replace("评论", "").replace(" ", "");
map.put("commentsNumber", commentsNumber);
//关键字
map.put("keyword", "");//用来分开保存
//获取详情
List<Map<String,Object>> commentsList = getContent(url+newsURL);
if(commentsList.size() > 0){//不是图文信息则跳过当前循环
map.put("content", commentsList);
list.add(map);
}else{
continue;
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(list);
return list;
}
/**
* 获取详情信息
* @return
*/
private static List<Map<String, Object>> getContent(String contentUrl) {
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
Document doc = Jsoup
.connect(contentUrl)
.timeout(3000)
.get();
//System.out.println(doc);
Map<String, Object> map = new HashMap<String, Object>();
//图文信息
if(!"transparent".equals(doc.select("embed").attr("wmode"))){//判断图文消息
//标题
String title = doc.select(".article h1").text();
map.put("title",title);
//时间
SimpleDateFormat form1 = new SimpleDateFormat("yyyy年MM月dd日 HH:mm");
String article_info = formatter.format(form1.parse(doc.select(".article-info .time").text())).toString();
map.put("article_info",article_info);
//作者
String author = doc.select(".article-info .author").text();
map.put("author",author);
//详情图片
String detail = "";
Elements pList = doc
.select(".article-content p");
if(pList.size() > 0){//图文信息获取
for(Element p : pList){
String data_src = p.select("img").attr("data-src");
if("".equals(data_src) || null ==data_src){
detail = detail + p.text()+ "@/";
}else{
detail = detail + data_src + "@/";
}
}
}
map.put("detail",detail);
//关键字
String tags = "";
Elements spanList = doc.select(".article-tag span");
if(spanList.size() > 0){
for(Element span : spanList){
tags = tags + span.text() + ";";
}
}
map.put("tags",tags);
//评论
String aid = doc.select("#commentHTML").attr("data-articleid"); //当前新闻的id
String getUrl = "http://www.bbonfire.com/api/list";
String key = "p=comment&isjs=1&articleid="+aid+"&len=15&hotlen=5";
String commentDe = JsonpUntil.encode(getUrl, key).toString();
//System.out.println(commentDe);
//把json乱码转成utf-8并以集合形式存贮
Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("data").toString());
List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
if(pData.size() > 0){
for(Map<String,Object> comm : pData){
Map<String, Object> commentMap = new HashMap<String, Object>();
//评论人信息
Map<String,Object> comment_user = (Map<String, Object>) comm.get("userInfo");
commentMap.put("comment_user", comment_user.get("screen_name").toString());
//评论时间
SimpleDateFormat form = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US);
String comment_time = formatter.format(form.parse((String)comm.get("ctime")));
commentMap.put("comment_time", comment_time);
//评论内容
String comment_content = (String) comm.get("content");
commentMap.put("comment_content", comment_content);
commentList.add(commentMap);
}
map.put("commentsList", commentList);
map.put("commentNumber", commentList.size());
}
list.add(map);
}
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(list);
return list;
}
}
3.ajxa请求
package com.suning.web.util;
import java.io.StringWriter;
import org.apache.commons.codec.Charsets;
import org.apache.commons.io.output.WriterOutputStream;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;
public class JsonpUntil {
public static StringWriter encode(String url,String key) throws Exception{
StringWriter sw = null ;
HttpClient httpClient = new DefaultHttpClient();
HttpHost proxy = new HttpHost("10.19.110.55", 8080);
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxy);
if(!"".equals(key) && null != key){
url = url+"?"+key;
}
HttpGet httpGet = new HttpGet(url);
httpGet.addHeader(new BasicHeader("Cookie", "_snma=1%7C149567342565754882%7C1495673425657%7C1495673446005%7C1495714227730%7C3%7C3; idsLoginUserIdLastTime=16030136; authId=si9343022161FCD46A3745D6F3A1BCB180; secureToken=5E769A7ADD32F1977AC2104266C010F3"));
HttpResponse loginResponse = httpClient.execute(httpGet);
HttpEntity loginEntity = loginResponse.getEntity();
if("HTTP/1.1 404 Not Found".trim().equals(loginResponse.getStatusLine().toString().trim()))
{
System.out.println(url);
System.out.println("此条信息异常!");
}
else
{
sw = new StringWriter();
try (WriterOutputStream out = new WriterOutputStream(sw, Charsets.UTF_8))
{
loginEntity.writeTo(out);
}
}
return sw;
}
}