1、使用gradle建立工程:
工程格式如下:
include ':spider-demo'
rootProject.name = 'my-spider-demo'
def void forceVersion(details, group, version) { if (details.requested.group == group) { details.useVersion version } } def void forceVersion(details, group, name, version) { if (details.requested.group == group && details.requested.name == name) { details.useVersion version } } allprojects { p -> group = 'com.my.spider' version = '1.0.0' apply plugin: 'java' apply plugin: 'maven' apply plugin: 'maven-publish' [compileJava, compileTestJava]*.options*.encoding = 'UTF-8' jar.doFirst { manifest { def manifestFile = "${projectDir}/META-INF/MANIFEST.MF" if (new File(manifestFile).exists()) from (manifestFile) attributes 'Implementation-Title':p.name if (p.version.endsWith('-SNAPSHOT')) { attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp } else { attributes 'Implementation-Version': p.version } attributes 'Implementation-BuildDateTime':new Date() } } javadoc { options { encoding 'UTF-8' charSet 'UTF-8' author false version true links 'http://docs.oracle.com/javase/8/docs/api/index.html' memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE } } if (p.name.endsWith('-api')){ task sourcesJar(type:Jar, dependsOn:classes) { classifier = 'sources' from sourceSets.main.allSource } task javadocJar(type:Jar, dependsOn:javadoc) { classifier = 'javadoc' from javadoc.destinationDir } } publishing { repositories { maven { credentials { username "${repositoryUploadUsername}" password "${repositoryUploadPassword}" } if (version.endsWith('-SNAPSHOT')) { url "${repositoryUploadSnapshotUrl}" } else { url "${repositoryUploadReleaseUrl}" } } } publications { mavenJava(MavenPublication) { from components.java // 只有*-api才会需要发布sources和javadoc if (p.name.endsWith('-api')){ artifact sourcesJar { classifier "sources" } artifact javadocJar { classifier "javadoc" } } } } } if (System.env.uploadArchives) { build.dependsOn publish } buildscript { repositories { maven { name 'Maven Repository' url "${repositoryMavenUrl}" credentials { username "${repositoryUsername}" password "${repositoryPassword}" } } } dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' } } afterEvaluate {Project project -> if (project.pluginManager.hasPlugin('java')) { configurations.all { resolutionStrategy.eachDependency {DependencyResolveDetails details -> forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE' forceVersion details, 'org.slf4j', '1.7.21' forceVersion details, 'org.springframework', '4.3.3.RELEASE' } exclude module:'slf4j-log4j12' exclude module:'log4j' } dependencies {testCompile 'junit:junit:4.12' } } } repositories { maven { name 'Maven Repository' url "${repositoryMavenUrl}" credentials { username "${repositoryUsername}" password "${repositoryPassword}" } } ivy { name 'Ivy Repository' url "${repositoryIvyUrl}" credentials { username "${repositoryUsername}" password "${repositoryPassword}" } layout "pattern", { artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]' ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]' m2compatible = true } } } // 时间戳:年月日时分 p.ext.Timestamp = new Date().format('yyyyMMddHHmm') // Build Number p.ext.BuildNumber = System.env.BUILD_NUMBER if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) { p.ext.BuildNumber = 'x' } } task zipSources(type: Zip) { description '压缩源代码' project.ext.zipSourcesFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip' archiveName = project.ext.zipSourcesFile includeEmptyDirs = false from project.projectDir exclude '**/.*' exclude 'build/*' allprojects.each { p -> exclude '**/' + p.name + '/bin/*' exclude '**/' + p.name + '/build/*' exclude '**/' + p.name + '/data/*' exclude '**/' + p.name + '/work/*' exclude '**/' + p.name + '/logs/*' } } def CopySpec appCopySpec(Project prj, dstname = null) { if (!dstname) { dstname = prj.name } return copySpec{ // Fat jar from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') { into dstname } // Configs from (prj.projectDir.toString() + '/config/examples') { into dstname + '/config' } // Windows start script from (prj.projectDir.toString() + '/' + prj.name + '.bat') { into dstname } // Unix conf script from (prj.projectDir.toString() + '/' + prj.name + '.conf') { into dstname rename prj.name, prj.name + '-' + project.version } } } task zipSetup(type: Zip, dependsOn: subprojects.build) { description '制作安装包' project.ext.zipSetupFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip' archiveName = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip' with appCopySpec(project(':spider-demo')) } import java.security.MessageDigest def generateMD5(final file) { MessageDigest digest = MessageDigest.getInstance("MD5") file.withInputStream(){is-> byte[] buffer = new byte[8192] int read = 0 while( (read = is.read(buffer)) > 0) { digest.update(buffer, 0, read); } } byte[] md5sum = digest.digest() BigInteger bigInt = new BigInteger(1, md5sum) return bigInt.toString(16) } task md5(dependsOn: [zipSetup, zipSources]) << { String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile)); String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile)); println project.ext.zipSetupFile + '=' + md5_setup println project.ext.zipSourcesFile + '=' + md5_sources def newFile = new File("${projectDir}/build/distributions/" + project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt') PrintWriter printWriter = newFile.newPrintWriter() printWriter.println project.ext.zipSetupFile + '=' + md5_setup printWriter.println project.ext.zipSourcesFile + '=' + md5_sources printWriter.flush() printWriter.close() } build.dependsOn subprojects.build, zipSetup, zipSources, md5
子过程相关依赖:
apply plugin: 'spring-boot' apply plugin: 'application' distributions { main { contents { from ("${projectDir}/config/examples") { into "config" } } } } distTar.enabled = false springBoot { executable = true mainClass = 'com.my.spider.Application' } dependencies { compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE' compile 'dom4j:dom4j:1.6.1' compile 'commons-httpclient:commons-httpclient:3.1' compileOnly 'com.h2database:h2:1.4.191' compile 'javax.cache:cache-api:1.0.0' compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final' compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final' // Axis compile 'axis:axis:1.4' compile 'org.jsoup:jsoup:1.10.1' compile 'com.alibaba:fastjson:1.2.21' }
2、代码编写:
入口:
package com.my.spider; import java.io.IOException; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.scheduling.annotation.EnableAsync; import org.springframework.scheduling.annotation.EnableScheduling; import com.my.spider.utils.CommonProperties; @SpringBootApplication @EnableScheduling @EnableAsync public class Application { public static void main(String[] args) throws IOException { String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location")); System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class)); System.getProperties().setProperty("app.home", loc + "/.."); SpringApplication.run(Application.class, args); } }
package com.my.spider.utils; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Properties; import org.springframework.util.StringUtils; public final class CommonProperties { public static final String PPT_KEY_APP_HOME = "app.home"; public static final String DEFAULT_APP_HOME = "./"; public static final String getAppHome() { return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME); } public static String loadProperties2System(String location) throws IOException { String configLocation = location; File cnf; if (!StringUtils.hasLength(configLocation)) { configLocation = "./config"; cnf = new File(configLocation); if (!cnf.exists() || !cnf.isDirectory()) { configLocation = "../config"; cnf = new File(configLocation); } } else { cnf = new File(configLocation); } for (File file : cnf.listFiles()) { if (file.isFile() && file.getName().endsWith(".properties")) { Properties ppt = new Properties(); try (FileInputStream fi = new FileInputStream(file)) { ppt.load(fi); System.getProperties().putAll(ppt); } } } return configLocation; } public static String getVersion(Class<?> clazz) { Package pkg = clazz.getPackage(); String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined"); return (ver == null ? "undefined" : ver); } }
配置类:
package com.my.spider.config; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; import org.springframework.scheduling.annotation.EnableScheduling; @EnableScheduling @Configuration @ComponentScan(basePackages = { "com.my.spider.rs", "com.my.spider.schedule" }) public class AppAutoConfiguration { }
META-INF下spring.factories文件:
org.springframework.boot.autoconfigure.EnableAutoConfiguration=
com.my.spider.config.AppAutoConfiguration
3、功能代码:
定时任务抽象类,提供三种定时任务的调用方法:
package com.my.spider.schedule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.DisposableBean; import org.springframework.beans.factory.InitializingBean; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import com.fasterxml.jackson.databind.ObjectMapper; @Component public abstract class ParentSchedule implements InitializingBean,DisposableBean{ public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class); public final static ObjectMapper objectMapper = new ObjectMapper(); @Scheduled( initialDelayString = "${agent.task.initialDelay:1000}", // fixedDelayString = "${agent.task.fixedDelay:10000}") public void dowork(){ execute(); } //定时任务一 public abstract void execute(); @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}") public void timeTask(){ executeTimeTask(); } //定时任务三 public abstract void executeTimeTask(); //每天12点出发 @Scheduled(cron = "0 0 12 * * ?") public void otherTask(){ executeOtherTask(); } //定时任务三 public abstract void executeOtherTask(); }
package com.my.spider.utils; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * 页面抓取请求的公共类 * */ public class HttpHtmlUtils { public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class); public static Map<String, String> header = new HashMap<String, String>(); public static Map<String, String> header_a = new HashMap<String, String>(); static { //设置请求头 header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"); header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*"); header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); header.put("Accept-Encoding","gzip, deflate"); header.put("X-Requested-With","XMLHttpRequest"); header.put("Content-Type","text/*, application/xml"); header.put("Connection","keep-alive"); header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0"); header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); header_a.put("Accept-Language","zh-CN,zh;q=0.8"); header_a.put("Accept-Encoding","gzip, deflate, sdch"); header_a.put("Content-Type","application/octet-stream"); header_a.put("Connection","keep-alive"); header_a.put("Upgrade-Insecure-Requests", "1"); } }
新浪滚动新闻抓取实现下载和分析:
package com.my.spider.schedule; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import org.springframework.util.StringUtils; import com.my.spider.utils.FileUtils; import com.my.spider.utils.HttpHtmlUtils; @Component public class SinaSchedule extends ParentSchedule { private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class); public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm"); public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd"); private static int downloadtimeout = 5000; public static Set<String> titleSet = new HashSet<String>(); @Value("${img.download.dir.prefix:D://testhtml}") public String dirpath; @Override public void afterPropertiesSet() throws Exception { // TODO Auto-generated method stub } // 抓取文章列表 public static List<String> getArticleList(String url) { List<String> urlList = new ArrayList<String>(); logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date())); try { Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a); Document document; document = connect.timeout(downloadtimeout).get(); Elements newsList = document.getElementsByClass("d_list_txt"); if (newsList != null && newsList.size() > 0) { newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li"); for (Element el : newsList) { String elUrl = el.getElementsByTag("a").get(0).absUrl("href"); String urlName = el.getElementsByTag("a").get(0).text(); String time = el.getElementsByClass("c_time").get(0).text(); logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time); //elUrl = el.getElementsByTag("a").get(0).attr("href"); urlList.add(elUrl); } } logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date())); return urlList; } catch (IOException e) { logger.error("访问文章列表失败:" + url + " 原因" + e.getMessage()); } return null; } // 抓取文章列表 public static Map<String, Object> getArticleInfo(String url) { logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date())); try { Map<String, Object> map = new HashMap<String, Object>(); Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header); Document document; document = connect.timeout(downloadtimeout).get(); Element titleEl = document.getElementById("artibodyTitle"); String tilte = ""; if (titleEl != null) { tilte = titleEl.text(); } Elements keywords = document.getElementsByClass("article-keywords"); String tag = ""; StringBuffer sb = new StringBuffer(); if (keywords != null ) { for (Element t : keywords.get(0).getElementsByTag("a")) { sb.append(t.text()).append(","); } if (!StringUtils.isEmpty(sb.toString())) { tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString(); } } Element contentEle = document.getElementById("artibody"); String content = ""; String contentText = ""; if (contentEle != null) { content = contentEle.html(); contentText = contentEle.text(); } String description = ""; Elements descEle = document.getElementsByAttributeValue("name","description"); if (descEle != null && descEle.size() > 0) { description = descEle.get(0).attr("content"); } List<String> imgUrls = new ArrayList<>(); Elements imgs = contentEle.getElementsByTag("img"); if (imgs != null && imgs.size() > 0) { for (Element img : imgs) { String imgUrl = img.attr("src"); if (!StringUtils.isEmpty(imgUrl)) { imgUrls.add(imgUrl); } } } map.put("imgs", imgUrls); map.put("description", description); map.put("content", content); map.put("contentText", contentText); map.put("tag", tag); map.put("title", tilte); logger.debug("获取文章信息:结束时间={}", sdf.format(new Date())); return map; } catch (IOException e) { logger.error("访问文章页失败:" + url + " 原因" + e.getMessage()); } return null; } @Override public void destroy() throws Exception { // TODO Auto-generated method stub } public static void main(String[] args) { List<String> url = new ArrayList<>(); url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0" + "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1")); titleSet.addAll(url); logger.debug("此次共获取到{}个",titleSet.size()); for (String urlStr : titleSet) { try { /* String htmlFile = FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date())); Document document = Jsoup.parse(new File(htmlFile), "utf8"); document.getElementsByTag("tilte"); */ //下载保存 FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date())); getArticleInfo(urlStr); } catch (Throwable e) { } } } @Override public void execute() { } @Override public void executeTimeTask() { // TODO Auto-generated method stub } @Override public void executeOtherTask() { // TODO Auto-generated method stub } }
下载html文件代码:
package com.my.spider.utils; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URI; import java.util.Arrays; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.http.HttpEntity; import org.springframework.http.HttpHeaders; import org.springframework.http.HttpMethod; import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.http.client.ClientHttpRequestFactory; import org.springframework.http.client.HttpComponentsClientHttpRequestFactory; import org.springframework.util.StreamUtils; import org.springframework.web.client.RestTemplate; import org.springframework.web.util.UriComponentsBuilder; import com.fasterxml.jackson.databind.ObjectMapper; public class FileUtils { private static final Logger logger = LoggerFactory.getLogger(FileUtils.class); private static ObjectMapper _objectMapper = new ObjectMapper(); private static int downloadTimeout = 5000; public static void main(String[] args) throws Throwable { String filePath = "/temp/temp/test.mpg"; String dirPrex = "/temp&Z:\\"; String[] paths = dirPrex.split("&"); System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\")); } // 文件复制 public static void copy(String src, String dest) throws IOException { System.out.println("正在拷贝【" + src + "】到【" + dest + "】 "); File destFile = new File(dest); if (!destFile.exists()) { String dir = dest.substring(0, dest.lastIndexOf(File.separator)); File dirF = new File(dir); if (!dirF.exists() || !dirF.isDirectory()) { dirF.mkdirs(); } destFile.createNewFile(); } FileInputStream in = new FileInputStream(src); FileOutputStream out = new FileOutputStream(dest); byte[] buffer = new byte[40960]; while (in.read(buffer) != -1) { out.write(buffer); out.flush(); } in.close(); out.close(); } // 下载云文件 public static String downloadYunFile(String url, String dir) throws Throwable { String fileName = getFileName(url); String filePath = dir + File.separator + fileName; try (CloseableHttpClient httpclient = HttpClients.createDefault()) { HttpGet httpget = new HttpGet(url); httpget.setConfig(RequestConfig.custom() // .setConnectionRequestTimeout(downloadTimeout) // .setConnectTimeout(downloadTimeout) // .setSocketTimeout(downloadTimeout) // .build()); try (CloseableHttpResponse response = httpclient.execute(httpget)) { org.apache.http.HttpEntity entity = response.getEntity(); File desc = new File(filePath); File folder = desc.getParentFile(); folder.mkdirs(); try (InputStream is = entity.getContent(); // OutputStream os = new FileOutputStream(desc)) { StreamUtils.copy(is, os); } } catch (Throwable e) { throw new Throwable("文件下载失败......", e); } } return filePath; } public static String getFileName(String fileFullPath) { fileFullPath = fileFullPath.replace("/", "\"); return fileFullPath.substring(fileFullPath.lastIndexOf("\") + 1, fileFullPath.length()); } // 请求例子 public void getToken(String url, String data) throws Throwable { RestTemplate restTemplate = new RestTemplate(); ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory(); restTemplate.setRequestFactory(clientFactory); HttpHeaders requestHeaders = new HttpHeaders(); requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8)); requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8); logger.debug("获取token的URL:" + url); URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri(); logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data)); HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders); ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class); String resp = response.getBody(); logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp)); } }
4、总结:
Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!
追加一个下载音频的代码:
package com.my.spider.service; import java.net.HttpURLConnection; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; import com.alibaba.fastjson.JSONObject; import com.my.spider.model.AudioInfo; import com.my.spider.utils.FileUtils; import com.my.spider.utils.HttpHtmlUtils; import com.my.spider.utils.HttpURLConnectionFactory; @Service public class XmlyAudioService { public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class); static String url = "http://www.ximalaya.com/dq/comic/"; static String requetUrl = "http://www.ximalaya.com/tracks/"; public static void main(String[] args) { List<String> audioUrlList = new ArrayList<String>(); int count = getCount(url); if(count > 1) { audioUrlList.addAll(getAudioList(1,url)); for (int i = 2; i <= count; i++) { url = url +i+"/"; audioUrlList.addAll(getAudioList(i,url)); url = url.replace(i+"/", ""); } } List<String> audioList = new ArrayList<String>(); //解析 if(audioUrlList.size() > 0) { for (String url : audioUrlList) { audioList.addAll(listAudio(url)); } } System.out.println(audioUrlList.size() + "==" + audioList.size()); List<AudioInfo> audioInfos = new ArrayList<>(); //下载 for (String sound_id : audioList) { requetUrl = requetUrl + sound_id+".json"; System.out.println(requetUrl); audioInfos.add(downloadList(requetUrl)); requetUrl = requetUrl.replace(sound_id+".json", ""); } } //获取音频页详情 public static List<String> getAudioList(int num,String url){ List<String> list = new ArrayList<>(); try { Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a); Document document = connect.timeout(5000).get(); FileUtils.str2File(document.toString(), "G:\xmly\html\comic" + num + ".html"); Element el = document.getElementById("explore_album_detail_entry"); Elements els = el.getElementsByClass("albumface"); for (Element element : els) { list.add(element.absUrl("href")); } } catch (Throwable e) { logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e); } return list; } public static List<String> listAudio(String url){ List<String> list = new ArrayList<>(); try { Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a); Document document = connect.timeout(5000).get(); FileUtils.str2File(document.toString(), "G:\xmly\html\comic_"+System.currentTimeMillis()+".html"); Elements els = document.getElementsByClass("personal_body"); if(els!=null && els.size() > 0) { String sound_ids = els.get(0).attr("sound_ids"); list.addAll(Arrays.asList(sound_ids.split(","))); } } catch (Throwable e) { logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e); } return list; } // @SuppressWarnings("unchecked") public static AudioInfo downloadList(String url){ AudioInfo audioInfo = new AudioInfo(); try { HttpURLConnection conn = HttpURLConnectionFactory.getConn(url); conn.setRequestProperty("Content-Type", "*/*; charset=utf-8"); String audioJson = HttpURLConnectionFactory.sendGet(conn); Map<String,Object> map = (Map<String, Object>) JSONObject.parse(audioJson); audioInfo.setId(map.get("id").toString()); audioInfo.setName(map.get("title").toString()); audioInfo.setUrl(map.get("play_path").toString()); try { FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\xmly", audioInfo.getName()+".mp3"); } catch (Throwable e) { logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());; } } catch (Throwable e) { logger.error(e.getMessage(),e); } return audioInfo; } //获取总页数页数 public static int getCount(String url) { try { Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a); Document document = connect.timeout(5000).get(); Elements els = document.getElementsByClass("pagingBar_page"); if(els.size() < 2) { return 1; } Element pageCout = els.get(els.size()-2); return Integer.valueOf(pageCout.text()); } catch (Throwable e) { e.printStackTrace(); } return 0; } }