zoukankan      html  css  js  c++  java
  • 新浪新闻页面抓取(JAVA-Jsoup)

    1、使用gradle建立工程:

        工程格式如下:

    include ':spider-demo'
    
    rootProject.name = 'my-spider-demo'
    settings
    def void forceVersion(details, group, version) {
        if (details.requested.group == group) {
            details.useVersion version
        }
    }
    
    def void forceVersion(details, group, name, version) {
        if (details.requested.group == group && details.requested.name == name) {
            details.useVersion version
        }
    }
    
    allprojects { p ->
        group = 'com.my.spider'
        version = '1.0.0'
        
        apply plugin: 'java'
        apply plugin: 'maven'
        apply plugin: 'maven-publish'
        
        [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
    
        jar.doFirst {
            manifest {
                def manifestFile = "${projectDir}/META-INF/MANIFEST.MF"
                if (new File(manifestFile).exists())
                    from (manifestFile)
                
                attributes 'Implementation-Title':p.name
                if (p.version.endsWith('-SNAPSHOT')) {
                    attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp
                } else {
                    attributes 'Implementation-Version': p.version
                }
                attributes 'Implementation-BuildDateTime':new Date()
            }
        }
        
        javadoc {
            options {
                encoding 'UTF-8'
                charSet 'UTF-8'
                author false
                version true
                links 'http://docs.oracle.com/javase/8/docs/api/index.html'
                memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE
            }
        }
    
        if (p.name.endsWith('-api')){
            task sourcesJar(type:Jar, dependsOn:classes) {
                classifier = 'sources'
                from sourceSets.main.allSource            
            }
    
            task javadocJar(type:Jar, dependsOn:javadoc) {
                classifier = 'javadoc'
                from javadoc.destinationDir
            }
        }
        
        publishing {
            repositories {
                  maven {
                       credentials {
                         username "${repositoryUploadUsername}"
                        password "${repositoryUploadPassword}"
                       }
                       
                       if (version.endsWith('-SNAPSHOT')) {
                        url "${repositoryUploadSnapshotUrl}"
                      } else {
                        url "${repositoryUploadReleaseUrl}"
                      }
                  }
              }
              publications {
                mavenJava(MavenPublication) {
                    from components.java
    
                    // 只有*-api才会需要发布sources和javadoc
                    if (p.name.endsWith('-api')){
                        artifact sourcesJar {
                            classifier "sources"
                        }
                        artifact javadocJar {
                            classifier "javadoc"
                        }
                    }
                }
            }
        }
    
        if (System.env.uploadArchives) {
            build.dependsOn publish
        }
    
        buildscript {
            repositories {
                maven {
                    name 'Maven Repository'
                    url "${repositoryMavenUrl}"
                    credentials {
                        username "${repositoryUsername}"
                        password "${repositoryPassword}"
                    }
                }
            }
            dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' }
        }
        
        afterEvaluate {Project  project -> 
            if (project.pluginManager.hasPlugin('java')) {
                configurations.all {
                    resolutionStrategy.eachDependency {DependencyResolveDetails details -> 
                        forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE'
                        forceVersion details, 'org.slf4j', '1.7.21'
                        forceVersion details, 'org.springframework', '4.3.3.RELEASE'
                    }
    
                    exclude module:'slf4j-log4j12'
                    exclude module:'log4j'
                }
    
                dependencies {testCompile 'junit:junit:4.12' }
            }
        }
    
        repositories {
            maven {
                name 'Maven Repository'
                url "${repositoryMavenUrl}"
                credentials {
                    username "${repositoryUsername}"
                    password "${repositoryPassword}"
                }
            }
    
            ivy {
                name 'Ivy Repository'
                url "${repositoryIvyUrl}"
                credentials {
                    username "${repositoryUsername}"
                    password "${repositoryPassword}"
                }
                layout "pattern",  {
                    artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]'
                    ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]'
                    m2compatible = true
                }
            }
        }
        
        // 时间戳:年月日时分
        p.ext.Timestamp = new Date().format('yyyyMMddHHmm')
        // Build Number
        p.ext.BuildNumber = System.env.BUILD_NUMBER
        if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) {
            p.ext.BuildNumber = 'x'
        }    
    }
    
    task zipSources(type: Zip) {
        description '压缩源代码'
        project.ext.zipSourcesFile = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip' 
        archiveName = project.ext.zipSourcesFile
        includeEmptyDirs = false
        
        from project.projectDir
        
        exclude '**/.*'
        exclude 'build/*'
        allprojects.each { p ->
            exclude '**/' + p.name + '/bin/*'
            exclude '**/' + p.name + '/build/*'
            exclude '**/' + p.name + '/data/*'
            exclude '**/' + p.name + '/work/*'
            exclude '**/' + p.name + '/logs/*'    
        }
    }
    
    def CopySpec appCopySpec(Project prj, dstname = null) {
        if (!dstname) { dstname = prj.name }
        return copySpec{
            // Fat jar
            from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') {
                into dstname
            }        
        
            // Configs
            from (prj.projectDir.toString() + '/config/examples') {
                into dstname + '/config'
            }
        
            // Windows start script
            from (prj.projectDir.toString() + '/' + prj.name + '.bat') {
                into dstname
            }
            
            // Unix conf script
            from (prj.projectDir.toString() + '/' + prj.name + '.conf') {
                into dstname
                rename prj.name, prj.name + '-' + project.version
            }
        }    
    }
    
    
    task zipSetup(type: Zip, dependsOn: subprojects.build) { 
        description '制作安装包' 
        project.ext.zipSetupFile = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip' 
        archiveName = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
        
        with appCopySpec(project(':spider-demo'))
    }
    
    import java.security.MessageDigest
    
    def generateMD5(final file) {
        MessageDigest digest = MessageDigest.getInstance("MD5")
        file.withInputStream(){is->
            byte[] buffer = new byte[8192]
            int read = 0
            while( (read = is.read(buffer)) > 0) {
                digest.update(buffer, 0, read);
            }
        }
        byte[] md5sum = digest.digest()
        BigInteger bigInt = new BigInteger(1, md5sum)
        return bigInt.toString(16)
    }
    
    task md5(dependsOn: [zipSetup, zipSources]) << {
        String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile));
        String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile));
        println project.ext.zipSetupFile + '=' + md5_setup
        println project.ext.zipSourcesFile + '=' + md5_sources
        
        def newFile = new File("${projectDir}/build/distributions/" 
                        + project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt')
        PrintWriter printWriter = newFile.newPrintWriter()
        printWriter.println project.ext.zipSetupFile + '=' + md5_setup
        printWriter.println project.ext.zipSourcesFile + '=' + md5_sources
        printWriter.flush()
        printWriter.close()
    }
    
    build.dependsOn subprojects.build, zipSetup, zipSources, md5
    bulid.gradle

    子过程相关依赖:

    apply plugin: 'spring-boot'
    apply plugin: 'application'
    
    distributions {
        main {
            contents {
                from ("${projectDir}/config/examples") {
                    into "config"
                }
            }
        }
    }
    
    distTar.enabled = false
    
    springBoot {
        executable = true
        mainClass = 'com.my.spider.Application'
    }
    
    dependencies {
        compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE'
        compile 'dom4j:dom4j:1.6.1'
        compile 'commons-httpclient:commons-httpclient:3.1'
        compileOnly 'com.h2database:h2:1.4.191'
        compile 'javax.cache:cache-api:1.0.0'
        compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final'
        compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final'
        // Axis
        compile 'axis:axis:1.4'
        
        compile 'org.jsoup:jsoup:1.10.1'
        
        compile 'com.alibaba:fastjson:1.2.21'
        
    }
    bulid

    2、代码编写:

        入口:

    package com.my.spider;
    
    import java.io.IOException;
    
    import org.springframework.boot.SpringApplication;
    import org.springframework.boot.autoconfigure.SpringBootApplication;
    import org.springframework.scheduling.annotation.EnableAsync;
    import org.springframework.scheduling.annotation.EnableScheduling;
    
    import com.my.spider.utils.CommonProperties;
    
    @SpringBootApplication
    @EnableScheduling
    @EnableAsync
    public class Application {
    
        public static void main(String[] args) throws IOException {
            String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location"));
            System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class));
            System.getProperties().setProperty("app.home", loc + "/..");
            SpringApplication.run(Application.class, args);
        }
        
    }
    package com.my.spider.utils;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.util.Properties;
    
    import org.springframework.util.StringUtils;
    
    public final class CommonProperties {
    
        public static final String PPT_KEY_APP_HOME = "app.home";
    
        public static final String DEFAULT_APP_HOME = "./";
    
        public static final String getAppHome() {
            return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME);
        }
    
        public static String loadProperties2System(String location) throws IOException {
            String configLocation = location;
            File cnf;
            if (!StringUtils.hasLength(configLocation)) {
                configLocation = "./config";
                cnf = new File(configLocation);
                if (!cnf.exists() || !cnf.isDirectory()) {
                    configLocation = "../config";
                    cnf = new File(configLocation);
                }
            } else {
                cnf = new File(configLocation);
            }
            for (File file : cnf.listFiles()) {
                if (file.isFile() && file.getName().endsWith(".properties")) {
                    Properties ppt = new Properties();
                    try (FileInputStream fi = new FileInputStream(file)) {
                        ppt.load(fi);
                        System.getProperties().putAll(ppt);
                    }
                }
            }
            return configLocation;
        }
    
        public static String getVersion(Class<?> clazz) {
            Package pkg = clazz.getPackage();
            String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined");
            return (ver == null ? "undefined" : ver);
        }
    }

    配置类:

    package com.my.spider.config;
    
    import org.springframework.context.annotation.ComponentScan;
    import org.springframework.context.annotation.Configuration;
    import org.springframework.scheduling.annotation.EnableScheduling;
    
    @EnableScheduling
    @Configuration
    @ComponentScan(basePackages = {
            "com.my.spider.rs",
            "com.my.spider.schedule"
    })
    public class AppAutoConfiguration {
    
    }

    META-INF下spring.factories文件:

    org.springframework.boot.autoconfigure.EnableAutoConfiguration=
        com.my.spider.config.AppAutoConfiguration

    3、功能代码:

    定时任务抽象类,提供三种定时任务的调用方法:

    package com.my.spider.schedule;
    
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.beans.factory.DisposableBean;
    import org.springframework.beans.factory.InitializingBean;
    import org.springframework.scheduling.annotation.Scheduled;
    import org.springframework.stereotype.Component;
    
    import com.fasterxml.jackson.databind.ObjectMapper;
    
    @Component
    public abstract class ParentSchedule implements InitializingBean,DisposableBean{
    
        public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class);
        
        public final static ObjectMapper objectMapper = new ObjectMapper();
        
        @Scheduled(
                initialDelayString = "${agent.task.initialDelay:1000}", //
                fixedDelayString = "${agent.task.fixedDelay:10000}")
        public void dowork(){
            execute();
        }
        //定时任务一
        public abstract void execute();
    
        @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}")
        public void timeTask(){
            executeTimeTask();
        }
        //定时任务三
        public abstract void executeTimeTask();
        
        //每天12点出发
        @Scheduled(cron = "0 0 12 * * ?")
        public void otherTask(){
            executeOtherTask();
        }
        //定时任务三
        public abstract void executeOtherTask();
    }
    package com.my.spider.utils;
    
    import java.util.HashMap;
    import java.util.Map;
    
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    /**
     * 页面抓取请求的公共类
     * */
    public class HttpHtmlUtils {
    
        public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class);
        
        public static Map<String, String> header = new HashMap<String, String>();
        
        public static Map<String, String> header_a = new HashMap<String, String>();
        
        static {
            //设置请求头
            header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
            header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*");
            header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
            header.put("Accept-Encoding","gzip, deflate");
            header.put("X-Requested-With","XMLHttpRequest");
            header.put("Content-Type","text/*, application/xml");
            header.put("Connection","keep-alive");
            
            header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0");
            header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            header_a.put("Accept-Language","zh-CN,zh;q=0.8");
            header_a.put("Accept-Encoding","gzip, deflate, sdch");
            header_a.put("Content-Type","application/octet-stream");
            header_a.put("Connection","keep-alive");
            header_a.put("Upgrade-Insecure-Requests", "1");
        }
        
        
    }

    新浪滚动新闻抓取实现下载和分析:

    package com.my.spider.schedule;
    
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.beans.factory.annotation.Value;
    import org.springframework.stereotype.Component;
    import org.springframework.util.StringUtils;
    
    import com.my.spider.utils.FileUtils;
    import com.my.spider.utils.HttpHtmlUtils;
    
    @Component
    public class SinaSchedule extends ParentSchedule {
    
        private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class);
    
        public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
    
        public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd");
    
        private static int downloadtimeout = 5000;
    
        public static Set<String> titleSet = new HashSet<String>();
        
        @Value("${img.download.dir.prefix:D://testhtml}")
        public String dirpath;
    
        @Override
        public void afterPropertiesSet() throws Exception {
            // TODO Auto-generated method stub
    
        }
    
        // 抓取文章列表
        public static List<String> getArticleList(String url) {
            
            List<String> urlList = new ArrayList<String>();
            logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
    
            try {
                Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                Document document;
                document = connect.timeout(downloadtimeout).get();
                Elements newsList = document.getElementsByClass("d_list_txt");
                if (newsList != null && newsList.size() > 0) {
                    newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li");
                    for (Element el : newsList) {
                        String elUrl = el.getElementsByTag("a").get(0).absUrl("href");
                        String urlName = el.getElementsByTag("a").get(0).text();
                        String time = el.getElementsByClass("c_time").get(0).text();
                        logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time);
                        //elUrl = el.getElementsByTag("a").get(0).attr("href");
                        urlList.add(elUrl);
                    }
                }
                logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date()));
                return urlList;
            } catch (IOException e) {
                logger.error("访问文章列表失败:" + url + "  原因" + e.getMessage());
            }
            return null;
        }
    
        // 抓取文章列表
        public static Map<String, Object> getArticleInfo(String url) {
    
            logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
            try {
                Map<String, Object> map = new HashMap<String, Object>();
                Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header);
                Document document;
                document = connect.timeout(downloadtimeout).get();
                Element titleEl = document.getElementById("artibodyTitle");
                String tilte = "";
                
                if (titleEl != null) {
                    tilte = titleEl.text();
                }
                
                Elements keywords = document.getElementsByClass("article-keywords");
                String tag = "";
                StringBuffer sb = new StringBuffer();
                if (keywords != null ) {
                    for (Element t : keywords.get(0).getElementsByTag("a")) {
                        sb.append(t.text()).append(",");
                    }
                    if (!StringUtils.isEmpty(sb.toString())) {
                        tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString();
                    }
                }
                
                Element contentEle = document.getElementById("artibody");
                String content = "";
                String contentText = "";
                if (contentEle != null) {
                    content = contentEle.html();
                    contentText = contentEle.text();
                }
                String description = "";
                Elements descEle = document.getElementsByAttributeValue("name","description");
                if (descEle != null && descEle.size() > 0) {
                    description = descEle.get(0).attr("content");
                }
                List<String> imgUrls = new ArrayList<>();
                Elements imgs = contentEle.getElementsByTag("img");
                if (imgs != null && imgs.size() > 0) {
                    for (Element img : imgs) {
                        String imgUrl = img.attr("src");
                        if (!StringUtils.isEmpty(imgUrl)) {
                            imgUrls.add(imgUrl);
                        }
                    }
                }
                map.put("imgs", imgUrls);
                map.put("description", description);
                map.put("content", content);
                map.put("contentText", contentText);
                map.put("tag", tag);
                map.put("title", tilte);
                logger.debug("获取文章信息:结束时间={}", sdf.format(new Date()));
    
                return map;
            } catch (IOException e) {
                logger.error("访问文章页失败:" + url + "  原因" + e.getMessage());
            }
            return null;
        }
    
        @Override
        public void destroy() throws Exception {
            // TODO Auto-generated method stub
    
        }
    
        public static void main(String[] args) {
            List<String> url = new ArrayList<>();
            url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0"
                        + "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"));
            titleSet.addAll(url);
            logger.debug("此次共获取到{}个",titleSet.size());
        
            for (String urlStr : titleSet) {
                try {
                    /*
                    String htmlFile =  FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
                    Document document = Jsoup.parse(new File(htmlFile), "utf8");
                    document.getElementsByTag("tilte");
                    */
                    //下载保存
                    FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
                    
                    getArticleInfo(urlStr);
                
                
                } catch (Throwable e) {
    
                }
                
                
            }
        }
    
        @Override
        public void execute() {
    
        }
    
        @Override
        public void executeTimeTask() {
            // TODO Auto-generated method stub
    
        }
    
        @Override
        public void executeOtherTask() {
            // TODO Auto-generated method stub
    
        }
    
    }
    View Code

    下载html文件代码:

    package com.my.spider.utils;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.OutputStream;
    import java.net.URI;
    import java.util.Arrays;
    
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.http.HttpEntity;
    import org.springframework.http.HttpHeaders;
    import org.springframework.http.HttpMethod;
    import org.springframework.http.MediaType;
    import org.springframework.http.ResponseEntity;
    import org.springframework.http.client.ClientHttpRequestFactory;
    import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
    import org.springframework.util.StreamUtils;
    import org.springframework.web.client.RestTemplate;
    import org.springframework.web.util.UriComponentsBuilder;
    
    import com.fasterxml.jackson.databind.ObjectMapper;
    
    public class FileUtils {
    
        private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);
    
        private static ObjectMapper _objectMapper = new ObjectMapper();
    
        private static int downloadTimeout = 5000;
    
        public static void main(String[] args) throws Throwable {
            String filePath = "/temp/temp/test.mpg";
            String dirPrex = "/temp&Z:\\";
            String[] paths = dirPrex.split("&");
            System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\"));
        }
    
        // 文件复制
        public static void copy(String src, String dest) throws IOException {
    
            System.out.println("正在拷贝【" + src + "】到【" + dest + "】
    ");
            File destFile = new File(dest);
            if (!destFile.exists()) {
                String dir = dest.substring(0, dest.lastIndexOf(File.separator));
                File dirF = new File(dir);
                if (!dirF.exists() || !dirF.isDirectory()) {
                    dirF.mkdirs();
                }
                destFile.createNewFile();
            }
            FileInputStream in = new FileInputStream(src);
            FileOutputStream out = new FileOutputStream(dest);
            byte[] buffer = new byte[40960];
            while (in.read(buffer) != -1) {
                out.write(buffer);
                out.flush();
            }
            in.close();
            out.close();
        }
    
        // 下载云文件
        public static String downloadYunFile(String url, String dir) throws Throwable {
    
            String fileName = getFileName(url);
    
            String filePath = dir + File.separator + fileName;
    
            try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
                HttpGet httpget = new HttpGet(url);
                httpget.setConfig(RequestConfig.custom() //
                        .setConnectionRequestTimeout(downloadTimeout) //
                        .setConnectTimeout(downloadTimeout) //
                        .setSocketTimeout(downloadTimeout) //
                        .build());
                try (CloseableHttpResponse response = httpclient.execute(httpget)) {
                    org.apache.http.HttpEntity entity = response.getEntity();
                    File desc = new File(filePath);
                    File folder = desc.getParentFile();
                    folder.mkdirs();
                    try (InputStream is = entity.getContent(); //
                            OutputStream os = new FileOutputStream(desc)) {
                        StreamUtils.copy(is, os);
                    }
                } catch (Throwable e) {
                    throw new Throwable("文件下载失败......", e);
                }
            }
            return filePath;
        }
    
        public static String getFileName(String fileFullPath) {
            fileFullPath = fileFullPath.replace("/", "\");
            return fileFullPath.substring(fileFullPath.lastIndexOf("\") + 1, fileFullPath.length());
        }
    
        // 请求例子
        public void getToken(String url, String data) throws Throwable {
    
            RestTemplate restTemplate = new RestTemplate();
            ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory();
            restTemplate.setRequestFactory(clientFactory);
    
            HttpHeaders requestHeaders = new HttpHeaders();
            requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8));
            requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8);
            logger.debug("获取token的URL:" + url);
    
            URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri();
    
            logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data));
    
            HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders);
    
            ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class);
            String resp = response.getBody();
            logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp));
        }
    
    }

    4、总结:

               Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!

    追加一个下载音频的代码:

    package com.my.spider.service;
    
    
    import java.net.HttpURLConnection;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;
    import java.util.Map;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.stereotype.Service;
    
    import com.alibaba.fastjson.JSONObject;
    import com.my.spider.model.AudioInfo;
    import com.my.spider.utils.FileUtils;
    import com.my.spider.utils.HttpHtmlUtils;
    import com.my.spider.utils.HttpURLConnectionFactory;
    
    @Service
    public class XmlyAudioService {
    
        public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class);
        static String url = "http://www.ximalaya.com/dq/comic/";
        static String requetUrl = "http://www.ximalaya.com/tracks/";
    
        public static void main(String[] args) {
            List<String> audioUrlList = new ArrayList<String>();
            int count = getCount(url);
            if(count > 1) {
                audioUrlList.addAll(getAudioList(1,url));
                for (int i = 2; i <= count; i++) {
                    url = url +i+"/";
                    audioUrlList.addAll(getAudioList(i,url));
                    url = url.replace(i+"/", "");
                }
            }
            List<String> audioList = new ArrayList<String>();
            //解析
            if(audioUrlList.size() > 0) {
                for (String url : audioUrlList) {
                    audioList.addAll(listAudio(url));
                }
            }
            System.out.println(audioUrlList.size() + "==" + audioList.size());
            List<AudioInfo> audioInfos = new ArrayList<>();
            //下载
            for (String sound_id : audioList) {
                requetUrl = requetUrl + sound_id+".json";
                System.out.println(requetUrl);
                audioInfos.add(downloadList(requetUrl));
                requetUrl = requetUrl.replace(sound_id+".json", "");
            }
        }
    
        //获取音频页详情
        public static List<String> getAudioList(int num,String url){
            List<String> list = new ArrayList<>();
            try {
                    Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                    Document document = connect.timeout(5000).get();
                    FileUtils.str2File(document.toString(), "G:\xmly\html\comic" + num + ".html");
                    Element el = document.getElementById("explore_album_detail_entry");
                    Elements els =  el.getElementsByClass("albumface");
                    for (Element element : els) {
                        list.add(element.absUrl("href"));
                    }
            } catch (Throwable e) {
                logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
            }
            return list;
        }
        
        public static List<String> listAudio(String url){
            List<String> list = new ArrayList<>();
            try {
                    Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                    Document document = connect.timeout(5000).get();
                    FileUtils.str2File(document.toString(), "G:\xmly\html\comic_"+System.currentTimeMillis()+".html");
                    Elements els = document.getElementsByClass("personal_body");
                    if(els!=null && els.size() > 0) {
                        String sound_ids = els.get(0).attr("sound_ids");
                        list.addAll(Arrays.asList(sound_ids.split(",")));
                    }
            } catch (Throwable e) {
                logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
            }
            return list;
        }
        
        //
        @SuppressWarnings("unchecked")
        public static AudioInfo downloadList(String url){
            AudioInfo audioInfo = new AudioInfo();
            try {
            
                HttpURLConnection conn = HttpURLConnectionFactory.getConn(url);
                conn.setRequestProperty("Content-Type", "*/*; charset=utf-8");
                String audioJson  = HttpURLConnectionFactory.sendGet(conn);
                Map<String,Object> map =  (Map<String, Object>) JSONObject.parse(audioJson);
                audioInfo.setId(map.get("id").toString());
                audioInfo.setName(map.get("title").toString());
                audioInfo.setUrl(map.get("play_path").toString());
                try {
                    FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\xmly", audioInfo.getName()+".mp3");
                } catch (Throwable e) {
                    logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());;
                }
                
            } catch (Throwable e) {
                logger.error(e.getMessage(),e);
            }
            return audioInfo;
        }
        
        //获取总页数页数
        public static int getCount(String url) {
            try {
                Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                Document document = connect.timeout(5000).get();
                Elements els = document.getElementsByClass("pagingBar_page");
                if(els.size() < 2) {
                    return 1;
                }
                Element pageCout = els.get(els.size()-2);
                return Integer.valueOf(pageCout.text());
            } catch (Throwable e) {
                e.printStackTrace();
            }
            return 0;
        }
        
        
    }
    xmly.java
  • 相关阅读:
    sql中保留2位小数
    C# 操作字符串,在某些特定的字符后面或前面添加其它字符
    Windows Server 2008 R2中上传和下载文件
    winform中显示标题,点击打开链接
    正则表达式
    winform重绘
    js获取元素的页面坐标
    剑指offer-从上往下打印二叉树
    剑指offer-栈的压入、弹出序列
    剑指offer-包含min函数的栈
  • 原文地址:https://www.cnblogs.com/liangblog/p/7594047.html
Copyright © 2011-2022 走看看