zoukankan html css js c++ java

Java jsoup多线程爬虫(爬豆瓣图书封面)

Java爬虫，就先爬个好爬的豆瓣读书的封面。

利用线程池多线程爬，biubiubiu，速度超快。

下载到指定的文件夹中。

App.java:

package com.newer.spider;

import java.io.IOException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class App {

    public static void main(String[] args) {
        
        // 确定目标地址 URL 统一资源定位符
        String url="https://book.douban.com/";
        
        // 2 解析 html ：  https：//jsoup.org
        try {
            //
            Document doc = Jsoup.connect(url).get();
            
//            System.out.println(doc.title());
//            System.out.println(doc.html());
            
            //从 Doc 的树形结构中查找 img 标签
            //.class 选择器
            Elements els = doc.select(".cover img");
            System.out.println(els.size());
            
            
            // 创建一个线程池
            //.class 选择器
            ExecutorService pool = Executors.newCachedThreadPool();
            pool = Executors.newFixedThreadPool(9);
//            pool = Executors.newSingleThreadExecutor();
            
            for(Element e : els) {
                // <img src=""  width=""  height="" />
                String src = e.attr("src");
                System.out.println(src);
                
                // 下载每张图片
                pool.execute(new DownloadTask(src));
            }
            //释放资源
            pool.shutdown();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

package com.newer.spider;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import javax.net.ssl.HttpsURLConnection;

/**
 * 负责下载图片的任务，可以由线程执行 （Runnable）
 * 
 * @author zmz
 *
 */
public class DownloadTask implements Runnable {

    //图片的路径
    String imagePath;
    /**
     * 
     * @param src
     *            图片的位置和路径
     */
    public DownloadTask(String src) {
        imagePath = src;
    }

    @Override
    public void run() {
         
        //建立一个HTTP连接，使用输入流获得数据，使用输出流写入磁盘
        HttpURLConnection conn = null;
        InputStream in = null;
        FileOutputStream out = null;
        
        try {
            conn = (HttpURLConnection) new URL(imagePath).openConnection();
            //读取数据
            in = conn.getInputStream();
            String uu = "G:\Newer_Project\Spider\img\";
            //获得图片的名字
            int index = imagePath.lastIndexOf('/');
            String file = imagePath.substring(index + 1);
            file = uu + file;
            //创建输出流，写入
            out = new FileOutputStream(file);
            
            byte[] buf = new byte[1024 + 16];
            int size;
            while(-1 != (size = in.read(buf))) {
                out.write(buf, 0, size);
            }
            //下载完成
            String name = Thread.currentThread().getName();
            System.out.println(name + "下载" + imagePath);
            
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //不论是否发生异常都会执行的
            if(out != null) {
                try {
                    out.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            
            if(conn != null) {
                conn.disconnect();
            }
        }
    }

}

后续是不是可以翻页爬的，因为这个只是爬当前页面的，豆瓣读书网的书还有很多页，我们爬完这一页的，继续爬下一页？

查看全文

相关阅读:
Python基础篇 -- 列表
 Python基础篇 -- 字符串
 Python基础篇 -- if while 语句
 Python基础篇 -- 运算符和编码
 Python 入门基础
 Docker知识收藏
 秒表
 Emac
Android开发
 shell 小工具

原文地址：https://www.cnblogs.com/zhangmingzhao/p/7580282.html