zoukankan      html  css  js  c++  java
  • JSOUP 暴力爬取实验

    jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    

    下面用这个工具来暴力获取一个视频网站,各种视频的基本信息

    建立获取信息的实体类,以便后期可以存入数据库

    package cn.haidnor.movie;
    
    import lombok.Data;
    
    import java.util.List;
    
    @Data
    public class Movie {
        // URL
        private String url;
        // 影片名
        private String name;
        // 年代
        private String years;
        // 国家
        private String country;
        // 时长
        private String minute;
        // 类型
        private List<String> types;
        // 导演
        private List<String> director;
        // 主演
        private List<String> performers;
        // 详细信息
        private String details;
    }
    
    

    编写爬取工具

    package cn.haidnor.movie;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.*;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * 全视频爬虫解析
     * https://www.qsptv.net/
     *
     * @author haidnor
     */
    public class QsptvReptile {
    
        private static final int TIMEOUT = 8000;
    
        private static int ip = 0;
    
        /**
         * 获取影片url资料
         *
         * @param url 视频连接
         * @return Movie
         */
        public Movie getMovie(String url,int id) throws Exception {
            ip++;
            Document doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
                    .header("x-forwarded-for","1423" + ip + "")
                    .timeout(TIMEOUT).get();
    
            Movie movie = new Movie();
    
            // 设置 URL
            movie.setUrl(url);
    
            // 影片信息根元素
            Element root = doc.getElementById("zanpian-score");
    
            if (root == null) {
                return movie;
            }
    
            // 影片名称
            Element name = root.getElementsByTag("h1").get(0);
            if (name != null) {
                movie.setName(name.text());
            }
    
            // 年代
            Element years = root.getElementsByClass("col-md-6 col-sm-6 col-xs-6 text hidden-xs").get(1);
            if (years != null) {
                years.getElementsByTag("span").remove();
                movie.setYears(years.text());
            }
    
            // 国家
            Element country = root.getElementsByClass("col-md-6 col-sm-6 col-xs-4 text hidden-xs").get(0).getElementsByTag("a").get(0);
            if (country != null) {
                movie.setCountry(country.text());
            }
    
            // 时长
            Element minute = root.getElementsByClass("col-md-6 col-sm-6 col-xs-12  text").get(0);
            if (minute != null) {
                minute.getElementsByTag("span").remove();
                movie.setMinute(minute.text());
            }
    
            // 类型
            Elements types = root.getElementsByClass("col-md-12 text").get(0).getElementsByTag("a");
            if (types != null) {
                List<String> type = new ArrayList<String>();
                for (Element element : types) {
                    type.add(element.text());
                }
                movie.setTypes(type);
            }
    
            // 主演
            Elements performers = root.getElementsByClass("col-md-12 text").get(1).getElementsByTag("a");
            if (performers != null) {
                List<String> performer = new ArrayList<String>();
                for (Element element : performers) {
                    performer.add(element.text());
                }
                movie.setPerformers(performer);
            }
    
            // 导演
            Elements directors = root.getElementsByClass("col-md-6 col-sm-6 col-xs-12 text hidden-xs").get(1).getElementsByTag("a");
            if (directors != null) {
                List<String> director = new ArrayList<String>();
                for (Element element : directors) {
                    director.add(element.text());
                }
                movie.setDirector(director);
            }
    
            // 影片详细信息
            Element element = doc.getElementsByClass("details-content").last();
            if (element != null) {
                StringBuilder details = new StringBuilder(element.text());
                int indexOf = details.lastIndexOf("全视频TV");
    
                CharSequence charSequence = details.subSequence(0, indexOf);
                movie.setDetails(charSequence.toString());
            }
    
            // 下载封面图片
            Element picture = doc.getElementsByClass("video-pic").get(0);
            StringBuilder style = new StringBuilder(picture.attr("style"));
            String pictureUrl = style.substring(style.indexOf("(") + 1,style.indexOf(")"));
    
            downloadPicture(pictureUrl,id);
    
            return movie;
        }
    
        /**
         * 下载图片
         * @param pictureUrl
         */
        static void downloadPicture(String pictureUrl,int id) throws Exception {
            String filePath = "D:/picture";
            File file = new File(filePath + "/" + id + ".jpg");
    
            URL url = new URL(pictureUrl);
            URLConnection connection = url.openConnection();
            connection.setRequestProperty("User-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
            connection.setRequestProperty("x-forwarded-for","143" + ip + "");
    
            InputStream inputStream = connection.getInputStream();
    
            DataInputStream dataInputStream = new DataInputStream(inputStream);
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            ByteArrayOutputStream output = new ByteArrayOutputStream();
    
            byte[] buffer = new byte[128];
            int length;
            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
    
            fileOutputStream.write(output.toByteArray());
    
            output.close();
            fileOutputStream.close();
            dataInputStream.close();
        }
    
        public static void main(String[] args) throws Exception {
            Movie movie = new QsptvReptile().getMovie("https://www.qsptv.net/show-2169.html",2169);
        }
    }
    
    

    获取的元素需要自己去查看 HTML 来选择

    开始爬取数据,这里开20个线程来获取数据

    package cn.haidnor.movie;
    
    public class Reptile implements Runnable {
    
        // 资源最小值 1
        private int index = 100;
        // 资源最大值 83877
        private int max = 9000;
    
        static String urlPrefix = "https://www.qsptv.net/show-";
        static String urlPostfix = ".html";
    
        @Override
        public void run() {
            while (index <= max) {
                String url = null;
                try {
                    synchronized (this) {
                        url = urlPrefix + index + urlPostfix;
                        index++;
                    }
                    Movie movie = new QsptvReptile().getMovie(url, index);
                    System.out.println(movie);
                } catch (Exception e) {
                    System.err.println("GET FAILED: " + url);
                }
            }
        }
    
        public static void main(String[] args) {
            Reptile reptile = new Reptile();
            for (int i = 0; i < 20; i++) {
                new Thread(reptile).start();
            }
        }
    
    }
    

    可见获取数据的速度是非常快的。这个就是不做好网络安全的后果。服务器的压力会非常大。

  • 相关阅读:
    QComboBox设置item height(行高)
    QTabWidget隐藏边框,QWebView/QWebFrame隐藏滚动条
    qt 提示 undefined reference to `vtable for XXX ' 的另一种可能性
    linux double buffering
    http 头信息详解(转载,出处已忘)
    php 魔术方法
    新手使用linux (1)
    关于chm提示 已取消到该网页的导航的解决方法(转载,忘记出处)
    redis 中文文档
    php PDO (转载)
  • 原文地址:https://www.cnblogs.com/Haidnor/p/13639717.html
Copyright © 2011-2022 走看看