zoukankan      html  css  js  c++  java
  • selenuim爬虫实战 (下)

    SuperLOFTERDownloader7.java

    package test;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Timer;
    import java.util.TimerTask;
    import java.util.concurrent.TimeUnit;
    
    import javax.swing.JOptionPane;
    
    import org.openqa.selenium.By;
    import org.openqa.selenium.JavascriptExecutor;
    import org.openqa.selenium.Keys;
    import org.openqa.selenium.WebElement;
    import org.openqa.selenium.firefox.FirefoxBinary;
    import org.openqa.selenium.firefox.FirefoxDriver;
    import org.openqa.selenium.firefox.FirefoxOptions;
    import org.openqa.selenium.interactions.Actions;
    
    public class SuperLOFTERDownloader7 {
        // static String html;
        static int urlCount = 0;
        static String username = "fenchenyue";
        static String url = "http://" + username + ".lofter.com/view";
        static String picUrl;
        static boolean ifToBreak = false;
        static ArrayList<String> urlList = new ArrayList<String>();
        // 注意:目录路径必须以""结尾
        static String downloadToDir = "E:\爬虫\妹纸2\";
    
        public static void main(String[] args) {
            System.setProperty("webdriver.gecko.driver", "C:\Users\Jim\Desktop\GeckoDriver\geckodriver.exe");
    
            FirefoxOptions options = new FirefoxOptions();
    
            // 启动配置"不加载图片"
            options.addPreference("permissions.default.image", 2);
    
            // 启动参数"无界面"
            FirefoxBinary myBinary = new FirefoxBinary();
            myBinary.addCommandLineOptions("--headless");
            options.setBinary(myBinary);
    
            FirefoxDriver driver = new FirefoxDriver(options);
            driver.manage().timeouts().implicitlyWait(16, TimeUnit.SECONDS); // 隐式等待
            driver.get(url);
    
            // ((JavascriptExecutor)
            // driver).executeScript("document.querySelector('.m-txtsch').style.display="inline"");
    
            int givenNumber = Integer.parseInt(driver.findElementByCssSelector(
                    "body > div.g-bdfull.g-bdfull-show.ztag > div.g-bdc.ztag > div.m-fbar.f-cb > div.schbtn.f-cb > div:nth-child(1) > div > div.txt > a.ztag.currt > span")
                    .getAttribute("innerHTML"));
    
            Actions action = new Actions(driver);
    
            Timer timer = new Timer();
            timer.schedule(new TimerTask() {
    
                @Override
                public void run() {
                    // TODO Auto-generated method stub
                    action.sendKeys(/* driver.findElement(By.cssSelector("body")), */ Keys.END).perform();
                }
            }, 1, 700);
    
            /*
             * WebDriverWait wait = new WebDriverWait(driver, 256, 2048);
             * wait.until(ExpectedConditions .numberOfElementsToBe(By.
             * cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li"), number));
             */
    
            new Thread(new Runnable() {
    
                @Override
                public void run() {
                    // TODO Auto-generated method stub
                    int flag = JOptionPane.showConfirmDialog(null, "正在收集资源...
    如果觉得时间过长,点击'是'提前中断", "是否中断",
                            JOptionPane.YES_NO_OPTION);
                    if (flag == JOptionPane.YES_OPTION) {
                        ifToBreak = true;
                    }
                }
            }).start();
    
            // 周期查询文档是否加载完或者用户选择中断
            String js = "let count=0;document.querySelectorAll('.g-bdc > div:nth-child(3) > div.m-filecnt.m-filecnt-1 > ul').forEach(function(e,i){count+=e.children.length});return count;";
            while (true) {
    
                long countFromJs = (long) ((JavascriptExecutor) driver).executeScript(js);
                System.out.println("已收集到资源数 : " + countFromJs);
                if (Math.abs((int) countFromJs - givenNumber) < 5 || ifToBreak) {
                    break;
                }
                try {
                    Thread.sleep(1600);
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            timer.cancel();
    
            for (WebElement element : driver.findElements(
                    By.cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li > a > div > div > img.realimg"))) {
                picUrl = element.getAttribute("src").split("\?")[0];
                System.out.println(picUrl);
                urlList.add(picUrl);
                urlCount++;
    
            }
    
            System.out.println("爬取到" + givenNumber + "篇文章中的" + urlCount + "张图的url");
    
            int flag = JOptionPane.showConfirmDialog(null, "核对url,是否开始下载?", "是否继续", JOptionPane.YES_NO_OPTION);
            if (flag == JOptionPane.NO_OPTION) {
                System.exit(0);
            } else if (flag == JOptionPane.YES_OPTION) {
                System.out.println("正在下载.....");
                try {
                    Runtime.getRuntime().exec("explorer " + downloadToDir);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                System.out.println("完成,成功下载了" + DownloadFromUrlList3.download(urlList, downloadToDir) + "张图");
                System.out.println("失败数:" + DownloadFromUrlList3.errCount);
            }
            // JOptionPane.showMessageDialog(null, "浏览器可以关闭了吗?");
            driver.quit();
            System.exit(0);
        }
    }

    DownloadFromUrlList3.java

    package test;
    
    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStream;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.util.ArrayList;
    
    public class DownloadFromUrlList3 {
        static private int count = 0;
        static ThreadGroup myThreadGroup = new ThreadGroup("myGroup");
        static picThread myThread;
        public static int errCount = 0;
        static long timeOut = 8 * 1000;
    
        public static synchronized int updateCount() {
            count++;
            return count;
        };
    
        public static int download(ArrayList<String> urlList, String dirPath) {
            if (new File(dirPath).exists()) {
                if (!new File(dirPath).isDirectory()) {
                    System.out.println("ERROR!! THE PATH GIVEN ISN'T A DIRECTORY");
                    return 0;
                }
            } else {
                new File(dirPath).mkdir();
            }
    
            for (String urlstr : urlList) {
                String myUrlStr = urlstr;
                myThread = new picThread(myThreadGroup, new Runnable() {
    
                    @Override
                    public void run() {
                        // TODO Auto-generated method stub
                        try {
                            URL url = new URL(myUrlStr);
                            BufferedInputStream is;
                            try {
                                // 网络流量一定要用高效的buffered
                                is = new BufferedInputStream(url.openStream());
                            } catch (Exception e) {
                                // TODO Auto-generated catch block
                                System.out.println("发现一个url资源出错:
    " + myUrlStr);
                                errCount++;
                                return;
                            }
                            String extention = "." + HttpURLConnection.guessContentTypeFromStream(is).split("/")[1];
                            File file = new File(dirPath + updateCount() + extention);
                            ((picThread) Thread.currentThread()).file = file;
                            file.createNewFile();
                            OutputStream os = new FileOutputStream(file);
                            int len;
                            byte[] buffer = new byte[1024];
                            while ((len = is.read(buffer)) != -1) {
                                os.write(buffer, 0, len);
                            }
                            is.close();
                            os.close();
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
    
                    }
                });
                myThread.urlStr = myUrlStr;
                myThread.start();
                // threadList.add(myThread);
                try {
                    myThread.join(timeOut);
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
    
            /*
             * for (Thread iThread : threadList) { try { // 等待所有线程执行完毕 iThread.join(1000 *
             * 15); } catch (InterruptedException e) { e.printStackTrace(); } }
             */
            try {
                Thread.sleep(1024);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            Thread[] activeThreads = new Thread[myThreadGroup.activeCount()];
            myThreadGroup.enumerate(activeThreads);
            for (Thread thread : activeThreads) {
                System.out.println("一张图片下载超时 :" + ((picThread) thread).file.getName() + "
    " + ((picThread) thread).urlStr);
                errCount++;
                // ((picThread) thread).file.delete();
            }
            return count - myThreadGroup.activeCount();
        }
    
        // 使用示例:
        public static void main(String[] args) {
            ArrayList<String> list = new ArrayList<String>();
            list.add(
                    "http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1lsd1doNjVPSWVkREFwNTRrNTdUM2tQYkk2bHVabHpIYjJOZVFFeEdaUWRnPT0.jpg");
            list.add("http://imgsrc.baidu.com/imgad/pic/item/b58f8c5494eef01f2c2e59feebfe9925bc317dd6.jpg");
            list.add(
                    "http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1p2Wk54NlRNMTZKLzJnci9HanJsbUNFUTlJdWdFaDhJQUlQQ3h5Y1kzOFlRPT0.jpg");
            list.add("http://b.hiphotos.baidu.com/image/pic/item/7a899e510fb30f247b237cc9c195d143ac4b03ba.jpg");
            System.out.println("完成,成功下载了:" + download(list, "C:\Users\Jim\Desktop\testPic\"));
            System.out.println("失败:" + DownloadFromUrlList3.errCount);
            System.exit(0);
        }
    
    }
    
    class picThread extends Thread {
        public String urlStr = "";
        public File file;
        /*
         * public void setUrl(String url) { urlStr = url; }
         * 
         * public String getUrl() { return urlStr; }
         */
    
        public picThread(ThreadGroup group, Runnable run) {
            super(group, run);
        }
    }
  • 相关阅读:
    job 定时任务的五种创建方式
    一步步实现 Redis 搜索引擎
    数据库第一二三范式
    MongoDB数组更新操作$addToSet和$each修饰符
    V8 执行 JavaScript 的过程
    servicebestpractice项目的更新
    公主连结过root检测-frida
    android使用AsyncHttpClient发送请求
    js检测dom元素的变化
    安卓手机关闭防火墙命令
  • 原文地址:https://www.cnblogs.com/jinhengyu/p/10257809.html
Copyright © 2011-2022 走看看