zoukankan      html  css  js  c++  java
  • 一个简单粗暴的爬虫

    1. 必应今日美图

    当使用bing搜索时,每天都会出现一副美图。

    搜索找到bing今日美图  http://bing.plmeizi.com/ (这里收集了一年多的今日美图) 收集者: http://leil.plmeizi.com/ 

    目前共47页

    url格式按 http://bing.plmeizi.com/?page=*

     点进去就是我们要的名称和名称

    2. 开始编码

    使用简单的Jsoup进行爬虫,很简单,很好理解。

    HtmlUtil

     1 package util;
     2 
     3 import java.io.IOException;
     4 
     5 import org.jsoup.Jsoup;
     6 import org.jsoup.nodes.Document;
     7 
     8 public class HtmlUtil {
     9     // 根据url从网络获取网页文本
    10     public Document getHtmlTextByUrl(String url) {
    11         Document doc = null;
    12         try {
    13             // doc = Jsoup.connect(url).timeout(5000000).get();
    14             int i = (int) (Math.random() * 1000); // 做一个随机延时,防止网站屏蔽
    15             while (i != 0) {
    16                 i--;
    17             }
    18             doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000)
    19                     .post();
    20         } catch (IOException e) {
    21             e.printStackTrace();
    22             try {
    23                 doc = Jsoup.connect(url).timeout(5000000).get();
    24             } catch (IOException e1) {
    25                 // TODO Auto-generated catch block
    26                 e1.printStackTrace();
    27             }
    28         }
    29         return doc;
    30     }
    31 }

    GetPhoto

    这个编码主要需要先分析html属性,按照属性取到我们需要的元素,元素值。

    我是先拿到每个图的page页面url

    然后到详情页面拿到图的url,图的名字截取。

    然后将图保存到本地。

     1 package bing;
     2 
     3 import java.io.DataInputStream;
     4 import java.io.File;
     5 import java.io.FileOutputStream;
     6 import java.io.IOException;
     7 import java.net.URL;
     8 
     9 import org.jsoup.nodes.Document;
    10 import org.jsoup.nodes.Element;
    11 import org.jsoup.select.Elements;
    12 
    13 import util.HtmlUtil;
    14 
    15 /**
    16  *
    17  * @author loveincode
    18  * @data Sep 29, 2017 1:15:00 PM
    19  */
    20 public class GetPhoto {
    21 
    22     public static void go(int startpage, int endpage) throws IOException {
    23 
    24         HtmlUtil htmlutil = new HtmlUtil();
    25         // 获取图片的绝对路径
    26         String url = "http://bing.plmeizi.com/?page=";
    27         for (int i = startpage; i <= endpage; i++) {
    28             String gourl = url + i + "";
    29             Document dochtml = htmlutil.getHtmlTextByUrl(gourl);
    30             Elements elements_a = dochtml.getElementsByClass("item");
    31             for (int x = 0; x < elements_a.size(); x++) {
    32                 String pyotopage = elements_a.get(x).attr("href");
    33                 Document dochtml_photo = htmlutil.getHtmlTextByUrl(pyotopage);
    34                 Element elements_picurl = dochtml_photo.getElementById("picurl");
    35                 String picurl = elements_picurl.attr("href");
    36                 Element elements_searchlink = dochtml_photo.getElementById("searchlink");
    37                 String name = elements_searchlink.getElementsByTag("span").get(0).html();
    38                 name = name.split("\(")[0];
    39 
    40                 if (picurl.contains("jpg")) {
    41                     // 下载图片
    42                     URL url_pic = new URL(picurl);
    43                     DataInputStream dataInputStream = new DataInputStream(url_pic.openStream());
    44                     String imageName = name + ".jpg";
    45                     FileOutputStream fileOutputStream = new FileOutputStream(new File("bing_pic/" + imageName));
    46                     byte[] buffer = new byte[1024];
    47                     int length;
    48                     while ((length = dataInputStream.read(buffer)) > 0) {
    49                         fileOutputStream.write(buffer, 0, length);
    50                     }
    51                     dataInputStream.close();
    52                     fileOutputStream.close();
    53                 }
    54             }
    55         }
    56 
    57     }
    58 
    59     public static void main(String[] args) throws IOException {
    60         System.out.println("test");
    61         go(1, 1);
    62     }
    63 
    64 }

    Mythread

     1 package bing;
     2 
     3 import java.io.IOException;
     4 
     5 public class Mythread extends Thread {
     6 
     7     private int startpage;
     8 
     9     private int endpage;
    10 
    11     public Mythread(int startpage, int endpage) {
    12         this.startpage = startpage;
    13         this.endpage = endpage;
    14     }
    15 
    16     @SuppressWarnings("static-access")
    17     @Override
    18     public void run() {
    19         GetPhoto getPhoto = new GetPhoto();
    20         try {
    21             getPhoto.go(startpage, endpage);
    22         } catch (IOException e) {
    23             // TODO Auto-generated catch block
    24             e.printStackTrace();
    25         }
    26     }
    27 }RUN

    RUN

    采用多线程,开启多个线程同时爬取图片

     1 package bing;
     2 
     3 import java.io.IOException;
     4 
     5 /**
     6  *
     7  * @author loveincode
     8  * @data Sep 29, 2017 1:55:57 PM
     9  */
    10 public class RUN {
    11 
    12     public static void main(String[] args) throws IOException {
    13 
    14         long startTime = System.currentTimeMillis(); // 获取开始时间
    15 
    16         Mythread a1 = new Mythread(1, 5);
    17         Mythread a2 = new Mythread(6, 10);
    18         Mythread a3 = new Mythread(11, 15);
    19         Mythread a4 = new Mythread(16, 20);
    20         Mythread a5 = new Mythread(21, 25);
    21         Mythread a6 = new Mythread(26, 30);
    22         Mythread a7 = new Mythread(31, 35);
    23         Mythread a8 = new Mythread(36, 40);
    24         Mythread a9 = new Mythread(41, 45);
    25         Mythread a10 = new Mythread(46, 47);
    26 
    27         a1.start();
    28         a2.start();
    29         a3.start();
    30         a4.start();
    31         a5.start();
    32         a6.start();
    33         a7.start();
    34         a8.start();
    35         a9.start();
    36         a10.start();
    37 
    38         while (true) {
    39             if (a1.isAlive() == false && a2.isAlive() == false && a3.isAlive() == false && a4.isAlive() == false
    40                     && a5.isAlive() == false && a6.isAlive() == false && a7.isAlive() == false && a8.isAlive() == false
    41                     && a9.isAlive() == false && a10.isAlive() == false) {
    42                 long endTime = System.currentTimeMillis(); // 获取结束时间
    43                 System.out.println("程序运行时间: " + (endTime - startTime) / 1000.0 + "s");
    44                 break;
    45             }
    46         }
    47     }
    48 
    49 }

    执行 RUN 

    耗时76.962s 完成图片下载到本地。

    成功

    效果:

    很高清吧

  • 相关阅读:
    redis缓存穿透
    rocketmq配置文件两主两从
    jvm参数模板
    (转)volatile如何保证可见性
    Spring事务传播性与隔离级别
    Redis windows 远程连接配置修改
    Redis安装与配置( Windows10 或Windows server)
    C#中的虚函数及继承关系
    C#高级功能(三)Action、Func,Tuple
    WAMP配置httpd.conf允许外部访问
  • 原文地址:https://www.cnblogs.com/loveincode/p/jsoup_bingpic.html
Copyright © 2011-2022 走看看