zoukankan      html  css  js  c++  java
  • 使用Jsoup爬取网站图片

      1 package com.test.pic.crawler;
      2 
      3 import java.io.File;
      4 import java.io.FileOutputStream;
      5 import java.io.IOException;
      6 import java.io.InputStream;
      7 import java.io.OutputStream;
      8 import java.net.URL;
      9 import java.net.URLConnection;
     10 import java.util.Arrays;
     11 import java.util.HashSet;
     12 import java.util.List;
     13 import java.util.Set;
     14 import java.util.concurrent.BlockingQueue;
     15 import java.util.concurrent.LinkedBlockingDeque;
     16 import java.util.concurrent.ScheduledExecutorService;
     17 import java.util.concurrent.ScheduledThreadPoolExecutor;
     18 import org.apache.commons.lang3.concurrent.BasicThreadFactory;
     19 import org.jsoup.Jsoup;
     20 import org.jsoup.nodes.*;
     21 import org.jsoup.select.Elements;
     22 
     23 
     24 
     25 
     26 /**   
     27  * @Title: PicCrawler.java 
     28  *
     29  * @Package com.test.pic.crawler 
     30  *
     31  * @Description: 爬取指定网站的指定Tag下的图片或者全部Tag图片 
     32  *
     33  * @author CoderZZ   
     34  *
     35  * @date 2018年1月12日 下午11:22:41 
     36  *
     37  * @version V1.0   
     38  *
     39  */
     40 public class PicCrawler implements Runnable{
     41     private static String pathString = "G:/test/pic/";//存储目录
     42     //存储真正的爬取页面
     43     static BlockingQueue<String> urlBlockingQueue = new LinkedBlockingDeque<String>(1000);
     44     static int threadNum = 10;
     45 //    public PicCrawler(String url){
     46 //        this.url = url;
     47 //    }
     48 
     49     /** 
     50      * @Title: main 
     51      *
     52      * @Description: TODO(这里用一句话描述这个方法的作用) 
     53      *
     54      * @param @param args    设定文件 
     55      *
     56      * @return void    返回类型 
     57      *
     58      * @throws 
     59      *
     60      */
     61     public static void main(String[] args) {
     62         String homeurlString = "https://www.xxxx.com";//爬取页面的基本地址
     63         String tagPageUrl = "https://www.xxxx.com/tag/";//tag分页地址
     64         //Tag标签的完整路径
     65         Set<String> tagFullHrefSet = new HashSet<String>(16);
     66         //想要爬取哪些tag,如果为空,则全部爬取;否则只配置对应的tag
     67         String[] crawlerTagArray = {"风景"};
     68         List<String> crawlerTagList = Arrays.asList(crawlerTagArray);
     69         try {
     70             //1.获取想要的tag完整的url
     71             Document tagListDocument = Jsoup.connect(tagPageUrl).get();
     72             Elements tagsListDivElements = tagListDocument.getElementsByClass("tags_list");
     73             for(Element element:tagsListDivElements){
     74                 Elements aElements = element.getElementsByTag("a");
     75                 for(Element a:aElements){
     76                     if(crawlerTagList.size() == 0 || crawlerTagList.contains(a.text())){
     77                         String tagUrlString = homeurlString+a.attr("href");
     78                         //https://www.xxxx.com/tag/fengjing.html
     79                         tagUrlString = tagUrlString.substring(0, tagUrlString.lastIndexOf("."))+"/1.html";
     80                         tagFullHrefSet.add(tagUrlString);
     81                     }
     82                 }
     83             }
     84             //2.获取图片链接页面地址,分页爬取
     85             for(String tagUrl:tagFullHrefSet){
     86                 String tempTagUrlString = tagUrl;
     87                 int currentPageNum = 1;
     88                 while(true){
     89                     try{
     90                         Document imagePageDocument = Jsoup.connect(tempTagUrlString).get();
     91                         Elements imageListElements = imagePageDocument.getElementsByClass("Pli-litpic");
     92                         if(imageListElements.size() == 0){
     93                             break;
     94                         }
     95                         for(Element image:imageListElements){
     96                             urlBlockingQueue.offer(homeurlString+image.attr("href"));
     97                         }
     98                         //https://www.xxxx.com/tag/fengjing/1.html
     99                         tempTagUrlString = tempTagUrlString.substring(0, tempTagUrlString.lastIndexOf("/")+1)+(++currentPageNum)+".html";
    100                     }catch(Exception e){
    101                         break;
    102                     }
    103                 }
    104             }
    105             ScheduledExecutorService excutor = new ScheduledThreadPoolExecutor(threadNum,new BasicThreadFactory.Builder().namingPattern("my-crawler-thread-%d").daemon(false).build());
    106             for(int i=0;i<threadNum;i++){
    107 //                excutor.schedule(new PicCrawler(urlArray[i]), 1, TimeUnit.SECONDS);
    108 //                excutor.execute(new PicCrawler(urlArray[i]));
    109                 excutor.submit(new PicCrawler());
    110             }
    111         } catch (IOException e) {
    112             // TODO Auto-generated catch block
    113             e.printStackTrace();
    114         }
    115     }
    116     @Override
    117     public void run() {
    118         while (true) {
    119             try {
    120                 long begin = System.currentTimeMillis();
    121                 String url = urlBlockingQueue.poll();
    122                 if(null != url){
    123                     Document doc = Jsoup.connect(url).get();
    124                     Elements titleElements =doc.select("#photos > h1");
    125                     if(null != titleElements && null != titleElements.get(0)){
    126                         Set<String> imgSrcSet = new HashSet<String>(16);
    127                         Element titleElement = titleElements.get(0);
    128                         String foldNameString = titleElement.text();
    129                         String[] nameArray = foldNameString.split("\(");
    130                         foldNameString = nameArray[0];
    131                         nameArray = nameArray[1].split("/");
    132                         int totalPaggs = Integer.parseInt(nameArray[1].replace(")", ""));
    133                         for(int i=1;i<=totalPaggs;i++){
    134                             String urlTemp = url.replace(".html", "_"+i+".html");
    135                             Document docTemp = Jsoup.connect(urlTemp).get();
    136                             Element element = docTemp.getElementById("big-pic");
    137                             Elements imgElements = element.getElementsByTag("img");
    138                             for(Element imgElement:imgElements){
    139                                 imgSrcSet.add(imgElement.attr("src"));
    140                             }
    141                         }
    142                         if(imgSrcSet.size()>0){
    143                             for(String imgSrc:imgSrcSet){
    144                                 // 构造URL    
    145                                 URL imgurl = new URL(imgSrc);    
    146                                 // 打开连接    
    147                                 URLConnection con = imgurl.openConnection();    
    148                                 //设置请求超时为10s    
    149                                 con.setConnectTimeout(10*1000);    
    150                                 // 输入流    
    151                                 InputStream is = con.getInputStream();    
    152                                 // 500k的数据缓冲    
    153                                 byte[] bs = new byte[1024*500];    
    154                                 // 读取到的数据长度    
    155                                 int len;    
    156                                 // 输出的文件流    
    157                                 File sf=new File(pathString+"\"+foldNameString);    
    158                                 if(!sf.exists()){    
    159                                     sf.mkdirs();    
    160                                 }
    161                                 String filename = imgSrc.split("/")[imgSrc.split("/").length-1];
    162                                 OutputStream os = new FileOutputStream(sf.getPath()+"\"+filename);    
    163                                 // 开始读取    
    164                                 while ((len = is.read(bs)) != -1) {    
    165                                     os.write(bs, 0, len);    
    166                                 }    
    167                                 // 完毕,关闭所有链接    
    168                                 os.close();    
    169                                 is.close();  
    170                                 System.out.println(imgSrc+"下载完成!!!");
    171                             }
    172                         }
    173                         long end = System.currentTimeMillis();
    174                         System.out.println("================================================================");
    175                         System.out.println(Thread.currentThread().getName()+"******************已全部下载完成,用时:"+((end-begin)/1000)+"S");
    176                     }
    177                 }else{
    178                     System.out.println("========================BlockingQueue已空,已全部抓取完成!=======================");
    179                 }
    180             } catch (Exception e) {
    181                 System.out.println("========================抓取异常=======================");
    182             }
    183         }
    184     }
    185 }
  • 相关阅读:
    dotnet 使用 MessagePack 序列化对象
    dotnet 使用 MessagePack 序列化对象
    PHP die() 函数
    PHP defined() 函数
    PHP define() 函数
    PHP constant() 函数
    PHP connection_status() 函数
    查看物理CPU个数、核数、逻辑CPU个数
    CF997C Sky Full of Stars
    dotnet 使用 lz4net 压缩 Stream 或文件
  • 原文地址:https://www.cnblogs.com/Java-Script/p/11089630.html
Copyright © 2011-2022 走看看