zoukankan      html  css  js  c++  java
  • 使用Jsoup 爬取网易首页所有的图片

    package com.enation.newtest;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.commons.lang3.StringEscapeUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    // 爬取网易首页所有图片
    public class Jsoup163 {
        
        public static void main(String[] args) throws Exception{
            String downloadPath = "D:\360Downloads\test";
            List<String> list = nameList("网易--首页");
            getPictures(list,1,downloadPath); //1代表下载一页,一页一般有30张图片
        }
        
        public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key为关键词,max作为爬取的页数
            String gsm=Integer.toHexString(max)+"";
            String finalURL = "";
            String tempPath = "";
            for(String keyword : keywordList){
                tempPath = downloadPath;
                if(!tempPath.endsWith("\")){
                           tempPath = downloadPath+"\";
                }
                tempPath = tempPath+keyword+"\";
                File f = new File(tempPath);
                if(!f.exists()){
                    f.mkdirs();
                }
                int picCount = 1;
                for(int page=1;page<=max;page++) { 
                    sop("正在下载第"+page+"页面");
                    Document document = null;
                    try {
                        String url ="http://www.163.com/";
                        sop(url);
                        document = Jsoup.connect(url).data("query", "Java")//请求参数  
                                 .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//设置urer-agent  get();
                                 .timeout(5000)
                                 .get();
                        String xmlSource = document.toString();
                        xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);
                        //sop(xmlSource);
                        String reg = "<img.*src=(.*?)[^>]*?>";
                        String reg2 = "src\s*=\s*"?(.*?)("|>|\s+)";
                        String reg2datasrc = "data-src\s*=\s*"?(.*?)("|>|\s+)";
                        
                        Pattern pattern = Pattern.compile(reg);
                        Pattern pattern2 = Pattern.compile(reg2);
                        Pattern pattern2datasrc = Pattern.compile(reg2datasrc);
                        
                        Matcher m = pattern.matcher(xmlSource);
                        while (m.find()){
                            finalURL = m.group();
                            System.out.println(finalURL);
                            Matcher m2 = null;
                            if(finalURL.indexOf("data-src")>0){
                                m2 = pattern2datasrc.matcher(finalURL);
                            }else {
                                m2 = pattern2.matcher(finalURL);
                            }
                            if(m2.find()){
                                finalURL = m2.group(1);
                                System.out.println(finalURL);
                                if(finalURL.startsWith("http")){
                                    sop(keyword+picCount+++":"+finalURL);
                                    download(finalURL,tempPath);
                                    sop("             下载成功");
                                }
                            }
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
           }
           sop("下载完毕");
           delMultyFile(downloadPath);
           sop("已经删除所有空图");
        }
        public static void delMultyFile(String path){
            File file = new File(path);
            if(!file.exists())
                throw new RuntimeException("File ""+path+"" NotFound when excute the method of delMultyFile()....");
            File[] fileList = file.listFiles();
            File tempFile=null;
            for(File f : fileList){
                if(f.isDirectory()){
                    delMultyFile(f.getAbsolutePath());
                }else{
                    if(f.length()==0)
                        sop(f.delete()+"---"+f.getName());
                }
            }
        }
        public static List<String> nameList(String nameList){
            List<String> arr = new ArrayList<String>();
            String[] list;
            if(nameList.contains(","))
                list= nameList.split(",");
            else if(nameList.contains("、"))
                list= nameList.split("、");
            else if(nameList.contains(" "))
                list= nameList.split(" ");
            else{
                arr.add(nameList);
                return arr;
            }
            for(String s : list){
                arr.add(s);
            }
            return arr;
        }
        public static void sop(Object obj){
            System.out.println(obj);
        }
        //根据图片网络地址下载图片
          public static void download(String url,String path){
              //path = path.substring(0,path.length()-2);
              File file= null;
              File dirFile=null;
              FileOutputStream fos=null;
              HttpURLConnection httpCon = null;
              URLConnection  con = null;
              URL urlObj=null;
              InputStream in =null;
              byte[] size = new byte[1024];
              int num=0;
              try {
                  String downloadName= url.substring(url.lastIndexOf("/")+1);
                  dirFile = new File(path);
                  if(!dirFile.exists() && path.length()>0){
                      if(dirFile.mkdir()){
                          sop("creat document file ""+path.substring(0,path.length()-1)+"" success...
    ");
                      }
                  }else{
                      file = new File(path+downloadName);
                      fos = new FileOutputStream(file);
                      if(url.startsWith("http")){
                          urlObj = new URL(url);
                          con = urlObj.openConnection();
                          httpCon =(HttpURLConnection) con;
                          int  responseCode = httpCon.getResponseCode();
                          if(responseCode == 200){
                              in = httpCon.getInputStream();
                              while((num=in.read(size)) != -1){
                                  for(int i=0;i<num;i++)
                                      fos.write(size[i]);
                              }
                          }else {
                            System.out.println("状态码:"+responseCode+" 地址:"+url);
                        }
                      }
                  }
              }catch (FileNotFoundException notFoundE) {
                  sop("找不到该网络图片....");
              }catch(NullPointerException nullPointerE){
                  sop("找不到该网络图片....");
              }catch(IOException ioE){
                  sop("产生IO异常.....");
              }catch (Exception e) {
                  e.printStackTrace();
              }finally{
                  try {
                      if(fos!=null){
                          fos.close();
                      }
                  } catch (Exception e) {
                      e.printStackTrace();
                  }
              }
          }
    }

    其中,关键点在于获取图片img标签的正则表达式和图片的链接地址

    String reg = "<img.*src=(.*?)[^>]*?>";
    String reg2 = "src\s*=\s*"?(.*?)("|>|\s+)";

     运行结果:

  • 相关阅读:
    【crontab】误删crontab及其恢复
    New Concept English there (7)
    New Concept English there (6)
    New Concept English there (5)
    New Concept English there (4)
    New Concept English there (3)
    New Concept English there (2)Typing speed exercise
    New Concept English there (1)Typing speed exercise
    New Concept English Two 34 game over
    New Concept English Two 33 94
  • 原文地址:https://www.cnblogs.com/jiafuwei/p/6089082.html
Copyright © 2011-2022 走看看