zoukankan      html  css  js  c++  java
  • 使用Jsoup 爬取网易首页所有的图片

    package com.enation.newtest;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.commons.lang3.StringEscapeUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    // 爬取网易首页所有图片
    public class Jsoup163 {
        
        public static void main(String[] args) throws Exception{
            String downloadPath = "D:\360Downloads\test";
            List<String> list = nameList("网易--首页");
            getPictures(list,1,downloadPath); //1代表下载一页,一页一般有30张图片
        }
        
        public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key为关键词,max作为爬取的页数
            String gsm=Integer.toHexString(max)+"";
            String finalURL = "";
            String tempPath = "";
            for(String keyword : keywordList){
                tempPath = downloadPath;
                if(!tempPath.endsWith("\")){
                           tempPath = downloadPath+"\";
                }
                tempPath = tempPath+keyword+"\";
                File f = new File(tempPath);
                if(!f.exists()){
                    f.mkdirs();
                }
                int picCount = 1;
                for(int page=1;page<=max;page++) { 
                    sop("正在下载第"+page+"页面");
                    Document document = null;
                    try {
                        String url ="http://www.163.com/";
                        sop(url);
                        document = Jsoup.connect(url).data("query", "Java")//请求参数  
                                 .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//设置urer-agent  get();
                                 .timeout(5000)
                                 .get();
                        String xmlSource = document.toString();
                        xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);
                        //sop(xmlSource);
                        String reg = "<img.*src=(.*?)[^>]*?>";
                        String reg2 = "src\s*=\s*"?(.*?)("|>|\s+)";
                        String reg2datasrc = "data-src\s*=\s*"?(.*?)("|>|\s+)";
                        
                        Pattern pattern = Pattern.compile(reg);
                        Pattern pattern2 = Pattern.compile(reg2);
                        Pattern pattern2datasrc = Pattern.compile(reg2datasrc);
                        
                        Matcher m = pattern.matcher(xmlSource);
                        while (m.find()){
                            finalURL = m.group();
                            System.out.println(finalURL);
                            Matcher m2 = null;
                            if(finalURL.indexOf("data-src")>0){
                                m2 = pattern2datasrc.matcher(finalURL);
                            }else {
                                m2 = pattern2.matcher(finalURL);
                            }
                            if(m2.find()){
                                finalURL = m2.group(1);
                                System.out.println(finalURL);
                                if(finalURL.startsWith("http")){
                                    sop(keyword+picCount+++":"+finalURL);
                                    download(finalURL,tempPath);
                                    sop("             下载成功");
                                }
                            }
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
           }
           sop("下载完毕");
           delMultyFile(downloadPath);
           sop("已经删除所有空图");
        }
        public static void delMultyFile(String path){
            File file = new File(path);
            if(!file.exists())
                throw new RuntimeException("File ""+path+"" NotFound when excute the method of delMultyFile()....");
            File[] fileList = file.listFiles();
            File tempFile=null;
            for(File f : fileList){
                if(f.isDirectory()){
                    delMultyFile(f.getAbsolutePath());
                }else{
                    if(f.length()==0)
                        sop(f.delete()+"---"+f.getName());
                }
            }
        }
        public static List<String> nameList(String nameList){
            List<String> arr = new ArrayList<String>();
            String[] list;
            if(nameList.contains(","))
                list= nameList.split(",");
            else if(nameList.contains("、"))
                list= nameList.split("、");
            else if(nameList.contains(" "))
                list= nameList.split(" ");
            else{
                arr.add(nameList);
                return arr;
            }
            for(String s : list){
                arr.add(s);
            }
            return arr;
        }
        public static void sop(Object obj){
            System.out.println(obj);
        }
        //根据图片网络地址下载图片
          public static void download(String url,String path){
              //path = path.substring(0,path.length()-2);
              File file= null;
              File dirFile=null;
              FileOutputStream fos=null;
              HttpURLConnection httpCon = null;
              URLConnection  con = null;
              URL urlObj=null;
              InputStream in =null;
              byte[] size = new byte[1024];
              int num=0;
              try {
                  String downloadName= url.substring(url.lastIndexOf("/")+1);
                  dirFile = new File(path);
                  if(!dirFile.exists() && path.length()>0){
                      if(dirFile.mkdir()){
                          sop("creat document file ""+path.substring(0,path.length()-1)+"" success...
    ");
                      }
                  }else{
                      file = new File(path+downloadName);
                      fos = new FileOutputStream(file);
                      if(url.startsWith("http")){
                          urlObj = new URL(url);
                          con = urlObj.openConnection();
                          httpCon =(HttpURLConnection) con;
                          int  responseCode = httpCon.getResponseCode();
                          if(responseCode == 200){
                              in = httpCon.getInputStream();
                              while((num=in.read(size)) != -1){
                                  for(int i=0;i<num;i++)
                                      fos.write(size[i]);
                              }
                          }else {
                            System.out.println("状态码:"+responseCode+" 地址:"+url);
                        }
                      }
                  }
              }catch (FileNotFoundException notFoundE) {
                  sop("找不到该网络图片....");
              }catch(NullPointerException nullPointerE){
                  sop("找不到该网络图片....");
              }catch(IOException ioE){
                  sop("产生IO异常.....");
              }catch (Exception e) {
                  e.printStackTrace();
              }finally{
                  try {
                      if(fos!=null){
                          fos.close();
                      }
                  } catch (Exception e) {
                      e.printStackTrace();
                  }
              }
          }
    }

    其中,关键点在于获取图片img标签的正则表达式和图片的链接地址

    String reg = "<img.*src=(.*?)[^>]*?>";
    String reg2 = "src\s*=\s*"?(.*?)("|>|\s+)";

     运行结果:

  • 相关阅读:
    SpringMVC 2
    MySQL--事务,隔离性和隔离级别
    String.intern()
    初识消息队列--ActiveMq
    Java后台上传图片到七牛云
    Thread.interrupt(),Thread.isInterrupted(),Thread.interrupted()碎碎念
    Java基础--对象
    Java后台调用gcc编译C语言代码
    ToolProvider.getSystemJavaCompiler()方法空指针的排坑
    [LeetCode]29 两数相除和一个小坑点
  • 原文地址:https://www.cnblogs.com/jiafuwei/p/6089082.html
Copyright © 2011-2022 走看看