zoukankan      html  css  js  c++  java
  • java爬虫,爬取网址、爬取视频、爬取图片

    一、爬取网址

    import java.io.*;
    import java.net.*;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * java实现爬虫
     */
    public class Robot {
        public static void main(String[] args) {
            URL url = null;
            URLConnection urlconn = null;
            BufferedReader br = null;
            PrintWriter pw = null;
    //        String regex = "http://[\w+\.?/?]+\.[A-Za-z]+";
            String regex = "https://[\w+\.?/?]+\.[A-Za-z]+";//url匹配规则
            Pattern p = Pattern.compile(regex);
            try {
                url = new URL("https://www.cnblogs.com/peachh/p/9740229.html");//爬取的网址、这里爬取的是一个生物网站
                urlconn = url.openConnection();
                pw = new PrintWriter(new FileWriter("C:/SiteURL.txt"), true);//将爬取到的链接放到D盘的SiteURL文件中
                br = new BufferedReader(new InputStreamReader(
                        urlconn.getInputStream()));
                String buf = null;
                while ((buf = br.readLine()) != null) {
                    Matcher buf_m = p.matcher(buf);
                    while (buf_m.find()) {
                        pw.println(buf_m.group());
                    }
                }
                System.out.println("爬取成功^_^");
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                pw.close();
            }
        }
    }
    View Code

    二、爬取视频

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.net.URL;
    import java.util.HashMap;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
     
    /**
     * 功能:爬取某姐的小视频
     * @author cxd
     *
     */
    public class WebSpiderDemo1 {
     
        public static void main(String[] args) throws Exception {
     
            String source = "http://www.budejie.com/video/";
            String destDir = "C:/rob/";
     
            Map<String, String> urlMap = getUrlInSource(source);
     
            for (Map.Entry<String, String> entry : urlMap.entrySet()) {
                String title = entry.getKey();// 视频名称
                String url = entry.getValue();// 视频url
                File destFile = new File(destDir + title + ".mp4");
                download(url, destFile);
            }
        }
     
        /**
         * 通过视频的URL下载该视频并存入本地
         * 
         * @param url      视频的URL
         * @param destFile 视频存入的位置
         * @throws IOException
         */
        public static void download(String url, File destFile) throws IOException {
            URL videoUrl = new URL(url);
     
            InputStream is = videoUrl.openStream();
            FileOutputStream fos = new FileOutputStream(destFile);
     
            int len = 0;
            byte[] buffer = new byte[1024];
            while ((-1) != (len = is.read(buffer))) {
                fos.write(buffer, 0, len);
            }
            fos.flush();
     
            if (null != fos) {
                fos.close();
            }
     
            if (null != is) {
                is.close();
            }
        }
     
        /**
         * 获取视频的URL地址和视频名称存入hashMap
         * 
         * @param source
         * @return
         * @throws IOException
         */
        public static Map<String, String> getUrlInSource(String source) throws IOException {
     
            Map<String, String> hashMap = new HashMap<>();
     
            for (int index = 1; index <= 1; index++) { // 页数最大为50,自己玩嘛,就只爬取了一页。
                String pageUrl = source + index;
                URL url = new URL(pageUrl);
                InputStream is = url.openStream();
     
    //            若遇到反爬机制则使用该方法将程序伪装为浏览器进行访问
    //            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
    //            conn.setRequestMethod("GET");
    //            conn.setRequestProperty("user-agent",
    //                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
    //            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
     
                BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
     
                String info = null;
                String title = null;
                // 此处不要用==null进行判断,因为网页中有很多行都是null,否则会报java.lang.NullPointerException。
                for (int i = 0; i < 10000; i++) {
                    info = br.readLine();
     
                    if (null != info) {
                        String urlRegex = "data-mp4="(.*?\.mp4)";
     
                        if (info.contains("data-title")) {
                            title = info;
                        }
     
                        Pattern pattern = Pattern.compile(urlRegex);
                        Matcher matcher = pattern.matcher(info);
                        if (matcher.find()) {
                            for (int j = 0; j <= matcher.groupCount(); j++) {
                                String tmp = matcher.group(j);
                                if (!tmp.startsWith("data-mp4=")) {
                                    String videoTitle = getTitle(title.trim());
                                    hashMap.put(videoTitle, tmp);
                                }
                            }
                        }
                    }
                }
            }
            return hashMap;
        }
     
        /**
         * 清洗整理titile字符串,
         * 
         * @param info
         * @return
         */
        private static String getTitle(String info) {
     
            int len = info.length();
            String title = info.substring(12, len - 1);
            return title;
        }
    
    }
    View Code

    三、爬取图片

      1 import com.obcy.util.DownLoad;
      2 import com.obcy.util.GetHTML;
      3 import org.jsoup.Jsoup;
      4 import org.jsoup.nodes.Document;
      5 import org.jsoup.select.Elements;
      6 import org.junit.Test;
      7  
      8 import java.io.File;
      9 import java.util.ArrayList;
     10 import java.util.HashMap;
     11 import java.util.Map;
     12  
     13 public class BiAn {
     14  
     15     //获取到所有的一级页面,从第2页到第946页
     16  
     17  
     18     public ArrayList<String> getTopUrl(){
     19         //String topurl = "http://www.netbian.com/hd3840x2160/index_2.htm"
     20  
     21         //定义一个集合保存所有的一级页面
     22         ArrayList<String> list = new ArrayList<String>();
     23         for (int i = 2; i <= 946; i++) {
     24             list.add("http://www.netbian.com/hd3840x2160/index_"+i+".htm");
     25         }
     26  
     27         return list;
     28     }
     29  
     30  
     31     //获取一级页面的所有图片查看地址
     32     //传入的参数是一级页面地址
     33     public HashMap<String,String> getGpjView(String topUrl){
     34  
     35         String url = topUrl;
     36         String html = GetHTML.getHTML(url);
     37         //获取到网页源代码document对象
     38         Document document = Jsoup.parse(html);
     39         //解析document对象,拿到页面每个图片查看地址
     40         Elements list = document.getElementsByClass("list");
     41         //拿到的list只有一个对象,这个对象里包含页面所有的图片a标签
     42         Elements a = null;
     43         try {
     44             a = list.get(0).select("ul>li>a");
     45         } catch (Exception e) {
     46             System.out.println("没有获取到a标签");
     47  
     48         }
     49         //遍历a标签对象,拿到a标签的href属性值并拼接成完整的图片查看地址放入集合里
     50             //建立一个map集合
     51         HashMap<String,String> map = new HashMap<String, String>();
     52         for (int i = 0; i < a.size(); i++) {
     53             String href = "http://www.netbian.com"+a.get(i).attr("href");
     54             String name = a.get(i).attr("title");
     55             //System.out.println(href);http://www.netbian.com/desk/22138.htm
     56             map.put(name,href);
     57         }
     58  
     59         //搜集本页面的图片查看地址完成
     60  
     61         return map;
     62  
     63     }
     64  
     65  
     66     //访问每个一级页面,获取到页面里所有的图片下载地址,
     67     // 方法接收一个装有一个一级页面所有图片查看地址的集合
     68  
     69     public void getDownload(HashMap<String,String> map){
     70  
     71  
     72         //遍历集合,对集合里所有的页面进行提取,每个页面提取到一张图片下载地址,并下载
     73  
     74         for (Map.Entry<String, String> entry : map.entrySet()) {
     75  
     76  
     77             String html = GetHTML.getHTML(entry.getValue());
     78  
     79             Document document = Jsoup.parse(html);
     80  
     81             //拿到图片img标签对象,只有一个
     82  
     83             Elements endpage = null;
     84             try {
     85                 endpage = document.getElementsByClass("endpage").get(0).select("div>p>a>img");
     86             } catch (Exception e) {
     87                 System.out.println("没获取到页面对象,继续下一个");
     88                 continue;
     89             }
     90  
     91             //System.out.println(endpage.get(0).attr("src"));
     92  
     93             //得到下载地址
     94             String target = endpage.get(0).attr("src");
     95  
     96  
     97             String path = "F:/BiAn/"+entry.getKey()+".jpg";
     98             //开始下载
     99             DownLoad.downLoad(target,path);
    100         }
    101  
    102  
    103     }
    104  
    105     @Test
    106     public void test(){
    107         //判断是否存在文件夹F:/BiAn
    108         File file = new File("C:/BiAn");
    109         if (!file.exists()){
    110             file.mkdirs();
    111             System.out.println("已创建下载文件夹F:/BiAn");
    112         }else {
    113             System.out.println("已存在文件夹,具备下载条件");
    114         }
    115  
    116  
    117         //单线程
    118         //拿到所有一级页面
    119         ArrayList<String> topUrl = getTopUrl();
    120         //对每个页面进行操作,1.得到视图集合,2.遍历集合,拿到下载地址,3.下载
    121  
    122         for (String url : topUrl) {
    123             HashMap<String, String> gpjView = getGpjView(url);
    124             getDownload(gpjView);
    125         }
    126  
    127  
    128  
    129     }
    130 }
    View Code

    四、如何分析网页信息,从而进行抓取的分析?

    https://www.cnblogs.com/518894-lu/p/9021548.html

  • 相关阅读:
    V2热帖:要多健壮的代码才能支撑起千变万化的需求?
    jmeter生成html报告的命令
    jmeter5.x&4.x搭配使用Serveragent 监听服务端性能参数
    springboot关于tomcat的几个默认配置
    nginx日志统计分析-shell
    OpenStack虚拟机VIP配置步骤
    openstack 3.14.3 虚拟机增加指定IP网卡
    OpenStack各组件的常用命令
    Filebeat的Registry文件解读
    一个shell脚本的实践
  • 原文地址:https://www.cnblogs.com/ciscolee/p/12655604.html
Copyright © 2011-2022 走看看