zoukankan      html  css  js  c++  java
  • Java 中 利用正则表达式 获取 网页图片



    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.InputStream;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    /***
    * java抓取网络图片
    * @author swinglife
    *
    */
    public class pimg {

    // 地址
    private static final String URL = "http://www.csdn.net";
    // 编码
    private static final String ECODING = "UTF-8";
    // 获取img标签正则
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    // 获取src路径的正则
    private static final String IMGSRC_REG = "http:"?(.*?)("|>|\s+)";


    public static void main(String[] args) throws Exception {
    pimg cm = new pimg();
    //获得html文本内容
    String HTML = cm.getHTML(URL);
    //获取图片标签
    List<String> imgUrl = cm.getImageUrl(HTML);
    //获取图片src地址
    List<String> imgSrc = cm.getImageSrc(imgUrl);
    //下载图片
    cm.Download(imgSrc);
    }


    /***
    * 获取HTML内容
    *
    * @param url
    * @return
    * @throws Exception
    */
    private String getHTML(String url) throws Exception {
    URL uri = new URL(url);
    URLConnection connection = uri.openConnection();
    InputStream in = connection.getInputStream();
    byte[] buf = new byte[1024];
    int length = 0;
    StringBuffer sb = new StringBuffer();
    while ((length = in.read(buf, 0, buf.length)) > 0) {
    sb.append(new String(buf, ECODING));
    }
    in.close();
    return sb.toString();
    }

    /***
    * 获取ImageUrl地址
    *
    * @param HTML
    * @return
    */
    private List<String> getImageUrl(String HTML) {
    Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
    List<String> listImgUrl = new ArrayList<String>();
    while (matcher.find()) {
    listImgUrl.add(matcher.group());
    }
    return listImgUrl;
    }

    /***
    * 获取ImageSrc地址
    *
    * @param listImageUrl
    * @return
    */
    private List<String> getImageSrc(List<String> listImageUrl) {
    List<String> listImgSrc = new ArrayList<String>();
    for (String image : listImageUrl) {
    Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
    while (matcher.find()) {
    listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
    }
    }
    return listImgSrc;
    }

    /***
    * 下载图片
    *
    * @param listImgSrc
    */
    private void Download(List<String> listImgSrc) {
    try {
    for (String url : listImgSrc) {
    String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
    URL uri = new URL(url);
    InputStream in = uri.openStream();
    FileOutputStream fo = new FileOutputStream("C:/Users/tutu/Desktop/img/"+imageName"//自定义路径);// new File(imageName)相对绝对路径
                    byte[] buf = new byte[1024];
    int length = 0;
    System.out.println("开始下载:" + url);
    while ((length = in.read(buf, 0, buf.length)) != -1) {
    fo.write(buf, 0, length);
    }
    in.close();
    fo.close();
    System.out.println(imageName + "下载完成");
    }
    } catch (Exception e) {
    System.out.println("下载失败");
    }
    }


    }
  • 相关阅读:
    ES基本介绍
    Mybatis 读写分离简单实现
    分享一个Flink checkpoint失败的问题和解决办法
    一次“内存泄露”引发的血案
    记一次堆外内存泄漏排查过程
    MySQL主从复制读写分离,看这篇就够了!
    JVM运行时内存数据区域
    .NET MVC 页面传值方式
    jQuery 对表格内容进行搜索筛选
    泛型
  • 原文地址:https://www.cnblogs.com/tutu21ybz/p/6737406.html
Copyright © 2011-2022 走看看