zoukankan      html  css  js  c++  java
  • JAVA-替换html中图片的路径-从html代码中提取图片路径并下载

    1,这段代码的功能:

    (1)可以将指定的html代码中的src路径提取出来

    (2)将提取出来的src路径重新下载到本地

    package com.googosoft.until;
    
    import java.io.BufferedInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.junit.Test;
    
    public class HtmlUtil {
    
        public static String delHTMLTag(String htmlStr) {
            String regEx_script = "<script[^>]*?>[\s\S]*?<\/script>"; // 定义script的正则表达式
            String regEx_style = "<style[^>]*?>[\s\S]*?<\/style>"; // 定义style的正则表达式
            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
    
            Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
            Matcher m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); // 过滤script标签
    
            Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
            Matcher m_style = p_style.matcher(htmlStr);
            htmlStr = m_style.replaceAll(""); // 过滤style标签
    
            Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
            Matcher m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); // 过滤html标签
            return htmlStr.trim(); // 返回文本字符串
        }
    
        /**
         * 根据图片的网络路径将图片下载到本地,并返回本地路径
         * @param urlHttp 图片的网络路径
         * @param path 新生成的图片的目录
         * @return
         */
        private static String getPicture2(String urlHttp, String path) {
            FileOutputStream out = null;
            BufferedInputStream in = null;
            HttpURLConnection connection = null;
            String newPath = "";
    
            byte[] buf = new byte[1024];
            int len = 0;
            try {
                URL url = new URL(urlHttp);
                connection = (HttpURLConnection) url.openConnection();
                connection.connect();
                in = new BufferedInputStream(connection.getInputStream());
                newPath = path + "/" + new Date().getTime() + ".jpg";
                out = new FileOutputStream(newPath);
                while ((len = in.read(buf)) != -1) {
                    out.write(buf, 0, len);
                }
                out.flush();
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                    out.close();
                    connection.disconnect();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
            return newPath;
        }
    
        /**
         * 提取HTML字符串中的img列表
         * @param htmlStr 要处理的html字符串
         * @return
         */
        private static List<String> getImgStrList(String htmlStr) {
            List<String> list = new ArrayList<>();
            String img = "";
            Pattern p_image;
            Matcher m_image;
            String regEx_img = "<img.*src\s*=\s*(.*?)[^>]*?>";
            p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
            m_image = p_image.matcher(htmlStr);
            while (m_image.find()) {
                img = m_image.group();
                Matcher m = Pattern.compile("src\s*=\s*"?(.*?)("|>|\s+)").matcher(img);
                while (m.find()) {
                    list.add(handleSrc(m.group(1)));
                }
            }
            return list;
        }
    
        /**
         * 去除src路径中的前后单引号
         * @param src 图片的src路径
         * @return
         */
        private static String handleSrc(String src) {
            if (src != null) {
                if (src.startsWith("'")) {
                    return src.substring(1, src.length());
                }
                if (src.endsWith("'")) {
                    return src.substring(0, src.length());
                }
    
            }
            return src;
        }
    
        @Test
        public void testTransSrc() throws Exception {
            String str = "<h1 style='font-weight: 400; padding-left: 0px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); font-size: 24px; line-height: 36px; color: rgb(0, 0, 0); font-family: 微软雅黑; text-align: center;'>标题</h1><div class='detail-body photos' style='margin-top: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); min-height: 306px; line-height: 26px; font-size: 16px; color: rgb(51, 51, 51); overflow-wrap: break-word; font-family: 微软雅黑;'><pre id='content' style='padding-right: 15px; padding-left: 15px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); white-space: pre-wrap; overflow-wrap: break-word; border-left-color: rgb(0, 150, 136); background-color: rgb(248, 248, 248); overflow: auto;'><div class='entry' style='margin-top: 30px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-family: Arial, &quot;Microsoft YaHei&quot;, 微软雅黑, STHeiti, &quot;WenQuanYi Micro Hei&quot;, SimSun, sans-serif; min-height: 450px; color: rgb(61, 70, 77);'><p style='margin-top: 5px; margin-bottom: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; line-height: 1.8; word-break: break-all; font-size: 16px; letter-spacing: 0px;'><img class='' src='https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg&amp;tp=webp&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1' crossorigin='anonymous' data-croporisrc='http://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGd2DN46qD8MwKvRgjEZnwu3n47tHQxRnqw3snRDsvccFL6cjTOjDGXw/0?wx_fmt=jpeg' data-cropx1='0' data-cropx2='434' data-cropy1='0' data-cropy2='393' data-ratio='0.9032258064516129' data-src='https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg' data-type='jpeg' data-w='434' data-fail='0' style='display: block; max- 750px; cursor: pointer; margin-top: 5px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;'><img class='size-full wp-image-28403 aligncenter' src='http://www.yunweipai.com/wp-content/uploads/2019/04/20190425172338.jpg' alt='' width='434' height='392' style='display: block; max- 750px; cursor: pointer; margin: 5px auto 10px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;'></p></div></pre></div>";
            List<String> imgList = getImgStrList(str);
            for (String img : imgList) {
                System.out.println(getPicture2(img, "D://uploadFiles"));
            }
        }
        
    
    }
  • 相关阅读:
    中国式关系
    太太万岁
    matlab记录运行时间命令
    matlab读xls数据
    matlab,xls转换为mat文件
    matlab里plot设置线形和颜色
    matlab里plot画多幅图像、设置总标题、legend无边框
    matlab显示图像的横纵坐标
    去掉matlab图片空白边缘
    matlab显示原图和灰度直方图
  • 原文地址:https://www.cnblogs.com/excellencesy/p/11925914.html
Copyright © 2011-2022 走看看