zoukankan      html  css  js  c++  java
  • 去掉HTML代码 保留文字和图片

    取得HTML中的img

            /// <summary>   
            /// 取得HTML中所有图片的 URL。   
            /// </summary>   
            /// <param name="sHtmlText">HTML代码</param>   
            /// <returns>图片的URL列表</returns>   
            public static string[] GetHtmlImageUrlList(string sHtmlText)
            {
                // 定义正则表达式用来匹配 img 标签   
                Regex regImg = new Regex(@"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>", RegexOptions.IgnoreCase);
    
                // 搜索匹配的字符串   
                MatchCollection matches = regImg.Matches(sHtmlText);
                int i = 0;
                string[] sUrlList = new string[matches.Count];
    
                // 取得匹配项列表   
                foreach (Match match in matches)
                    sUrlList[i++] = match.Groups["imgUrl"].Value;
                return sUrlList;
            }

    取得HTML中的文字

     /// <summary>
            /// 取得html中的文字
            /// </summary>
            /// <param name="htmlString"></param>
            /// <returns></returns>
            public static string NoHTML(string htmlString)
            {
                if (string.IsNullOrEmpty(htmlString)) return string.Empty;
                //删除脚本   
                htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
                //删除HTML   
                htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"([
    ])[s]+", "", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase);
    
                htmlString = Regex.Replace(htmlString, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
                htmlString = Regex.Replace(htmlString, @"&#(d+);", "", RegexOptions.IgnoreCase);
    
                //htmlString = System.Web.HttpUtility.HtmlEncode(htmlString);
    
                return htmlString;
            }
  • 相关阅读:
    tomcat与resin的比较
    Linux Resin 安装配置
    [BZOJ3456]城市规划
    ZJOI 2017 仙人掌
    「LibreOJ NOI Round #1」动态几何问题
    [SDOI2015]约数个数和
    codeforce 940F
    codeforce 940F
    codeforce 940E
    [NOI2009]植物大战僵尸
  • 原文地址:https://www.cnblogs.com/YorkQi/p/13924529.html
Copyright © 2011-2022 走看看