zoukankan      html  css  js  c++  java
  • 提取HTML代码中文字的C#函数

    /// <summary>
    /// 去除HTML标记
    /// </summary>
    /// <param name="strHtml">包括HTML的源码 </param>
    /// <returns>已经去除后的文字</returns>
    public static string StripHTML(string strHtml)
    {
        string [] aryReg ={
            @"<script[^>]*?>.*?</script>", 
            @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""])(\\[""tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
            @"([\r\n])[\s]+",
            @"&(quot|#34);",
            @"&(amp|#38);",
            @"&(lt|#60);",
            @"&(gt|#62);", 
            @"&(nbsp|#160);", 
            @"&(iexcl|#161);",
            @"&(cent|#162);",
            @"&(pound|#163);",
            @"&(copy|#169);",
            @"&#(\d+);",
            @"-->",
            @"<!--.*\n"
        };
    
        string [] aryRep = {
            "",
            "",
            "",
            "\"",
            "&",
            "<",
            ">",
            " ",
            "\xa1",//chr(161),
            "\xa2",//chr(162),
            "\xa3",//chr(163),
            "\xa9",//chr(169),
            "",
            "\r\n",
            ""
        };
    
        string newReg =aryReg[0];
        string strOutput=strHtml;
        for(int i = 0;i<aryReg.Length;i++)
        {
            Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
            strOutput = regex.Replace(strOutput,aryRep[i]);
        }
    
        strOutput.Replace("<","");
        strOutput.Replace(">","");
        strOutput.Replace("\r\n","");
    
        return strOutput;
    } 
    
  • 相关阅读:
    从服务器上下载下来的代码,部署到本地时,Url自动带www前缀
    个人说明
    名词解释
    Bandizip-解压缩软件
    uTools-工具插件集
    Geek-软件卸载工具
    Microsoft商店软件推荐
    Docker入门第九章
    Docker入门第八章
    IDM-下载工具
  • 原文地址:https://www.cnblogs.com/wubin264/p/1771305.html
Copyright © 2011-2022 走看看