zoukankan      html  css  js  c++  java
  • HtmlEntities

    #region GetOnlyTextFromHtmlCode + RemoveHtmlChars + RemoveTagFromHtmlCode
            /// <summary>
            /// http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fedit%2FZetaHtmlEditControl%2F%2FZetaHtmlEditControl-Source.zip&zep=Control%2FHtmlEditControl.cs&obid=43954&obtid=2&ovid=13
            /// </summary>
            /// <param name="htmlCode"></param>
            /// <returns></returns>
            private static string getOnlyTextFromHtmlCode(string htmlCode)
            {
                //<br>
                htmlCode = htmlCode.Replace("
    ", @" ");
                htmlCode = htmlCode.Replace("
    ", @" ");
                htmlCode = htmlCode.Replace("
    ", @" ");
    
                htmlCode = htmlCode.Replace(@"</p>", Environment.NewLine + Environment.NewLine);
                htmlCode = htmlCode.Replace(@"</P>", Environment.NewLine + Environment.NewLine);
    
                //html comment 
                htmlCode = Regex.Replace(
                    htmlCode,
                    @"<!--.*?-->",
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //<p>
                htmlCode = Regex.Replace(htmlCode,
                    @"<br[^>]*>",
                    Environment.NewLine,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //tags
                htmlCode = removeTagFromHtmlCode(@"style", htmlCode);
                htmlCode = removeTagFromHtmlCode(@"script", htmlCode);
    
                //html
                htmlCode = Regex.Replace(
                    htmlCode,
                    "<(.|
    )+?>",
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                //umlaute
                htmlCode = unescapeHtmlEntities(htmlCode);
    
                //whitespaces
                htmlCode = Regex.Replace(
                    htmlCode,
                    @" +",
                    @" ",
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                return htmlCode;
            }
            /// <summary>
            /// http://dev.w3.org/html5/html-author/charref
            /// </summary>
            /// <param name="htmlCode"></param>
            /// <returns></returns>
            private static string unescapeHtmlEntities(string htmlCode)
            {

          htmlCode = htmlCode.Replace(@"&nbsp;", @" ");

          htmlCode = htmlCode.Replace(@"&Auml;", @"ä");
          htmlCode = htmlCode.Replace(@"&absp;", @"");
          htmlCode = htmlCode.Replace(@"&obsp;", @"");
          htmlCode = htmlCode.Replace(@"&Obsp;", @"");
          htmlCode = htmlCode.Replace(@"&ubsp;", @"");
          htmlCode = htmlCode.Replace(@"&Ubsp;", @"");
          htmlCode = htmlCode.Replace(@"&szlig;", @"ß");

          htmlCode = htmlCode.Replace(@"&pound;", @"£");
          htmlCode = htmlCode.Replace(@"&sect;", @"§");
          htmlCode = htmlCode.Replace(@"&copy;", @"©");
          htmlCode = htmlCode.Replace(@"&reg;", @"®");
          htmlCode = htmlCode.Replace(@"&micro;", @"µ");
          htmlCode = htmlCode.Replace(@"&para;", @"¶");
          htmlCode = htmlCode.Replace(@"&Oslash;", @"Ø");
          htmlCode = htmlCode.Replace(@"&oslash;", @"Ø");
          htmlCode = htmlCode.Replace(@"&divide;", @"÷");
          htmlCode = htmlCode.Replace(@"&times;", @"×");

                return htmlCode;
            }
    
            private static string removeTagFromHtmlCode(
                string tag,
                string htmlCode)
            {
                return Regex.Replace(
                    htmlCode,
                    string.Format(@"<{0}.*?</{1}>", tag, tag),
                    string.Empty,
                    RegexOptions.Singleline | RegexOptions.IgnoreCase);
            }
            #endregion
    

      

  • 相关阅读:
    PHP之简单实现MVC框架
    socket泄露的问题
    gdb 调试多线程
    MMAP和DIRECT IO区别
    三年回首:C基础
    定时器管理:nginx的红黑树和libevent的堆
    strsep和strtok_r替代strtok
    缓存穿透和缓存失效
    mmap为什么比read/write快(兼论buffercache和pagecache)
    B+Tree和MySQL索引分析
  • 原文地址:https://www.cnblogs.com/geovindu/p/4310328.html
Copyright © 2011-2022 走看看