zoukankan      html  css  js  c++  java
  • html转换为纯文本,支持撇号

     /// <summary>
            /// html转换为纯文本
            /// </summary>
            /// <param name="source"></param>
            /// <returns></returns>
            private static string HtmlToPlainText(string source)
            {
                string result;
    
                //remove line breaks,tabs
                result = source.Replace("
    ", " ");
                result = result.Replace("
    ", " ");
                result = result.Replace("	", " ");
    
                //remove the header
                result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase);
    
                result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase);
    
                //remove all styles
                result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase); //clearing attributes
                result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase);
    
                //insert tabs in spaces of <td> tags
                result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase);
    
                //insert line breaks in places of <br> and <li> tags
                result = Regex.Replace(result, @"<( )*br( )*>", "
    ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*li( )*>", "
    ", RegexOptions.IgnoreCase);
    
                //insert line paragraphs in places of <tr> and <p> tags
                result = Regex.Replace(result, @"<( )*tr([^>])*>", "
    
    ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*p([^>])*>", "
    
    ", RegexOptions.IgnoreCase);
    
                //remove anything thats enclosed inside < >
                result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase);
    
                //replace special characters:
                result = Regex.Replace(result, @"&amp;", "&", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&nbsp;", " ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&lt;", "<", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&gt;", ">", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&#39;", "'", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase);
    
                //remove extra line breaks and tabs
                result = Regex.Replace(result, @" ( )+", " ");
                result = Regex.Replace(result, "(
    )( )+(
    )", "
    
    ");
                result = Regex.Replace(result, @"(
    
    )+", "
    ");
    
                return result;
            }
  • 相关阅读:
    对象池使用时要注意几点
    Flash3D学习计划(一)——3D渲染的一般管线流程
    714. Best Time to Buy and Sell Stock with Transaction Fee
    712. Minimum ASCII Delete Sum for Two Strings
    647. Palindromic Substrings(马拉车算法)
    413. Arithmetic Slices
    877. Stone Game
    338. Counting Bits
    303. Range Sum Query
    198. House Robber
  • 原文地址:https://www.cnblogs.com/haorui/p/4228490.html
Copyright © 2011-2022 走看看