zoukankan      html  css  js  c++  java
  • html转换为纯文本,支持撇号

     /// <summary>
            /// html转换为纯文本
            /// </summary>
            /// <param name="source"></param>
            /// <returns></returns>
            private static string HtmlToPlainText(string source)
            {
                string result;
    
                //remove line breaks,tabs
                result = source.Replace("
    ", " ");
                result = result.Replace("
    ", " ");
                result = result.Replace("	", " ");
    
                //remove the header
                result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase);
    
                result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase);
    
                //remove all styles
                result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase); //clearing attributes
                result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase);
    
                //insert tabs in spaces of <td> tags
                result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase);
    
                //insert line breaks in places of <br> and <li> tags
                result = Regex.Replace(result, @"<( )*br( )*>", "
    ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*li( )*>", "
    ", RegexOptions.IgnoreCase);
    
                //insert line paragraphs in places of <tr> and <p> tags
                result = Regex.Replace(result, @"<( )*tr([^>])*>", "
    
    ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*p([^>])*>", "
    
    ", RegexOptions.IgnoreCase);
    
                //remove anything thats enclosed inside < >
                result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase);
    
                //replace special characters:
                result = Regex.Replace(result, @"&amp;", "&", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&nbsp;", " ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&lt;", "<", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&gt;", ">", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&#39;", "'", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase);
    
                //remove extra line breaks and tabs
                result = Regex.Replace(result, @" ( )+", " ");
                result = Regex.Replace(result, "(
    )( )+(
    )", "
    
    ");
                result = Regex.Replace(result, @"(
    
    )+", "
    ");
    
                return result;
            }
  • 相关阅读:
    人脸识别活体检测测试案例
    网络相关配置
    DOS基础整理
    [转载]EXTJS学习
    [转载]JS定时器例子讲解
    [转载]JS定时器例子讲解
    如何设置网页自动刷新(JSP,JS,HTML)
    如何设置网页自动刷新(JSP,JS,HTML)
    18岁以下严禁进入
    18岁以下严禁进入
  • 原文地址:https://www.cnblogs.com/haorui/p/4228490.html
Copyright © 2011-2022 走看看