zoukankan      html  css  js  c++  java
  • Html2Text

    from:http://www.cnblogs.com/rookieport/articles/272825.html 

    private static string StripHTML(string source)
            {
                try
                {
                    string result;

                    // Remove HTML Development formatting
                    result = source.Replace("\r", " ");        // Replace line breaks with space because browsers inserts space
                    result = result.Replace("\n", " ");        // Replace line breaks with space because browsers inserts space
                    result = result.Replace("\t", string.Empty);    // Remove step-formatting
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " ");    // Remove repeating speces becuase browsers ignore them

                    // Remove the header (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // remove all scripts (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    //result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // remove all styles (prepare first by clearing attributes)
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // insert tabs in spaces of <td> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // insert line breaks in places of <BR> and <LI> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // replace special characters:
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&nbsp;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // for testng
                    //System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                    // make line breaking consistent
                    result = result.Replace("\n", "\r");

                    // Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4.
                    // Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);    // Remove redundant tabs
                    result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);        // Remove multible tabs followind a linebreak with just one tab
                    //string breaks = "\r\r\r";        // Initial replacement target string for linebreaks
                    //string tabs = "\t\t\t\t\t";        // Initial replacement target string for tabs
                    //for (int index = 0; index < result.Length; index++)
                    //{
                    //    result = result.Replace(breaks, "\r\r");
                    //    result = result.Replace(tabs, "\t\t\t\t");
                    //    breaks = breaks + "\r";
                    //    tabs = tabs + "\t";
                    //}
                    result = System.Text.RegularExpressions.Regex.Replace(result, "\r{2,}", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    result = System.Text.RegularExpressions.Regex.Replace(result, "\t{2,}", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    return result;
                }
                catch
                {
                }
                return source;
            }

  • 相关阅读:
    Apache Common-IO 使用
    Apache Compress-使用
    使用JavaConfig方式-Spring 基础学习
    设计模式-策略模式
    logback mybatis 打印sql语句
    oracle 迭代查询
    使用 Dom4j 将 XML 转换为 MAP
    Java连接Access数据库的那些坑
    如何从Maven中央存储库下载?
    sublime构建各个编译环境
  • 原文地址:https://www.cnblogs.com/94cool/p/2218578.html
Copyright © 2011-2022 走看看