zoukankan      html  css  js  c++  java
  • html代码转换成为纯文本

    先解码。接着转换为纯文本,用这段代码:
    public static string HtmlToText(string source)
            {
                string result;

                //remove line breaks,tabs
                result = source.Replace("\r", " ");
                result = result.Replace("\n", " ");
                result = result.Replace("\t", " ");

                //remove the header
                result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase);

                result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase);

                //remove all styles
                result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase); //clearing attributes
                result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase);

                //insert tabs in spaces of <td> tags
                result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase);

                //insert line breaks in places of <br> and <li> tags
                result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase);

                //insert line paragraphs in places of <tr> and <p> tags
                result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase);

                //remove anything thats enclosed inside < >
                result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase);

                //replace special characters:
                result = Regex.Replace(result, @"&amp;", "&", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&nbsp;", " ", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&lt;", "<", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&gt;", ">", RegexOptions.IgnoreCase);
                result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase);

                //remove extra line breaks and tabs
                result = Regex.Replace(result, @" ( )+", " ");
                result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r");
                result = Regex.Replace(result, @"(\r\r)+", "\r\n");

                return result;
            }

  • 相关阅读:
    HDU 1176 免费馅饼(DP)
    微通道的兵锋 阿里将血腥
    head first c&lt;11&gt;在根据网络编程
    ZOJ 3795 Grouping(Tarjan收缩点+DAG)
    【设计和算法分析】3、二进制搜索
    杨:联想的创新将成为打造最酷的助推力的重要器件
    android 如何将电话簿SDN数字和其他普通的数字混合在一起?
    redhat6.3已安装was6.1你可以不弹出安装程序
    Android属于查询执行情况的电话号码
    table在 点击线条颜色
  • 原文地址:https://www.cnblogs.com/walleyekneel/p/2182635.html
Copyright © 2011-2022 走看看