zoukankan      html  css  js  c++  java
  • C# HTML帮助类 包括补全标签 截取HTML字符串包含标签

    public static class HtmlHelper
        {
            /// <summary>
            /// 按文本内容长度截取HTML字符串(支持截取带HTML代码样式的字符串)
            /// </summary>
            /// <param name="html">将要截取的字符串参数</param>
            /// <param name="len">截取的字节长度</param>
            /// <param name="endString">字符串末尾补上的字符串</param>
            /// <returns>返回截取后的字符串</returns>
            public static string HTMLSubstring(string html, int len, string endString)
            {
                if (string.IsNullOrEmpty(html) || html.Length <= len) return html;
                MatchCollection mcentiry, mchtmlTag;
                ArrayList inputHTMLTag = new ArrayList();
                string r = "", tmpValue;
                int rWordCount = 0, wordNum = 0, i = 0;
                Regex rxSingle = new Regex("^<(br|hr|img|input|param|meta|link)", RegexOptions.Compiled | RegexOptions.IgnoreCase)//是否单标签正则
                    , rxEndTag = new Regex("</[^>]+>", RegexOptions.Compiled)//是否结束标签正则
                    , rxTagName = new Regex("<([a-z]+)[^>]*>", RegexOptions.Compiled | RegexOptions.IgnoreCase)//获取标签名正则
                    , rxHtmlTag = new Regex("<[^>]+>", RegexOptions.Compiled)//html标签正则
                    , rxEntity = new Regex("&[a-z]{1,9};", RegexOptions.Compiled | RegexOptions.IgnoreCase)//实体正则
                    , rxEntityReverse = new Regex("§", RegexOptions.Compiled)//反向替换实体正则
                    ;
                html = html.Replace("§", "§");//替换字符§为他的实体“§”,以便进行下一步替换
                mcentiry = rxEntity.Matches(html);//收集实体对象到匹配数组中
                html = rxEntity.Replace(html, "§");//替换实体为特殊字符§,这样好控制一个实体占用一个字符
                mchtmlTag = rxHtmlTag.Matches(html);//收集html标签到匹配数组中
                html = rxHtmlTag.Replace(html, "__HTMLTag__");//替换为特殊标签
                string[] arrWord = html.Split(new string[] { "__HTMLTag__" }, StringSplitOptions.None);//通过特殊标签进行拆分
                wordNum = arrWord.Length;
                //获取指定内容长度及HTML标签
                for (; i < wordNum; i++)
                {
                    if (rWordCount + arrWord[i].Length >= len) r += arrWord[i].Substring(0, len - rWordCount) + endString;
                    else r += arrWord[i];
                    rWordCount += arrWord[i].Length;//计算已经获取到的字符长度
                    if (rWordCount >= len) break;
                    //搜集已经添加的非单标签,以便封闭HTML标签对
                    if (i < wordNum - 1)
                    {
                        tmpValue = mchtmlTag[i].Value;
                        if (!rxSingle.IsMatch(tmpValue))
                        { //不是单标签
                            if (rxEndTag.IsMatch(tmpValue) && inputHTMLTag.Count > 0) inputHTMLTag.RemoveAt(inputHTMLTag.Count - 1);
                            else inputHTMLTag.Add(tmpValue);
                        }
                        r += tmpValue;
                    }
    
                }
                //替换回实体
                for (i = 0; i < mcentiry.Count; i++) r = rxEntityReverse.Replace(r, mcentiry[i].Value, 1);
                //封闭标签
                for (i = inputHTMLTag.Count - 1; i >= 0; i--) r += "</" + rxTagName.Match(inputHTMLTag[i].ToString()).Groups[1].Value + ">";
                return r;
            }
    
            /// <summary>
            /// 过滤html格式
            /// </summary>
            /// <param name="Htmlstring"></param>
            /// <returns></returns>
            public static string NoHTML(this string Htmlstring)
            {
                Htmlstring = Regex.Replace(Htmlstring, @"<script[sS]*?</script>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<noscript[sS]*?</noscript>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<style[sS]*?</style>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"([
    ])[s]+", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"-->", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", " ", RegexOptions.IgnoreCase);
                return Htmlstring;
            }
    
            /// <summary>
            /// 去除html标签后并截取字符串
            /// </summary>
            /// <param name="html">源html</param>
            /// <param name="length">截取长度</param>
            /// <returns></returns>
            public static string RemoveHtmlTag(this string html, int length = 0)
            {
                var doc = new HtmlDocument();
                doc.LoadHtml(html);
                var strText = doc.DocumentNode.InnerText;
                if (length > 0 && strText.Length > length)
                {
                    return strText.Substring(0, length);
                }
    
                return strText;
            }
    
            /// <summary>
            /// 补全HTMl标签
            /// </summary>
            /// <param name="html">源html</param>
            /// <param name="length">截取长度</param>
            /// <returns></returns>
            public static string GetHtmlTag(this string html)
            {
                var doc = new HtmlDocument();
                doc.LoadHtml(html);
                var strText = doc.DocumentNode.InnerHtml;
                return strText;
            }
    
            /// <summary>
            /// 转换为HtmlDecode
            /// </summary>
            /// <param name="value"></param>
            /// <returns></returns>
            public static string HtmlDecode(this string value)
            {
                return System.Net.WebUtility.HtmlDecode(value);
            }
            /// <summary>
            /// 转换为HtmlEncode
            /// </summary>
            /// <param name="value"></param>
            /// <returns></returns>
            public static string HtmlEncode(this string value)
            {
                return System.Net.WebUtility.HtmlEncode(value);
            }
        }
  • 相关阅读:
    shingling算法——提取特征,m个hash函数做指纹计算,针对特征hash后变成m维向量,最后利用union-find算法计算相似性
    普林斯顿算法(1.3)并查集(union-find算法)——本质就是一个数 下面的子树代表了连在一起的点
    Cuckoo hash算法分析——其根本思想和bloom filter一致 增加hash函数来解决碰撞 节省了空间但代价是查找次数增加
    Merkle 树——空间换时间,分而治之的hash表,通过根节点是由它的两个子节点内容的哈希值组成来校验数据完整性,定位篡改的数据位置
    图解Skip List——本质是空间换时间的数据结构,在lucene的倒排列表,bigtable,hbase,cassandra的memtable,redis中sorted set中均用到
    LSM Tree 学习笔记——本质是将随机的写放在内存里形成有序的小memtable,然后定期合并成大的table flush到磁盘
    LSM Tree 学习笔记——MemTable通常用 SkipList 来实现
    Raft 为什么是更易理解的分布式一致性算法——(1)Leader在时,由Leader向Follower同步日志 (2)Leader挂掉了,选一个新Leader,Leader选举算法。
    一个php user class
    CI 模板解析器类
  • 原文地址:https://www.cnblogs.com/netlock/p/13924181.html
Copyright © 2011-2022 走看看