zoukankan      html  css  js  c++  java
  • 正则表达式过滤HTML、JS、CSS

    功能用途

    主要是用来提取html页面内容时使用。

    示例代码

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Net.NetworkInformation;
    using System.Net.Sockets;
    using System.Threading;
    using System.Text.RegularExpressions;
    namespace HtmlRegex
    {
        public class BaseRegex
        {
            WebClient web = new WebClient();
            public void DeBug(string path,int encoding,string content)
            {
                Encoding encods;
                if (encoding == 1)
                    encods = Encoding.UTF8;
                else
                    encods = Encoding.Default;
                StreamWriter sw = new StreamWriter(path,true ,encods);
                sw.WriteLine(content);
                sw.Flush();
                sw.Close();
            }
            public string getPageContent(string url, int encoding)
            {
                byte[] buff = web.DownloadData(url);
                if (encoding == 1)
                {
                    return Encoding.UTF8.GetString(buff);
                }
                return Encoding.Default.GetString(buff);
            }
            public string checkHtml(string html)
            {
                //过滤JS和CSS
                Regex regex1 = new Regex(@"<script.*?>.+?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex2 = new Regex(@"<style.*?>.+?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex3 = new Regex(@"<script.*?>.*?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex4 = new Regex(@"<style.*?>.*?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                
                Regex regex5 = new Regex(@"<.*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex6 = new Regex(@"&S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex7 = new Regex(@"<!--.+?-->", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex8 = new Regex(@"[
    ]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                //HTML标签包括自闭和标签
                //Regex regex9 = new Regex(@"<(.*)(.*)>.*</1>|<(.*) />", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                html = regex1.Replace(html, "");
                html = regex2.Replace(html, "");
                html = regex3.Replace(html, "");
                html = regex4.Replace(html, "");
                html = regex5.Replace(html, "");
                html = regex6.Replace(html, "");
                html = regex7.Replace(html, "");
                html = regex8.Replace(html, "");
                html = html.Replace(" ", "");
                return html;
            }
        }
    }
  • 相关阅读:
    SQLServer2005安装提示服务无法启动解决方法
    如何处理SQL Server2005配置管理器打不开的问题!
    如何卸载oracle 10g数据库
    Gesture实现手势滑动效果
    为android虚拟机配置正确的DNS服务器地址
    a链接事件点击函数
    web 音频文件自动播放(兼容所有浏览器)
    关于Jquery中的$.each获取各种返回类型数据的使用方法
    jquery如何在异步方式中给全局变量赋值
    jquery的blur之后,focus获取不到焦点的解决办法
  • 原文地址:https://www.cnblogs.com/shya/p/2439443.html
Copyright © 2011-2022 走看看