zoukankan      html  css  js  c++  java
  • 正则表达式过滤HTML、JS、CSS

    功能用途

    主要是用来提取html页面内容时使用。

    示例代码

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using System.Net.NetworkInformation;
    using System.Net.Sockets;
    using System.Threading;
    using System.Text.RegularExpressions;
    namespace HtmlRegex
    {
        public class BaseRegex
        {
            WebClient web = new WebClient();
            public void DeBug(string path,int encoding,string content)
            {
                Encoding encods;
                if (encoding == 1)
                    encods = Encoding.UTF8;
                else
                    encods = Encoding.Default;
                StreamWriter sw = new StreamWriter(path,true ,encods);
                sw.WriteLine(content);
                sw.Flush();
                sw.Close();
            }
            public string getPageContent(string url, int encoding)
            {
                byte[] buff = web.DownloadData(url);
                if (encoding == 1)
                {
                    return Encoding.UTF8.GetString(buff);
                }
                return Encoding.Default.GetString(buff);
            }
            public string checkHtml(string html)
            {
                //过滤JS和CSS
                Regex regex1 = new Regex(@"<script.*?>.+?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex2 = new Regex(@"<style.*?>.+?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex3 = new Regex(@"<script.*?>.*?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex4 = new Regex(@"<style.*?>.*?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                
                Regex regex5 = new Regex(@"<.*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex6 = new Regex(@"&S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex7 = new Regex(@"<!--.+?-->", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                Regex regex8 = new Regex(@"[
    ]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase);
                //HTML标签包括自闭和标签
                //Regex regex9 = new Regex(@"<(.*)(.*)>.*</1>|<(.*) />", RegexOptions.Singleline | RegexOptions.IgnoreCase);
    
                html = regex1.Replace(html, "");
                html = regex2.Replace(html, "");
                html = regex3.Replace(html, "");
                html = regex4.Replace(html, "");
                html = regex5.Replace(html, "");
                html = regex6.Replace(html, "");
                html = regex7.Replace(html, "");
                html = regex8.Replace(html, "");
                html = html.Replace(" ", "");
                return html;
            }
        }
    }
  • 相关阅读:
    LeetCode 120:三角形最小路径和
    守护进程
    G711时间戳增量和数据包大小的关系
    H264防止竞争机制
    硬编码帧率错误导致的浏览器不能播放的问题
    GCC inline
    单例模式的双检锁的隐患和优化
    Java中异常捕获子类异常捕获在父类异常前面,即小范围先被捕获
    线程运行流程图
    将二维数组转为稀疏数组
  • 原文地址:https://www.cnblogs.com/shya/p/2439443.html
Copyright © 2011-2022 走看看