zoukankan      html  css  js  c++  java
  • 抓取html 写正则

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.IO.Compression;
    using System.Text.RegularExpressions;
    
    namespace WikiPageCreater.Common
    {
        public class PageHelper
        {
            /// <summary>
            /// 根据 url 获取网页编码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetEncoding(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;
    
                    response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
    
                        string html = reader.ReadToEnd();
    
                        Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                        if (reg_charset.IsMatch(html))
                        {
                            return reg_charset.Match(html).Groups["charset"].Value;
                        }
                        else if (response.CharacterSet != string.Empty)
                        {
                            return response.CharacterSet;
                        }
                        else
                            return Encoding.Default.BodyName;
                    }
                }
                catch
                {
                }
                finally
                {
    
                    if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();
    
                    if (request != null)
                        request = null;
    
                }
    
                return Encoding.Default.BodyName;
            }
    
            /// <summary>
            /// 根据 url 和 encoding 获取当前url页面的 html 源代码        
           /// </summary>
            /// <param name="url"></param>
            /// <param name="encoding"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;
    
                    response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();
    
                        return html;
                    }
                }
                catch
                {
                }
                finally
                {
    
                    if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();
    
                    if (request != null)
                        request = null;
    
                }
    
                return string.Empty;
            }
        }
    }

    抓取后

    Regex regex = new Regex("<a.*?href=\"(?<href>.*?)\".*?title=\"(?<title>.*?)\".*?>", RegexOptions.Compiled);

    可以取到href和title

    Regex reg = new Regex(@"(?is)<img[^>]*?scr=(['""\s]?)([^'""\s]+)\1[^>]*?>");
    抓取图片src
    Regex regexObj = new Regex("<li class=\"vImg\"><img.*?src=\"(?<src>.+?)\".*?></li>.*?<li class=\"vStatus\">.*?<a.*?href=\"(?<href>.+?)\".*?>(?<title>.*?)</a></li>", RegexOptions.Singleline);
    
    
    可以获取图片src,title,href,其它信息可以类推,一次截取出来  
     
  • 相关阅读:
    springmvc到底怎么工作的
    (netty专题)初步认识BIO、NIO、AIO
    dubbo基本使用理解
    warning: ignoring option PermSize=512m; support was removed in 8.0解决
    面试都看那些
    MySQL——通过EXPLAIN分析SQL的执行计划
    springboot中的json、gson、fastjson如何使用与日期格式转换
    如何生成一个不重复的四位数
    深入理解SpringCloud之Gateway 接上篇Webflux快速入门
    Layui 手册2
  • 原文地址:https://www.cnblogs.com/wangchuang/p/2507392.html
Copyright © 2011-2022 走看看