zoukankan      html  css  js  c++  java
  • 正则表达式获取博客园随笔1

    正则表达式获取博客园随笔(一)

    晚上起先和朋友们跑步去了,然

    后回来之后洗了个澡,打开VS新建项目发现都会弹出一个问题

    然后就去找万能的度娘了,http://bbs.csdn.net/topics/390514964?page=1#post-395015041

    25楼真相,卸载掉那2个补丁就可以了,不过在卸载第一个补丁的时候你需要停止他指出的那个服务。

      我当初刚开始接触正则是去年公司主管让我去学,然后发了个网址给我:http://www.cnblogs.com/ie421/archive/2008/07/23/1249896.html

    看完后收益颇大,下面就开始正题。

      之所以要获取博客园的内容是因为博客园造就了我,而大家也都是在博客园里相识,所以我们就以博客园为例子。

      下面上传的这个是当初主管给我的一个类,大家可以参考参考,我今天的内容用到了里面的GetString()这个方法。在运行之前要引用System.Web

     View Code

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.IO.Compression;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Web;

    namespace CnblogsSearch
    {
    public class HttpClient
    {
    #region fields
    private bool keepContext;
    private string defaultLanguage = "zh-CN";
    private Encoding defaultEncoding = Encoding.UTF8;
    private string accept = "*/*";
    private string userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
    private HttpVerb verb = HttpVerb.GET;
    private HttpClientContext context;
    private readonly List<HttpUploadingFile> files = new List<HttpUploadingFile>();
    private readonly Dictionary<string, string> postingData = new Dictionary<string, string>();
    private string url;
    private WebHeaderCollection responseHeaders;
    private int startPoint;
    private int endPoint;
    public bool boundaryed;
    private string encodingType = "utf-8";
    private int timeOut = 10000;

    #endregion

    #region events
    public event EventHandler<StatusUpdateEventArgs> StatusUpdate;

    private void OnStatusUpdate(StatusUpdateEventArgs e)
    {
    EventHandler<StatusUpdateEventArgs> temp = StatusUpdate;

    if (temp != null)
    temp(this, e);
    }
    #endregion

    #region properties

    public string EncodingType
    {
    get
    {
    return encodingType;
    }
    set
    {
    encodingType = value;
    }
    }

    /// <summary>
    /// 是否启用gzip压缩传输
    /// </summary>
    public bool IsGzip { get; set; }

    /// <summary>
    /// 是否在数据流中编码
    /// </summary>
    public bool encodeMemory { get; set; }
    /// <summary>
    /// 是否自动在不同的请求间保留Cookie, Referer
    /// </summary>
    public bool KeepContext
    {
    get { return keepContext; }
    set { keepContext = value; }
    }
    public CookieContainer cookie;
    /// <summary>
    /// 期望的回应的语言
    /// </summary>
    public string DefaultLanguage
    {
    get { return defaultLanguage; }
    set { defaultLanguage = value; }
    }

    /// <summary>
    /// GetString()如果不能从HTTP头或Meta标签中获取编码信息,则使用此编码来获取字符串
    /// </summary>
    public Encoding DefaultEncoding
    {
    get { return defaultEncoding; }
    set { defaultEncoding = value; }
    }

    public int TimeOut
    {
    get
    {
    return timeOut;
    }
    set
    {
    timeOut = value;
    }
    }
    /// <summary>
    /// 指示发出Get请求还是Post请求
    /// </summary>
    public HttpVerb Verb
    {
    get { return verb; }
    set { verb = value; }
    }

    /// <summary>
    /// 要上传的文件.如果不为空则自动转为Post请求
    /// </summary>
    public List<HttpUploadingFile> Files
    {
    get { return files; }
    }

    public List<RepeatPostData> repeatPostData
    {
    get;
    set;
    }

    /// <summary>
    /// 要发送的Form表单信息
    /// </summary>
    public Dictionary<string, string> PostingData
    {

    get { return postingData; }
    }

    /// <summary>
    /// 获取或设置请求资源的地址
    /// </summary>
    public string Url
    {
    get { return url; }
    set { url = value; }
    }

    /// <summary>
    /// 用于在获取回应后,暂时记录回应的HTTP头
    /// </summary>
    public WebHeaderCollection ResponseHeaders
    {
    get { return responseHeaders; }
    }

    /// <summary>
    /// 获取或设置期望的资源类型
    /// </summary>
    public string Accept
    {
    get { return accept; }
    set { accept = value; }
    }

    /// <summary>
    /// 获取或设置请求中的Http头User-Agent的值
    /// </summary>
    public string UserAgent
    {
    get { return userAgent; }
    set { userAgent = value; }
    }

    /// <summary>
    /// 获取或设置Cookie及Referer
    /// </summary>
    public HttpClientContext Context
    {
    get { return context; }
    set { context = value; }
    }

    /// <summary>
    /// 获取或设置获取内容的起始点,用于断点续传,多线程下载等
    /// </summary>
    public int StartPoint
    {
    get { return startPoint; }
    set { startPoint = value; }
    }

    /// <summary>
    /// 获取或设置获取内容的结束点,用于断点续传,多下程下载等.
    /// 如果为0,表示获取资源从StartPoint开始的剩余内容
    /// </summary>
    public int EndPoint
    {
    get { return endPoint; }
    set { endPoint = value; }
    }

    #endregion

    #region constructors
    /// <summary>
    /// 构造新的HttpClient实例
    /// </summary>
    public HttpClient()
    : this(null)
    {
    }

    /// <summary>
    /// 构造新的HttpClient实例
    /// </summary>
    /// <param name="url">要获取的资源的地址</param>
    public HttpClient(string url)
    : this(url, null)
    {
    }

    /// <summary>
    /// 构造新的HttpClient实例
    /// </summary>
    /// <param name="url">要获取的资源的地址</param>
    /// <param name="context">Cookie及Referer</param>
    public HttpClient(string url, HttpClientContext context)
    : this(url, context, false)
    {
    }

    /// <summary>
    /// 构造新的HttpClient实例
    /// </summary>
    /// <param name="url">要获取的资源的地址</param>
    /// <param name="context">Cookie及Referer</param>
    /// <param name="keepContext">是否自动在不同的请求间保留Cookie, Referer</param>
    public HttpClient(string url, HttpClientContext context, bool keepContext)
    {
    this.url = url;
    this.context = context;
    this.keepContext = keepContext;
    if (this.context == null)
    this.context = new HttpClientContext();
    cookie = new CookieContainer();
    }
    #endregion

    #region AttachFile
    /// <summary>
    /// 在请求中添加要上传的文件
    /// </summary>
    /// <param name="fileName">要上传的文件路径</param>
    /// <param name="fieldName">文件字段的名称(相当于&lt;input type=file name=fieldName&gt;)里的fieldName)</param>
    public void AttachFile(string fileName, string fieldName)
    {
    HttpUploadingFile file = new HttpUploadingFile(fileName, fieldName);
    files.Add(file);
    }

    /// <summary>
    /// 在请求中添加要上传的文件
    /// </summary>
    /// <param name="data">要上传的文件内容</param>
    /// <param name="fileName">文件名</param>
    /// <param name="fieldName">文件字段的名称(相当于&lt;input type=file name=fieldName&gt;)里的fieldName)</param>
    public void AttachFile(byte[] data, string fileName, string fieldName)
    {
    HttpUploadingFile file = new HttpUploadingFile(data, fileName, fieldName);
    files.Add(file);
    }
    #endregion

    /// <summary>
    /// 清空PostingData, Files, StartPoint, EndPoint, ResponseHeaders, 并把Verb设置为Get.
    /// 在发出一个包含上述信息的请求后,必须调用此方法或手工设置相应属性以使下一次请求不会受到影响.
    /// </summary>
    public void Reset()
    {
    verb = HttpVerb.GET;
    files.Clear();
    postingData.Clear();
    responseHeaders = null;
    startPoint = 0;
    endPoint = 0;
    IsGzip = false;
    if (repeatPostData != null) repeatPostData.Clear();
    }
    public string ip;
    private IPEndPoint BindIPEndPointCallback(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
    {
    return new IPEndPoint(IPAddress.Parse(ip), 0);
    }

    public string cookieStr = "";

    private HttpWebRequest CreateRequest()
    {
    HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);

    req.CookieContainer = cookie;
    //req.Headers.Add("Accept-Language", defaultLanguage);
    req.Accept = accept;
    req.UserAgent = userAgent;
    req.KeepAlive = true;
    req.AllowAutoRedirect = true;
    req.Timeout = TimeOut;

    if (IsGzip)
    {
    req.Headers.Add("Accept-Encoding", "gzip");
    }

    if (ip != null)
    {
    req.ServicePoint.BindIPEndPointDelegate = new BindIPEndPoint(BindIPEndPointCallback);
    }
    if (context.Cookies != null)
    req.CookieContainer.Add(context.Cookies);
    if (!string.IsNullOrEmpty(context.Referer))
    req.Referer = context.Referer;

    if (verb == HttpVerb.HEAD)
    {
    req.Method = "HEAD";
    return req;
    }

    if (postingData.Count > 0 || files.Count > 0)
    verb = HttpVerb.POST;
    if (cookieStr != "") req.Headers.Add("Cookie", cookieStr);
    if (verb == HttpVerb.POST)
    {
    req.Method = "POST";

    MemoryStream memoryStream = new MemoryStream();

    StreamWriter writer;
    if (encodeMemory)
    {
    writer = new StreamWriter(memoryStream, Encoding.GetEncoding(EncodingType));
    }
    else
    writer = new StreamWriter(memoryStream);

    if (files.Count > 0 || boundaryed)
    {
    string newLine = " ";
    string boundary = Guid.NewGuid().ToString().Replace("-", "");
    req.ContentType = "multipart/form-data; boundary=" + boundary;

    foreach (string key in postingData.Keys)
    {
    writer.Write("--" + boundary + newLine);
    writer.Write("Content-Disposition: form-data; name="{0}"{1}{1}", key, newLine);
    writer.Write(postingData[key] + newLine);
    }

    foreach (HttpUploadingFile file in files)
    {
    writer.Write("--" + boundary + newLine);
    writer.Write(
    "Content-Disposition: form-data; name="{0}"; filename="{1}"{2}",
    file.FieldName,
    file.FileName,
    newLine
    );
    writer.Write("Content-Type: image/jpeg" + newLine + newLine);
    writer.Flush();
    memoryStream.Write(file.Data, 0, file.Data.Length);
    writer.Write(newLine);
    writer.Write("--" + boundary + "--" + newLine);
    }

    }
    else
    {
    req.ContentType = "application/x-www-form-urlencoded";
    StringBuilder sb = new StringBuilder();
    foreach (string key in postingData.Keys)
    {
    sb.AppendFormat("{0}={1}&",HttpUtility.UrlEncode(key, Encoding.GetEncoding(EncodingType)), HttpUtility.UrlEncode(postingData[key], Encoding.GetEncoding(EncodingType)));
    }

    if (repeatPostData != null)
    {
    foreach (var item in repeatPostData)
    {
    sb.AppendFormat("{0}={1}&", HttpUtility.UrlEncode(item.key, Encoding.GetEncoding(EncodingType)), HttpUtility.UrlEncode(item.value, Encoding.GetEncoding(EncodingType)));
    }
    }

    if (sb.Length > 0)
    sb.Length--;
    writer.Write(sb.ToString());
    }

    writer.Flush();

    using (Stream stream = req.GetRequestStream())
    {
    memoryStream.WriteTo(stream);
    }
    }

    if (startPoint != 0 && endPoint != 0)
    req.AddRange(startPoint, endPoint);
    else if (startPoint != 0 && endPoint == 0)
    req.AddRange(startPoint);

    return req;
    }

    /// <summary>
    /// 发出一次新的请求,并返回获得的回应
    /// 调用此方法永远不会触发StatusUpdate事件.
    /// </summary>
    /// <returns>相应的HttpWebResponse</returns>
    public HttpWebResponse GetResponse()
    {

    HttpWebRequest req = CreateRequest();
    HttpWebResponse res = null;
    try
    {
    res = (HttpWebResponse)req.GetResponse();


    responseHeaders = res.Headers;
    if (keepContext)
    {
    context.Cookies = res.Cookies;
    context.Referer = url;
    cookie.Add(context.Cookies);
    }
    }
    catch (Exception)
    { throw; }
    return res;

    }

    /// <summary>
    /// 发出一次新的请求,并返回回应内容的流
    /// 调用此方法永远不会触发StatusUpdate事件.
    /// </summary>
    /// <returns>包含回应主体内容的流</returns>
    public Stream GetStream()
    {
    return GetResponse().GetResponseStream();
    }
    public string responseURL;
    /// <summary>
    /// 发出一次新的请求,并以字节数组形式返回回应的内容
    /// 调用此方法会触发StatusUpdate事件
    /// </summary>
    /// <returns>包含回应主体内容的字节数组</returns>
    public byte[] GetBytes()
    {
    byte[] result = new byte[] { 0, 1 };
    try
    {
    HttpWebResponse res = GetResponse();
    int length = (int)res.ContentLength;
    responseURL = res.ResponseUri.AbsoluteUri;
    MemoryStream memoryStream = new MemoryStream();
    byte[] buffer = new byte[0x100];
    Stream rs = res.GetResponseStream();
    for (int i = rs.Read(buffer, 0, buffer.Length); i > 0; i = rs.Read(buffer, 0, buffer.Length))
    {
    memoryStream.Write(buffer, 0, i);
    OnStatusUpdate(new StatusUpdateEventArgs((int)memoryStream.Length, length));
    }
    rs.Close();
    result = memoryStream.ToArray();
    }
    catch (Exception)
    {
    throw;
    }

    return result;
    }

    /// <summary>
    /// 发出一次新的请求,以Http头,或Html Meta标签,或DefaultEncoding指示的编码信息对回应主体解码
    /// 调用此方法会触发StatusUpdate事件
    /// </summary>
    /// <returns>解码后的字符串</returns>
    public string GetString()
    {
    byte[] data = GetBytes();
    if (responseHeaders.AllKeys.Contains<string>("Content-Encoding") && responseHeaders["Content-Encoding"].Contains("gzip"))
    {
    //Console.WriteLine(responseHeaders["Content-Encoding"].ToString());
    data = GZipDecompress(data);
    }

    string encodingName = GetEncodingFromHeaders();

    if (encodingName == null)
    encodingName = GetEncodingFromBody(data);

    Encoding encoding;
    if (encodingName == null)
    encoding = defaultEncoding;
    else
    {
    try
    {
    encoding = Encoding.GetEncoding(encodingName);
    }
    catch (ArgumentException)
    {
    encoding = defaultEncoding;
    }
    }
    return encoding.GetString(data);
    }

    /// <summary>
    /// 发出一次新的请求,对回应的主体内容以指定的编码进行解码
    /// 调用此方法会触发StatusUpdate事件
    /// </summary>
    /// <param name="encoding">指定的编码</param>
    /// <returns>解码后的字符串</returns>
    public string GetString(Encoding encoding)
    {
    byte[] data = GetBytes();
    return encoding.GetString(data);
    }

    /// <summary>
    /// GZip解压函数
    /// </summary>
    /// <param name="data"></param>
    /// <returns></returns>
    private byte[] GZipDecompress(byte[] data)
    {
    using (MemoryStream stream = new MemoryStream())
    {
    using (GZipStream gZipStream = new GZipStream(new MemoryStream(data), CompressionMode.Decompress))
    {
    byte[] bytes = new byte[40960];
    int n;
    while ((n = gZipStream.Read(bytes, 0, bytes.Length)) != 0)
    {
    stream.Write(bytes, 0, n);
    }
    gZipStream.Close();
    }

    return stream.ToArray();
    }
    }

    private string GetEncodingFromHeaders()
    {
    string encoding = null;
    try
    {
    string contentType = responseHeaders["Content-Type"];
    if (contentType != null)
    {
    int i = contentType.IndexOf("charset=");
    if (i != -1)
    {
    encoding = EncodingType = contentType.Substring(i + 8);
    }
    }
    }
    catch (Exception)
    { }
    return encoding;
    }

    private string GetEncodingFromBody(byte[] data)
    {
    //string encodingName = null;
    string dataAsAscii = Encoding.ASCII.GetString(data);
    if (dataAsAscii != null)
    {
    int i = dataAsAscii.IndexOf("charset=");
    if (i != -1)
    {
    int j = dataAsAscii.IndexOf(""", i);
    if (j != -1)
    {
    int k = i + 8;
    EncodingType = dataAsAscii.Substring(k, (j - k) + 1);
    char[] chArray = new char[2] { '>', '"' };
    EncodingType = EncodingType.TrimEnd(chArray);
    }
    }
    }
    return EncodingType;
    }

    /// <summary>
    /// 发出一次新的Head请求,获取资源的长度
    /// 此请求会忽略PostingData, Files, StartPoint, EndPoint, Verb
    /// </summary>
    /// <returns>返回的资源长度</returns>
    public int HeadContentLength()
    {
    Reset();
    HttpVerb lastVerb = verb;
    verb = HttpVerb.HEAD;
    using (HttpWebResponse res = GetResponse())
    {
    verb = lastVerb;
    return (int)res.ContentLength;
    }
    }

    /// <summary>
    /// 发出一次新的请求,把回应的主体内容保存到文件
    /// 调用此方法会触发StatusUpdate事件
    /// 如果指定的文件存在,它会被覆盖
    /// </summary>
    /// <param name="fileName">要保存的文件路径</param>
    public void SaveAsFile(string fileName)
    {
    SaveAsFile(fileName, FileExistsAction.Overwrite);
    }

    /// <summary>
    /// 发出一次新的请求,把回应的主体内容保存到文件
    /// 调用此方法会触发StatusUpdate事件
    /// </summary>
    /// <param name="fileName">要保存的文件路径</param>
    /// <param name="existsAction">指定的文件存在时的选项</param>
    /// <returns>是否向目标文件写入了数据</returns>
    public bool SaveAsFile(string fileName, FileExistsAction existsAction)
    {
    byte[] data = GetBytes();
    switch (existsAction)
    {
    case FileExistsAction.Overwrite:
    using (BinaryWriter writer = new BinaryWriter(new FileStream(fileName, FileMode.OpenOrCreate, FileAccess.Write)))
    writer.Write(data);
    return true;

    case FileExistsAction.Append:
    using (BinaryWriter writer = new BinaryWriter(new FileStream(fileName, FileMode.Append, FileAccess.Write)))
    writer.Write(data);
    return true;

    default:
    if (!File.Exists(fileName))
    {
    using (
    BinaryWriter writer =
    new BinaryWriter(new FileStream(fileName, FileMode.Create, FileAccess.Write)))
    writer.Write(data);
    return true;
    }
    else
    {
    return false;
    }
    }
    }
    }

    public class HttpClientContext
    {
    private CookieCollection cookies;
    private string referer;

    public CookieCollection Cookies
    {
    get { return cookies; }
    set { cookies = value; }
    }

    public string Referer
    {
    get { return referer; }
    set { referer = value; }
    }
    }

    public class RepeatPostData
    {
    public string key { get; set; }
    public string value { get; set; }
    }

    public enum HttpVerb
    {
    GET,
    POST,
    HEAD,
    }

    public enum FileExistsAction
    {
    Overwrite,
    Append,
    Cancel,
    }

    public class HttpUploadingFile
    {
    private string fileName;
    private string fieldName;
    private byte[] data;

    public string FileName
    {
    get { return fileName; }
    set { fileName = value; }
    }

    public string FieldName
    {
    get { return fieldName; }
    set { fieldName = value; }
    }

    public byte[] Data
    {
    get { return data; }
    set { data = value; }
    }

    public HttpUploadingFile(string fileName, string fieldName)
    {
    this.fileName = fileName;
    this.fieldName = fieldName;
    using (FileStream stream = new FileStream(fileName, FileMode.Open))
    {
    byte[] inBytes = new byte[stream.Length];
    stream.Read(inBytes, 0, inBytes.Length);
    data = inBytes;
    }
    }

    public HttpUploadingFile(byte[] data, string fileName, string fieldName)
    {
    this.data = data;
    this.fileName = fileName;
    this.fieldName = fieldName;
    }
    }

    public class StatusUpdateEventArgs : EventArgs
    {
    private readonly int bytesGot;
    private readonly int bytesTotal;

    public StatusUpdateEventArgs(int got, int total)
    {
    bytesGot = got;
    bytesTotal = total;
    }

    /// <summary>
    /// 已经下载的字节数
    /// </summary>
    public int BytesGot
    {
    get { return bytesGot; }
    }

    /// <summary>
    /// 资源的总字节数
    /// </summary>
    public int BytesTotal
    {
    get { return bytesTotal; }
    }
    }
    }

      然后我们先根据这个方法获取博客园首页的源代码

     View Code

    /// <summary>
    /// 根据网址获取页面源码
    /// </summary>
    /// <param name="url"></param>
    /// <returns></returns>
    public string GetHtml(string url)
    {
    string ContentHtml = "";
    try
    {
    HttpClient hc = new HttpClient();
    hc.Url = url;
    if (!hc.Url.Contains("http://"))//如果输入的网址没有包含http:// 则手动添加
    {
    hc.Url = "http://" + hc.Url;
    }
    ContentHtml = hc.GetString();
    }
    catch (Exception e)//如果上面的执行出错,则返回继续执行
    {
    return GetHtml(url);
    }
    return ContentHtml;
    }

      然后再观察每条随笔的规律,我们发现没条的开头是<div class="post_item_body">,结尾是<div class="clear">,那我们就可以根据这个规律来写出正则:Regex regexContent = new Regex("<div class="post_item_body">(?<content>.*?)<div class="clear"></div>",RegexOptions.Singleline);
         然后可以使用这个正则来获取我们需要匹配的内容了

    复制代码
    1         string Html= GetHtml("http://www.cnblogs.com/");
    2             Regex regexContent = new Regex("<div class="post_item_body">(?<content>.*?)<div class="clear"></div>",RegexOptions.Singleline);
    3             string blog = regexContent.Match(Html).Groups["content"].Value.ToString();
    复制代码

    在这里我用到的正则匹配工具是Expresso,有需要的朋友可以留言。当然,如果我有什么地方写的不好的,欢迎各位指出。晚上就先到这里了,该洗洗睡了。

         

    预测未来的最好方法,就是创造未来。
  • 相关阅读:
    第三讲:增长全局观
    搭建安卓环境
    ~~
    天气阴
    天气晴
    Spark性能优化指南——高级篇
    Ceph Jewel 10.2.3 环境部署
    《你只是看起来很努力》--读书笔记
    博客园基础环境配置
    Spark 1.3.0 单机安装
  • 原文地址:https://www.cnblogs.com/Leo_wl/p/3229355.html
Copyright © 2011-2022 走看看