zoukankan      html  css  js  c++  java
  • HtmlAgilityPack教程

    解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html

    完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html

    1 解析html

    路径
    //div 属于平行路径
    /html/body/div/ul 属于xml类型的路径
    //table/tr 平行路径+xml类型路径,混合使用
    //*[@id='div1'] 可以根据id选择,也可以根据其它的属性
    *代表匹配所有类型的标签,也可以换成其它的标签,如div等
    如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
    按节点的ChildNodes选择
    divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value


    1 选择网页中的所有的div
    doc.DocumentNode.SelectNodes("//div")

    2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")

    3 根据属性id选择节点
    HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
    Response.Write(node8.Id);
    Response.Write(node8.InnerText);


    属性
    Name
    InnerHtml
    InnerText
    OuterHtml
    ParentNode
    XPath

    2 Get/Post请求网页

      1 using System;
      2 using System.Collections.Generic;
      3 using System.Linq;
      4 using System.Web;
      5 using System.Net;
      6 using System.Configuration;
      7 using System.IO;
      8 using System.Text;
      9 
     10 namespace MyLibrary.Common
     11 {
     12     public class BaseParser
     13     {
     14         private string _encode = "utf-8"; //默认编码格式
     15 
     16         #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
     17         /// <summary>
     18         /// 下载指定URL的HTML代码
     19         /// </summary>
     20         /// <param name="strUrl">目标页URL</param>
     21         /// <returns>目标URL的HTML代码</returns>
     22         public string GetHtml(string strUrl)
     23         {
     24             HttpWebRequest httpReq;
     25             HttpWebResponse httpResp;
     26 
     27             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
     28             httpReq.AllowAutoRedirect = true;
     29             CookieContainer cc = new CookieContainer();
     30             httpReq.CookieContainer = cc;
     31 
     32             httpResp = (HttpWebResponse)httpReq.GetResponse();
     33             Stream respStream = httpResp.GetResponseStream();
     34             StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
     35             string html = respStreamReader.ReadToEnd();
     36             respStream.Close();
     37             respStreamReader.Close();
     38 
     39             return html;
     40         }
     41         #endregion
     42 
     43         #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
     44         /// <summary>
     45         /// 下载指定URL的HTML代码
     46         /// </summary>
     47         /// <param name="strUrl">目标页URL</param>
     48         /// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
     49         public string GetHtml2(string strUrl)
     50         {
     51             HttpWebRequest httpReq;
     52             HttpWebResponse httpResp;
     53 
     54             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
     55             httpReq.AllowAutoRedirect = true;
     56             CookieContainer cc = new CookieContainer();
     57             httpReq.CookieContainer = cc;
     58             try
     59             {
     60                 httpResp = (HttpWebResponse)httpReq.GetResponse();
     61                 Stream respStream = httpResp.GetResponseStream();
     62                 StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
     63                 string html = respStreamReader.ReadToEnd();
     64                 respStream.Close();
     65                 respStreamReader.Close();
     66 
     67                 return html;
     68             }
     69             catch
     70             {
     71                 return "error";
     72             }
     73 
     74             
     75         }
     76         #endregion
     77 
     78         #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
     79         /// <summary>
     80         /// 下载指定URL的HTML代码
     81         /// </summary>
     82         /// <param name="strUrl">目标页URL</param>
     83         ///<param name="encode">编码格式</param>
     84         /// <returns>目标URL的HTML代码</returns>
     85         public string GetHtml(string strUrl, Encoding encode)
     86         {
     87             HttpWebRequest httpReq;
     88             HttpWebResponse httpResp;
     89 
     90             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
     91             httpReq.AllowAutoRedirect = true;
     92             CookieContainer cc = new CookieContainer();
     93             httpReq.CookieContainer = cc;
     94 
     95             httpResp = (HttpWebResponse)httpReq.GetResponse();
     96             Stream respStream = httpResp.GetResponseStream();
     97             StreamReader respStreamReader = new StreamReader(respStream, encode);
     98             string html = respStreamReader.ReadToEnd();
     99             respStream.Close();
    100             respStreamReader.Close();
    101 
    102             return html;
    103         }
    104         #endregion
    105 
    106         #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
    107         /// <summary>
    108         /// 带Cookie凭据下载有登录限制URL的HTML代码
    109         /// </summary>
    110         /// <param name="strUrl">目标URL</param>
    111         /// <param name="cc">Cookie凭据</param>
    112         /// <returns>目标URL的HTML代码</returns>
    113         public string GetHtml(string strUrl, CookieContainer cc)
    114         {
    115             HttpWebRequest httpReq;
    116             HttpWebResponse httpResp;
    117 
    118             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
    119             httpReq.AllowAutoRedirect = true;
    120             httpReq.CookieContainer = cc;
    121 
    122             httpResp = (HttpWebResponse)httpReq.GetResponse();
    123             Stream respStream = httpResp.GetResponseStream();
    124             StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
    125             string html = respStreamReader.ReadToEnd();
    126             respStream.Close();
    127             respStreamReader.Close();
    128 
    129             return html;
    130         }
    131         #endregion
    132 
    133         #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
    134         /// <summary>
    135         /// 带Cookie凭据下载有登录限制URL的HTML代码
    136         /// </summary>
    137         /// <param name="strUrl">目标URL</param>
    138         /// <param name="cc">Cookie凭据</param>
    139         /// <param name="encode">编码格式</param>
    140         /// <returns>目标URL的HTML代码</returns>
    141         public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
    142         {
    143             HttpWebRequest httpReq;
    144             HttpWebResponse httpResp;
    145 
    146             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
    147             httpReq.AllowAutoRedirect = true;
    148             httpReq.CookieContainer = cc;
    149 
    150             httpResp = (HttpWebResponse)httpReq.GetResponse();
    151             Stream respStream = httpResp.GetResponseStream();
    152             StreamReader respStreamReader = new StreamReader(respStream, encode);
    153             string html = respStreamReader.ReadToEnd();
    154             respStream.Close();
    155             respStreamReader.Close();
    156 
    157             return html;
    158         }
    159         #endregion
    160 
    161         #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
    162         /// <summary>
    163         /// 带Cookie凭据模拟发送POST请求
    164         /// </summary>
    165         /// <param name="strUrl">目标URL</param>
    166         /// <param name="dicParams">参数列表</param>
    167         /// <param name="container">Cookie凭据</param>
    168         /// <param name="encode">编码格式</param>
    169         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
    170         public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
    171         {
    172             string postData = string.Empty;
    173             if (dicParams != null)
    174             {
    175                 foreach (string key in dicParams.Keys)
    176                 {
    177                     postData += string.Format("{0}={1}&", key, dicParams[key]);
    178                 }
    179                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
    180             }
    181             byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
    182             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
    183             httpReq.AllowAutoRedirect = true;
    184             //httpReq.Credentials = CredentialCache.DefaultCredentials;
    185             httpReq.KeepAlive = true;
    186             httpReq.Method = "POST";
    187             httpReq.ContentType = "application/x-www-form-urlencoded";
    188             httpReq.ContentLength = byteArray.Length;
    189 
    190             if (container != null) httpReq.CookieContainer = container;
    191             else httpReq.CookieContainer = new CookieContainer();
    192 
    193             Stream reqStream = httpReq.GetRequestStream();
    194             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
    195             reqStream.Close();
    196 
    197 
    198             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
    199             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
    200             int cookies = httpResp.Cookies.Count;
    201             if (container == null) container = httpReq.CookieContainer;
    202 
    203             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
    204             string html = respStream.ReadToEnd();
    205 
    206             respStream.Close();
    207             httpReq.Abort();
    208             httpResp.Close();
    209 
    210             if (cookies > 0) return html;
    211             else return "error";
    212         }
    213         #endregion
    214 
    215         #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
    216         /// <summary>
    217         /// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
    218         /// </summary>
    219         /// <param name="strUrl">目标URL</param>
    220         /// <param name="dicParams">参数列表</param>
    221         /// <param name="container">Cookie凭据</param>
    222         /// <param name="encode">编码格式</param>
    223         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
    224         public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
    225         {
    226             string postData = string.Empty;
    227             if (dicParams != null)
    228             {
    229                 foreach (string key in dicParams.Keys)
    230                 {
    231                     postData += string.Format("{0}={1}&", key, dicParams[key]);
    232                 }
    233                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
    234             }
    235             byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
    236             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
    237             httpReq.AllowAutoRedirect = true;
    238             //httpReq.Credentials = CredentialCache.DefaultCredentials;
    239             httpReq.KeepAlive = true;
    240             httpReq.Method = "POST";
    241             httpReq.ContentType = "application/x-www-form-urlencoded";
    242             httpReq.ContentLength = byteArray.Length;
    243 
    244             if (container != null) httpReq.CookieContainer = container;
    245             else httpReq.CookieContainer = new CookieContainer();
    246 
    247             Stream reqStream = httpReq.GetRequestStream();
    248             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
    249             reqStream.Close();
    250 
    251 
    252             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
    253             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
    254             int cookies = httpResp.Cookies.Count;
    255             if (container == null) container = httpReq.CookieContainer;
    256 
    257             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
    258             string html = respStream.ReadToEnd();
    259 
    260             respStream.Close();
    261             httpReq.Abort();
    262             httpResp.Close();
    263 
    264             if (cookies > 0) return html;
    265             else return "error|"+html;
    266         }
    267         #endregion
    268 
    269         #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
    270         /// <summary>
    271         /// 带Cookie凭据模拟发送POST请求
    272         /// </summary>
    273         /// <param name="strUrl">目标URL</param>
    274         /// <param name="dicParams">参数列表</param>
    275         /// <param name="container">Cookie凭据</param>
    276         /// <param name="encode">编码格式</param>
    277         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
    278         public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
    279         {
    280             string postData = string.Empty;
    281             if (dicParams != null)
    282             {
    283                 foreach (string key in dicParams.Keys)
    284                 {
    285                     postData += string.Format("{0}={1}&", key, dicParams[key]);
    286                 }
    287                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
    288             }
    289             byte[] byteArray = encode.GetBytes(postData);
    290             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
    291             httpReq.AllowAutoRedirect = true;
    292             //httpReq.Credentials = CredentialCache.DefaultCredentials;
    293             httpReq.KeepAlive = true;
    294             httpReq.Method = "POST";
    295             httpReq.ContentType = "application/x-www-form-urlencoded";
    296             httpReq.ContentLength = byteArray.Length;
    297 
    298             if (container != null) httpReq.CookieContainer = container;
    299             else httpReq.CookieContainer = new CookieContainer();
    300 
    301             Stream reqStream = httpReq.GetRequestStream();
    302             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
    303             reqStream.Close();
    304 
    305 
    306             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
    307             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
    308             int cookies = httpResp.Cookies.Count;
    309             if (container == null) container = httpReq.CookieContainer;
    310 
    311             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
    312             string html = respStream.ReadToEnd();
    313 
    314             respStream.Close();
    315             httpReq.Abort();
    316             httpResp.Close();
    317 
    318             if (cookies > 0) return html;
    319             else return "error";
    320         }
    321         #endregion
    322         
    332     }
    333 }
  • 相关阅读:
    eslint 屏蔽html的检查
    css 自定义悬浮窗写法
    echarts象形图图例显示问题
    win10上rocketMQ的部署过程
    死锁以及如何避免死锁
    volatile的一个例子-通俗易懂
    Java四种引用类型
    记录下:安全|API接口安全性设计(防篡改和重复调用)
    了解聚集索引,非聚集索引,联合索引,索引覆盖
    mybatis 查询树形结构
  • 原文地址:https://www.cnblogs.com/james641/p/4903463.html
Copyright © 2011-2022 走看看