解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html
完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html
1 解析html
路径
//div 属于平行路径
/html/body/div/ul 属于xml类型的路径
//table/tr 平行路径+xml类型路径,混合使用
//*[@id='div1'] 可以根据id选择,也可以根据其它的属性
*代表匹配所有类型的标签,也可以换成其它的标签,如div等
如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
按节点的ChildNodes选择
divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value
1 选择网页中的所有的div
doc.DocumentNode.SelectNodes("//div")
2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")
3 根据属性id选择节点
HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
Response.Write(node8.Id);
Response.Write(node8.InnerText);
属性
Name
InnerHtml
InnerText
OuterHtml
ParentNode
XPath
2 Get/Post请求网页
1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Web;
5 using System.Net;
6 using System.Configuration;
7 using System.IO;
8 using System.Text;
9
10 namespace MyLibrary.Common
11 {
12 public class BaseParser
13 {
14 private string _encode = "utf-8"; //默认编码格式
15
16 #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
17 /// <summary>
18 /// 下载指定URL的HTML代码
19 /// </summary>
20 /// <param name="strUrl">目标页URL</param>
21 /// <returns>目标URL的HTML代码</returns>
22 public string GetHtml(string strUrl)
23 {
24 HttpWebRequest httpReq;
25 HttpWebResponse httpResp;
26
27 httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
28 httpReq.AllowAutoRedirect = true;
29 CookieContainer cc = new CookieContainer();
30 httpReq.CookieContainer = cc;
31
32 httpResp = (HttpWebResponse)httpReq.GetResponse();
33 Stream respStream = httpResp.GetResponseStream();
34 StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
35 string html = respStreamReader.ReadToEnd();
36 respStream.Close();
37 respStreamReader.Close();
38
39 return html;
40 }
41 #endregion
42
43 #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
44 /// <summary>
45 /// 下载指定URL的HTML代码
46 /// </summary>
47 /// <param name="strUrl">目标页URL</param>
48 /// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
49 public string GetHtml2(string strUrl)
50 {
51 HttpWebRequest httpReq;
52 HttpWebResponse httpResp;
53
54 httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
55 httpReq.AllowAutoRedirect = true;
56 CookieContainer cc = new CookieContainer();
57 httpReq.CookieContainer = cc;
58 try
59 {
60 httpResp = (HttpWebResponse)httpReq.GetResponse();
61 Stream respStream = httpResp.GetResponseStream();
62 StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
63 string html = respStreamReader.ReadToEnd();
64 respStream.Close();
65 respStreamReader.Close();
66
67 return html;
68 }
69 catch
70 {
71 return "error";
72 }
73
74
75 }
76 #endregion
77
78 #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
79 /// <summary>
80 /// 下载指定URL的HTML代码
81 /// </summary>
82 /// <param name="strUrl">目标页URL</param>
83 ///<param name="encode">编码格式</param>
84 /// <returns>目标URL的HTML代码</returns>
85 public string GetHtml(string strUrl, Encoding encode)
86 {
87 HttpWebRequest httpReq;
88 HttpWebResponse httpResp;
89
90 httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
91 httpReq.AllowAutoRedirect = true;
92 CookieContainer cc = new CookieContainer();
93 httpReq.CookieContainer = cc;
94
95 httpResp = (HttpWebResponse)httpReq.GetResponse();
96 Stream respStream = httpResp.GetResponseStream();
97 StreamReader respStreamReader = new StreamReader(respStream, encode);
98 string html = respStreamReader.ReadToEnd();
99 respStream.Close();
100 respStreamReader.Close();
101
102 return html;
103 }
104 #endregion
105
106 #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
107 /// <summary>
108 /// 带Cookie凭据下载有登录限制URL的HTML代码
109 /// </summary>
110 /// <param name="strUrl">目标URL</param>
111 /// <param name="cc">Cookie凭据</param>
112 /// <returns>目标URL的HTML代码</returns>
113 public string GetHtml(string strUrl, CookieContainer cc)
114 {
115 HttpWebRequest httpReq;
116 HttpWebResponse httpResp;
117
118 httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
119 httpReq.AllowAutoRedirect = true;
120 httpReq.CookieContainer = cc;
121
122 httpResp = (HttpWebResponse)httpReq.GetResponse();
123 Stream respStream = httpResp.GetResponseStream();
124 StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
125 string html = respStreamReader.ReadToEnd();
126 respStream.Close();
127 respStreamReader.Close();
128
129 return html;
130 }
131 #endregion
132
133 #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
134 /// <summary>
135 /// 带Cookie凭据下载有登录限制URL的HTML代码
136 /// </summary>
137 /// <param name="strUrl">目标URL</param>
138 /// <param name="cc">Cookie凭据</param>
139 /// <param name="encode">编码格式</param>
140 /// <returns>目标URL的HTML代码</returns>
141 public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
142 {
143 HttpWebRequest httpReq;
144 HttpWebResponse httpResp;
145
146 httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
147 httpReq.AllowAutoRedirect = true;
148 httpReq.CookieContainer = cc;
149
150 httpResp = (HttpWebResponse)httpReq.GetResponse();
151 Stream respStream = httpResp.GetResponseStream();
152 StreamReader respStreamReader = new StreamReader(respStream, encode);
153 string html = respStreamReader.ReadToEnd();
154 respStream.Close();
155 respStreamReader.Close();
156
157 return html;
158 }
159 #endregion
160
161 #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
162 /// <summary>
163 /// 带Cookie凭据模拟发送POST请求
164 /// </summary>
165 /// <param name="strUrl">目标URL</param>
166 /// <param name="dicParams">参数列表</param>
167 /// <param name="container">Cookie凭据</param>
168 /// <param name="encode">编码格式</param>
169 /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
170 public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
171 {
172 string postData = string.Empty;
173 if (dicParams != null)
174 {
175 foreach (string key in dicParams.Keys)
176 {
177 postData += string.Format("{0}={1}&", key, dicParams[key]);
178 }
179 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
180 }
181 byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
182 HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
183 httpReq.AllowAutoRedirect = true;
184 //httpReq.Credentials = CredentialCache.DefaultCredentials;
185 httpReq.KeepAlive = true;
186 httpReq.Method = "POST";
187 httpReq.ContentType = "application/x-www-form-urlencoded";
188 httpReq.ContentLength = byteArray.Length;
189
190 if (container != null) httpReq.CookieContainer = container;
191 else httpReq.CookieContainer = new CookieContainer();
192
193 Stream reqStream = httpReq.GetRequestStream();
194 reqStream.Write(byteArray, 0, byteArray.Length); //写入参数
195 reqStream.Close();
196
197
198 HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
199 httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
200 int cookies = httpResp.Cookies.Count;
201 if (container == null) container = httpReq.CookieContainer;
202
203 StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
204 string html = respStream.ReadToEnd();
205
206 respStream.Close();
207 httpReq.Abort();
208 httpResp.Close();
209
210 if (cookies > 0) return html;
211 else return "error";
212 }
213 #endregion
214
215 #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
216 /// <summary>
217 /// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
218 /// </summary>
219 /// <param name="strUrl">目标URL</param>
220 /// <param name="dicParams">参数列表</param>
221 /// <param name="container">Cookie凭据</param>
222 /// <param name="encode">编码格式</param>
223 /// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
224 public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
225 {
226 string postData = string.Empty;
227 if (dicParams != null)
228 {
229 foreach (string key in dicParams.Keys)
230 {
231 postData += string.Format("{0}={1}&", key, dicParams[key]);
232 }
233 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
234 }
235 byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
236 HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
237 httpReq.AllowAutoRedirect = true;
238 //httpReq.Credentials = CredentialCache.DefaultCredentials;
239 httpReq.KeepAlive = true;
240 httpReq.Method = "POST";
241 httpReq.ContentType = "application/x-www-form-urlencoded";
242 httpReq.ContentLength = byteArray.Length;
243
244 if (container != null) httpReq.CookieContainer = container;
245 else httpReq.CookieContainer = new CookieContainer();
246
247 Stream reqStream = httpReq.GetRequestStream();
248 reqStream.Write(byteArray, 0, byteArray.Length); //写入参数
249 reqStream.Close();
250
251
252 HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
253 httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
254 int cookies = httpResp.Cookies.Count;
255 if (container == null) container = httpReq.CookieContainer;
256
257 StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
258 string html = respStream.ReadToEnd();
259
260 respStream.Close();
261 httpReq.Abort();
262 httpResp.Close();
263
264 if (cookies > 0) return html;
265 else return "error|"+html;
266 }
267 #endregion
268
269 #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
270 /// <summary>
271 /// 带Cookie凭据模拟发送POST请求
272 /// </summary>
273 /// <param name="strUrl">目标URL</param>
274 /// <param name="dicParams">参数列表</param>
275 /// <param name="container">Cookie凭据</param>
276 /// <param name="encode">编码格式</param>
277 /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
278 public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
279 {
280 string postData = string.Empty;
281 if (dicParams != null)
282 {
283 foreach (string key in dicParams.Keys)
284 {
285 postData += string.Format("{0}={1}&", key, dicParams[key]);
286 }
287 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
288 }
289 byte[] byteArray = encode.GetBytes(postData);
290 HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
291 httpReq.AllowAutoRedirect = true;
292 //httpReq.Credentials = CredentialCache.DefaultCredentials;
293 httpReq.KeepAlive = true;
294 httpReq.Method = "POST";
295 httpReq.ContentType = "application/x-www-form-urlencoded";
296 httpReq.ContentLength = byteArray.Length;
297
298 if (container != null) httpReq.CookieContainer = container;
299 else httpReq.CookieContainer = new CookieContainer();
300
301 Stream reqStream = httpReq.GetRequestStream();
302 reqStream.Write(byteArray, 0, byteArray.Length); //写入参数
303 reqStream.Close();
304
305
306 HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
307 httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
308 int cookies = httpResp.Cookies.Count;
309 if (container == null) container = httpReq.CookieContainer;
310
311 StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
312 string html = respStream.ReadToEnd();
313
314 respStream.Close();
315 httpReq.Abort();
316 httpResp.Close();
317
318 if (cookies > 0) return html;
319 else return "error";
320 }
321 #endregion
322
332 }
333 }