1 //过滤html标签 2 static void InnerText() 3 { 4 HtmlWeb htmlWeb = new HtmlWeb(); 5 HtmlDocument doc = htmlWeb.Load("http://www.cnblogs.com/", "GET"); 6 HtmlNode rootNode = doc.DocumentNode; 7 Console.WriteLine(rootNode.InnerHtml); 8 //Console.WriteLine(rootNode.InnerText); 9 } 10 11 //选择器 12 static void GetBlogs() 13 { 14 string url = "http://www.cnblogs.com/"; 15 HtmlWeb htmlWeb = new HtmlWeb(); 16 HtmlDocument doc = htmlWeb.Load(url, "GET"); 17 //doc.GetElementbyId("aa"); 18 HtmlNode rootNode = doc.DocumentNode; 19 HtmlNodeCollection h3Nodes = rootNode.SelectNodes("//div[@class='post_item_body']/h3"); 20 foreach (var h3Node in h3Nodes) 21 { 22 HtmlNode aNode = h3Node.SelectSingleNode("a"); //筛选a标签节点 23 HtmlNode pNode = h3Node.NextSibling.NextSibling; //下一个节点 24 string blogLink = aNode.GetAttributeValue("href", ""); //获取元素属性 25 string title = aNode.InnerText; 26 string content = pNode.InnerText; 27 Console.WriteLine(title); 28 Console.WriteLine(blogLink); 29 Console.WriteLine(content); 30 Console.WriteLine("------------------------------------------------------"); 31 } 32 return; 33 } 34 35 //XPath表达式 36 static void XPathTest() 37 { 38 string path = @"test.html"; 39 HtmlDocument doc = new HtmlDocument(); 40 HtmlNode rootNode = doc.DocumentNode; 41 doc.Load(path); 42 //获取h1标签 43 var h1 = rootNode.SelectSingleNode("/html/body/div[1]/h1[1]"); 44 Console.WriteLine(h1.InnerText); 45 //获取ul>li 姓名标签 46 var liName = rootNode.SelectSingleNode("/html/body/div[2]/ul[1]/li[1]"); 47 Console.WriteLine(liName.InnerText); 48 //获取ul>li 年龄标签 49 var liAge = rootNode.SelectSingleNode("/html/body/div[2]/ul[1]/li[2]"); 50 Console.WriteLine(liAge.InnerText); 51 }
test.html代码如下:
1 <html> 2 <head> 3 </head> 4 <body> 5 <div> 6 <h1>欢迎访问这个网页!</h1> 7 </div> 8 9 <div> 10 <ul class="user_match clear"> 11 <li>姓名:张三</li> 12 <li>年龄:18</li> 13 </ul> 14 </div> 15 </body> 16 </html>