想做一个爬虫程序,以前用的一直使用CSS选择器的html解析插件,最近做的项目想使用 Html Agility Pack 来做解析
Html Agility Pack使用 XPath 和 Linq 来做Html解析,我使用Xpath中记录
解析网页:http://txzhanshang.zhankoo.com/tt
列表下一页://*[contains(@class, 'pagination-right')]/a[text()='下一页']
文章地址://*[@class='zsinfo']/li/a[@href]
// GET: Test public ActionResult Index() { var crawlerConfigList = _crawlerConfigService.GetCrawlerConfigs(); foreach (var crawlerConfig in crawlerConfigList) { GetList(crawlerConfig); } ViewBag.UrlList = urlList; return View(crawlerConfigList); } private void GetList(CrawlerConfig crawlerConfig) { var web = new HtmlWeb(); var htmlDoc = web.Load(crawlerConfig.CrawlerUrl); var node = htmlDoc.DocumentNode.SelectNodes(crawlerConfig.ASelector); node.ForEach(x => { urlList.Add(x.Attributes["href"].Value); }); //下一页 var nextpageNode = htmlDoc.DocumentNode.SelectSingleNode(crawlerConfig.ListNextPageSelector); if (nextpageNode != null) { var nextpage = nextpageNode.Attributes["href"].Value; if (!string.IsNullOrEmpty(nextpage)) { crawlerConfig.CrawlerUrl = crawlerConfig.CrawlerDomain + nextpage; GetList(crawlerConfig); } } }
移除某个节点
var url = "http://txzhanshang.zhankoo.com/detail/12709.html"; var web = new HtmlWeb(); var htmlDoc = web.Load(url); var node = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='inner-wrap']"); //移除某个节点 foreach (var rm in node.SelectNodes("//*[@class='newsContenttip']")) { rm.Remove(); } var ss = node.InnerText; var sss = node.InnerHtml;