最近经常听说或者接触关于网络爬虫的问题,只是一直看到被人写的代码。而没有真正的做过实践,
昨天做了一下尝试,其中采用网络流行的扩展类库 http://html-agility-pack.net/?z=codeplex
遇到的问题是:部分网站禁止爬虫,或者有规则验证,无法通过模拟http 请求获取 html
本测试案例 通过模拟http 请求获取html ,通过Html Agility Pack 分析节点,获取对应节点的值,其中本案例采用的是:赶集网的数据
代码如下:
private static void ClearnHtml(string html) { var htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(html); var list = new List<Room>(); var sb = new StringBuilder();//f-list-item ershoufang-list HtmlAgilityPack.HtmlNodeCollection htmlBody = htmlDoc.DocumentNode.SelectNodes("*//div[@class='f-list-item ershoufang-list']"); foreach(HtmlAgilityPack.HtmlNode roomitem in htmlBody) { var room = new Room(); if (roomitem != null) { try { var title = roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault() != null ? roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.Type = roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "1"; room.buju = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.mianji = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.Direction = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.Floor = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.zhuangxiu = roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.area = roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "") : "0"; room.feature = roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault().InnerText.Trim().Replace(" ", "").Replace(" ", "") : "0"; room.Price = roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault() != null ? roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault().InnerText.Replace(" ", "").Replace(" ", "").Replace(" ", "") : "0"; } catch (Exception ex) { continue; } } sb.Append($"insert into room(title,Type,buju,mianji,Direction,Floor,zhuangxiu,area,feature,Price)values"); sb.Append($"('{room.title}','{room.Type}','{room.buju}','{ room.mianji}','{room.Direction}','{room.Floor}','{room.zhuangxiu}','{room.area}','{room.feature}','{room.Price}');"); // // list.Add(room); } var connection = new MySqlConnection("Server=127.0.0.1;Database=personal;Uid=ken;Pwd=123456;"); connection.Execute(sb.ToString()); }