zoukankan      html  css  js  c++  java
  • 使用Xpath从网页中获取数据

     /// <summary>
            /// 从官方网站中抓取产品信息存放在本地数据库中
            /// </summary>
            /// <returns></returns>
            public List<ProductMessage> GetlistProductMessage()
            {
    
                string html = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products");
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(html);
                HtmlNode rootNode = document.DocumentNode;
    
                /*//*[@class='list-product']为元素的XPath标记实例,
                 * 表示所有使用class="list-product"的节点
                 */
                HtmlNodeCollection rootNodeList = rootNode.SelectNodes("//*[@class='list-product']");
    
                List<ProductMessage> products = new List<ProductMessage>();
                foreach (HtmlNode node in rootNodeList)
                {
                    ProductMessage db_product = new ProductMessage();
                    HtmlDocument docu = new HtmlDocument();
                    docu.LoadHtml(node.InnerHtml);
                    HtmlNode ro = docu.DocumentNode;
                    db_product.Code = Formsub(ro.SelectSingleNode("//*[@style='float:right;']").InnerText);
                    string Code = db_product.Code;
                        List<ProductMessage> Productlist = ProductMessage.GetProductList(Code,"");
                    
                        if (Productlist.Count>0)
                        {
                            db_product.Name = Formsub(ro.SelectSingleNode("//*[@style='float:left;']").InnerText);
                            /*获取a节点中href标签的属性值*/
                            db_product.ID = GetProductID(ro.SelectSingleNode("a").Attributes["href"].Value);
                            string descmationhtml = GetProductsDescriptionsImage("http://www.grandcanyononepoint.com/products/view/" + db_product.ID + "");
                            HtmlDocument descmationDo = new HtmlDocument();
                            descmationDo.LoadHtml(descmationhtml);
                            HtmlNode descmationNode = descmationDo.DocumentNode;
                            db_product.Descmation = Formsub(descmationNode.SelectSingleNode("//*[@class='product-desc']").InnerHtml).Replace("'", "");
    
                            if (descmationNode.SelectSingleNode("//*[@class='details-tile']") != null)
                            {
                                db_product.DepartingFrom = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile']").InnerHtml.Replace("Departing From", ""));
                            }
                            if (descmationNode.SelectSingleNode("//*[@class='details-tile details-list']") != null)
                            {
                                db_product.ProductHighlights = Formsub(descmationNode.SelectSingleNode("//*[@class='details-tile details-list']").InnerHtml.Replace("Product Highlights", "")).Replace("'", "");
                            }
    
                            #region
                            try
                            {
                                ProductMessage.UpdateWEBProductMessage(db_product.Descmation,db_product.DepartingFrom,db_product.ProductHighlights,db_product.Name,db_product.Code);
                            }
                            catch { }
                            #endregion
    
                            #region
                            if (descmationNode.SelectSingleNode("//*[@class='product-equip']") != null)
                            {
                                HtmlDocument DesmationEquipment = new HtmlDocument();
                                DesmationEquipment.LoadHtml(descmationNode.SelectSingleNode("//*[@class='product-equip']").InnerHtml);
                                HtmlNode EquipmentNode = DesmationEquipment.DocumentNode;
                                HtmlNodeCollection EquipmentNodes = EquipmentNode.SelectNodes("div");
    
                                List<EquipmentModel> EquipmentString = new List<EquipmentModel>();
                                foreach (HtmlNode equipment in EquipmentNodes)
                                {
                                    EquipmentModel Equipment_model = new EquipmentModel();
                                    Equipment_model.Name = equipment.Attributes["title"].Value;
                                    Equipment_model.ImageUrl = "/Papillon/EquipmentImage/" + equipment.Attributes["title"].Value + ".png";
    
                                    try
                                    {
                                        ProductMessage.InsertProductEquipment(db_product.ID, Equipment_model.Name, Equipment_model.ImageUrl);
                                    }
                                    catch { }
                                    EquipmentString.Add(Equipment_model);
                                }
                                db_product.Equipment = EquipmentString;
                            }
                            #endregion
    
    
    
                            #region
                            if (descmationNode.SelectNodes("//*[@title='See full size image']") != null)
                            {
                                HtmlNodeCollection ImageNodes = descmationNode.SelectNodes("//*[@title='See full size image']");
                                List<ImageModel> ImageString = new List<ImageModel>();
                                foreach (HtmlNode imagenode in ImageNodes)
                                {
                                    ImageModel image_model = new ImageModel();
    
                                    HtmlDocument imageDo = new HtmlDocument();
                                    imageDo.LoadHtml(imagenode.InnerHtml);
                                    HtmlNode imgRo = imageDo.DocumentNode;
                                    //原图片地址
                                    string FromPath = "http://www.grandcanyononepoint.com" + imgRo.SelectSingleNode("img").Attributes["src"].Value;
    
                                    image_model.ImageUrl = FromPath;
                                    try
                                    {
                                        ProductMessage.InsertProductImage(db_product.ID, image_model.ImageUrl);
                                    }
                                    catch { }
                                }
                            }
                            #endregion
                            products.Add(db_product);
                        }
                }
                return products;
            }
    View Code

    Xpath是将html作为类似xml的格式进行获取的,主要通过节点的不同标示,获取不同内容,可以从网页中获取想要的数据,与网页爬虫不同。

  • 相关阅读:
    Spring Security Oauth2 认证(获取token/刷新token)流程
    Centos7安装配置Apache(httpd)+php+mysql+phpMyAdmin
    servlet 请求(Request)
    关于对连接数据库时出现1130-host “**” is not allowed to connect to this MySql/mariadb server 的错误解决方法
    在Spring中配置Hibernate和单独配置Hibernate的区别
    新手学Struts2的几个小注意事项
    Oracle 11g 发行版2 新安装后关于登录的一些基本操作
    CentOS7下mariaDB和phpmyadmin的一些安装和配置问题
    Ubuntu的一些文件系统的操作(转自我自己的其他博客)
    jquery上传图片获取像素
  • 原文地址:https://www.cnblogs.com/ly77461/p/5719018.html
Copyright © 2011-2022 走看看