zoukankan      html  css  js  c++  java
  • 关于使用HtmlAgilityPack

    请直接看代码:

            /// <summary>
            
    /// 根据输入的地址获取其文档节点对象
            
    /// </summary>
            
    /// <param name="url">地址</param>
            
    /// <returns></returns>
            public static HtmlAgilityPack.HtmlNode GetHtmlNodeFromLink(string url)
            {
                try{
                    Uri uri = new Uri(url);

                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
                    WebResponse response = request.GetResponse();

                    Stream stream = response.GetResponseStream();
                    StreamReader read = new StreamReader(stream, Encoding.GetEncoding("gb2312"));
                    string str = read.ReadToEnd();

                    HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
                    html.LoadHtml(str);
                    return html.DocumentNode;
                }
                catch{return null;}
            }

            /// <summary>
            
    /// 根据输入的URL地址输出指定XPATH下的节点集合
            
    /// </summary>
            
    /// <param name="url">地址</param>
            
    /// <param name="xPath">过滤地址</param>
            
    /// <param name="imgs">过滤地址</param>
            
    /// <param name="links">过滤地址</param>
            
    /// <param name="title">标题</param>
            
    /// <returns></returns>
            public static bool GetGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode,string xPath,ref string[] imgs, ref string[] links,ref string[] title)
            {
                try
                {
                    HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
                    if (hnc.Count < 1)
                        return false;
                    links = new string[hnc.Count];
                    title = new string[hnc.Count];
                    imgs = new string[hnc.Count];
                    int i = 0;
                    string cateDataRegex = @"background-image:url\((?<image>.+)\)";
                    Regex re = new Regex(cateDataRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
                    foreach (HtmlNode node in hnc)
                    {
                        HtmlAttributeCollection hac = node.Attributes;
                        links[i] = hac["href"].Value;
                        imgs[i] = hac["style"] == null ? hac["src2"].Value : re.Match(hac["style"].Value).Groups["image"].Value;
                        title[i++] = string.IsNullOrEmpty(hac["title"].Value) ? hac["alt"].Value : hac["title"].Value;
                    }
                    return true;
                }
                catch { return false; }
            }
            
            //调用 
            
                string[] strLink;
                string[] strLinAlt;
                string[] strImg;
                string urls = "http://www.newegg.com.cn";
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink(urls);
                GetGalleryInfo(nodes, "//div[@class='slideBannerA homeSlideAD1']/div[1]/div[1]/a"out strImg, out strLink,out strLinAlt);

    淘宝今日活动:

    /// <summary>
            
    /// 淘宝今日活动
            
    /// </summary>
            
    /// <param name="htmlNode">页面节点集合</param>
            
    /// <param name="xPath">选择的路径</param>
            
    /// <param name="imgs">图片集合</param>
            
    /// <param name="links">链接集合</param>
            
    /// 调用:
            
    ///    string[] strLink;
            
    ///    string[] strImg;
            
    ///    HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
            
    ///    GetTaobaoGalleryInfo(nodes, "//div[@class='sub-promotion-content']/div[@class='ks-switchable-content zoom']/ul/li", out strImg, out strLink);
            
    /// <returns></returns>
            public static bool GetTaobaoGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode, string xPath, out string[] imgs, out string[] links)//, ref string[] title)
            {
                HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
                links = new string[hnc.Count];
                imgs = new string[hnc.Count];
                try
                {
                    if (hnc.Count < 1)
                        return false;
                    int i = 0;
                    foreach (HtmlNode node in hnc)
                    {
                        links[i] = node.ChildNodes[1].Attributes["href"].Value;
                        imgs[i++] = node.ChildNodes[1].ChildNodes[0].Attributes["src"].Value;
                    }
                    return true;
                }
                catch { return false; }
            }
     //今日炸弹
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.newegg.com.cn/");

                HtmlAgilityPack.HtmlNode node = nodes.SelectSingleNode("//div[@class='colSub']/div[@class='picBanner shellShocker ']/a");//"//div[@class='slideBannerA homeSlideAD1']"
               
                string strImg = node.Attributes["href"].Value;
                string strSrc= node.ChildNodes[0].Attributes["src"].Value;
                //淘宝类别活动
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.taobao.com");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//span[@class='category-pop']/a");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strText = new string[node.Count];

                try
                {
                    int i = 0;
                    foreach (HtmlNode htmlNode in node)
                    {
                        strLink[i] = htmlNode.Attributes["href"].Value;
                        strText[i++] = htmlNode.InnerText;
                    }
                }
                catch { }
    //淘宝-服侍-新品推荐
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://fushi.taobao.com");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='new-product-image-list']/ul[@class='image-list']/li");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];

                try
                {
                    int i = 0;
                    foreach (HtmlNode htmlNode in node)
                    {
                        strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                        strAlt[i] = htmlNode.ChildNodes[0].ChildNodes[1].InnerHtml;
                        strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                    }
                }
                catch { }
    //针织衫推荐
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://rihan.vancl.com/","UTF-8");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='prod_area']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];
                string[] strPrice = new string[node.Count];
                string[] strCurrentPrice = new string[node.Count];

                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    try
                    {
                        strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                        strAlt[i] = htmlNode.ChildNodes[4].ChildNodes[1].InnerHtml.Trim();
                        strImg[i] = htmlNode.ChildNodes[0].ChildNodes[1].Attributes["src"].Value;
                        strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("""");
                        strCurrentPrice[i++] = htmlNode.ChildNodes[6].ChildNodes[2].InnerHtml.Trim().Replace("售价¥""");
                    }
                    catch { }
                }
            private void button8_Click(object sender, EventArgs e)
            {
                //http://www.masamaso.com  商品列表
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/""UTF-8");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//ul/li/div[@class='goods_case']");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];
                string[] strPrice = new string[node.Count];
                string[] strCurrentPrice = new string[node.Count];

                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    try
                    {
                        strLink[i] = "http://www.masamaso.com/" + htmlNode.ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
                        strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                        strImg[i] = htmlNode.ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                        //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                        strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;""");
                    }
                    catch 
                    { }
                }
            }

            private void button9_Click(object sender, EventArgs e)
            {
                //http://www.masamaso.com/  弹出广告
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.masamaso.com/""UTF-8");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];
                string[] strPrice = new string[node.Count];
                string[] strCurrentPrice = new string[node.Count];

                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    try
                    {
                        strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                        //strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                        strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                        //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                        
    //strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;", "");
                    }
                    catch 
                    { }
                }

                Func();
            }

            private void Func()
            {
                //http://www.vivian.cn/  弹出广告
                HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/""UTF-8");
                HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='foot_img tabContainer']/div[@class='tabBox']/div[@class='hd_tp']");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];
                string[] strPrice = new string[node.Count];
                string[] strCurrentPrice = new string[node.Count];

                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    try
                    {
                        strLink[i] = htmlNode.ChildNodes[0].Attributes["href"].Value;
                        //strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                        strImg[i++] = htmlNode.ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                        //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                        
    //strCurrentPrice[i++] = htmlNode.ChildNodes[3].ChildNodes[1].ChildNodes[1].ChildNodes[0].InnerHtml.Trim().Replace("&yen;", "");
                    }
                    catch 
                    { }
                }

            }

            private void button10_Click(object sender, EventArgs e)
            {
                //http://www.vivian.cn/"   产品列表
                 HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink("http://www.vivian.cn/""UTF-8");
                 HtmlAgilityPack.HtmlNodeCollection node = nodes.SelectNodes("//div[@class='goods_list']/ul/li");//"//div[@class='slideBannerA homeSlideAD1']"

                string[] strLink = new string[node.Count];
                string[] strImg = new string[node.Count];
                string[] strAlt = new string[node.Count];
                string[] strPrice = new string[node.Count];
                string[] strCurrentPrice = new string[node.Count];

                int i = 0;
                foreach (HtmlNode htmlNode in node)
                {
                    try
                    {
                        strLink[i] = "http://www.vivian.cn/" + htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["href"].Value;
                        strAlt[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].Attributes["title"].Value;
                        strImg[i] = htmlNode.ChildNodes[1].ChildNodes[1].ChildNodes[0].ChildNodes[0].Attributes["src"].Value;
                        //strPrice[i] = htmlNode.ChildNodes[6].ChildNodes[1].ChildNodes[1].InnerHtml.Trim().Replace("¥", "");
                        strCurrentPrice[i++] = htmlNode.ChildNodes[1].ChildNodes[3].ChildNodes[3].ChildNodes[0].ChildNodes[1].InnerHtml.Trim().Replace("&yen;""");
                    }
                    catch 
                    { }
                }
            }
  • 相关阅读:
    [每周心学]先生王阳明何许人也?
    CET-6 分频周计划生词筛选(番外篇:百词斩)
    CET-6 分频周计划生词筛选(Week 3)
    架构之美阅读笔记三
    架构之美阅读笔记二
    架构之美阅读笔记一
    软件需求分析课堂讨论01
    问题账户需求分析
    2016年秋季个人阅读计划
    个人总结
  • 原文地址:https://www.cnblogs.com/bober/p/2226794.html
Copyright © 2011-2022 走看看