zoukankan      html  css  js  c++  java
  • 爬取当当网的图书信息之封装一个工具类

    把这个类名取为Tool

    封装一个下载网页的方法GetHtml

    public static string GetHtml(string url)
            {
                try
                {
                    WebClient wb = new WebClient();
                    return  wb.DownloadString(url);
                
                }
                catch
                {
                    return "";
                }
            }

    传入的是这个网页的URL,这个方法能帮我们把网页下载下来
    封装一个匹配图书类URL的的方法
    public static ArrayList GetList(string html)
            {
                
                ArrayList list = new ArrayList();
                MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html");
                for (int i = 0; i < matches.Count; i++)
                {
                    if (!list.Contains(matches[i].Value.ToString()))//去重
                    {
                        list.Add(matches[i].Value.ToString());
                    }
                }
                return list;
            }
    这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
    封装一个一个获取图书类名的方法
     public static string  GetBookClassName(string html)
            {
                // <meta name="keywords" content="计算机/网络,家庭与办公室用书" />
                //计算机/网络、家庭与办公室用书等商品
                string name = "";
                MatchCollection matches = Regex.Matches(html, "<meta name="keywords" content=".{1,30}" />");
                if (matches.Count>0)
                {
                   string temp= matches[0].ToString();
                  int x= temp.IndexOf("/");
                  int y = temp.LastIndexOf(">");
                  if (y-x>4)
                  {
                      name = temp.Substring(x + 1, y - x - "" />".Length);
                  }
                 
                }
                return name;
            }

    查看网页的源代码

     <meta name="keywords" content="计算机/网络,家庭与办公室用书" />

    图书类名就在这里 接着我们使用正则把它抓取到

    接下来我们要抓取每个图书类别共有多少页

     public static int GetPages(string html)
            {
                int result = 1;
                MatchCollection matches = Regex.Matches(html, "<li class="page_input"><span>共[0-9]{1,4}页 到第</span>");
                if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("共", 0);
                    int y2 = temp.IndexOf("页", y1);
                    if (y1>0&&y2>0)
                    {
                        string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length);
                        result = int.Parse(page);
                    }
    
                }
                return result;
            }
    

    处理好BookClass接下来处理Book了

    获取图书详细页面的URL

     public static ArrayList GetProduct(string html)
            {
                //http://product.dangdang.com/22862060.html
                ArrayList list = new ArrayList();
                MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html");
                for (int i = 0; i < matches.Count; i++)
                {
                    Console.WriteLine(matches[i].Value.ToString());
                    if (!list.Contains(matches[i].Value.ToString()))
                    list.Add(matches[i].Value.ToString());
                }
    
                return list;
            }

    封装一个方法,待爬虫获取图书详细页来抓取图书信息

    以如何抓取价格信息为例

     <div class="price_pc" id="pc-price">
                <div class="price_d">
                    <p class="t" id="dd-price-text">当当价</p>
                    <p id="dd-price">
                        <span class="yen">&yen;</span>66.40                </p>
                </div>
                            <div class="price_zhe" id="dd-zhe"></div>
                <div class="price_m price_m_t" id="original-price-text">定价</div>
                <div class="price_m" id='original-price'>
                    <span class="yen">&yen;</span>99.00            </div>
                            <div class="price_vip" style="display:none" id="dd-vip">
                    <span></span>
                </div>
                        </div>
                        </div>

    66.40是我们需要匹配出来的数据,数据特征并不是很明显,直接匹配会出现杂乱的数据,我们先抓取稍大范围的,缩小搜索范围再来寻找

     MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");

    缩小爬虫抓取范围后,借助Indexof来搜索到

    if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("</span>", 0);
                  
                    if (y1>0)
                        price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);
    
                }

    嘿嘿 价格信息就这样抓取到了,其他的不详细介绍

     public static Dictionary<int, string> analysis(string html)
            {
                string BookName = "";
                string price = "0";
                string author = "";
                string publisher = "";
                string imgurl = "";
                string Content = "";
    
                Dictionary<int, string> dictionary = new Dictionary<int, string>();
    
    
                MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");
                if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("</span>", 0);
                  
                    if (y1>0)
                        price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);
    
                }
    
    
    
                matches = Regex.Matches(html, "<title>.*</title>");
                if (matches.Count > 0)
                {
                    string thtml = matches[0].ToString();
                    int n1 = thtml.IndexOf("《", 0);
                    if (n1 > 0)
                    {
                        int n2 = thtml.IndexOf("》", n1);
                        if (n2 > n1)
                            BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                        else
                        {
                            n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1);
                            if (n2 > n1)
                                BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                        }
                    }
                   
                }
                //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1"  target="_blank" >@秋叶</a>
                //>作者:<a href="http://www.dangdang.com/author/Marty_1"  target="_blank" >Marty</a>
                int a1 = html.IndexOf("target="_blank" dd_name="作者">",0);
                if (a1 > 0)
                {
                    int   a2 = html.IndexOf("</a>", a1);
                    if (a2>a1)
                    {
                        author = html.Substring(a1 + "target="_blank" dd_name="作者">".Length, a2 - a1 - "target="_blank" dd_name="作者">".Length);
                    }
                }
    
    
                //  target="_blank" dd_name="出版社">
                int p1 = html.IndexOf("target="_blank" dd_name="出版社">", 0);
                if (p1 > 0)
                {
    
                    int p2 = html.IndexOf("</a>", p1);
                    if (p2>0)
                    {
                        publisher = html.Substring(p1 + "target="_blank" dd_name="出版社">".Length, p2 - p1 - "target="_blank" dd_name="出版社">".Length);
                    }
                }
                //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800">
                //    <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800">
                // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800">
                matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg");
                if (matches.Count > 0)
                {
                    imgurl = matches[0].ToString();
    
                }
    
    
    
                //content
                int c1 = html.IndexOf("<meta name="description" content="");
                if (c1>0)
                {
                    int c2 = html.IndexOf("">", c1);
                    if (c2>0)
                    {
                        Content = html.Substring(c1 + "<meta name="description" content="".Length, c2 - c1 - "<meta name="description" content="".Length);
                    }
                }
    
    
    
                dictionary.Add(1, BookName);
                dictionary.Add(2, price);
                dictionary.Add(3, author);
                dictionary.Add(4, publisher);
                dictionary.Add(5, imgurl);
                dictionary.Add(6, Content);
    
    
    
                return dictionary;
            }

    Tool类完成

  • 相关阅读:
    HDU 1009 FatMouse' Trade
    HDU 2602 (简单的01背包) Bone Collector
    LA 3902 Network
    HDU 4513 吉哥系列故事——完美队形II
    LA 4794 Sharing Chocolate
    POJ (Manacher) Palindrome
    HDU 3294 (Manacher) Girls' research
    HDU 3068 (Manacher) 最长回文
    Tyvj 1085 派对
    Tyvj 1030 乳草的入侵
  • 原文地址:https://www.cnblogs.com/zuin/p/6106196.html
Copyright © 2011-2022 走看看