zoukankan html css js c++ java

爬取当当网的图书信息之封装一个工具类

把这个类名取为Tool

封装一个下载网页的方法GetHtml

public static string GetHtml(string url)
        {
            try
            {
                WebClient wb = new WebClient();
                return  wb.DownloadString(url);
            
            }
            catch
            {
                return "";
            }
        }

传入的是这个网页的URL，这个方法能帮我们把网页下载下来
封装一个匹配图书类URL的的方法

public static ArrayList GetList(string html)
        {
            
            ArrayList list = new ArrayList();
            MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html");
            for (int i = 0; i < matches.Count; i++)
            {
                if (!list.Contains(matches[i].Value.ToString()))//去重
                {
                    list.Add(matches[i].Value.ToString());
                }
            }
            return list;
        }

这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
封装一个一个获取图书类名的方法

 public static string  GetBookClassName(string html)
        {
            // <meta name="keywords" content="计算机/网络,家庭与办公室用书" />
            //计算机/网络、家庭与办公室用书等商品
            string name = "";
            MatchCollection matches = Regex.Matches(html, "<meta name="keywords" content=".{1,30}" />");
            if (matches.Count>0)
            {
               string temp= matches[0].ToString();
              int x= temp.IndexOf("/");
              int y = temp.LastIndexOf(">");
              if (y-x>4)
              {
                  name = temp.Substring(x + 1, y - x - "" />".Length);
              }
             
            }
            return name;
        }

查看网页的源代码

 <meta name="keywords" content="计算机/网络,家庭与办公室用书" />

图书类名就在这里接着我们使用正则把它抓取到

接下来我们要抓取每个图书类别共有多少页

 public static int GetPages(string html)
        {
            int result = 1;
            MatchCollection matches = Regex.Matches(html, "<li class="page_input"><span>共[0-9]{1,4}页 到第</span>");
            if (matches.Count > 0)
            {
                string temp = matches[0].ToString();
                int y1 = temp.IndexOf("共", 0);
                int y2 = temp.IndexOf("页", y1);
                if (y1>0&&y2>0)
                {
                    string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length);
                    result = int.Parse(page);
                }

            }
            return result;
        }

处理好BookClass接下来处理Book了

获取图书详细页面的URL

 public static ArrayList GetProduct(string html)
        {
            //http://product.dangdang.com/22862060.html
            ArrayList list = new ArrayList();
            MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html");
            for (int i = 0; i < matches.Count; i++)
            {
                Console.WriteLine(matches[i].Value.ToString());
                if (!list.Contains(matches[i].Value.ToString()))
                list.Add(matches[i].Value.ToString());
            }

            return list;
        }

封装一个方法，待爬虫获取图书详细页来抓取图书信息

以如何抓取价格信息为例

 <div class="price_pc" id="pc-price">
            <div class="price_d">
                <p class="t" id="dd-price-text">当当价</p>
                <p id="dd-price">
                    <span class="yen">&yen;</span>66.40                </p>
            </div>
                        <div class="price_zhe" id="dd-zhe"></div>
            <div class="price_m price_m_t" id="original-price-text">定价</div>
            <div class="price_m" id='original-price'>
                <span class="yen">&yen;</span>99.00            </div>
                        <div class="price_vip" style="display:none" id="dd-vip">
                <span></span>
            </div>
                    </div>
                    </div>

66.40是我们需要匹配出来的数据，数据特征并不是很明显，直接匹配会出现杂乱的数据，我们先抓取稍大范围的，缩小搜索范围再来寻找

 MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");

缩小爬虫抓取范围后，借助Indexof来搜索到

if (matches.Count > 0)
            {
                string temp = matches[0].ToString();
                int y1 = temp.IndexOf("</span>", 0);
              
                if (y1>0)
                    price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);

            }

嘿嘿价格信息就这样抓取到了，其他的不详细介绍

 public static Dictionary<int, string> analysis(string html)
        {
            string BookName = "";
            string price = "0";
            string author = "";
            string publisher = "";
            string imgurl = "";
            string Content = "";

            Dictionary<int, string> dictionary = new Dictionary<int, string>();


            MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");
            if (matches.Count > 0)
            {
                string temp = matches[0].ToString();
                int y1 = temp.IndexOf("</span>", 0);
              
                if (y1>0)
                    price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);

            }



            matches = Regex.Matches(html, "<title>.*</title>");
            if (matches.Count > 0)
            {
                string thtml = matches[0].ToString();
                int n1 = thtml.IndexOf("《", 0);
                if (n1 > 0)
                {
                    int n2 = thtml.IndexOf("》", n1);
                    if (n2 > n1)
                        BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                    else
                    {
                        n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1);
                        if (n2 > n1)
                            BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                    }
                }
               
            }
            //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1"  target="_blank" >@秋叶</a>
            //>作者:<a href="http://www.dangdang.com/author/Marty_1"  target="_blank" >Marty</a>
            int a1 = html.IndexOf("target="_blank" dd_name="作者">",0);
            if (a1 > 0)
            {
                int   a2 = html.IndexOf("</a>", a1);
                if (a2>a1)
                {
                    author = html.Substring(a1 + "target="_blank" dd_name="作者">".Length, a2 - a1 - "target="_blank" dd_name="作者">".Length);
                }
            }


            //  target="_blank" dd_name="出版社">
            int p1 = html.IndexOf("target="_blank" dd_name="出版社">", 0);
            if (p1 > 0)
            {

                int p2 = html.IndexOf("</a>", p1);
                if (p2>0)
                {
                    publisher = html.Substring(p1 + "target="_blank" dd_name="出版社">".Length, p2 - p1 - "target="_blank" dd_name="出版社">".Length);
                }
            }
            //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800">
            //    <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800">
            // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800">
            matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg");
            if (matches.Count > 0)
            {
                imgurl = matches[0].ToString();

            }



            //content
            int c1 = html.IndexOf("<meta name="description" content="");
            if (c1>0)
            {
                int c2 = html.IndexOf("">", c1);
                if (c2>0)
                {
                    Content = html.Substring(c1 + "<meta name="description" content="".Length, c2 - c1 - "<meta name="description" content="".Length);
                }
            }



            dictionary.Add(1, BookName);
            dictionary.Add(2, price);
            dictionary.Add(3, author);
            dictionary.Add(4, publisher);
            dictionary.Add(5, imgurl);
            dictionary.Add(6, Content);



            return dictionary;
        }

Tool类完成

查看全文

相关阅读:
Navicat 远程连接ubuntu出现的问题
 替换 ubuntu 自带的python版本
 xpath疑惑
 xpath中返回值问题
 AttributeError: 'unicode' object has no attribute 'xpath'
linux下mysql忘记密码解决方案
 IntelliJ idea常用快捷键
 最近的说明（本篇不谈具体技术，看技术的可以忽略）
常用的排序算法介绍和在JAVA的实现（二）
mysql数据库查询过程探究和优化建议

原文地址：https://www.cnblogs.com/zuin/p/6106196.html