zoukankan      html  css  js  c++  java
  • 爬取当当网的图书信息之封装一个工具类

    把这个类名取为Tool

    封装一个下载网页的方法GetHtml

    public static string GetHtml(string url)
            {
                try
                {
                    WebClient wb = new WebClient();
                    return  wb.DownloadString(url);
                
                }
                catch
                {
                    return "";
                }
            }

    传入的是这个网页的URL,这个方法能帮我们把网页下载下来
    封装一个匹配图书类URL的的方法
    public static ArrayList GetList(string html)
            {
                
                ArrayList list = new ArrayList();
                MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html");
                for (int i = 0; i < matches.Count; i++)
                {
                    if (!list.Contains(matches[i].Value.ToString()))//去重
                    {
                        list.Add(matches[i].Value.ToString());
                    }
                }
                return list;
            }
    这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
    封装一个一个获取图书类名的方法
     public static string  GetBookClassName(string html)
            {
                // <meta name="keywords" content="计算机/网络,家庭与办公室用书" />
                //计算机/网络、家庭与办公室用书等商品
                string name = "";
                MatchCollection matches = Regex.Matches(html, "<meta name="keywords" content=".{1,30}" />");
                if (matches.Count>0)
                {
                   string temp= matches[0].ToString();
                  int x= temp.IndexOf("/");
                  int y = temp.LastIndexOf(">");
                  if (y-x>4)
                  {
                      name = temp.Substring(x + 1, y - x - "" />".Length);
                  }
                 
                }
                return name;
            }

    查看网页的源代码

     <meta name="keywords" content="计算机/网络,家庭与办公室用书" />

    图书类名就在这里 接着我们使用正则把它抓取到

    接下来我们要抓取每个图书类别共有多少页

     public static int GetPages(string html)
            {
                int result = 1;
                MatchCollection matches = Regex.Matches(html, "<li class="page_input"><span>共[0-9]{1,4}页 到第</span>");
                if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("共", 0);
                    int y2 = temp.IndexOf("页", y1);
                    if (y1>0&&y2>0)
                    {
                        string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length);
                        result = int.Parse(page);
                    }
    
                }
                return result;
            }
    

    处理好BookClass接下来处理Book了

    获取图书详细页面的URL

     public static ArrayList GetProduct(string html)
            {
                //http://product.dangdang.com/22862060.html
                ArrayList list = new ArrayList();
                MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html");
                for (int i = 0; i < matches.Count; i++)
                {
                    Console.WriteLine(matches[i].Value.ToString());
                    if (!list.Contains(matches[i].Value.ToString()))
                    list.Add(matches[i].Value.ToString());
                }
    
                return list;
            }

    封装一个方法,待爬虫获取图书详细页来抓取图书信息

    以如何抓取价格信息为例

     <div class="price_pc" id="pc-price">
                <div class="price_d">
                    <p class="t" id="dd-price-text">当当价</p>
                    <p id="dd-price">
                        <span class="yen">&yen;</span>66.40                </p>
                </div>
                            <div class="price_zhe" id="dd-zhe"></div>
                <div class="price_m price_m_t" id="original-price-text">定价</div>
                <div class="price_m" id='original-price'>
                    <span class="yen">&yen;</span>99.00            </div>
                            <div class="price_vip" style="display:none" id="dd-vip">
                    <span></span>
                </div>
                        </div>
                        </div>

    66.40是我们需要匹配出来的数据,数据特征并不是很明显,直接匹配会出现杂乱的数据,我们先抓取稍大范围的,缩小搜索范围再来寻找

     MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");

    缩小爬虫抓取范围后,借助Indexof来搜索到

    if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("</span>", 0);
                  
                    if (y1>0)
                        price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);
    
                }

    嘿嘿 价格信息就这样抓取到了,其他的不详细介绍

     public static Dictionary<int, string> analysis(string html)
            {
                string BookName = "";
                string price = "0";
                string author = "";
                string publisher = "";
                string imgurl = "";
                string Content = "";
    
                Dictionary<int, string> dictionary = new Dictionary<int, string>();
    
    
                MatchCollection matches = Regex.Matches(html, " <span class="yen">&yen;</span>.{1,4}.[0-9]{2}");
                if (matches.Count > 0)
                {
                    string temp = matches[0].ToString();
                    int y1 = temp.IndexOf("</span>", 0);
                  
                    if (y1>0)
                        price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);
    
                }
    
    
    
                matches = Regex.Matches(html, "<title>.*</title>");
                if (matches.Count > 0)
                {
                    string thtml = matches[0].ToString();
                    int n1 = thtml.IndexOf("《", 0);
                    if (n1 > 0)
                    {
                        int n2 = thtml.IndexOf("》", n1);
                        if (n2 > n1)
                            BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                        else
                        {
                            n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1);
                            if (n2 > n1)
                                BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                        }
                    }
                   
                }
                //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1"  target="_blank" >@秋叶</a>
                //>作者:<a href="http://www.dangdang.com/author/Marty_1"  target="_blank" >Marty</a>
                int a1 = html.IndexOf("target="_blank" dd_name="作者">",0);
                if (a1 > 0)
                {
                    int   a2 = html.IndexOf("</a>", a1);
                    if (a2>a1)
                    {
                        author = html.Substring(a1 + "target="_blank" dd_name="作者">".Length, a2 - a1 - "target="_blank" dd_name="作者">".Length);
                    }
                }
    
    
                //  target="_blank" dd_name="出版社">
                int p1 = html.IndexOf("target="_blank" dd_name="出版社">", 0);
                if (p1 > 0)
                {
    
                    int p2 = html.IndexOf("</a>", p1);
                    if (p2>0)
                    {
                        publisher = html.Substring(p1 + "target="_blank" dd_name="出版社">".Length, p2 - p1 - "target="_blank" dd_name="出版社">".Length);
                    }
                }
                //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800">
                //    <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800">
                // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800">
                matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg");
                if (matches.Count > 0)
                {
                    imgurl = matches[0].ToString();
    
                }
    
    
    
                //content
                int c1 = html.IndexOf("<meta name="description" content="");
                if (c1>0)
                {
                    int c2 = html.IndexOf("">", c1);
                    if (c2>0)
                    {
                        Content = html.Substring(c1 + "<meta name="description" content="".Length, c2 - c1 - "<meta name="description" content="".Length);
                    }
                }
    
    
    
                dictionary.Add(1, BookName);
                dictionary.Add(2, price);
                dictionary.Add(3, author);
                dictionary.Add(4, publisher);
                dictionary.Add(5, imgurl);
                dictionary.Add(6, Content);
    
    
    
                return dictionary;
            }

    Tool类完成

  • 相关阅读:
    递归
    最简单的基于FFMPEG的音频编码器(PCM编码为AAC)
    最简单的基于FFMPEG的封装格式转换器(无编解码)
    最简单的基于FFMPEG的图像编码器(YUV编码为JPEG)
    视频主观质量评价工具:MSU Perceptual Video Quality tool
    ffmbc——为广播电视以及专业用途量身定制的FFmpeg
    方便使用FFMPEG的经验
    OpenCV提取显示一张图片(或者视频)的R,G,B颜色分量
    avcodec_decode_video2()解码视频后丢帧的问题解决
    HEVC,VP9,x264性能对比
  • 原文地址:https://www.cnblogs.com/zuin/p/6106196.html
Copyright © 2011-2022 走看看