zoukankan      html  css  js  c++  java
  • Selenium&EmguCV实现爬虫图片识别

    概述

    爬虫需要抓取网站价格,与一般抓取网页区别的是抓取内容是通过AJAX加载,并且价格是通过CSS背景图片显示的。

    image

    每一个数字对应一个样式,如'p_h57_5'

    .p_h57_5 {
    background: url('http://pic.c-ctrip.com/priceblur/h57/3713de5c594648529f39d031243966dd.gif') no-repeat -590px;
    padding: 0 6px;
    font-size: 18px;
    }
    

    数字对应的样式和对应的backgroundimg都是动态改变的,需要获取到每一个房型的房价。虽然后来有了其它渠道获取房价,这里记录一下用Selenium&Emgu抓取的方式。
    流程:

    1.Selenium访问网址
    2.全屏截图
    3.Selenium选择器获取房型等信息
    4.Selenium选择器获取价格DOM元素,计算出价格元素的相对位置,截取价格图片,使用Emgu识别价格并且输出
    

    实现

    static void Main(string[] args)
            {
    
    
                //访问网址
                ChromeOptions options = new ChromeOptions();
                options.AddArguments("--start-maximized --disable-popup-blocking");
                var driver = new ChromeDriver(options);
                driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html");
       
           try
                {
                    new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until(
                        ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加载完毕
                }
                finally
                {
                    
                }
                
                //删除价格的¥符号
                ReadOnlyCollection<IWebElement> elementsList = driver.FindElementsByCssSelector("tr[expand]");
                driver.ExecuteScript(@"
                    var arr =  document.getElementsByTagName('dfn');
                    for(var i=0;i<arr.length;i++){
                        arr[i].style.display = 'none';     
                    }
                ");
                
                //全屏截图
                var image2 = GetEntereScreenshot(driver);
                image2.Save(@"Z:111.jpg");
    
                //输出
                Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "类型", "房价");
                foreach (IWebElement _ in elementsList)
                {
                    //var image = _.Snapshot();
                    //image.Save(@"Z:" + Guid.NewGuid() + ".jpg");
                    //var str = ORC_((Bitmap)image);
                    var roomType = "";
                    try
                    {
                        roomType = _.FindElement(By.CssSelector(".room_unfold")).Text;
    
                    }
                    catch (Exception)
                    {
                    }
                    
                    var roomTypeText = regRoomType.Match(roomType);
                    
                    var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text;
    
                    //价格元素生成图片
                    var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2);
                    //识别
                    var price = ORC_((Bitmap)image);
                    Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price);
                }
                Console.Read();
            }
    

    图片识别方法

    
    static Program()
            {
                _ocr.SetVariable("tessedit_char_whitelist", "0123456789");
            }
    
            private static Tesseract _ocr = new Tesseract(@"C:Emguemgucv-windows-universal-cuda 2.9.0.1922in	essdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED);
            //传入图片进行识别
            public static string ORC_(Bitmap img)
            {
                //""标示OCR识别调用失败
                string re = "";
                if (img == null)
                    return re;
                else
                {
    
    
                    Bgr drawColor = new Bgr(Color.Blue);
                    try
                    {
                        Image<Bgr, Byte> image = new Image<Bgr, byte>(img);
    
    
                        using (Image<Gray, byte> gray = image.Convert<Gray, Byte>())
                        {
                            _ocr.Recognize(gray);
                            Tesseract.Charactor[] charactors = _ocr.GetCharactors();
                            foreach (Tesseract.Charactor c in charactors)
                            {
                                image.Draw(c.Region, drawColor, 1);
                            }
    
    
                            re = _ocr.GetText();
    
    
                        }
                        return re;
                    }
                    catch (Exception ex)
                    {
    
                        return re;
                    }
                }
            }
    

    Selenium内置了截图方法,只能截取浏览器中显示的内容,找到一个全屏截图的方式(内置截图+控制滚动条,图片拼接)

    
     public static Bitmap GetEntereScreenshot(IWebDriver _driver)
            {
    
                Bitmap stitchedImage = null;
                try
                {
                    long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth");
    
                    long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return  document.body.parentNode.scrollHeight");
    
                    int totalWidth = (int)totalwidth1;
                    int totalHeight = (int)totalHeight1;
    
                    // Get the Size of the Viewport
                    long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth");
                    long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth");
    
                    int viewportWidth = (int)viewportWidth1;
                    int viewportHeight = (int)viewportHeight1;
    
    
                    // Split the Screen in multiple Rectangles
                    List<Rectangle> rectangles = new List<Rectangle>();
                    // Loop until the Total Height is reached
                    for (int i = 0; i < totalHeight; i += viewportHeight)
                    {
                        int newHeight = viewportHeight;
                        // Fix if the Height of the Element is too big
                        if (i + viewportHeight > totalHeight)
                        {
                            newHeight = totalHeight - i;
                        }
                        // Loop until the Total Width is reached
                        for (int ii = 0; ii < totalWidth; ii += viewportWidth)
                        {
                            int newWidth = viewportWidth;
                            // Fix if the Width of the Element is too big
                            if (ii + viewportWidth > totalWidth)
                            {
                                newWidth = totalWidth - ii;
                            }
    
                            // Create and add the Rectangle
                            Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight);
                            rectangles.Add(currRect);
                        }
                    }
    
                    // Build the Image
                    stitchedImage = new Bitmap(totalWidth, totalHeight);
                    // Get all Screenshots and stitch them together
                    Rectangle previous = Rectangle.Empty;
                    foreach (var rectangle in rectangles)
                    {
                        // Calculate the Scrolling (if needed)
                        if (previous != Rectangle.Empty)
                        {
                            int xDiff = rectangle.Right - previous.Right;
                            int yDiff = rectangle.Bottom - previous.Bottom;
                            // Scroll
                            //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));
                            ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));
                            System.Threading.Thread.Sleep(200);
                        }
    
                        // Take Screenshot
                        var screenshot = ((ITakesScreenshot)_driver).GetScreenshot();
    
                        // Build an Image out of the Screenshot
                        Image screenshotImage;
                        using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray))
                        {
                            screenshotImage = Image.FromStream(memStream);
                        }
    
                        // Calculate the Source Rectangle
                        Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height);
    
                        // Copy the Image
                        using (Graphics g = Graphics.FromImage(stitchedImage))
                        {
                            g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel);
                        }
    
                        // Set the Previous Rectangle
                        previous = rectangle;
                    }
                }
                catch (Exception ex)
                {
                    // handle
                }
                return stitchedImage;
            }
    

    最后的是根据传入的元素和全屏截图,获取到价格元素的图片

    
     public static Image SnapshotV2(this IWebElement element, Bitmap bitmap)
            {
                Size size = new Size(
                       Math.Min(element.Size.Width, bitmap.Width),
                       Math.Min(element.Size.Height, bitmap.Height));
                Rectangle crop = new Rectangle(element.Location, size);
                return bitmap.Clone(crop, bitmap.PixelFormat);
            }
            
    

    运行效果如下
    image

  • 相关阅读:
    深入理解memcached
    如何查看你的 memcached 的状态
    转: Linux 技巧:让进程在后台可靠运行的几种方法
    centos 如何用 rsyslog 搭建本地日志服务(续1: omprog模块与php deamon的配合使用)
    转: 解决MSYS2下的中文乱码问题
    解决windows下vim方向键变成 ABCD 的问题
    centos 如何用 rsyslog 搭建本地日志服务
    转:理解 Linux 的硬链接与软链接
    php include include_once require require_once 的区别与联系
    让块级元素水平垂直居中
  • 原文地址:https://www.cnblogs.com/miku/p/4298588.html
Copyright © 2011-2022 走看看