zoukankan      html  css  js  c++  java
  • 用c#编写爬虫在marinetraffic下载船仅仅图片

    近期在做船仅仅识别方面的事情,须要大量的正样本来训练adaboost分类器。

    于是到marinetraffic这个站点上下载船仅仅图片。写个爬虫来自己主动下载显然非常方便。

    站点特点

    在介绍爬虫之前首先了解一下marinetraffic这个站点的一些特点:
    1. 会定期检測爬虫行为。假设觉得有爬虫大量下载图片。

    会把该连接增加黑名单,后几天都没办法下载。
    2. 船仅仅图片资源差异大。有的船仅仅有1000多张图,有的船仅仅没有一张图,我们须要的是非常多船仅仅的非常多张图。所以须要对下载的船仅仅按优先级排序。
    3. 用来训练分类器的正样本要求检測对象的分辨率一样。而marinetraffic站点下载的图片能够设置下在的图片的宽度,站点依据长宽比,生成对应的高度。所以。不同图片高度不一样。须要自己后期处理。

    船仅仅图片

    解决方式

    1. 针对爬虫检測。设置一个随机等待时间,10s左右。能够绕过站点爬虫行为检測。
    2. 对船仅仅依照图片熟练排序,先下载图片数量多的,而且每一个船仅仅不用下载太多。保证图片的差异性。比如
    3. 在下载的时候使用统一的宽度。

      后期处理从图片中抠出分辨率一样的船仅仅

    爬虫源代码

    using System;
    using System.Collections.Generic;
    using System.Globalization;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Runtime.Serialization.Formatters.Binary;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Threading.Tasks;
    
    namespace 船仅仅图像爬虫
    {
        class Program
        {
    
            static void download_all_shipid(List<string> shipid_list)
            {
                try
                {
    
                    WebClient MyWebClient = new WebClient();
    
                    MyWebClient.Headers["User-Agent"] = "blah";
                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;
    
                    //Console.WriteLine("here1");
                    //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/
    
                    //http://www.marinetraffic.com/en/ais/index/ships/all
                    //http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc;
    
                    for (int pageNum = 1; pageNum < 100; pageNum++)
                    {
                        Console.WriteLine("開始分析第" + pageNum + "张网页");
    
                        MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;
                        MyWebClient.Headers["User-Agent"] = "blah";
                        try
                        {
                            //Console.WriteLine("here0");
                            Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //从指定站点下载数据
                            //pageHtml = Encoding.Default.GetString(pageData);  //假设获取站点页面採用的是GB2312,则使用这句;            
    
                            string pageHtml = Encoding.UTF8.GetString(pageData); //假设获取站点页面採用的是UTF-8。则使用这句;
    
                            //Console.WriteLine(pageHtml);//在控制台输入获取的内容;
                            //Console.WriteLine("here1");
                            int urlindex = -1;
                            string org_label = "shipid:";
                            urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
    
    
                            while (urlindex != -1)
                            {
                                int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);
                                //Console.WriteLine("here2");
                                string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);
                                if (!shipid_list.Contains(shipid))
                                {
                                    Console.WriteLine("新增id:" + shipid);
                                    shipid_list.Add(shipid);
                                }
                                //Console.WriteLine("已有id:" + shipid);
    
    
    
                                urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
                            }
    
                            ///保存网页
                            //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
                            //{
                            //    sw.Write(pageHtml);
                            //}
                            Console.WriteLine("完毕第" + pageNum + "页分析");
                        }
                        catch (WebException webEx)
                        {
    
                            Console.WriteLine(webEx.Message.ToString());
    
                        }
    
    
    
    
    
                        //以下是一个随机数的方法保证10秒后再下载。以绕过违规检測。
                        Console.Write("绕开站点爬虫行为检測中......");
                        Random rd = new Random();
                        int time_sleep = rd.Next() % 10 + 10;
                        Thread.Sleep(time_sleep * 1000);
                        Console.WriteLine();
                    }
    
    
    
                    Console.WriteLine("分析结束");
                    //以下把list内容保存进文件,使用序列化的方法;
                    string file = @"C:UsersdragonfiveDesktop爬虫获得船仅仅图片第三批_100page_shipid.txt";
                    using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write))
                    {
                        //以下对stu进行序列化。
                        BinaryFormatter bf = new BinaryFormatter();
                        bf.Serialize(fsWriter, shipid_list);
                    }
    
                }
    
                catch (WebException webEx)
                {
    
                    Console.WriteLine(webEx.Message.ToString());
    
                }
            }
    
            /// <summary>
            /// 依据得到的ship_id获得该ship_id的全部图片;
            /// </summary>
            /// <param name="ship_id"></param>
            static void download_jpg(string ship_id)
            {
                try
                {
                    Console.WriteLine("開始下载shipid为:"+ship_id+"的图片");
                    WebClient MyWebClient = new WebClient();
    
    
                    MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
                    MyWebClient.Headers["User-Agent"] = "blah";
                    //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/
                    //http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1
                    Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //从指定站点下载数据
    
    
                    //string pageHtml = Encoding.Default.GetString(pageData);  //假设获取站点页面採用的是GB2312。则使用这句            
    
                    string pageHtml = Encoding.UTF8.GetString(pageData); //假设获取站点页面採用的是UTF-8,则使用这句
                    //Console.WriteLine(pageHtml);//在控制台输入获取的内容
                    Console.WriteLine("元网页已下载");
                    //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
                    //{
                    //    sw.Write(pageHtml);
                    //}
    
                    int urlindex = -1;
                    string org_label = "data-original='";
                    urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
    
                    int i = 0;
    
                    //Directory.CreateDirectory(@"./" );
                    while (urlindex != -1)
                    {
                        int endOfUrl = pageHtml.IndexOf("'", urlindex + org_label.Length);
    
                        string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);
    
    
                        ////以下是unicode编码转换为string的方式;
                        //MatchCollection mc = Regex.Matches(strName, @"\u([w]{2})([w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);
                        //byte[] bts = new byte[2];
                        //foreach (Match m in mc)
                        //{
                        //    bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);
                        //    bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);
                        //    musicName += Encoding.Unicode.GetString(bts);
                        //}
                        //Console.WriteLine("接下来下载的是:" + musicName);
    
    
                        //以下是一个随机数的方法保证10秒后再下载。以绕过违规检測。
                        Console.Write("绕过站点爬虫行为检測中......");
                        Random rd = new Random();
                        int time_sleep = rd.Next() % 10 + 10;
                        Thread.Sleep(time_sleep * 1000);
                        Console.WriteLine();
                        try
                        {
                            //这是下载的命令;
                            Console.WriteLine(url);
    
                            MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
                            MyWebClient.Headers["User-Agent"] = "blah";
                            Byte[] jpgdata = MyWebClient.DownloadData(url); //从指定网页下载数据;
    
                            //把下载的内容保存在一个地方;
                            using (FileStream fs = new FileStream(@"C:UsersdragonfiveDesktop爬虫获得船仅仅图片第三批" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write))
                            {
                                fs.Write(jpgdata, 0, jpgdata.Length);
                            }
                        }
                        catch (WebException webEx)
                        {
                            Console.WriteLine("被捕获了吗?");
                            Console.WriteLine(webEx.Message.ToString());
    
                        }
    
                        Console.WriteLine("成功下载第" + (i ++) + "张图片");
    
                        urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
                    }
    
                    ///保存网页
                    //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
                    //{
                    //    sw.Write(pageHtml);
                    //}
                    Console.WriteLine("*****************************************");
                    Console.WriteLine("下载"+i+"张ship_id为"+ship_id+"的图片");
                    Console.WriteLine("*****************************************");
                    //Console.ReadLine(); //让控制台暂停,否则一闪而过了 
    
                }
    
                catch (WebException webEx)
                {
    
                    Console.WriteLine(webEx.Message.ToString());
    
                }
            }
            static void Main(string[] args)
            {
    
                List<string> shipid_list = new List<string>();
                //shipid_list.Add("371681");//临时高速产生图片用这个;
                download_all_shipid(shipid_list);
                //string file = @"C:UsersdragonfiveDesktop爬虫获得船仅仅图片第三批_100page_shipid.txt";
                //using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))
                //{
                //    //以下进行反序列话;
                //    BinaryFormatter bf = new BinaryFormatter();
                //    shipid_list = (List<string>)bf.Deserialize(fsReader);
                //    Console.WriteLine("成功加载" + shipid_list.Count + "个shipid");
                //}
                ////371652 371668  371681 1252401 
                //shipid_list.Remove("371652");
                //shipid_list.Remove("371668");
                //shipid_list.Remove("371681");
                //shipid_list.Remove("1252401");
                ////132264
                //shipid_list.Remove("371077");
                //shipid_list.Remove("132264");
                //shipid_list.Remove("224871");
                //shipid_list.Remove("279923");
                //shipid_list.Remove("369163");
                //shipid_list.Remove("266342");
                //shipid_list.Remove("371216");
                //shipid_list.Remove("368174");
                //shipid_list.Remove("369163");
    
    
                foreach (var ship_id in shipid_list)
                {
                    download_jpg(ship_id);
                }
    
                Console.ReadLine(); //让控制台暂停,否则一闪而过了 
    
            }
        }
    }
    
    
  • 相关阅读:
    第十六天
    第十五天
    STM8L段式液晶驱动器
    STM8L的LCD接口详解及驱动程序
    作为合格的工程师,这些电路图一辈子都得记住!
    双向晶闸管触发电路工作原理图
    3~15伏10A大电流可调稳压电源
    用TL431制作简单充电器电路
    5V USB充电器电路图
    555
  • 原文地址:https://www.cnblogs.com/yutingliuyl/p/6941828.html
Copyright © 2011-2022 走看看