zoukankan      html  css  js  c++  java
  • C#多线程图片爬虫

    写了个简单的多线程图片爬虫,整理一下。数据已经爬下来了,图片URL需要自行拼接,首先从Lawyers表中取的RawData字段,RawData中有一个list字段是json格式的数据,需要的只是list中的pic和XZQH字段用于拼接图片地址,拼接URL规则如下:

    http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0,2)}00/lsfw/lsuser/{model.pic.Substring(0,model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}

    得到图片URL之后就好说了,接下来的就是常规操作download。线程调度的核心思想是四个线程轮流工作,当一个下完或下载失败后,就移除该线程,并重启新线程重复同样的工作。代码如下:

        public class Main : HandleProgramBase, IHandleProgram
        {
            public readonly IUnitOfWork _iUnitOfWork;
    
            public Main(IUnitOfWork iUnitOfWork)
            {
                _iUnitOfWork = iUnitOfWork;
            }
    
            private List<Task> threadManager = new List<Task>();
            private static object locker = new object();
            private static object counter = new object();
            private static ConcurrentQueue<int> counterQueue = new  ConcurrentQueue<int>();
            private static ReaderWriterLockSlim logWriteLock = new ReaderWriterLockSlim();
            private const int total = 150136;
            private static int start = 1;
            private static int downloadNumber = 0;
    
            public override void Entrance(string[] args)
            {
                var watcher = new Stopwatch();
                watcher.Start();
    
                while (start < total)
                {
                    if (threadManager.Any())
                    {
                        Task.WaitAny(threadManager.ToArray());
                        var completedTaskList = threadManager.Where(a => a.IsCompleted).ToList();
                        for (var i = 0; i < completedTaskList.Count; i++)
                        {
                            if (completedTaskList[i].Status == TaskStatus.Faulted || completedTaskList[i].IsCompleted)
                            {
                                threadManager.Remove(completedTaskList[i]);
                                threadManager.Add(Task.Factory.StartNew(DownloadImg));
                                continue;
                            }
                            completedTaskList[i].Dispose();
                        }
    
                    }
                    else
                    {
                        for (var i = 0; i < 4; i++)
                        {
                            threadManager.Add(Task.Factory.StartNew(DownloadImg));
                        }
                    }
                }
    
                Task.WaitAll(threadManager.ToArray());
                watcher.Stop();
    
                Console.WriteLine();
                Console.WriteLine("Download Completed.Total time: " + watcher.ElapsedMilliseconds + " ms.");
            }
    
            private void DownloadImg()
            {
                using (var web = new WebClient())
                {
                    var lawyerList = new List<Lawyers>();
                    lock (locker)
                    {
                        if (start != total)
                        {
                            var end = start + 50 < total ? start + 50 : total;
                            lawyerList = _iUnitOfWork.Implement<Lawyers>(string.Format(Resource.GetPagedLawyer, start, end)).ToList();
                            start = end;
                        }
                    }
                    if (!lawyerList.Any()) return;
                    foreach (var lawyer in lawyerList)
                    {
                        var model = JsonConvert.DeserializeObject<RawData>(lawyer.RawData).list;
                        var imgUrl =
                            $"http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0, 2)}00/lsfw/lsuser/{model.pic.Substring(0, model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}";
                        var savePath = $@"D:ImageTestMulti{lawyer.LawFrimKey}";
                        if (!Directory.Exists(savePath)) Directory.CreateDirectory(savePath);
                        for (int j = 0; j < 10; j++)
                        {
                            try
                            {
                                var position = lawyer.ImageName.LastIndexOf("/", StringComparison.Ordinal);
                                if (position > 0)
                                {
                                    //处理 2017-04-19/B748FA5EF1517886AF76A11CDACE5378.png 类文件
                                    var folder = savePath + "\" + lawyer.ImageName.Substring(0, position).Replace("/", "");
                                    if (!Directory.Exists(folder)) Directory.CreateDirectory(folder);
                                }
                                var fileName = savePath + "\" + lawyer.ImageName.Replace("/", "\");
                                if (!File.Exists(fileName)) web.DownloadFile(imgUrl, fileName);
    
    
                                counterQueue.Enqueue(1);
    
                                //控制台显示下载数量
                                lock (counter)
                                {
                                    Console.WriteLine(Resource.Space);
                                    Console.SetCursorPosition(0, Console.CursorTop - 1);
                                    Console.Write(Resource.DownloadNumber, ++downloadNumber, counterQueue.Count);
                                }
                            }
                            catch (Exception e)
                            {
                                Thread.Sleep(1000);
                                if (j == 9 || ((HttpWebResponse)((WebException)e).Response)?.StatusCode == HttpStatusCode.NotFound)
                                {
                                    WriteLog($"{lawyer.ImageName}爬取失败! 错误:{e.Message}当前Id:{lawyer.Id}。");
                                    break;
                                }
                                continue;
                            }
                            break;
                        }
    
                    }
                }
            }
    
            /// <summary>
            /// 写日志
            /// </summary>
            public static void WriteLog(string errMsg)
            {
                DateTime dt = DateTime.Now;
                string filePathName = System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName;
                int pos = filePathName.LastIndexOf("\");
                if (pos != -1)
                {
                    filePathName = filePathName.Substring(0, pos + 1);
                    filePathName = filePathName + "ErrorLog.txt";
                }
                StreamWriter sw = null;
                try
                {
                    logWriteLock.EnterWriteLock();
                    if (File.Exists(filePathName))
                    {
                        FileInfo mapInfo = new FileInfo(filePathName);
                        long fileSize = mapInfo.Length;
                        sw = fileSize > 5 * 1024 * 1024 ? new StreamWriter(filePathName, false) : new StreamWriter(filePathName, true);
                    }
                    else
                    {
                        sw = new StreamWriter(filePathName, true);
                    }
                    sw.WriteLine(dt.ToShortDateString() + "  " + dt.ToShortTimeString() + "  " + errMsg);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
                finally
                {
                    if (logWriteLock.IsWriteLockHeld)
                        logWriteLock.ExitWriteLock();
                    sw?.Close();
                }
            }
    
            public override string Helper { get; protected set; }
        }
    
    

    几个实体类:

        public class Lawyers
        {
            public int Id { get; set; }
            public string LawFrimKey { get; set; }
            public string RawData { get; set; }
            public string ImageName { get; set; }
        }
    
        public class RawData
        {
            public Lawyer list { get; set; }
        }
    
        public class Lawyer
        {
            public string pic { get; set; }
            public string XZQH { get; set; }
        }
    
    
  • 相关阅读:
    Spring spEL
    Spring 使用外部部署文件
    Spring 自动装配
    spring 属性配置细节
    hdu 1054 Strategic Game
    fzu 2037 Maximum Value Problem
    将博客搬至CSDN
    HDU 4714 Tree2Cycle
    HDU 1009 The Shortest Path in Nya Graph
    POJ 1942 Paths on a Grid 组合数的优化
  • 原文地址:https://www.cnblogs.com/ligykq/p/10315480.html
Copyright © 2011-2022 走看看