zoukankan      html  css  js  c++  java
  • C#多线程图片爬虫

    写了个简单的多线程图片爬虫,整理一下。数据已经爬下来了,图片URL需要自行拼接,首先从Lawyers表中取的RawData字段,RawData中有一个list字段是json格式的数据,需要的只是list中的pic和XZQH字段用于拼接图片地址,拼接URL规则如下:

    http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0,2)}00/lsfw/lsuser/{model.pic.Substring(0,model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}

    得到图片URL之后就好说了,接下来的就是常规操作download。线程调度的核心思想是四个线程轮流工作,当一个下完或下载失败后,就移除该线程,并重启新线程重复同样的工作。代码如下:

        public class Main : HandleProgramBase, IHandleProgram
        {
            public readonly IUnitOfWork _iUnitOfWork;
    
            public Main(IUnitOfWork iUnitOfWork)
            {
                _iUnitOfWork = iUnitOfWork;
            }
    
            private List<Task> threadManager = new List<Task>();
            private static object locker = new object();
            private static object counter = new object();
            private static ConcurrentQueue<int> counterQueue = new  ConcurrentQueue<int>();
            private static ReaderWriterLockSlim logWriteLock = new ReaderWriterLockSlim();
            private const int total = 150136;
            private static int start = 1;
            private static int downloadNumber = 0;
    
            public override void Entrance(string[] args)
            {
                var watcher = new Stopwatch();
                watcher.Start();
    
                while (start < total)
                {
                    if (threadManager.Any())
                    {
                        Task.WaitAny(threadManager.ToArray());
                        var completedTaskList = threadManager.Where(a => a.IsCompleted).ToList();
                        for (var i = 0; i < completedTaskList.Count; i++)
                        {
                            if (completedTaskList[i].Status == TaskStatus.Faulted || completedTaskList[i].IsCompleted)
                            {
                                threadManager.Remove(completedTaskList[i]);
                                threadManager.Add(Task.Factory.StartNew(DownloadImg));
                                continue;
                            }
                            completedTaskList[i].Dispose();
                        }
    
                    }
                    else
                    {
                        for (var i = 0; i < 4; i++)
                        {
                            threadManager.Add(Task.Factory.StartNew(DownloadImg));
                        }
                    }
                }
    
                Task.WaitAll(threadManager.ToArray());
                watcher.Stop();
    
                Console.WriteLine();
                Console.WriteLine("Download Completed.Total time: " + watcher.ElapsedMilliseconds + " ms.");
            }
    
            private void DownloadImg()
            {
                using (var web = new WebClient())
                {
                    var lawyerList = new List<Lawyers>();
                    lock (locker)
                    {
                        if (start != total)
                        {
                            var end = start + 50 < total ? start + 50 : total;
                            lawyerList = _iUnitOfWork.Implement<Lawyers>(string.Format(Resource.GetPagedLawyer, start, end)).ToList();
                            start = end;
                        }
                    }
                    if (!lawyerList.Any()) return;
                    foreach (var lawyer in lawyerList)
                    {
                        var model = JsonConvert.DeserializeObject<RawData>(lawyer.RawData).list;
                        var imgUrl =
                            $"http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0, 2)}00/lsfw/lsuser/{model.pic.Substring(0, model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}";
                        var savePath = $@"D:ImageTestMulti{lawyer.LawFrimKey}";
                        if (!Directory.Exists(savePath)) Directory.CreateDirectory(savePath);
                        for (int j = 0; j < 10; j++)
                        {
                            try
                            {
                                var position = lawyer.ImageName.LastIndexOf("/", StringComparison.Ordinal);
                                if (position > 0)
                                {
                                    //处理 2017-04-19/B748FA5EF1517886AF76A11CDACE5378.png 类文件
                                    var folder = savePath + "\" + lawyer.ImageName.Substring(0, position).Replace("/", "");
                                    if (!Directory.Exists(folder)) Directory.CreateDirectory(folder);
                                }
                                var fileName = savePath + "\" + lawyer.ImageName.Replace("/", "\");
                                if (!File.Exists(fileName)) web.DownloadFile(imgUrl, fileName);
    
    
                                counterQueue.Enqueue(1);
    
                                //控制台显示下载数量
                                lock (counter)
                                {
                                    Console.WriteLine(Resource.Space);
                                    Console.SetCursorPosition(0, Console.CursorTop - 1);
                                    Console.Write(Resource.DownloadNumber, ++downloadNumber, counterQueue.Count);
                                }
                            }
                            catch (Exception e)
                            {
                                Thread.Sleep(1000);
                                if (j == 9 || ((HttpWebResponse)((WebException)e).Response)?.StatusCode == HttpStatusCode.NotFound)
                                {
                                    WriteLog($"{lawyer.ImageName}爬取失败! 错误:{e.Message}当前Id:{lawyer.Id}。");
                                    break;
                                }
                                continue;
                            }
                            break;
                        }
    
                    }
                }
            }
    
            /// <summary>
            /// 写日志
            /// </summary>
            public static void WriteLog(string errMsg)
            {
                DateTime dt = DateTime.Now;
                string filePathName = System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName;
                int pos = filePathName.LastIndexOf("\");
                if (pos != -1)
                {
                    filePathName = filePathName.Substring(0, pos + 1);
                    filePathName = filePathName + "ErrorLog.txt";
                }
                StreamWriter sw = null;
                try
                {
                    logWriteLock.EnterWriteLock();
                    if (File.Exists(filePathName))
                    {
                        FileInfo mapInfo = new FileInfo(filePathName);
                        long fileSize = mapInfo.Length;
                        sw = fileSize > 5 * 1024 * 1024 ? new StreamWriter(filePathName, false) : new StreamWriter(filePathName, true);
                    }
                    else
                    {
                        sw = new StreamWriter(filePathName, true);
                    }
                    sw.WriteLine(dt.ToShortDateString() + "  " + dt.ToShortTimeString() + "  " + errMsg);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
                finally
                {
                    if (logWriteLock.IsWriteLockHeld)
                        logWriteLock.ExitWriteLock();
                    sw?.Close();
                }
            }
    
            public override string Helper { get; protected set; }
        }
    
    

    几个实体类:

        public class Lawyers
        {
            public int Id { get; set; }
            public string LawFrimKey { get; set; }
            public string RawData { get; set; }
            public string ImageName { get; set; }
        }
    
        public class RawData
        {
            public Lawyer list { get; set; }
        }
    
        public class Lawyer
        {
            public string pic { get; set; }
            public string XZQH { get; set; }
        }
    
    
  • 相关阅读:
    天梯赛练习 L3-011 直捣黄龙 (30分) dijkstra + dfs
    PAT甲级练习 1087 All Roads Lead to Rome (30分) 字符串hash + dijkstra
    天梯赛练习 L3-010 是否完全二叉搜索树 (30分) 数组建树模拟
    天梯赛练习 L3-008 喊山 (30分) bfs搜索
    天梯赛练习 L3-007 天梯地图 (30分) Dijkstra
    1018 Public Bike Management (30分) PAT甲级真题 dijkstra + dfs
    PAT天梯赛练习 L3-004 肿瘤诊断 (30分) 三维BFS
    课堂实验(计算1!+2!+...+100!)
    39页作业第7题
    39页作业(还款年限—月还款额表)
  • 原文地址:https://www.cnblogs.com/ligykq/p/10315480.html
Copyright © 2011-2022 走看看