【提出问题】
autohome是个汽车门户,有时论坛里面会有一些比较好看的帖子,比如“一家四口环中国行”,主贴100多页,跟帖4000多页,看起来很爽。
但是,其论坛的JS脚本写的并不好,如果一帖图片非常多的情况下,经常有图片显示不了,很是郁闷。
于是有思路想下载帖子出来离线浏览。有人可能会说,现在有很多现成的离线浏览软件呀,不错,但是下载不了这里的图片,因为其图片URL做了个小小的手脚。
【分析问题】
1、URL规律分析
第一帖是 http://club.autohome.com.cn/bbs/thread-o-200042-19582947-1.html
第二帖子 http://club.autohome.com.cn/bbs/thread-o-200042-19582947-2.html
发现其N贴是 http://club.autohome.com.cn/bbs/thread-o-200042-19582947-N.html
2、图片分析
查看源文件,其图片的HTML为
1 <img id="img-0-8" name="lazypic" onload="tz.picLoaded(this)" onerror="tz.picNotFind(this)" style="700px;height:464px" src="http://x.autoimg.cn/club/lazyload.png" src9="http://club1.autoimg.cn/album/userphotos/2013/2/25/500_9bed_79b1f6c8_79b1f6c8.jpg" />
默认的src是一个等待图片,真实的src为src9属性,通过onload事件来替换src实现显示图片,超时或者出错是显示onerror事件
那么我们抓取src9就可以下载图片了
3、图片抓取尝试
比如上面的图片URL为 http://club1.autoimg.cn/album/userphotos/2013/2/25/500_9bed_79b1f6c8_79b1f6c8.jpg ,如果直接下载图片的话,服务器会拒绝,因为你在盗链。
所以最好是用HTTP 1.1的指令方式发起HTTP REQUEST,同时要传达 request.Referer 属性,可用fiddler监控
1 GET http://club1.autoimg.cn/album/userphotos/2013/4/3/500_43be_cde0b51b_cde0b51b.jpg HTTP/1.1 2 Accept: */* 3 Referer: http://club.autohome.com.cn/bbs/thread-o-200042-19582947-1.html 4 Accept-Language: zh-CN 5 User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; KB974487) 6 Accept-Encoding: gzip, deflate 7 Connection: Keep-Alive 8 DNT: 1 9 Host: club1.autoimg.cn
为了方便图片显示,建议按照原来的路径保存图片文件,比如图片 http://club1.autoimg.cn/album/userphotos/2013/4/3/500_43be_cde0b51b_cde0b51b.jpg,则按照文件夹 club1.autoimg.cn/album/userphotos/2013/4/3/500_43be_cde0b51b_cde0b51b.jpg 来保存。
4、分页链接
为了便于浏览,下载后的分页连接要能连上
1 <div class="pages fs"> 2 <a href="forum-o-200042-1.html">返回列表</a></div> 3 <div class="pages" id="x-pages1" maxindex="4927"><span class="cur">1</span><a target="_self" href="thread-o-200042-19582947-2.html">2</a><a target="_self" href="thread-o-200042-19582947-3.html">3</a><a target="_self" href="thread-o-200042-19582947-4.html">4</a><a target="_self" href="thread-o-200042-19582947-5.html">5</a><span>...</span><a target="_self" href="thread-o-200042-19582947-4927.html">4927</a><span class="gopage"><input type="text" value="1" title="输入页码,按回车快速跳转" onkeydown="if(event.keyCode==13){tz.goPage(this)}" /><span class="fs" title="共 4927 页"> / 4927 页</span></span><a target="_self" class="afpage" href="thread-o-200042-19582947-2.html" title="支持键盘 ← → 键翻页">下一页</a></div> 4 <div class="jfwen"> 5 到第<span><input type="text" value="" class="topinp txtcenter" id="txtGoFloor1" maxlength="7" 6 title="输入楼层数,按回车快速定位" onkeydown="if(event.keyCode==13){tz.goFloor(null,'txtGoFloor1')}" /></span>楼</div>
发现本来的连接就是html文件的文件名,所以只要按原来的文件名保存就可以了。
5、无用代码过滤
将onload和onerror事件去掉,将script的始末标签替换为DIV,将无用http://开头 替换为本地./,方便本地浏览不占资源
【解决问题】
1、http
1 private static readonly string DefaultUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; 2 /// <summary> 3 /// 创建GET方式的HTTP请求 ,拿来的改了下 4 /// </summary> 5 /// <param name="url">请求的URL</param> 6 /// <param name="timeout">请求的超时时间</param> 7 /// <param name="userAgent">请求的客户端浏览器信息,可以为空</param> 8 /// <param name="referer">请求来源URL</param> 9 /// <param name="cookies">随同HTTP请求发送的Cookie信息,如果不需要身份验证可以为空</param> 10 /// <returns></returns> 11 public static HttpWebResponse CreateGetHttpResponse(string url, int? timeout, string userAgent, string referer, CookieCollection cookies) 12 { 13 if (string.IsNullOrEmpty(url)) 14 { 15 throw new ArgumentNullException("url"); 16 } 17 HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; 18 request.Method = "GET"; 19 request.UserAgent = DefaultUserAgent; 20 if (!string.IsNullOrEmpty(userAgent)) 21 { 22 request.UserAgent = userAgent; 23 } 24 if (timeout.HasValue) 25 { 26 request.Timeout = timeout.Value; 27 } 28 if (referer != null) 29 { 30 request.Referer = referer; 31 } 32 if (cookies != null) 33 { 34 request.CookieContainer = new CookieContainer(); 35 request.CookieContainer.Add(cookies); 36 } 37 return request.GetResponse() as HttpWebResponse; 38 }
2、主要动作按钮
1 private void btnStart_Click(object sender, EventArgs e) 2 { 3 btnStart.Enabled = false; 4 btnStop.Enabled = true; 5 timer1.Enabled = true; 6 7 var dt1 = DateTime.Now; 8 var dir = tbSaveDir.Text; 9 var baseUrl = tbURL.Text.Replace("-1.", "-#."); 10 var totalPage = (int) tbTotalPage.Value; 11 var fromPage = (int) tbFromPage.Value; 12 var isDefault = radioButton1.Checked; 13 var html = ""; 14 var imgurl = ""; 15 var htmlFile = ""; 16 var imgNum = 0; 17 18 //进度条 19 progressBar1.Maximum = totalPage - fromPage + 1; 20 progressBar1.Value = 1; 21 progressBar2.Value = 1; 22 23 for (int i = fromPage; i <= totalPage; i++) 24 { 25 //处理进度条 26 progressBar1.Step = 1; 27 progressBar1.PerformStep(); 28 29 //处理操作 30 var url = baseUrl.Replace("#", i.ToString()); 31 try 32 { 33 var response = HttpWebResponseUtility.CreateGetHttpResponse(url, null, null, url, null); 34 if (response != null) 35 { 36 37 var sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"),true); 38 html = sr.ReadToEnd(); 39 response.Close(); 40 41 string pattern = "src9=\"http://[a-zA-Z0-9_./]+\""; 42 var gc = Regex.Matches(html, pattern); 43 //Console.WriteLine(gc); 44 45 //处理HTML文件 46 html = html.Replace("src=\"http://x.autoimg.cn/club/lazyload.png\" src9=\"", "src=\""); 47 html = html.Replace("http://", ""); 48 html = html.Replace("onload=", "x1="); 49 html = html.Replace("onerror=", "x2="); 50 html = html.Replace("<script", "<DIV style=\"display:none\" "); 51 html = html.Replace("</script", "</DIV"); 52 htmlFile = dir + Path.DirectorySeparatorChar + Path.GetFileName(url.Replace("http://", "")); 53 var sw = new StreamWriter(htmlFile, true, Encoding.GetEncoding("gb2312")); 54 sw.Write(html); 55 sw.Close(); 56 sw.Dispose(); 57 tbLog.AppendText(htmlFile + " ok"); 58 59 imgNum = 0; 60 foreach (var match in gc) 61 { 62 //Console.WriteLine(match.ToString()); 63 imgurl = match.ToString().Replace("\"", "").Replace("src9=", ""); 64 _myQue.Enqueue(new ParamEntity(dir, imgurl, url, isDefault)); 65 66 67 68 69 imgNum++; 70 }//end-foreach 71 72 tbLog.AppendText(", " + imgNum +" image(s)" + Environment.NewLine); 73 _totalNum += imgNum; 74 } 75 } 76 catch (Exception exception) 77 { 78 //Console.WriteLine(exception); 79 MessageBox.Show(exception.Message); 80 } 81 82 } 83 btnStart.Enabled = true; 84 var dt2 = DateTime.Now; 85 var timeUse = dt2 - dt1; 86 MessageBox.Show(string.Format("页面下载已结束,耗时 {0} 分钟,请等待图片下载结束,结束后打开目录 {1} 查看下载内容。", timeUse.TotalMinutes.ToString("F2"), dir)); 87 }
3、下载图片
1 private void DownloadImage(object obj) 2 { 3 var pe = obj as ParamEntity; 4 var tmp = pe.ImgUrl.Replace("http://", ""); 5 var dir = pe.SaveDir + Path.DirectorySeparatorChar + Path.GetDirectoryName(tmp); 6 var filename = pe.SaveDir + Path.DirectorySeparatorChar + tmp; 7 8 if (!Directory.Exists(dir)) 9 { 10 Directory.CreateDirectory(dir); 11 } 12 try 13 { 14 _runNum++; 15 if (pe.IsType1) 16 { 17 var wc = new WebClient(); 18 wc.DownloadFile(pe.ImgUrl, filename); 19 } 20 else 21 { 22 var imgres = HttpWebResponseUtility.CreateGetHttpResponse(pe.ImgUrl, null, null, pe.PageUrl, null); 23 if (imgres != null) 24 { 25 var reader = imgres.GetResponseStream(); 26 var writer = new FileStream(filename, FileMode.OpenOrCreate, FileAccess.Write); 27 var buff = new byte[512]; 28 var c = 0; //实际读取的字节数 29 while ((c = reader.Read(buff, 0, buff.Length)) > 0) 30 { 31 writer.Write(buff, 0, c); 32 } 33 writer.Close(); 34 writer.Dispose(); 35 reader.Close(); 36 reader.Dispose(); 37 imgres.Close(); 38 } 39 } 40 41 } 42 catch (Exception e) 43 { 44 //Console.WriteLine(e.ToString()); 45 _logQue.Enqueue(Path.GetFileName(tmp) + " fail. " + e.Message); 46 } 47 }
4、timer触发器
1 private void timer1_Tick(object sender, EventArgs e) 2 { 3 //处理进度条 4 progressBar2.Maximum = _totalNum; 5 progressBar2.Step = 5; 6 progressBar2.PerformStep(); 7 8 for (var i = 1; i <= 5; i++) 9 { 10 if (_myQue.Count > 0) 11 { 12 //Console.WriteLine(@"RunThread ({0}) {1}", i, _runNum); 13 var pe = (ParamEntity) _myQue.Dequeue(); 14 var thread = new Thread(new ParameterizedThreadStart(DownloadImage)); 15 thread.Start(pe); 16 } 17 } 18 if (_logQue.Count > 0) 19 { 20 tbLog.AppendText(_logQue.Dequeue().ToString() + Environment.NewLine); 21 } 22 23 Application.DoEvents(); 24 }
本来不想用timer的,想用一个队列,自己处理完了会再继续处理下一个,结果没写成。
【可能的技术要点】
1、http请求带referer
2、多线程,界面不阻塞(backgroundWorker,我还没改)
3、progressBar
4、Queue
【成品】
【心得】
不求最好,但求心安。新手的看看,大虾的指点。