1 import urllib2 2 3 class SimpleCookieHandler(urllib2.BaseHandler): 4 def http_request(self, req): 5 simple_cookie = 'PHPSESSID=fcf3c469c3c80f3e8480aca76c87faa;ocKey=c9821225458886fa8329cccc283e60e;wy_uid=b52fOIbsG%2BB6kyNmyU9esuL%2FRb8GTatlM4n5ghw7dP4;wy_pwd=f770PcQrsQ2YyEUO07ouNVJxMxXvONgOQHcoXQ%2Bm4xJC22oosvWGDL6RJU09fIwTNkO9JTZ9yQMWXiszw' 6 if not req.has_header('Cookie'): 7 req.add_unredirected_header('Cookie', simple_cookie) 8 else: 9 cookie = req.get_header('Cookie') 10 req.add_unredirected_header('Cookie', simple_cookie + '; ' + cookie) 11 return req 12 13 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), SimpleCookieHandler()); 14 urllib2.install_opener(opener); 15 16 while 1: 17 for n in range(4,10): 18 resp = urllib2.urlopen('http://zone.wooyun.org/content/'+repr(n)); 19 file = open('c:\'+repr(n)+'.html', 'w') 20 for line in resp.read(): 21 file.write(line) 22 file.close() 23 break;
首先定义个cookie类参考http://blog.csdn.net/uestcyao/article/details/7896184,把收集到的cookie放到simple_cookie内。使用urllib2模块完成整个功能。
说明:刚开始打算使用登录功能,自动提交cookie,要用到cookielib模块。后来发现需要验证码,折腾半天觉得验证码还是比较难缠的问题。暂时改变思路使用自己登录时采集的cookie进行提交。就可以模拟登录者的身份进行采集了。以后会研究验证码方面的功能。解决验证码方面的困扰。