有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章
#coding=utf-8 import re import urllib2 class dropsSpider: def __init__(self): self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"] self.re_getpage = re.compile(r"<spansclass='pages'>.*?1.*? (d+).*?</span>") self.re_gettitleandlinks = re.compile(r"<a href="(.*?)" rel="bookmark" title="Permanent Link to (.*?)">") #self.category = category self.url = "http://drops.wooyun.org/category/" self.filename = "text.html" def getPages(self,category): self.category = category user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } url = self.url + self.category #print url request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request,timeout=5) res = response.read() pages = re.findall(self.re_getpage, res) if pages: return pages[0] else : return str(1) def getTitleAndLinks(self,link): self.link = link user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } request = urllib2.Request(self.link,headers = headers) response = urllib2.urlopen(request,timeout=5) res = response.read() titleandlinks = re.findall(self.re_gettitleandlinks, res) return titleandlinks def startSpider(self): f = open(self.filename,"w+") for i in self.list: sum = self.getPages(i) for j in range(1,int(sum)+1): link = self.url+"category/"+ i + "/" + "page/" + str(j) aaa = self.getTitleAndLinks(link) for s in aaa: res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>' #res = s[0] + '===>' + s[1] f.write(res) f.close() if __name__=='__main__': myname = dropsSpider() myname.startSpider()
脚本有点臃肿可以大大的优化,更可做成多线程。