zoukankan      html  css  js  c++  java
  • python爬乌云dorps文章

    有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章

    #coding=utf-8
    import re
    import urllib2
     
    class dropsSpider:
        def __init__(self):
            self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"]
            self.re_getpage = re.compile(r"<spansclass='pages'>.*?1.*? (d+).*?</span>")
            self.re_gettitleandlinks = re.compile(r"<a href="(.*?)" rel="bookmark" title="Permanent Link to (.*?)">")
            #self.category = category
            self.url = "http://drops.wooyun.org/category/"
            self.filename = "text.html"
         
        def getPages(self,category):
            self.category = category
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            url = self.url + self.category
            #print url
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            pages = re.findall(self.re_getpage, res)
            if pages:
                return pages[0]
            else :
                return str(1)
      
        def getTitleAndLinks(self,link):
            self.link = link
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            request = urllib2.Request(self.link,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            titleandlinks = re.findall(self.re_gettitleandlinks, res)
            return titleandlinks
     
        def startSpider(self):
            f = open(self.filename,"w+")
            for i in self.list:
                sum = self.getPages(i)
                for j in range(1,int(sum)+1):
                    link = self.url+"category/"+ i + "/" + "page/" + str(j)
                    aaa = self.getTitleAndLinks(link)
                    for s in aaa:
                        res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>'
                        #res = s[0] + '===>' + s[1]
                        f.write(res)       
             f.close()
     
     
    if __name__=='__main__':
        myname = dropsSpider()
        myname.startSpider()
    

    脚本有点臃肿可以大大的优化,更可做成多线程。

  • 相关阅读:
    动态规划算法-3
    动态规划算法-2
    动态规划算法-1
    滑动窗口算法-3
    央行副行长提示金融风险:地方偿债高峰期到来
    银行卡换“芯” 更要银行换心
    破解IT运维成本困境,专业化分工是妙方
    php连接mysql
    ajax原生验证用户名是否存在
    ajax跨域问题
  • 原文地址:https://www.cnblogs.com/depycode/p/5190102.html
Copyright © 2011-2022 走看看