zoukankan      html  css  js  c++  java
  • python爬乌云dorps文章

    有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章

    #coding=utf-8
    import re
    import urllib2
     
    class dropsSpider:
        def __init__(self):
            self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"]
            self.re_getpage = re.compile(r"<spansclass='pages'>.*?1.*? (d+).*?</span>")
            self.re_gettitleandlinks = re.compile(r"<a href="(.*?)" rel="bookmark" title="Permanent Link to (.*?)">")
            #self.category = category
            self.url = "http://drops.wooyun.org/category/"
            self.filename = "text.html"
         
        def getPages(self,category):
            self.category = category
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            url = self.url + self.category
            #print url
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            pages = re.findall(self.re_getpage, res)
            if pages:
                return pages[0]
            else :
                return str(1)
      
        def getTitleAndLinks(self,link):
            self.link = link
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            request = urllib2.Request(self.link,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            titleandlinks = re.findall(self.re_gettitleandlinks, res)
            return titleandlinks
     
        def startSpider(self):
            f = open(self.filename,"w+")
            for i in self.list:
                sum = self.getPages(i)
                for j in range(1,int(sum)+1):
                    link = self.url+"category/"+ i + "/" + "page/" + str(j)
                    aaa = self.getTitleAndLinks(link)
                    for s in aaa:
                        res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>'
                        #res = s[0] + '===>' + s[1]
                        f.write(res)       
             f.close()
     
     
    if __name__=='__main__':
        myname = dropsSpider()
        myname.startSpider()
    

    脚本有点臃肿可以大大的优化,更可做成多线程。

  • 相关阅读:
    201671030116宋菲菲 实验三作业互评与改进报告
    通读《构建之法》提出问题
    201671010460-朱艺璇-实验四附加实验
    201671010460朱艺璇 词频统计软件项目报告
    201671010460朱艺璇 实验三作业互评与改进报告
    阅读《现代软件工程—构建之法》提出的问题
    手把手带你了解消息中间件(3)——RocketMQ
    字符编码的历史由来
    linux常用命令
    linux各目录及重要目录的详细介绍
  • 原文地址:https://www.cnblogs.com/depycode/p/5190102.html
Copyright © 2011-2022 走看看