zoukankan      html  css  js  c++  java
  • python爬乌云dorps文章

    有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章

    #coding=utf-8
    import re
    import urllib2
     
    class dropsSpider:
        def __init__(self):
            self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"]
            self.re_getpage = re.compile(r"<spansclass='pages'>.*?1.*? (d+).*?</span>")
            self.re_gettitleandlinks = re.compile(r"<a href="(.*?)" rel="bookmark" title="Permanent Link to (.*?)">")
            #self.category = category
            self.url = "http://drops.wooyun.org/category/"
            self.filename = "text.html"
         
        def getPages(self,category):
            self.category = category
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            url = self.url + self.category
            #print url
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            pages = re.findall(self.re_getpage, res)
            if pages:
                return pages[0]
            else :
                return str(1)
      
        def getTitleAndLinks(self,link):
            self.link = link
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            request = urllib2.Request(self.link,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            titleandlinks = re.findall(self.re_gettitleandlinks, res)
            return titleandlinks
     
        def startSpider(self):
            f = open(self.filename,"w+")
            for i in self.list:
                sum = self.getPages(i)
                for j in range(1,int(sum)+1):
                    link = self.url+"category/"+ i + "/" + "page/" + str(j)
                    aaa = self.getTitleAndLinks(link)
                    for s in aaa:
                        res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>'
                        #res = s[0] + '===>' + s[1]
                        f.write(res)       
             f.close()
     
     
    if __name__=='__main__':
        myname = dropsSpider()
        myname.startSpider()
    

    脚本有点臃肿可以大大的优化,更可做成多线程。

  • 相关阅读:
    面向报文(UDP)和面向字节流(TCP)的区别
    c++ 字符串和数字拼接
    OpenGL中着色器,渲染管线,光栅化
    阅读计划
    课堂测试
    多态与异常处理
    《大道至简》第七八章读后感
    继承与接口课堂作业
    《大道至简》第六章读后感
    随机数组
  • 原文地址:https://www.cnblogs.com/depycode/p/5190102.html
Copyright © 2011-2022 走看看