zoukankan      html  css  js  c++  java
  • python爬乌云dorps文章

    有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章

    #coding=utf-8
    import re
    import urllib2
     
    class dropsSpider:
        def __init__(self):
            self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"]
            self.re_getpage = re.compile(r"<spansclass='pages'>.*?1.*? (d+).*?</span>")
            self.re_gettitleandlinks = re.compile(r"<a href="(.*?)" rel="bookmark" title="Permanent Link to (.*?)">")
            #self.category = category
            self.url = "http://drops.wooyun.org/category/"
            self.filename = "text.html"
         
        def getPages(self,category):
            self.category = category
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            url = self.url + self.category
            #print url
            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            pages = re.findall(self.re_getpage, res)
            if pages:
                return pages[0]
            else :
                return str(1)
      
        def getTitleAndLinks(self,link):
            self.link = link
            user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            headers = { 'User-Agent' : user_agent }
            request = urllib2.Request(self.link,headers = headers)
            response = urllib2.urlopen(request,timeout=5)
            res = response.read()
            titleandlinks = re.findall(self.re_gettitleandlinks, res)
            return titleandlinks
     
        def startSpider(self):
            f = open(self.filename,"w+")
            for i in self.list:
                sum = self.getPages(i)
                for j in range(1,int(sum)+1):
                    link = self.url+"category/"+ i + "/" + "page/" + str(j)
                    aaa = self.getTitleAndLinks(link)
                    for s in aaa:
                        res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>'
                        #res = s[0] + '===>' + s[1]
                        f.write(res)       
             f.close()
     
     
    if __name__=='__main__':
        myname = dropsSpider()
        myname.startSpider()
    

    脚本有点臃肿可以大大的优化,更可做成多线程。

  • 相关阅读:
    HashMap按键排序和按值排序
    LeetCode 91. Decode Ways
    LeetCode 459. Repeated Substring Pattern
    JVM
    LeetCode 385. Mini Parse
    LeetCode 319. Bulb Switcher
    LeetCode 343. Integer Break
    LeetCode 397. Integer Replacement
    LeetCode 3. Longest Substring Without Repeating Characters
    linux-网络数据包抓取-tcpdump
  • 原文地址:https://www.cnblogs.com/depycode/p/5190102.html
Copyright © 2011-2022 走看看