zoukankan      html  css  js  c++  java
  • imgur.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
     
    from __future__ import with_statement
    import sys
    import os
    import urllib2
    from urlparse import urlparse
    import random
    import re
    import gevent
    from gevent import monkey
    monkey.patch_all()
     
    def get(url):
      setup = urllib2.build_opener()
      # TODO: Write appropriate headers.
      setup.add_headers = [('None', 'None')]
      urllib2.install_opener(setup)
      try:
        request = urllib2.Request(url)
      except (urllib2.HTTPError, urllib2.URLError), e:
        sys.exit(-1)
      return setup.open(request)
     
    def is_url(url):
      res = urlparse(url)
      return 'imgur.com' in res.netloc
     
    def fetch(url):
      res = urlparse(url)
      key = res.path.split('/')[2]
      urll = 'https://imgur.com/a/%s/noscript' % key
      return get(urll).read(), key
     
    def get_or_create_folder(key, folder=None):
      foldername = key
      if folder is not None:
        foldername = folder
      if not os.path.exists(foldername):
        os.makedirs(foldername)
      return foldername
      
    def fetch_images(foldername, images):
      gevent.sleep(random.randint(0, 1) * 0.0001)
      path = os.path.join(foldername, images[1])
      with open(path, 'wb') as img:
        img.write(get(images[0]).read())
      print 'Done:	%s' % images[0]
     
    def save(url, folder=None):
      data, key = fetch(url)
      REGEX = re.compile(r'<img src="(http://i.imgur.com/([a-zA-Z0-9]{5}.(jpg|png|gif)))"')
      images = REGEX.findall(data)
      foldername = get_or_create_folder(key, folder)
      return foldername, images
     
     
    if __name__ == '__main__':
      url = sys.argv[1]
      try:
        folder = sys.argv[2]
      except IndexError:
        folder = None
      foldername, images = save(url, folder=folder)
      threads = [gevent.spawn(fetch_images, foldername, image) for image in images]
      gevent.joinall(threads)
  • 相关阅读:
    题解 P2810 【Catch the theives】
    2020.11.27 考试题解
    2020.11.25 考试题解
    题解 SP16254 【RMID2
    2020.11.24 考试题解
    2020.11.23 考试题解
    CSP-2020 T3 函数调用
    二维树状数组学习笔记
    题解 P4910 【帕秋莉的手环】
    Python实现向指定IP的目标机器拷贝文件
  • 原文地址:https://www.cnblogs.com/bergus/p/4592772.html
Copyright © 2011-2022 走看看