zoukankan      html  css  js  c++  java
  • imgur.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
     
    from __future__ import with_statement
    import sys
    import os
    import urllib2
    from urlparse import urlparse
    import random
    import re
    import gevent
    from gevent import monkey
    monkey.patch_all()
     
    def get(url):
      setup = urllib2.build_opener()
      # TODO: Write appropriate headers.
      setup.add_headers = [('None', 'None')]
      urllib2.install_opener(setup)
      try:
        request = urllib2.Request(url)
      except (urllib2.HTTPError, urllib2.URLError), e:
        sys.exit(-1)
      return setup.open(request)
     
    def is_url(url):
      res = urlparse(url)
      return 'imgur.com' in res.netloc
     
    def fetch(url):
      res = urlparse(url)
      key = res.path.split('/')[2]
      urll = 'https://imgur.com/a/%s/noscript' % key
      return get(urll).read(), key
     
    def get_or_create_folder(key, folder=None):
      foldername = key
      if folder is not None:
        foldername = folder
      if not os.path.exists(foldername):
        os.makedirs(foldername)
      return foldername
      
    def fetch_images(foldername, images):
      gevent.sleep(random.randint(0, 1) * 0.0001)
      path = os.path.join(foldername, images[1])
      with open(path, 'wb') as img:
        img.write(get(images[0]).read())
      print 'Done:	%s' % images[0]
     
    def save(url, folder=None):
      data, key = fetch(url)
      REGEX = re.compile(r'<img src="(http://i.imgur.com/([a-zA-Z0-9]{5}.(jpg|png|gif)))"')
      images = REGEX.findall(data)
      foldername = get_or_create_folder(key, folder)
      return foldername, images
     
     
    if __name__ == '__main__':
      url = sys.argv[1]
      try:
        folder = sys.argv[2]
      except IndexError:
        folder = None
      foldername, images = save(url, folder=folder)
      threads = [gevent.spawn(fetch_images, foldername, image) for image in images]
      gevent.joinall(threads)
  • 相关阅读:
    多线程开发技术基础
    Asp.Net MVC 进阶篇:路由匹配 实现博客路径 和文章路径
    详解 ManualResetEvent
    Http状态码完整说明
    Javascript 封装问题
    网络爬虫的C++程序
    闭包
    算法框架与问题求解
    SQLSERVER用无中生有的思想来替代游标
    Citrix 服务器虚拟化之四 Xenserver资源池
  • 原文地址:https://www.cnblogs.com/bergus/p/4592772.html
Copyright © 2011-2022 走看看