zoukankan      html  css  js  c++  java
  • 爬虫下载脚本

    download 文件

     1 #!/usr/bin/python
     2 #_*_coding:utf-8 _*_
     3 import  urlparse
     4 import urllib2
     5 import random
     6 import time
     7 from   datetime  import  datetime, timedelta
     8 import socket 
     9 import disk_cache 
    10 DEFAULT_AGENT='WSWP'  # 设置代理
    11 DEFAULT_DELAY=5       #设置下载延迟 为了限制下载速度
    12 DEFAULT_RETRIES=1#发生错误时候尝试的次数
    13 DEFAULT_TIMEOUT=60   
    14 CACHE=disk_cache.DiskCache()
    15 class   Downloader:
    16    def __init__ (self ,delay=DEFAULT_DELAY,user_agent=DEFAULT_AGENT, proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, opener=None,cache=CACHE ):
    17             socket.setdefaulttimeout(timeout)
    18             self.throttle=Throttle(delay)
    19             self.user_agent=user_agent
    20             self.proxies=proxies
    21             self.num_retries=num_retries
    22             self.opener=opener
    23             self.cache=cache
    24 
    25 
    26    def __call__(self,url):
    27        result=None
    28        print self.cache
    29        if  self.cache:
    30             try:
    31                 print  'women doushi'
    32                 result=self.cache[url]
    33                 print result 
    34                 print '123' 
    35             except  KeyError:
    36                 pass
    37             else:
    38                   if self.num_retries>0  and 500< result['code']<600:
    39                          result=None
    40        if result is  None:
    41               self.throttle.wait(url)
    42               proxy=random.choice(self.proxies)  if self.proxies   else  None
    43               headers={'User-agent':self.user_agent}
    44               result=self.download(url,headers, proxy=proxy,num_retries=self.num_retries)
    45               if self.cache:
    46                     self.cache[url]=result
    47 
    48        return result['html']
    49 
    50    def  download(self,url, headers,proxy, num_retries, data=None):
    51             print 'Downloading:', url
    52             request=urllib2.Request(url,data,headers or {})
    53             opener=self.opener or  urllib2.build_opener()
    54             if proxy:
    55                   proxy_params={urlparse.urlparse(url).scheme:proxy}
    56                   opener.add_handler(urllib2.ProxyHandler(proxy_params))
    57 
    58 
    59             try:
    60                        
    61 
    62               response=opener.open(request)
    63               html=response.read()
    64               code=response.code
    65 
    66             except   Exception   as e :
    67                    print  'Download error:',  str(e)
    68 
    69                    html=''
    70                    
    71                    if num_retries>0  and   500<=code<600:
    72                            return  self._get(url,headers, prox,num_retries-1,data)
    73 
    74 
    75                    else:
    76                      code=None
    77             return  {'html':html,'code':code}
    78                     
    79 class     Throttle:
    80       def  __init__ (self,delay):
    81          self.delay=delay
    82          self.domains={}
    83       def wait(self,url):
    84            domain=urlparse.urlsplit(url).netloc
    85            last_accessed=self.domains.get(domain)
    86            if self.delay>0 and last_accessed is not None:
    87                      sleep_secs=self.delay-(datetime.now()-last_accessed).seconds
    88                      if  sleep_secs >0:
    89                               time.sleep(sleep_secs)
    90            self.domains[domain]=datetime.now()
    91 p=Downloader()
    92 x=p('http://www.meituan.com')
    View Code

    缓存disk_cache 脚本

      

      1 import os
      2 import re
      3 import urlparse
      4 import shutil
      5 import zlib
      6 from datetime import datetime, timedelta
      7 try:
      8     import cPickle as pickle
      9 except ImportError:
     10     import pickle
     11 
     12 
     13 
     14 class DiskCache:
     15     """
     16     Dictionary interface that stores cached 
     17     values in the file system rather than in memory.
     18     The file path is formed from an md5 hash of the key.
     19 
     20     >>> cache = DiskCache()
     21     >>> url = 'http://example.webscraping.com'
     22     >>> result = {'html': '...'}
     23     >>> cache[url] = result
     24     >>> cache[url]['html'] == result['html']
     25     True
     26     >>> cache = DiskCache(expires=timedelta())
     27     >>> cache[url] = result
     28     >>> cache[url]
     29     Traceback (most recent call last):
     30      ...
     31     KeyError: 'http://example.webscraping.com has expired'
     32     >>> cache.clear()
     33     """
     34 
     35     def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):
     36         """
     37         cache_dir: the root level folder for the cache
     38         expires: timedelta of amount of time before a cache entry is considered expired
     39         compress: whether to compress data in the cache
     40         """
     41         self.cache_dir = cache_dir
     42         self.expires = expires
     43         self.compress = compress
     44 
     45     
     46     def __getitem__(self, url):
     47         """Load data from disk for this URL
     48         """
     49         path = self.url_to_path(url)
     50         if os.path.exists(path):
     51             with open(path, 'rb') as fp:
     52                 data = fp.read()
     53                 if self.compress:
     54                     data = zlib.decompress(data)
     55                 result, timestamp = pickle.loads(data)
     56                 if self.has_expired(timestamp):
     57                     raise KeyError(url + ' has expired')
     58                 return result
     59         else:
     60             # URL has not yet been cached
     61             raise KeyError(url + ' does not exist')
     62 
     63 
     64     def __setitem__(self, url, result):
     65         """Save data to disk for this url
     66         """
     67         path = self.url_to_path(url)
     68         folder = os.path.dirname(path)
     69         if not os.path.exists(folder):
     70             os.makedirs(folder)
     71 
     72         data = pickle.dumps((result, datetime.utcnow()))
     73         if self.compress:
     74             data = zlib.compress(data)
     75         with open(path, 'wb') as fp:
     76             fp.write(data)
     77 
     78 
     79     def __delitem__(self, url):
     80         """Remove the value at this key and any empty parent sub-directories
     81         """
     82         path = self._key_path(url)
     83         try:
     84             os.remove(path)
     85             os.removedirs(os.path.dirname(path))
     86         except OSError:
     87             pass
     88 
     89 
     90     def url_to_path(self, url):
     91         """Create file system path for this URL
     92         """
     93         components = urlparse.urlsplit(url)
     94         # when empty path set to /index.html
     95         path = components.path
     96         if not path:
     97             path = '/index.html'
     98         elif path.endswith('/'):
     99             path += 'index.html'
    100         filename = components.netloc + path + components.query
    101         # replace invalid characters
    102         filename = re.sub('[^/0-9a-zA-Z-.,;_ ]', '_', filename)
    103         # restrict maximum number of characters
    104         filename = '/'.join(segment[:255] for segment in filename.split('/'))
    105         return os.path.join(self.cache_dir, filename)
    106 
    107 
    108     def has_expired(self, timestamp):
    109         """Return whether this timestamp has expired
    110         """
    111         return datetime.utcnow() > timestamp + self.expires
    112 
    113 
    114     def clear(self):
    115         """Remove all the cached values
    116         """
    117         if os.path.exists(self.cache_dir):
    118             shutil.rmtree(self.cache_dir)
    119 
    120 
    121 
    122 if __name__ == '__main__':
    123     cache=DiskCache()
    124     print cache
    View Code
  • 相关阅读:
    记一次腾讯云服务器被黑
    mui学习笔记
    Maven(二)名词、命令、仓库、依赖、私服
    MySQL触发器
    SSL证书安装(Tomcat)腾讯云服务器
    Maven(一)简介安装
    HTML多图片压缩上传
    Gradle(一)安装配置
    dubbo本地服务化实现(dubbo三)
    Object is not a function
  • 原文地址:https://www.cnblogs.com/yubenliu/p/6055021.html
Copyright © 2011-2022 走看看