zoukankan      html  css  js  c++  java
  • python高级之scrapy框架

    目录:

    • 爬虫性能原理
    • scrapy框架解析

    一、爬虫性能原理

    在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。

    1、同步执行

     1 import requests
     2 
     3 def fetch_async(url):
     4     response = requests.get(url)
     5     return response
     6 
     7 
     8 url_list = ['http://www.github.com', 'http://www.bing.com']
     9 
    10 for url in url_list:
    11     fetch_async(url)
    View Code

    2、多线程执行

     1 from concurrent.futures import ThreadPoolExecutor
     2 #导入线程池
     3 import requests
     4 
     5 
     6 def fetch_async(url):
     7     response = requests.get(url)
     8     return response
     9 
    10 
    11 url_list = ['http://www.github.com', 'http://www.bing.com']
    12 pool = ThreadPoolExecutor(5)
    13 for url in url_list:
    14     pool.submit(fetch_async, url)
    15 pool.shutdown(wait=True)
    View Code
     1 from concurrent.futures import ThreadPoolExecutor
     2 import requests
     3 
     4 def fetch_async(url):
     5     response = requests.get(url)
     6     return response
     7 
     8 
     9 def callback(future):
    10     print(future.result())
    11 
    12 
    13 url_list = ['http://www.github.com', 'http://www.bing.com']
    14 pool = ThreadPoolExecutor(5)
    15 for url in url_list:
    16     v = pool.submit(fetch_async, url)
    17     v.add_done_callback(callback)
    18 pool.shutdown(wait=True)
    多线程+回掉函数

    3、多进程执行

     1 from concurrent.futures import ProcessPoolExecutor
     2 import requests
     3 
     4 def fetch_async(url):
     5     response = requests.get(url)
     6     return response
     7 
     8 
     9 url_list = ['http://www.github.com', 'http://www.bing.com']
    10 pool = ProcessPoolExecutor(5)
    11 for url in url_list:
    12     pool.submit(fetch_async, url)
    13 pool.shutdown(wait=True)
    View Code
     1 from concurrent.futures import ProcessPoolExecutor
     2 import requests
     3 
     4 
     5 def fetch_async(url):
     6     response = requests.get(url)
     7     return response
     8 
     9 
    10 def callback(future):
    11     print(future.result())
    12 
    13 
    14 url_list = ['http://www.github.com', 'http://www.bing.com']
    15 pool = ProcessPoolExecutor(5)
    16 for url in url_list:
    17     v = pool.submit(fetch_async, url)
    18     v.add_done_callback(callback)
    19 pool.shutdown(wait=True)
    多进程+回掉函数

    通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO回事首选:

    1、asyncio示例

     1 import asyncio
     2 
     3 
     4 @asyncio.coroutine
     5 def func1():
     6     print('before...func1......')
     7     yield from asyncio.sleep(5)
     8     print('end...func1......')
     9 
    10 
    11 tasks = [func1(), func1()]
    12 
    13 loop = asyncio.get_event_loop()
    14 loop.run_until_complete(asyncio.gather(*tasks))
    15 loop.close()
    View Code
    View Code

    2、asyncio+aiohttp示例

    View Code

    3、asyncio+ requests示例

     1 import asyncio
     2 import requests
     3 
     4 
     5 @asyncio.coroutine
     6 def fetch_async(func, *args):
     7     loop = asyncio.get_event_loop()
     8     future = loop.run_in_executor(None, func, *args)
     9     response = yield from future
    10     print(response.url, response.content)
    11 
    12 
    13 tasks = [
    14     fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
    15     fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
    16 ]
    17 
    18 loop = asyncio.get_event_loop()
    19 results = loop.run_until_complete(asyncio.gather(*tasks))
    20 loop.close()
    View Code

    4、gevent+requests示例

     1 import gevent
     2 
     3 import requests
     4 from gevent import monkey
     5 
     6 monkey.patch_all()
     7 
     8 
     9 def fetch_async(method, url, req_kwargs):
    10     print(method, url, req_kwargs)
    11     response = requests.request(method=method, url=url, **req_kwargs)
    12     print(response.url, response.content)
    13 
    14 # ##### 发送请求 #####
    15 gevent.joinall([
    16     gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    17     gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    18     gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
    19 ])
    20 
    21 # ##### 发送请求(协程池控制最大协程数量) #####
    22 # from gevent.pool import Pool
    23 # pool = Pool(None)
    24 # gevent.joinall([
    25 #     pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    26 #     pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    27 #     pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
    28 # ])
    View Code

    5、grequests示例

     1 import grequests
     2 
     3 
     4 request_list = [
     5     grequests.get('http://httpbin.org/delay/1', timeout=0.001),
     6     grequests.get('http://fakedomain/'),
     7     grequests.get('http://httpbin.org/status/500')
     8 ]
     9 
    10 
    11 # ##### 执行并获取响应列表 #####
    12 # response_list = grequests.map(request_list)
    13 # print(response_list)
    14 
    15 
    16 # ##### 执行并获取响应列表(处理异常) #####
    17 # def exception_handler(request, exception):
    18 # print(request,exception)
    19 #     print("Request failed")
    20 
    21 # response_list = grequests.map(request_list, exception_handler=exception_handler)
    22 # print(response_list)
    View Code

    6、twisted示例

     1 from twisted.web.client import getPage, defer
     2 from twisted.internet import reactor
     3 
     4 
     5 def all_done(arg):
     6     reactor.stop()
     7 
     8 
     9 def callback(contents):
    10     print(contents)
    11 
    12 
    13 deferred_list = []
    14 
    15 url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
    16 for url in url_list:
    17     deferred = getPage(bytes(url, encoding='utf8'))
    18     deferred.addCallback(callback)
    19     deferred_list.append(deferred)
    20 
    21 dlist = defer.DeferredList(deferred_list)
    22 dlist.addBoth(all_done)
    23 
    24 reactor.run()
    View Code

    7、tornado示例

     1 from twisted.internet import reactor
     2 from twisted.web.client import getPage
     3 import urllib.parse
     4 
     5 
     6 def one_done(arg):
     7     print(arg)
     8     reactor.stop()
     9 
    10 post_data = urllib.parse.urlencode({'check_data': 'adf'})
    11 post_data = bytes(post_data, encoding='utf8')
    12 headers = {b'Content-Type': b'application/x-www-form-urlencoded'}
    13 response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),
    14                    method=bytes('POST', encoding='utf8'),
    15                    postdata=post_data,
    16                    cookies={},
    17                    headers=headers)
    18 response.addBoth(one_done)
    19 
    20 reactor.run()
    View Code

    以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】:

      1 import select
      2 import socket
      3 import time
      4 
      5 
      6 class AsyncTimeoutException(TimeoutError):
      7     """
      8     请求超时异常类
      9     """
     10 
     11     def __init__(self, msg):
     12         self.msg = msg
     13         super(AsyncTimeoutException, self).__init__(msg)
     14 
     15 
     16 class HttpContext(object):
     17     """封装请求和相应的基本数据"""
     18 
     19     def __init__(self, sock, host, port, method, url, data, callback, timeout=5):
     20         """
     21         sock: 请求的客户端socket对象
     22         host: 请求的主机名
     23         port: 请求的端口
     24         port: 请求的端口
     25         method: 请求方式
     26         url: 请求的URL
     27         data: 请求时请求体中的数据
     28         callback: 请求完成后的回调函数
     29         timeout: 请求的超时时间
     30         """
     31         self.sock = sock
     32         self.callback = callback
     33         self.host = host
     34         self.port = port
     35         self.method = method
     36         self.url = url
     37         self.data = data
     38 
     39         self.timeout = timeout
     40 
     41         self.__start_time = time.time()
     42         self.__buffer = []
     43 
     44     def is_timeout(self):
     45         """当前请求是否已经超时"""
     46         current_time = time.time()
     47         if (self.__start_time + self.timeout) < current_time:
     48             return True
     49 
     50     def fileno(self):
     51         """请求sockect对象的文件描述符,用于select监听"""
     52         return self.sock.fileno()
     53 
     54     def write(self, data):
     55         """在buffer中写入响应内容"""
     56         self.__buffer.append(data)
     57 
     58     def finish(self, exc=None):
     59         """在buffer中写入响应内容完成,执行请求的回调函数"""
     60         if not exc:
     61             response = b''.join(self.__buffer)
     62             self.callback(self, response, exc)
     63         else:
     64             self.callback(self, None, exc)
     65 
     66     def send_request_data(self):
     67         content = """%s %s HTTP/1.0
    Host: %s
    
    %s""" % (
     68             self.method.upper(), self.url, self.host, self.data,)
     69 
     70         return content.encode(encoding='utf8')
     71 
     72 
     73 class AsyncRequest(object):
     74     def __init__(self):
     75         self.fds = []
     76         self.connections = []
     77 
     78     def add_request(self, host, port, method, url, data, callback, timeout):
     79         """创建一个要请求"""
     80         client = socket.socket()
     81         client.setblocking(False)
     82         try:
     83             client.connect((host, port))
     84         except BlockingIOError as e:
     85             pass
     86             # print('已经向远程发送连接的请求')
     87         req = HttpContext(client, host, port, method, url, data, callback, timeout)
     88         self.connections.append(req)
     89         self.fds.append(req)
     90 
     91     def check_conn_timeout(self):
     92         """检查所有的请求,是否有已经连接超时,如果有则终止"""
     93         timeout_list = []
     94         for context in self.connections:
     95             if context.is_timeout():
     96                 timeout_list.append(context)
     97         for context in timeout_list:
     98             context.finish(AsyncTimeoutException('请求超时'))
     99             self.fds.remove(context)
    100             self.connections.remove(context)
    101 
    102     def running(self):
    103         """事件循环,用于检测请求的socket是否已经就绪,从而执行相关操作"""
    104         while True:
    105             r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)
    106 
    107             if not self.fds:
    108                 return
    109 
    110             for context in r:
    111                 sock = context.sock
    112                 while True:
    113                     try:
    114                         data = sock.recv(8096)
    115                         if not data:
    116                             self.fds.remove(context)
    117                             context.finish()
    118                             break
    119                         else:
    120                             context.write(data)
    121                     except BlockingIOError as e:
    122                         break
    123                     except TimeoutError as e:
    124                         self.fds.remove(context)
    125                         self.connections.remove(context)
    126                         context.finish(e)
    127                         break
    128 
    129             for context in w:
    130                 # 已经连接成功远程服务器,开始向远程发送请求数据
    131                 if context in self.fds:
    132                     data = context.send_request_data()
    133                     context.sock.sendall(data)
    134                     self.connections.remove(context)
    135 
    136             self.check_conn_timeout()
    137 
    138 
    139 if __name__ == '__main__':
    140     def callback_func(context, response, ex):
    141         """
    142         :param context: HttpContext对象,内部封装了请求相关信息
    143         :param response: 请求响应内容
    144         :param ex: 是否出现异常(如果有异常则值为异常对象;否则值为None)
    145         :return:
    146         """
    147         print(context, response, ex)
    148 
    149     obj = AsyncRequest()
    150     url_list = [
    151         {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    152          'callback': callback_func},
    153         {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    154          'callback': callback_func},
    155         {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    156          'callback': callback_func},
    157     ]
    158     for item in url_list:
    159         print(item)
    160         obj.add_request(**item)
    161 
    162     obj.running()
    自写异步IO框架

    基本原理:
    IO多路复用:select,用于检测socket对象是否发生变化(是否连接成功,是否有数据到来)
    Socket:socket客户端

     1 import socket
     2             import select
     3 
     4             class Request(object):
     5                 def __init__(self,sock,func,url):
     6                     self.sock = sock
     7                     self.func = func
     8                     self.url = url
     9 
    10                 def fileno(self):
    11                     return self.sock.fileno()
    12 
    13             def async_request(url_list):
    14 
    15                 input_list = []
    16                 conn_list = []
    17 
    18                 for url in url_list:
    19                     client = socket.socket()
    20                     client.setblocking(False)
    21                     # 创建连接,不阻塞
    22                     try:
    23                         client.connect((url[0],80,)) # 100个向百度发送的请求
    24                     except BlockingIOError as e:
    25                         pass
    26 
    27                     obj = Request(client,url[1],url[0])
    28 
    29                     input_list.append(obj)
    30                     conn_list.append(obj)
    31 
    32                 while True:
    33                     # 监听socket是否已经发生变化 [request_obj,request_obj....request_obj]
    34                     # 如果有请求连接成功:wlist = [request_obj,request_obj]
    35                     # 如果有响应的数据:  rlist = [request_obj,request_obj....client100]
    36                     rlist,wlist,elist = select.select(input_list,conn_list,[],0.05)
    37                     for request_obj in wlist:
    38                         # print('连接成功')
    39                         # # # # 发送Http请求
    40                         # print('发送请求')
    41                         request_obj.sock.sendall("GET / HTTP/1.0
    host:{0}
    
    ".format(request_obj.url).encode('utf-8'))
    42                         conn_list.remove(request_obj)
    43 
    44                     for request_obj in rlist:
    45                         data = request_obj.sock.recv(8096)
    46                         request_obj.func(data)
    47                         request_obj.sock.close()
    48                         input_list.remove(request_obj)
    49 
    50                     if not input_list:
    51                         break
    View Code
     1 使用一个线程完成并发操作,如何并发?
     2         当第一个任务到来时,先发送连接请求,此时会发生IO等待,但是我不等待,我继续发送第二个任务的连接请求....
     3         
     4         IO多路复用监听socket变化
     5         先连接成功:
     6             发送请求信息: GET / http/1.0
    host....
     7             遇到IO等待,不等待,继续检测是否有人连接成功:
     8             发送请求信息: GET / http/1.0
    host....
     9             遇到IO等待,不等待,继续检测是否有人连接成功:
    10             发送请求信息: GET / http/1.0
    host....
    11             
    12         有结果返回:
    13             读取返回内容,执行回调函数
    14             读取返回内容,执行回调函数
    15             读取返回内容,执行回调函数
    16             读取返回内容,执行回调函数
    17             读取返回内容,执行回调函数
    18             读取返回内容,执行回调函数
    19             读取返回内容,执行回调函数
    20             
    21         
    22         
    23         问题:什么是协程?
    24               单纯的执行一端代码后,调到另外一端代码执行,再继续跳...
    25               
    26         异步IO:
    27              - 【基于协程】可以用 协程+非阻塞socket+select实现,gevent
    28              - 【基于事件循环】完全通用socket+select实现,Twsited
    29         
    30         1. 如何提高爬虫并发?
    31             利用异步IO模块,如:asyncio,twisted,gevent 
    32             本质:
    33                 - 【基于协程】可以用 协程+非阻塞socket+select实现,gevent
    34                 - 【基于事件循环】完全通用socket+select实现,Twsited,tornado
    35                 
    36         2. 异步非阻塞
    37               异步:回调   select 
    38             非阻塞:不等待 setblocking(False)
    39                 
    40         3. 什么是协程?
    41             pip3 install gevent 
    42         
    43             from greenlet import greenlet
    44 
    45             def test1():
    46                 print(12)
    47                 gr2.switch()
    48                 print(34)
    49                 gr2.switch()
    50              
    51              
    52             def test2():
    53                 print(56)
    54                 gr1.switch()
    55                 print(78)
    56              
    57             gr1 = greenlet(test1)
    58             gr2 = greenlet(test2)
    59             gr1.switch()
    View Code

    二、scrapy框架解析

    Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中。
    其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的, 也可以应用在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通用的网络爬虫。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。

    Scrapy 使用了 Twisted异步网络库来处理网络通讯。整体架构大致如下

    Scrapy主要包括了以下组件:

      • 引擎(Scrapy)
        用来处理整个系统的数据流处理, 触发事务(框架核心)
      • 调度器(Scheduler)
        用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址
      • 下载器(Downloader)
        用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的)
      • 爬虫(Spiders)
        爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面
      • 项目管道(Item Pipeline)
        负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。
      • 下载器中间件(Downloader Middlewares)
        位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。
      • 爬虫中间件(Spider Middlewares)
        介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。

    简而言之:

    5个模块功能

    • (1) 最重要的模块是Engine:它是数据流的指挥官,负责控制数据流(控制各个模块之间的通信);
    • (2) scheduler:负责将Engine提交的URL排成一个队列;
    • (3) spider:用户自己写的代码放在spider。主要负责HTTP response的解析,从回复的HTML中提取关键数据。
    • (4) downloader:负责跟URL对应的server通信,并获取返回的内容。
    • (5) item pipeline:负责处理spider提取出来的信息,一般用于做跟DB相关的操作。

    2个中间件

    中间件是处于两个模块之间的一种特殊hook,它的目的是提供一种简易的机制,通过插拔用户自己写的代码,来扩展新功能。

    典型的数据流

    • (1) Engine启动,从spider中读出要爬的第一个URL
    • (2) Engine将读到的第一个URL送给scheduler
    • (3) Engine向scheduler请求下一个要爬的URL
    • (4) scheduler从队列中读出一个URL,送给Engine,Engine将这个URL送到downloader
    • (5) downloader去GET这个URL,并将HTTP response生成一个Response对象。downloader将生成的Response返回给Engine
    • (6) Engine将这个Response对象发给spider
    • (7) spider处理这个Response对象,提取其中的信息,生成item。还会生成新的请求。并将item和请求送给Engine
    • (7) Engine将收到的请求送给scheduler,将收到的item送给item pipline
    • (8) 重复步骤(2),直到没有URL需要继续处理

    1、安装:

     1 Linux
     2       pip3 install scrapy
     3  
     4  
     5 Windows
     6       a. pip3 install wheel
     7       b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
     8       c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
     9       d. pip3 install scrapy
    10       e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/
    View Code

    2、基本使用命令:

     1 1. scrapy startproject 项目名称
     2    - 在当前目录中创建中创建一个项目文件(类似于Django)
     3  
     4 2. scrapy genspider [-t template] <name> <domain>
     5    - 创建爬虫应用
     6    如:
     7       scrapy gensipider -t basic oldboy oldboy.com
     8       scrapy gensipider -t xmlfeed autohome autohome.com.cn
     9    PS:
    10       查看所有命令:scrapy gensipider -l
    11       查看模板命令:scrapy gensipider -d 模板名称
    12  
    13 3. scrapy list
    14    - 展示爬虫应用列表
    15  
    16 4. scrapy crawl 爬虫应用名称
    17    - 运行单独爬虫应用

    3、项目结构以及爬虫应用简介

     1 project_name/
     2    scrapy.cfg
     3    project_name/
     4        __init__.py
     5        items.py
     6        pipelines.py
     7        settings.py
     8        spiders/
     9            __init__.py
    10            爬虫1.py
    11            爬虫2.py
    12            爬虫3.py
    • scrapy.cfg: 项目配置文件
    • project_name/: 项目python模块, 呆会代码将从这里导入
    • project_name/items.py: 项目items文件
    • project_name/pipelines.py: 项目管道文件
    • project_name/settings.py: 项目配置文件
    • project_name/spiders: 放置spider的目录
    • project_name/middlewares: 放置中间件文件

    注意:一般创建爬虫文件时,以网站域名命名

     1 import scrapy
     2  
     3 class XiaoHuarSpider(scrapy.spiders.Spider):
     4     name = "xiaohuar"                            # 爬虫名称 *****
     5     allowed_domains = ["xiaohuar.com"]  # 允许的域名
     6     start_urls = [
     7         "http://www.xiaohuar.com/hua/",   # 其实URL
     8     ]
     9  
    10     def parse(self, response):
    11         # 访问起始URL并获取结果后的回调函数
    View Code

    window编码问题:

    import sys,os
    sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

    3、书写爬虫

     1 import scrapy
     2 from scrapy.selector import HtmlXPathSelector
     3 from scrapy.http.request import Request
     4  
     5  
     6 class DigSpider(scrapy.Spider):
     7     # 爬虫应用的名称,通过此名称启动爬虫命令
     8     name = "dig"
     9  
    10     # 允许的域名
    11     allowed_domains = ["chouti.com"]
    12  
    13     # 起始URL
    14     start_urls = [
    15         'http://dig.chouti.com/',
    16     ]
    17  
    18     has_request_set = {}
    19  
    20     def parse(self, response):
    21         print(response.url)
    22  
    23         hxs = HtmlXPathSelector(response)
    24         page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/d+")]/@href').extract()
    25         for page in page_list:
    26             page_url = 'http://dig.chouti.com%s' % page
    27             key = self.md5(page_url)
    28             if key in self.has_request_set:
    29                 pass
    30             else:
    31                 self.has_request_set[key] = page_url
    32                 obj = Request(url=page_url, method='GET', callback=self.parse)
    33                 yield obj
    34  
    35     @staticmethod
    36     def md5(val):
    37         import hashlib
    38         ha = hashlib.md5()
    39         ha.update(bytes(val, encoding='utf-8'))
    40         key = ha.hexdigest()
    41         return key
    View Code

    执行:

    scrapy crawl dig --nolog

    对于上述代码重要之处在于:

    • Request是一个封装用户请求的类,在回调函数中yield该对象表示继续访问
    • HtmlXpathSelector用于结构化HTML代码并提供选择器功能

    4、选择器:

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 from scrapy.selector import Selector, HtmlXPathSelector
     4 from scrapy.http import HtmlResponse
     5 html = """<!DOCTYPE html>
     6 <html>
     7     <head lang="en">
     8         <meta charset="UTF-8">
     9         <title></title>
    10     </head>
    11     <body>
    12         <ul>
    13             <li class="item-"><a id='i1' href="link.html">first item</a></li>
    14             <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
    15             <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
    16         </ul>
    17         <div><a href="llink2.html">second item</a></div>
    18     </body>
    19 </html>
    20 """
    21 response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
    22 # hxs = HtmlXPathSelector(response)
    23 # print(hxs)
    24 # hxs = Selector(response=response).xpath('//a')
    25 # print(hxs)
    26 # hxs = Selector(response=response).xpath('//a[2]')
    27 # print(hxs)
    28 # hxs = Selector(response=response).xpath('//a[@id]')
    29 # print(hxs)
    30 # hxs = Selector(response=response).xpath('//a[@id="i1"]')
    31 # print(hxs)
    32 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
    33 # print(hxs)
    34 # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
    35 # print(hxs)
    36 # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
    37 # print(hxs)
    38 # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]')
    39 # print(hxs)
    40 # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/text()').extract()
    41 # print(hxs)
    42 # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/@href').extract()
    43 # print(hxs)
    44 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
    45 # print(hxs)
    46 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
    47 # print(hxs)
    48  
    49 # ul_list = Selector(response=response).xpath('//body/ul/li')
    50 # for item in ul_list:
    51 #     v = item.xpath('./a/span')
    52 #     # 或
    53 #     # v = item.xpath('a/span')
    54 #     # 或
    55 #     # v = item.xpath('*/a/span')
    56 #     print(v)
    View Code

    示例:

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from scrapy.selector import HtmlXPathSelector
     4 from scrapy.http.request import Request
     5 from scrapy.http.cookies import CookieJar
     6 from scrapy import FormRequest
     7 
     8 
     9 class ChouTiSpider(scrapy.Spider):
    10     # 爬虫应用的名称,通过此名称启动爬虫命令
    11     name = "chouti"
    12     # 允许的域名
    13     allowed_domains = ["chouti.com"]
    14 
    15     cookie_dict = {}
    16     has_request_set = {}
    17 
    18     def start_requests(self):
    19         url = 'http://dig.chouti.com/'
    20         # return [Request(url=url, callback=self.login)]
    21         yield Request(url=url, callback=self.login)
    22 
    23     def login(self, response):
    24         cookie_jar = CookieJar()
    25         cookie_jar.extract_cookies(response, response.request)
    26         for k, v in cookie_jar._cookies.items():
    27             for i, j in v.items():
    28                 for m, n in j.items():
    29                     self.cookie_dict[m] = n.value
    30 
    31         req = Request(
    32             url='http://dig.chouti.com/login',
    33             method='POST',
    34             headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
    35             body='phone=8615131255089&password=pppppppp&oneMonth=1',
    36             cookies=self.cookie_dict,
    37             callback=self.check_login
    38         )
    39         yield req
    40 
    41     def check_login(self, response):
    42         req = Request(
    43             url='http://dig.chouti.com/',
    44             method='GET',
    45             callback=self.show,
    46             cookies=self.cookie_dict,
    47             dont_filter=True
    48         )
    49         yield req
    50 
    51     def show(self, response):
    52         # print(response)
    53         hxs = HtmlXPathSelector(response)
    54         news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
    55         for new in news_list:
    56             # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
    57             link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
    58             yield Request(
    59                 url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
    60                 method='POST',
    61                 cookies=self.cookie_dict,
    62                 callback=self.do_favor
    63             )
    64 
    65         page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/d+")]/@href').extract()
    66         for page in page_list:
    67 
    68             page_url = 'http://dig.chouti.com%s' % page
    69             import hashlib
    70             hash = hashlib.md5()
    71             hash.update(bytes(page_url,encoding='utf-8'))
    72             key = hash.hexdigest()
    73             if key in self.has_request_set:
    74                 pass
    75             else:
    76                 self.has_request_set[key] = page_url
    77                 yield Request(
    78                     url=page_url,
    79                     method='GET',
    80                     callback=self.show
    81                 )
    82 
    83     def do_favor(self, response):
    84         print(response.text)
    登陆抽屉并点赞

    注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。

    5. 格式化处理

    上述实例只是简单的处理,所以在parse方法中直接处理。如果对于想要获取更多的数据处理,则可以利用Scrapy的items将数据格式化,然后统一交由pipelines来处理。

     1 import scrapy
     2 from scrapy.selector import HtmlXPathSelector
     3 from scrapy.http.request import Request
     4 from scrapy.http.cookies import CookieJar
     5 from scrapy import FormRequest
     6 
     7 
     8 class XiaoHuarSpider(scrapy.Spider):
     9     # 爬虫应用的名称,通过此名称启动爬虫命令
    10     name = "xiaohuar"
    11     # 允许的域名
    12     allowed_domains = ["xiaohuar.com"]
    13 
    14     start_urls = [
    15         "http://www.xiaohuar.com/list-1-1.html",
    16     ]
    17     # custom_settings = {
    18     #     'ITEM_PIPELINES':{
    19     #         'spider1.pipelines.JsonPipeline': 100
    20     #     }
    21     # }
    22     has_request_set = {}
    23 
    24     def parse(self, response):
    25         # 分析页面
    26         # 找到页面中符合规则的内容(校花图片),保存
    27         # 找到所有的a标签,再访问其他a标签,一层一层的搞下去
    28 
    29         hxs = HtmlXPathSelector(response)
    30 
    31         items = hxs.select('//div[@class="item_list infinite_scroll"]/div')
    32         for item in items:
    33             src = item.select('.//div[@class="img"]/a/img/@src').extract_first()
    34             name = item.select('.//div[@class="img"]/span/text()').extract_first()
    35             school = item.select('.//div[@class="img"]/div[@class="btns"]/a/text()').extract_first()
    36             url = "http://www.xiaohuar.com%s" % src
    37             from ..items import XiaoHuarItem
    38             obj = XiaoHuarItem(name=name, school=school, url=url)
    39             yield obj
    40 
    41         urls = hxs.select('//a[re:test(@href, "http://www.xiaohuar.com/list-1-d+.html")]/@href')
    42         for url in urls:
    43             key = self.md5(url)
    44             if key in self.has_request_set:
    45                 pass
    46             else:
    47                 self.has_request_set[key] = url
    48                 req = Request(url=url,method='GET',callback=self.parse)
    49                 yield req
    50 
    51     @staticmethod
    52     def md5(val):
    53         import hashlib
    54         ha = hashlib.md5()
    55         ha.update(bytes(val, encoding='utf-8'))
    56         key = ha.hexdigest()
    57         return key
    spiders/xiahuar.py
    1 import scrapy
    2 
    3 
    4 class XiaoHuarItem(scrapy.Item):
    5     name = scrapy.Field()
    6     school = scrapy.Field()
    7     url = scrapy.Field()
    item
     1 import json
     2 import os
     3 import requests
     4 
     5 
     6 class JsonPipeline(object):
     7     def __init__(self):
     8         self.file = open('xiaohua.txt', 'w')
     9 
    10     def process_item(self, item, spider):
    11         v = json.dumps(dict(item), ensure_ascii=False)
    12         self.file.write(v)
    13         self.file.write('
    ')
    14         self.file.flush()
    15         return item
    16 
    17 
    18 class FilePipeline(object):
    19     def __init__(self):
    20         if not os.path.exists('imgs'):
    21             os.makedirs('imgs')
    22 
    23     def process_item(self, item, spider):
    24         response = requests.get(item['url'], stream=True)
    25         file_name = '%s_%s.jpg' % (item['name'], item['school'])
    26         with open(os.path.join('imgs', file_name), mode='wb') as f:
    27             f.write(response.content)
    28         return item
    pipelines
    1 ITEM_PIPELINES = {
    2    'spider1.pipelines.JsonPipeline': 100,
    3    'spider1.pipelines.FilePipeline': 300,
    4 }
    5 # 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内。
    settings

    对于pipeline可以做更多,如下:

     1 from scrapy.exceptions import DropItem
     2 
     3 class CustomPipeline(object):
     4     def __init__(self,v):
     5         self.value = v
     6 
     7     def process_item(self, item, spider):
     8         # 操作并进行持久化
     9 
    10         # return表示会被后续的pipeline继续处理
    11         return item
    12 
    13         # 表示将item丢弃,不会被后续pipeline处理
    14         # raise DropItem()
    15 
    16 
    17     @classmethod
    18     def from_crawler(cls, crawler):
    19         """
    20         初始化时候,用于创建pipeline对象
    21         :param crawler: 
    22         :return: 
    23         """
    24         val = crawler.settings.getint('MMMM')
    25         return cls(val)
    26 
    27     def open_spider(self,spider):
    28         """
    29         爬虫开始执行时,调用
    30         :param spider: 
    31         :return: 
    32         """
    33         print('000000')
    34 
    35     def close_spider(self,spider):
    36         """
    37         爬虫关闭时,被调用
    38         :param spider: 
    39         :return: 
    40         """
    41         print('111111')
    View Code

    6、中间件

     1 class SpiderMiddleware(object):
     2 
     3     def process_spider_input(self,response, spider):
     4         """
     5         下载完成,执行,然后交给parse处理
     6         :param response: 
     7         :param spider: 
     8         :return: 
     9         """
    10         pass
    11 
    12     def process_spider_output(self,response, result, spider):
    13         """
    14         spider处理完成,返回时调用
    15         :param response:
    16         :param result:
    17         :param spider:
    18         :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
    19         """
    20         return result
    21 
    22     def process_spider_exception(self,response, exception, spider):
    23         """
    24         异常调用
    25         :param response:
    26         :param exception:
    27         :param spider:
    28         :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
    29         """
    30         return None
    31 
    32 
    33     def process_start_requests(self,start_requests, spider):
    34         """
    35         爬虫启动时调用
    36         :param start_requests:
    37         :param spider:
    38         :return: 包含 Request 对象的可迭代对象
    39         """
    40         return start_requests
    爬虫中间件
     1 class DownMiddleware1(object):
     2     def process_request(self, request, spider):
     3         """
     4         请求需要被下载时,经过所有下载器中间件的process_request调用
     5         :param request: 
     6         :param spider: 
     7         :return:  
     8             None,继续后续中间件去下载;
     9             Response对象,停止process_request的执行,开始执行process_response
    10             Request对象,停止中间件的执行,将Request重新调度器
    11             raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
    12         """
    13         pass
    14 
    15 
    16 
    17     def process_response(self, request, response, spider):
    18         """
    19         spider处理完成,返回时调用
    20         :param response:
    21         :param result:
    22         :param spider:
    23         :return: 
    24             Response 对象:转交给其他中间件process_response
    25             Request 对象:停止中间件,request会被重新调度下载
    26             raise IgnoreRequest 异常:调用Request.errback
    27         """
    28         print('response1')
    29         return response
    30 
    31     def process_exception(self, request, exception, spider):
    32         """
    33         当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
    34         :param response:
    35         :param exception:
    36         :param spider:
    37         :return: 
    38             None:继续交给后续中间件处理异常;
    39             Response对象:停止后续process_exception方法
    40             Request对象:停止中间件,request将会被重新调用下载
    41         """
    42         return None
    下载器中间件

    7、自定制命令

    • 在spiders同级创建任意目录,如:commands
    • 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
     1 from scrapy.commands import ScrapyCommand
     2     from scrapy.utils.project import get_project_settings
     3 
     4 
     5     class Command(ScrapyCommand):
     6 
     7         requires_project = True
     8 
     9         def syntax(self):
    10             return '[options]'
    11 
    12         def short_desc(self):
    13             return 'Runs all of the spiders'
    14 
    15         def run(self, args, opts):
    16             spider_list = self.crawler_process.spiders.list()
    17             for name in spider_list:
    18                 self.crawler_process.crawl(name, **opts.__dict__)
    19             self.crawler_process.start()
    crawlall.py
    • 在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
    • 在项目目录执行命令:scrapy crawlall 

    8、自定义扩展

    自定义扩展时,利用信号在指定位置注册制定操作

     1 from scrapy import signals
     2 
     3 
     4 class MyExtension(object):
     5     def __init__(self, value):
     6         self.value = value
     7 
     8     @classmethod
     9     def from_crawler(cls, crawler):
    10         val = crawler.settings.getint('MMMM')
    11         ext = cls(val)
    12 
    13         crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
    14         crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
    15 
    16         return ext
    17 
    18     def spider_opened(self, spider):
    19         print('open')
    20 
    21     def spider_closed(self, spider):
    22         print('close')
    View Code

    9. 避免重复访问

    scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配置有:

    1 DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
    2 DUPEFILTER_DEBUG = False
    3 JOBDIR = "保存范文记录的日志路径,如:/root/"  # 最终路径为 /root/requests.seen
    View Code
     1 class RepeatUrl:
     2     def __init__(self):
     3         self.visited_url = set()
     4 
     5     @classmethod
     6     def from_settings(cls, settings):
     7         """
     8         初始化时,调用
     9         :param settings: 
    10         :return: 
    11         """
    12         return cls()
    13 
    14     def request_seen(self, request):
    15         """
    16         检测当前请求是否已经被访问过
    17         :param request: 
    18         :return: True表示已经访问过;False表示未访问过
    19         """
    20         if request.url in self.visited_url:
    21             return True
    22         self.visited_url.add(request.url)
    23         return False
    24 
    25     def open(self):
    26         """
    27         开始爬去请求时,调用
    28         :return: 
    29         """
    30         print('open replication')
    31 
    32     def close(self, reason):
    33         """
    34         结束爬虫爬取时,调用
    35         :param reason: 
    36         :return: 
    37         """
    38         print('close replication')
    39 
    40     def log(self, request, spider):
    41         """
    42         记录日志
    43         :param request: 
    44         :param spider: 
    45         :return: 
    46         """
    47         print('repeat', request.url)
    自定义url去重

    10、settings详解

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for step8_king project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     http://doc.scrapy.org/en/latest/topics/settings.html
      9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     11 
     12 # 1. 爬虫名称
     13 BOT_NAME = 'step8_king'
     14 
     15 # 2. 爬虫应用路径
     16 SPIDER_MODULES = ['step8_king.spiders']
     17 NEWSPIDER_MODULE = 'step8_king.spiders'
     18 
     19 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     20 # 3. 客户端 user-agent请求头
     21 # USER_AGENT = 'step8_king (+http://www.yourdomain.com)'
     22 
     23 # Obey robots.txt rules
     24 # 4. 禁止爬虫配置
     25 # ROBOTSTXT_OBEY = False
     26 
     27 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     28 # 5. 并发请求数
     29 # CONCURRENT_REQUESTS = 4
     30 
     31 # Configure a delay for requests for the same website (default: 0)
     32 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
     33 # See also autothrottle settings and docs
     34 # 6. 延迟下载秒数
     35 # DOWNLOAD_DELAY = 2
     36 
     37 
     38 # The download delay setting will honor only one of:
     39 # 7. 单域名访问并发数,并且延迟下次秒数也应用在每个域名
     40 # CONCURRENT_REQUESTS_PER_DOMAIN = 2
     41 # 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP
     42 # CONCURRENT_REQUESTS_PER_IP = 3
     43 
     44 # Disable cookies (enabled by default)
     45 # 8. 是否支持cookie,cookiejar进行操作cookie
     46 # COOKIES_ENABLED = True
     47 # COOKIES_DEBUG = True
     48 
     49 # Disable Telnet Console (enabled by default)
     50 # 9. Telnet用于查看当前爬虫的信息,操作爬虫等...
     51 #    使用telnet ip port ,然后通过命令操作
     52 # TELNETCONSOLE_ENABLED = True
     53 # TELNETCONSOLE_HOST = '127.0.0.1'
     54 # TELNETCONSOLE_PORT = [6023,]
     55 
     56 
     57 # 10. 默认请求头
     58 # Override the default request headers:
     59 # DEFAULT_REQUEST_HEADERS = {
     60 #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     61 #     'Accept-Language': 'en',
     62 # }
     63 
     64 
     65 # Configure item pipelines
     66 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
     67 # 11. 定义pipeline处理请求
     68 # ITEM_PIPELINES = {
     69 #    'step8_king.pipelines.JsonPipeline': 700,
     70 #    'step8_king.pipelines.FilePipeline': 500,
     71 # }
     72 
     73 
     74 
     75 # 12. 自定义扩展,基于信号进行调用
     76 # Enable or disable extensions
     77 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
     78 # EXTENSIONS = {
     79 #     # 'step8_king.extensions.MyExtension': 500,
     80 # }
     81 
     82 
     83 # 13. 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
     84 # DEPTH_LIMIT = 3
     85 
     86 # 14. 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo
     87 
     88 # 后进先出,深度优先
     89 # DEPTH_PRIORITY = 0
     90 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
     91 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
     92 # 先进先出,广度优先
     93 
     94 # DEPTH_PRIORITY = 1
     95 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
     96 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
     97 
     98 # 15. 调度器队列
     99 # SCHEDULER = 'scrapy.core.scheduler.Scheduler'
    100 # from scrapy.core.scheduler import Scheduler
    101 
    102 
    103 # 16. 访问URL去重
    104 # DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'
    105 
    106 
    107 # Enable and configure the AutoThrottle extension (disabled by default)
    108 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    109 
    110 """
    111 17. 自动限速算法
    112     from scrapy.contrib.throttle import AutoThrottle
    113     自动限速设置
    114     1. 获取最小延迟 DOWNLOAD_DELAY
    115     2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
    116     3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
    117     4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
    118     5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
    119     target_delay = latency / self.target_concurrency
    120     new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
    121     new_delay = max(target_delay, new_delay)
    122     new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
    123     slot.delay = new_delay
    124 """
    125 
    126 # 开始自动限速
    127 # AUTOTHROTTLE_ENABLED = True
    128 # The initial download delay
    129 # 初始下载延迟
    130 # AUTOTHROTTLE_START_DELAY = 5
    131 # The maximum download delay to be set in case of high latencies
    132 # 最大下载延迟
    133 # AUTOTHROTTLE_MAX_DELAY = 10
    134 # The average number of requests Scrapy should be sending in parallel to each remote server
    135 # 平均每秒并发数
    136 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    137 
    138 # Enable showing throttling stats for every response received:
    139 # 是否显示
    140 # AUTOTHROTTLE_DEBUG = True
    141 
    142 # Enable and configure HTTP caching (disabled by default)
    143 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    144 
    145 
    146 """
    147 18. 启用缓存
    148     目的用于将已经发送的请求或相应缓存下来,以便以后使用
    149     
    150     from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
    151     from scrapy.extensions.httpcache import DummyPolicy
    152     from scrapy.extensions.httpcache import FilesystemCacheStorage
    153 """
    154 # 是否启用缓存策略
    155 # HTTPCACHE_ENABLED = True
    156 
    157 # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
    158 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
    159 # 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
    160 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
    161 
    162 # 缓存超时时间
    163 # HTTPCACHE_EXPIRATION_SECS = 0
    164 
    165 # 缓存保存路径
    166 # HTTPCACHE_DIR = 'httpcache'
    167 
    168 # 缓存忽略的Http状态码
    169 # HTTPCACHE_IGNORE_HTTP_CODES = []
    170 
    171 # 缓存存储的插件
    172 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    173 
    174 
    175 """
    176 19. 代理,需要在环境变量中设置
    177     from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
    178     
    179     方式一:使用默认
    180         os.environ
    181         {
    182             http_proxy:http://root:woshiniba@192.168.11.11:9999/
    183             https_proxy:http://192.168.11.11:9999/
    184         }
    185     方式二:使用自定义下载中间件
    186     
    187     def to_bytes(text, encoding=None, errors='strict'):
    188         if isinstance(text, bytes):
    189             return text
    190         if not isinstance(text, six.string_types):
    191             raise TypeError('to_bytes must receive a unicode, str or bytes '
    192                             'object, got %s' % type(text).__name__)
    193         if encoding is None:
    194             encoding = 'utf-8'
    195         return text.encode(encoding, errors)
    196         
    197     class ProxyMiddleware(object):
    198         def process_request(self, request, spider):
    199             PROXIES = [
    200                 {'ip_port': '111.11.228.75:80', 'user_pass': ''},
    201                 {'ip_port': '120.198.243.22:80', 'user_pass': ''},
    202                 {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
    203                 {'ip_port': '101.71.27.120:80', 'user_pass': ''},
    204                 {'ip_port': '122.96.59.104:80', 'user_pass': ''},
    205                 {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
    206             ]
    207             proxy = random.choice(PROXIES)
    208             if proxy['user_pass'] is not None:
    209                 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
    210                 encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
    211                 request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
    212                 print "**************ProxyMiddleware have pass************" + proxy['ip_port']
    213             else:
    214                 print "**************ProxyMiddleware no pass************" + proxy['ip_port']
    215                 request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
    216     
    217     DOWNLOADER_MIDDLEWARES = {
    218        'step8_king.middlewares.ProxyMiddleware': 500,
    219     }
    220     
    221 """
    222 
    223 """
    224 20. Https访问
    225     Https访问时有两种情况:
    226     1. 要爬取网站使用的可信任证书(默认支持)
    227         DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
    228         DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
    229         
    230     2. 要爬取网站使用的自定义证书
    231         DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
    232         DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
    233         
    234         # https.py
    235         from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
    236         from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
    237         
    238         class MySSLFactory(ScrapyClientContextFactory):
    239             def getCertificateOptions(self):
    240                 from OpenSSL import crypto
    241                 v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
    242                 v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
    243                 return CertificateOptions(
    244                     privateKey=v1,  # pKey对象
    245                     certificate=v2,  # X509对象
    246                     verify=False,
    247                     method=getattr(self, 'method', getattr(self, '_ssl_method', None))
    248                 )
    249     其他:
    250         相关类
    251             scrapy.core.downloader.handlers.http.HttpDownloadHandler
    252             scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
    253             scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
    254         相关配置
    255             DOWNLOADER_HTTPCLIENTFACTORY
    256             DOWNLOADER_CLIENTCONTEXTFACTORY
    257 
    258 """
    259 
    260 
    261 
    262 """
    263 21. 爬虫中间件
    264     class SpiderMiddleware(object):
    265 
    266         def process_spider_input(self,response, spider):
    267             '''
    268             下载完成,执行,然后交给parse处理
    269             :param response: 
    270             :param spider: 
    271             :return: 
    272             '''
    273             pass
    274     
    275         def process_spider_output(self,response, result, spider):
    276             '''
    277             spider处理完成,返回时调用
    278             :param response:
    279             :param result:
    280             :param spider:
    281             :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
    282             '''
    283             return result
    284     
    285         def process_spider_exception(self,response, exception, spider):
    286             '''
    287             异常调用
    288             :param response:
    289             :param exception:
    290             :param spider:
    291             :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
    292             '''
    293             return None
    294     
    295     
    296         def process_start_requests(self,start_requests, spider):
    297             '''
    298             爬虫启动时调用
    299             :param start_requests:
    300             :param spider:
    301             :return: 包含 Request 对象的可迭代对象
    302             '''
    303             return start_requests
    304     
    305     内置爬虫中间件:
    306         'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
    307         'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
    308         'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
    309         'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
    310         'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,
    311 
    312 """
    313 # from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
    314 # Enable or disable spider middlewares
    315 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    316 SPIDER_MIDDLEWARES = {
    317    # 'step8_king.middlewares.SpiderMiddleware': 543,
    318 }
    319 
    320 
    321 """
    322 22. 下载中间件
    323     class DownMiddleware1(object):
    324         def process_request(self, request, spider):
    325             '''
    326             请求需要被下载时,经过所有下载器中间件的process_request调用
    327             :param request:
    328             :param spider:
    329             :return:
    330                 None,继续后续中间件去下载;
    331                 Response对象,停止process_request的执行,开始执行process_response
    332                 Request对象,停止中间件的执行,将Request重新调度器
    333                 raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
    334             '''
    335             pass
    336     
    337     
    338     
    339         def process_response(self, request, response, spider):
    340             '''
    341             spider处理完成,返回时调用
    342             :param response:
    343             :param result:
    344             :param spider:
    345             :return:
    346                 Response 对象:转交给其他中间件process_response
    347                 Request 对象:停止中间件,request会被重新调度下载
    348                 raise IgnoreRequest 异常:调用Request.errback
    349             '''
    350             print('response1')
    351             return response
    352     
    353         def process_exception(self, request, exception, spider):
    354             '''
    355             当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
    356             :param response:
    357             :param exception:
    358             :param spider:
    359             :return:
    360                 None:继续交给后续中间件处理异常;
    361                 Response对象:停止后续process_exception方法
    362                 Request对象:停止中间件,request将会被重新调用下载
    363             '''
    364             return None
    365 
    366     
    367     默认下载中间件
    368     {
    369         'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
    370         'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
    371         'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
    372         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
    373         'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
    374         'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
    375         'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
    376         'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
    377         'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
    378         'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
    379         'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
    380         'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
    381         'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
    382         'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
    383     }
    384 
    385 """
    386 # from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
    387 # Enable or disable downloader middlewares
    388 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    389 # DOWNLOADER_MIDDLEWARES = {
    390 #    'step8_king.middlewares.DownMiddleware1': 100,
    391 #    'step8_king.middlewares.DownMiddleware2': 500,
    392 # }
    View Code

    11、TinyScrapy

      1 #!/usr/bin/env python
      2 # -*- coding:utf-8 -*-
      3 import types
      4 from twisted.internet import defer
      5 from twisted.web.client import getPage
      6 from twisted.internet import reactor
      7 
      8 
      9 
     10 class Request(object):
     11     def __init__(self, url, callback):
     12         self.url = url
     13         self.callback = callback
     14         self.priority = 0
     15 
     16 
     17 class HttpResponse(object):
     18     def __init__(self, content, request):
     19         self.content = content
     20         self.request = request
     21 
     22 
     23 class ChouTiSpider(object):
     24 
     25     def start_requests(self):
     26         url_list = ['http://www.cnblogs.com/', 'http://www.bing.com']
     27         for url in url_list:
     28             yield Request(url=url, callback=self.parse)
     29 
     30     def parse(self, response):
     31         print(response.request.url)
     32         # yield Request(url="http://www.baidu.com", callback=self.parse)
     33 
     34 
     35 
     36 
     37 from queue import Queue
     38 Q = Queue()
     39 
     40 
     41 class CallLaterOnce(object):
     42     def __init__(self, func, *a, **kw):
     43         self._func = func
     44         self._a = a
     45         self._kw = kw
     46         self._call = None
     47 
     48     def schedule(self, delay=0):
     49         if self._call is None:
     50             self._call = reactor.callLater(delay, self)
     51 
     52     def cancel(self):
     53         if self._call:
     54             self._call.cancel()
     55 
     56     def __call__(self):
     57         self._call = None
     58         return self._func(*self._a, **self._kw)
     59 
     60 
     61 class Engine(object):
     62     def __init__(self):
     63         self.nextcall = None
     64         self.crawlling = []
     65         self.max = 5
     66         self._closewait = None
     67 
     68     def get_response(self,content, request):
     69         response = HttpResponse(content, request)
     70         gen = request.callback(response)
     71         if isinstance(gen, types.GeneratorType):
     72             for req in gen:
     73                 req.priority = request.priority + 1
     74                 Q.put(req)
     75 
     76 
     77     def rm_crawlling(self,response,d):
     78         self.crawlling.remove(d)
     79 
     80     def _next_request(self,spider):
     81         if Q.qsize() == 0 and len(self.crawlling) == 0:
     82             self._closewait.callback(None)
     83 
     84         if len(self.crawlling) >= 5:
     85             return
     86         while len(self.crawlling) < 5:
     87             try:
     88                 req = Q.get(block=False)
     89             except Exception as e:
     90                 req = None
     91             if not req:
     92                 return
     93             d = getPage(req.url.encode('utf-8'))
     94             self.crawlling.append(d)
     95             d.addCallback(self.get_response, req)
     96             d.addCallback(self.rm_crawlling,d)
     97             d.addCallback(lambda _: self.nextcall.schedule())
     98 
     99 
    100     @defer.inlineCallbacks
    101     def crawl(self):
    102         spider = ChouTiSpider()
    103         start_requests = iter(spider.start_requests())
    104         flag = True
    105         while flag:
    106             try:
    107                 req = next(start_requests)
    108                 Q.put(req)
    109             except StopIteration as e:
    110                 flag = False
    111 
    112         self.nextcall = CallLaterOnce(self._next_request,spider)
    113         self.nextcall.schedule()
    114 
    115         self._closewait = defer.Deferred()
    116         yield self._closewait
    117 
    118     @defer.inlineCallbacks
    119     def pp(self):
    120         yield self.crawl()
    121 
    122 _active = set()
    123 obj = Engine()
    124 d = obj.crawl()
    125 _active.add(d)
    126 
    127 li = defer.DeferredList(_active)
    128 li.addBoth(lambda _,*a,**kw: reactor.stop())
    129 
    130 reactor.run()
    View Code
  • 相关阅读:
    Java 连接Redis客户端 Jedis
    Redis的基本类型
    [Windows Server 2008] 搭建数据云备份
    [Windows Server 2008] PHP安装Memcached
    五大免费主机系统
    当前主要的常用的PHP环境部署套件比较
    [Windows Server 2008] 404错误设置方法
    [Windows Server 2008] IIS配置伪静态方法(Web.config模式的IIS rewrite)
    护卫神·云查杀系统V4.0-安全检测部分
    [Windows Server 2008] 阿里云.云主机忘记密码解决方法
  • 原文地址:https://www.cnblogs.com/wangshuyang/p/7717263.html
Copyright © 2011-2022 走看看