zoukankan      html  css  js  c++  java
  • 爬虫相关

    性能相关

    在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。

     1 import requests
     2 
     3 def fetch_async(url):
     4     response = requests.get(url)
     5     return response
     6 
     7 
     8 url_list = ['http://www.github.com', 'http://www.bing.com']
     9 
    10 for url in url_list:
    11     fetch_async(url)
    1.同步执行
     1 from concurrent.futures import ThreadPoolExecutor
     2 import requests
     3 
     4 
     5 def fetch_async(url):
     6     response = requests.get(url)
     7     return response
     8 
     9 
    10 url_list = ['http://www.github.com', 'http://www.bing.com']
    11 pool = ThreadPoolExecutor(5)
    12 for url in url_list:
    13     pool.submit(fetch_async, url)
    14 pool.shutdown(wait=True)
    2.多线程执行
     1 from concurrent.futures import ThreadPoolExecutor
     2 import requests
     3 
     4 def fetch_async(url):
     5     response = requests.get(url)
     6     return response
     7 
     8 
     9 def callback(future):
    10     print(future.result())
    11 
    12 
    13 url_list = ['http://www.github.com', 'http://www.bing.com']
    14 pool = ThreadPoolExecutor(5)
    15 for url in url_list:
    16     v = pool.submit(fetch_async, url)
    17     v.add_done_callback(callback)
    18 pool.shutdown(wait=True)
    2.多线程+回调函数执行
     1 from concurrent.futures import ProcessPoolExecutor
     2 import requests
     3 
     4 def fetch_async(url):
     5     response = requests.get(url)
     6     return response
     7 
     8 
     9 url_list = ['http://www.github.com', 'http://www.bing.com']
    10 pool = ProcessPoolExecutor(5)
    11 for url in url_list:
    12     pool.submit(fetch_async, url)
    13 pool.shutdown(wait=True)
    3.多进程执行
     1 from concurrent.futures import ProcessPoolExecutor
     2 import requests
     3 
     4 
     5 def fetch_async(url):
     6     response = requests.get(url)
     7     return response
     8 
     9 
    10 def callback(future):
    11     print(future.result())
    12 
    13 
    14 url_list = ['http://www.github.com', 'http://www.bing.com']
    15 pool = ProcessPoolExecutor(5)
    16 for url in url_list:
    17     v = pool.submit(fetch_async, url)
    18     v.add_done_callback(callback)
    19 pool.shutdown(wait=True)
    3.多进程+回调函数

    通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO回事首选:

     1 import asyncio
     2 
     3 
     4 @asyncio.coroutine
     5 def func1():
     6     print('before...func1......')
     7     yield from asyncio.sleep(5)
     8     print('end...func1......')
     9 
    10 
    11 tasks = [func1(), func1()]
    12 
    13 loop = asyncio.get_event_loop()
    14 loop.run_until_complete(asyncio.gather(*tasks))
    15 loop.close()
    1.asyncio实例1
     1 import asyncio
     2 
     3 
     4 @asyncio.coroutine
     5 def fetch_async(host, url='/'):
     6     print(host, url)
     7     reader, writer = yield from asyncio.open_connection(host, 80)
     8 
     9     request_header_content = """GET %s HTTP/1.0
    Host: %s
    
    """ % (url, host,)
    10     request_header_content = bytes(request_header_content, encoding='utf-8')
    11 
    12     writer.write(request_header_content)
    13     yield from writer.drain()
    14     text = yield from reader.read()
    15     print(host, url, text)
    16     writer.close()
    17 
    18 tasks = [
    19     fetch_async('www.cnblogs.com', '/wupeiqi/'),
    20     fetch_async('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
    21 ]
    22 
    23 loop = asyncio.get_event_loop()
    24 results = loop.run_until_complete(asyncio.gather(*tasks))
    25 loop.close()
    1.asyncio示例2
     1 import aiohttp
     2 import asyncio
     3 
     4 
     5 @asyncio.coroutine
     6 def fetch_async(url):
     7     print(url)
     8     response = yield from aiohttp.request('GET', url)
     9     # data = yield from response.read()
    10     # print(url, data)
    11     print(url, response)
    12     response.close()
    13 
    14 
    15 tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')]
    16 
    17 event_loop = asyncio.get_event_loop()
    18 results = event_loop.run_until_complete(asyncio.gather(*tasks))
    19 event_loop.close()
    2.asyncio + aiohttp
     1 import asyncio
     2 import requests
     3 
     4 
     5 @asyncio.coroutine
     6 def fetch_async(func, *args):
     7     loop = asyncio.get_event_loop()
     8     future = loop.run_in_executor(None, func, *args)
     9     response = yield from future
    10     print(response.url, response.content)
    11 
    12 
    13 tasks = [
    14     fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
    15     fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
    16 ]
    17 
    18 loop = asyncio.get_event_loop()
    19 results = loop.run_until_complete(asyncio.gather(*tasks))
    20 loop.close()
    3.asyncio + requests
     1 import gevent
     2 
     3 import requests
     4 from gevent import monkey
     5 
     6 monkey.patch_all()
     7 
     8 
     9 def fetch_async(method, url, req_kwargs):
    10     print(method, url, req_kwargs)
    11     response = requests.request(method=method, url=url, **req_kwargs)
    12     print(response.url, response.content)
    13 
    14 # ##### 发送请求 #####
    15 gevent.joinall([
    16     gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    17     gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    18     gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
    19 ])
    20 
    21 # ##### 发送请求(协程池控制最大协程数量) #####
    22 # from gevent.pool import Pool
    23 # pool = Pool(None)
    24 # gevent.joinall([
    25 #     pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    26 #     pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    27 #     pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
    28 # ])
    4.gevent + requests
     1 import grequests
     2 
     3 
     4 request_list = [
     5     grequests.get('http://httpbin.org/delay/1', timeout=0.001),
     6     grequests.get('http://fakedomain/'),
     7     grequests.get('http://httpbin.org/status/500')
     8 ]
     9 
    10 
    11 # ##### 执行并获取响应列表 #####
    12 # response_list = grequests.map(request_list)
    13 # print(response_list)
    14 
    15 
    16 # ##### 执行并获取响应列表(处理异常) #####
    17 # def exception_handler(request, exception):
    18 # print(request,exception)
    19 #     print("Request failed")
    20 
    21 # response_list = grequests.map(request_list, exception_handler=exception_handler)
    22 # print(response_list)
    5.grequests
     1 from twisted.web.client import getPage
     2 from twisted.internet import reactor
     3 
     4 REV_COUNTER = 0
     5 REQ_COUNTER = 0
     6 
     7 def callback(contents):
     8     print(contents,)
     9 
    10     global REV_COUNTER
    11     REV_COUNTER += 1
    12     if REV_COUNTER == REQ_COUNTER:
    13         reactor.stop()
    14 
    15 
    16 url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
    17 REQ_COUNTER = len(url_list)
    18 for url in url_list:
    19     deferred = getPage(bytes(url, encoding='utf8'))
    20     deferred.addCallback(callback)
    21 reactor.run()
    6.Twisted示例1
     1 from twisted.web.client import getPage
     2 from twisted.internet import reactor
     3 
     4 
     5 class TwistedRequest(object):
     6     def __init__(self):
     7         self.__req_counter = 0
     8         self.__rev_counter = 0
     9 
    10     def __execute(self, content, url, callback):
    11         if callback:
    12             callback(url, content)
    13         self.__rev_counter += 1
    14         if self.__rev_counter == self.__req_counter:
    15             reactor.stop()
    16 
    17     def fetch_url(self, url_callback_list):
    18 
    19         self.__req_counter = len(url_callback_list)
    20 
    21         for item in url_callback_list:
    22             url = item['url']
    23             success_callback = item['success_callback']
    24             error_callback = item['error_callback']
    25 
    26             deferred = getPage(bytes(url, encoding='utf8'))
    27             deferred.addCallback(self.__execute, url, success_callback)
    28             deferred.addErrback(self.__execute, url, error_callback)
    29 
    30         reactor.run()
    31 
    32 
    33 def callback(url, content):
    34     print(url, content)
    35 
    36 
    37 def error(url, content):
    38     print(url, content)
    39 
    40 
    41 obj = TwistedRequest()
    42 obj.fetch_url([
    43     {'url': 'http://www.baidu.com', 'success_callback': callback, 'error_callback': error},
    44     {'url': 'http://www.google.com', 'success_callback': callback, 'error_callback': error},
    45 ])
    6.Twisted示例2
     1 from tornado.httpclient import AsyncHTTPClient
     2 from tornado.httpclient import HTTPRequest
     3 from tornado import ioloop
     4 
     5 
     6 def handle_response(response):
     7     if response.error:
     8         print("Error:", response.error)
     9     else:
    10         print(response.body)
    11         # 方法同twisted
    12         # ioloop.IOLoop.current().stop()
    13 
    14 
    15 def func():
    16     url_list = [
    17         'http://www.google.com',
    18         'http://127.0.0.1:8000/test2/',
    19     ]
    20     for url in url_list:
    21         print(url)
    22         http_client = AsyncHTTPClient()
    23         http_client.fetch(HTTPRequest(url), handle_response)
    24 
    25 
    26 ioloop.IOLoop.current().add_callback(func)
    27 ioloop.IOLoop.current().start()
    7.tornado

    以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】:

      1 import select
      2 import socket
      3 import time
      4 
      5 
      6 class AsyncTimeoutException(TimeoutError):
      7     """
      8     请求超时异常类
      9     """
     10 
     11     def __init__(self, msg):
     12         self.msg = msg
     13         super(AsyncTimeoutException, self).__init__(msg)
     14 
     15 
     16 class HttpContext(object):
     17     """封装请求和相应的基本数据"""
     18 
     19     def __init__(self, sock, host, port, method, url, data, callback, timeout=5):
     20         """
     21         sock: 请求的客户端socket对象
     22         host: 请求的主机名
     23         port: 请求的端口
     24         port: 请求的端口
     25         method: 请求方式
     26         url: 请求的URL
     27         data: 请求时请求体中的数据
     28         callback: 请求完成后的回调函数
     29         timeout: 请求的超时时间
     30         """
     31         self.sock = sock
     32         self.callback = callback
     33         self.host = host
     34         self.port = port
     35         self.method = method
     36         self.url = url
     37         self.data = data
     38 
     39         self.timeout = timeout
     40 
     41         self.__start_time = time.time()
     42         self.__buffer = []
     43 
     44     def is_timeout(self):
     45         """当前请求是否已经超时"""
     46         current_time = time.time()
     47         if (self.__start_time + self.timeout) < current_time:
     48             return True
     49 
     50     def fileno(self):
     51         """请求sockect对象的文件描述符,用于select监听"""
     52         return self.sock.fileno()
     53 
     54     def write(self, data):
     55         """在buffer中写入响应内容"""
     56         self.__buffer.append(data)
     57 
     58     def finish(self, exc=None):
     59         """在buffer中写入响应内容完成,执行请求的回调函数"""
     60         if not exc:
     61             response = b''.join(self.__buffer)
     62             self.callback(self, response, exc)
     63         else:
     64             self.callback(self, None, exc)
     65 
     66     def send_request_data(self):
     67         content = """%s %s HTTP/1.0
    Host: %s
    
    %s""" % (
     68             self.method.upper(), self.url, self.host, self.data,)
     69 
     70         return content.encode(encoding='utf8')
     71 
     72 
     73 class AsyncRequest(object):
     74     def __init__(self):
     75         self.fds = []
     76         self.connections = []
     77 
     78     def add_request(self, host, port, method, url, data, callback, timeout):
     79         """创建一个要请求"""
     80         client = socket.socket()
     81         client.setblocking(False)
     82         try:
     83             client.connect((host, port))
     84         except BlockingIOError as e:
     85             pass
     86             # print('已经向远程发送连接的请求')
     87         req = HttpContext(client, host, port, method, url, data, callback, timeout)
     88         self.connections.append(req)
     89         self.fds.append(req)
     90 
     91     def check_conn_timeout(self):
     92         """检查所有的请求,是否有已经连接超时,如果有则终止"""
     93         timeout_list = []
     94         for context in self.connections:
     95             if context.is_timeout():
     96                 timeout_list.append(context)
     97         for context in timeout_list:
     98             context.finish(AsyncTimeoutException('请求超时'))
     99             self.fds.remove(context)
    100             self.connections.remove(context)
    101 
    102     def running(self):
    103         """事件循环,用于检测请求的socket是否已经就绪,从而执行相关操作"""
    104         while True:
    105             r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)
    106 
    107             if not self.fds:
    108                 return
    109 
    110             for context in r:
    111                 sock = context.sock
    112                 while True:
    113                     try:
    114                         data = sock.recv(8096)
    115                         if not data:
    116                             self.fds.remove(context)
    117                             context.finish()
    118                             break
    119                         else:
    120                             context.write(data)
    121                     except BlockingIOError as e:
    122                         break
    123                     except TimeoutError as e:
    124                         self.fds.remove(context)
    125                         self.connections.remove(context)
    126                         context.finish(e)
    127                         break
    128 
    129             for context in w:
    130                 # 已经连接成功远程服务器,开始向远程发送请求数据
    131                 if context in self.fds:
    132                     data = context.send_request_data()
    133                     context.sock.sendall(data)
    134                     self.connections.remove(context)
    135 
    136             self.check_conn_timeout()
    137 
    138 
    139 if __name__ == '__main__':
    140     def callback_func(context, response, ex):
    141         """
    142         :param context: HttpContext对象,内部封装了请求相关信息
    143         :param response: 请求响应内容
    144         :param ex: 是否出现异常(如果有异常则值为异常对象;否则值为None)
    145         :return:
    146         """
    147         print(context, response, ex)
    148 
    149     obj = AsyncRequest()
    150     url_list = [
    151         {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    152          'callback': callback_func},
    153         {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    154          'callback': callback_func},
    155         {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
    156          'callback': callback_func},
    157     ]
    158     for item in url_list:
    159         print(item)
    160         obj.add_request(**item)
    161 
    162     obj.running()
    最牛逼的异步IO模块

    Scrapy

    Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中。
    其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的, 也可以应用在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通用的网络爬虫。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。

    Scrapy 使用了 Twisted异步网络库来处理网络通讯。整体架构大致如下:

    Scrapy主要包括了以下组件:

    • 引擎(Scrapy)
      用来处理整个系统的数据流处理, 触发事务(框架核心)
    • 调度器(Scheduler)
      用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址
    • 下载器(Downloader)
      用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的)
    • 爬虫(Spiders)
      爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面
    • 项目管道(Pipeline)
      负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。
    • 下载器中间件(Downloader Middlewares)
      位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。
    • 爬虫中间件(Spider Middlewares)
      介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。
    • 调度中间件(Scheduler Middewares)
      介于Scrapy引擎和调度之间的中间件,从Scrapy引擎发送到调度的请求和响应。

    Scrapy运行流程大概如下:

    1. 引擎从调度器中取出一个链接(URL)用于接下来的抓取
    2. 引擎把URL封装成一个请求(Request)传给下载器
    3. 下载器把资源下载下来,并封装成应答包(Response)
    4. 爬虫解析Response
    5. 解析出实体(Item),则交给实体管道进行进一步的处理
    6. 解析出的是链接(URL),则把URL交给调度器等待抓取

    一、安装

    Linux
          pip3 install scrapy
     
     
    Windows
          a. pip3 install wheel
          b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
          c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
          d. pip3 install scrapy
          e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/
        f. 下载安装 OpenSSL: 下载地址:https://pypi.python.org/pypi/pyOpenSSL#downloads
          安装方式同安装twisted
    

    二、基本使用

    1. 基本命令

    1. scrapy startproject 项目名称
       - 在当前目录中创建中创建一个项目文件(类似于Django)
     
    2. scrapy genspider [-t template] <name> <domain>
       - 创建爬虫应用
       如:
          scrapy gensipider -t basic oldboy oldboy.com
          scrapy gensipider -t xmlfeed autohome autohome.com.cn
       PS:
          查看所有命令:scrapy gensipider -l
          查看模板命令:scrapy gensipider -d 模板名称
     
    3. scrapy list
       - 展示爬虫应用列表
     
    4. scrapy crawl 爬虫应用名称
       - 运行单独爬虫应用
    

    2.项目结构以及爬虫应用简介

    project_name/
       scrapy.cfg
       project_name/
           __init__.py
           items.py
           pipelines.py
           settings.py
           spiders/
               __init__.py
               爬虫1.py
               爬虫2.py
               爬虫3.py
    

    文件说明:

    • scrapy.cfg  项目的主配置信息。(真正爬虫相关的配置信息在settings.py文件中)
    • items.py    设置数据存储模板,用于结构化数据,如:Django的Model
    • pipelines    数据处理行为,如:一般结构化的数据持久化
    • settings.py 配置文件,如:递归的层数、并发数,延迟下载等
    • spiders      爬虫目录,如:创建文件,编写爬虫规则

    注意:一般创建爬虫文件时,以网站域名命名;在执行的时候会先访问一个robots.txt的文件(防爬), 这个可以通过将配置文件中的ROBOTSTXT_OBEY设置为False来规避。

     1 import scrapy
     2  
     3 class XiaoHuarSpider(scrapy.spiders.Spider):
     4     name = "xiaohuar"                            # 爬虫名称 *****
     5     allowed_domains = ["xiaohuar.com"]  # 允许的域名
     6     start_urls = [
     7         "http://www.xiaohuar.com/hua/",   # 其实URL
     8     ]
     9  
    10     def parse(self, response):
    11         # 访问起始URL并获取结果后的回调函数
    爬虫1.py

    3. 小试牛刀

    import scrapy
    from scrapy.selector import HtmlXPathSelector
    from scrapy.http.request import Request
     
     
    class DigSpider(scrapy.Spider):
        # 爬虫应用的名称,通过此名称启动爬虫命令
        name = "dig"
     
        # 允许的域名
        allowed_domains = ["chouti.com"]
     
        # 起始URL
        start_urls = [
            'http://dig.chouti.com/',
        ]
     
        has_request_set = {}
     
        def parse(self, response):
            print(response.url)
     
            hxs = HtmlXPathSelector(response)
            page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/d+")]/@href').extract()
            for page in page_list:
                page_url = 'http://dig.chouti.com%s' % page
                key = self.md5(page_url)
                if key in self.has_request_set:
                    pass
                else:
                    self.has_request_set[key] = page_url
                    obj = Request(url=page_url, method='GET', callback=self.parse)
                    yield obj
     
        @staticmethod
        def md5(val):
            import hashlib
            ha = hashlib.md5()
            ha.update(bytes(val, encoding='utf-8'))
            key = ha.hexdigest()
            return key
    

    执行此爬虫文件,则在终端进入项目目录执行如下命令:

    1 scrapy crawl dig --nolog

    对于上述代码重要之处在于:

    • Request是一个封装用户请求的类,在回调函数中yield该对象表示继续访问
    • HtmlXpathSelector用于结构化HTML代码并提供选择器功能

    4. 选择器

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    from scrapy.selector import Selector, HtmlXPathSelector
    from scrapy.http import HtmlResponse
    html = """<!DOCTYPE html>
    <html>
        <head lang="en">
            <meta charset="UTF-8">
            <title></title>
        </head>
        <body>
            <ul>
                <li class="item-"><a id='i1' href="link.html">first item</a></li>
                <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
                <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
            </ul>
            <div><a href="llink2.html">second item</a></div>
        </body>
    </html>
    """
    response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
    # hxs = HtmlXPathSelector(response)
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[2]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@id]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@id="i1"]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/text()').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/@href').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
    # print(hxs)
     
    # ul_list = Selector(response=response).xpath('//body/ul/li')
    # for item in ul_list:
    #     v = item.xpath('./a/span')
    #     # 或
    #     # v = item.xpath('a/span')
    #     # 或
    #     # v = item.xpath('*/a/span')
    #     print(v)
    
     1 import scrapy
     2 from scrapy.selector import HtmlXPathSelector
     3 from scrapy.http.request import Request
     4 from scrapy.http.cookies import CookieJar
     5 from scrapy import FormRequest
     6 
     7 
     8 class ChouTiSpider(scrapy.Spider):
     9     # 爬虫应用的名称,通过此名称启动爬虫命令
    10     name = "chouti"
    11     # 允许的域名
    12     allowed_domains = ["chouti.com"]
    13 
    14     cookie_dict = {}
    15     has_request_set = {}
    16 
    17     def start_requests(self):
    18         url = 'http://dig.chouti.com/'
    19         # return [Request(url=url, callback=self.login)]
    20         yield Request(url=url, callback=self.login)
    21 
    22     def login(self, response):
    23         cookie_jar = CookieJar()
    24         cookie_jar.extract_cookies(response, response.request)
    25         for k, v in cookie_jar._cookies.items():
    26             for i, j in v.items():
    27                 for m, n in j.items():
    28                     self.cookie_dict[m] = n.value
    29 
    30         req = Request(
    31             url='http://dig.chouti.com/login',
    32             method='POST',
    33             headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
    34             body='phone=8615131255089&password=pppppppp&oneMonth=1',
    35             cookies=self.cookie_dict,
    36             callback=self.check_login
    37         )
    38         yield req
    39 
    40     def check_login(self, response):
    41         req = Request(
    42             url='http://dig.chouti.com/',
    43             method='GET',
    44             callback=self.show,
    45             cookies=self.cookie_dict,
    46             dont_filter=True
    47         )
    48         yield req
    49 
    50     def show(self, response):
    51         # print(response)
    52         hxs = HtmlXPathSelector(response)
    53         news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
    54         for new in news_list:
    55             # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
    56             link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
    57             yield Request(
    58                 url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
    59                 method='POST',
    60                 cookies=self.cookie_dict,
    61                 callback=self.do_favor
    62             )
    63 
    64         page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/d+")]/@href').extract()
    65         for page in page_list:
    66 
    67             page_url = 'http://dig.chouti.com%s' % page
    68             import hashlib
    69             hash = hashlib.md5()
    70             hash.update(bytes(page_url,encoding='utf-8'))
    71             key = hash.hexdigest()
    72             if key in self.has_request_set:
    73                 pass
    74             else:
    75                 self.has_request_set[key] = page_url
    76                 yield Request(
    77                     url=page_url,
    78                     method='GET',
    79                     callback=self.show
    80                 )
    81 
    82     def do_favor(self, response):
    83         print(response.text)
    示例:自动登陆抽屉并点赞

    注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。

    5. 格式化处理

    上述实例只是简单的处理,所以在parse方法中直接处理。如果对于想要获取更多的数据处理,则可以利用Scrapy的items将数据格式化,然后统一交由pipelines来处理。

     1 import scrapy
     2 from scrapy.selector import HtmlXPathSelector
     3 from scrapy.http.request import Request
     4 from scrapy.http.cookies import CookieJar
     5 from scrapy import FormRequest
     6 
     7 
     8 class XiaoHuarSpider(scrapy.Spider):
     9     # 爬虫应用的名称,通过此名称启动爬虫命令
    10     name = "xiaohuar"
    11     # 允许的域名
    12     allowed_domains = ["xiaohuar.com"]
    13 
    14     start_urls = [
    15         "http://www.xiaohuar.com/list-1-1.html",
    16     ]
    17     # custom_settings = {
    18     #     'ITEM_PIPELINES':{
    19     #         'spider1.pipelines.JsonPipeline': 100
    20     #     }
    21     # }
    22     has_request_set = {}
    23 
    24     def parse(self, response):
    25         # 分析页面
    26         # 找到页面中符合规则的内容(校花图片),保存
    27         # 找到所有的a标签,再访问其他a标签,一层一层的搞下去
    28 
    29         hxs = HtmlXPathSelector(response)
    30 
    31         items = hxs.select('//div[@class="item_list infinite_scroll"]/div')
    32         for item in items:
    33             src = item.select('.//div[@class="img"]/a/img/@src').extract_first()
    34             name = item.select('.//div[@class="img"]/span/text()').extract_first()
    35             school = item.select('.//div[@class="img"]/div[@class="btns"]/a/text()').extract_first()
    36             url = "http://www.xiaohuar.com%s" % src
    37             from ..items import XiaoHuarItem
    38             obj = XiaoHuarItem(name=name, school=school, url=url)
    39             yield obj
    40 
    41         urls = hxs.select('//a[re:test(@href, "http://www.xiaohuar.com/list-1-d+.html")]/@href')
    42         for url in urls:
    43             key = self.md5(url)
    44             if key in self.has_request_set:
    45                 pass
    46             else:
    47                 self.has_request_set[key] = url
    48                 req = Request(url=url,method='GET',callback=self.parse)
    49                 yield req
    50 
    51     @staticmethod
    52     def md5(val):
    53         import hashlib
    54         ha = hashlib.md5()
    55         ha.update(bytes(val, encoding='utf-8'))
    56         key = ha.hexdigest()
    57         return key
    spiders/xiahuar.py
    1 import scrapy
    2 
    3 
    4 class XiaoHuarItem(scrapy.Item):
    5     name = scrapy.Field()
    6     school = scrapy.Field()
    7     url = scrapy.Field()
    items
     1 import json
     2 import os
     3 import requests
     4 
     5 
     6 class JsonPipeline(object):
     7     def __init__(self):
     8         self.file = open('xiaohua.txt', 'w')
     9 
    10     def process_item(self, item, spider):
    11         v = json.dumps(dict(item), ensure_ascii=False)
    12         self.file.write(v)
    13         self.file.write('
    ')
    14         self.file.flush()
    15         return item
    16 
    17 
    18 class FilePipeline(object):
    19     def __init__(self):
    20         if not os.path.exists('imgs'):
    21             os.makedirs('imgs')
    22 
    23     def process_item(self, item, spider):
    24         response = requests.get(item['url'], stream=True)
    25         file_name = '%s_%s.jpg' % (item['name'], item['school'])
    26         with open(os.path.join('imgs', file_name), mode='wb') as f:
    27             f.write(response.content)
    28         return item
    pipelines
    1 ITEM_PIPELINES = {
    2    'spider1.pipelines.JsonPipeline': 100,
    3    'spider1.pipelines.FilePipeline': 300,
    4 }
    5 # 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内。
    settings

    6.中间件

     1 class CustomSpiderMiddleware(object):
     2     # Not all methods need to be defined. If a method is not defined,
     3     # scrapy acts as if the spider middleware does not modify the
     4     # passed objects.
     5 
     6     def process_spider_input(self, response, spider):
     7         # Called for each response that goes through the spider
     8         # middleware and into the spider.
     9 
    10         # Should return None or raise an exception.
    11         print('process_spider_input', len(response.text))
    12         return None
    13 
    14     def process_spider_output(self, response, result, spider):
    15         # Called with the results returned from the Spider, after
    16         # it has processed the response.
    17         print('process_spider_output', len(response.text))
    18         # Must return an iterable of Request, dict or Item objects.
    19         for i in result:
    20             yield i
    21 
    22     def process_spider_exception(self, response, exception, spider):
    23         # Called when a spider or process_spider_input() method
    24         # (from other spider middleware) raises an exception.
    25 
    26         # Should return either None or an iterable of Response, dict
    27         # or Item objects.
    28         print('process_spider_exception')
    29         pass
    30 
    31     def process_start_requests(self, start_requests, spider):
    32         # Called with the start requests of the spider, and works
    33         # similarly to the process_spider_output() method, except
    34         # that it doesn’t have a response associated.
    35 
    36         # Must return only requests (not items).
    37         print('process_start_requests')
    38         for r in start_requests:
    39             yield r
    40 
    41     def spider_opened(self, spider):
    42         spider.logger.info('Spider opened: %s' % spider.name)
    43 
    44 
    45 class CustomDownloaderMiddleware(object):
    46     def process_request(self, request, spider):
    47         return None
    48 
    49     def process_response(self, request, response, spider):
    50         return response
    51 
    52     def process_exception(self, request, exception, spider):
    53         return None
    middlewares.py
    # settings.py
     
    DOWNLOADER_MIDDLEWARES = {
        'spider1.middlewares.CustomDownloaderMiddleware': 543,
    }
    SPIDER_MIDDLEWARES = {
        'spider1.middlewares.CustomSpiderMiddleware': 543,
    }
    

    7. 自定制命令

    • 在spiders同级创建任意目录,如:commands
    • 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
     1 from scrapy.commands import ScrapyCommand
     2     from scrapy.utils.project import get_project_settings
     3 
     4 
     5     class Command(ScrapyCommand):
     6 
     7         requires_project = True
     8 
     9         def syntax(self):
    10             return '[options]'
    11 
    12         def short_desc(self):
    13             return 'Runs all of the spiders'
    14 
    15         def run(self, args, opts):
    16             spider_list = self.crawler_process.spiders.list()
    17             for name in spider_list:
    18                 self.crawler_process.crawl(name, **opts.__dict__)
    19             self.crawler_process.start()
    crawlall.py
    • 在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
    • 在项目目录执行命令:scrapy crawlall 

    8.概念补充

    线程:线程是计算机工作中的最小单元,在IO密集型的程序中适合使用,但是也不太好,如果每个线程又是IO请求的话,会造成浪费,使用协程更好 

    进程:默认有主线程,可以有多个线程,共享这个进程的内部资源。计算密集型的程序适合使用进程

    协程:使用一个线程去完成多个任务,也叫微线程

    GIL:Python特有的全局解释器锁,相当于在进程中给所有线程加锁,保证同一时间只有一个线程被CUP调度

    更多文档参见:http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html

  • 相关阅读:
    java总结2
    java总结
    java动手动脑
    今日代码总结
    JavaScript 中 几 个需要掌握基础的问题
    JavaScript中如何将指定的某个字符全部转换为其他字符
    HTML页面一键分享到QQ空间、QQ好友、新浪微博、微信代码
    jq动画里这样写css属性
    h5 前端面试题
    ES6 object.defineProperty
  • 原文地址:https://www.cnblogs.com/sxzwj/p/6411050.html
Copyright © 2011-2022 走看看