zoukankan      html  css  js  c++  java
  • 二十七、miniscrapy,scrapy源码初解

    基本使用

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    
    # 基本使用
    
    def all_done(contents):
        # 所有爬虫执行完毕后,循环终止
        reactor.stop()
    
    
    def callback(contents):
        # 每一个爬虫获取结果后,自动执行
        print(contents)
    
    
    deferred_list = list()
    
    url_list = ['http://www.bing.com', 'http://www.baidu.com']
    for url in url_list:
        deferred = getPage(bytes(url, encoding='utf8'))
        deferred.addCallback(callback)
        deferred_list.append(deferred)
    
    dlist = defer.DeferredList(deferred_list)
    dlist.addBoth(all_done)
    
    reactor.run()
    View Code

    基于装饰器1

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    # 基于装饰器1
    def all_done(arg):
        reactor.stop()
    
    
    def onedone(response):
        print(response)
    
    # 三要素:装饰器,deferred对象,yield
    @defer.inlineCallbacks
    def task(url):
        deferred = getPage(bytes(url, encoding='utf8'))
        deferred.addCallback(onedone)
        yield deferred
    
    
    deferred_list = list()
    
    url_list = ['http://www.bing.com', 'http://www.baidu.com']
    
    for url in url_list:
        deferred = task(url)
        deferred_list.append(deferred)
    
    dlist = defer.DeferredList(deferred_list)
    dlist.addBoth(all_done)
    
    reactor.run()
    View Code

    基于装饰器2

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    
    # 基于装饰器2
    
    def all_done(arg):
        reactor.stop()
    
    
    def onedone(response):
        print(response)
    
    
    @defer.inlineCallbacks
    def task():
        deferred_1 = getPage(bytes('http://www.baidu.com', encoding='utf8'))
        deferred_1.addCallback(onedone)
        yield deferred_1
    
        deferred_2 = getPage(bytes('http://www.bing.com', encoding='utf8'))
        deferred_2.addCallback(onedone)
        yield deferred_2
    
    
    ret = task()
    ret.addBoth(all_done)
    
    reactor.run()
    View Code

    基于装饰器3

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    
    # 基于装饰器3,永恒循环
    def all_done(arg):
        reactor.stop()
    
    
    def onedone(response):
        print(response)
    
    
    @defer.inlineCallbacks
    def task():
        deferred_1 = getPage(bytes('http://www.bing.com', encoding='utf8'))
        deferred_1.addCallback(onedone)
        yield deferred_1
    
        stop_deferred = defer.Deferred()  # 永远不能完成的任务
        # stop_deferred.callback(None)  # 自定义callback对象,通过回调终止操作
        yield stop_deferred
    
    
    ret = task()
    ret.addBoth(all_done)
    
    reactor.run()  # run是事件循环
    View Code

    基于装饰器4

    from twisted.web.client import defer, getPage
    from twisted.internet import reactor
    
    # 基于装饰器,执行完毕后停止事件循环
    
    running_list = list()
    stop_deferred = None
    
    
    def all_done(arg):
        reactor.stop()
    
    
    def onedone(response, url):
        print(response)
        running_list.remove(url)
    
    
    def check_empty(response):
        if not running_list:
            stop_deferred.callback(None)
    
    
    @defer.inlineCallbacks
    def task(url):
        deferred = getPage(bytes(url, encoding='utf8'))
        deferred.addCallback(onedone, url)
        deferred.addCallabck(check_empty)
        yield deferred
    
        global stop_deferred
        stop_deferred = defer.Deferred()
        yield stop_deferred
    
    
    running_list.append('http://www.baidu.com')
    ret = task('http://www.baidu.com')
    ret.addBoth(all_done)
    
    reactor.run()
    View Code

    基于装饰器5

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    
    class ExecutionEngine(object):
    
        def __init__(self):
            self.stop_deferred = None
            self.running_list = list()
    
        def one_done(self, response, url):
            print(response)
            self.running_list.remove(url)
    
        def check_empty(self, url):
            if not self.running_list:
                self.stop_deferred.callback(None)
    
        @defer.inlineCallbacks
        def open_spider(self, url):
            deferred = getPage(bytes(url, encoding='utf8'))
            deferred.addCallback(self.one_done, url)
            deferred.addCallback(self.check_empty)
            yield deferred
    
        @defer.inlineCallbacks
        def close_spider(self, url):
            self.stop_deferred = defer.Deferred()
            yield self.stop_deferred
    
    
    @defer.inlineCallbacks
    def task(url):
        engine = ExecutionEngine()
        engine.running_list.append(url)
        yield engine.open_spider(url)
        yield engine.close_spider(url)
    
    
    def all_done(arg):
        reactor.stop()
    
    
    if __name__ == "__main__":
        ret = task("http://www.bing.com")
        ret.addBoth(all_done)
        reactor.run()
    View Code

    Miniscrapy,scrapy源码初解

    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    import queue
    
    
    class Request(object):
    
        def __init(self, url, callback):
            self.url = url
            self.callback = callback
    
    
    class Response(object):
    
        def __init__(self, body, request):
            self.body = body
            self.request = request
            self.url = reqeust.url
    
        @property
        def text(self):
            return self.body.decode('utf8')
    
    
    class Scheduler(object):
    
        def __init(self, engine):
            self.q = queue.Queue()
            self.engine = engine
    
        def enqueue_request(self, request):
            self.q.put(request)
    
        def next_request(self):
            try:
                req = self.q.get(block=False)
            except Exception as e:
                req = None
            return req
    
        def size(self):
            return self.q.qsize()
    
    
    class ExecutionEngine(object):
    
        def __init(self):
            self._closewait = None
            self.runing = True
            self.start_requests = None
            self.scheduler = Scheduler(self)
            self.inprogress = set()
    
        def check_empty(self, response):
            if not self.runing:
                self._closewait.callback(None)
    
        def _next_request(self):
            while self.start_requests:
                try:
                    request = next(self.start_requests)
                except StopIteration:
                    self.start_requests = None
                else:
                    self.scheduler.enqueue_request(request)
            while len(self.inprogress) < 5 and self.scheduler.size() > 0:  # 最大并发数
                request = self.scheduler.next_request()
                if not request:
                    break
                self.inprogress.add(request)
                d = getPage(bytes(request.url, encoding='utf8'))
                d.addBoth(self._handle_downloader_output, request)
                d.addBoth(lambda x, req: self.inprogress.remove(req), request)
                d.addBoth(lambda x: self._next_request())
            if len(self.inprogress) == 0 and self.scheduler.size() == 0:
                self._closewait.callback(None)
    
        def _handle_downloader_output(self, body, request):
            # 获取内容,执行回调函数,并且把回调函数中的返回值获取,并添加到队列中
            import types
            response = Response(body, request)
            func = request.callback or self.spider.parse
            gen = func(response)
            if isinstance(gen, types.GeneratorType):
                for req in gen:
                    self.scheduler.enqueue_request(req)
    
        @defer.inlineCallbacks
        def start(self):
            self._closewait = defer_Deferred()
            yield self._closewait
    
        @defer.inlineCallbacks
        def open_spider(self, spider, start_requests):
            self.start_requests = start_requests
            self.spider = spider
            yield None
            reactor.callLater(0, self._next_request)
    
    
    class Crawler(object):
    
        def __init__(self, spider_cls):
            self.spider_cls = spider_cls
            self.spider = None
            self.engine = None
    
        @defer.inlineCallbacks
        def crawl(self):
            self.engine = ExecutionEngine()
            self.spider = self.spider_cls()
            start_requests = iter(self.spider.start_requests())
            yield self.engine.open_spider(self.spider, start_requests)
            yield self.engine.start()
    
    
    class CrawlerProcess(object):
    
        def __init__(self):
            self._active = set()
            self.crawlers = set()
    
        def crawl(self, spider_cls, *args, **kwargs):
            crawler = Crawler(spider_cls)
    
            self.crawlers.add(crawler)
            d = crawler.crawl(*args, **kwargs)
            self._active.add(d)
            return d
    
        def start(self):
            d = defer.DeferredList(self._active)
            d.addBoth(self._stop_reactor)
            reactor.run()
    
        def _stop_reactor(self, _=None):
            reactor.stop()
    
    
    class Spider(object):
    
        def start_requests(self):
            for url in self.start_urls:
                yield Request(url)
    
    
    class BaiduSpider(spider):
        name = 'baidu'
        start_urls = [
            'http://www.baidu.com'
        ]
    
        def parse(self, response):
            print(response.text)
    
    
    class BingSpider(spider):
        pass
    
    
    if __name__ == "__main__":
        spider_cls_list = [BaiduSpider, BingSpider]
        crawler_process = CrawlerProcess()
        for spider_cls in spider_cls_list:
            crawler_process.crawl(spider_cls)
        crawler_process.start()
    View Code
  • 相关阅读:
    【已解决】Makefile执行过程中出错:make: *** No rule to make target ` ‘, needed by xxx. Stop(转载)
    eclipse导入工程报Invalid project description(转载)
    基于Linux的v4l2视频架构驱动编写(转载)
    在eclipse中如何在大量项目中查找指定文件(转载)
    Ubuntu下FileZilla的安装(转载)
    创建 /dev/video0 节点 (转载)
    python函数-迭代器&生成器
    前端第三篇---前端基础之JavaScript
    前端第二篇---前端基础之CSS
    块级元素和行内元素使用心得汇总
  • 原文地址:https://www.cnblogs.com/nuochengze/p/13391756.html
Copyright © 2011-2022 走看看