zoukankan      html  css  js  c++  java
  • 简单实现一个初步的爬虫

    Django:
        # 创建project
        django-admin startproject mysite
        
        cd mysite
        
        # 创建app
        python manage.py startapp app01
        python manage.py startapp app02
        
        # 启动项目
        python manage.py runserver
    
    scrapy:
        # 创建project     项目名称
        scrapy startproject xdb
        cd xdb
        
        #创建爬虫       爬虫名称  爬虫地址
        scrapy genspider chouti chouti.com
        scrapy genspider cnblogs cnblogs.com
        
        # 启动爬虫
        scrapy crawl chouti
        scrapy crawl chouti --nolog
    """
    源码内容:
        1. 判断当前XdbPipeline类中是否有from_crawler
            有:obj = XdbPipeline.from_crawler(...)
            否:obj = XdbPipeline()
        2. obj.open_spider()
        3. obj.process_item()|obj.process_item()|obj.process_item()|
        4. obj.close_spider()
    """
    from scrapy.exceptions import DropItem
    class XdbPipeline(object):
    
        def __init__(self, path):
            self.f = None
            self.path = path
    
        @classmethod
        def from_crawler(cls, crawler):
            '''
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            '''
            path = crawler.settings.get('HREF_FILE_PATH')
            return cls(path)
    
        def open_spider(self, spider):
            '''
            爬虫开始执行时,调用
            :param spider:
            :return:
            '''
            self.f = open(self.path, 'a+')
    
        def process_item(self, item, spider):
            # print(item.get("text"))
            self.f.write(item.get('href') + '
    ')
            return item   # 交给下一个Pipleline中的process_item方法去执行
         return DropItem() # 后续的Pipeline中的process_item方法不再执行
    def close_spider(self, spider): ''' 爬虫关闭时,被调用 :param spider: :return: ''' self.f.close()
    持久化:pipelines
    pipelines.py
        class XdbPipeline(object):
    
            def __init__(self, path):
                self.f = None
                self.path = path
    
            @classmethod
            def from_crawler(cls, crawler):
                path = crawler.settings.get('HREF_FILE_PATH')
                return cls(path)
    
            def open_spider(self, spider):
                self.f = open(self.path, 'a+')
    
            def process_item(self, item, spider):
                # print(item.get("text"))
                self.f.write(item.get('href') + '
    ')
                return item
    
            def close_spider(self, spider):
                self.f.close()
                
    settings.py
        ITEM_PIPELINES = {
       'xdb.pipelines.XdbPipeline': 300,  # 数字越小优先级越高, 范围0--1000
        }
        
    items.py
        import scrapy
        class XdbItem(scrapy.Item):
            text = scrapy.Field()
            href = scrapy.Field()
            
    chouti.py
        import scrapy
        xdb.items import XdbItem
    
        class ChoutiSpider(scrapy.Spider):
            name = 'chouti'
            allowed_domains = ['chouti.com']
            start_urls = ['http://chouti.com/']
    
            def parse(self, response):
                content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]')
                for item in content_list:
                    text = item.xpath('./a/text()').extract_first()
                    href = item.xpath('./a/@href').extract_first()
                    yield XdbItem(text=text, href=href)
  • 相关阅读:
    [SoapUI] Compare JSON Response(比较jsonobject)
    [SoapUI] Command-Line Arguments
    [SoapUI] How to create a random UUID in each Request's Headers
    SoapUI Script Library
    [正则表达式] 正则表达式匹配UUID
    [远程] windows 2008 server设置了共享文件夹,并且共享给了everyone,但是还是无法访问,怎么解决呢?
    each
    WSDL Style和use的组合方式说明
    XML xsd
    java.lang.ClassNotFoundException: springosgi
  • 原文地址:https://www.cnblogs.com/xiongfanyong/p/13063205.html
Copyright © 2011-2022 走看看