zoukankan      html  css  js  c++  java
  • 简单实现一个初步的爬虫

    Django:
        # 创建project
        django-admin startproject mysite
        
        cd mysite
        
        # 创建app
        python manage.py startapp app01
        python manage.py startapp app02
        
        # 启动项目
        python manage.py runserver
    
    scrapy:
        # 创建project     项目名称
        scrapy startproject xdb
        cd xdb
        
        #创建爬虫       爬虫名称  爬虫地址
        scrapy genspider chouti chouti.com
        scrapy genspider cnblogs cnblogs.com
        
        # 启动爬虫
        scrapy crawl chouti
        scrapy crawl chouti --nolog
    """
    源码内容:
        1. 判断当前XdbPipeline类中是否有from_crawler
            有:obj = XdbPipeline.from_crawler(...)
            否:obj = XdbPipeline()
        2. obj.open_spider()
        3. obj.process_item()|obj.process_item()|obj.process_item()|
        4. obj.close_spider()
    """
    from scrapy.exceptions import DropItem
    class XdbPipeline(object):
    
        def __init__(self, path):
            self.f = None
            self.path = path
    
        @classmethod
        def from_crawler(cls, crawler):
            '''
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            '''
            path = crawler.settings.get('HREF_FILE_PATH')
            return cls(path)
    
        def open_spider(self, spider):
            '''
            爬虫开始执行时,调用
            :param spider:
            :return:
            '''
            self.f = open(self.path, 'a+')
    
        def process_item(self, item, spider):
            # print(item.get("text"))
            self.f.write(item.get('href') + '
    ')
            return item   # 交给下一个Pipleline中的process_item方法去执行
         return DropItem() # 后续的Pipeline中的process_item方法不再执行
    def close_spider(self, spider): ''' 爬虫关闭时,被调用 :param spider: :return: ''' self.f.close()
    持久化:pipelines
    pipelines.py
        class XdbPipeline(object):
    
            def __init__(self, path):
                self.f = None
                self.path = path
    
            @classmethod
            def from_crawler(cls, crawler):
                path = crawler.settings.get('HREF_FILE_PATH')
                return cls(path)
    
            def open_spider(self, spider):
                self.f = open(self.path, 'a+')
    
            def process_item(self, item, spider):
                # print(item.get("text"))
                self.f.write(item.get('href') + '
    ')
                return item
    
            def close_spider(self, spider):
                self.f.close()
                
    settings.py
        ITEM_PIPELINES = {
       'xdb.pipelines.XdbPipeline': 300,  # 数字越小优先级越高, 范围0--1000
        }
        
    items.py
        import scrapy
        class XdbItem(scrapy.Item):
            text = scrapy.Field()
            href = scrapy.Field()
            
    chouti.py
        import scrapy
        xdb.items import XdbItem
    
        class ChoutiSpider(scrapy.Spider):
            name = 'chouti'
            allowed_domains = ['chouti.com']
            start_urls = ['http://chouti.com/']
    
            def parse(self, response):
                content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]')
                for item in content_list:
                    text = item.xpath('./a/text()').extract_first()
                    href = item.xpath('./a/@href').extract_first()
                    yield XdbItem(text=text, href=href)
  • 相关阅读:
    tyvj 1031 热浪 最短路
    【bzoj2005】 [Noi2010]能量采集 数学结论(gcd)
    hdu 1394 Minimum Inversion Number 逆序数/树状数组
    HDU 1698 just a hook 线段树,区间定值,求和
    ZeptoLab Code Rush 2015 C. Om Nom and Candies 暴力
    ZeptoLab Code Rush 2015 B. Om Nom and Dark Park DFS
    ZeptoLab Code Rush 2015 A. King of Thieves 暴力
    hdoj 5199 Gunner map
    hdoj 5198 Strange Class 水题
    vijos 1659 河蟹王国 线段树区间加、区间查询最大值
  • 原文地址:https://www.cnblogs.com/xiongfanyong/p/13063205.html
Copyright © 2011-2022 走看看