zoukankan      html  css  js  c++  java
  • scrapy项目2

    # -*- coding: utf-8 -*-
    import scrapy
    import logging
    
    logger = logging.getLogger(__name__)  #__name__,log日志回显示当前py文件路径的log日志;[myspider03.spiders.itcast1],不加__name__,显示root
    
    class Itcast1Spider(scrapy.Spider):
        name = 'itcast1' #爬虫名称
        allowed_domains = ['itcast.cn']
        start_urls = ['http://itcast.cn/']
    
        def parse(self, response):
            for i in range(10):
                item = {}
                item["come_from"] = 'jingdong'
                # logging.warning(item) #可以通过 loggin输出的内容保存在本地,可以查看log日志,查看程序状态,但是这个保存在根目录,一般不用
                logger.warning(item)
                yield item #注意yield的位置,在for循环内核循环外的区别

    pipelines.py

    #同时爬取三个网站,怎么区分,加键判断,也可获取spider.name来判断
    # 定义一个多判断
    
    # #1.加键判断
    # class Myspider03Pipeline(object):
    #     def process_item(self, item, spider):
    #         if item["come_from"] == 'jingdong':
    #             pass
    #         elif item["come_from"] == 'chouti':
    #             pass
    #         else:
    #             pass
    #         return item
    #
    #
    # #也可以定义多个判断
    # class Myspider03Pipeline2(object):
    #     def process_item(self, item, spider):
    #         if item["come_from"] == 'jingdong':
    #             pass
    #         return item
    #
    # class Myspider03Pipeline3(object):
    #     def process_item(self, item, spider):
    #
    #         if item["come_from"] == 'chouti':
    #             pass
    #
    #         return item
    
    #2.spider.name,通过爬虫的名字来判断
    
    import logging
    
    logger = logging.getLogger(__name__)
    class Myspider03Pipeline4(object):
        def process_item(self, item, spider): #item就是前面yieldtiem
            #spilder就是爬虫项目的类如:class Itcast1Spider(scrapy.Spider):
            #可以通过spilder.xxx获取name,allowed_domains,start_urls
            if spider.name == "itcast1": #获取类的名字判断
                logger.warning('=========')
                print(item)
    
            return item

    settings.py

    LOG_LEVEL = "WARNING"
    LOG_FILE = "./log.log"  #加这个log日志会显示在当前路径的log.log文件中,不加的话会在终端显示

    注意点

    #使用pipeline,要开启settings中的ITEM_PIPELINES
    #多个pipeline的作用,不同的pipeline处理不同的item内容,一个爬虫项目包含多个爬虫,一个spider可能做不同的操作,比如存入不同的数据库
    # pipeline的权重越小,优先级越高,pipeline中的process_item方法名不能改为其他名称
    
    scrapy中log日志配置
    LOG_LEVEL = "WARNING"
    LOG_FILE = "./log.log"  #加这个log日志会显示在当前路径的log.log文件中,不会在终端显示,不加的话会在终端显示
    
    
    普通项目log配置
    import logging
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename='test.log',
                        filemode='w')
    
    实例化一个log对象 logger = logging.getLogger(__name__)
    
    在任何py文件中调用loggeer即可
  • 相关阅读:
    Single Number II
    Best Time to Buy and Sell Stock
    Linked List Cycle
    Single Number
    Max Points on a Line
    Strategy
    LRU Cache
    Word Break II
    Text Justification
    Median of Two Sorted Arrays
  • 原文地址:https://www.cnblogs.com/chvv/p/10332458.html
Copyright © 2011-2022 走看看