zoukankan      html  css  js  c++  java
  • 请求传参

    以爬取某电影网的电影列表以及子链接中的信息为例子

    spiders/parndemo.py

    import scrapy
    from parnpost import items
    
    
    class ParndemoSpider(scrapy.Spider):
        name = 'parndemo'
        # allowed_domains = ['https://www.55xia.com/movie']
        start_urls = ['https://www.55xia.com/movie/']
    
        def getinfo(self, response):
            print(00000000000000)
            actor = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()").extract_first()
            lanuage = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()").extract_first()
            time = response.xpath("/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()").extract_first()
            item = response.meta["item"]
            item["actor"] = actor
            item["lanuage"] = lanuage
            item["time"] = time
            print("item", item)
            # 将item提交给管道
            yield item
    
        def parse(self, response):
            # 名称  类型  导演  语言  片长
            div_list = response.xpath("/html/body/div[1]/div[1]/div[2]/div")  # /div获取所有的子div
            # print("div_list", div_list)
            # 实例化item
            # language = response.xpath("")
            # longTime = response.xpath("")
            for div in div_list:
                item = items.ParnpostItem()
                # print("div", div)
                name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
                kind = div.xpath('.//div[@class="otherinfo"]/text()').extract_first()
                # print("kind", kind)
                url = div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()  # 获取指定数据的链接
                # 将数据存入item中
                # print("name", name)
                print("url", url)
                url = "https:"+url
                item["name"] = name
                item["kind"] = kind
                print("item", item)
                # 对url发起请求操作,获取相应的页面数据进行分析  #meta传入只能是字典格式
                yield scrapy.Request(url=url, callback=self.getinfo, meta={"item": item})

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class ParnpostPipeline(object):
        fp = None
    
        def open_spider(self, spider):
            print(1)
            self.fp = open("record.txt", "w", encoding="utf-8")
    
        def process_item(self, item, spider):
            print(2)
            name = item["name"]
            kind = item["kind"]
            actor = item["actor"]
            lanuage = item["lanuage"]
            time = item["time"]
            content = "名称:" + str(name) + "  " + "种类:" + str(kind) + "  " + "导演:" + str(actor) + "  " + "语种:" + str(lanuage) + "  " + "时间:" + str(time) + "
    
    "
            print(name, kind, actor, lanuage, content)
            print("content")
            self.fp.write(content)
            return item
    
        def close_spider(self, spider):
            print(3)
            self.fp.close()

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for parnpost project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'parnpost'
    
    SPIDER_MODULES = ['parnpost.spiders']
    NEWSPIDER_MODULE = 'parnpost.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'parnpost.middlewares.ParnpostSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'parnpost.middlewares.ParnpostDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'parnpost.pipelines.ParnpostPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ParnpostItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        kind = scrapy.Field()
        actor = scrapy.Field()
        lanuage = scrapy.Field()
        time = scrapy.Field()
        # pass
  • 相关阅读:
    springboot(eureka子项目)+idea+jsp 404问题
    什么是区块链以及他的6个特征?
    elasticsearch启动时提示内存不足错误的解决方法
    ElasticSearch 安装root用户启动失败问题解决
    防抖节流(立即执行和延时执行)
    将数字转为千分制格式(最简单)
    微信H5跳转任意小程序
    原生html+css设置项目主题色(超简单)
    H5获取手机型号
    css属性——env()和constant()设置安全区域
  • 原文地址:https://www.cnblogs.com/cjj-zyj/p/10144251.html
Copyright © 2011-2022 走看看