zoukankan      html  css  js  c++  java
  • scrapy (三) : 请求传参

    scrapy 请求传参

    1.定义数据结构item.py文件

    '''
    field: item.py
    '''
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class MovieprojectItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # 电影海报
        # 一级页面要抓取的内容
        post = scrapy.Field()
        name = scrapy.Field()
        _type = scrapy.Field()
    
        # 二级页面要抓取的内容
        director = scrapy.Field()
        design = scrapy.Field()
        actor = scrapy.Field()
        info = scrapy.Field()
    
    
    

    2.爬虫文件

    # -*- coding: utf-8 -*-
    # -*- coding: utf-8 -*-
    import scrapy
    from movieproject.items import MovieprojectItem
    
    
    class MovieSpider(scrapy.Spider):
        name = 'movie'
        allowed_domains = ['www.id97.com']
        start_urls = ['http://www.id97.com/movie/']
        url = 'http://www.id97.com/movie/?page={}'
        page = 1
        
        '''
        (1)只需要提取页码链接,只提取第一页的信息即可
        (2)需要写两个规则,一个规则提取详情页面,一个规则是提取页码链接
        '''
    
    
        def parse(self, response):
            # 先查找所有的movie_div
            movie_div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]')
            # 遍历所有的div,去获取每一个详细的信息
            for odiv in movie_div_list:
                item = MovieprojectItem()
                # 获取电影海报
                item['post'] = odiv.xpath(".//img/@data-original").extract_first()
    
                # 获取电影名字
                item['name'] = odiv.xpath("./div/div/h1/a/text()").extract_first()
                # 获取电影类型
                item['_type'] = odiv.xpath("./div/div/div/a/text()").extract()
    
                # 获取详情页面
                detail_href = odiv.xpath('./div/a/@href').extract_first()
    						'''
    						向详情页面发送请求
                将item向二级传递过去,到二级页面接受并且接着提取其他的信息
                请求二级详情页面,解析二级页面中的相应内容,通过meta参数进行Request的数据传
    						'''
                
                yield scrapy.Request(url=detail_href,callback=self.parse_detail, meta={'item': item})
    # 爬取其他页面
            if self.page <= 5:
                self.page += 1
                url = self.url.format(self.page)
                print(url)
                yield scrapy.Request(url=url, callback=self.parse)
    
    
        def parse_detail(self,response):
            # 首先获取到上一级传递过来的item
            item = response.meta['item']
            # 在这个页面中接着提取电影的其它信息即可
            # 获取导演
            item['director'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr/td[2]/a/text()").extract()
            # 获取编剧
            item['design'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[2]/td[2]/a/text()").extract()
            # 获取主演
            item['actor'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[3]/td[2]/a/text()").extract()
            # 获取电影介绍
            item['info'] = response.xpath("//div[@class='col-xs-12 movie-introduce']/p/text()").extract_first()
    				
            #提交item到管道
            yield item
    
    
    

    3.管道文件

    # -*- coding: utf-8 -*-
    '''
    filed: pipelines.py
    '''
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import json
    from scrapy.utils.project import get_project_settings
    import pymysql
    
    class MovieprojectPipeline(object):
        def open_spider(self,spider):
            self.fp = open("movie.json","w",encoding="utf8")
        def process_item(self, item, spider):
            obj = dict(item)
            string = json.dumps(obj,ensure_ascii=False)
            self.fp.write(string+'
    ')
            # print("写入成功")
            return item
        def close_spider(self,spider):
            self.fp.close()
    
            
    class MovieMysqlPipeline(object):
        def open_spider(self,spider):
            # 获取所有的配置信息
            settings = get_project_settings()
            # 链接数据库
            host = settings['DB_HOST']
            port = settings['DB_PORT']
            user = settings['DB_USER']
            pwd = settings['DB_PWD']
            name = settings['DB_NAME']
            charset = settings['DB_CHARSET']
            
            self.conn = pymysql.connect(host=host, port=port, user=user, password=pwd, db=name, charset=charset)
            
        def process_item(self, item, spider):
            # 拼接sql语句
            sql = 'insert into movie(post, name, type, director, design, actor, info) values("%s","%s","%s","%s","%s","%s","%s")' % (item['post'], item['name'], item['_type'], item['director'], item['design'], item['actor'], item['info'])
            
            # 获取游标
            cursor = self.conn.cursor()
            
            # 执行sql语句
            try:
                cursor.execute(sql)
                self.conn.commit()
            except Exception as e:
                self.conn.rollback()
            return item
            
        def close_spider(self,spider):
            # 关闭数据库
            self.conn.close()
            
      
    
  • 相关阅读:
    where和having的区别
    lnmp环境安装
    Elasticsearch 日常维护命令
    Haproxy基础知识
    LVM常规操作记录梳理(扩容/缩容/快照等)
    Centos7下ELK+Redis日志分析平台的集群环境部署记录
    CentOS7.2下安装php加速软件Xcache
    ELK实时日志分析平台环境部署
    Docker容器时间跟主机时间保持同步的操作记录
    ELK基础架构解析
  • 原文地址:https://www.cnblogs.com/lpdeboke/p/12964940.html
Copyright © 2011-2022 走看看