zoukankan      html  css  js  c++  java
  • JDBOOK

    # -*- coding: utf-8 -*-
    import scrapy

    from BOOK.items import BookItem
    import json

    from copy import deepcopy

    class BookSpider(scrapy.Spider):
        name = 'book'

        # 域名范围
        allowed_domains = ['jd.com', 'p.3.cn']
        start_urls = ['https://book.jd.com/booksort.html']

        index = 0

        # 1.解析大类的名字  和 dt(为了小类)
        def parse(self, response):
            dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')

            for dt in dt_list[:1]:
                item = BookItem()
                item['big_name'] = dt.xpath('.//a/text()').extract_first()

                # 2.提取小类dd 名字 和 url
                em_list = dt.xpath('./following-sibling::*[1]/em')

                for em in em_list[:1]:
                    item['small_name'] = em.xpath('./a/text()').extract_first()
                    item['small_link'] = 'https:' + em.xpath('./a/@href').extract_first()

                    # 3. 发送每个小类的请求
                    yield scrapy.Request(
                        item['small_link'],
                        callback=self.parse_book_info,
                        meta={'key': deepcopy(item)}
                    )

        # 解析每本书的数据
        def parse_book_info(self, response):

            # 接收 从 小类 传入的 item
            item = response.meta['key']

            # 3.1图书列表
            book_list = response.xpath('//*[@id="plist"]/ul/li')

            # 3.2 遍历每一本书  取出信息 100本书
            for book in book_list[:1]:
                # 书的图片
                item['book_img_src'] = book.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()

                # 书的名字
                item['book_name'] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first()

                # 书的作者
                item['book_auth'] = book.xpath('.//span[@class="p-bi-name"]/span/a/text()').extract_first()
                # 出版社
                item['book_store'] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
                # 出版时间
                item['book_time'] = book.xpath('.//span[@class="p-bi-date"]/text()').extract_first()

                # 书的价格 前端里面 callback 前端jsonp 跨域
                price_link = 'https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds={}'
                skuId = book.xpath('./div/@data-sku').extract_first()

                # 发送每本书的价格
                yield scrapy.Request(
                    price_link.format(skuId),
                    callback=self.parse_price,
                    meta={'key': deepcopy(item)}
                )

            self.index += 1
            # 4.翻页 判断什么时候结束
            next_url = response.xpath('//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract_first()

            if next_url is not None:

                # 验证数据 翻5页
                if self.index > 3:
                    return

                yield response.follow(
                    next_url,
                    callback=self.parse_book_info,
                    meta={'key': item}
                )

        # 解析价格
        def parse_price(self, response):
            print(response.body.decode())
            # 接收 从 book 传入的 item
            item = response.meta['key']
            item['book_price'] = json.loads(response.body.decode())[0]['op']
            # 交给 engien -- pipeline
            yield item

  • 相关阅读:
    两数相加[链表加法] LeetCode.2
    无重复字符的最长子串[双指针+哈希表] LeetCode.3
    Rikka with Game[技巧]----2019 杭电多校第九场:1005
    度度熊与排列[搜索+剪枝]----2019 年百度之星·程序设计大赛
    度度熊与数字[公因数]----2019 年百度之星·程序设计大赛
    最大层内元素和----leetcode周赛150_1002
    拼写单词[哈希表]----leetcode周赛150_1001
    Seq[找规律]----2019 年百度之星·程序设计大赛
    实验三
    实验二
  • 原文地址:https://www.cnblogs.com/hanjian200ok/p/9534447.html
Copyright © 2011-2022 走看看