zoukankan      html  css  js  c++  java
  • 京东书籍爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from copy import deepcopy
    import json
    import urllib
    class BookjdSpider(scrapy.Spider):
        name = 'bookjd'
        allowed_domains = ['jd.com','p.3.cn']  #价格域名也要加进去
        start_urls = ['https://book.jd.com/booksort.html']
    
        def parse(self, response):
            #第一种
            # dt_list = response.xpath("//div[@class='mc']/dl/dt")
            # dd_list = response.xpath("//div[@class='mc']/dl/dd")
            # # for dt in dt_list:
            # #     item = {}
            # #     item["fenlei"] = dt.xpath("./a/text()").extract_first()
            # #     print(item)
            # # for dd in dd_list:
            # #     em_list = dd.xpath("//em")
            # #     for em in em_list:
            # #         item = {}
            # #         item["fenlei"] = em.xpath("./a/text()").extract_first()
            # #         print(item)
            # for dt,dd in zip(dt_list,dd_list):
            #     item = {}
            #     item["dafenlei"] = dt.xpath("./a/text()").extract_first()
            #     em_list = dd.xpath("./em")
            #     for em in em_list:
            #
            #         item["fenlei"] = em.xpath("./a/text()").extract_first()
            #         print(item)
            #第二种
            dt_list = response.xpath("//div[@class='mc']/dl/dt")
            for dt in dt_list:
                item = {}
                item["dafenlei"] = dt.xpath("./a/text()").extract_first()
                em_list = dt.xpath("./following-sibling::dd[1]/em")
                for em in em_list:
    
                    item["fenlei"] = em.xpath("./a/text()").extract_first()
                    #//list.jd.com/1713-3258-3297.html
                    #https://list.jd.com/list.html?cat=1713,3258,3297&tid=3297
                    item["href"] = em.xpath("./a/@href").extract_first()
                    # num1 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[0]
                    # num2 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[1]
                    # num3 = item["href"].rsplit('/')[3].rsplit('.')[0].rsplit('-')[2]
                    # item["href"] = "https://list.jd.com/list.html?cat={},{},{}&tid={}".format(num1,num2,num3,num3)
                    if item["href"] is not None:
                        item["href"] = "https:" + item["href"]
                    print(item)
                    yield scrapy.Request(
                        item["href"],
                        callback=self.book_list,
                        meta = {"item":deepcopy(item)}
    
                    )
        def book_list(self,response):
            item = response.meta["item"]
            li_list = response.xpath("//div[@id='plist']/ul/li")
            for li in li_list:
                item["book_title"] = li.xpath(".//div[@class='p-name']/a/em/text()").extract_first().strip()
                item["book_img"] = li.xpath(".//div[@class='p-img']/a/img/@src").extract_first()
                if item["book_img"] is None:
                    item["book_img"] = li.xpath(".//div[@class='p-img']/a/img/@data-lazy-img").extract_first()
                item["book_img"] = "https:"+ item["book_img"] if item["book_img"] is not None else None
                item["book_href"] = li.xpath(".//div[@class='p-name']/a/@href").extract_first()
                item["book_publish"] = li.xpath(".//span[@class='p-bi-store']/a/text()").extract_first()
                item["book_sku"] = li.xpath("./div/@data-sku").extract_first()
                print(item)
                #多个书籍价格的URL
                #https://p.3.cn/prices/mgets?callback=jQuery5548003&ext=11101000&pin=&type=1&area=1_72_4137_0&skuIds=J_11757834
                # %2CJ_12090377%2CJ_10616501%2CJ_12192773%2CJ_12155241%2CJ_11716978%2CJ_12174897%2CJ_10367073%2CJ_11711801
                # %2CJ_10960247%2CJ_10019917%2CJ_11711801%2CJ_10199768%2CJ_11711801%2CJ_12174895%2CJ_12173835%2CJ_12350509
                # %2CJ_12052646%2CJ_12479361%2CJ_12160627%2CJ_12406846%2CJ_10162899%2CJ_12449755%2CJ_11888857%2CJ_11982184
                # %2CJ_12271618%2CJ_12184621%2CJ_12041776%2CJ_12174923%2CJ_11982172
                # &pdbp=0&pdtk=&pdpin=&pduid=15498906230231700222016&source=list_pc_front&_=1549895670335  #多余的可以去去掉
                yield scrapy.Request(
                    "https://p.3.cn/prices/mgets?skuIds=J_{}".format(item["book_sku"]),  #拼接单个书籍的价格
                    callback=self.pare_book_price,
                    meta={"item":deepcopy(item)}
    
                )
            next_url = response.xpath("//a[@class='pn-next']/@href").extract_first()
            if next_url is not None:
                next_url = urllib.parse.urljoin(response.url,next_url)
                yield scrapy.Request(
                    next_url,
                    callback=self.book_list,
                    meta = {"item":item}
                )
    
        def pare_book_price(self,response):
            item = response.meta["item"]
            item["book_price"] = json.loads(response.body.decode())[0]["op"]

    settings.py

    # 指定使用scrapy-redis的去重
    DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    # 指定使用scrapy-redis的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
    SCHEDULER_PERSIST = True
    # REDIS_URL = "redis://192.168.170.141:6379"
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    REDIS_HOST = '192.168.170.141'            # 主机名
    REDIS_PORT = 6379                         # 端口
    REDIS_PARAMS  = {'password':'xxxxx'}    # Redis连接参数
    REDIS_ENCODING = "utf-8"  

    项目地址:https://github.com/CH-chen/jdbook

  • 相关阅读:
    用GDB调试程序(一)
    关于“鸡脚神”的看法
    Oracle 经典SQL 专为笔试准备
    怎样设计接口?
    myeclipse6.0下载及注冊码
    VB连接Mysql数据库
    开源html5_kiwijs_helloworld
    server宕机监控、检測、报警程序(139绑定手机短信报警)monitor_down.sh
    js实现自己定义鼠标右键-------Day45
    C/C++程序猿必须熟练应用的开源项目
  • 原文地址:https://www.cnblogs.com/chvv/p/10365015.html
Copyright © 2011-2022 走看看