zoukankan      html  css  js  c++  java
  • 京东图书爬虫

    效果:

     

     

    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    version_0

    分析:

    返回的数据是Json
    大分类:https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book,每个poolid对应一个大title
    ```
    title: "文学综合馆",
    operate_title: "",
    sort_num: 2,
    fid: 0,
    id: 7369,
    ext_info: {
    poolId: "0101945"
    }
    ```
    部分数据,即title对应一个poolId,属于同一个字典下
    拿到大分类的poolId,就可以用pooId取构造请求:https://lapi.jd.com/pool?body={%22pid%22:%220101945%22}&source=book,来获取中title以及小title,以及对应的小title的url
    请求小title的url拿到图书列表,图书信息在https://list.jd.com/listNew.php?cat=1713%2C3260%2C3345&page=150,在这个地址中,有总的页数,SEARCH.adv_param={page:"150",page_count:"200",psort:"0",cid1: 0,
    其中page_count为总的页数,page为当前页数,价格未有单独请求,包含在这个网址中,至此,网页分析完毕。

    <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

    源代码

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    from copy import deepcopy
    from pprint import pprint
    import re
    from urllib import parse
    
    
    class JdtsSpider(scrapy.Spider):
        name = 'jdts'
        # allowed_domains = ['book.jd.com','lapi.jd.com','coll.jd.com','list.jd.com','search.jd.com']
        allowed_domains = ["jd.com"]
        start_urls = ['https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book']
    
        def parse(self, response):
            item = dict()
            json_obj = json.loads(response.body.decode())
            data_list = json_obj["data"][1:-4]
            for data in data_list:
                # 拿到大分类及poolId
                item["b_title"] = data["title"]
                item["poolId"] = data["ext_info"]["poolId"]
                # 用poolId去请求中级分类和下级分类
                temporary_var = '"pid":"{}"'.format(item["poolId"])
                next_url = 'https://lapi.jd.com/pool?body={'+temporary_var+'}&source=book'
                yield scrapy.Request(
                    url=next_url,
                    callback=self.parse_poolid,
                    meta={"item":item}
                )
                break
    
        def parse_poolid(self,response):
            item = response.meta["item"]
    
            json_obj = json.loads(response.body.decode())
            data_list = json_obj["data"][2:]
    
            # 获取中级分类
            for data in data_list:
                item["m_title"] =    data["title"]
                s_item_list = data["children"]
                # 获得下级分类
                for s_item in s_item_list:
                    item["s_title"] = s_item["title"]
                    item["s_href"] = s_item["link_value"]
                    yield scrapy.Request(
                        url=item["s_href"],
                        callback=self.parse_s_href,
                        meta={"item":item}
                    )
    
        def parse_s_href(self,response):
            item = deepcopy(response.meta["item"])
            book_info = dict()
            print("-"*20)
            print("响应的url:", response.url)
            # with open   ("test.html",'w',encoding='utf8') as f:
            #     f.write(response.body.decode())
            # 拿到总页数
            count_page = int(re.findall(r'page_count:"(.*?)"',response.body.decode(),re.S)[0])
            print("count_page:",count_page)
            # 获取书籍列表
            content_list = re.findall(r'class="p-img">(.*?)<div class="p-commit">',response.body.decode(),re.S)
            for content in content_list:
                # 获取书籍信息
                item["book_info"] = book_info
                item["book_info"]["book_title"] = re.findall(r'<em>(.*?)</em>', content, re.S)[1]
                item["book_info"]["book_href"] = re.findall(r'href="(.*?)"',content,re.S)[0]
                item["book_info"]["book_href"] = parse.urljoin(response.url,item["book_info"]["book_href"])
                item["book_info"]["book_price"] = re.findall(r'<i>(.*?)</i>', content, re.S)[0]
                yield scrapy.Request(
                    url=item["book_info"]["book_href"],
                    callback=self.parse_detail,
                    meta={"item":deepcopy(item)}
                )
            num = 2     # 计数指针
            while num<count_page:
                if "&page=" in response.url:
                    next_page_href = response.url.split("&page",1)[0]+"&page={}".format(num)
                    # print("*"*20)
                    # print("next_page_url:", next_page_href)
                    # print("*"*20)
                else:
                    next_page_href = response.url+"&page={}".format(num)
                    # print("next_page_url:",next_page_href)
                yield scrapy.Request(
                    url=next_page_href,
                    callback=self.parse_s_href,
                    meta={"item":response.meta["item"]}
                )
                num += 1
    
        def parse_detail(self,response):
            item = response.meta["item"]
            item["book_info"]["book_author"] = response.xpath("//div[@class='p-author']/a/text()").extract_first()
            # url_1 = "https://dx.3.cn/desc/12234231"  skudid
            # url_2 = "https://cd.jd.com/description/channel?skuId=69612641897&mainSkuId=14541400416" skuid mainskuid
            skuid = re.findall(r'com/(.*?).html',response.url,re.S)[0]
            mainskuid = re.findall(r"mainSkuId:'(.*?)'",response.body.decode(),re.S)
            if mainskuid:
                mainskuid = mainskuid[0]
                url = "https://cd.jd.com/description/channel?skuId={}&mainSkuId={}"
                next_url = url.format(skuid,mainskuid)
            else:
                url = "https://dx.3.cn/desc/{}"
                next_url = url.format(skuid)
            item["book_info"]["book_description"] = next_url
            print("*"*20)
            print("描述文件获取地址:",next_url)
            print("*"*20)
            pprint(item)
  • 相关阅读:
    hdu 2647 Reward
    hdu 2094 产生冠军
    hdu 3342 Legal or Not
    hdu 1285 确定比赛名次
    hdu 3006 The Number of set
    hdu 1429 胜利大逃亡(续)
    UVA 146 ID Codes
    UVA 131 The Psychic Poker Player
    洛谷 P2491消防 解题报告
    洛谷 P2587 [ZJOI2008]泡泡堂 解题报告
  • 原文地址:https://www.cnblogs.com/nuochengze/p/12981734.html
Copyright © 2011-2022 走看看