zoukankan      html  css  js  c++  java
  • python爬虫练习2苏宁图书信息

    仅供学习,禁止用于商用,后果自负。

    import scrapy, requests, json, re
    from copy import deepcopy
    from bs4 import BeautifulSoup
    from urllib.parse import urlencode
    
    
    class BookSpider(scrapy.Spider):
        name = 'book'
        allowed_domains = ['suning.com']
        start_urls = ['http://book.suning.com/']
    
        def parse(self, response):
            """
            进入图书首页,分析图书类别和子类别
            该方法仅解析了一个子类别图书
            :param response:
            :return:
            """
            one_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-item"]')
            two_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-sub"]/div[@class="submenu-left"]')
            for one_index, menu_item in enumerate([one_types[0]]):
                one_type = menu_item.xpath('./dl/dt/h3/a/text()').extract_first()
                two_type_list = two_types[one_index].xpath('./p[@class="submenu-item"]/a/text()').extract()
                for two_index, two_type_item in enumerate([two_type_list[0]]):
                    two_type = two_type_item
                    three_types = two_types[one_index].xpath('./ul')[two_index].xpath('./li')
                    for three_type_a in [three_types[0]]:
                        three_type = three_type_a.xpath('./a/text()').extract_first()
                        url = three_type_a.xpath('./a/@href').extract_first()
                        item = {}
                        item["one_type"] = one_type
                        item["two_type"] = two_type
                        item["three_type"] = three_type
                        item["type_url"] = url
                        yield scrapy.Request(item["type_url"], callback=self.get_book_page_num, meta={"item": deepcopy(item)})
    
        def get_book_page_num(self, response):
            """进入图书列表页面获取每个图书详情地址
            由于列表页面默认只会加载30条数据,因此通过接口的方式获取图书详情地址
            该方法仅解析了第一页数据
            """
            item = response.meta.get("item", {})
            page_num = int(response.xpath('//div[@id="bottom_pager"]/a[@role="menuitem"]')[-1].xpath('./@pagenum').extract_first())
            item["page_num"] = page_num
            ci = item["type_url"].split("-")[1]
            for i in range(1):
                params = (
                    ('ci', str(ci)),
                    ('pg', '03'),
                    ('cp', str(i)),
                    ('il', '0'),
                    ('iy', '0'),
                    ('adNumber', '0'),
                    ('n', '1'),
                    ('ch', '4'),
                    ('prune', '0'),
                    ('sesab', 'ACBAABC'),
                    ('id', 'IDENTIFYING'),
                    ('cc', '089'),
                )
                book_lsit_api = "https://list.suning.com/emall/showProductList.do?" + urlencode(params)
                # 获取前30条数据
                yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)})
                # 获取后30条数据
                params.append(('paging', '1'))
                params.append(('sub', '0'))
                yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)})
    
        def parse_book_list(self, response):
            """
            接口返回的数据为存在缺失的html代码,xpath解析有误,因此使用BeautifulSoup解析获取详情页地址
            :param response:
            :return:
            """
            item = response.meta.get("item", {})
            soup = BeautifulSoup(response.text, "lxml")
            books = soup.find_all('a', attrs={'class': 'sellPoint'})
            for book in books:
                detail_url = "https:" + book.get('href')
                yield scrapy.Request(detail_url, callback=self.parse_book_detail, meta={"item": deepcopy(item)})
    
        def parse_book_detail(self, response):
            """
            解析详情页获取图书名称、价格、作者、出版社信息
            由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup
            :param response:
            :return:
            """
            price = self.get_price(response)
            item = response.meta.get("item", {})
            soup = BeautifulSoup(response.text, "html.parser")
            li_list = soup.find_all('li', attrs={'class': 'pb-item'})
            if len(li_list) > 0: item["author"] = self.replace(li_list[0].text)
            if len(li_list) > 1: item["press"] = self.replace(li_list[1].text)
            if len(li_list) > 2: item["time"] = self.replace(li_list[2].text)
            name = soup.find('h1', attrs={"id": "itemDisplayName"}).text.replace("\n", "").replace("\u3000", " ")
            image_url = response.xpath('//div[@class="imgzoom-main"]/a/img/@src').extract_first()
            item["name"] = name
            item["price"] = price
            item["image_url"] = "https:" + image_url
            print(item)
    
        def get_price(self, response):
            """
            获取价格
            通过接口分析参数后发现仅passPartNumber、vendorCode控制价格信息因解析该参数即可
            由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup
            :param response:
            :return:
            """
            passPartNumber_str = re.findall(r'"passPartNumber":"[0-9]*?"', response.text)[0]
            passPartNumber = passPartNumber_str.split('"')[-2]
            vendorCode_str = re.findall(r'"vendorCode":"[0-9]*?"', response.text)[0]
            vendorCode = vendorCode_str.split('"')[-2]
            url = "https://pas.suning.com/nspcsale_0_{}_{}_{}_300_089_0890199_502282_1000347_8999_100138_Z001___R9011205_3.0____0001400PA____0___16.0_2__502320_502687_.html?callback=pcData&_=1637305043921".format(
                passPartNumber, passPartNumber, vendorCode
            )
            r = requests.get(url=url)
            json_data = r.text.replace("pcData(", "")[:-2]
            price = json.loads(json_data)["data"]["price"]["saleInfo"][0]["netPrice"]
            return price
    
        def replace(self, str):
            """特殊符号清理
            1:删除\n
            2:删除\t
            3:删除" "
            4:替换\u3000、\xa0为空格
            """
            temp = str.replace("\n", "").replace("\t", "").replace(" ", "").replace("\u3000", " ").replace(u'\xa0', u' ')
            return temp


  • 相关阅读:
    oracle sql 优化大全
    MyBatis学习笔记
    Eclipse启动项目时,删除workspaces无用的工作区间
    java 中 BigDecimal 怎么与 0 比较
    Mybatis 异常: The content of elements must consist of well-formed character data or markup
    ODS与数据仓库
    MyBatis 缓存
    管理信息系统需求调研分析指南
    Merge Into 语句代替Insert/Update在Oracle中的应用实战
    Oracle数据库常用函数使用--持续更新中
  • 原文地址:https://www.cnblogs.com/fuchenjie/p/15578045.html
Copyright © 2011-2022 走看看