zoukankan      html  css  js  c++  java
  • 京东家具定制数据爬取

    # -*- coding: utf-8 -*-
    
    import scrapy,time
    from jiaju.items import JiajuItem
    
    class DingzhiSpider(scrapy.Spider):
        name = 'dingzhi'
        #allowed_domains = ['https://search.jd.com/']
        baseURL= 'https://search.jd.com/search?keyword=%E5%AE%9A%E5%88%B6%E5%AE%B6%E5%85%B7&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E5%AE%9A%E5%88%B6%E5%AE%B6%E5%85%B7&stock=1&page='
        offset = 1
        start_urls = [baseURL + str(offset)]
    
        def parse(self, response):
            # 没有用管道,用来存储所有的jj字段
            # jjs = []
            quanbu = response.xpath('//*[@id="J_goodsList"]/ul/li')
            a = 0
            print(len(quanbu))
            for fen in quanbu:
                a +=1
                print(a)
                jj = JiajuItem()
                try:
                    if len(fen.xpath('./div/div[3]/strong/i/text()').extract()[0]):
                        jj['jg'] = fen.xpath('./div/div[3]/strong/i/text()').extract()[0]
                    else:
                        jj['jg'] =''
                    jj['name'] = fen.xpath('./div/div[4]/a/em/text()').extract()[0]
                    jj['dz'] = fen.xpath('./div/div[1]/a/@href').extract()[0]
                except:
                    print(1111)#这三个打印用了测试的
                    time.sleep(5)
                    print(fen.xpath('.').extract())
                    if len(fen.xpath('./div/div[3]/strong/@data-price').extract()[0]):
                        jj['jg'] = fen.xpath('./div/div[3]/strong/@data-price').extract()[0]
                    else:
                        jj['jg'] =''
                    jj['name'] = fen.xpath('./div/div[7]/span/a/text()').extract()[0]
                    jj['dz'] = fen.xpath('./div/div[7]/span/a/@href').extract()[0]
    
    
    
                # jj['jg'] = jg[0]
                # jj['name'] = name[0]
                # jj['dz'] = dz[0]
                # jjs.append(jj)
    
                # 返回提取到的每个jj数据,给管道文件处理,同时还会回来继续执行后面的代码(要把jjs注释掉)
                yield jj
    
            if self.offset < 199:
                self.offset += 2
                url = self.baseURL+str(self.offset)
                yield scrapy.Request(url,callback=self.parse)

    ....

    具体代码: https://github.com/mysteriousKiller/jingdong

  • 相关阅读:
    Vue学习笔记(十三) 响应式原理
    CSS学习笔记(十一) CSS3新特性
    JavaScript学习笔记(十三) ES6新特性
    Node.js学习笔记(六) express模块
    Node.js学习笔记(五) http模块
    Node.js学习笔记(四) fs、os、path模块
    Node.js学习笔记(三) 模块系统
    Node.js学习笔记(二) 包管理器
    Node.js学习笔记(一) 安装配置
    用Visual Studio创建集成了gtest的命令行工程
  • 原文地址:https://www.cnblogs.com/mysterious-killer/p/10137097.html
Copyright © 2011-2022 走看看