zoukankan      html  css  js  c++  java
  • scrapy 爬取当当网产品分类

    #spider部分
    import
    scrapy from Autopjt.items import AutopjtItem from scrapy.http import Request class AutospdSpider(scrapy.Spider): name = "autospd" allowed_domains = ["dangdang.com"] start_urls = ['http://category.dangdang.com/pg1-cid4007379.html'] def parse(self, response): item = AutopjtItem() item['name'] =response.xpath('//a[@name="itemlist-title"]/@title').extract() item['price'] = response.xpath('//span[@class="price_n"]/text()').extract() item['link'] = response.xpath('//a[@name="itemlist-title"]/@href').extract() item['comnum'] = response.xpath('//a[@name="itemlist-review"]/text()').extract() yield item for i in range(1,101): url = 'http://category.dangdang.com/pg'+str(i)+'-cid4007379.html' yield Request(url,callback=self.parse)

    pipeline部分

    import codecs
    import json
    
    class AutopjtPipeline(object):
        def __init__(self):
            self.file = codecs.open('D:/mydata.json','wb',encoding='utf-8')
        def process_item(self, item, spider):
            for j in range(0,len(item['name'])):
                name = item['name'][j]
                price = item['price'][j]
                comnum = item['comnum'][j]
                link =item['link'][j]
                goods = {'name':name,'price':price,'comnum':comnum,'link':link}
                i = json.dumps(dict(goods),ensure_ascii=False)
                line = i + '
    '
                self.file.write(line)
            return item
        def close_spider(self,spider):
            self.file.close()

    item部分

    import scrapy
    
    
    class AutopjtItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name = scrapy.Field()
        price = scrapy.Field()
        link = scrapy.Field()
        comnum = scrapy.Field()
  • 相关阅读:
    bzoj 1977
    bzoj 3389
    bzoj 1064
    codeforces 424D
    codeforces 425C
    codeforces 425D
    codeforces 427E
    codeforces 425E
    codeforces 429D
    codeforces 429E
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6835391.html
Copyright © 2011-2022 走看看