zoukankan      html  css  js  c++  java
  • 爬取100页京东商品评论

    #items.py
    import scrapy
    class InsistItem(scrapy.Item):
        comment=scrapy.Field()
    
    #pipelines.py
    import json
    class InsistPipeline(object):
        def __init__(self):
            self.f=open('tencent.json','w',encoding='gbk')
        def process_item(self, item, spider):
            #item(Item对象,被爬取的item)
            #这个方
            content=json.dumps(dict(item),ensure_ascii=False)+",
    "
            self.f.write(content)
            return item
    
    #tengxun.py
    import scrapy
    from insist.items import InsistItem
    import json
    
    class TengxunSpider(scrapy.Spider):
        name = 'tengxun'
        allowed_domains = ['sclub.jd.com']
        #start_urls = ['https://item.jd.com/4432058.html']
        baseURL = 'https://sclub.jd.com/comment/productPageComments.action?productId=4432058&score=0&sortType=5&pageSize=10&isShadowSku=0&rid=0&fold=1&page='
        offset = 0
        start_urls = [baseURL + str(offset)]
        def parse(self, response):
           com=json.loads(response.body.decode('gbk'))
           comment=com['comments']
           for co in comment:
               item = InsistItem()
               item['comment']=co['content']
               yield item
           if self.offset<100:
               self.offset+=1
               yield  scrapy.Request(self.baseURL+str(self.offset),callback=self.parse)
  • 相关阅读:
    结对编程收获
    《程序员修炼之道》读书笔记
    《梦断代码》读书笔记
    《编程珠玑》和《梦断代码》(部分) 读书笔记
    团队项目个人心得
    团队项目Alpha阶段心得感悟
    第9周读书笔记
    第8周读书笔记
    结对编程收获
    第七周读书笔记
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11576574.html
Copyright © 2011-2022 走看看