zoukankan      html  css  js  c++  java
  • Scrapy爬虫-pipeline.py

    一.文件保存

    1.分类保存

    1     def process_item(self, item, spider):
    2         category="novel1/"+item['category']
    3         if os.path.exists(category)==False:
    4             os.mkdir(category)
    5         fname=category+"/"+item['article_name']+'.txt'
    6         self.filename=codecs.open(fname, 'a', 'utf-8')
    7         self.filename.write(item['content_name']+'
    ')
    8         self.filename.write(item['content']+'
    ')
    9         return item

    2.直接保存

     1     def __init__(self):
     2         self.filename=codecs.open('face.json','wb+','utf-8')
     3 
     4     def process_item(self, item, spider):
     5         line = json.dumps(dict(item), ensure_ascii=False,sort_keys=True, indent=4) + ",
    "
     6         self.filename.write(line)
     7         return item
     8 
     9     def spider_closed(self, spider):
    10         self.filename.close()

    3.图片下载(setting.py文件中设置保存路径    IMAGES_STORE=os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')    )

     1 class Img699PicPipeline(object):
     2     def process_item(self, item, spider):
     3         return item
     4 
     5 
     6 class Images699Pipeline(ImagesPipeline):
     7     def get_media_requests(self, item, info):
     8         # 这个方法是在发送下载请求之前调用的,其实这个方法本身就是去发送下载请求的
     9         request_objs=super(Images699Pipeline, self).get_media_requests(item,info)
    10         for request_obj in request_objs:
    11             request_obj.item=item
    12         return request_objs
    13 
    14     def file_path(self, request, response=None, info=None):
    15         # 这个方法是在图片将要被存储的时候调用,来获取这个图片存储的路径
    16         path=super(Images699Pipeline, self).file_path(request,response,info)
    17         category=request.item.get('category')
    18         image_store=settings.IMAGES_STORE
    19         category_path=os.path.join(image_store,category)
    20         if not os.path.exists(category_path):
    21             os.makedirs(category_path)
    22         image_name=path.replace("full/","")
    23         image_path=os.path.join(category_path,image_name)
    24         return image_path

    二、内容去重

     1 class DuplicatesPipeline(object):
     2     def __init__(self):
     3         self.face_set = set()
     4 
     5     def process_item(self, item, spider):
     6         for materail in item['materials']:
     7             id=materail['id']
     8             if id in self.face_set:
     9                 raise DropItem("Duplicate book found:%s" % item)
    10             self.face_set.add(id)
    11         return item
    
    
  • 相关阅读:
    科研道路上培养的7种能力
    (OK) network diagnose tools
    linux-bridge-ping-high-latency-ebtables.txt
    houshoul
    Android x86 Virtual box with Internet and connection to adb?
    linux內核調試kmsg,dmesg
    Android Logging System
    Linux 日志级别(loglevel)详解
    Linux之绝处逢生------SysRq
    /proc/sysrq-trigger详解
  • 原文地址:https://www.cnblogs.com/ShadowXie/p/9699888.html
Copyright © 2011-2022 走看看