zoukankan      html  css  js  c++  java
  • scrapy下载图片到指定目录,创建缩略图,存储入库

    环境和工具:python2.7,scrapy

    实验网站:http://www.XXXX.com/tag/333.html  爬去所有兔女郎图片,下面的推荐需要过滤

    逻辑:分析网站信息,下载图片和入库需要开启ITEM_PIPELINES,开启缩略图配置,转移图片

     -----settings.py

    ##不按照robots.txt
    ROBOTSTXT_OBEY = False
    ##默认
    DOWNLOAD_DELAY = 3
    ##关闭cookie
    COOKIES_ENABLED = False
    ##开启ITEM_PIPELINES
    ITEM_PIPELINES = {
                        'MyPicSpider.pipelines.MyImagesPipeline': 300,
                        'MyPicSpider.pipelines.MysqlPipeline': 400
                      }
    ##存储路径
    IMAGES_STORE ='G:\www\scrapy_rpo\pic\meinv\rabbit\'
    ##过滤图片
    IMAGES_MIN_HEIGHT = 110
    IMAGES_MIN_WIDTH = 110
    ##缩略图片
    IMAGES_THUMBS = {
        'big': (270, 270),
    }

    ------items.py

    import scrapy
    
    
    class PicspiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        tag = scrapy.Field()
        image_urls = scrapy.Field()
        images_data = scrapy.Field()
        img_path = scrapy.Field()
        img_big_path = scrapy.Field()
        file_path = scrapy.Field()

    ----pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    import scrapy,os,datetime
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    import shutil,os,pymysql
    # 导入项目设置
    from scrapy.utils.project import get_project_settings
    #conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test', charset="UTF8")
    #cursor = conn.cursor()
    class MyImagesPipeline(ImagesPipeline):
        # 从项目设置文件中导入图片下载路径
        img_store = get_project_settings().get('IMAGES_STORE')
        def get_media_requests(self, item, info):
            ''' 多个url'''
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url)
    
        def item_completed(self, results, item, info, ):
            image_paths = [x["path"] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            file_path = item['file_path']
            # 定义分类保存的路径
            if os.path.exists(file_path) == False:
                os.mkdir(file_path)
            print image_paths
            ## pic  ==  full/80dd7db02e4da4e63f05d9d49c1092fc7fdcb43e.jpg
            pic_list = []
            for v in image_paths:
                pic_name = v.replace('full/','')
                pic_small_name =pic_name.replace('.jpg','')+'_s.jpg'
                pic_big_name = pic_name.replace('.jpg', '') + '_b.jpg'
                ##获取创建的图片名字
                # 将文件从默认下路路径移动到指定路径下
                # 移动图片
                shutil.move(self.img_store + 'full\'+pic_name, file_path + "\" + pic_name)
                # 移动缩略图
                #shutil.move(self.img_store + 'thumbs\small\'+ pic_name, file_path + "\" + pic_small_name)
                shutil.move(self.img_store + 'thumbs\big\' + pic_name, file_path + "\" + pic_big_name)
                #img_path_dict['img_path'] = file_path + "\" + pic_name
                #img_path_dict['img_small_path'] = file_path + "\" + pic_small_name
                #img_path_dict['img_big_path'] = file_path + "\" + pic_big_name
                img_path_dict = ('picture/meinv/rabbit/'+item['tag']+"/" + pic_name,'picture/meinv/rabbit/'+item['tag']+"/" +pic_big_name)
                pic_list.append(img_path_dict)
            item["img_path"] = pic_list
            return item
    
    ##入库
    class MysqlPipeline(object):
        def __init__(self):
            self.conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
            # 创建指针
            self.cursor = self.conn.cursor()
        def process_item(self, item, spider):
            ###组装数据
            list = []
            datetime_now  =datetime.datetime.now()
            datetime_now = datetime.datetime.now()
            datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year, datetime_now.month, datetime_now.day,datetime_now.hour, datetime_now.minute, datetime_now.second)
            ##增加type
            result = self.cursor.execute(u"select id from network_type where RESOURCETYPE ='p' and TYPENAME='{0}'".format(item['tag']))
            if result==0:
                self.cursor.execute("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s) ",(2415,'p',item['tag']))
                typeid = self.cursor.lastrowid
                self.conn.commit()
            else:
                #tag_id = self.cursor.fetchall()
                #typeid = tag_id[0][0]
                return False
    
            types = ','+str(typeid)+','
            #print item['img_path']
            self.cursor.execute('select  id from network_picture order by cast(id as SIGNED INTEGER) desc limit 0,1')
            old_id = self.cursor.fetchone()
            if old_id:
                id_n = str(int(old_id[0]) + 1)
            else:
                id_n = str(1)
            for v in item['img_path']:
                path1 = v[0]
                path2 = v[1]
                self.cursor.execute(u'select  id from network_picture where FILEPATH="{0}" and fileScalPath="{1}"'.format(path1,path2))
                data = self.cursor.fetchone()
                if data:
                    print u'该数据已经存在'
                else:
                    a = (str(id_n),'',path1,'',types,0,datetime_str,path2)
                list.append(a)
                id_n = int(id_n) + 1
            print list
            self.cursor.executemany("insert into network_picture(ID,NAME,FILEPATH,FILESIZE,TYPES,STATUS,DATETIME,fileScalPath)values(%s,%s,%s,%s,%s,%s,%s,%s)", list)
            self.conn.commit()
            return item

    ----spider.py

    # -*- coding: utf-8 -*-
    import scrapy,os,urllib2
    from scrapy.linkextractors import LinkExtractor   ##引入linkextractors  用于筛选链接和跟进链接,还有很多功能,可以去百度下
    from scrapy.spiders import CrawlSpider, Rule     ##定义spider的模板,引入Rule规则
    from MyPicSpider.items import PicspiderItem      ##引入定义的items.py
    # 导入项目设置
    from scrapy.utils.project import get_project_settings
    from bs4 import BeautifulSoup
    import time,pymysql
    headers = {'User_agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
    conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8")
    # 创建指针
    cursor = conn.cursor()
    class PicSpider(CrawlSpider):    ##继承模板CrawlSpider 普通模板继承Spider
        name = 'pic'     ###定义spider名    运行---$ scrapy crawl blog
        allowed_domains = ['www.xxxx.com']    ##  定义查找范围
        start_urls = ['http://www.xxxx.com/tag/333.html']   ###初始url
        ####当有follow=True  则会跟进该页面
        ####原理就是  spider在初始页面查找,同时查找帖子详情页的url和下一个分页,同时跟进下一个分页页面,继续查找下一个分页页面和上面的详情页url,详情页面使用回调函数进行采集
        rules = (
            ###爬去索引页并跟踪其中链接
            ###查找start_urls  所有的分页页面
            Rule(LinkExtractor(allow=r'/tag/[0-9]*_[0-9]*.html'),follow=True),
            ###爬去items页面并将下载响应返回个头parse_item函数
            ####查询每个分页页面的详情页
            Rule(LinkExtractor(allow=r'http://www.xxxx.com/ent/[a-z]*/[0-9]*/[0-9]*.html'), callback='parse_item', follow=False,),
        )
        ####详情页面回调函数
        def parse_item(self,response):
            start_url = response.url
            item = PicspiderItem()
            tag_name = response.xpath('//h1[@class="articleV4Tit"]/text()').extract()[0]
            # cursor.execute(u'select id from network_type  where PID=258 AND TYPENAME="{0}" limit 0,1'.format(tag_name))
            # old_id = cursor.fetchone()
            # if old_id:
            #     exit()
            name = u'兔'
            if name in tag_name:
                pass
            else:
                print u'----这是其他的分类----'
                return False
            li_list =  response.xpath('//ul[@class="articleV4Page l"]/li').extract()
            srcs = []
            for v in range(1, (len(li_list) - 3)):
                if v == 1:
                    url_s = start_url
                else:
                    url_s = start_url.replace('.html', '') + '_' + str(v) + '.html'
                try:
                    request = urllib2.Request(url_s, headers=headers)
                    response = urllib2.urlopen(request, timeout=200).read()
                except urllib2.URLError, err:
                    print err, '错误的url' + url
                obj = BeautifulSoup(response, 'html.parser')
                try:
                    pic_url = obj.find('center').find('img')['src']
                except:
                    print u'----第一种获取方式失败----'
                    try:
                        pic_url = obj.find('div', {'id': 'picBody'}).find('img')['src']
                    except:
                        print u'----第二种方式获取失败----'
                        try:
                            pic_url = obj.find('p', attrs={"style": "text-align: center"}).find('img')['src']
                        except:
                            print u'----第三种获取方式失败----'
                srcs.append(pic_url)
            item['tag'] = tag_name
            item['file_path'] = '%s%s' %(get_project_settings().get('IMAGES_STORE'),tag_name)
            item['image_urls'] = srcs
            return item

    ------scrapy的去重方面我还不是特别了解,有知道的大佬可以告知本白,谢谢。

  • 相关阅读:
    命令行参数解析
    业务
    从0开始架构二
    从0开始架构读书笔记
    增加ldl
    工具论
    go的web框架的context回调的原理
    id生成器雪花算法和雪花算法的sony实现
    软件架构师应该知道的97件事(六)
    进程通信简介
  • 原文地址:https://www.cnblogs.com/shuangzikun/p/python_taotao_scrapy_pic_mysql.html
Copyright © 2011-2022 走看看