zoukankan      html  css  js  c++  java
  • 爬虫 Scrapy框架 爬取图虫图片并下载

    items.py,根据需求确定自己的数据要求

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class TodayScrapyItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     pass
    15 
    16 
    17 class TuchongItem(scrapy.Item):
    18     title = scrapy.Field() #图片名字
    19     views = scrapy.Field() #浏览人数
    20     favorites = scrapy.Field()#点赞人数
    21     img_url = scrapy.Field()#图片地址
    22 
    23     # def get_insert_sql(self):
    24     #     # 存储时候用的sql语句
    25     #     sql = 'insert into tuchong(title,views,favorites,img_url)' 
    26     #           ' VALUES (%s, %s, %s, %s)'
    27     #     # 存储的数据
    28     #     data = (self['title'], self['views'], self['favorites'], self['img_url'])
    29     #     return (sql, data)

    setting.py 设置headers和items

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for today_scrapy project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'today_scrapy'
    
    SPIDER_MODULES = ['today_scrapy.spiders']
    NEWSPIDER_MODULE = 'today_scrapy.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'today_scrapy (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      'User-Agnet':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'today_scrapy.middlewares.TodayScrapySpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'today_scrapy.middlewares.TodayScrapyDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       # 'today_scrapy.pipelines.TodayScrapyPipeline': 300,
        'today_scrapy.pipelines.TuchongPipeline': 200,
    
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    pipelines.py 将图片下载到指定文件夹

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 import os
     8 import requests
     9 
    10 class TodayScrapyPipeline(object):
    11     def process_item(self, item, spider):
    12         return item
    13 
    14 class TuchongPipeline(object):
    15     def process_item(self, item, spider):
    16         img_url = item['img_url'] #从items中得到图片url地址
    17         img_title= item['title'] #得到图片的名字
    18         headers = {
    19             'User-Agnet': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    20             'cookie':'webp_enabled=1; bad_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=78baed41-a870-11e8-b7fd-370d61367b46; _ga=GA1.2.1188216139.1535263387; _gid=GA1.2.1476686092.1535263387; PHPSESSID=4k7pb6hmkml8tjsbg0knii25n6'
    21         }
    22         if not os.path.exists(img_title):
    23             os.mkdir(img_title)
    24         filename =img_url.split('/')[-1]
    25         with open(img_title+'/'+filename, 'wb+') as f:
    26             f.write(requests.get(img_url, headers=headers).content)
    27         f.close()
    28         return item

    爬虫文件

    tuchong.py

    图片的url可以直接拼接

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 import json
     4 from today_scrapy.items import TuchongItem
     5 
     6 
     7 class TuchongSpider(scrapy.Spider):
     8     name = 'tuchong'
     9     allowed_domains = ['tuchong.com']
    10     start_urls = ['http://tuchong.com/']
    11 
    12     def start_requests(self):
    13         for pag in range(1, 20):
    14             referer_url = 'https://tuchong.com/rest/tags/自然/posts?page={}&count=20'.format(pag)   # url中红字部分可以换
    15             form_req = scrapy.Request(url=referer_url, callback=self.parse)
    16             form_req.headers['referer'] = referer_url
    17             yield form_req
    18 
    19     def parse(self, response):
    20         tuchong_info_html = json.loads(response.text)
    21         # print(tuchong_info_html)
    22         postList_c = len(tuchong_info_html['postList'])
    23         # print(postList_c)
    24         for c in range(postList_c):
    25             print(c)
    26             # print(tuchong_info_html['postList'][c])
    27             title = tuchong_info_html['postList'][c]['title']
    28             print('图集名称:'+title)
    29             views = tuchong_info_html['postList'][c]['views']
    30             print(''+str(views)+'人浏览')
    31             favorites = tuchong_info_html['postList'][c]['favorites']
    32             print('喜欢的人数:'+str(favorites))
    33             images_c = len(tuchong_info_html['postList'][c]['images'])
    34             for img_c in range(images_c):
    35                 user_id = tuchong_info_html['postList'][c]['images'][img_c]['user_id']
    36                 img_id = tuchong_info_html['postList'][c]['images'][img_c]['img_id']
    37                 img_url = 'https://photo.tuchong.com/{}/f/{}.jpg'.format(user_id,img_id)
    38                 item = TuchongItem()
    39                 item['title'] = title
    40                 item['img_url'] = img_url
    41             # 返回我们的item
    42                 yield item
  • 相关阅读:
    点击劫持漏洞之理解 python打造一个挖掘点击劫持漏洞的脚本
    URL重定向漏洞,python打造URL重定向漏洞检测脚本
    动态链接库(DLL)
    vs不支持通过afxgetmainwnd()获取窗口句柄(转)
    HALCON学习-下载、安装
    HALCON学习-资料
    MFC,ADO方式实现数据库操作
    VS2010 EXCEL2010 表格操作的编程实现
    Git Compare with base,比较大文件时,长时间等待,无法加载
    VS2010编译VS2008工程时,LINK : fatal error LNK1123: failure during conversion to COFF: file invalid or corrupt
  • 原文地址:https://www.cnblogs.com/pantom0122/p/9540299.html
Copyright © 2011-2022 走看看