zoukankan      html  css  js  c++  java
  • scrapy 框架简单 爬取 4K高清 壁纸

    import scrapy


    class TpSpider(scrapy.Spider):
    name = 'tp'
    # allowed_domains = ['baidu.com']
    # 壁纸网址路径 爬取10页
    start_urls = ['http://pic.netbian.com/4kmeinv/index.html'] +
    ['http://pic.netbian.com/4kmeinv/index_%s.html' % page for page in range(2, 11)]

    def parse(self, response):
    # 获取普通图片地址
    li_list = response.xpath('//ul[@class="clearfix"]/li')
    for li in li_list:
    img = li.xpath('./a/@href').extract_first()
    imgs = 'http://pic.netbian.com' + img
    yield scrapy.Request(url=imgs, callback=self.imgs_parse)

    def imgs_parse(self, response):
    # 获取4K高清图片
    div_list = response.xpath('//div[@class="photo-pic"]/a')
    for div in div_list:
    name = div.xpath('./img/@alt').extract_first()
    img = div.xpath('./img/@src').extract_first()
    imgs = 'http://pic.netbian.com' + img
    yield scrapy.Request(url=imgs, callback=self.img_parse, meta={'name': name})

    def img_parse(self, response):
    # 下载图片
    name = response.meta['name']
    # 下载图片保存本地
    with open('./imgs/%s.jpg' % name, 'wb')as f:
    f.write(response.body)
    print('正在下载图片:%s' % name)
  • 相关阅读:
    Linux文件及目录查找
    英语单词independent
    英语单词omitting
    英语单词deploy
    英语单词debug
    线程
    进程
    操作系统历史
    分布式爬虫
    爬虫基础
  • 原文地址:https://www.cnblogs.com/hyao/p/13303637.html
Copyright © 2011-2022 走看看