zoukankan      html  css  js  c++  java
  • python--批量下载豆瓣图片

    溜达豆瓣的时候,发现一些图片,懒得一个一个扒,之前写过c#和python版本的图片下载,因此拿之前的Python代码来改了改,折腾出一个豆瓣版本,方便各位使用

    # -*- coding:utf8 -*-
    import urllib2, urllib, socket
    import re
    import requests
    from lxml import etree
    import os, time
    
    DEFAULT_DOWNLOAD_TIMEOUT = 30
    
    
    class AppURLopener(urllib.FancyURLopener):
        version = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT)"
    
    
    def check_save_path(save_path):
        if not os.path.exists(save_path):
            os.makedirs(save_path)
    
    
    def get_image_name(image_link):
        file_name = os.path.basename(image_link)
        return file_name
    
    
    def save_image1(image_link, save_path):
        file_name = get_image_name(image_link)
        file_path = save_path + "\" + file_name
        print("准备下载{0} 到{1}".format(image_link, file_path))
        try:
            urllib._urlopener = AppURLopener()
            socket.setdefaulttimeout(DEFAULT_DOWNLOAD_TIMEOUT)
            urllib.urlretrieve(url=image_link, filename=save_path)
            return True
        except Exception, ex:
            print(ex.args)
            print("下载文件出错:{0}".format(ex.message))
            return False
    
    
    def save_image(image_link, save_path):
        file_name = get_image_name(image_link)
        file_path = save_path + "\" + file_name
        print("准备下载{0} 到{1}".format(image_link, file_path))
        try:
            file_handler = open(file_path, "wb")
            image_handler = urllib2.urlopen(url=image_link, timeout=DEFAULT_DOWNLOAD_TIMEOUT).read()
            file_handler.write(image_handler)
            return True
        except Exception, ex:
            print("下载文件出错:{0}".format(ex.message))
            return False
    
    
    def get_thumb_picture_link(thumb_page_link):
        try:
            html_content = urllib2.urlopen(url=thumb_page_link, timeout=DEFAULT_DOWNLOAD_TIMEOUT).read()
            html_tree = etree.HTML(html_content)
            # print(str(html_tree))
            link_tmp_list = html_tree.xpath('//div[@class="photo_wrap"]/a[@class="photolst_photo"]/img/@src')
            page_link_list = []
            for link_tmp in link_tmp_list:
                page_link_list.append(link_tmp)
            return page_link_list
        except Exception, ex:
            print(ex.message)
            return []
    
    
    def download_pictures(album_link, min_page_id, max_page_id, picture_count_per_page, save_path):
        check_save_path(save_path)
        min_page_id = 0
        while min_page_id < max_page_id:
            thumb_page_link = album_link + "?start={0}".format(min_page_id * picture_count_per_page)
            thumb_picture_links = get_thumb_picture_link(thumb_page_link)
            for thumb_picture_link in thumb_picture_links:
                full_picture_link = thumb_picture_link.replace("photo/lthumb", "photo/large")
                save_flag = save_image(image_link=full_picture_link, save_path=save_path)
                if not save_flag:
                    full_picture_link = thumb_picture_link.replace("photo/thumb", "photo/photo")
                    save_image(image_link=full_picture_link, save_path=save_path)
                time.sleep(1)
            min_page_id += 1
        print("下载完成")
    
    
    # 设置图片保存的本地文件夹
    save_path = "J:\douban\gugu"
    # 设置相册地址,注意以反斜杠结尾
    album_link = "https://www.douban.com/photos/album/1625969357/"
    # 设置相册总页数
    max_page_id = 11
    # 设置每页图片数量,默认为18张
    picture_count_per_page = 18
    
    download_pictures(album_link=album_link,
                      min_page_id=1,
                      max_page_id=max_page_id,
                      picture_count_per_page=picture_count_per_page,
                      save_path=save_path)

    =============================================================

    相对urllib2来说,urllib 真的比较坑,如果不设置User-Agent,下载速度会超慢无比,另外还需要调用socket模块来设置超时时间,比较折腾,最终可能还会踩到其他坑里去,比如我下着下着就被豆瓣给‘屏蔽’啦,so建议使用urllib2。

    相关参考链接:

    http://www.jb51.net/article/57239.htm

    http://www.crifan.com/use_python_urllib-urlretrieve_download_picture_speed_too_slow_add_user_agent_for_urlretrieve/comment-page-1/

    =============================================================

    国庆最后一天,祝各位国庆快乐!

  • 相关阅读:
    jQuery事件
    jQuery的效果
    jQuery 选择器
    中级 jQuery 了解
    回调函数 callback()
    预加载
    表格对象的方法
    script中type属性讲解
    将数据渲染到页面的方式:模版
    将数据渲染到页面的几种方式
  • 原文地址:https://www.cnblogs.com/TeyGao/p/5935575.html
Copyright © 2011-2022 走看看