zoukankan      html  css  js  c++  java
  • python 单线程图片下载

    import urllib.request
    import urllib.parse
    import urllib.error
    import re
    import os
    import ssl
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    path = "./images"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        "referer": "https://www.mzitu.com/xinggan/"
    }
    
    
    def handler_request(url, pageIndex):
        url = url + str(pageIndex)
        # 构建请求对象
        request = urllib.request.Request(url=url, headers=headers)
    
        return request
    
    
    def get_images_url(content, basePath):
        patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?')
        patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?')
        alts = patternNames.findall(content, re.S)
        hrefs = patternHrefs.findall(content, re.S)
        image_map = {}
        for i in range(len(hrefs)):
            key = alts[i][1: len(alts[i]) - 1]
            image_map[key] = hrefs[i]
    
        for item in image_map.items():
            image_category_response(item, basePath)
    
    
    def image_category_response(item, basePath):
        alt = item[0]
        save_folder = os.path.join(basePath, alt)
        if not os.path.exists(save_folder):
            os.mkdir(save_folder)
    
        baseurl = item[1][1: len(item[1]) -1]
        pageCount = 1000
        try:
            for pageIndex in range(pageCount):
                page_url = baseurl + "/" + str(pageIndex)
                try:
                    # 构建请求对象
                    request = urllib.request.Request(url=page_url, headers=headers)
                    # 发送请求
                    response = urllib.request.urlopen(request)
                    content = response.read().decode()
                    imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?')
                    imgUrl = imgPattern.findall(content, re.S)
                    download_images(imgUrl[0], save_folder)
                except urllib.error.URLError as e:
                    raise TypeError("最大页面数{0}".format(pageIndex - 1))
        except Exception as e:
            print(e)
    
    
    def download_images(url, save_path):
        url = url[1: len(url) - 1]
        print(url)
        # 构建请求对象
        request = urllib.request.Request(url=url, headers=headers)
        # 发送请求
        response = urllib.request.urlopen(request)
    
        filename = url.split('/')[-1]
        with open(os.path.join(save_path, filename), 'wb') as fb:
            fb.write(response.read())
    
    
    def parse_pages(content):
        print(content)
    
    
    def main():
        url = 'https://www.mzitu.com/xinggan/page/'
        start_page = int(input("请输入起始页码:"))
        end_page = int(input("请输入结束页码:"))
        # 创建根文件夹
        if not os.path.exists(path):
            os.mkdir(path)
    
        for pageIndex in range(start_page, end_page + 1):
    
            print("...........开始下载第{0}页".format(pageIndex))
            # 创建文件夹
            save_path = create_folder(pageIndex)
            # 生成request
            request = handler_request(url, pageIndex)
            # 发送请求对象,获取相应内容
            response = urllib.request.urlopen(request)
            content = response.read().decode()
            # 解析内容,提取图片并且下载
            get_images_url(content, save_path)
    
            print("...........结束下载第{0}页".format(pageIndex))
    
    
    def create_folder(pageIndex):
        save_path = os.path.join(path, str(pageIndex))
        if not os.path.exists(save_path):
            os.mkdir(save_path)
    
        return save_path.replace("\", "/") + "/"
    
    
    if __name__ == "__main__":
        main()
  • 相关阅读:
    java Activiti 工作流引擎 SSM 框架模块设计方案
    自定义表单 Flowable 工作流 Springboot vue.js 前后分离 跨域 有代码生成器
    数据库设计的十个最佳实践
    activiti 汉化 stencilset.json 文件内容
    JAVA oa 办公系统模块 设计方案
    java 考试系统 在线学习 视频直播 人脸识别 springboot框架 前后分离 PC和手机端
    集成 nacos注册中心配置使用
    “感恩节 ”怼记
    仓颉编程语言的一点期望
    关于System.out.println()与System.out.print("\n")的区别
  • 原文地址:https://www.cnblogs.com/KruceCoder/p/12076682.html
Copyright © 2011-2022 走看看