zoukankan html css js c++ java

python 爬虫下载图片

import os#导入操作系统模块
from urllib.request import urlretrieve#下载url对应的文件
from urllib.request import urlopen    #打开url，得到网页源代码
from bs4 import BeautifulSoup         #bs库，对源代码进行各种操作

downloadDirectory = "downloaded"     #下载至名为“download”的文件夹
baseUrl = "http://pythonscraping.com"#########################################

#将任意链接转换成absolute URL——清理和标准化
def getAbsoluteURL(baseUrl, source):
    if source.startswith("http://www."):
        url = "http://"+source[11:]
    elif source.startswith("http://"):
        url = source
    elif source.startswith("www."):
        url = source[4:]
        url = "http://"+url
    else:
        url = baseUrl+"/"+source
    if baseUrl not in url:
        return None
    return url

#新建一个文件夹，存放下载文件
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace("www.", "")
    path = path.replace(baseUrl, "")
    path = downloadDirectory+path
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

html = urlopen("http://www.pythonscraping.com")##############################
bsObj = BeautifulSoup(html)
downloadList = bsObj.findAll(src=True)#获取src对应的链接list
#print(downloadList)

#将链接list中每一个链接转换成absoluteURL
for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download["src"])
    if fileUrl is not None:
        print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

查看全文

相关阅读:
试说明一级文件索引结构、二级文件索引结构是如何构造的。
文件物理结构的比较
 文件的物理结构
 什么是索引文件，要随机存取某一记录时需经过几步操作？
对文件的存取有哪两种基本方式，各有什么特点？
文件的逻辑结构有哪两种形式？
文件组织的两种结构
 WebService或HTTP服务端接收请求转发消息到另一个服务端-实现思路
 Eclipse报Caused by: java.lang.OutOfMemoryError: PermGen space解决思路
 树莓派2操作记录（有记录才能沉淀...）

原文地址：https://www.cnblogs.com/yrm1160029237/p/6295990.html

python 爬虫 下载图片

python 爬虫下载图片