1.媒体文件:存储媒体文件有两种主要方式:a.获取URL链接。b.将源文件下载下来。
1 import os 2 from urllib.request import urlretrieve 3 from urllib.request import urlopen 4 from bs4 import BeautifulSoup 5 6 downloadDirectory = "downloaded" 7 baseUrl = "http://pythonscraping.com" 8 9 def getAbsoluteURL(baseUrl, source): 10 if source.startswith("http://www."): 11 url = "http://"+source[11:] 12 13 elif source.startswith("http://"): 14 url = source 15 16 elif source.startswith("www."): 17 url = source[4:] 18 url = "http://" + source 19 20 else: 21 url = baseUrl + "/" + source 22 if baseUrl not in url: 23 return None 24 25 return url 26 27 def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory): 28 path = absoluteUrl.replace("www.", "") 29 30 path = path.replace(baseUrl, "") 31 path = downloadDirectory + path 32 33 directory = os.path.dirname(path) 34 35 36 if not os.path.exists(directory): 37 os.makedirs(directory) 38 39 return path 40 41 html = urlopen("http://www.pythonscraping.com") 42 bsObj = BeautifulSoup(html) 43 downloadList = bsObj.findAll(src=True) 44 45 for download in downloadList: 46 fileUrl = getAbsoluteURL(baseUrl, download["src"]) 47 if fileUrl is not None: 48 print(fileUrl) 49 urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))