URL很简单,数据集分散开在一个URL页面上,单个用手下载很慢,这样可以用python辅助下载;
问题:很多国外的数据集,收到网络波动的影响很大,最好可以添加一个如果失败就继续请求的逻辑,这里还没有实现;
参考链接:
https://blog.csdn.net/sinat_36246371/article/details/62426444
代码都是这位大神的,感谢,我再上面稍微改了一点点,加了异常处理。
''' downloading dataset on one html page ''' import requests from bs4 import BeautifulSoup archive_url = your_target_url def get_target_links(): r = requests.get(archive_url) soup = BeautifulSoup(r.content, 'html5lib') links = soup.findAll('a') video_links = [] #video_links = [archive_url + link['href'] forlink in links if (link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )] for link in links: try: if((link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )): video_links.append(archive_url + link['href']) except KeyError: print('keyerror, keep going!') for i in video_links: print(i, ' ') return video_links def download_target_series(video_links): failed_list = [] for link in video_links: file_name = link.split('/')[-1] file_name = ‘your_local_folder’ + file_name print("Downloading file:%s" % file_name) print(link) try: r = requests.get(link, stream=True) except Exception: failed_list.append(file_name.split('\')[-1]) print('download failed. go to down next one ') # download started with open(file_name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024 * 1024): if chunk: f.write(chunk) print("%s downloaded! " % file_name) print("All videos downloaded!") print(failed_list) #record which one is failed to download return if __name__ == "__main__": target_links = get_target_links() download_target_series(target_links)