该脚本引入了四个模块:
-
re
用于正则表达式匹配,将Podcast标题转为音频文件名。这是由于Linux系统下的文件名应避免包含字符/><|:&
。所以,如果Podcast标题包含了这些字符,就需要将它们替换掉。 -
shutil
用于将从网络上下载的音频数据流转存为本地文件。 -
requests
用于向网站服务器发出GET请求,以获取HTML页面。 -
html.parser
用于解析得到的HTML页面内容,从中匹配并提取出感兴趣的内容,包括Podcast子页面的链接,mp3文件的url等。使用时,需要由该模块中的HTMLParser
类派生出子类,重写其中的成员函数handle_starttag
与handle_data
,即可以实现对HTML标签及其属性与包含的内容进行提取与处理。
具体代码如下:
import re import shutil import requests from html.parser import HTMLParser # Remove characters unsuitable to be used in a file name. def StringToFileName(s): s = s.replace('#', 'No.') s = re.sub('[?]', '', s) s = re.sub('[&+$*!@%^|<>/]', '_', s) s = re.sub(':s*', '-', s) return s # Class for parse podcast main page and localize the links to subpages. class PlusPodcastPageParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.root_url = 'https://plus.maths.org' self.subpage_url_list = list() self.subpage_title_list = list() self.is_subpage_div_found = False self.is_subpage_span_found = False self.is_subpage_link_found = False def handle_starttag(self, tag, attrs): if (not self.is_subpage_link_found and not self.is_subpage_div_found and tag == 'div'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'class' and attr[1] == 'views-field views-field-title'): self.is_subpage_div_found = True break elif (not self.is_subpage_link_found and self.is_subpage_div_found and tag == 'span'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'class' and attr[1] == 'field-content'): self.is_subpage_span_found = True break elif (not self.is_subpage_link_found and self.is_subpage_span_found and tag == 'a'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'href'): self.is_subpage_link_found = True self.subpage_url_list.append(self.root_url + attr[1]) break def handle_data(self, data): if (self.is_subpage_link_found): podcast_file_name = StringToFileName(data) self.subpage_title_list.append(podcast_file_name) # Reset pattern searching flags and prepare for the next search. self.is_subpage_div_found = False self.is_subpage_span_found = False self.is_subpage_link_found = False # Class for parse podcast subpage which contains the mp3 file. class PlusPodcastSubpageParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.root_url = 'https://plus.maths.org' self.podcast_url = '' self.is_subpage_div_found = False self.is_subpage_span_found = False self.is_podcast_link_found = False def handle_starttag(self, tag, attrs): if (not self.is_podcast_link_found and not self.is_subpage_div_found and tag == 'div'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'class' and attr[1] == 'field-item even'): self.is_subpage_div_found = True break elif (not self.is_podcast_link_found and self.is_subpage_div_found and tag == 'span'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'class' and attr[1] == 'file'): self.is_subpage_span_found = True break elif (not self.is_podcast_link_found and self.is_subpage_span_found and tag == 'a'): if (len(attrs) > 0): for attr in attrs: if (attr[0] == 'href'): self.is_podcast_link_found = True self.podcast_url = attr[1] break number_of_podcast_pages = 16 mp3_file_counter = 1 for page_idx in range(0, number_of_podcast_pages): if (page_idx > 0): page_url = 'https://plus.maths.org/content/podcast?page=' + str(page_idx) else: page_url = 'https://plus.maths.org/content/podcast' response = requests.get(page_url) if (response.status_code == 200): current_page = response.text del response page_parser = PlusPodcastPageParser() page_parser.feed(current_page) if (len(page_parser.subpage_url_list) == len(page_parser.subpage_title_list)): # Iterative over each found subpage url. for subpage_idx in range(len(page_parser.subpage_url_list)): response = requests.get(page_parser.subpage_url_list[subpage_idx]) if (response.status_code == 200): current_subpage = response.text del response subpage_parser = PlusPodcastSubpageParser() subpage_parser.feed(current_subpage) if (len(subpage_parser.podcast_url) > 0): print ('*** Downloading ' + subpage_parser.podcast_url + ' ...') response = requests.get(subpage_parser.podcast_url, stream = True) with open(str(mp3_file_counter) + '-' + page_parser.subpage_title_list[subpage_idx] + '.mp3', 'wb') as mp3_file: shutil.copyfileobj(response.raw, mp3_file) del response mp3_file_counter = mp3_file_counter + 1 else: print ('Cannot get the podcast subpage: ' + page_parser.subpage) else: print ('The numbers of subpage urls and titles should be the same!') else: print ('Cannot get the podcast page: ' + page_url)
运行该程序,稍等片刻,就可以得到所有的Podcast资源了。