Python 使用selenium抓取网页文本和下载音频
#!usrinenv python # -*- coding: utf-8 -*- '一个自动从https://podcast.duolingo.com/spanish中下载音频并且爬取文本的程序' '需要配置下载以下所需库,并且配置好webdriver.Chrome(),否则报错' from selenium import webdriver import requests import re import os import shelve def mainProc(): '主进程' db = openDb() get_pages(db) get_episodes(db) db.close() def openDb(): '打开data文件,如果当前路径不存在,则新建文件并初始化' filename = "data.dat" if not os.path.exists(filename): db = shelve.open("data", writeback=True) db["pages"] = [] db["episodes"] = [] else: db = shelve.open("data", writeback=True) return db def get_pages(db): '遍历获取所有页面的网址并保存到shelve文件中' # 主页面 main = 'https://podcast.duolingo.com/spanish' # 循环遍历获取所有页面的网址 # 第一页则为主页面,不需要在main末尾添加i #'https://podcast.duolingo.com/spanish2' 以此类推" # 如果页面没有在文件中存在,则尝试访问页面,如果200成功,写入文本 for i in range(1, 100): page = main if i == 1 else main + str(i) if not page in db["pages"]: r = requests.get(page) print(f'{page} with status code {r.status_code}.') if r.status_code != 200: break db["pages"].append(page) # 获取页面所有节目链接并补全连接 episodes = re.findall('entry-title">s*<a href="(.*)" rel', r.text) for episode in episodes: episode = str(main[:-7]) + str(episode[2:]) db["episodes"].append(episode) def get_episodes(db): '在每一页中遍历所有的单集网址' for episode in db["episodes"]: r = requests.get(episode) print(f'{episode} with status code {r.status_code}.') if r.status_code != 200: continue # 将页面的文本写入文件中并下载音频 get_transcript(episode) get_audios(r, episode) def get_transcript(episode): # 获取节目单集网址中的文本 filename = 'transcript/' + episode.split('/')[-1] + '.txt' if os.path.exists(filename): print(filename, 'existed!') else: req = requests.get(episode) print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code)) if not os.path.exists('transcript'): os.mkdir('transcript') with open(filename, 'w+', encoding="utf-8") as fp: for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text): for line in lines: fp.write(line) fp.write(' ') print(filename, 'added!') def get_audios(r, episode): audio = "https:" + re.findall('<iframe .* src="(.*)" height', r.text)[0] # 自定义下载配置 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--ignore-certificate-errors") prefs = {"download.default_directory":r"E:Pythoncodeprojectduolingoaudio"} chromeOptions.add_experimental_option("prefs", prefs) # 下载文件 print(audio) browser = webdriver.Chrome(chrome_options=chromeOptions) browser.get(audio) if not os.path.exists("audio"): os.mkdir("audio") browser.find_element_by_id('download-player').click() download_status = False while not download_status: download_status = True for i in os.listdir('audio'): if i.endswith(".crdownload"): download_status = False time.sleep(5) browser.close() if __name__ == "__main__": mainProc()