zoukankan      html  css  js  c++  java
  • Python 使用selenium抓取网页文本和下载音频

    Python 使用selenium抓取网页文本和下载音频

    #!usrinenv python
    # -*- coding: utf-8 -*-
    
    '一个自动从https://podcast.duolingo.com/spanish中下载音频并且爬取文本的程序'
    '需要配置下载以下所需库,并且配置好webdriver.Chrome(),否则报错'
    
    from selenium import webdriver
    import requests
    import re
    import os
    import shelve
                       
    def mainProc():
        '主进程'
        db = openDb()
        get_pages(db)
        get_episodes(db)
        db.close() 
    
    def openDb():
        '打开data文件,如果当前路径不存在,则新建文件并初始化'
        filename = "data.dat"
        if not os.path.exists(filename):
            db = shelve.open("data", writeback=True)
            db["pages"] = []
            db["episodes"] = []
        else:
            db = shelve.open("data", writeback=True)
    
        return db
    
    def get_pages(db):
        '遍历获取所有页面的网址并保存到shelve文件中'
        # 主页面
        main = 'https://podcast.duolingo.com/spanish'  
    
        # 循环遍历获取所有页面的网址
        # 第一页则为主页面,不需要在main末尾添加i
        #'https://podcast.duolingo.com/spanish2' 以此类推"
        # 如果页面没有在文件中存在,则尝试访问页面,如果200成功,写入文本
        
        for i in range(1, 100):   
            page = main if i == 1 else main + str(i)
            if not page in db["pages"]:                                 
                r = requests.get(page)
                print(f'{page} with status code {r.status_code}.') 
                if r.status_code != 200:                                
                    break 
                db["pages"].append(page)
                # 获取页面所有节目链接并补全连接
                episodes = re.findall('entry-title">s*<a href="(.*)" rel', r.text)
                for episode in episodes:
                    episode = str(main[:-7]) + str(episode[2:])
                    db["episodes"].append(episode)
                            
    def get_episodes(db):
        '在每一页中遍历所有的单集网址'
        for episode in db["episodes"]:
            r = requests.get(episode)
            print(f'{episode} with status code {r.status_code}.') 
            if r.status_code != 200:
                continue
            # 将页面的文本写入文件中并下载音频
            get_transcript(episode)
            get_audios(r, episode)
    
    def get_transcript(episode):
        # 获取节目单集网址中的文本
        filename = 'transcript/' + episode.split('/')[-1] + '.txt'
        if os.path.exists(filename):
            print(filename, 'existed!')
        else:
            req = requests.get(episode)
            print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code))
            if not os.path.exists('transcript'):
                os.mkdir('transcript')
            with open(filename, 'w+', encoding="utf-8") as fp:
                for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text):
                    for line in lines:
                        fp.write(line)
                    fp.write('
    
    ')
                print(filename, 'added!')
    
    def get_audios(r, episode):
        audio = "https:" + re.findall('<iframe .* src="(.*)" height', r.text)[0]
        # 自定义下载配置
        chromeOptions = webdriver.ChromeOptions()
        chromeOptions.add_argument("--ignore-certificate-errors")
        prefs = {"download.default_directory":r"E:Pythoncodeprojectduolingoaudio"}
        chromeOptions.add_experimental_option("prefs", prefs)
        # 下载文件
        print(audio)
        browser = webdriver.Chrome(chrome_options=chromeOptions)
        browser.get(audio)
        if not os.path.exists("audio"):
            os.mkdir("audio")
        browser.find_element_by_id('download-player').click()
        download_status = False
        while not download_status:
            download_status = True
            for i in os.listdir('audio'):
                if i.endswith(".crdownload"):
                    download_status = False
                    time.sleep(5)
        browser.close()
    
    if __name__ == "__main__":
        mainProc()
        
  • 相关阅读:
    java面试常见的类
    Day6
    DAY5
    Day4
    Day3
    Day2
    Day1
    echarts3关系图:力引导布局, 固定某些节点
    Hbuilder中配置cmd
    webpack 学习
  • 原文地址:https://www.cnblogs.com/noonjuan/p/12218402.html
Copyright © 2011-2022 走看看