zoukankan      html  css  js  c++  java
  • python爬虫-喜马拉雅_晚安妈妈睡前故事

    这里先说下思路:

    1、首先要获取当前书的音频信息

            '''获取当前书的音频信息'''
            all_list = []
            for url in self.book_url:
                r = requests.get(url, headers=self.headers)
                ret = r.content.decode()
                # ret通过requests请求得到的网页源代码,是一个json数据类型
                pyhton_dict = json.loads(ret)  # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict
                # print(pyhton_dict)
                pythonData = pyhton_dict['data']['tracksAudioPlay']
                # print(pythonData)
                for book in pythonData:
                    # 取出每个音频的播放地址和名字
                    list = {}
                    list['src'] = book['src']
                    list['name'] = book['trackName']
                    print(list)
                    all_list.append(list)
            return all_list  # 所有音频的信息,只是一个list

    2、然后遍历保存

      for i in all_list:
                # 遍历每个音频,保存
                print(i)
                i['name'] = re.sub('"', '', i['name'])  # 如果有文件名称是"结尾,需要改成空
                with open('D:\xima\{}.m4a'.format(self.name + i['name']), 'ab') as f:  # wb会覆盖之前数据,ab不覆盖保存
                    r = requests.get(i['src'], headers=self.headers)
                    ret = r.content
                    f.write(ret)
            print("下载完毕")

    3、最后直接上代码啦!

    import requests
    from lxml import etree
    import re
    import json
    
    class Xima(object):
    
        def __init__(self, name):
            self.name = name
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
            }
            self.start_url = "https://www.ximalaya.com/revision/play/album?albumId=260744&pageNum={}&sort=-1&pageSize=30"  # {} 占位
            self.book_url = []
            for i in range(30):
                url = self.start_url.format(i+1) # format格式插入
                self.book_url.append(url)
            print(self.book_url)
            print(len(self.book_url))
    
        def get_book_msg(self):
            '''获取当前书的音频信息'''
            all_list = []
            for url in self.book_url:
                r = requests.get(url, headers=self.headers)
                ret = r.content.decode()
                # ret通过requests请求得到的网页源代码,是一个json数据类型
                pyhton_dict = json.loads(ret)  # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict
                # print(pyhton_dict)
                pythonData = pyhton_dict['data']['tracksAudioPlay']
                # print(pythonData)
                for book in pythonData:
                    # 取出每个音频的播放地址和名字
                    list = {}
                    list['src'] = book['src']
                    list['name'] = book['trackName']
                    print(list)
                    all_list.append(list)
            return all_list  # 所有音频的信息,只是一个list
    
        def save(self, all_list):
            print("开始下载")
            for i in all_list:
                # 遍历每个音频,保存
                print(i)
                i['name'] = re.sub('"', '', i['name'])  # 如果有文件名称是"结尾,需要改成空
                with open('D:\xima\{}.m4a'.format(self.name + i['name']), 'ab') as f:  # wb会覆盖之前数据,ab不覆盖保存
                    r = requests.get(i['src'], headers=self.headers)
                    ret = r.content
                    f.write(ret)
            print("下载完毕")
    
    
        def run(self):
            all_list = self.get_book_msg()
            self.save(all_list)
    
    if __name__ == "__main__":
        xima = Xima('晚安妈妈睡前故事')
        xima.run()
  • 相关阅读:
    MySQL语法
    SQL必知必会
    大话设计模式
    软件工程
    myeclipse中git的使用
    提取文件中图片标签
    pandas 分析各地区男女的分布情况
    提取图片标签
    Requwsts+正则表达式爬取猫眼电影Top100
    PyCharm激活码
  • 原文地址:https://www.cnblogs.com/lixy-88428977/p/9366913.html
Copyright © 2011-2022 走看看