zoukankan      html  css  js  c++  java
  • python3爬虫-网易云排行榜,网易云歌手及作品

    import requests, re, json, os, time
    from fake_useragent import UserAgent
    from lxml import etree
    from urllib import parse
    
    
    class MyError(Exception):
        def __init__(self, status, msg):
            self.status = status
            self.msg = msg
    
    
    class WyRinking():
        def __init__(self):
            ua = UserAgent()
            self.stratUrl = "https://music.163.com/discover/toplist"
            self.headers = {
                "User-Agent": ua.random
            }
            self.timeout = 10
            self.allow_redirects = False
            self.nameList = []
            self.urlList = []
    
        def __getRinkNameUrl(self, response):
            '''获取所有排行榜名字,和url'''
            html_selector = self.__etreeSelector(response)
            self.nameList = html_selector.xpath(
                "//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/text()") or []
            self.urlList = html_selector.xpath(
                "//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/@href") or []
    
        def __getPageHtml(self, url):
            '''请求页面'''
            try:
                response = requests.get(url, headers=self.headers, timeout=self.timeout,
                                        allow_redirects=self.allow_redirects)
                return response
            except requests.exceptions.Timeout as e:
                print("Timeout Error>>:", e)
                self.__getPageHtml(url=url)
    
        def __getRankHtml(self):
            '''获取每个排行榜的html源码'''
            if not self.nameList and not self.urlList:
                raise MyError(10000, "{},{} 数据不能为空".format(self.nameList, self.urlList))
            if len(self.nameList) != len(self.urlList):
                raise MyError(10001, "nameList,urlList数据不能一一对应")
            for i in range(len(self.urlList)):
                url = parse.urljoin(self.stratUrl, url=self.urlList[i])
                response = self.__getPageHtml(url=url)
                response.customizeName = self.nameList[i]
                self.__getRankInfo(response)
    
        def __getRankInfo(self, response):
            '''获取到网页中的json格式数据,写入到文件'''
            html_selector = self.__etreeSelector(response)
    
            test = html_selector.xpath("//*[@id='song-list-pre-data']/text()")[0] or ""
            updateTime = html_selector.xpath("//span[contains(@class,'sep') and contains(@class,'s-fc3')]/text()")[0]
            try:
                data = json.loads(test)
            except json.decoder.JSONDecodeError:
                data = json.loads(test + '"}}]')
            '''
            if not len(songNmaeList) == len(songUrlList) == len(songIdList) == len(songIdList):
                raise MyError(10001, "songNmaeList,songUrlList,songIdList,songIdList数据不能一一对应")
            '''
            fileName = response.customizeName + '--' + updateTime + ".json"
            if not Rink_BASE_PATH:
                raise MyError(10005, "需要在全局中配置该参数Rink_BASE_PATH,用于文件存放地址")
            if not os.path.exists(Rink_BASE_PATH):
                os.makedirs(Rink_BASE_PATH)
            path = os.path.join(Rink_BASE_PATH, fileName)
            self.__writeToFile(path, data)
    
        def __writeToFile(self, path, data):
            print('正在写入文件{}.json'.format(path))
            index = 1
            with open(path, "w", encoding="utf-8") as f:
                for data_dic in data:
                    dic = {}
                    dic["rankNum"] = index
                    dic["songId"] = data_dic.get("id")
                    dic["songName"] = data_dic.get("name")
                    dic["artistsInfo"] = data_dic.get("artists")
                    dic["commentThreadId"] = data_dic.get("commentThreadId")
                    f.write(json.dumps(dic, ensure_ascii=False) + "
    ")
                    index += 1
    
        def __reSongId(self, songurl: str):
            '''
            :param songurl:  /song?id=1336871144 格式类似于这样
            '''
            pattern = r"id=(d+)"
            try:
                id = re.findall(pattern, songurl)[0]
            except IndexError:
                raise MyError(10002, "歌曲id获取失败")
            return id
    
        def collectRanking(self):
            '''获取网易云排行榜数据'''
            response = self.__getPageHtml(url=self.stratUrl)
            self.__getRinkNameUrl(response)
            self.__getRankHtml()
    
        def __etreeSelector(self, response):
            '''将response对象转换为xml格式'''
            return etree.HTML(response.text)
    
    
    class WySinger():
        __isFirstStatus = True  # 请求华语男歌手页面的时候,获取到A-Z对应的参数,这个只需要获取一次就足够
    
        def __init__(self):
            ua = UserAgent()
            self.stratUrl = "https://music.163.com/discover/artist"
            self.headers = {
                "User-Agent": ua.random
            }
            self.timeout = 10
            self.allow_redirects = False
            self.sCategoryNameList = []
            self.sCategoryIdList = []
            self.sCategoryUrlList = []
            self.initialIdList = []
            self.markList = []
    
        def __getPageHtml(self, url):
            '''请求页面'''
            try:
                response = requests.get(url, headers=self.headers, timeout=self.timeout,
                                        allow_redirects=self.allow_redirects)
                return response
            except requests.exceptions.Timeout as e:
                print("Timeout Error>>:", e)
                self.__getPageHtml(url=url)
    
        def __getSingerCategory(self, response):
            htmlSelector = self.__etreeSelector(response)
            sCategoryNameList = htmlSelector.xpath(
                "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/text()")
            sCategoryIdList = htmlSelector.xpath(
                "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@data-cat")
            sCategoryUrlList = htmlSelector.xpath(
                "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@href")
            if sCategoryUrlList and len(sCategoryNameList) == len(sCategoryIdList) == len(sCategoryUrlList):
                self.sCategoryNameList = sCategoryNameList or []
                self.sCategoryIdList = sCategoryIdList or []
                self.sCategoryUrlList = [parse.urljoin(self.stratUrl, url) for url in sCategoryUrlList or []]
    
        def __getSingerListPage(self):
            if not self.sCategoryNameList and not self.sCategoryUrlList:
                raise MyError(10000, "{},{} 数据不能为空".format(self.sCategoryNameList, self.sCategoryUrlList))
            if len(self.sCategoryNameList) != len(self.sCategoryUrlList):
                raise MyError(10001, "nameList,urlList数据不能一一对应")
            for sCategoryUrl in self.sCategoryUrlList:
                response = self.__getPageHtml(sCategoryUrl)
                if self.__isFirstStatus:
                    self.__getInitialId(response)
                    self.__isFirstStatus = False
    
                for inintalId in self.initialIdList:
                    if inintalId == "-1":
                        # inintalId 为-1的时候代表热门,但是会和后面的歌手信息重复,所以做个判断
                        continue
                    url = sCategoryUrl + "&initial=" + inintalId
                    res = self.__getPageHtml(url)
                    yield res
    
        def __getSingerIdUrl(self, response):
            htmlSelector = self.__etreeSelector(response)
            aSelector = htmlSelector.xpath(
                "//*[@id='m-artist-box']//a[@class='msk'] | //*[@id='m-artist-box']/li[@class='sml']/a[1]")
            singerUrlList = [parse.urljoin(self.stratUrl, selector.xpath("@href")[0]) for selector in aSelector]
            singerNameList = [selector.xpath("@title")[0].replace("的音乐", "") for selector in aSelector]
            if singerUrlList and len(singerUrlList) == len(singerNameList):
                yield list(zip(singerUrlList, singerNameList))
            else:
                yield []
    
        def __getInitialId(self, response):
            '''获取A-Z对应的initialId'''
            htmlSelector = self.__etreeSelector(response)
            urlList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/@href")
            initialIdList = [self.__reInitialId(url) for url in urlList]
            markList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/text()")
    
            if len(initialIdList) == len(markList):
                self.initialIdList = initialIdList
                self.markList = markList
    
        def __reInitialId(self, url):
            '''
            url格式为:/discover/artist/cat?id=1001&initial=-1
            '''
            pattern = r"initial=(.*)"
            initialId = re.findall(pattern, url, re.S)[0]
            return initialId
    
        def __getSingerDetails(self, response):
            htmlSelector = self.__etreeSelector(response)
            try:
                data_json = htmlSelector.xpath("//*[@id='song-list-pre-data']/text()")[0]
                data_list = json.loads(data_json, strict=False)
                singerDetails_json = htmlSelector.xpath("//script[@type='application/ld+json']/text()")[0]
                singerDetails_dict = json.loads(singerDetails_json, strict=False)
                singerDetails_content = singerDetails_dict.get("description")
                return data_list, singerDetails_content
            except Exception as e:
                # 有些音乐人是没有音乐作品的,所以通过索引取值([0])就会抛异常,我这里捕捉改异常,不进行处理就好
                print(e)
                return None, None
    
        def __writeToFile(self, datalist, singerDetails_content, singerName):
            if not os.path.exists(Singer_BASE_PATH):
                os.makedirs(Singer_BASE_PATH)
            path = os.path.join(Singer_BASE_PATH, singerName)
            print("正在写入{}".format(singerName))
            with open(path + ".txt", 'w', encoding="utf-8") as f:
                f.write("歌手简介:{}".format(singerDetails_content) + "
    ")
                for data in datalist:
                    f.write("-" * 50 + "
    ")
                    f.write("歌曲名:{}".format(data.get("name")) + "
    ")
                    f.write("歌曲ID:{}".format(data.get("privilege").get("id")) + "
    ")
                    f.write("歌曲专辑:{}".format(data.get("album").get("name")) + "
    ")
                    f.write("歌曲别号:{}".format("" if not data.get("alias") else data.get("alias")) + "
    ")
    
        def __etreeSelector(self, response):
            '''将response对象转换为xml格式'''
            return etree.HTML(response.text)
    
        def collectSinger(self):
            response = self.__getPageHtml(url=self.stratUrl)
            self.__getSingerCategory(response)
            resGenerator = self.__getSingerListPage()
            for res in resGenerator:
                time.sleep(1)
                a = self.__getSingerIdUrl(res)  # a是一个生成器,不知道取啥名,它__next__就是一个列表,这个列表就是当前页面所有歌手名和url的元组
                for i in a:  # i 就是 a__next__得来的列表
                    for b in i:  # b 就是我们想要的结果  一个元组(歌手名,歌手详情页的url)
                        singerUrl = b[0]
                        singerName = b[1]
                        singerResponse = self.__getPageHtml(singerUrl)
                        datalist, singerDetails_content = self.__getSingerDetails(singerResponse)
                        if not datalist and not singerDetails_content:
                            continue
                        self.__writeToFile(datalist, singerDetails_content, singerName)
    
    
    if __name__ == '__main__':
        Rink_BASE_PATH = r"D:spidersDataRinking"
        Singer_BASE_PATH = r"D:spidersDataSingerInfo"
        wangyiyun = WyRinking()
        wangyiyun.collectRanking()  # 获取网易云排行榜数据
        wangyiyun = WySinger()
        wangyiyun.collectSinger()  # 获取网易云所有歌手及作品
  • 相关阅读:
    SDN实验 7: OpenDaylight 实验——Python 中的 REST API 调用
    2020软工第四次作业:结对编程作业
    SDN实验 6: OpenDaylight 实验——OpenDaylight 及 Postman 实现流表下发
    SDN实验 5: OpenFlow 协议分析和 OpenDaylight 安装
    SDN实验 4: Open vSwitch 实验——Mininet 中使用 OVS 命令
    2020软工第二次作业
    SDN实验3:Mininet 实验——测量路径的损耗率
    软件工程实践个人总结
    软件工程实践番外篇——获小黄衫有感
    软件定义网络实验 7:OpenDaylight 实验——Python 中的 REST API 调用(含选做题)
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10765932.html
Copyright © 2011-2022 走看看