zoukankan      html  css  js  c++  java
  • 网易云热评

    import requests
    from lxml import etree
    from urllib import parse
    import re
    import json
    import threading
    from queue import Queue
    import hashlib
    
    
    class Spider(object):
        # 封装属性
        def __init__(self):
            self.big_set = set()
            self.conut = 2
            self.domain = "https://music.163.com"
            # 这个cookie会过期,使用requests.session()
            self.cookies = "_ntes_nnid=2a370ddafbaa8c4a4918f335705a78f9,1576592837529; _iuqxldmzr_=32; WM_TID=oK8Bh118%2BE9FFQEFRFYo%2Bk2URh7F9sz3; MUSIC_FU=50760ce67efbcac315aaaeb73c474fc759a861d304e62c6b71f00e0133fb4d09; ntes_kaola_ad=1; WM_NI=mGol9ziQhFk5o2By%2FKYkXylqOvs2ogKPPx9JsfEMB0v6mAr1V4MafqejoO30tUC4G6m6JbXoC%2FabSzq9pYscHfhOZPJ4IZBJBBvL%2Beez5KSNGy3wmNxkDExsenURdhgcanE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed2c23a8f88a285bc72abac8bb2c55e939a8eaab77bab8ae596b66ef4a8a9d6e82af0fea7c3b92aaf99a18dbc72ab8fa8a7ea3fadee8e9aae419bb5989aef7b8daabdd3ee6d8990a099bc42b1ba8baec13cb6eaa3a2f13b96ec8382e57ba7b6a1b0b23cb88b8dd5c63eafea8683db6581f1af83d272b3e7c0baf2698cbaa8b0d95c8aec87bbc142829ebaadaa6fa7a6fbacd24685a7fd98f046bcb58ca6fb79a7b29bb3e764a5bdaf8fc837e2a3; JSESSIONID-WYYY=CTBkJVhsHud8lUA96HIGCd7zTEct9vpBt2tA%2Fny%5C%2FQE5hhUrEaVDig6PQ7bOJ%5C8ubN9XqkNsiWPs8r1viXclodB7tvP%2BFPKWvM1Dg4%2F%2Fux%2F5JnekJB2jNS7p0BceQWsCaBNqRcuWUzy0p%2BhZHCft2hq0%2FhIIBTFHTn9EutKWN049vfzF%3A1577467835043"
            # 注意不能出现url中不能出现"#"
            self.start_url = "https://music.163.com/discover/playlist/"
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            }
            self.sings_url_queue = Queue()  # 歌单url
            self.sings_html_queue = Queue()  # 歌单响应
            self.every_sing_url_queue = Queue()  # 每一个歌单的歌曲的url
    
        # 获取cookie字典形式
        def get_cookies_self(self):
            cookies = {i.split("=")[0]: i.split("=")[1] for i in self.cookies.split(";")}
            return cookies
    
        # 发送请求获取响应
        def get_html(self, url):
            cookies = self.get_cookies_self()
            response = requests.get(url, headers=self.headers, cookies=cookies)
            return response.content.decode()
    
        # 获取全部歌单的url(歌单页)
        def parse(self, url):
            response = self.get_html(url)
            html_elm = etree.HTML(text=response)
            # 获取歌单的url
            url_list = html_elm.xpath("//ul[@id='m-pl-container']/li/div/a[1]/@href")
            # ============第1步=============
            self.sings_url_queue.put(url_list)
            #下一页的歌单url地址
            next_url = html_elm.xpath("//a[text()='下一页']/@href")[0]
            while next_url != "javascript:void(0)":
                next_url = parse.unquote(next_url)
                next_url = parse.urljoin(self.domain, next_url)  # 下一页的歌单的完整url地址
                print("第{}页完整歌单的url".format(self.conut), next_url)
                self.conut += 1
                self.parse(next_url)  # 递归调用
            #url_list.extend(next_url_list)  # 添加到大列表
            #self.sings_url_queue.put(url_list)
    
        # 每一个歌单url的响应(点进歌单页看到歌列表)
        def get_sings_html(self):
            # ============第2步=============
            while 1:
                url_list = self.sings_url_queue.get()
                #print("获得歌单的url", url_list)
                for i in url_list:
                    url = parse.urljoin(self.domain, i)
                    print("拼接的歌单url", url)
                    cookies = self.get_cookies_self()
                    response = requests.get(url, headers=self.headers, cookies=cookies)
                    #print(response.status_code)
                    # ============第3步=============
                    self.sings_html_queue.put(response.content.decode())
    
        # 获取每一首歌曲的url
        def get_sing_list(self):
            # ============第4步=============
            while 1:
                response = self.sings_html_queue.get()
                html_elm = etree.HTML(text=response)
                sing_list = html_elm.xpath("//div[@id='song-list-pre-cache']/ul/li/a/@href")
                #print("每一个歌单的歌曲的url:", sing_list)
                # ============第5步=============
                self.every_sing_url_queue.put(sing_list)
    
        # 发送请求获取每一首歌曲响应+数据
        def get_sing_html(self):
            # ============第6步=============
            while 1:
                url = self.every_sing_url_queue.get()
                for i in url:
                    every_sing_list = []
                    url_num = re.findall(r"d+", i)
                    # 获取歌名+歌手
                    url_name = "https://music.163.com/song?id=" + url_num[0]
                    response1 = self.get_html(url_name)
                    html_elm1 = etree.HTML(text=response1)
                    sing_name = html_elm1.xpath("//div[@class='cnt']//em[@class='f-ff2']/text()")[0] if len(
                        html_elm1.xpath("//div[@class='cnt']//em[@class='f-ff2']/text()")) > 0 else None
                    sing_singer = html_elm1.xpath("//div[@class='cnt']//span/@title")[0] if len(html_elm1.xpath(
                        "//div[@class='cnt']//span/@title")) > 0 else None
                    # print(sing_name)
                    # 获取评论+点赞数
                    url = "https://music.163.com/weapi/v1/resource/comments/R_SO_4_" + url_num[0] + "?csrf_token="
                    cookies = cookies = self.get_cookies_self()
                    
                    data = {
                        "params": "/SUbxIEON9B0tm/OT7p89/1dZ2wkhK+jogYGnLdYY1BWxdfpFo7YgxUBVxoCuh6P92GpQDRBu4EF0frSd1JG2hmTex36G2Qw77CC/6s3fa3facaX8A3CUpyUUNoK8h3fN8hIZwwrQuHFVuwXxeKeoA==",
                        "encSecKey": "deba713b7bc36c398c8d9c99fa4f11a33b4beba35131db3df66188cca4bae0c8a7c4e390aeaacc1fe0b9d44baedc9c6289026e5fe2d8082a9ffab2e3eec34e5f8b2c53845ad593fdd9572fd9618a510461b05f4c49a169b3095dea055d40e365acae25313e044f3a28b341e7697f0222da29d3104ec76c0370eaffb3577e4b1b"
                    }
                    response_sing = requests.post(url, headers=self.headers, cookies=cookies, data=data)
                    comment_dict = json.loads(response_sing.content.decode())
                    try:
                        comment = comment_dict["hotComments"][0]["content"]
                        comment_num = comment_dict["hotComments"][0]['likedCount']
                    except:
                        comment = "空"
                        comment_num = "0"
                    if int(comment_num) > 10000:
                        every_sing_list.append(comment_num)
                        every_sing_list.append(sing_name)
                        every_sing_list.append(sing_singer)
                        every_sing_list.append(comment)
                        #print(every_sing_list)
                        m_obj = hashlib.md5()
                        m_obj.update((every_sing_list[1]+every_sing_list[2]+every_sing_list[3]).encode())
                        ret = m_obj.hexdigest()
                        #print("摘要:", ret)
                        self.big_set.update(ret)
                        if ret not in self.big_set:
                            self.save_data(every_sing_list)
                        else:
                            print("数据重复")
    
    
        # 保存数据
        def save_data(self, list_data):
            data1 , data2, data3 = self.execute_str(list_data)
            with open("music.txt", "a", encoding="utf8") as f:
                f.write(str(list_data[0]) + "  ")  # 纯数字好排序
                f.write("歌名:" + data1 + "    ")  # 歌名歌手去空格使用"_".join(str.split(" "))
                f.write("歌手:" + data2 + "    ")
                f.write("评论:" + data3 + "
    ")
            print("写入成功!")
    
        # 处理字符串
        def execute_str(self, list_data):
            str1 = list_data[1]
            data1 = "_".join(str1.split(" "))
            data1 = "_".join(data1.split(" "))
    
            str2 = list_data[2]
            data2 = "_".join(str2.split(" "))
            data2 = "_".join(data2.split(" "))
    
            str3 = list_data[3].replace("
    ", "")
            str3_new = str3.replace(" ", "")
            data3 = str3_new.replace("
    ", "")
    
            return data1, data2, data3
    
        # 实现主要逻辑
        def run(self):
            thread_list = []
            t1 = threading.Thread(target=self.parse, args=(self.start_url,))  # 注意这里传的是元组
            thread_list.append(t1)
    
            t2 = threading.Thread(target=self.get_sings_html)
            thread_list.append(t2)
    
            t3 = threading.Thread(target=self.get_sing_list)
            thread_list.append(t3)
    
            t4 = threading.Thread(target=self.get_sing_html)
            thread_list.append(t4)
            print(thread_list)
            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()
    
    
    if __name__ == "__main__":
        print("start...")
        s = Spider()
        s.run()
        print("...done")
    

      

  • 相关阅读:
    IE6中布局常见问题
    -bash: grunt-cli: command not found
    字符长度
    Mac下safari、chrome打开开发者工具快捷键
    line-height:150%和line-height:1.5的区别
    JavaScript中的apply()、call()、bind()
    JavaScript中的 this
    JavaScript中的var与作用域
    onload与ready的区别
    浏览器的同源策略
  • 原文地址:https://www.cnblogs.com/yzg-14/p/12291256.html
Copyright © 2011-2022 走看看