zoukankan      html  css  js  c++  java
  • 爬虫-网易云评论

    爬虫-网易云评论

    # post请求体    
    params: HMtP7KwWWgctb71g3T8v7b5SzlO1qN5JDI6WC8AqPYoakAYrpw1hm99wsn0Hp6AfP1ZNlp494Z+4XGXKiYyEXYTSoHvYTVhYpgDxUuSBdgNcZE0IXkkoA5YUEnQf2ESWO3bmt09k2ogKLOoQNWxEnXRewB0Oy2lPEdo52CVVNkUTMMd/gVPq4Zhj4LUvyjDh
    encSecKey: 83e7a7f8bf53186b5c224d2732d86fb41a6366b8fb3c61b7dd4e630f6c5199e5c98732ab6fef399a8b4d08ece5a338e132c7cbc4a86a7f2d8c768431b408671acac04d05010406784afad5c36a904a784478bbc5a1fb29e46df26dc49fea70e6015d1a5409dec5a2f1bc0c997ffc3642177034138d7c2b9c872b35b81e95da7d
        
    # js文件中
    var bLq2x = window.asrsea(JSON.stringify(i8a), bvc9T(["流泪", "强"]), bvc9T(TQ2x.md), bvc9T(["爱心", "女孩", "惊恐", "大笑"]));
                e8e.data = k8c.cy9p({
                    params: bLq2x.encText,
                    encSecKey: bLq2x.encSecKey
                })
        
    # 进一步找
    function() {
        function a(a) {
            var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
            for (d = 0; a > d; d += 1)
                e = Math.random() * b.length,
                e = Math.floor(e),
                c += b.charAt(e);
            return c
        }
        // AES加密
        function b(a, b) {
            var c = CryptoJS.enc.Utf8.parse(b)
              , d = CryptoJS.enc.Utf8.parse("0102030405060708")
              , e = CryptoJS.enc.Utf8.parse(a)
              , f = CryptoJS.AES.encrypt(e, c, {
                iv: d,
                mode: CryptoJS.mode.CBC
            });
            return f.toString()
        }
        // RSA加密
        function c(a, b, c) {
            var d, e;
            return setMaxDigits(131),  // n的十六进制位数
            d = new RSAKeyPair(b,"",c),  // d key
            e = encryptedString(d, a)  // e 为 a的加密结果
        }
        // 得到加密后的结果
        function d(d, e, f, g) {
            var h = {}
              , i = a(16);
            return h.encText = b(d, g),  # 第一次AES加密(msg,key)
            h.encText = b(h.encText, i),  # 第二次AES加密
            h.encSecKey = c(i, e, f),   # 第一次RSA加密
            h
        }
        
        function e(a, b, d, e) {
            var f = {};
            return f.encText = c(a + e, b, d),
            f
        }
        
    d = "{"csrf_token":""}", e = "010001", f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7", g = "0CoJUm6Qyw8W8jud"
    

    python 代码

    '''爬取指定歌曲的评论信息
    
    点入歌曲详情页面,通过以下链接取出评论
    /weapi/v1/resource/comments/R_SO_4_254574?csrf_token= HTTP/1.1
    '''
    import base64
    import random
    from math import floor, ceil
    from multiprocessing import Pool
    import jieba
    from Crypto.Cipher import AES
    import codecs
    import requests
    from wordcloud import WordCloud
    class CommentSpider(object):
        def __init__(self, song_name, song_id):
            self.song_name = song_name
            self.song_id = song_id
            self.headers = {'Host': 'music.163.com',
                            'Referer': 'http://music.163.com/',
                            'Upgrade-Insecure-Requests': '1',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                          'Chrome/66.0.3359.181 Safari/537.36'
                            }
    
    
        def generate_random_string(self, length):
            '''从string字符串中随机取出length个字母'''
            string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            random_str = ''
            for i in range(length):
                random_str += string[floor(int(random.random() * len(string)))]
            return random_str
    
        def aes_encrypt(self,msg, key):
            vi = '0102030405060708'
            pad = lambda s: s + (16 - len(s) % 16) * chr(16 - len(s) % 16)
            msg = pad(msg)
            cipher = AES.new(key.encode('utf8'), AES.MODE_CBC, vi.encode('utf8'))
            encryptedbytes = cipher.encrypt(msg.encode('utf8'))
            encodestrs = base64.b64encode(encryptedbytes)
            enctext = encodestrs.decode('utf8')
            return enctext
    
        def rsa_encrypt(self, random_string, key, f):
            # 随机字符串逆序排序
            string = random_string[::-1]
            # 转成bytes类型
            text = bytes(string, 'utf-8')
            # RSA加密
            sec_key = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
            # 返回结果, x填充到256位
            return format(sec_key, 'x').zfill(256)
    
        def get_params(self, page):
            offset = (page - 1) * 20
            # msg = '{"offset"' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
            # 在d方法处打断点来找到d方法的三个参数
            msg = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'false')
            # msg ='{"csrf_token":""}'
            key = '0CoJUm6Qyw8W8jud'
            f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a87' 
                '6aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9' 
                'd05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b' 
                '8e289dc6935b3ece0462db0a22b8e7'
            e = '010001'
            # 生成长度为16的随机字符串,aes加密用的key
            str_16 = self.generate_random_string(16)
            # 第一次AES加密
            first_aes = self.aes_encrypt(msg, key)
            # 第二次AES加密
            encText = self.aes_encrypt(first_aes, str_16)
            # RSA加密得到encSecKey
            encSecKey = self.rsa_encrypt(str_16, e, f)
            return encText, encSecKey
    
        def get_comment(self,song_data):  #[song_name,id,page]
            '''
            获取第page页的评论
            post请求url,先组参数
            '''
            comment_url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
            params, encSecKey = self.get_params(song_data[2])
            res = requests.post(comment_url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
                                verify=False)
            # 总页数
            # page_count = ceil((res.json()['total']-15) % 20)
            if res.status_code == 200:
                print('正在爬取第%s页的内容'%song_data[2])
                comments = res.json()['comments']
                with open(song_data[0] + '.txt','a',encoding='utf-8') as f:
                    for i in comments:
                        f.write(i['content']+'
    ')
            else:
                print('爬取第%s页失败'%song_data[2])
    
        def make_wordcloud(self,file_name):
            with open('%s.txt'%file_name,'r',encoding='utf-8') as f:
                txt = f.read()
            # 进行结巴分词
            text = ''.join(jieba.cut(txt))
            # 定义词云
            wc = WordCloud(
                font_path="simhei.ttf",  # 这里的字体要电脑上有的 C:WindowsFonts
                width=1200,
                height=800,
                max_words=100,
                max_font_size=200,
                min_font_size=10
            )
            # 生成词云
            wc.generate(text)
            # 保存图片
            wc.to_file(file_name + '.png')
    
        def run(self):
            '''首先要拿到总共多少页,然后进程池一页一页的爬'''
            url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
            params, encSecKey = self.get_params(1)
            res = requests.post(url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
                                verify=False)
            # 总页数
            page_count = ceil((res.json()['total']-15) % 20)
            song_data = [(self.song_name,self.song_id,i+1) for i in range(int(page_count))]
            # 构造进程池
            pool = Pool(processes=4)
            pool.map(self.get_comment,song_data)
    
            # 所有的数据写入文件完成后生成词云
            self.make_wordcloud(self.song_name)
    
    
    if __name__ == '__main__':
        # song_name = input('enter song name: ').strip()
        cs = CommentSpider('太多', '1339315554')
        cs.run()
    
    
    
  • 相关阅读:
    使用虚拟环境virtualenv/Virtualenvwrapper隔离多个python
    计算机硬件基本知识及Linux的常用命令
    网络电子时钟系统案例
    地铁时钟系统介绍
    北斗校时服务器装置介绍
    网络电子时钟系统成功案例
    高精度统一时钟基准特点
    IEEE1588 PTP对时系统原理及特点
    GPS轨迹发生模拟器介绍
    python urllib模块
  • 原文地址:https://www.cnblogs.com/Afrafre/p/11693784.html
Copyright © 2011-2022 走看看