zoukankan      html  css  js  c++  java
  • Python练习:优酷评论过滤(抓取当前视频全部评论,并过滤不包括所需关键词的留言)

     1 # coding:utf-8
     2 print('正在初始化...')
     3 import requests
     4 import re
     5 from lxml.html import fromstring
     6 import pyautogui
     7 import sys
     8 import os
     9 from multiprocessing.dummy import Pool
    10 from lxml import _elementpath
    11 
    12 
    13 class Youku_comment:
    14 
    15     """docstring for Youku_comment"""
    16 
    17     def __init__(self, raw_url):
    18 
    19         self.pid = re.findall('/id_(.*?).html', raw_url)[0]
    20         r1 = requests.get(raw_url)
    21         title = re.findall('<title>(.*?)</title>', r1.text)[0]
    22         title = re.sub('W', '', title).replace('在线播放优酷网视频高清在线观看', '')
    23         totalpn = self.get_totalpn(self.pid)
    24         print('视频ID:%s' % self.pid, '
    视频标题:%s' %
    25               title, '
    总页码数:%s
    正在抓取...' % totalpn)
    26 
    27         pp = Pool(30)
    28         pagenums = range(1, totalpn + 1)
    29         result = pp.map(self.get_comment, pagenums)
    30         pp.close()
    31         pp.join()
    32         result = [i for i in result if i]
    33         self.aa = sum(result, [])
    34         # print(jieguo)
    35         # with open('%s.csv' % title, 'w', encoding='gbk') as f:
    36         #     f.write(jieguo.encode('gbk', 'ignore').decode('gbk'))
    37 
    38     def get_totalpn(self, pid):
    39         r = requests.get(
    40             'http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":1}' % pid)
    41         totalpn = (int(r.json()['totalSize'].replace(',', '')) // 30) + 1
    42         return totalpn
    43 
    44     def get_comment(self, pagenum):
    45         for _ in range(5):
    46             try:
    47                 r = requests.get(
    48                     'http://comments.youku.com/comments/~ajax/vpcommentContent.html?__ap={"videoid":"%s","page":%s}' % (self.pid, pagenum), timeout=3)
    49                 sjson = r.json()
    50                 scode = sjson['con']
    51                 ss = re.findall('<p id=".*?">.*?</p>', scode, flags=re.S)
    52                 ss = [re.sub('<.*?>', '', i) for i in ss]
    53                 if ss:
    54                     return ss
    55             except:
    56                 pass
    57 
    58 
    59 def filt1(str1, kws):
    60     kws = kws.split(' ') if kws else 'OST 背景 音乐 旋律 歌曲 调子 music 耳熟 BGM 谁唱的 来自 出自 原声'.split(
    61         ' ')
    62     for i in kws:
    63         if i in str1:
    64             return str1
    65 
    66 
    67 def quchong(ll):
    68     ss = ''
    69     for i in ll:
    70         if i in ss:
    71             continue
    72         else:
    73             ss = ss + '
    ' + i
    74     return ss
    75 
    76 while 1:
    77     try:
    78         url = pyautogui.prompt('请输入网址:')
    79         if not url:
    80             break
    81         tt = Youku_comment(url)
    82         pinglun = tt.aa
    83         while 1:
    84             kws = pyautogui.prompt('请输入关键词,多个请用空格隔开(直接回车则代表找背景音乐):')
    85             kws = kws if kws else 0
    86             ss = [filt1(i, kws) for i in pinglun]
    87             ss = [i for i in ss if i]
    88             ss = quchong(ss)
    89             print('检索结果:
    ')
    90             print(ss)
    91             jixu = pyautogui.confirm(
    92                 text='是否要继续检索', title='请确认', buttons=['', ''])
    93             if jixu == '':
    94                 break
    95 
    96     except Exception as e:
    97         print(e)
    98         print('错误,请重试')
    99 os.system('pause')


    Windows已编译可执行文件: http://pan.baidu.com/s/1bn0jLmf

  • 相关阅读:
    mysql千万级数据量根据索引优化查询速度
    Centos 7 开放查看端口 防火墙关闭打开
    Spring-Quartz 配置执行时间参数
    Curl命令查看请求响应时间
    Centos 7 安装 Mongodb
    配置Docker加速器
    TCP端口状态说明ESTABLISHED、TIME_WAIT
    Maven依赖项Scope属性设置含义
    实战JAVA虚拟机 JVM故障诊断与性能优化(十)---字节码的执行
    代理模式(Proxy)_java实现
  • 原文地址:https://www.cnblogs.com/pyld/p/4732311.html
Copyright © 2011-2022 走看看