zoukankan      html  css  js  c++  java
  • python爬虫--爬取cctv连续剧

      1 #encoding=utf-8
      2 import requests
      3 from bs4 import BeautifulSoup
      4 import re
      5 import os
      6 from aria2rpc import rpc_addUri
      7 class Cntv():
      8 
      9     def openUrl(self,url):
     10         """
     11         This method is used to open a web site
     12         :param url:Web site to request
     13         :return:Requested object
     14         """
     15         header = {
     16             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
     17         }
     18         response = requests.get(url, header)
     19         return response
     20         # pass
     21     def getEachEpisodeUrl(self):
     22         """
     23         Get the address of each episode of the TV play
     24         :return:urls lists
     25         """
     26         urls = []
     27         # response = requests.get(self.url)
     28         url = "http://tv.cctv.com/2014/07/07/VIDA1404730290373811.shtml"
     29         response = self.openUrl(url)
     30         html = response.content.decode('utf-8')
     31         soup = BeautifulSoup(html,'html.parser')
     32         title = soup.select(".text_mod h3")
     33         print(title[0].text)
     34         episodes = soup.select('.img a')
     35         # print(episodes)
     36         for each in range(1,len(episodes),3):
     37             print(episodes[each]['title'],"link:"+episodes[each]['href'])
     38             urls.append(episodes[each]['href'])
     39         print("Get Each Episode Url Come Over !!!")
     40         return urls
     41     def getEachDLUrl(self):
     42         urls = self.getEachEpisodeUrl()
     43         links = []
     44         for num,url in enumerate(urls):
     45             response = self.openUrl(url)
     46             html = response.text
     47             # soup = BeautifulSoup(html, 'html.parser')
     48             match = re.search(r'guid = "(w+?)";', html)
     49             pid = match.group(1)
     50             # print(pid)
     51             link = "http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=%s&tz=%s&from=%s&url=%s&idl=%s&idlr=%s&modifyed=%s" %(pid,'-8','000news',url,'32','32','false')
     52             links.append(link)
     53             print("获取第%d集" %(num))
     54             # print(urls)
     55         return links
     56     def getDLList(self):
     57         """
     58         Get the download address for each episode of the TV play
     59         :return:ownload address list
     60         """
     61         links = self.getEachDLUrl()
     62         # links = ["http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=59381a0e55404cf5b101f7d3bcad2da8&tz=-8&from=000news&url=http://tv.cctv.com/2014/07/15/VIDE1405435161521590.shtml&idl=32&idlr=32&modifyed=false"]
     63         dl_urls = []
     64         for link in links:
     65             dl_url = []
     66             response = self.openUrl(link)
     67             # html = response.content.decode('utf-8')
     68             dl_list = response.json()['video']['chapters4']
     69             for each in range(len(dl_list)):
     70                 downloadurl = dl_list[each]['url']
     71                 dl_url.append(downloadurl)
     72                 print(downloadurl)
     73             dl_urls.append(dl_url)
     74         return dl_urls
     75     def _add_aria2_task(self, url, name):
     76         """
     77         :param url:download url
     78         :param name:dowmload tv name
     79         :return:
     80         """
     81         try:
     82             result = rpc_addUri(url, {'out': name})
     83             return result
     84         except Exception as e:
     85             print(e)
     86             return None
     87 
     88 
     89 # response.json()['video']['lowChapters'][0]['url']
     90 # response.json()['video']['chapters4'][0]['url']
     91 """    
     92     def dlTv(self):
     93       
     94         dl_urls_list = self.getDLList()
     95         if os.path.exists("tv_list") == False:
     96             os.mkdir("tv_list")
     97         os.chdir("tv_list")
     98         for dl_urls in dl_urls_list:
     99             for dl_url in dl_urls:
    100                 print("download" + dl_url)
    101                 # response = self.openUrl(dl_url)
    102                 # with open("first.mp4",'ab') as tl:
    103                 #     tl.write(response.content)
    104             print("-"*20)
    105 """
    106 if __name__ == "__main__":
    107     cm = Cntv()
    108     # cm.getUrl()
    109     # cm.openUrl()
    110 
    111     lists = cm.getDLList()
    112     for num,list in enumerate(lists):
    113         for i,url in enumerate(list):
    114             cm._add_aria2_task(url, str(num+1)+'_'+str(i+1)+'.mp4')
  • 相关阅读:
    【转】ORACLE日期时间 等函数大全
    list_car()函数小记
    git代码提交流程
    windows连接ubuntu服务器方式
    win10专业版安装docker实战
    selenium来识别数字验证码
    web服务器、WSGI跟Flask(等框架)之间的关系
    pymysql的使用
    sql常用 语句总结
    sql语句insert into where 错误解析
  • 原文地址:https://www.cnblogs.com/royfans/p/7573135.html
Copyright © 2011-2022 走看看