zoukankan      html  css  js  c++  java
  • 音悦台mv视频下载

    需要获取的页面:

    参考了此处,做了修改,代码如下:

      1 #coding:utf-8
      2 import urllib2
      3 import urllib
      4 import re
      5 import sys
      6 import os
      7 import time
      8 
      9 
     10 class Yinyuetai():
     11 
     12     #地址初始化
     13     def __init__(self, url):
     14         self.i = 1
     15         self.url = url
     16         self.headers = {
     17             'User-Agent':'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
     18             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     19         }
     20         self.timeout = 30
     21         self.__init()
     22 
     23 
     24     def __init(self, page=1):
     25         print u"开始下载:第 %d 页 ..." % page
     26         reurl = self.url + "&page=%d" %page
     27         page = self.getPage(reurl)
     28         mvPageList = self.__getMvPageList(page)
     29         if len(mvPageList) > 0:
     30             for plist in mvPageList:
     31                 mvlist = self.getMvURL(plist)
     32                 self.downLoad(mvlist[0], mvlist[1].decode("utf-8"))
     33                 self.i += 1
     34             time.sleep(2)
     35             page += 1
     36             self.__init(page)
     37         else:
     38             print u"
    ~~~~~~~~~~~完成!~~~~~~~~~~~~~~"
     39 
     40 
     41 
     42     #获取指定页面源码
     43     def getPage(self, url):
     44         try:
     45             request = urllib2.Request(url, None, self.headers)
     46             response = urllib2.urlopen(request, None, self.timeout)
     47             return response.read()
     48         except:
     49             return []
     50 
     51     #分析列表页,返回MV地址和名字列表[0]:视频ID[1]:视频名称
     52     def __getMvPageList(self, page):
     53             reg = r"<h3><ashref="http://v.yinyuetai.com/video/([0-9]+)".*title="(.*)".*"
     54             pattern = re.compile(reg)
     55             findList = re.findall(pattern, page)
     56             return findList
     57             #print findList
     58 
     59 
     60 
     61     def getMvURL(self, mvlist):
     62         url = "http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=%d" % int(mvlist[0])
     63         html = self.getPage(url)
     64 
     65         reg = r"http://w*?.yinyuetai.com/uploads/videos/common/.*?(?=&br)"
     66         pattern=re.compile(reg)
     67         findList = re.findall(pattern, html)
     68 
     69         if len(findList) >= 3:
     70             return [findList[2], mvlist[1]]
     71         else:
     72             return [findList[0], mvlist[1]]
     73 
     74 
     75     #end def
     76 
     77     #下载文件
     78     def downLoad(self, url, name):
     79         name = name + '.flv'
     80         print u"下载:[%s] [%d]" % (name, self.i)
     81         local = self.makeDirs() + '/' + name
     82         try:
     83             urllib.urlretrieve(url, local, self.schedule)
     84             print u"下载完成:[%s]
    " % name
     85         except:
     86             print u"下载失败!
    "
     87 
     88     def makeDirs(self):
     89         path = sys.path[0]
     90         newPath = os.path.join(path, 'flv')
     91         if not os.path.isdir(newPath):
     92             os.mkdir(newPath)
     93         return newPath
     94 
     95     """
     96     回调函数获取进度
     97     @ a 已经下载的数据块
     98     @ b 数据块的大小
     99     @ c 远程文件的大小
    100     """
    101     def schedule(self, a, b, c):
    102         per = 100.0 *a * b / c
    103         if per > 100 : per = 100
    104         sys.stdout.write(u" 进度:%.1f%%
    " % per)
    105         sys.stdout.flush()
    106 
    107 
    108 
    109 
    110 if __name__ == '__main__':
    111     url = 'http://mv.yinyuetai.com/all?pageType=page&sort=weekViews&tab=allmv&parenttab=mv'
    112     Yinyuetai(url)
  • 相关阅读:
    洛谷P2146 [NOI2015]软件包管理器
    洛谷P3038 [USACO11DEC]牧草种植Grass Planting
    洛谷P2831 愤怒的小鸟
    洛谷P1084 疫情控制
    洛谷P3258 [JLOI]2014松鼠的新家
    洛谷P1084 运输计划
    洛谷P2051 [AHOI2009]中国象棋
    洛谷P1438 无聊的数列
    洛谷P1312 Mayan游戏
    luogu P1038 神经网络
  • 原文地址:https://www.cnblogs.com/nju2014/p/4471296.html
Copyright © 2011-2022 走看看