zoukankan      html  css  js  c++  java
  • python 电影下载链接爬虫

    V1.0

    功能:从比较知名的几个电影下载网站爬取下载链接,并自动打印出来:

    代码:

    # -*- coding: utf8 -*-
    from bs4 import BeautifulSoup
    import requests, lxml
    from urllib.parse import quote
    import re
    
    
    def get_name():
        while 1:
            moviename = input('请输入要查找的电影名
    ->')
            moviename_quote = quote(moviename.encode('gb2312'))
            get_url_from_ygdy(moviename_quote)
            get_url_from_bttiantang(moviename)
            get_url_from_dytt(moviename_quote)
    
    
    def get_url_from_ygdy(moviename):
        baseurl = 'http://s.dydytt.net/plus/search.php?kwtype=0&keyword='
        url = baseurl + str(moviename)
        content = BeautifulSoup(requests.get(url).content.decode('gb2312', 'ignore'), 'lxml')
        first_page = content.find_all('td', width="30")
        movie_infos = content.find_all('td', width="55%")
        if movie_infos.__len__() == 0:
            print('查无此电影,请检查后重试')
            return
        else:
            print('阳光电影搜索结果:')
            if first_page.__len__() == 0:
                for movie_info in movie_infos:
                    get_info(movie_info, moviename)
            else:
                last_page_url = first_page[1].find('a').get('href') + '"'
                pattern = re.compile('PageNo=(.*?)"')
                pnt = re.findall(pattern, last_page_url)
                for i in range(int(pnt[0])):
                    print('', i + 1, '页:')
                    page_url = url + '&PageNo=' + str(i + 1)
                    pagecontent = BeautifulSoup(requests.get(page_url).content.decode('gb2312', 'ignore'), 'lxml')
                    movie_infos = pagecontent.find_all('td', width='55%')
                    for movie_info in movie_infos:
                        get_info(movie_info, moviename)
    
    
    def get_info(movie_info, name):
        movie_url = movie_info.find('a').get('href')
        moviename = movie_info.text
        if '游戏' not in name and '游戏' in moviename:
            return
        else:
            print('电影名:', moviename)
            url = 'http://www.ygdy8.com' + movie_url
            info = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml')
            download = info.find_all('td', style="WORD-WRAP: break-word")
            print('下载链接:')
            if download.__len__() == 1:
                print(download[0].find('a').string)
            else:
                for each in range(download.__len__()):
                    print('链接', each + 1, ':', download[each].find('a').string)
            print('
    ')
    
    
    def get_url_from_bttiantang(moviename):
        baseurl = 'http://www.bttiantang.com/s.php?q=' + str(moviename)
        page_content = requests.get(baseurl).content.decode('utf8', 'ignore')
        pattern = re.compile('</b>条<b>(.*?)</b>')
        pagenum_info = re.findall(pattern, page_content)
        page_content = BeautifulSoup(page_content, 'lxml')
        content = page_content.find_all('p', class_="tt cl")
        if content.__len__() == 0:
            print('查无此电影,请检查后重试')
            return
        else:
            print('BT天堂搜索结果:')
            if pagenum_info.__len__() == 0:
                for each in content:
                    get_movieinfo(each, moviename)
            else:
                for i in range(int(pagenum_info[0])):
                    print('', i + 1, '页:')
                    page_url = baseurl + '&PageNo=' + str(i + 1)
                    page_content = BeautifulSoup(requests.get(page_url).content.decode('utf8', 'ignore'), 'lxml')
                    content = page_content.find_all('p', class_="tt cl")
                    for each in content:
                        get_movieinfo(each, moviename)
    
    
    def get_movieinfo(movie_content, name):
        url = 'http://www.bttiantang.com/' + movie_content.find('a').get('href')
        moviename = movie_content.text
        if '游戏' not in name and '游戏' in moviename:
            return
        print('电影名:', moviename)
        info = BeautifulSoup(requests.get(url).content.decode('utf8', 'ignore'), 'lxml')
        links = info.find_all('div', class_='tinfo')
        print('下载链接:')
        i = 0
        for each in links:
            i += 1
            print('链接' + str(i) + ':')
            print('http://www.bttiantang.com' + each.find('a').get('href'))
    
    
    def get_url_from_dytt(moviename):
        baseurl = 'http://www.dytt.com/search.asp?searchword=' + str(moviename)
        content = requests.get(baseurl).content.decode('gbk', 'ignore')
        pattern = re.compile('下一页.*?href.*?page=(.*?)&')
        result = re.findall(pattern, content)
        content = BeautifulSoup(content, 'lxml')
        items = content.find_all('p', class_='s1')
        if items.__len__() == 1:
            print('查无此电影,请检查后重试')
            return
        else:
            print('电影淘淘搜索结果:')
            if result.__len__() == 0:
                for i in range(items.__len__() - 1):
                    get_movieinfo_from_dytt(items[i + 1], moviename)
            else:
                for i in range(int(result[0])):
                    print('', i + 1, '页:')
                    url = baseurl + '&page=' + str(i + 1)
                    page_content = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml')
                    items = page_content.find_all('p', class_='s1')
                    for i in range(items.__len__() - 1):
                        get_movieinfo_from_dytt(items[i + 1], moviename)
    
    
    def get_movieinfo_from_dytt(item, name):
        moviename = item.find('a').text
        movieurl = 'http://www.dytt.com' + item.find('a').get('href')
        if '游戏' not in name and '游戏' in moviename:
            return
        print('电影名:', moviename)
        pagecontent = requests.get(movieurl).content.decode('gbk', 'ignore')
        links = re.findall(re.compile('ed2k:(.*?)|/'), pagecontent)
        i = 0
        print('下载链接:')
        if links.__len__() != 0:
            for link in links:
                i += 1
                print('链接' + str(i) + ':', 'ed2k://|file|' + link + '|/')
        else:
            links = re.findall(re.compile('http:(.*?)torrent'), pagecontent)
            if links.__len__() != 0:
                for link in links:
                    i += 1
                    print('链接' + str(i) + ':', 'http:' + link + 'torrent')
            else:
                links = re.findall(re.compile('ftp:(.*?)mkv'), pagecontent)
                for link in links:
                    i += 1
                    print('链接' + str(i) + ':', 'ftp:' + link + 'mkv')
    
    
    if __name__ == '__main__':
        get_name()

     运行结果:

  • 相关阅读:
    eclipse常用快捷键
    .net操作Excel快速
    treeview使用sort以后取消排序
    行转列,参数是文本类型
    easyui分页控件汉化扩展
    在子页面中获取父frameset中元素
    easyui-datagrid自定义分页控件样式
    字符串和图片转换
    VS2012设置随笔
    DevExpress之GridControl
  • 原文地址:https://www.cnblogs.com/INnoVationv2/p/5846094.html
Copyright © 2011-2022 走看看