V1.0
功能:从比较知名的几个电影下载网站爬取下载链接,并自动打印出来:
代码:
# -*- coding: utf8 -*- from bs4 import BeautifulSoup import requests, lxml from urllib.parse import quote import re def get_name(): while 1: moviename = input('请输入要查找的电影名 ->') moviename_quote = quote(moviename.encode('gb2312')) get_url_from_ygdy(moviename_quote) get_url_from_bttiantang(moviename) get_url_from_dytt(moviename_quote) def get_url_from_ygdy(moviename): baseurl = 'http://s.dydytt.net/plus/search.php?kwtype=0&keyword=' url = baseurl + str(moviename) content = BeautifulSoup(requests.get(url).content.decode('gb2312', 'ignore'), 'lxml') first_page = content.find_all('td', width="30") movie_infos = content.find_all('td', width="55%") if movie_infos.__len__() == 0: print('查无此电影,请检查后重试') return else: print('阳光电影搜索结果:') if first_page.__len__() == 0: for movie_info in movie_infos: get_info(movie_info, moviename) else: last_page_url = first_page[1].find('a').get('href') + '"' pattern = re.compile('PageNo=(.*?)"') pnt = re.findall(pattern, last_page_url) for i in range(int(pnt[0])): print('第', i + 1, '页:') page_url = url + '&PageNo=' + str(i + 1) pagecontent = BeautifulSoup(requests.get(page_url).content.decode('gb2312', 'ignore'), 'lxml') movie_infos = pagecontent.find_all('td', width='55%') for movie_info in movie_infos: get_info(movie_info, moviename) def get_info(movie_info, name): movie_url = movie_info.find('a').get('href') moviename = movie_info.text if '游戏' not in name and '游戏' in moviename: return else: print('电影名:', moviename) url = 'http://www.ygdy8.com' + movie_url info = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml') download = info.find_all('td', style="WORD-WRAP: break-word") print('下载链接:') if download.__len__() == 1: print(download[0].find('a').string) else: for each in range(download.__len__()): print('链接', each + 1, ':', download[each].find('a').string) print(' ') def get_url_from_bttiantang(moviename): baseurl = 'http://www.bttiantang.com/s.php?q=' + str(moviename) page_content = requests.get(baseurl).content.decode('utf8', 'ignore') pattern = re.compile('</b>条<b>(.*?)</b>') pagenum_info = re.findall(pattern, page_content) page_content = BeautifulSoup(page_content, 'lxml') content = page_content.find_all('p', class_="tt cl") if content.__len__() == 0: print('查无此电影,请检查后重试') return else: print('BT天堂搜索结果:') if pagenum_info.__len__() == 0: for each in content: get_movieinfo(each, moviename) else: for i in range(int(pagenum_info[0])): print('第', i + 1, '页:') page_url = baseurl + '&PageNo=' + str(i + 1) page_content = BeautifulSoup(requests.get(page_url).content.decode('utf8', 'ignore'), 'lxml') content = page_content.find_all('p', class_="tt cl") for each in content: get_movieinfo(each, moviename) def get_movieinfo(movie_content, name): url = 'http://www.bttiantang.com/' + movie_content.find('a').get('href') moviename = movie_content.text if '游戏' not in name and '游戏' in moviename: return print('电影名:', moviename) info = BeautifulSoup(requests.get(url).content.decode('utf8', 'ignore'), 'lxml') links = info.find_all('div', class_='tinfo') print('下载链接:') i = 0 for each in links: i += 1 print('链接' + str(i) + ':') print('http://www.bttiantang.com' + each.find('a').get('href')) def get_url_from_dytt(moviename): baseurl = 'http://www.dytt.com/search.asp?searchword=' + str(moviename) content = requests.get(baseurl).content.decode('gbk', 'ignore') pattern = re.compile('下一页.*?href.*?page=(.*?)&') result = re.findall(pattern, content) content = BeautifulSoup(content, 'lxml') items = content.find_all('p', class_='s1') if items.__len__() == 1: print('查无此电影,请检查后重试') return else: print('电影淘淘搜索结果:') if result.__len__() == 0: for i in range(items.__len__() - 1): get_movieinfo_from_dytt(items[i + 1], moviename) else: for i in range(int(result[0])): print('第', i + 1, '页:') url = baseurl + '&page=' + str(i + 1) page_content = BeautifulSoup(requests.get(url).content.decode('gbk', 'ignore'), 'lxml') items = page_content.find_all('p', class_='s1') for i in range(items.__len__() - 1): get_movieinfo_from_dytt(items[i + 1], moviename) def get_movieinfo_from_dytt(item, name): moviename = item.find('a').text movieurl = 'http://www.dytt.com' + item.find('a').get('href') if '游戏' not in name and '游戏' in moviename: return print('电影名:', moviename) pagecontent = requests.get(movieurl).content.decode('gbk', 'ignore') links = re.findall(re.compile('ed2k:(.*?)|/'), pagecontent) i = 0 print('下载链接:') if links.__len__() != 0: for link in links: i += 1 print('链接' + str(i) + ':', 'ed2k://|file|' + link + '|/') else: links = re.findall(re.compile('http:(.*?)torrent'), pagecontent) if links.__len__() != 0: for link in links: i += 1 print('链接' + str(i) + ':', 'http:' + link + 'torrent') else: links = re.findall(re.compile('ftp:(.*?)mkv'), pagecontent) for link in links: i += 1 print('链接' + str(i) + ':', 'ftp:' + link + 'mkv') if __name__ == '__main__': get_name()
运行结果: