zoukankan html css js c++ java
第3课-电影天堂爬虫实战

#电影天堂电影爬虫

import requests
from lxml import etree
import time

import warnings

warnings.filterwarnings('ignore')
DOMAIN = "https://dytt8.net"

HEADERS = {
    "Referer": "https://dytt8.net/html/gndy/dyzz/index.html",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

}

#获取元素对象
def get_page_info(url,flag=True):
    html = ""
    my_time = 0
    time.sleep(1)
    while(True):
        response = requests.get(url=url,headers=HEADERS,verify=False)
        if response.status_code == 200:
            if flag:
                text = response.text
            else:
                text = response.content.decode("gbk")
            html = etree.HTML(text)
            break
        else:
            my_time = my_time + 1
            # print(response.status_code,my_time)
            time.sleep(my_time)
    return html

#获取页数
def get_pages():
    url = DOMAIN+"/html/gndy/dyzz/index.html"
    html = get_page_info(url)
    pages = html.xpath("//select[@name='sldd']/option[last()]/text()")[0]

    return  int(pages)

#获取电影信息
def get_movie_info(detail_url):
    html = get_page_info(detail_url,False)
    infos = html.xpath("//div[@id='Zoom']//p/text()")
    index = 0
    for info in infos:
        index = index + 1
        if info != '':
            info = str(info).replace(u'u3000',u' ').strip()
            if info.startswith("◎译  名"):
                info = info.replace("◎译  名 ", "").strip()
                print("

======================================译 名:{}===============================".format(info))
            elif info.startswith("◎片  名"):
                info = info.replace("◎片  名", "").strip()
                print("片  名:{}".format(info))
            elif info.startswith("◎年  代"):
                info = info.replace("◎年  代", "").strip()
                print("年 代:{}".format(info))
            elif info.startswith("◎产  地"):
                info = info.replace("◎产  地", "").strip()
                print("产 地:{}".format(info))
            elif info.startswith("◎类  别"):
                info = info.replace("◎类  别", "").strip()
                print("类 别:{}".format(info))
            elif info.startswith("◎语  言"):
                info = info.replace("◎语  言", "").strip()
                print("语 言:{}".format(info))
            elif info.startswith("◎字  幕"):
                info = info.replace("◎字  幕", "").strip()
                print("字 幕:{}".format(info))
            elif info.startswith("◎上映日期"):
                info = info.replace("◎上映日期", "").strip()
                print("上映日期:{}".format(info))
            elif info.startswith("◎IMDb评分"):
                info = info.replace("◎IMDb评分", "").strip()
                print("◎IMDb评分:{}".format(info))
            elif info.startswith("◎豆瓣评分"):
                info = info.replace("◎豆瓣评分", "").strip()
                print("豆瓣评分:{}".format(info))
            elif info.startswith("◎文件格式"):
                info = info.replace("◎文件格式", "").strip()
                print("文件格式:{}".format(info))
            elif info.startswith("◎视频尺寸"):
                info = info.replace("◎视频尺寸", "").strip()
                print("视频尺寸:{}".format(info))
            elif info.startswith("◎文件大小"):
                info = info.replace("◎文件大小", "").strip()
                print("文件大小:{}".format(info))
            elif info.startswith("◎片  长"):
                info = info.replace("◎片  长", "").strip()
                print("片  长:{}".format(info))
            elif info.startswith("◎导  演"):
                info = info.replace("◎导  演", "").strip()
                print("导  演:{}".format(info))
            elif info.startswith("◎编  剧"):
                info = info.replace("◎编  剧", "").strip()
                print("编  剧:{}".format(info))
            elif info.startswith("◎主  演"):
                actors = []
                info = info.replace("◎主  演", "").strip()
                actors.append(info)
                for i in range(index,len(infos)):
                    info = infos[i].strip()
                    if info.startswith("◎"):
                        break
                    else:
                        actors.append(info)
                print("主演:{}".format(actors))
            elif info.startswith("◎标  签"):
                info = info.replace("◎标  签", "").strip()
                print("标 签:{}".format(info))
            elif info.startswith("◎简  介"):
                info = info.replace("◎简  介", "").strip()
                info = infos[index].strip()
                print("简 介:{}".format(info))
    download_url = html.xpath("//table//td[@bgcolor='#fdfddf']/a/@href")
    if len(download_url) > 0:
        print("迅雷下载地址:{}".format(download_url[0]))
def get_detail_url():#获取电影详情链接
    for i in range(1,get_pages()+1):
        url =  "{}/html/gndy/dyzz/list_23_{}.html".format(DOMAIN,i)
        print(url)
        html = get_page_info(url)
        detail_urls = html.xpath("//table[@class='tbspan']//a[@class='ulink']/@href")
        for detail_url in detail_urls:
            detail_url = DOMAIN + detail_url

            get_movie_info(detail_url)


if __name__ == '__main__':

    get_detail_url()
查看全文
相关阅读:
yum 安装包的用法
 php session文件修改路径
 apache性能测试工具ab使用详解
 shell exit 0 exit 1
网站架构(页面静态化,图片服务器分离,负载均衡)方案全解析
 【转载】新手如何快速打造高流量网站
 高并发处理方案
 HTML静态化技术
 在项目中学习.NET的json(二）之运费计算器
 在项目中学习.NET的json(一）
原文地址：https://www.cnblogs.com/win0211/p/11991185.html