zoukankan      html  css  js  c++  java
  • 第3课-电影天堂爬虫实战

    #电影天堂电影爬虫

    import requests
    from lxml import etree
    import time

    import warnings

    warnings.filterwarnings('ignore')
    DOMAIN = "https://dytt8.net"

    HEADERS = {
    "Referer": "https://dytt8.net/html/gndy/dyzz/index.html",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

    }

    #获取元素对象
    def get_page_info(url,flag=True):
    html = ""
    my_time = 0
    time.sleep(1)
    while(True):
    response = requests.get(url=url,headers=HEADERS,verify=False)
    if response.status_code == 200:
    if flag:
    text = response.text
    else:
    text = response.content.decode("gbk")
    html = etree.HTML(text)
    break
    else:
    my_time = my_time + 1
    # print(response.status_code,my_time)
    time.sleep(my_time)
    return html

    #获取页数
    def get_pages():
    url = DOMAIN+"/html/gndy/dyzz/index.html"
    html = get_page_info(url)
    pages = html.xpath("//select[@name='sldd']/option[last()]/text()")[0]

    return int(pages)

    #获取电影信息
    def get_movie_info(detail_url):
    html = get_page_info(detail_url,False)
    infos = html.xpath("//div[@id='Zoom']//p/text()")
    index = 0
    for info in infos:
    index = index + 1
    if info != '':
    info = str(info).replace(u'u3000',u' ').strip()
    if info.startswith("◎译 名"):
    info = info.replace("◎译 名 ", "").strip()
    print(" ======================================译 名:{}===============================".format(info))
    elif info.startswith("◎片 名"):
    info = info.replace("◎片 名", "").strip()
    print("片 名:{}".format(info))
    elif info.startswith("◎年 代"):
    info = info.replace("◎年 代", "").strip()
    print("年 代:{}".format(info))
    elif info.startswith("◎产 地"):
    info = info.replace("◎产 地", "").strip()
    print("产 地:{}".format(info))
    elif info.startswith("◎类 别"):
    info = info.replace("◎类 别", "").strip()
    print("类 别:{}".format(info))
    elif info.startswith("◎语 言"):
    info = info.replace("◎语 言", "").strip()
    print("语 言:{}".format(info))
    elif info.startswith("◎字 幕"):
    info = info.replace("◎字 幕", "").strip()
    print("字 幕:{}".format(info))
    elif info.startswith("◎上映日期"):
    info = info.replace("◎上映日期", "").strip()
    print("上映日期:{}".format(info))
    elif info.startswith("◎IMDb评分"):
    info = info.replace("◎IMDb评分", "").strip()
    print("◎IMDb评分:{}".format(info))
    elif info.startswith("◎豆瓣评分"):
    info = info.replace("◎豆瓣评分", "").strip()
    print("豆瓣评分:{}".format(info))
    elif info.startswith("◎文件格式"):
    info = info.replace("◎文件格式", "").strip()
    print("文件格式:{}".format(info))
    elif info.startswith("◎视频尺寸"):
    info = info.replace("◎视频尺寸", "").strip()
    print("视频尺寸:{}".format(info))
    elif info.startswith("◎文件大小"):
    info = info.replace("◎文件大小", "").strip()
    print("文件大小:{}".format(info))
    elif info.startswith("◎片 长"):
    info = info.replace("◎片 长", "").strip()
    print("片 长:{}".format(info))
    elif info.startswith("◎导 演"):
    info = info.replace("◎导 演", "").strip()
    print("导 演:{}".format(info))
    elif info.startswith("◎编 剧"):
    info = info.replace("◎编 剧", "").strip()
    print("编 剧:{}".format(info))
    elif info.startswith("◎主 演"):
    actors = []
    info = info.replace("◎主 演", "").strip()
    actors.append(info)
    for i in range(index,len(infos)):
    info = infos[i].strip()
    if info.startswith("◎"):
    break
    else:
    actors.append(info)
    print("主演:{}".format(actors))
    elif info.startswith("◎标 签"):
    info = info.replace("◎标 签", "").strip()
    print("标 签:{}".format(info))
    elif info.startswith("◎简 介"):
    info = info.replace("◎简 介", "").strip()
    info = infos[index].strip()
    print("简 介:{}".format(info))
    download_url = html.xpath("//table//td[@bgcolor='#fdfddf']/a/@href")
    if len(download_url) > 0:
    print("迅雷下载地址:{}".format(download_url[0]))
    def get_detail_url():#获取电影详情链接
    for i in range(1,get_pages()+1):
    url = "{}/html/gndy/dyzz/list_23_{}.html".format(DOMAIN,i)
    print(url)
    html = get_page_info(url)
    detail_urls = html.xpath("//table[@class='tbspan']//a[@class='ulink']/@href")
    for detail_url in detail_urls:
    detail_url = DOMAIN + detail_url

    get_movie_info(detail_url)


    if __name__ == '__main__':

    get_detail_url()


  • 相关阅读:
    ***php 数组添加关联元素的方法小结(关联数组添加元素)
    阿里云PHP Redis代码示例
    linux内核编程笔记【原创】
    linux RTC 驱动模型分析【转】
    linux 实时时钟(RTC)驱动【转】
    RTC系统【转】
    IRQ和FIQ中断的区别【转】
    NAND Flash【转】
    NandFlash详述【转】
    展讯NAND Flash高级教程【转】
  • 原文地址:https://www.cnblogs.com/win0211/p/11991185.html
Copyright © 2011-2022 走看看