zoukankan      html  css  js  c++  java
  • 爬取电影网站

    code

    import time
    import sys,os
    import requests
    import shutil
    from selenium import webdriver 
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.action_chains import ActionChains
    from bs4 import BeautifulSoup
    
    
    def asleep(driver):
        driver.implicitly_wait(3.5)
        time.sleep(2) 
    
    driver = webdriver.Chrome()
    asleep(driver)
    
    #719页
    for k in range(1,720):
        url="http://zimiyy.com/mov/0/0/all/{}.html".format(k)
    
        driver.get(url)
    
        t=driver.find_element_by_xpath("//div[@class='index-tj mb clearfix']/ul").get_attribute('innerHTML')
    
        soup1 = BeautifulSoup(t, 'html.parser')
    
        tmp=soup1.findAll('a')
    
        for i in tmp:
            tmp_movie_url="http://zimiyy.com{}".format(i.get("href"))
            print(tmp_movie_url)
            movie_name=i.get("title")
            print(movie_name)
            pic_url=i.find("img").get("src")
            print(pic_url)
    
            time.sleep(2)
            #进入详情页
            driver.get(tmp_movie_url)
            #获取描述
            tmp_desc=driver.find_element_by_class_name("info").get_attribute('innerHTML')
            detail_html=driver.find_element_by_id("stab_1_71").get_attribute('innerHTML')
            soup2 = BeautifulSoup(detail_html, 'html.parser')
            tmp_play_page_list=soup2.findAll('li')
    
            print(tmp_desc)
            all_movie_url={}
            for j in tmp_play_page_list:
                movie_url_type=j.find("a").string
                play_page_url=j.find("a").get("href")
                #进入播放页
                driver.get(play_page_url)
                #获取视频链接
                try:
                    movie_url=driver.find_element_by_xpath("//span[@class='dplayer-info-panel-item-data']").text
                except Exception as e:
                    print(e)
                    movie_url=None
    
                #记录
                all_movie_url[movie_url_type]=movie_url
    
            print(all_movie_url)
            print("*"*17)
    
        time.sleep(3)

  • 相关阅读:
    华为lab-rs-v1-2.11_OSPF与ISIS互通
    jdk源码分析红黑树——插入篇
    jdk源码分析PriorityQueue
    jdk源码分析ArrayDeque
    jdk链表笔记
    jdk顺序表笔记
    SpringMVC类型转换器、属性编辑器
    SpringMVC基本使用
    spring整合hibernate
    spring aop注解配置
  • 原文地址:https://www.cnblogs.com/sea-stream/p/13851753.html
Copyright © 2011-2022 走看看