zoukankan      html  css  js  c++  java
  • 爬取电影网站

    code

    import time
    import sys,os
    import requests
    import shutil
    from selenium import webdriver 
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.action_chains import ActionChains
    from bs4 import BeautifulSoup
    
    
    def asleep(driver):
        driver.implicitly_wait(3.5)
        time.sleep(2) 
    
    driver = webdriver.Chrome()
    asleep(driver)
    
    #719页
    for k in range(1,720):
        url="http://zimiyy.com/mov/0/0/all/{}.html".format(k)
    
        driver.get(url)
    
        t=driver.find_element_by_xpath("//div[@class='index-tj mb clearfix']/ul").get_attribute('innerHTML')
    
        soup1 = BeautifulSoup(t, 'html.parser')
    
        tmp=soup1.findAll('a')
    
        for i in tmp:
            tmp_movie_url="http://zimiyy.com{}".format(i.get("href"))
            print(tmp_movie_url)
            movie_name=i.get("title")
            print(movie_name)
            pic_url=i.find("img").get("src")
            print(pic_url)
    
            time.sleep(2)
            #进入详情页
            driver.get(tmp_movie_url)
            #获取描述
            tmp_desc=driver.find_element_by_class_name("info").get_attribute('innerHTML')
            detail_html=driver.find_element_by_id("stab_1_71").get_attribute('innerHTML')
            soup2 = BeautifulSoup(detail_html, 'html.parser')
            tmp_play_page_list=soup2.findAll('li')
    
            print(tmp_desc)
            all_movie_url={}
            for j in tmp_play_page_list:
                movie_url_type=j.find("a").string
                play_page_url=j.find("a").get("href")
                #进入播放页
                driver.get(play_page_url)
                #获取视频链接
                try:
                    movie_url=driver.find_element_by_xpath("//span[@class='dplayer-info-panel-item-data']").text
                except Exception as e:
                    print(e)
                    movie_url=None
    
                #记录
                all_movie_url[movie_url_type]=movie_url
    
            print(all_movie_url)
            print("*"*17)
    
        time.sleep(3)

  • 相关阅读:
    React生命周期
    React第三次入门
    滴滴新锐面经
    前端优化
    Z-index
    maven建ssh项目的pom文件
    拦截器与过滤器的区别
    Jquery的ajax获取action中的返回值
    清空数据库所有表的数据
    orcal操作锦集
  • 原文地址:https://www.cnblogs.com/sea-stream/p/13851753.html
Copyright © 2011-2022 走看看