zoukankan      html  css  js  c++  java
  • 爬虫 selenium

    点击这里有惊喜

    https://www.cnblogs.com/bobo-zhang/p/9685362.html

    安装selenium

    环境安装 pip install selenium
    编码流程:
    导报 from selenium import webdriver
    实例化某一款浏览器对象
    自指定自动化操作代码

    使用后,会自动化跳转一个Chrome页面,应进行操作

    from selenium import webdriver
    from time import sleep

    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe') bro.get(url='https://www.baidu.com/') sleep(2) text_input = bro.find_element_by_id('kw') text_input.send_keys('人民币') sleep(2) bro.find_element_by_id('su').click() sleep(3) #获取当前的页面源码数据 print(bro.page_source) bro.quit()

    通过selenium 获取 数据

    #获取豆瓣电影中更多电影详情数据
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe')
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    sleep(1)
    bro.quit()

    关于谷歌无头浏览器

    什么是无头浏览器?
    使用后不会弹出页面,也就是没有页面

    #谷歌无头浏览器
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    #获取豆瓣电影中更多电影详情数据
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe',chrome_options=chrome_options)
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    print(page_text)
    sleep(1)
    bro.quit()

    通过selenium自动话登录qq空间

    #qq空间
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫chromedriver.exe')
    url = 'https://qzone.qq.com/'
    bro.get(url=url)
    sleep(2)
    #定位到一个具体的iframe
    bro.switch_to.frame('login_frame')
    bro.find_element_by_id('switcher_plogin').click()
    sleep(2)
    
    bro.find_element_by_id('u').send_keys('460086804')
    bro.find_element_by_id('p').send_keys('shuo0127')
    
    bro.find_element_by_id('login_button').click()
    
    sleep(5)
    
    page_text = bro.page_source
    with open('qq.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    bro.quit()

    线程池

    #爬取梨视频数据
    import requests
    import re
    from lxml import etree
    from multiprocessing.dummy import Pool
    import random
    
    #实例化一个线程池对象
    pool = Pool(5)
    url = 'https://www.pearvideo.com/category_1'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
    
    video_url_list = []
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        detail_page = requests.get(url=detail_url,headers=headers).text
        video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
        video_url_list.append(video_url)
        
    video_data_list = pool.map(getVideoData,video_url_list)
    
    pool.map(saveVideo,video_data_list)
    def getVideoData(url):
        return requests.get(url=url,headers=headers).content
    
    def saveVideo(data):
        fileName = str(random.randint(0,5000))+'.mp4'
        with open(fileName,'wb') as fp:
            fp.write(data)
  • 相关阅读:
    Activex打包于发布完整版---ActiveX打包
    同步和异步的区别
    QoS的构建模块与机制
    GLSL语言内置的变量详解
    jquery中的DOM操作
    varchar和Nvarchar区别
    使用SqlServer中的float类型时发现的问题
    SQL2005,错误 0xc00470fe 数据流任务 产品级别对于 组件“源
    SQL SERVER SQLOS的任务调度
    隐式事务(转)
  • 原文地址:https://www.cnblogs.com/zhangqing979797/p/10453421.html
Copyright © 2011-2022 走看看