zoukankan      html  css  js  c++  java
  • selenium

    - selenium模块在爬虫中的使用
        - 概念:是一个基于浏览器自动化的模块。
        - 爬虫之间的关联:
            - 便捷的捕获到动态加载到的数据。(可见即可得)
            - 实现模拟登陆
        - 环境安装:pip install selenium
        - 基本使用:
            - 准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html
                - 版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672
            - 实例化某一款浏览器对象
        - 动作链:
            - 一系列连续的动作
            - 在实现标签定位时,如果发现定位的标签是存在于iframe标签之中的,则在定位时必须执行一个
            固定的操作:bro.switch_to.frame('id')
        - 无头浏览器的操作:无可视化界面的浏览器
            - PhantomJs:停止更新
            - 谷歌无头浏览器
        - 让selenium规避检测

    自动化京东搜索关键字

    from selenium import webdriver
    from time import sleep
    bro = webdriver.Chrome(executable_path='chromedriver.exe')
    bro.get('https://www.jd.com/')
    sleep(1)
    #进行标签定位
    search_input = bro.find_element_by_id('key')
    search_input.send_keys('mac pro')
    
    btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
    btn.click()
    sleep(2)
    
    #执行js
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    
    page_text = bro.page_source
    print(page_text)
    
    sleep(2)
    bro.quit()

    自动化抓取动态加载数据

    from selenium import webdriver
    from time import sleep
    from lxml import etree
    bro = webdriver.Chrome(executable_path='chromedriver.exe')
    
    bro.get('http://125.35.6.84:81/xk/')
    sleep(1)
    page_text = bro.page_source
    page_text_list = [page_text]
    
    for i in range(3):
        bro.find_element_by_id('pageIto_next').click()#点击下一页
        sleep(1)
        page_text_list.append(bro.page_source)
    
    for page_text in page_text_list:
        tree = etree.HTML(page_text)
        li_list = tree.xpath('//ul[@id="gzlist"]/li')
        for li in li_list:
            title = li.xpath('./dl/@title')[0]
            num = li.xpath('./ol/@title')[0]
            print(title+':'+num)
    
    sleep(2)
    bro.quit()

    动作链

    from selenium import webdriver
    from time import sleep
    from selenium.webdriver import ActionChains
    bro = webdriver.Chrome(executable_path='chromedriver.exe')
    bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    bro.switch_to.frame('iframeResult')
    div_tag = bro.find_element_by_id('draggable')
    #拖动= 点击+滑动
    action = ActionChains(bro)
    action.click_and_hold(div_tag)
    
    for i in range(5):
        #perform让动作链立即执行
        action.move_by_offset(17,5).perform()
        sleep(0.5)
    
    action.release()
    
    sleep(3)
    
    bro.quit()

    12306登录

    超级鹰:

    import requests
    from hashlib import md5
    
    class Chaojiying_Client(object):
    
        def __init__(self, username, password, soft_id):
            self.username = username
            password =  password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
    
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
    
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()

    12306自动登录主体代码:

    from selenium import webdriver
    from time import sleep
    from PIL import Image
    from selenium.webdriver import ActionChains
    from Cjy import Chaojiying_Client
    from selenium.webdriver import ActionChains
    bro = webdriver.Chrome(executable_path='chromedriver.exe')
    bro.get('https://kyfw.12306.cn/otn/login/init')
    sleep(5)
    bro.save_screenshot('main.png')
    
    code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
    location = code_img_tag.location
    size = code_img_tag.size
    #裁剪的区域范围
    rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
    
    i = Image.open('./main.png')
    frame = i.crop(rangle)
    frame.save('code.png')
    
    def get_text(imgPath,imgType):
        chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')
        im = open(imgPath, 'rb').read()
        return chaojiying.PostPic(im, imgType)['pic_str']
    
    #55,70|267,133 ==[[55,70],[33,66]]
    result = get_text('./code.png',9004)
    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)
    # action = ActionChains(bro)
    for a in all_list:
        x = a[0]
        y = a[1]
        ActionChains(bro).move_to_element_with_offset(code_img_tag,x,y).click().perform()
        sleep(1)
    
    bro.find_element_by_id('username').send_keys('123456')
    sleep(1)
    bro.find_element_by_id('password').send_keys('67890000000')
    sleep(1)
    bro.find_element_by_id('loginSub').click()
    
    sleep(5)
    bro.quit()

    selenium其他操作

    #使用谷歌无头浏览器
    from selenium import webdriver
    from time import sleep
    from selenium.webdriver.chrome.options import Options
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    driver = webdriver.Chrome(r'chromedriver.exe',chrome_options=chrome_options)
    driver.get('https://www.cnblogs.com/')
    print(driver.page_source)
    
    #如何规避selenium被检测
    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    from time import sleep
    
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    
    driver = webdriver.Chrome(r'chromedriver.exe',options=option)
    driver.get('https://www.taobao.com/')
  • 相关阅读:
    转:Java 6 JVM参数选项大全(中文版)
    转:Http Get Post put delete
    转:Google MapReduce中文版
    转:java.net.SocketException: Too many open files解决方法
    转:UML类图基础
    转:Maven常用命令
    转:ibatis配置简介
    转:导出 Oracle 数据库中所所有用户表的表结构
    C# 中使用iTextSharp组件修改PDF元数据(title,Keywords等)
    SQL Server跨服务器查询
  • 原文地址:https://www.cnblogs.com/sun-10387834/p/12846802.html
Copyright © 2011-2022 走看看