zoukankan      html  css  js  c++  java
  • python爬虫--selenium模块.上来自己动!

    selenium

    基本操作

    from selenium import webdriver
    from time import sleep
    #实例化一个浏览器对象
    bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')
    url = 'https://www.jd.com/'
    
    #用户发起请求
    bro.get(url)
    
    #定位标签
    search_input = bro.find_element_by_id('key')
    
    #对指定标签进行数据交互
    search_input.send_keys('macPro')
    
    btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
    btn.click()
    sleep(2)
    
    #执行js代码
    jsCode = 'window.scrollTo(0,document.body.scrollHeight)'
    bro.execute_script(jsCode)
    
    sleep(3)
    bro.quit()
    
    selenium
    - 概念:基于浏览器自动化的一个模块。
    - 环境的安装:
        - pip install selenium
    - selenium和爬虫之间的关联:
        - 模拟登录
        - 便捷的捕获到动态加载的数据(重点)
        	获取的页码源码数据 : page_source
            - 特点:可见及可得
            - 缺点:效率低
    
    - selenium的具体使用
        - 准备浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html
    - 动作链:ActionChains,一系列的行为动作
        - 使用流程:
            - 实例化一个动作连对象,需要将指定的浏览器和动作连对象进行绑定
            - 执行相关的连续的动作
            - perform()立即执行动作连制定好的动作
    

    滑动操作

    方式一:

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver import ActionChains #动作链
    import time
    bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')
    
    
    bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    bro.implicitly_wait(3)
    
    bro.switch_to.frame('iframeResult')  #切换到frame标签下
    frame_tag = bro.find_element_by_id('draggable') #如果不切换到iframe下,就找不到该标签
    
    begin_tag = bro.find_element_by_id('draggable') #滑动块的起始位置
    end_tag = bro.find_element_by_id('droppable')#滑动块的终止位置
    
    actions = ActionChains(bro) #拿到动作链对象
    actions.drag_and_drop(begin_tag,end_tag) #把动作放到动作链中,准备串行执行
    actions.perform() #开始执行
    time.sleep(2)
    bro.quit()
    

    方式二:

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver import ActionChains
    import time
    bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')
    
    
    bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    bro.implicitly_wait(3)
    
    bro.switch_to.frame('iframeResult')
    frame_tag = bro.find_element_by_id('draggable')
    
    begin_tag = bro.find_element_by_id('draggable')
    end_tag = bro.find_element_by_id('droppable')
    
    ActionChains(bro).click_and_hold(begin_tag).perform() #起始位置的滑动块,点击并按住
    distance = end_tag.location['x']-begin_tag.location['x']
    #滑动的距离
    trsck = 0
    while trsck < distance:   		      ActionChains(bro).move_by_offset(xoffset=50,yoffset=0).perform()     #xoffset横向滑动距离为50像素
        trsck += 50
    ActionChains(bro).release().perform() #滑动结束,释放滑动块
    
    time.sleep(2)
    bro.quit()
    
    

    基于selenium模拟登陆12306

    import requests
    from hashlib import md5
    
    class Chaojiying_Client(object):
    
        def __init__(self, username, password, soft_id):
            self.username = username
            password =  password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
    
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
    
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()
    
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from time import sleep
    from PIL import Image #安装PIL或者是Pillow
    from CJY import Chaojiying_Client
    
    #封装一个识别验证码的函数
    def transformCode(imgPath,imgType):
        chaojiying = Chaojiying_Client('username', 'password', '902590')
        im = open(imgPath, 'rb').read()
        return chaojiying.PostPic(im, imgType)['pic_str']
    
    
    bro = webdriver.Chrome(executable_path=r'C:pycahrm文件chromedriver.exe')
    
    bro.get('https://kyfw.12306.cn/otn/login/init')
    sleep(2)
    #将当前浏览器页面进行图片保存
    bro.save_screenshot('./main.png')
    #将验证码的局部区域进行裁剪
    #捕获标签在页面中的位置信息
    img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
    location = img_tag.location#标签的起始位置坐标(左下角坐标)
    size = img_tag.size#标签的尺寸
    #裁剪范围对应的矩形区域
    rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
    #使用Image工具进行指定区域的裁剪
    i = Image.open('./main.png')
    frame = i.crop(rangle)#crop就是根据指定的裁剪范围进行图片的截取
    frame.save('code.png')
    
    #调用打码平台进行验证码的识别
    result = transformCode('./code.png',9004)
    print(result) #x1,y1|x2,y2|x3,y3
    
    #x1,y1|x2,y2|x3,y3 ==>[[x1,y1],[x2,y2],[x3,y3]]
    all_list = []#[[x1,y1],[x2,y2],[x3,y3]]
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    
    
    for point in all_list:
        x = point[0]
        y = point[1]
        ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()
        sleep(1)
    
    
    bro.find_element_by_id('username').send_keys('xxxxxx')
    sleep(1)
    bro.find_element_by_id('password').send_keys('xxxx')
    sleep(1)
    
    bro.find_element_by_id('loginSub').click()
    
    sleep(10)
    print(bro.page_source)
    bro.quit()
    
    

    selenium规避风险

    规避检测
    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    
    bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)
    
    url = 'https://www.taobao.com/'
    
    bro.get(url)
    
    # 当用爬虫程序发起的请求时,在后台进行window.navigator.webdriver的js的注入,返回值为true
    # 正常访问一个页面是注入js会返回一个undefind
    

    无头浏览器

    #无头浏览器
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from time import sleep
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) #看不见页面,不显示可视化页面
    url = 'https://www.taobao.com/'
    bro.get(url)
    sleep(2)
    bro.save_screenshot('123.png') 
    
    print(bro.page_source)
    
  • 相关阅读:
    MySQL Explain详解
    sql查询:存在A表而不在B表中的数据
    mybatis处理集合、数组参数使用in查询
    mysql日期范围查找(两个日期之间的记录)
    MYSQL查询数据表中某个字段包含某个数值
    springboot+jpa分页(Pageable+Page)
    MySQL单表能存储多少条数据?
    nosql几种热门数据库的优缺点及应用场景
    MySQL百万级数据分页查询及优化
    Redis cluster群集操作
  • 原文地址:https://www.cnblogs.com/tangjian219/p/11996707.html
Copyright © 2011-2022 走看看