zoukankan      html  css  js  c++  java
  • selenium模块在爬虫中的应用

    1. 相关概念

    1. selenium模块

      是一个基于浏览器自动化的模块

    2. 与爬虫之间的关联

      便捷的捕获到动态加载到的数据(可见即可得)

      实现模拟登陆

    3.环境安装

    pip3 install selenium

    简单演示

    from selenium import webdriver
    from time import sleep
    
    # 后面是你的浏览器驱动位置,记得前面加r'','r'是防止字符转义的
    driver = webdriver.Chrome(r'chromedriver.exe')
    
    # 用get打开百度页面
    driver.get("http://www.baidu.com")
    
    # 查找页面的“设置”选项,并进行点击
    driver.find_elements_by_link_text('设置')[0].click()
    sleep(2)
    
    # 打开设置后找到“搜索设置”选项,设置为每页显示50条
    driver.find_elements_by_link_text('搜索设置')[0].click()
    sleep(2)
    
    # 选中每页显示50条
    m = driver.find_element_by_id('nr')
    sleep(2)
    m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
    m.find_element_by_xpath('.//option[3]').click()
    sleep(2)
    
    # 点击保存设置
    driver.find_elements_by_class_name("prefpanelgo")[0].click()
    sleep(2)
    
    # 处理弹出的警告页面   确定accept() 和 取消dismiss()
    driver.switch_to_alert().accept()
    sleep(2)
    
    # 找到百度的输入框,并输入 美女
    driver.find_element_by_id('kw').send_keys('美女')
    sleep(2)
    
    # 点击搜索按钮
    driver.find_element_by_id('su').click()
    sleep(2)
    
    # 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面
    driver.find_elements_by_link_text('美女_百度图片')[0].click()
    sleep(3)
    
    # 关闭浏览器
    driver.quit()

    2.基本使用

      准备好某一款浏览器的驱动程序:http://chromedriver.storage.googleapis.com/index.html

      版本的映射关系:https://blog.csdn.net/huilan_same/article/details/51896672

    1. 访问京东网站,并搜索“苹果”

    from time import sleep
    from selenium import webdriver
    
    bro = webdriver.Chrome(executable_path="chromedriver.exe")
    
    # 录入路由地址
    bro.get("https://www:jd.com/")
    sleep(2)
    
    # 进行标签定位
    search_input = bro.find_element_by_id("key")
    
    # 向搜索框中录入关键词
    search_input.send_keys("苹果")
    
    # 定位搜索按钮
    btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
    
    # 点击搜索按钮
    btn.click()
    sleep(2)
    
    #执行js(滑动滚轮)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    
    # 获取页面的源码数据
    page_text = bro.page_source
    print(page_text)
    
    # 退出访问
    bro.quit()

    2.selenium爬取动态加载的数据

    from time import sleep
    from selenium import webdriver
    from lxml import etree
    
    bro = webdriver.Chrome(executable_path="chromedriver.exe")
    
    bro.get("http://125.35.6.84:81/xk/")
    sleep(2)
    
    page_text = bro.page_source
    page_text_list = [page_text]
    
    for i in range(3):
        bro.find_element_by_id("pageIto_next").click()  # 点击下一页
        sleep(2)
        page_text_list.append(bro.page_source)
    
    for page_text in page_text_list:
        tree = etree.HTML(page_text)
        tree_list = tree.xpath('//ul[@id="gzlist"]/li')
        for lis in tree_list:
            title = lis.xpath('./dl/@title')[0]
            num = lis.xpath('./ol/@title')[0]
    
    sleep(2)
    bro.quit()

    3.动作链

      一系列连续的动作

      在实现标签定位时,如果发现定位的标签是存在于iframe标签中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame('id')

    from selenium import webdriver
    from time import sleep
    from selenium.webdriver import ActionChains

    bro
    = webdriver.Chrome(executable_path='chromedriver.exe')
    bro.get(
    'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') bro.switch_to.frame('iframeResult') div_tag = bro.find_element_by_id('draggable') # 拖动=点击+滑动 action = ActionChains(bro) action.click_and_hold(div_tag) for i in range(5): #perform让动作链立即执行 action.move_by_offset(17,5).perform() sleep(0.5) action.release() sleep(3) bro.quit()

    4.模拟12306登录

    超级鹰识别代码

    # Cjy.py
    
    import requests
    from hashlib import md5
    
    class Chaojiying_Client(object):
    
        def __init__(self, username, password, soft_id):
            self.username = username
            password =  password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
    
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
    
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()

    模拟登陆

    from selenium import webdriver
    from time import sleep
    from PIL import Image
    from selenium.webdriver import ActionChains
    from Cjy import Chaojiying_Client
    from selenium.webdriver import ActionChains
    
    bro = webdriver.Chrome(executable_path='chromedriver.exe')
    bro.get('https://kyfw.12306.cn/otn/login/init')
    sleep(5)
    bro.save_screenshot('main.png')
    
    code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
    location = code_img_tag.location
    size = code_img_tag.size
    
    # 裁剪的区域范围
    rangle = (
    int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    
    i = Image.open('./main.png')
    frame = i.crop(rangle)
    frame.save('code.png')
    
    
    def get_text(imgPath, imgType):
        chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')
        im = open(imgPath, 'rb').read()
        return chaojiying.PostPic(im, imgType)['pic_str']
    
    
    # 55,70|267,133 ==[[55,70],[33,66]]
    result = get_text('./code.png', 9004)
    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)
    # action = ActionChains(bro)
    for a in all_list:
        x = a[0]
        y = a[1]
        ActionChains(bro).move_to_element_with_offset(code_img_tag, x, y).click().perform()
        sleep(1)
    
    bro.find_element_by_id('username').send_keys('123456')
    sleep(1)
    bro.find_element_by_id('password').send_keys('67890000000')
    sleep(1)
    bro.find_element_by_id('loginSub').click()
    
    sleep(5)
    bro.quit()

    爬取梨视频

    import requests
    from lxml import etree
    import re
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
    }
    url = 'https://www.pearvideo.com/category_1'
    page_text = requests.get(url,headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
        detail_page_text = requests.get(detail_url,headers=headers).text
        ex = 'srcUrl="(.*?)",vdoUrl'
        video_url = re.findall(ex,detail_page_text,re.S)[0]
        video_data = requests.get(video_url,headers=headers).content
        with open(title,'wb') as fp:
            fp.write(video_data)

    5. 移动端数据的爬取

    1. fiddler是一款抓包工具,代理服务器

      - 青花瓷

      - miteproxy

      - 配置:让其可以抓取https协议的请求

      - tools -> options -> https -> 安装证书

    2. http:客户端和服务端进行数据交互的某种形式

      - https:安全的http协议

      - https的加密方式采用的是证书密钥加密。

    3.步骤

    1. 配置下fiddler的端口

    2. 将手机和fiddler所在的电脑处在同一个网段下(pc开启wifi,手机连接)

    3. 在手机中访问fiddler的ip+port:192.168.14.110:50816,在当前页面中点击对应的连接下载证书

    4. 在手机中安装且信任证书

    5. 设置手机网络的代理:开启代理==》fiddler对应pc端的ip地址和fiddler自己端口号

    待续

  • 相关阅读:
    北风设计模式课程---22、责任链模式
    bootstarp modal自己主动调整宽度的JS代码
    谷歌技术面试要点(Google面试)(14年5月20日交大专场)
    ASCII与Unicode编码消息写文件浅析
    程序编写中的细节问题
    Oracle使用并行建索引须要注意的问题
    PHP读取Excel里的文件
    Oracle db中 CONNECT role的含义
    集团信息化之路—电子採购软件与现有库存及財务软件数据对接的探讨
    NTP方式保证以时间戳同步可靠性
  • 原文地址:https://www.cnblogs.com/zangyue/p/12203214.html
Copyright © 2011-2022 走看看