zoukankan html css js c++ java

day98 爬虫 selenium

补充:python调用dll文件执行

跟华为合作，华为摄像头的硬件设备，windows，用软件

用python来调用，可以操控摄像头，给了他们一个文档（熟悉一门桌面开发语言第三方库，Tkinter，pyqt），

华为提供了sdk（别人帮咱写好的底包），只需要用python来调用，华为提供的不是用python写的sdk，

xxx.dll这种东西，动态链接库，c语言写的，（windows中每个软件都会有dll文件）

dll文件是用c语言写的（exe，dll：add，remove）

ret=对象.add( )

xpath选择器

#xpath:xml查找语言，在xml中查找标签的语言
# / 从节点中获取 和 //
/body/p 直接子节点
/body//p  子子孙孙

# xpath选择
# lxml解析库中的xpath讲解

from lxml import etree

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' id='id_1'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
#实例化的第一种方式 字符串
html=etree.HTML(doc)
#第二种方式
# html=etree.parse('search.html',etree.HTMLParser())
# 基本使用（****重点，xpath选择出来的结果都是列表）
ret=html.xpath('//body/div/a')
# 取文本  取属性
#取属性
ret=html.xpath('//body/div/a/@href')
# 取文本
ret=html.xpath('//body/div/a/text()')
print(ret)


####------
#所有标签
# a=html.xpath('//*')
# 2 指定节点（结果为列表）
# a=html.xpath('//head')
# 3 子节点，子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body/a') #无数据
# a=html.xpath('//body//a')
# 4 父节点

#  a[@href="image1.html"] 找a标签，a标签的href属性是image1.html
# find(name='a',href='image1.html')
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a[1] body下的第一个a
# a=html.xpath('//body//a[1]/..')
# 也可以这样(了解)
# a=html.xpath('//body//a[1]/parent::*')
# 5 属性匹配
# a=html.xpath('//body//a[@href="image2.html"]/text()')
# a=html.xpath('//body//a[@href="image2.html"]')

# 6 文本获取
# a=html.xpath('//body//a[@href="image1.html"]/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取（不是从0）
# a=html.xpath('//body//a[1]/@href')
# 选最后一个
# a=html.xpath('//body//a[last()]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//body//a[@class="li"]')
# a=html.xpath('//body//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute：属性值
# a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
# a=html.xpath('//a[1]/child::*')
# descendant：所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]/text()')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')
# print(a)


# //*[@id="key"]
#//*[@id="settleup"]/div[1]
#/html/body/div[1]/div[4]/div/div[2]/div/div[3]/div[1]

##settleup > div.cw-icon

# css选择器和xpath选择器都可以直接copy

selenium模块使用

自动化测试工具：控制浏览器，像人一样操作，用在爬虫中，执行js
可见即可爬
使用（本质，并不是python在操作浏览器，而是python在操作浏览器驱动（xx.exe），浏览器驱动来驱动浏览器）
	0 以驱动谷歌浏览器为例子（建议你使用谷歌，最适合）找谷歌浏览器驱动（的翻墙，国内镜像：http://npm.taobao.org/mirrors/chromedriver）
	0 如果是Windows，解压之后是个exe，不同平台就是不同平台的可执行文件
	1 安装模块pip3 install selenium
    2 需要浏览器驱动（ie,火狐，谷歌浏览器   驱动得匹配（浏览器匹配，浏览器版本跟驱动匹配）
    3 写代码
    	1实例化
        bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
    	2 发送请求
        bro.get('https://www.baidu.com/')
        3 打印加载完成的（js执行完成）html
        bro.page_source
        4 找控件 （自己的一堆方法  css xpath）
        5 向空间中写入数据
        send_keys('')
        6 点击空间
        click
        7 清空空间
        clear
        8 显式等待和隐式等待（都做那个隐式等待）
        bro.implicitly_wait(10)
        9 获取cookie 
        bro.get_cookies()后退 
        10 补充：
        find_elements_xxx和find_element_xx 一个是找所有，一个是找到第一个
        其他操作：
        模拟浏览器前进：bro.back()  bro.forword()
        -异常处理，在finally中关闭浏览器（不管是否发生异常都关闭）
        -补充：滑动浏览器（执行js）window.scrollTo(0,document.body.scrollHeight)
        browser.execute_script(window.scrollTo(0,document.body.scrollHeight)
        -动作链和截图和切换frame（了解）

selenium的简单使用

# selenium的使用
from selenium import webdriver
import time

#executable_path:驱动的位置，参数可以不传，他会去项目的根路径，python安装路径（script文件夹）找
# 相当于打开了一个浏览器
bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
#相当于在浏览器中输入了百度
bro.get('https://www.baidu.com/')

time.sleep(5)
#关闭浏览器
bro.close()

selenium的其他用法

#selenium的其他用法
from selenium import webdriver
import time
#键盘按键操作
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://www.baidu.com/')

# 1 拿到页面内容
print(bro.page_source)

# 2 隐藏浏览器,无界面浏览器（只有谷歌）
from selenium.webdriver.chrome.options import Options
# 实例化得到一个对象
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败


driver=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe',chrome_options=chrome_options)

driver.get('https://www.baidu.com/')
print(driver.page_source)

# 3 在页面中选取元素 模拟点击 输入。。。
bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://www.baidu.com/')

#===============所有方法==========================

#自带的
# 1、find_element_by_id  :id号查找
# 2、find_element_by_link_text ：通过超链接
# 3、find_element_by_partial_link_text  ：通过超链接，模糊匹配
# 4、find_element_by_tag_name   ：根据标签来找
# 5、find_element_by_class_name ：通过类名
# 6、find_element_by_name       ：属性name='sss' 通过name来找
#css选择器
# 7、find_element_by_css_selector ：
#xpath选择器
# 8、find_element_by_xpath

# 3.1 找到搜索框
input_search=bro.find_element_by_id('kw')
# input_search=bro.find_element_by_xpath('//*[@id="kw"]')
# 3.2 在框中输入文字
input_search.send_keys('美女')
time.sleep(3)
# 3.2 清空文字
input_search.clear()
time.sleep(2)
input_search.send_keys('美女')
# 3.3 模拟回车
input_search.send_keys(Keys.ENTER)
time.sleep(2)
#模拟浏览器前进后退 bro.back()  bro.forword()
bro.back()
time.sleep(2)
bro.forward()

# 4 写一个百度登录
bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://www.baidu.com/')
# 等待，显式等待和隐式等待，以后都用隐式等待，等待元素加载完成
# 隐式等待10s,查找元素所有，如果元素没出来，就会等待10s  显式等待很麻烦 每个元素都要写代码一般不用
bro.implicitly_wait(10)

#查找登录按钮a标签
a=bro.find_element_by_link_text('登录')

# 点击
a.click()

# 找到用户名登录
user_login=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
user_login.click()

# 找到用户名输入框，密码输入框 登录按钮
username=bro.find_element_by_id('TANGRAM__PSP_10__userName')
username.send_keys('1158611093')
pwd=bro.find_element_by_id('TANGRAM__PSP_10__password')
pwd.send_keys('oldboy@666')
submit=bro.find_element_by_id('TANGRAM__PSP_10__submit')
submit.click()

time.sleep(10)
#一旦登录成功就可以取到cookie
print(bro.get_cookies())
bro.close()

选项卡管理

import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
browser.get('https://www.baidu.com')
# 可以执行js
# browser.execute_script('alert("hello world")')


browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
#去第二个选项卡
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(3)
#去第一个选项卡
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')


time.sleep(3)
browser.close()

爬取京东商品信息

#爬取京东商品信息
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys#键盘按键操作
bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://www.jd.com/')
bro.implicitly_wait(10)


def get_goods(bro):
    li_list=bro.find_elements_by_class_name('gl-item')
    for li in li_list:
        # 找到li标签下img中src属性
        img_url=li.find_element_by_css_selector('.p-img img').get_attribute('src')
        if not img_url:
            img_url='https:'+li.find_element_by_css_selector('.p-img img').get_attribute('data-lazy-img')
        print(img_url)
        # 用request可以把图片下载在本地
        # 获取价格，a标签的文本
        price=li.find_element_by_css_selector('.p-price i').text
        print(price)
        good_name=li.find_element_by_css_selector('.p-name em').text
        print(good_name)
        commit=li.find_element_by_css_selector('.p-commit a').text
        print(commit)
    # 获取下一页的控件
    next_page=bro.find_element_by_partial_link_text('下一页')
    time.sleep(1)
    next_page.click()
    get_goods(bro)
try:
    input_search=bro.find_element_by_id('key')
    input_search.send_keys('macpro')
    # 敲回车
    input_search.send_keys(Keys.ENTER)
    # print(bro.page_source)
    get_goods(bro)
except Exception as e:
    print(e)
finally:
    bro.close()

动作链

# 动作链
from selenium import webdriver
from selenium.webdriver import ActionChains#动作练
import time

bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
bro.implicitly_wait(10)

# 切换frame(很少）
bro.switch_to.frame('iframeResult')
div=bro.find_element_by_xpath('//*[@id="draggable"]')

#使用动作链
# 得到一个动作链对象
action=ActionChains(bro)

#使用动作链 点击并且夯筑
action.click_and_hold(div)
#直接把上面的div移动到某个元素上
# action.move_to_element('元素控件')

#移动x坐标 y坐标
for i in range(5):
    action.move_by_offset(10,10)

#直接把上面的div到某个元素上的某个位置
# action.move_to_element_with_offset()

#调动他 会动起来
action.perform()
time.sleep(1)
#释放动作链
action.release()
time.sleep(5)
bro.close()

自动登录12306获取cookie，并发送请求

# 自动登录12306
# pip3 install pillow

from PIL import Image#抠图
from chaojiying import Chaojiying_Client
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
import json
import requests

bro=webdriver.Chrome(executable_path='D:pycharm爬虫day03chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/login/init')
bro.implicitly_wait(10)
# 因为selenium没有直接截取某个元素的功能，现在需要截取全图，然后通过图形软件，再把小图扣出来
# bro.minimize_window()  #最小化
bro.maximize_window() #最大化
time.sleep(1)
# save_screenshot 截取整个屏幕

bro.save_screenshot('main.png')
tag_code=bro.find_element_by_css_selector('#loginForm > div > ul:nth-child(2) > li.dl.captchaButton > div > div > div.touclick-img-par.touclick-bgimg > img')
print(tag_code)

# 查看控件的位置大小
size=tag_code.size#大小
location=tag_code.location#位置
print(size)
print(location)
# location['x']=1.25*location['x']
# location['y']=1.25*location['y']
# size['width']=1.25*size['width']
# size['height']=1.25*size['height']
img_tu = (int(location['x']) ,int(location['y']) ,int(location['x' ] +size['width']) ,int(location['y' ] +size['height']))
print(img_tu)
#抠出验证码
#打开
img=Image.open('./main.png')
#抠图
fram=img.crop(img_tu)
#截出的小图
fram.save('code.png')

# 调用超级鹰
def get_result():
    chaojiying = Chaojiying_Client('17356530633', '17356530633zqf', '903673')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('code.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 9004))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
    return chaojiying.PostPic(im, 9004)['pic_str']

#返回结果如果有多个 260,133|123，233,处理这种格式[[260,133],[123,233]]
result=get_result()
all_list=[]
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
print(all_list)

for a in all_list:
    x=a[0]
    y=a[1]
    # 使用动作练 将鼠标移动到图片的物品上 点击 释放动作练
    ActionChains(bro).move_to_element_with_offset(tag_code,x,y).click().perform()
    time.sleep(1)
username=bro.find_element_by_id('username')
username.send_keys('17356530633')
pwd=bro.find_element_by_id('password')
pwd.send_keys('ZQF05020710')
submit=bro.find_element_by_id('loginSub')
submit.click()

#获取cookie
#使用request模块 携带cookie朝某个接口发请求
c=bro.get_cookies()
print(c)

with open('xxx.json','w') as f:
    json.dump(c,f)

# cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
# for cookie in c:
#     cookies[cookie['name']] = cookie['value']
#
# print(cookies)

with open('xxx.json', 'r') as f:
    di = json.load(f)

cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in di:
    cookies[cookie['name']] = cookie['value']


print('---------------------------------')
print(cookies)
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    'Referer': 'https: // kyfw.12306.cn /otn/view/index.html',
    'Origin': 'https://kyfw.12306.cn',
    'Host': 'kyfw.12306.cn',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
}
res=requests.post('https://kyfw.12306.cn/otn/index/initMy12306Api',headers=headers,cookies=cookies)
print(res.text)

cookie池讲解

# 通过selenium登录，获取cookie，放到redis中，用flask框架搭建服务，每发一次请求，获取一个cookie

dic={'lqz':'xxx','egon':'eee','json':'ttt'}

fiddler抓包工具的简单使用

# 抓包工具的使用（fiddler，charles）
# # 6 fildder的使用：http://101.133.225.166:8088/
#下载地址：https://www.telerik.com/fiddler
#  https://telerik-fiddler.s3.amazonaws.com/fiddler/FiddlerSetup.exe
# 双击安装，一路下一步
# 桌面上绿色的东西
# 双击打开(中文汉化版)

# 这个东西可以抓手机包，手机app都是http请求，手机端配置代理（装有fildder的机器地址）
手机跟电脑连到同一个路由器，手机端配置代理是你的机器  网关

查看全文

相关阅读:
配置SecondaryNameNode
hadoop 根据secondary namenode恢复namenode
Hadoop如何修改HDFS文件存储块大小
 hadoop1.2.1 datanode 由于权限无法启动 expected: rwxr-xr-x
CentOS 7 下，如何设置DNS服务器
 Eclipse+pydev环境搭建
 Python numpy
Leetcode#54 Spiral Matrix
Leetcode#53 Maximum Subarray
Leetcode#40 Combination Sum II

原文地址：https://www.cnblogs.com/zqfzqf/p/12392293.html