zoukankan html css js c++ java

Day 03

昨日回顾:
    一 爬取豆瓣电影TOP250
        1.爬取电影页
        2.解析提取电影信息
        3.保存数据

    二 Selenium请求库
        驱动浏览器往目标网站发送请求，获取响应数据。
        - 不需要分析复杂的通信流程
        - 执行js代码
        - 获取动态数据


    三 selenium使用
        driver = webdriver.Chrome()  打开驱动浏览器
        # 隐式等待
        driver.get('网站')  往某个网站发送请求
        # 显式等待
        driver.close()

    四 选择器
        element: 查找一个
        elements: 查找多个

        by_id
        by_class_name
        by_name
        by_link_text
        by_partial_link_text
        by_css_selector

今日内容:
    一 Selenium剩余部分
    二 BeautifulSoup4 解析库

    一 Selenium剩余部分
        1.元素交互操作:
            - 点击、清除
                click
                clear

            - ActionChains
                是一个动作链对象，需要把driver驱动传给它。
                动作链对象可以操作一系列设定好的动作行为。

            - iframe的切换
                driver.switch_to.frame('iframeResult')

            - 执行js代码
                execute_script()
    二 BeautifulSoup4 解析库（+ re模块 > selenium）
        BS4

        1.什么BeautifulSoup？
            bs4是一个解析库，可以通过某种(解析器)来帮我们提取想要的数据。

        2.为什么要使用bs4？
            因为它可以通过简洁的语法快速提取用户想要的数据内容。

        3.解析器的分类
            - lxml
            - html.parser

        4.安装与使用
            - 遍历文档树
            - 搜索文档树



补充知识点:

 1     数据格式:
 2 
 3     json数据:
 4     {
 5     "name": "tank"
 6     }
 7 
 8     XML数据:
 9     <name>tank</name>
10 
11     HTML:
12     <html></html>

生成器: yield 值（把值放进生成器中）

 1     def f():
 2         # return 1
 3         yield 1
 4         yield 2
 5         yield 3
 6 
 7     g = f()
 8     print(g)
 9 
10     for line in g:
11         print(line)

01❤元素交互操作

 1 from selenium import webdriver  # 用来驱动浏览器的
 2 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 3 import time
 4 driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
 5 
 6 try:
 7     driver.implicitly_wait(10)
 8 
 9     driver.get('https://www.jd.com/')
10 
11     input1 = driver.find_element_by_id('key')
12     input1.send_keys('剑网3')
13     search_button = driver.find_element_by_class_name('button')
14     search_button.click()
15 
16     time.sleep(1)
17 
18     # 清空
19     input2 = driver.find_element_by_class_name('text')
20     input2.clear()
21     input2.send_keys('剑网3花萝')
22     input2.send_keys(Keys.ENTER)
23 
24     time.sleep(10)
25 
26 finally:
27     driver.close()

View Code

02❤自动完成滑块验证码

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 import time
 4 driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
 5 try:
 6     driver.implicitly_wait(10)
 7     driver.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
 8     time.sleep(5)
 9 
10     driver.switch_to.frame('iframeResult')
11     time.sleep(1)
12 
13     # 起始方块id：draggable
14     source = driver.find_element_by_id('draggable')
15     # 目标方块：droppable
16     target = driver.find_element_by_id('droppable')
17 
18     # print(source.size)    #大小
19     # print(source.tag_name)#标签名
20     # print(source.text)    #文本
21     # print(source.location)#坐标：x、y轴
22 
23     # 找到滑块距离
24     distance = target.location['x'] - source.location['x']
25     # 摁住起始滑块
26     ActionChains(driver).click_and_hold(source).perform()
27     # 方式二：一点一点移动
28     s = 0
29     while s < distance:
30         # 获取动作链对象
31         # 每一次位移s距离
32         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
33         s +=2
34         time.sleep(0.1)
35     # 松开起始滑块
36     ActionChains(driver).release().perform()
37     time.sleep(10)
38 
39 finally:
40     driver.close()

View Code

03❤主页弹窗内容更改

from selenium import webdriver  # web驱动
import time

driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
try:
    driver.implicitly_wait(10)

    driver.get('https://www.baidu.com/')
    driver.execute_script(
        '''
        alert("花间天下第一")
        '''
    )
    time.sleep(10)
finally:
    driver.close()

View Code

04❤模拟浏览器的前进后退

 1 #模拟浏览器的前进后退
 2 import time
 3 from selenium import webdriver
 4 
 5 browser = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
 6 browser.get('https://www.baidu.com')
 7 browser.get('https://www.taobao.com')
 8 browser.get('http://www.sina.com.cn/')
 9 
10 browser.back()
11 time.sleep(10)
12 browser.forward()
13 browser.close()

View Code

05❤自动爬取指定的京东商品信息

 1 '''
 2 初级版：普普通通
 3 '''
 4 from selenium import webdriver
 5 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 6 import time
 7 
 8 driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
 9 try:
10     driver.implicitly_wait(10)
11     driver.get('https://www.jd.com/')
12 
13     input1 = driver.find_element_by_id('key')
14     input1.send_keys('剑网3花萝')
15     input1.send_keys(Keys.ENTER)
16 
17     time.sleep(5)
18     num = 1
19     good_list = driver.find_elements_by_class_name('gl-item')
20     for good in good_list:
21         # 商品名称
22         good_name = good.find_element_by_css_selector('.p-name em').text
23         # 商品链接
24         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
25         # 商品价格
26         good_price = good.find_element_by_class_name('p-price').text
27         # 商品评价
28         good_commit = good.find_element_by_class_name('p-commit').text
29 
30         good_content = f'''
31         第{num}个
32         商品名称：{good_name}
33         商品链接：{good_url}
34         商品价格：{good_price}
35         商品评价：{good_commit}
36         '''
37         print(good_content)
38         with open('jd.txt','a',encoding='utf-8') as f:
39             f.write(good_content)
40             num += 1
41 finally:
42     driver.close()

View Code

 1 '''
 2 中级版：增加下拉+下一页
 3 '''
 4 import time
 5 from selenium import webdriver
 6 from selenium.webdriver.common.keys import Keys
 7 
 8 driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
 9 
10 num = 1
11 
12 try:
13     driver.implicitly_wait(10)
14     # 往京东发送请求
15     driver.get('https://www.jd.com/')
16 
17     # 往京东主页输入框输入墨菲定律，按回车键
18     input_tag = driver.find_element_by_id('key')
19     input_tag.send_keys('剑网3炮太')
20     input_tag.send_keys(Keys.ENTER)
21 
22     time.sleep(5)
23 
24     # 下拉滑动5000px
25     js_code = '''
26         window.scrollTo(0, 5000)
27     '''
28 
29     driver.execute_script(js_code)
30 
31     # 等待5秒，待商品数据加载
32     time.sleep(5)
33 
34     good_list = driver.find_elements_by_class_name('gl-item')
35     for good in good_list:
36         # 商品名称
37         good_name = good.find_element_by_css_selector('.p-name em').text
38         # 商品链接
39         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
40         # 商品价格
41         good_price = good.find_element_by_class_name('p-price').text
42         # 商品评价
43         good_commit = good.find_element_by_class_name('p-commit').text
44 
45         good_content = f'''
46         num: {num}
47         商品名称: {good_name}
48         商品链接: {good_url}
49         商品价格: {good_price}
50         商品评价: {good_commit}
51         '''
52         print(good_content)
53 
54         with open('jd.txt', 'a', encoding='utf-8') as f:
55             f.write(good_content)
56         num += 1
57 
58     # 找到下一页并点击
59     next_tag = driver.find_element_by_class_name('pn-next')
60     next_tag.click()
61 
62     time.sleep(10)
63 
64 finally:
65     driver.close()

View Code

 1 '''
 2 狂暴版:加载所有指定商品
 3 '''
 4 import time
 5 from selenium import webdriver
 6 from selenium.webdriver.common.keys import Keys
 7 
 8 
 9 def get_good(driver):
10     num = 1
11     try:
12         time.sleep(5)
13 
14         # 下拉滑动5000px
15         js_code = '''
16             window.scrollTo(0, 5000)
17         '''
18         driver.execute_script(js_code)
19 
20         # 等待5秒，待商品数据加载
21         time.sleep(5)
22         good_list = driver.find_elements_by_class_name('gl-item')
23         for good in good_list:
24             # 商品名称
25             good_name = good.find_element_by_css_selector('.p-name em').text
26             # 商品链接
27             good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
28             # 商品价格
29             good_price = good.find_element_by_class_name('p-price').text
30             # 商品评价
31             good_commit = good.find_element_by_class_name('p-commit').text
32 
33             good_content = f'''
34             num: {num}
35             商品名称: {good_name}
36             商品链接: {good_url}
37             商品价格: {good_price}
38             商品评价: {good_commit}
39             

40             '''
41             print(good_content)
42             with open('jd.txt', 'a', encoding='utf-8') as f:
43                 f.write(good_content)
44             num += 1
45 
46         print('商品信息写入成功!')
47 
48         # 找到下一页并点击
49         next_tag = driver.find_element_by_class_name('pn-next')
50         next_tag.click()
51 
52         time.sleep(5)
53         # 递归调用函数本身
54         get_good(driver)
55 
56     finally:
57         driver.close()
58 
59 
60 if __name__ == '__main__':
61     driver = webdriver.Chrome(r'E:Python驱动浏览器chromedriver.exe')
62     try:
63         driver.implicitly_wait(10)
64         # 往京东发送请求
65         driver.get('https://www.jd.com/')
66         # 往京东主页输入框输入墨菲定律，按回车键
67         input_tag = driver.find_element_by_id('key')
68         input_tag.send_keys('剑网3伞萝')
69         input_tag.send_keys(Keys.ENTER)
70 
71         # 调用获取商品信息函数
72         get_good(driver)
73 
74     finally:
75         driver.close()

View Code

06❤bs4的安装与使用

是pip3 install ***

 1 '''
 2 安装解析器：
 3 pip install lxml
 4 安装解析库：
 5 pip install bs4
 6 
 7 注意: 如何初始文本内有换行，也会算在里面。（坑）
 8 '''
 9 
10 html_doc = """
11 <html><head><title>The Dormouse's story</title></head>
12 <body>
13 <p class="sister"><b>$37</b></p>
14 <p class="story" id="p">Once upon a time there were three little sisters; and their names were
15 <a href="http://example.com/elsie" class="sister" >Elsie</a>,
16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
18 and they lived at the bottom of a well.</p>
19 
20 <p class="story">...</p>
21 """
22 
23 from bs4 import BeautifulSoup
24 # python自带的解析库
25 # soup = BeautifulSoup(html_doc,'html.parser')
26 
27 # 调用bs4得到一个soup对象
28 # 第一个参数是解析文本
29 # 第二个参数是解析器
30 soup = BeautifulSoup(html_doc, 'lxml')
31 
32 # 具备自动补全html标签功能
33 print(soup)
34 
35 # bs4类型
36 print(type(soup))
37 # 美化html便签
38 html = soup.prettify()
39 print(html)

View Code

07❤bs4解析库之遍历文档树

 1 from bs4 import BeautifulSoup
 2 
 3 # 注意: 如何初始文本内有换行，也会算在里面。（坑）
 4 html_doc = """
 5 <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>
 6 """
 7 
 8 
 9 # 第一个参数是解析文本
10 # 第二个参数是解析器
11 soup = BeautifulSoup(html_doc, 'lxml')
12 # 遍历文档树
13 # 1、直接选择标签  *****
14 # （返回的是一个对象）
15 print(soup.html)
16 print(type(soup.html))
17 print(soup.a)  # 获取第一个a标签
18 print(soup.p)  # 获取第一个p标签
19 
20 # 2、获取标签的名称
21 print(soup.a.name)  # 获取a标签的名字
22 
23 # 3、获取标签的属性 *****
24 print(soup.a.attrs)  # 获取a标签内所有的属性
25 print(soup.a.attrs['href']) # 获取a标签内的href属性
26 
27 # 4、获取标签的文本内容 *****
28 print(soup.p.text)  #￥37
29 
30 # 5、嵌套选择标签
31 print(soup.p.b)  # 获取第一个p标签内的b标签
32 print(soup.p.b.text)  # 打印b标签内的文本
33 
34 # 6、子节点、子孙节点
35 # 获取子节点
36 print(soup.p.children)  # 获取第一个p标签所有的子节点，返回的是一个迭代器
37 print(list(soup.p.children))  # list转成列表
38 
39 # 获取子孙节点
40 print(soup.body.descendants)  # 获取body标签内所有的子孙节点，返回的是一个生成器
41 print(list(soup.body.descendants))  # list转成列表
42 
43 # 获取第一个p标签中所有的内容，返回的是一个列表
44 print(soup.p.contents)
45 
46 # 7、父节点、祖先节点
47 # 获取父节点
48 print(soup.a.parent)  # 获取第一个a标签内的父节点
49 
50 # 获取祖先节点（爸爸，爸爸的爸爸，爸爸的爸爸的爸爸...以此类推）
51 print(list(soup.a.parents))  # 获取第一个a标签的祖先节点，返回的是一个生成器
52 
53 # 8、兄弟节点  （sibling: 兄弟姐妹）
54 print(soup.a)
55 # 获取下一个兄弟节点
56 print(soup.a.next_sibling)
57 
58 # 获取下一个的所有兄弟节点,返回的是一个生成器
59 print(soup.a.next_siblings)
60 print(list(soup.a.next_siblings))
61 
62 # 获取上一个兄弟节点
63 print(soup.a.previous_sibling)
64 # 获取上一个的所有兄弟节点，返回的是一个生成器
65 print(list(soup.a.previous_siblings))

View Code

08❤bs4解析库之搜索文档

'''
标签查找与属性查找:

    标签:
        - 字符串过滤器   字符串全局匹配
            name 属性匹配
            attrs 属性查找匹配
            text 文本匹配

        - 正则过滤器
            re模块匹配

        - 列表过滤器
            列表内的数据匹配

        - bool过滤器
            True匹配

        - 方法过滤器
            用于一些要的属性以及不需要的属性查找。

    属性:
        - class_
        - id
'''

 1 import re
 2 # name
 3 # 根据re模块匹配带有a的节点
 4 a = soup.find(name=re.compile('a'))
 5 print(a)
 6 a_s = soup.find_all(name=re.compile('a'))
 7 print(a_s)
 8 
 9 # attrs
10 a = soup.find(attrs={"id": re.compile('link')})
11 print(a)
12 
13 '''3、列表过滤器'''
14 # 列表内的数据匹配
15 print(soup.find(name=['a', 'p', 'html', re.compile('a')]))
16 print(soup.find_all(name=['a', 'p', 'html', re.compile('a')]))
17 
18 
19 '''4、bool过滤器 '''
20 # True匹配
21 print(soup.find(name=True, attrs={"id": True}))
22 
23 '''5、方法过滤器'''
24 # 用于一些要的属性以及不需要的属性查找。
25 
26 def have_id_not_class(tag):
27     # print(tag.name)
28     if tag.name == 'p' and tag.has_attr("id") and not tag.has_attr("class"):
29         return tag
30 
31 # print(soup.find_all(name=函数对象))
32 print(soup.find_all(name=have_id_not_class))
33 
34 
35 ''' 补充知识点:'''
36 # id
37 a = soup.find(id='link2')
38 print(a)
39 
40 # class
41 p = soup.find(class_='sister')
42 print(p)

View Code

7月3日作业【半成品】只能加载第一个，无法所有

完整版在Day 04

 1 ''''''
 2 '''
 3 今日作业:
 4     1.整理课堂知识点
 5     2.写博客
 6     3.爬取豌豆荚app数据
 7         spider_method:
 8             requests + bs4
 9                 or
10             selenium
11 
12         url:
13             https://www.wandoujia.com/category/6001
14 
15         data:
16             名称、详情页url、下载人数、app大小
17             app_name, detail_url, download_num, app_size
18 '''
19 import requests
20 from bs4 import BeautifulSoup
21 import time
22 response = requests.get('https://www.wandoujia.com/category/6001')
23 response.encoding = response.apparent_encoding
24 soup = BeautifulSoup(response.text, 'html.parser')
25 
26 app_list= soup.find(attrs={'class': 'app-desc'})
27 app_url_name_list = soup.find(name='a', attrs={'class': 'name'})
28 
29 # 从a标签中找到title
30 app_name = app_url_name_list['title']
31 #获取url
32 detail_url = app_url_name_list.attrs.get('href')
33 #获取下载人数
34 download_num = soup.find(attrs={'class': 'install-count'}).text
35 # 获得app大小
36 app_size = soup.find( attrs={'class': 'dot'}).next_sibling.next_sibling.text
37 app_content = f'''
38 ❀=================  游戏信息  ==================❀
39 游戏名称:{app_name}
40 详情页url:{detail_url}
41 下载人数:{download_num}
42 app大小:{app_size}
43 ❀=============  游戏信息加载完毕  ==============❀
44 '''
45 print(app_content)
46 with open('wdj.txt', 'a', encoding='utf-8') as f:
47     f.write(app_content)

查看全文

相关阅读:
vm中花屏的最直接解决办法
 【转】SQL SERVER中一些常见性能问题的总结
 好不容易把Head First Design Patterns下下来了，与大家分享一下
 没想到单位的bt下载的速度可以到这么快，满意了
 买了张水货的1Gsd卡
 商业缩略语汇总
 一些个人收集的书籍恢复下载
 sourceforge.net 是不是被封了？
2009年我大学毕业了，我工作了，总结成长中的我的2009年 Fred
Web Service的传输协议

原文地址：https://www.cnblogs.com/DLYQY/p/11128044.html