zoukankan html css js c++ java

DAY 88 爬虫02

1 爬虫介绍
    -发送http请求获取数据-----》解析，清洗----》入库
2 请求库-requests（urllib2，requests-html）
3 requests发送get请求，携带数据，携带头，携带cookie
4 发送post请求，携带数据（请求地址中的数据，请求体中数据：data，json）
5 响应对象的属性
    -状态码
    -响应头
    -cookie  # cookiejar对象
    -响应体内容二进制
    -响应体的字符串text
    -字节 字符
    -iter_content()

6 爬取梨视频

1 requests高级用法

# 1 ssl携带证书（这种网站极少见）
# import requests
# respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
# print(respone.status_code)
#
#
# import requests
# respone=requests.get('https://www.12306.cn',
#                      cert=('/path/server.crt','/path/key'))
# print(respone.status_code)

# 2 超时设置
# import requests
# respone=requests.get('https://www.baidu.com',timeout=0.0001)


# 3 认证设置
# import requests
# from requests.auth import HTTPBasicAuth
# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
# print(r.status_code)


# 4 异常处理
# import requests
# from requests.exceptions import * #可以查看requests.exceptions获取异常类型
#
# try:
#     r=requests.get('http://www.baidu.com',timeout=0.00001)
# except ReadTimeout:
#     print('===:')
# except ConnectionError: #网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')
#
# except RequestException:
#     print('Error')
# except Exception as e:
#     print('未知错误')


# 5 使用代理

# 代理：网上免费的（不稳定，自己玩）  收费的（稳定，公司都会买）
# 代理：高匿：隐藏访问者ip
# 透明：不隐藏访问者ip   http的请求头中：X-Forwarded-For---》django中从META中取

# 每次访问，随机使用代理
# 从网上找很多免费的代理，放到列表中，每次随机取一个

# 使用第三方开源的代理池：python+flask写的，自己搭建一个免费的代理池  https://github.com/jhao104/proxy_pool
# 你要开源一个代理池：
# import requests
#
# ip = requests.get('http://118.24.52.95:5010/get/').json()['proxy']
# print(ip)
# proxies = {
#     'http': ip
# }
# respone = requests.get('http://101.133.225.166:8088/test_ip/', proxies=proxies)
#
# print(respone.text)

# 6 上传文件

# import requests
#
# respone=requests.post('http://101.133.225.166:8088/upload_file/',files={'myfile':open('1 requests高级用法.py','rb')})
# print(respone.text)

2 抽屉自动点赞

import requests

# data = {
#     'linkId': '31009758',
#
# }

data = {
    'content': '其实一般',
    'linkId': '31008563',
    'parentId': '0',
    'pictureUrl': ''
}
header = {
    'Referer': 'https://dig.chouti.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Cookie': 'deviceId=web.eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiIwZmVlMjk5OS1iMDgzLTRmYzctOTM4MC03YjIzZmVmY2U5YmYiLCJleHBpcmUiOiIxNjIzOTA0ODk5MzM5In0.7cadtBYznS6OgnLwEF8aH0AmtDOoYB1WKDgdU4eYYS0; __snaker__id=VbChmBUEZIVY3FPa; _9755xjdesxxd_=32; YD00000980905869%3AWM_TID=%2FazmF9%2FrClJFEVFBVRN70z7msH6De39Y; YD00000980905869%3AWM_NI=fmln0UTLoOM0bJxRYMet9SoHoQFrKUG7angbfEmftGxseQnkMmbwsdEPNwgtVpQ9K0fqli5fhP6nKsZ15bIt%2BQYBdpjdM8x19UJqjf6LSi%2FmhSgQW%2F3SYGNWEwJPPlYGRWM%3D; YD00000980905869%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb5d567838e8fa6f94dbaef8eb7d54a938e8b85f83bf88a97a2e464a98689afaa2af0fea7c3b92aa6b3a48fb35f9894a1b0d03ca296b8b3dc47a7acf7b4ee44ad8f8a93ca5f85e9af8fe66aa69ba387f74dbcadabb2ed618fb3ae98f27087908298e68096b09fdaca3ca6afa48ab86eac90fa8fca799aeffb83cc80e98f97a3e77caabc83d9fb3bfb8b8692e96ef6949d8aae67ac8da9b2d625f18d97a8cd5d87a986b1d3689b999eb8d037e2a3; Hm_lvt_03b2668f8e8699e91d479d62bc7630f1=1621312902,1621392225; gdxidpyhxdE=weRAWhzVrJfrCGllI4mwY8LxZOiO4D79t%2Fkf8j8qcJUsTDrjyVh05GQiaf6uL8dwsXpkShI%2B2uGHa9Vj5b1QilxdgI%2BoDUr%5C0VN4kMrnVLUmzGb56lwmZRoAmUq%2FToGtCRjYKAaANejzA%5CQcWg4LwkrdXzwqNISMTfwQUaMw4puru4fM%3A1621393127138; token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNTMyMDcwNzg0NjAiLCJleHBpcmUiOiIxNjIzOTg0MjcwNDA0In0.4Q4uQAd4LkbVVcu37t0SjRFE4CSIidduRspeQ08-iYE; Hm_lpvt_03b2668f8e8699e91d479d62bc7630f1=1621392374'
}
# res = requests.post('https://dig.chouti.com/link/vote', data=data,headers=header)
res = requests.post('https://dig.chouti.com/comments/create', data=data, headers=header)
print(res.text)

3 爬取汽车之家新闻

# 爬取汽车之间
# 使用bs4模块：beautifulsoup4：专门用来解析html的模块
import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
# print(res.text)

# 第一个参数是要解析的文档
# 第二参数是使用的解析库，解析方式
# html.parser  解析速度慢    不需要额外安装
# lxml         解析速度快，文档容错率高，需要额外安装lxml模块
soup = BeautifulSoup(res.text, 'html.parser')
# 查找文档中的所有类名为article的ul标签
ul_list = soup.find_all(name='ul', class_='article')
li_list = []
for ul in ul_list:
    li_list += ul.find_all(name='li')

for li in li_list:
    h3 = li.find(name='h3')
    if h3:
        title = h3.text  # 获取文本内容

        url = 'http:' + li.find(name='a')['href']

        desc = li.find(name='p').text

        img = 'http:' + li.find(name='img')['src']

        print('''
        文章标题：%s
        文章地址：%s
        文章图片：%s
        文章摘要：%s
        ''' % (title, url, img, desc))
        # 存到mysql中:
        # 存redis  articles=[{json格式字符串},{}]
        #         article ={'1':{json格式字符串},}

4 bs4 介绍

1 Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库
2 默认有个解析器  html.parser
3 额外安装  lxml
4 html中搜索数据的时候
    -css选择器    (通用)
    -xpath选择器  （通用）
    -模块提供的查找方法（find，find_all）

5 bs4 之遍历文档树




# 遍历文档树

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p>

<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'lxml') 

# 美化
# print(soup.prettify())

# 遍历文档树

from bs4.element import Tag

# soup.html.p.text
# #1、用法. 遍历
# bs4.element.Tag   每个标签对象，用起来跟soup对象一样用
# print(soup.html.head)
# print(soup.html.body.p)


#2、获取标签的名称
# bs4.element.Tag  有一个name属性
# print(soup.html.body.name)

#3、获取标签的属性(如果是class就放到列表中)
# print(soup.html.body.p)
#
# print(soup.html.body.p.attrs)
# print(soup.html.body.p.attrs.get('class'))
# print(soup.html.body.p.attrs['id'])
#
# print(soup.html.body.p['class'])  #如果是class就放到列表中
# print(soup.html.body.p['id'])  #id是一个
#4、获取标签的内容
# print(soup.html.body.p)
# print(soup.html.body.p.text)  #获取该标签子子孙孙的文本内容

# print(soup.html.body.p.string)  #这个标签必须没有子孙，才能拿出文本内容

# print(list(soup.html.body.p.strings))  #把子子孙孙的文本内容放到一个生成器中

#5、嵌套选择
# print(soup.p.b.string)


#了解即可
#6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点



# print(soup.p.children) #得到一个迭代器,包含p下所有子节点
# for i,child in enumerate(soup.p.children):
#     print(i,child)

# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来
# for i,child in enumerate(soup.p.descendants):
#     print(i,child)
#7、父节点、祖先节点
# print(soup.b.parent) #获取b标签的父节点
# print(list(soup.b.parents)) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
# print(len(list(soup.b.parents))) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...

#8、兄弟节点


# print(soup.a.next_sibling) #紧邻的下一个兄弟（如果是空格就会拿出空格）
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

6 bs4 之搜索文档树

# 遍历文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p>

<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx='xx'><span>lqz</span>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name='lqz'>Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')


# 搜索文档树：（find  find_all）
# #1、五种过滤器: 字符串、正则表达式、列表、True、方法
# soup.find()  # 找到符合的第一个
# soup.find_all() # 找到符合的所有

# 字符串过滤器

# res=soup.find(name='body',)
# 找a标签，id为 link1
# res=soup.find(name='a',id='link1')
# res=soup.find_all(name='a',class_='sister')
# res=soup.find_all(name='a',href="http://example.com/elsie")
# res=soup.find_all(name='a',xx='xx')

# res=soup.find_all(name='a',attrs={'class':'sister'})   # 以属性找attrs
# res=soup.find_all(attrs={'id':'link1'})
# res=soup.find_all(attrs={'xx':'xx'})

#
# res=soup.find_all(name='a',attrs={'name':'lqz'})
# print(res)


# 正则表达式
# import re
# # res=soup.find_all(name=re.compile('^b'))
# # res=soup.find_all(class_=re.compile('^s'))
# res=soup.find_all(attrs={'name':'lqz'},id=re.compile('^l'))
# print(res)


# 列表
# res=soup.find_all(name=['b',])
# res=soup.find_all(id=['link1','link2'])
# print(res)

# 布尔
# res = soup.find_all(class_=True)  # 有标签标签

# res = soup.find_all(href=True)  # 有标签标签
# print(res)

# 方法（了解）
#获取有类名，但是没有id的标签
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
#
# res = soup.find_all(name=has_class_but_no_id)
# print(res)



# 遍历文档树和搜索文档树可以连用

# res=soup.find(name='a').span.text
# res=soup.html.body.find('a')
# print(res)

# limit  限制取几条
# soup.findChild()
# res=soup.find_all(name='a',limit=1)
# print(res)

# recursive 是否递归查找，如果是False是只找一层
# res=soup.body.find_all(name='p',recursive=False)
# res=soup.find_all(name='p',recursive=False)
# res=soup.find_all(name='p',recursive=True)
# print(res)

7 css选择器



#css选择器是通用的


from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p'>p的内容<b>The Dormouse's story<span>孙子</span></b><span>lqz</span></p>

<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx='xx'><span>lqz</span>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name='lqz'>Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')


# 括号中写css选择器
'''
直接写标签名
.类
#id
div>a  找div的子标签a
div a  找div的子子孙孙中的a

'''
# res=soup.select('.sister')
# res=soup.select('#link2')
# res=soup.select('p>b')
res=soup.select('p b')
print(res)


# bs4 可以修改xml格式的文档：后期可以会有一些配置文件是xml格式

8 selenium

1 requests模块不能执行js代码
2 selenium可以操作浏览器，模拟人的行为
3 selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器


# selenium驱动浏览器：谷歌（以谷歌为例），火狐，ie
# 下载一个驱动（谷歌驱动）
# 本地的谷歌浏览器版本要跟谷歌驱动对应
# 国内镜像：http://npm.taobao.org/mirrors/chromedriver/

# pip3 install selenium

from selenium import webdriver
import time
# 实例化得到对象，设置驱动的位置

# 跟你用手点开一个浏览器是一样的
driver=webdriver.Chrome(executable_path='chromedriver.exe')

#地址栏中输入百度地址
driver.get('https://www.baidu.com')

time.sleep(3)

print(driver.page_source)  # 当前页面的html内容
# 把浏览器关闭
driver.close()

github好项目：
https://github.com/gumengkai/db_monitor
https://github.com/openspug/spug

查看全文

相关阅读:
GDOI 2020 赛前两周模拟总结
 猫树模板
 LOJ#2023. 「AHOI / HNOI2017」抛硬币（OGF+ExLucas+Crt）
扩展Lucas定理及其优化
 LOJ#2018. 「AHOI / HNOI2017」单旋（平衡树模拟+set+线段树）
LOJ #2008. 「SCOI2015」小凸想跑步（半平面交）
[TJOI2018]游园会
 [Ynoi2018]未来日记
 「雅礼集训 2018 Day7」B
「雅礼集训 2018 Day7」A

原文地址：https://www.cnblogs.com/DEJAVU888/p/14894051.html