zoukankan html css js c++ java

爬虫之BeautifulSoup (三)

简单来说，Beautiful Soup是python的一个库，最主要的功能是从网页抓取数据。官方解释如下：

'''
Beautiful Soup提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。
它是一个工具箱，通过解析文档为用户提供需要抓取的数据，因为简单，所以不需要多少代码就可以写出一个完整的应用程序。
'''

安装

#安装 Beautiful Soup
pip install beautifulsoup4

#安装解析器
Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,其中一个是 lxml .根据操作系统不同,可以选择下列方法来安装lxml:

$ apt-get install Python-lxml

$ easy_install lxml

$ pip install lxml

另一个可供选择的解析器是纯Python实现的 html5lib , html5lib的解析方式与浏览器相同,可以选择下列方法来安装html5lib:

$ apt-get install Python-html5lib

$ easy_install html5lib

$ pip install html5lib

解析器

简单使用

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""


# 方式一:
# soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 方式二:
# soup = BeautifulSoup(open('a.html'), "lxml")
# print(soup)


soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 自动补全
soup.prettify()

print(soup.p)
# 获取p标签下的b标签
print(soup.p.b)
#  获取p标签下的b标签下的文本
print(soup.p.b.text)

# 找body内的所有标签
print(soup.body.contents)

# 获取p标签属性
print(soup.p.attrs)

# 获取p标签的孩子， 返回一个iter对象
print(list(soup.p.children))

# 获取p标签的子子孙孙
print(list(soup.p.descendants))

# 获取p标签的爸爸
print(soup.p.parent)

# 获取p标签的爸爸， 获取p标签的爸爸的爸爸， 获取p标签的爸爸的爸爸的爸爸
print(list(soup.p.parents))

# 获取a标签内的href属性
print(soup.a.attrs['href'])

搜索文档树

from bs4 import BeautifulSoup
html_doc ="""

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# 搜索文档树
# 1.文本查找
# 通过文本查找p标签
soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 自动补全
soup.prettify()
print(soup.find_all(name='p'))

# 通过文本查找文本为$37的p标签
print(soup.find_all(name='p', text='$37'))

# 通过文本查找id为link3的a标签
print(soup.find_all(name='a', attrs={"id": "link3"}))

import re
# 2.正则查找
# 通过正则查找所有p标签
print(soup.find_all(name=re.compile("^p")))

# 通过正则查找所有a标签
print(soup.find_all(name=re.compile("^a")))

# 通过正则查找所有id为link的p标签
print(soup.find_all(name="p", attrs={"id": re.compile("^link")}))

# 通过正则查找所有id为link的a标签
print(soup.find_all(name="a", attrs={"id": re.compile("^link")}))

# 通过正则查找所有class为story的p标签
print(soup.find_all(name="p", attrs={"class": re.compile("story")}))

# 3.列表
# 通过列表查找所有的a、p标签
print(soup.find_all(name=['p', 'a']))

# 通过列表查找所有的正则匹配有Elsie的文本
print(soup.find_all(text=[re.compile("Elsie")]))

# 通过列表查找所有的正则匹配有Elsie的文本的a标签
print(soup.find_all(name=['a'], text=[re.compile("Elsie")]))


# 4.True
# 获取所有标签
print(soup.find_all(name=True))

# 获取所有有id的a标签
print(soup.find_all(name="a", attrs={"id": True}))

# # 获取所有有class的a标签
print(soup.find_all(name="a", attrs={"class": True}))


# 5.方法
def have_id_not_class(a):
    # if tag.has_attr('id') and not tag.has_attr('class'):
    #     return tag
    if a.has_attr('class') and not a.has_attr('id'):
        return a

# 通过方法查找所有有class没id的标签
print(soup.find_all(have_id_not_class))

给新热抽屉网1-100页所有用户点赞

import requests

# 一: 访问主页获取cookies
index_res = requests.get('https://dig.chouti.com/', headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
})
# print(index_res.cookies.get_dict())
index_cookies = index_res.cookies.get_dict()

'''
url:
    https://dig.chouti.com/login

method:
    POST

headers:
    user-agent:
    referer:
    cookie

form_data:
    phone: 8615622792660
    password: k46709394
    oneMonth: 1

'''

# 二 携带主页的cookies进行登陆
login_url = 'https://dig.chouti.com/login'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'referer': 'https://dig.chouti.com/',
}

form_data = {
    'phone': '8615622792660',
    'password': 'k46709394',
    'oneMonth': '1'
}

# 往登陆连接发送post请求进行登陆
login_res = requests.post(login_url, cookies=index_cookies, headers=headers, data=form_data)
print(login_res)

# 登陆成功返回文本
print(login_res.text)

# 对Id=25068561的用户进行点赞
# vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=25068561', headers=headers, cookies=index_cookies)
# print(vote_res.text)

'''
url：
    https://dig.chouti.com/all/hot/recent/1
headers:
'''
from bs4 import BeautifulSoup

# 循环拼接1-100页的 特推 页面
for line in range(1, 100):
    hot_url = 'https://dig.chouti.com/all/hot/recent/%s' % line
    # 往特推页面发送get请求获取当前页所有用户
    hot_res = requests.get(hot_url, headers=headers, )
    # print(hot_res.text)

    soup = BeautifulSoup(hot_res.text, 'lxml')

    # 通过bs4获取到所有id为content-list用户特推的div标签
    div_1 = soup.find(name='div', attrs={"id": "content-list"})
    # print(div_1)

    # # 通过bs4获取到所有class为item的用户特推的div标签
    div_s = soup.find_all(name='div', attrs={"class": "item"})
    for div in div_s:
        # 获取所有用户的id
        # 通过bs4查找到class为part2的div，然后获取属性中的share-linkid进而获取id值
        id = div.find(name='div', attrs={"class": "part2"}).get("share-linkid")
        # print(id)
        # 对所有用户进行点赞
        vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=%s' % id, headers=headers,
                                 cookies=index_cookies)
        print(vote_res.text)
        '''
        {"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53191784515","likedTime":"1552034126796000","lvCount":"6","nick":"人生苦短灬及时行乐","uvCount":"430","voteTime":"小于1分钟前"}}}

查看全文

相关阅读:
正则与普通方法对字符串过滤的比较
 java基础练习笔记
 node.js-express路由基础+获取前端数据+rmvc架构开发
 解决powershell因为在此系统上禁止运行脚本"报错
 树、森林、二叉树的转换
 git提交代码时如何不提交node_modules文件
 node.js-静态资源目录搭建
 node.js路由基础
 sql server查询练习
 MYQL存储过程与事件

原文地址：https://www.cnblogs.com/sima-3/p/11077711.html