zoukankan      html  css  js  c++  java
  • BS基础

    Beautifulsoup 库详解

    # -*- coding:utf8 -*-
    # 工程路径:3.3 beautifulsoup库.py
    # 工程日期:9/6/2019
    # 工程目标:beautifulsoup使用详解
    """
    bs支持lxml, HTML 解析, html5解析

    """
    #%%
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.prettify()) # 格式化html
    print(soup.title.string) # 输出 title中内容

    #%% 标签选择器
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.title)
    print(type(soup.title)) # 为bs4的元素tag类型
    print(soup.head)
    print(type(soup.head))
    print(soup.p) # 只返回第一个匹配的p标签

    #%% 获取标签的名称
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.title.name) # 获取标签的名称
    print(soup.p.name) # 获取p标签的名称

    #%% 获取标签的属性
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p['name'])
    print(soup.p.attrs['name']) # 获取属性

    #%% 获取标签内的文本内容 .string
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p.string) # 获取标签内的文本内容

    #%% 标签的嵌套选择
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.head.title.string)
    print(soup.body.p)
    print(soup.body.a['href'])
    print(soup.body.a['class'])
    print(soup.body.a['id'])

    #%% 子节点以及子孙节点的选择
    html = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
    <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.body.p.a['href'])
    #print(soup.p.contents)
    print(type(soup.p.contents))
    for i in soup.p.contents:
    print(i)

    #%% .children 获取子节点 迭代器类型,
    # 使用循环的方式才能取出内容
    html = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
    <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p.children)
    for i, child in enumerate(soup.p.children):
    print(i, child)



  • 相关阅读:
    开放平台整理
    Ubuntu Linux之软件源 解释
    遇到的几个linux问题,顺便把网上查到的帖过来
    Linux之网络配置
    Linux维护之Grub应用与问题汇总
    Linux维护之我用过的命令汇总(没多少意义,给我自己看的)
    大数据hadoop目录
    大数据hadoop环境搭建伪分布式模式
    大数据数据分析numpy库数组创建方式、数组的数据类型
    大数据hadoop运行模式
  • 原文地址:https://www.cnblogs.com/binyang/p/10995677.html
Copyright © 2011-2022 走看看