zoukankan      html  css  js  c++  java
  • 爬虫 基础 BS详解

    Beautifulsoup 库详解

    # -*- coding:utf8 -*-

    # 工程路径:3.3 beautifulsoup库.py

    # 工程日期:9/6/2019

    # 工程目标:beautifulsoup使用详解

    """

    bs支持lxml, HTML 解析, html5解析

     

    """

    #%%

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

     

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.prettify()) # 格式化html

    print(soup.title.string) # 输出 title中内容

     

    #%% 标签选择器

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.title)

    print(type(soup.title)) # 为bs4的元素tag类型

    print(soup.head)

    print(type(soup.head))

    print(soup.p) # 只返回第一个匹配的p标签

     

    #%% 获取标签的名称

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.title.name) # 获取标签的名称

    print(soup.p.name) # 获取p标签的名称

     

    #%% 获取标签的属性

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p['name'])

    print(soup.p.attrs['name']) # 获取属性

     

    #%% 获取标签内的文本内容 .string

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.string) # 获取标签内的文本内容

     

    #%% 标签的嵌套选择

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.head.title.string)

    print(soup.body.p)

    print(soup.body.a['href'])

    print(soup.body.a['class'])

    print(soup.body.a['id'])

     

    #%% 子节点以及子孙节点的选择

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.body.p.a['href'])

    #print(soup.p.contents)

    print(type(soup.p.contents))

    for i in soup.p.contents:

    print(i)

     

    #%% .children 获取子节点 迭代器类型,

    # 使用循环的方式才能取出内容

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.children)

    for i, child in enumerate(soup.p.children):

    print(i, child)

     

    #%% .descendents 获取所有的子孙节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.descendants)

    for i, decendant in enumerate(soup.p.descendants):

    print(i, decendant) # 输出p标签的所有的子孙节点

     

    #%% .parent父节点 .parents祖先节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.a.parent)

    print(type(soup.a.parent)) # 父节点为 标签类

    print(type(soup.a.parents)) # 祖先节点为 迭代器

    for i, pars in enumerate(soup.a.parents):

    print(i, pars)

     

    print(list(enumerate(soup.a.parents))) # list 输出

     

    #%% 获取兄弟并列的节点

    # .next_siblings 下一个兄弟节点

    # .previous_siblings 前一个兄弟界节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.a.next_siblings) # 迭代器的类型的返回

    print(list(enumerate(soup.a.next_siblings)))

    print(list(enumerate(soup.a.previous_siblings)))

     

    """

    以上的选择方式为表标签的选择方式,方式快,但是满足不够

    """

    #%% 标准选择器 find_all 根据标签名, 属性,选择标签 列表返回

    # find_all (name, attrs, recursiv, text, **kwargs)

     

    #%% 标签名选择 name

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all('ul'))

    print(type(soup.find_all('ul')))

    print(soup.find_all('ul')[0])

     

    #%% 循环嵌套 find_all

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.find_all('ul'):

    print(soup.find_all('li')) # 循环嵌套的方式查找 ul 标签中的li标签

     

    #%% attrs 属性查找对应的内容

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1" name="elements">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all(attrs={'id':'list-1'})) # 匹配所有符合该 属性的标签内容

    print(soup.find_all(attrs={'name':'elements'})) # 两次的匹配结果实际上一致

     

     

    ## 更简单的写法

    print(soup.find_all(id='list-1'))

    print(soup.find_all(class_= 'list')) # 再这个地方class 为关键字, 因此加下划线来进行区分

     

     

    #%% 使用文本的内容进行匹配 text

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

     

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all(text='Foo')) # 有两处满足 输出文本, 不输出完整的标签

     

    #%% find 方法 只返回单个匹配的元素, 不返回所有的结果

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find('ul')) # 值返回第一个匹配的结果

     

    #%% 其他选择, 类似于标签的选择

    """

    ### find_parents() find_parent()

     

    find_parents()返回所有祖先节点,find_parent()返回直接父节点。

     

    ### find_next_siblings() find_next_sibling()

     

     

    ### find_previous_siblings() find_previous_sibling()

     

    find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。

     

    ### find_all_next() find_next()

     

    find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点

     

    ### find_all_previous() 和 find_previous()

     

    find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

     

    """

     

     

    #%% CSS 选择器 通过select 直接传入CSS选择器 就可以完成标签或者元素的的选择

    # . 代表 class

    # # 代表 id

    # 空格 代表嵌套

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.select('.panel .panel-heading')) # . 选择class

    print(soup.select('.panel .panel-body'))

     

    print(soup.select('ul li'))

    print(type(soup.select('ul li'))) # 嵌套使用空格进行选择 选择后的对象为list

    print(list(enumerate(soup.select('ul li'))))

    print(list(soup.select('ul li')))

    print(soup.select('#list-2')) # 通过 # 选择id 选出所有 id 符合条件的标签

    print(soup.select('#list-1 .element')) # 通过 # id 选择 然后嵌套选出 class 为element的标签

     

    #%% for嵌套选择

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.select('ul'):

    print(soup.select('li')) # 通过for循环嵌套选出 li 标签

    print(soup.select('ul li')) # 使用空格 具有同等效果的嵌套

     

    #%% select 获取标签的属性 [ ]

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.select('ul'):

    print(ul['class'])

    print(ul.attrs['id']) # 使用 [ ] 的这两种方式都可以获取 标签的 id

     

    #%% get_text 获取标签标签中的内容

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" ok hah id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for li in soup.select('li'):

    print(li.get_text('.title'))

     

     

    #%% 示例

    import requests

    from bs4 import BeautifulSoup

     

    html = requests.get('https://book.douban.com').text

    #print(html.text)

    soup = BeautifulSoup(html, 'lxml')

    # print(soup.prettify())

    for title in soup.select('.title '):

    #print(soup.select('a'))

    for a in soup.select('a'):

    print(a['href'])

    print(a.get_text())

  • 相关阅读:
    洛谷—— P2234 [HNOI2002]营业额统计
    BZOJ——3555: [Ctsc2014]企鹅QQ
    CodeVs——T 4919 线段树练习4
    python(35)- 异常处理
    August 29th 2016 Week 36th Monday
    August 28th 2016 Week 36th Sunday
    August 27th 2016 Week 35th Saturday
    August 26th 2016 Week 35th Friday
    August 25th 2016 Week 35th Thursday
    August 24th 2016 Week 35th Wednesday
  • 原文地址:https://www.cnblogs.com/binyang/p/10995671.html
Copyright © 2011-2022 走看看