zoukankan      html  css  js  c++  java
  • scrapy--BeautifulSoup

    BeautifulSoup官方文档:https://beautifulsoup.readthedocs.io/zh_CN/latest/#id8

    太繁琐的,精简了一些自己用的到的。

    1.index.html

    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>

    2..prettify()--标准的缩进格式输出

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    print(soup.prettify())
    # <html>
    #  <head>
    #   <title>
    #    The Dormouse's story
    #   </title>
    #  </head>
    #  <body>
    #   <p class="title">
    #    <b>
    #     The Dormouse's story
    #    </b>
    #   </p>
    #   <p class="story">
    #    Once upon a time there were three little sisters; and their names were
    #    <a class="sister" href="http://example.com/elsie" id="link1">
    #     Elsie
    #    </a>
    #    ,
    #    <a class="sister" href="http://example.com/lacie" id="link2">
    #     Lacie
    #    </a>
    #    and
    #    <a class="sister" href="http://example.com/tillie" id="link2">
    #     Tillie
    #    </a>
    #    ; and they lived at the bottom of a well.
    #   </p>
    #   <p class="story">
    #    ...
    #   </p>
    #  </body>
    # </html>

    3.选择标签,属性

    soup.title
    # <title>The Dormouse's story</title>
    
    soup.title.name
    # u'title'
    
    soup.title.string
    # u'The Dormouse's story'
    
    soup.title.parent.name
    # u'head'
    
    soup.p
    # <p class="title"><b>The Dormouse's story</b></p>
    
    soup.p['class']
    # u'title'
    
    soup.a
    # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
    
    soup.find_all('a')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    soup.find(id="link3")
    # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    for link in soup.find_all('a'):
        print(link.get('href'))
        # http://example.com/elsie
        # http://example.com/lacie
        # http://example.com/tillie
    print(soup.get_text())
    # The Dormouse's story
    #
    # The Dormouse's story
    #
    # Once upon a time there were three little sisters; and their names were
    # Elsie,
    # Lacie and
    # Tillie;
    # and they lived at the bottom of a well.
    #
    # ...
    
    #Tag
        soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
        tag = soup.b
        type(tag)
        # <class 'bs4.element.Tag'>
    #Name
        tag.name
        # u'b'
        tag.name = "blockquote"
        tag
        # <blockquote class="boldest">Extremely bold</blockquote>
    #Attributes
        tag['class']
        # u'boldest'
        tag.attrs
        # {u'class': u'boldest'}
        tag['class'] = 'verybold'
        tag['id'] = 1
        tag
        # <blockquote class="verybold" id="1">Extremely bold</blockquote>
    
        del tag['class']
        del tag['id']
        tag
        # <blockquote>Extremely bold</blockquote>
    
        tag['class']
        # KeyError: 'class'
        print(tag.get('class'))
        # None

     2.find_all

    1)name 参数#A.传字符串soup.find_all('b')
    #
    [<b>The Dormouse's story</b>] print soup.find_all('a') #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    print soup.find_all('a') #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    <
    a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    #B.传正则表达式 import re for tag in soup.find_all(re.compile("^b")): print(tag.name) # body # b #C.传列表 soup.find_all(["a", "b"]) # [<b>The Dormouse's story</b>, # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] #D.传 True for tag in soup.find_all(True): print(tag.name) # html # head # title # body # p # b # p # a # a #E.传方法 def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') soup.find_all(has_class_but_no_id) # [<p class="title"><b>The Dormouse's story</b></p>, # <p class="story">Once upon a time there were...</p>, # <p class="story">...</p>] 2)keyword 参数 soup.find_all(id='link2') # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] soup.find_all(href=re.compile("elsie")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] soup.find_all(href=re.compile("elsie"), id='link1') # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] soup.find_all("a", class_="sister") # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') data_soup.find_all(data-foo="value") # SyntaxError: keyword can't be an expression data_soup.find_all(attrs={"data-foo": "value"}) # [<div data-foo="value">foo!</div>] 3)text 参数 soup.find_all(text="Elsie") # [u'Elsie'] soup.find_all(text=["Tillie", "Elsie", "Lacie"]) # [u'Elsie', u'Lacie', u'Tillie'] soup.find_all(text=re.compile("Dormouse")) [u"The Dormouse's story", u"The Dormouse's story"] 4)limit 参数 soup.find_all("a", limit=2) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 5)recursive 参数 #调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False soup.html.find_all("title") # [<title>The Dormouse's story</title>] soup.html.find_all("title", recursive=False) # []

    3.CSS选择器

    (1)通过标签名查找
    print soup.select('title') 
    #[<title>The Dormouse's story</title>]
    
    print soup.select('a')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    print soup.select('b')
    #[<b>The Dormouse's story</b>]
    2)通过类名查找
    print soup.select('.sister')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    3)通过 id 名查找
    print soup.select('#link1')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    4)组合查找
    print soup.select('p #link1')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    print soup.select("head > title")
    #[<title>The Dormouse's story</title>]
    5)属性查找
    print soup.select('a[class="sister"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    print soup.select('a[href="http://example.com/elsie"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    
    print soup.select('p a[href="http://example.com/elsie"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    
    soup = BeautifulSoup(html, 'lxml')
    print type(soup.select('title'))
    print soup.select('title')[0].get_text()
     
    for title in soup.select('title'):
        print title.get_text()
  • 相关阅读:
    Educational Codeforces Round 86 (Rated for Div. 2) D. Multiple Testcases
    Educational Codeforces Round 86 (Rated for Div. 2) C. Yet Another Counting Problem
    HDU
    HDU
    HDU
    HDU
    Good Bye 2019 C. Make Good (异或的使用)
    Educational Codeforces Round 78 (Rated for Div. 2) C. Berry Jam
    codeforces 909C. Python Indentation
    codeforces1054 C. Candies Distribution
  • 原文地址:https://www.cnblogs.com/eilinge/p/9641598.html
Copyright © 2011-2022 走看看