zoukankan      html  css  js  c++  java
  • scrapy--BeautifulSoup

    BeautifulSoup官方文档:https://beautifulsoup.readthedocs.io/zh_CN/latest/#id8

    太繁琐的,精简了一些自己用的到的。

    1.index.html

    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>

    2..prettify()--标准的缩进格式输出

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    print(soup.prettify())
    # <html>
    #  <head>
    #   <title>
    #    The Dormouse's story
    #   </title>
    #  </head>
    #  <body>
    #   <p class="title">
    #    <b>
    #     The Dormouse's story
    #    </b>
    #   </p>
    #   <p class="story">
    #    Once upon a time there were three little sisters; and their names were
    #    <a class="sister" href="http://example.com/elsie" id="link1">
    #     Elsie
    #    </a>
    #    ,
    #    <a class="sister" href="http://example.com/lacie" id="link2">
    #     Lacie
    #    </a>
    #    and
    #    <a class="sister" href="http://example.com/tillie" id="link2">
    #     Tillie
    #    </a>
    #    ; and they lived at the bottom of a well.
    #   </p>
    #   <p class="story">
    #    ...
    #   </p>
    #  </body>
    # </html>

    3.选择标签,属性

    soup.title
    # <title>The Dormouse's story</title>
    
    soup.title.name
    # u'title'
    
    soup.title.string
    # u'The Dormouse's story'
    
    soup.title.parent.name
    # u'head'
    
    soup.p
    # <p class="title"><b>The Dormouse's story</b></p>
    
    soup.p['class']
    # u'title'
    
    soup.a
    # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
    
    soup.find_all('a')
    # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    #  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    #  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    soup.find(id="link3")
    # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    for link in soup.find_all('a'):
        print(link.get('href'))
        # http://example.com/elsie
        # http://example.com/lacie
        # http://example.com/tillie
    print(soup.get_text())
    # The Dormouse's story
    #
    # The Dormouse's story
    #
    # Once upon a time there were three little sisters; and their names were
    # Elsie,
    # Lacie and
    # Tillie;
    # and they lived at the bottom of a well.
    #
    # ...
    
    #Tag
        soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
        tag = soup.b
        type(tag)
        # <class 'bs4.element.Tag'>
    #Name
        tag.name
        # u'b'
        tag.name = "blockquote"
        tag
        # <blockquote class="boldest">Extremely bold</blockquote>
    #Attributes
        tag['class']
        # u'boldest'
        tag.attrs
        # {u'class': u'boldest'}
        tag['class'] = 'verybold'
        tag['id'] = 1
        tag
        # <blockquote class="verybold" id="1">Extremely bold</blockquote>
    
        del tag['class']
        del tag['id']
        tag
        # <blockquote>Extremely bold</blockquote>
    
        tag['class']
        # KeyError: 'class'
        print(tag.get('class'))
        # None

     2.find_all

    1)name 参数#A.传字符串soup.find_all('b')
    #
    [<b>The Dormouse's story</b>] print soup.find_all('a') #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    print soup.find_all('a') #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
    <
    a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    #B.传正则表达式 import re for tag in soup.find_all(re.compile("^b")): print(tag.name) # body # b #C.传列表 soup.find_all(["a", "b"]) # [<b>The Dormouse's story</b>, # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] #D.传 True for tag in soup.find_all(True): print(tag.name) # html # head # title # body # p # b # p # a # a #E.传方法 def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') soup.find_all(has_class_but_no_id) # [<p class="title"><b>The Dormouse's story</b></p>, # <p class="story">Once upon a time there were...</p>, # <p class="story">...</p>] 2)keyword 参数 soup.find_all(id='link2') # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] soup.find_all(href=re.compile("elsie")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] soup.find_all(href=re.compile("elsie"), id='link1') # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] soup.find_all("a", class_="sister") # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') data_soup.find_all(data-foo="value") # SyntaxError: keyword can't be an expression data_soup.find_all(attrs={"data-foo": "value"}) # [<div data-foo="value">foo!</div>] 3)text 参数 soup.find_all(text="Elsie") # [u'Elsie'] soup.find_all(text=["Tillie", "Elsie", "Lacie"]) # [u'Elsie', u'Lacie', u'Tillie'] soup.find_all(text=re.compile("Dormouse")) [u"The Dormouse's story", u"The Dormouse's story"] 4)limit 参数 soup.find_all("a", limit=2) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 5)recursive 参数 #调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False soup.html.find_all("title") # [<title>The Dormouse's story</title>] soup.html.find_all("title", recursive=False) # []

    3.CSS选择器

    (1)通过标签名查找
    print soup.select('title') 
    #[<title>The Dormouse's story</title>]
    
    print soup.select('a')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    print soup.select('b')
    #[<b>The Dormouse's story</b>]
    2)通过类名查找
    print soup.select('.sister')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    3)通过 id 名查找
    print soup.select('#link1')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    4)组合查找
    print soup.select('p #link1')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    print soup.select("head > title")
    #[<title>The Dormouse's story</title>]
    5)属性查找
    print soup.select('a[class="sister"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
    
    print soup.select('a[href="http://example.com/elsie"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    
    print soup.select('p a[href="http://example.com/elsie"]')
    #[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
    
    soup = BeautifulSoup(html, 'lxml')
    print type(soup.select('title'))
    print soup.select('title')[0].get_text()
     
    for title in soup.select('title'):
        print title.get_text()
  • 相关阅读:
    myeclipse 配置svn
    windows下 将tomcat做成服务,并于oracle后启动
    局部内部类为什么只能访问final局部变量,对于成员变量却可以随便访问?
    使用cmd查看windows端口占用情况,并关闭应用
    生成javadoc文档
    JNI以及JNA使用
    自定义标签-java
    dwr框架应用
    Hadoop生态圈简介
    tomcat之日志记录
  • 原文地址:https://www.cnblogs.com/eilinge/p/9641598.html
Copyright © 2011-2022 走看看