zoukankan      html  css  js  c++  java
  • python3之BeautifulSoup

    # -*- coding:utf-8 -*-
    # !/user/bin env python3
    from bs4 import BeautifulSoup
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    
    #标签选择器
    
    soup = BeautifulSoup(html,"html.parser")
    """
    print(soup.title)
    print(soup.title.name)
    print(soup.title.text)
    print(soup.head)
    print(soup.p)
    print(soup.a)
    #获取属性
    print("====获取属性====")
    print(soup.p["name"])
    print(soup.p.attrs["name"])
    #获取内容
    print("====获取内容====")
    print(soup.p.string)
    print(soup.p.text)
    #获取子节点,content属性将子节点以列表的形式输出,可以通过列表索引来获取元素
    print("====获取子节点====")
    print(soup.p.contents)
    print(soup.p.contents[0])
    
    print(soup.p.parent)
    print(soup.p.parent.name)
    
    content=soup.head.title.string
    for parent in content.parents:
        print(parent.name)
    """
    
    #标准选择器
    """
    print(soup.find_all("a")[0])
    print(soup.find_all("a"))
    print(soup.find_all(attrs={'name':'dromouse'}))
    print(soup.find_all(class_ = "title"))
    print(soup.find_all(text = "The Dormouse's story"))
    print(soup.find_all("a",limit = 1))
    print(soup.find("a"))
    """
    
    #css选择器
    # print(soup.select(".story .sister"))   #打印class属性为story的标签中class属性为sister的元素,需要加点
    # print(soup.select("p a"))              #打印p标签中a标签的内容
    # print(soup.select("#link2"))           #打印id为link2标签的内容
    # print(soup.select("p")[0])              #打印的是第一个p标签的内容
    # print(soup.select("p")[0]["class"])      #打印第一个p标签中class属性的名称
    # print(soup.select("p")[0].get_text)
    # print(soup.select("p")[0].text)
    print(soup.prettify())
  • 相关阅读:
    浅谈 IBM 购并 Sun Microsystems
    用 CSS 替代 HTML 的 table tag 设计网页版面
    用 IIS 7、ARR 與 Velocity 建设高性能的大型网站
    实作 ASP.NET 多笔数据离线编辑
    快速搞懂 ASP.NET MVC
    C# Design Patterns (2) Strategy
    网站性能越来越差怎么办?
    dotNET 类型转型的三种做法
    ASP.NET 数据分页第二篇 范例下载
    程序员真情忏悔录
  • 原文地址:https://www.cnblogs.com/python-kp/p/15137787.html
Copyright © 2011-2022 走看看