zoukankan      html  css  js  c++  java
  • 爬虫之BeautifulSoup类

    安装:pip install BeautifulSoup4

    下表列出了主要的解析器,以及它们的优缺点:看个人习惯选取自己喜欢的解析方式

     1 # 获取html代码
     2 import requests
     3 r = requests.get('http://www.python123.io/ws/demo.html')
     4 demo = r.text
     5 from bs4 import BeautifulSoup
     6 soup = BeautifulSoup(demo,'html.parser')
     7 print(soup.prettify()) #按照标准的缩进格式的结构输出,代码如下
     8 <html>
     9  <head>
    10   <title>
    11    This is a python demo page
    12   </title>
    13  </head>
    14  <body>
    15   <p class="title">
    16    <b>
    17     The demo python introduces several python courses.
    18    </b>
    19   </p>
    20   <p class="course">
    21    Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
    22    <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
    23     Basic Python
    24    </a>
    25    and
    26    <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
    27     Advanced Python
    28    </a>
    29    .
    30   </p>
    31  </body>
    32 </html>

    简单浏览数据化方法的用法

    #demo的源代码
    html_d="""
    <html><head><title>This is a python demo page</title></head>
    <body>
    <p class="title"><b>The demo python introduces several python courses.</b></p>
    <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
    <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
    </body></html>
    """
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html_d,'html.parser')
    # 获取title标签
    print(soup.title)
    #获取文本内容
    print(soup.text)
    #获取标签名称
    print(soup.title.name)
    #获取标签属性
    print(soup.title.attrs)
    #获取head标签的子节点
    print(soup.p.contents)
    print(soup.p.children)
    #获取所有的a标签
    print(soup.find_all('a'))

    常用解析方法

    #demo的源代码
    html_d="""
    <html><head><title>This is a python demo page</title></head>
    <body>
    <p class="title"><b>The demo python introduces several python courses.</b></p>
    <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
    <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
    </body></html>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_d,"lxml")
    #p下面所有的子节点
    print(soup.p.contents) 
    soup.contents[0].name
    #children本身没有子节点,得到一个迭代器,包含p下所有子节点
    print(soup.p.children)
    for child in enumerate(soup.p.children):
        print(child)
    #子孙节点p下面所有的标签都会出来
    print(soup.p.descendants)
    for i in enumerate(soup.p.children):
      print(i)
    # string 下面有且只有一个子节皆可以取出,如有多个字节则返回为none
    print(soup.title.string)
    # strings 如果有多个字符串
    for string in soup.strings:
        print(repr(string))
    #去掉空白
    for line in soup.stripped_strings: 
        print(line)
    #获取a标签的父节点
    print(soup.a.parent) 
    #找到a标签的父辈节点
    print(soup.a.parents) 
    #兄弟节点
    print(soup.a.next_sibling) #同一个兄弟
    print(soup.a.next_sibling) #上一个兄弟
    print(soup.a.next_sibling) #下一个兄弟

    find_all的用法( name, attrs, recursive, text, **kwargs)

    import re
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_d,"lxml")
    # name
    for tag in soup.find_all(re.compile('b')):
    print(tag.name)
    #attrs
    print(soup.find_all('p','course'))
    #keyword
    print(soup.find_all(id='link1'))
    #recursive
    # print(soup.find_all('a',recursive=False))
    # string
    # print(soup.find_all(string=re.compile('python')))

    小案例

    import requests
    from bs4 import BeautifulSoup
    import bs4
    #获取URL里面信息
    def getHtmlText(url):
        try:
            r= requests.get(url,timeout=30 )
            r.encoding=r.apparent_encoding
            return r.text
        except:
          return ""
    #提起网页数据
    def fillunivList(ulist,html):
        soup = BeautifulSoup(html,"html.parser")
        for tr in soup.find('tbody').children:
            if isinstance(tr,bs4.element.Tag):
                tds = tr('td')
                ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string])
        pass
    #打印数据结果
    def printUnivList(ulist,num):
        # tplt = "{0:^10}	{1:{3}^10}	{2:^10}	{:^10}"
        # print(tplt.format('排名', '学校名称', '省份','总分',chr(12288)))
        # for i in range(num):
        #     u = ulist[i]
        #     print(tplt.format(u[0], u[1], u[2],u[3],chr(12288)))
        print("{:^10}	{:^6}	{:^10}	{:^10}".format('排名', '学校名称', '地区', '总分'))
        for i in range(num):
             u = ulist[i]
             print("{:^10}	{:^6}	{:^10}	{:^10}".format(u[0], u[1], u[2], u[3]))
        return
    def main():
        unifo = []
        url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
        html = getHtmlText(url)
        fillunivList(unifo,html)
        printUnivList(unifo,20) #打印前20所
    main()
  • 相关阅读:
    Delphi TWebBrowser[11] 读写html代码
    Nginx 配置反向代理
    清除git中缓存的凭证(用户名及密码)
    python 摄像头
    a 链接控制打开新窗口 无地址栏
    树形多级菜单数据源嵌套结构与扁平结构互转
    使用 git 的正确姿势
    JavaScript this
    JavaScript Scope Chain
    JavaScript Scope Context
  • 原文地址:https://www.cnblogs.com/zqxFly/p/12496451.html
Copyright © 2011-2022 走看看