zoukankan      html  css  js  c++  java
  • python3爬虫(find_all用法等)

    #read1.html文件
    # <html><head><title>The Dormouse's story</title></head>
    # <body>
    # <p class="title"><b>The Dormouse's story</b></p>
    #
    # <p class="story">Once upon a time there were three little sisters; and their names were
    # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    # and they lived at the bottom of a well.</p>
    #
    # <p class="story">...</p></body></html>
    
    
    #!/usr/bin/env python
    # # -*- coding:UTF-8 -*-
    
    import os
    import re
    import requests
    from bs4 import NavigableString
    from bs4 import BeautifulSoup
    
    curpath=os.path.dirname(os.path.realpath(__file__))
    hmtlpath=os.path.join(curpath,'read1.html')
    
    res=requests.get(hmtlpath)
    
    soup=BeautifulSoup(res.content,features="html.parser")
    
    for str in soup.stripped_strings:
        print(repr(str))
    
    links=soup.find_all(class_="sister")
    for parent in links.parents:
        if parent is None:
            print(parent)
        else:
            print(parent.name)
    
    print(links.next_sibling)
    
    for link in links:
        print(link.next_element)
    print(link.next_sibling)
    
    print(link.privous_element)
    print(link.privous_sibling)
    
    def has_class_no_id(tag):
        return tag.has_attr('class') and not tag.has_attr('id')
    
    def not_lacie(href):
        return href and not re.compile("lacie").search(href)
    
    def not_tillie(href):
        return href and not re.compile("tillie").search(href)
    
    def not_tillie1(id):
        return id and not re.compile("link2").search(id)
    
    file=open("soup.html","r",encoding="utf-8")
    soup=BeautifulSoup(file,features="lxml")
    
    #find_all用法
    tags=soup.find_all(re.compile('^b'))
    tags=soup.find_all('b')
    tags=soup.find_all(['a','b'])
    tags=soup.find_all(has_class_no_id)
    tags=soup.find_all(True)
    tags=soup.find_all(href=not_lacie)
    for tag in tags:
        print(tag.name)
    
    def surrounded_by_strings(tag):
        return (isinstance(tag.next_element, NavigableString)
                and isinstance(tag.previous_element, NavigableString))
    
    tags=soup.find_all(id=not_tillie1)
    for tag in tags:
        print(tag)
    
    tags=soup.find_all(attrs={"id":"link3"})
    for tag in tags:
        print(tag)
    
    soup.find_all(recursive=False)
    tags=soup.select("body a")
    tags=soup.select("p > a")
    tags=soup.select("p > #link1")
    tags=soup.select("html head title")
    tags=soup.select(".sister")
    tags=soup.select("[class~=sister]")
    tags=soup.select("#link1 + .sister")
    tags=soup.select("#link1")
    tags=soup.select("a#link1")
    tags=soup.select("a[href]")
    tags=soup.select('a[href^="http://example"]')
    tags=soup.select('a[href$="tillie"]')
    tags=soup.select('a[href*=".com/el"]')
    for tag in tags:
        print(tag)
    
    file=open("soup.html","r",encoding="utf-8")
    soup=BeautifulSoup(file,features="html.parser")
    soup=BeautifulSoup(file,features="html.parser")
    print(soup.prettify())
    print(type(soup))
    print(type(soup.title))
    print(type(soup.title.string))
    print(type(soup.b.string))
    
    print(soup.head.name)
    print(soup.title.name)
    print(soup.a.name)
    print(soup.name)
    
    tag=soup.a
    print(tag["href"])
    print(tag.string)
    print(tag["class"])
    print(tag.attrs)
    
    print(soup.title.string)
    print(soup.title.name)
    print(soup.p.attrs)
    print(soup.a.attrs)
    print(soup.a["class"])
  • 相关阅读:
    mysql数据库常用指令
    解决windows的mysql无法启动 服务没有报告任何错误的经验。
    “Can't open file for writing”或“operation not permitted”的解决办法
    启动Apache出现错误Port 80 in use by "Unable to open process" with PID 4!
    如何打开windows的服务services.msc
    常见的HTTP状态码 404 500 301 200
    linux系统常用的重启、关机指令
    (wifi)wifi移植之命令行调试driver和supplicant
    linux(debian)安装USB无线网卡(tp-link TL-WN725N rtl8188eu )
    alloc_chrdev_region申请一个动态主设备号,并申请一系列次设备号
  • 原文地址:https://www.cnblogs.com/Anderson-An/p/10275018.html
Copyright © 2011-2022 走看看