zoukankan      html  css  js  c++  java
  • python3爬虫(find_all用法等)

    #read1.html文件
    # <html><head><title>The Dormouse's story</title></head>
    # <body>
    # <p class="title"><b>The Dormouse's story</b></p>
    #
    # <p class="story">Once upon a time there were three little sisters; and their names were
    # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    # and they lived at the bottom of a well.</p>
    #
    # <p class="story">...</p></body></html>
    
    
    #!/usr/bin/env python
    # # -*- coding:UTF-8 -*-
    
    import os
    import re
    import requests
    from bs4 import NavigableString
    from bs4 import BeautifulSoup
    
    curpath=os.path.dirname(os.path.realpath(__file__))
    hmtlpath=os.path.join(curpath,'read1.html')
    
    res=requests.get(hmtlpath)
    
    soup=BeautifulSoup(res.content,features="html.parser")
    
    for str in soup.stripped_strings:
        print(repr(str))
    
    links=soup.find_all(class_="sister")
    for parent in links.parents:
        if parent is None:
            print(parent)
        else:
            print(parent.name)
    
    print(links.next_sibling)
    
    for link in links:
        print(link.next_element)
    print(link.next_sibling)
    
    print(link.privous_element)
    print(link.privous_sibling)
    
    def has_class_no_id(tag):
        return tag.has_attr('class') and not tag.has_attr('id')
    
    def not_lacie(href):
        return href and not re.compile("lacie").search(href)
    
    def not_tillie(href):
        return href and not re.compile("tillie").search(href)
    
    def not_tillie1(id):
        return id and not re.compile("link2").search(id)
    
    file=open("soup.html","r",encoding="utf-8")
    soup=BeautifulSoup(file,features="lxml")
    
    #find_all用法
    tags=soup.find_all(re.compile('^b'))
    tags=soup.find_all('b')
    tags=soup.find_all(['a','b'])
    tags=soup.find_all(has_class_no_id)
    tags=soup.find_all(True)
    tags=soup.find_all(href=not_lacie)
    for tag in tags:
        print(tag.name)
    
    def surrounded_by_strings(tag):
        return (isinstance(tag.next_element, NavigableString)
                and isinstance(tag.previous_element, NavigableString))
    
    tags=soup.find_all(id=not_tillie1)
    for tag in tags:
        print(tag)
    
    tags=soup.find_all(attrs={"id":"link3"})
    for tag in tags:
        print(tag)
    
    soup.find_all(recursive=False)
    tags=soup.select("body a")
    tags=soup.select("p > a")
    tags=soup.select("p > #link1")
    tags=soup.select("html head title")
    tags=soup.select(".sister")
    tags=soup.select("[class~=sister]")
    tags=soup.select("#link1 + .sister")
    tags=soup.select("#link1")
    tags=soup.select("a#link1")
    tags=soup.select("a[href]")
    tags=soup.select('a[href^="http://example"]')
    tags=soup.select('a[href$="tillie"]')
    tags=soup.select('a[href*=".com/el"]')
    for tag in tags:
        print(tag)
    
    file=open("soup.html","r",encoding="utf-8")
    soup=BeautifulSoup(file,features="html.parser")
    soup=BeautifulSoup(file,features="html.parser")
    print(soup.prettify())
    print(type(soup))
    print(type(soup.title))
    print(type(soup.title.string))
    print(type(soup.b.string))
    
    print(soup.head.name)
    print(soup.title.name)
    print(soup.a.name)
    print(soup.name)
    
    tag=soup.a
    print(tag["href"])
    print(tag.string)
    print(tag["class"])
    print(tag.attrs)
    
    print(soup.title.string)
    print(soup.title.name)
    print(soup.p.attrs)
    print(soup.a.attrs)
    print(soup.a["class"])
  • 相关阅读:
    LeetCode: Copy List with Random Pointer
    LeetCode: Clone Graph
    LeetCode: Candy
    Database: Normal form
    Algorithm: cartesian tree
    【阿里云产品公测】云引擎ACE初体验
    【阿里云产品公测】Opensearch使用体验和评测
    【阿里云产品公测】阿里云OpenSearch初次使用评测
    【阿里云产品公测】OpenSearch初探
    【阿里云产品公测】弹性伸缩服务ESS之试用初体验
  • 原文地址:https://www.cnblogs.com/Anderson-An/p/10275018.html
Copyright © 2011-2022 走看看