zoukankan      html  css  js  c++  java
  • beautifulsoup的一些使用

    自动补全代码:

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    print(soup.prettify())#如果html代码补全,则自动补全
    print(soup.title.string)

    查找标签

    #基本使用
    soup.title#<title>xxxxxxx</title>
    soup.title.string#xxxxxx

    获取名称

    #基本使用
    soup.title#<title>xxxxxxx</title>
    soup.title.name#title

    获取属性

    #基本使用
    soup.a#<a>xxxxxxx</a>
    soup.a['name']#a标签的name属性值

    获取内容

    soup.title.string#xxxxxx

    嵌套选择

    print(soup.head.title.string)

    子节点

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    print(soup.div.contents)

    或者

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.div.children
    print(a)
    for i,j in enumerate(a):
        print(i,j)

    子孙节点

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.div.descendants
    print(a)
    for i,j in enumerate(a):
        print(i,j)

    获取父节点

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.div.parent
    print(a)

    获取祖先节点

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.div.parents
    for i,j in enumerate(a):
        print(i,j)

    获取兄弟节点

    a=soup.div.next_siblings#后面的兄弟节点(迭代器)

    前面的兄弟节点

    a=soup.div.next_siblings#前面的兄弟节点(迭代器)

    标准选择器
    find_all(name,attrs,recursive,**kwargs)

    name

    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    print(soup.find_all('ul'))#根据标签名查找
    import requests
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            print(li)

    attrs

    import requests
    import re
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.find_all(attrs={'class':'lazy'})#<a class='lazy'>xxxxx</lazy>
    for index,i in enumerate(a):
        result=re.findall(r'[a-zA-z]+://[^s]*png',str(i))
        url=result[0]
        res = requests.get(url)
        with open('%d.png'%index,'wb')as f:
            f.write(res.content)
    import requests
    import re
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    a=soup.find_all(class_='lazy')
    a=soup.find_all(id='lazy')

    CSS选择器

    import requests
    import re
    from bs4 import BeautifulSoup
    response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.select('.lazy')
    
    for index,i in enumerate(a):
        result=re.findall(r'[a-zA-z]+://[^s]*png',str(i))
        url=result[0]
        res = requests.get(url)
        with open('./test/%d.png'%(index+1),'wb')as f:
            f.write(res.content)

    获取css属性

    import requests
    import re
    from bs4 import BeautifulSoup
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
    }
    response=requests.get('https://www.zhihu.com/question/20519068/answer/215288567',headers=headers,timeout=None)
    result=response.text
    soup=BeautifulSoup(response.content,'lxml')
    # print(soup.prettify())
    a=soup.select('.lazy')
    # print(a)
    for index,i in enumerate(a):
        url=i['data-original']
    #
       #result=re.findall(r'[a-zA-z]+://[^s]*jpg',str(i))
    # url=result[0] res = requests.get(url) with open('./test/%d.jpg'%(index+1),'wb')as f: f.write(res.content)

    获取内容

    li.get_text()

  • 相关阅读:
    将自己数据转化为cifar10支持的lmdb
    python实现cifar10数据集的可视化
    Python OS 文件/目录方法
    象棋AI算法(二)
    象棋AI算法(一)
    围棋人机大战中阿尔法狗原理解析,左右互搏,青出于蓝而胜于蓝?
    电脑开机停留在主板设置界面,进不了系统
    哪一种编程语言适合人工智能?
    BIOS设置图解教程-看完就没有不明白的了
    关于AndroidStudio的打包数字签名以及多渠道发布
  • 原文地址:https://www.cnblogs.com/wang666/p/8125837.html
Copyright © 2011-2022 走看看