zoukankan      html  css  js  c++  java
  • pyquery解析库

    语法和jquey几乎一致

    安装

    conda install pyquery

    一、初始化

    标准用法

    from pyquery import PyQuery as pq
    import requests
    
    #
    r = requests.get(url='http://www.baidu.com')
    
    html_doc = pq(r.text)
    print(html_doc)
    print(html_doc('#u1 a'))

    1、字符串初始化(最常用)

    from pyquery import PyQuery as pq
    
    html_doc = '''<div>
        <ul id = 'haha'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    print(doc)
    print(type(doc))

    2、url初始化

    from pyquery import PyQuery as pq
    
    #
    
    
    html_doc = pq(url='http://www.baidu.com')
    print(html_doc)
    print(html_doc('#u1 a'))

    注意:一般通过requests模块或urllib获取网页的html->解析模块去解析

    3、文件初始化

    from pyquery import PyQuery as pq
    
    #
    
    
    doc = pq(filename='test.html')
    print(doc)

    二、基本CSS选择器

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    li_list = doc('div #con li')
    print(li_list)
    
    # id      #
    # class  .
    # tag    tagname

    三、查找节点

    1、子节点

    find() 最常用的方法

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    div = doc('div')
    li_list = div.find('li.active')
    print(li_list)

    children() 查找所有子节点,children('') 查找指定的子节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    div = doc('div')
    # 查找所有子节点
    selector = div.children()
    print(selector)
    # 查找含有item-0类的节点
    li_item_0 = div.children('#con .item-0')
    print(li_item_0)

    2、父节点

    parent() 父节点 parents() 祖节点 parents('') 含有某些选择器祖节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # s所有li节点
    li_list = doc('#con li')
    # li节点的父节点
    ul = li_list.parent()
    # print(ul)
    # 祖辈节点(包含父节点)
    divs = li_list.parents()
    # print(divs)
    # 含有id="conn" 的祖节点
    div = li_list.parents('#con')
    print(div)

    3、兄弟节点

    siblings() 所有兄弟姊妹节点,siblings('') 含有指定css选择器的兄弟节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 含有class="item-0 active"的节点
    li = doc('#con li.item-0.active')
    # 查找所有兄弟节点(除了自己本身)
    # print(li.siblings())
    # 查找含有指定css选择器的节点
    print(li.siblings('.item-1.active'))

    四、遍历

    1、单个节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 单个节点
    li = doc('#con li.item-0.active')
    print(li)

    2、多个节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 多个节点,使用items()->生成器
    li_lst = doc('#con li')
    for li in li_lst.items():
        print(li, end='')

    五、获取信息

    1、属性

    获取 设置

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取a标签的href属性
    a = doc('li.item-0.active a')
    print(a.attr('href'))
    # 设置属性
    a.attr('href', 'oj8k')
    print(a.attr('href'))

    2、文本

    text() html()

    获取 设置

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取text()
    li = doc('li.item-0.active')
    print(li.text())
    # 获取html()
    print(li.html())
    
    # 设置text()
    li.text('Hello World')
    print(li.text())
    # 设置html()
    li.html('<a>打我</a>')
    print(li.html())

    注意:与JQuery的区别,pyquery(),  html() 获取的是内部的html,不包含其本身

    六、操作DOM节点

    1、add_class()和remove_class()c

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取text()
    li = doc('li.item-0.active')
    print(li.text())
    # 获取html()
    print(li.html())
    
    # 设置text()
    li.text('Hello World')
    print(li.text())
    # 设置html()
    li.html('<a>打我</a>')
    print(li.html())

    2、remove()

    作用:删除节点

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取li节点
    li = doc('li.item-0.active')
    print(li)
    # 找到a节点,并删除a节点
    a = li('a')
    a.remove()
    print(li)

    七、伪类选择器

    from pyquery import PyQuery as pq
    
    #
    
    html_doc = '''<div>
        <ul id = 'con'>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
         </div>'''
    
    doc = pq(html_doc)
    # 获取li节点
    li = doc('li.item-0.active')
    print(li)
    # 找到a节点,并删除a节点
    a = li('a')
    a.remove()
    print(li)
  • 相关阅读:
    js-快速选择日期区间
    关于状态更新时间字段取值的问题
    MySql 前缀索引
    Java springMVC 多数据源的实现和使用
    哈哈哈,终于找到一个安稳的“家”了
    POJ 1724: Roads
    POJ 1221: UNIMODAL PALINDROMIC DECOMPOSITIONS
    createjs 用户画线 粗细bug Graphics setStrokeStyle() 粗细BUG
    GAudio是一个音频播放SDK
    新做的一个基于OPENGL的gui库
  • 原文地址:https://www.cnblogs.com/wt7018/p/11904944.html
Copyright © 2011-2022 走看看