zoukankan      html  css  js  c++  java
  • PyQuery的基本使用详解

    0.安装:pip3 install pyquery

    1.初始化

    1.字符串初始化

    # 字符串初始化
    html = """
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    print(doc('li'))
    

    2.URL初始化

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    print(doc('head'))
    

    3.文件初始化

    from pyquery import PyQuery as pq
    doc = pq(filename='demo.html')
    print(doc('li'))
    

    2.基本CSS选择器

    html = """
    <div id="container">
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    print(doc('#container .list li'))
    

    3.查找元素

    1.子元素

    html = """
    <div id="container">
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    print(items)
    
    # 查询li标签
    lis = items.find('li')
    print(lis)
    
    # 查询孩子
    lis = items.children()
    print(type(lis))
    print(lis)
    
    # 查询带有'.active'的孩子
    lis = items.children('.active')
    print(lis)
    

    2.父元素

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    print(items)
    
    parents = items.parents()
    print(parents)
    
    parent = items.parents('.wrap')
    print(parent)
    

    3.兄弟元素

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    items = doc('.list .item-0.active')
    print(items)
    print(items.siblings())
    print(items.siblings('.active'))
    

    4.遍历

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    # 单个元素
    items = doc('.item-0.active')
    print(items)
    # 遍历元素
    lis = doc('li').items()
    print(lis)
    # <generator object PyQuery.items at 0x0000000003A84468>
    for item in lis:
        print(item)
    

    5.获取信息

    1.获取属性

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    

    2.获取文本

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    a = doc('.item-0.active a')
    print(a)
    print(a.text())
    

    3.获取HTML

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    li = doc('.item-0.active')
    print(li)
    print(li.html())
    

    6.DOM操作

    1.addclass、removeclass

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    li = doc('.item-0.active')
    print(li)
    
    li.removeClass('active')
    print(li)
    
    li.addClass('active')
    print(li)
    

    2. attr、css

    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    
    li = doc('.item-0.active')
    print(li)
    
    li.attr('name','apollo')
    print(li)
    
    li.css('font-size','14px')
    print(li)
    

    3. remove

    html = """
    <div class='wrap'>
        Hello World
        <p>This is paragraph</p>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    wrap = doc('.wrap')
    print('移除前:',wrap.text())
    wrap.find('p').remove()
    print('移除后:',wrap.text())
    

    4.其他DOM方法

    https://pyquery.readthedocs.io/en/latest/api.html
    

    7.伪类选择器


    html = """
    <div class='wrap'>
        <div id="container">
            <ul class='list'>
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('li:first-child')
    print(li)
    li = doc('li:last-child')
    print(li)
    li = doc('li:nth-child(2)')
    print(li)
    li = doc('li:gt(2)')
    print(li)
    li = doc('li:nth-child(2n)')
    print(li)
    li = doc('li:contains(second)')
    print(li)
    

    官网

    https://pyquery.readthedocs.io

  • 相关阅读:
    【Lintcode】112.Remove Duplicates from Sorted List
    【Lintcode】087.Remove Node in Binary Search Tree
    【Lintcode】011.Search Range in Binary Search Tree
    【Lintcode】095.Validate Binary Search Tree
    【Lintcode】069.Binary Tree Level Order Traversal
    【Lintcode】088.Lowest Common Ancestor
    【Lintcode】094.Binary Tree Maximum Path Sum
    【算法总结】二叉树
    库(静态库和动态库)
    从尾到头打印链表
  • 原文地址:https://www.cnblogs.com/apollo1616/p/10403950.html
Copyright © 2011-2022 走看看