zoukankan      html  css  js  c++  java
  • pyquery解析库的介绍和使用

    ### pyquery的介绍和使用
    
    ## 测试文本
    text = '''
    <html><head><title>there is money</title></head>
    <body>
    <p class="title" name="dmr"><b>there is money</b>contents</p>
    <p class="money">good good study, day day up
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    </p>
    <p class='body'>...</p>
    '''

    1. pyquery对象初始化,html字符串,url,file皆可

    ## pyquery对象初始化,html字符串,url,file皆可
    from pyquery import PyQuery as pq
    import requests
    
    # html字符串初始化
    doc = pq(text)
    print(doc('a'))
    # url初始化
    doc = pq(requests.get('https://www.baidu.com').text)
    print(doc('title'))
    # 读取文件内容初始化,编码格式为GBK,当有不可识别字符时会报错,可通过open指定编码格式为utf-8来解决
    # doc = pq(filename='text')
    # print(doc('li'))

    2. 基本CSS选择器

    ## 基本CSS选择器
    from pyquery import PyQuery as pq
    
    doc = pq(text)
    print(type(doc))
    print(doc('.money a'))
    print(doc('.money #l1'))
    '''
    输出内容:
    <class 'pyquery.pyquery.PyQuery'>
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    '''
    输出内容

    3. 查找节点,返回一个PyQuery对象,当匹配到多个节点时,PyQuery对象值为多个节点的字符串整合

    ## 查找节点,返回一个PyQuery对象,当匹配到多个节点时,PyQuery对象值为多个节点的字符串整合
    from pyquery import PyQuery as pq
    
    doc = pq(text)
    items = doc('p')
    print(items)
    print(type(items))
    # 查找子节点
    print(items.children('#l2'))
    print('--------------------分隔符------------------')
    # 查找父节点
    print(items.parent())
    print('--------------------分隔符------------------')
    print(items.parents('html'))
    print('--------------------分隔符------------------')
    ## 查找兄弟节点
    print(items('#l2').siblings())
    '''
    输出内容:
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    
    <class 'pyquery.pyquery.PyQuery'>
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    
    --------------------分隔符------------------
    <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    </body>
    --------------------分隔符------------------
    <html><head><title>there is money</title></head>
    <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    </body></html>
    --------------------分隔符------------------
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    '''
    输出内容

    4. 遍历,通过PyQuery对象的items方法可以把匹配到多个节点的PyQuery对象构造成一个生成器

    ## 遍历,通过PyQuery对象的items方法可以把匹配到多个节点的PyQuery对象构造成一个生成器
    from pyquery import PyQuery as pq
    
    doc = pq(doc)
    print(doc('a'))
    items = doc('a').items()
    print(type(items))
    for i, item in enumerate(items):
        print(i, item, type(item))
    '''
    输出内容:
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    
    <class 'generator'>
    0 <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
     <class 'pyquery.pyquery.PyQuery'>
    1 <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
     <class 'pyquery.pyquery.PyQuery'>
    2 <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
     <class 'pyquery.pyquery.PyQuery'>
    '''
    输出内容

    5. 获取属性和文本

    ## 获取属性和文本
    from pyquery import PyQuery as pq
    
    doc = pq(doc)
    # 获取属性,当多个节点时,同上,用items方法构造生成器然后进行遍历输出
    a = doc('.error.ed2')
    print(a, type(a))
    print(a.attr('href'))
    print(a.attr.href)
    
    '''
    输出结果:
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
     <class 'pyquery.pyquery.PyQuery'>
    https://www.baidu.com/2
    https://www.baidu.com/2
    '''
    
    ## 获取文本
    from pyquery import PyQuery as pq
    
    doc = pq(text)
    print(doc('a.error'))
    # 第一个a节点文本内容为注释内容,所以不输出
    print(doc('a.error').text())  # 只输出节点内的文本内容
    print(doc('a.error').html())  # 输出节点内的内容,包含标签内容
    items = doc('a.error').items()
    for i, item in enumerate(items):
        print(i, type(item), item.text())
        print(i, type(item), item.html())
    
    '''
    输出内容:
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    
     2 3
    <span><!-- 1 --></span>
    0 <class 'pyquery.pyquery.PyQuery'> 
    0 <class 'pyquery.pyquery.PyQuery'> <span><!-- 1 --></span>
    1 <class 'pyquery.pyquery.PyQuery'> 2
    1 <class 'pyquery.pyquery.PyQuery'> <span>2</span>
    2 <class 'pyquery.pyquery.PyQuery'> 3
    2 <class 'pyquery.pyquery.PyQuery'> 3
    '''

    6. 节点操作

    ## 节点操作
    from pyquery import PyQuery as pq
    
    doc = pq(text)
    p = doc('.title')
    # addClass和removeClass,增加或减少class属性值 p.add_class('admin') print(p.attr.class_) p.removeClass('title') print(p.attr('class'))
    # attr、text和html,修改属性、文本、HTML文本内容 print(p.attr.name) print(p.text()) print(p.html()) p.attr('name', 'test') print(p.attr.name) p.text('change text') print(p.text()) p.html('<span>change html</span>') print(p.html())
    # remove,移除节点 doc = pq(text) p = doc('.title') print(p.html()) p.remove('b') print(p.html())
    '''
    输出内容:
    title admin
    admin
    dmr
    there is money
    <b>there is money</b>
    test
    change text
    <span>change html</span>
    <b>there is money</b>contents
     contents
    '''
    输出内容

    7. 伪类编辑器

    # 伪类编辑器
    from pyquery import PyQuery as pq
    
    doc = pq(text)
    # 第一个a节点
    a = doc('a:first-child')
    print(a)
    # 最后一个a节点
    a = doc('a:last-child')
    print(a)
    # 第二个a节点
    a = doc('a:nth-child(2)')
    print(a)
    # 第0个节点之后的节点
    a = doc('a:gt(0)')
    print(a)
    # 偶数位置的节点
    a = doc('a:nth-child(2n)')
    print(a)
    # 文本内容包含3的节点
    a = doc('a:contains("3")')
    print(a)
    
    '''
    输出内容:
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
    
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    
    <a href="https://www.baidu.com/2" class="error ed2" id="l2"><span>2</span></a> and 
    
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    '''
  • 相关阅读:
    数据结构-栈与队列
    数据结构-选择排序
    数据结构-冒泡排序
    数据结构-插入排序
    mysql安装最后一步不响应解决
    ScvQ常用的网站(持续更新...)
    排序算法(二)
    排序算法(一)
    原码、反码、补码
    进制转换
  • 原文地址:https://www.cnblogs.com/Caiyundo/p/12507277.html
Copyright © 2011-2022 走看看