zoukankan      html  css  js  c++  java
  • 使用pyquery

    • 简单举例

       1 from pyquery import PyQuery as pq
       2 
       3 html = '''
       4 <div>
       5 <ul>
       6 <li class="item-O"><a href="linkl.html">first item</a></li>
       7 <li class="item-1"><a href="link2.html">second item</a></li>
       8 <li class="item-inactive"><a href="link3.html">third item</a></li>
       9 <li class="item-1"><a href="link4.html">fourth item</a></li>
      10 <li class="item-0"><a href="link5.html">fifth item</a>
      11 </ul>
      12 </div>
      13 '''
      14 
      15 doc = pq(html)
      16 print(doc)
      17 
      18 
      19 # 输出:
      20 <div>
      21 <ul>
      22 <li class="item-O"><a href="linkl.html">first item</a></li>
      23 <li class="item-1"><a href="link2.html">second item</a></li>
      24 <li class="item-inactive"><a href="link3.html">third item</a></li>
      25 <li class="item-1"><a href="link4.html">fourth item</a></li>
      26 <li class="item-0"><a href="link5.html">fifth item</a>
      27 </li></ul>
      28 </div>
      字符串
       1 from pyquery import PyQuery as pq
       2 import requests
       3 
       4 # doc1 与 doc2 功能相同
       5 doc1 = pq(url='https://www.cnblogs.com/liyihua/')
       6 print(doc1('title'))
       7 
       8 doc2 = pq(requests.get('https://www.cnblogs.com/liyihua/').text)
       9 print(doc1('title'))
      10 
      11 
      12 # 输出:
      13 <title>李亦华 - 博客园</title>&#13;
      14     
      15 <title>李亦华 - 博客园</title>&#13;
      16
      URL
       1 from pyquery import PyQuery as pq
       2 
       3 doc = pq(filename='test.html')
       4 print(doc('li'))
       5 
       6 
       7 # 输出:
       8 <li class="item-O"><a href="linkl.html">first item</a></li>
       9 <li class="item-1"><a href="link2.html">second item</a></li>
      10 <li class="item-inactive"><a href="link3.html">third item</a></li>
      11 <li class="item-1"><a href="link4.html">fourth item</a></li>
      12 <li class="item-0"><a href="link5.html">fifth item</a>
      13 </li>
      14 
      15 
      16 # 文件内容:
      17 <div>
      18 <ul>
      19 <li class="item-O"><a href="linkl.html">first item</a></li>
      20 <li class="item-1"><a href="link2.html">second item</a></li>
      21 <li class="item-inactive"><a href="link3.html">third item</a></li>
      22 <li class="item-1"><a href="link4.html">fourth item</a></li>
      23 <li class="item-0"><a href="link5.html">fifth item</a>
      24 </ul>
      25 </div>
      文件
    • 基本CSS选择器

       1 from pyquery import PyQuery as pq
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = pq(html)
      16 print(doc('#container .list li'))
      17 
      18 print(
      19     type(
      20         doc('#container .list li')
      21     )
      22 )
      23 
      24 
      25 # 输出:
      26 <li class="item-0">first item</li>
      27          <li class="item-1"><a href="link2.html">second item</a></li>
      28          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      29          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      30          <li class="item-0"><a href="link5.html">fifth item</a></li>
      31      
      32 <class 'pyquery.pyquery.PyQuery'>
      View Code
    • 查找节点

      •  

         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 items = doc('.list')
        17 
        18 print(
        19     type(items),
        20     items,
        21     sep='
        '
        22 )
        23 
        24 print(
        25     type(items.find('li')),
        26     items.find('li'),
        27     sep='
        '
        28 )
        29 
        30 
        31 # 输出:
        32 <class 'pyquery.pyquery.PyQuery'>
        33 <ul class="list">
        34          <li class="item-0">first item</li>
        35          <li class="item-1"><a href="link2.html">second item</a></li>
        36          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        37          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        38          <li class="item-0"><a href="link5.html">fifth item</a></li>
        39      </ul>
        40 
        41 <class 'pyquery.pyquery.PyQuery'>
        42 <li class="item-0">first item</li>
        43          <li class="item-1"><a href="link2.html">second item</a></li>
        44          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        45          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        46          <li class="item-0"><a href="link5.html">fifth item</a></li>
        47
        子孙节点----find()方法

        # find()方法查找的是所有子孙节点,如果只查找子节点,可以使用children()方法

         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 items = doc('.list')
        17 
        18 print(items, '
        ')
        19 
        20 print(
        21     type(items.parent()),
        22     items.parent(),
        23     sep='
        '
        24 )
        25 
        26 
        27 # 输出:
        28 <ul class="list">
        29          <li class="item-0">first item</li>
        30          <li class="item-1"><a href="link2.html">second item</a></li>
        31          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        32          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        33          <li class="item-0"><a href="link5.html">fifth item</a></li>
        34      </ul>
        35  
        36 
        37 <class 'pyquery.pyquery.PyQuery'>
        38 <div id="container">
        39     <ul class="list">
        40          <li class="item-0">first item</li>
        41          <li class="item-1"><a href="link2.html">second item</a></li>
        42          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        43          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        44          <li class="item-0"><a href="link5.html">fifth item</a></li>
        45      </ul>
        46 </div>
        父节点----parent()方法
        parents(selector=None)
        parent(selector=None)
         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 
        17 # 选择class为list的节点内部class为item-0和active的节点
        18 items = doc('.list .item-0.active')
        19 
        20 print(
        21     type(items.siblings()),
        22     items.siblings(),
        23     sep='
        '
        24 )
        25 
        26 print("
        ", items.siblings('.active'))
        27 
        28 
        29 # 输出:
        30 <class 'pyquery.pyquery.PyQuery'>
        31 <li class="item-1"><a href="link2.html">second item</a></li>
        32          <li class="item-0">first item</li>
        33          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        34          <li class="item-0"><a href="link5.html">fifth item</a></li>
        35      
        36 
        37  <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        38
        兄弟节点----siblings()方法
    
    
    •  遍历

       1 from pyquery import PyQuery
       2 
       3 html = '''
       4 <div id="container">
       5     <ul class="list">
       6          <li class="item-0">first item</li>
       7          <li class="item-1"><a href="link2.html">second item</a></li>
       8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
       9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      10          <li class="item-0"><a href="link5.html">fifth item</a></li>
      11      </ul>
      12 </div>
      13 '''
      14 
      15 doc = PyQuery(html)
      16 lis = doc('li').items()             # 调用items()方法,得到一个生成器
      17 
      18 for li in lis:
      19     print(
      20         li, 
      21         type(li)
      22     )
      23 
      24 
      25 # 输出:
      26 <li class="item-0">first item</li>
      27           <class 'pyquery.pyquery.PyQuery'>
      28 <li class="item-1"><a href="link2.html">second item</a></li>
      29           <class 'pyquery.pyquery.PyQuery'>
      30 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      31           <class 'pyquery.pyquery.PyQuery'>
      32 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
      33           <class 'pyquery.pyquery.PyQuery'>
      34 <li class="item-0"><a href="link5.html">fifth item</a></li>
      35       <class 'pyquery.pyquery.PyQuery'>
      遍历----items()
    • 获取信息

      • 获取属性

        attr()方法获取属性
         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 a = doc('.item-0.active a')
        17 print(
        18     a,
        19     type(a),
        20     a.attr('href'),             # 也可以用a.attr.href,两者作用相同
        21     sep='
        '
        22 )
        23 
        24 
        25 # 输出:
        26 <a href="link3.html"><span class="bold">third item</span></a>
        27 <class 'pyquery.pyquery.PyQuery'>
        28 link3.html
        View Code
        # 当返回结果包含多个节点时,调用attr()方法,只会得到第一个节点的属性。如果想获取所有返回的节点的属性,就要用到遍历
      • 获取文本

         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 li = doc('li')
        17 
        18 print(
        19     li.html(),                  # 获取节点的内部文本
        20     li.text(),                  # 获取节点文本,返回结果是纯文字内容
        21     type(li.text()),
        22     sep='
        '
        23 )
        24 
        25 
        26 # 输出:
        27 first item
        28 first item second item third item fourth item fifth item
        29 <class 'str'>
        View Code
    • 节点操作

      • add_class() 和 remove_class() ---- 添加class、移除class

         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0">first item</li>
         7          <li class="item-1"><a href="link2.html">second item</a></li>
         8          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         9          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        10          <li class="item-0"><a href="link5.html">fifth item</a></li>
        11      </ul>
        12 </div>
        13 '''
        14 
        15 doc = PyQuery(html)
        16 li = doc('.item-0.active')
        17 
        18 print(li)
        19 print(li.remove_class('active'))
        20 print(li.add_class('active'))
        21 
        22 
        23 # 输出:
        24 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        25          
        26 <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
        27          
        28 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        29          
        View Code
      • attr、text 和 html

        # attr(*args, **kwargs) ---- Attributes manipulation
        # text(value=no_default, **kwargs) ---- Get or set the text representation of sub nodes.
        # html(value=no_default, **kwargs) ---- Get or set the html representation of sub nodes.
         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div id="container">
         5     <ul class="list">
         6          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         7      </ul>
         8 </div>
         9 '''
        10 
        11 doc = PyQuery(html)
        12 
        13 li = doc('.item-0.active')
        14 print(li)
        15 
        16 li.attr('name', 'link')         # 添加属性name,属性值为link
        17 print(li)
        18 
        19 li.text('change item')          # 将节点内部的内容改为'change item'
        20 print(li)
        21 
        22 li.html('<span>change item</span>')         # 将节点内部的内容改为'<span>change item</span>'
        23 print(li)
        24 
        25 
        26 # 输出:
        27 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        28      
        29 <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
        30      
        31 <li class="item-0 active" name="link">change item</li>
        32      
        33 <li class="item-0 active" name="link"><span>change item</span></li>
        View Code
      • remove()----删除节点

         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div class="LeeHua">
         5 LiYihua
         6 <ul class="201802004731">liyihua</ul>
         7 </div>
         8 '''
         9 
        10 doc = PyQuery(html)
        11 Leehua = doc('.LeeHua')
        12 print("移除节点ul前的输出:
        "+Leehua.text())
        13 
        14 Leehua.find('ul').remove()
        15 print("移除节点ul后的输出:
        "+Leehua.text())
        16 
        17 
        18 # 输出:
        19 移除节点ul前的输出:
        20 LiYihua
        21 liyihua
        22 移除节点ul后的输出:
        23 LiYihua
        View Code
    • 伪类选择器

      • 示例:
         1 from pyquery import PyQuery
         2 
         3 html = '''
         4 <div class="wrap">
         5     <div id="container">
         6         <ul class="list">
         7             <li class="item-0">first item</li>
         8             <li class="item-1"><a href="link2.html">second item</a></li>
         9             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        10             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        11             <li class="item-0"><a href="link5.html">fifth item</a></li>
        12         </ul>
        13     </div>
        14 </div>
        15 '''
        16 
        17 doc = PyQuery(html)
        18 
        19 # 选择属于父元素的第一个子元素的每个 <li> 元素。
        20 li = doc('li:first-child')
        21 print(li)
        22 
        23 # 选择属于父元素的最后一个子元素的每个 <li> 元素。
        24 li = doc('li:last-child')
        25 print(li)
        26 
        27 # 选择属于其父元素的第二个子元素的每个 <li> 元素
        28 li = doc('li:nth-child(2)')
        29 print(li)
        30 
        31 # 选择属于其父元素的最后两个子元素的每个 <li> 元素
        32 li = doc('li:gt(2)')
        33 print(li)
        34 
        35 # 选择属于父元素的第偶个子元素的每个 <li> 元素。
        36 li = doc('li:nth-child(2n)')
        37 print(li)
        38 
        39 # 选择包含'second'的每个元素
        40 li = doc('li:contains(second)')
        41 print(li)
        42 
        43 
        44 # 输出:
        45 <li class="item-0">first item</li>
        46             
        47 <li class="item-0"><a href="link5.html">fifth item</a></li>
        48         
        49 <li class="item-1"><a href="link2.html">second item</a></li>
        50             
        51 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        52             <li class="item-0"><a href="link5.html">fifth item</a></li>
        53         
        54 <li class="item-1"><a href="link2.html">second item</a></li>
        55             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        56             
        57 <li class="item-1"><a href="link2.html">second item</a></li>
        58             
        View Code

        CSS 选择器的用法:http://www.w3school.com.cn/cssref/css_selectors.asp

  • 相关阅读:
    python网络编程学习笔记(3):socket网络服务器
    Python编码爬坑指南
    ROT13 维基百科,自由的百科全书
    ZODB + Traversal Wiki Tutorial¶
    ZODB programming guide¶
    利用新浪api获取ip归属地 QtSharp 博客园
    用python做了个桌球瞄准器
    Python运维工具介绍1–fabric
    python httplib2 使用代理出错
    第四回 基类中的修饰符,应该根据你对架构的理解去定义它们,没有绝对的
  • 原文地址:https://www.cnblogs.com/liyihua/p/11165647.html
Copyright © 2011-2022 走看看