zoukankan      html  css  js  c++  java
  • Python爬虫之pyquery库的基本使用

      1 # 字符串初始化
      2 html = '''
      3 <div>
      4     <ul>
      5         <li class = "item-0">first item</li>
      6         <li class = "item-1"><a href = "link2.html">second item</a></li>
      7         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
      8         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
      9         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     10     </ul>
     11 </div>
     12 '''
     13 
     14 from pyquery import PyQuery as pq
     15 doc = pq(html)
     16 print(doc('li'))
     17 
     18 # url初始化
     19 from pyquery import  PyQuery as pq
     20 doc = pq(url = "http://www.baidu.com")
     21 print(doc("head"))
     22 
     23 # 文件初始化
     24 from pyquery import PyQuery as pq
     25 doc = pq(filename = "demo.html")
     26 print(doc('li'))
     27 
     28 # 基本CSS选择器
     29 html = '''
     30 <div id = "container">
     31     <ul class = "list">
     32         <li class = "item-0">first item</li>
     33         <li class = "item-1"><a href = "link2.html">second item</a></li>
     34         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     35         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     36         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     37     </ul>
     38 </div>
     39 '''
     40 from pyquery import PyQuery as pq
     41 doc = pq(html)
     42 # 注意下面id 前面需要加上#,class 前面需要加上.
     43 print(doc('#container .list li'))
     44 
     45 # 查找元素
     46 # 子元素
     47 html = '''
     48 <div id = "container">
     49     <ul class = "list">
     50         <li class = "item-0">first item</li>
     51         <li class = "item-1"><a href = "link2.html">second item</a></li>
     52         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     53         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     54         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     55     </ul>
     56 </div>
     57 '''
     58 from pyquery import PyQuery as pq
     59 doc = pq(html)
     60 items = doc('.list')
     61 print(type(items))
     62 print(items)
     63 lis = items.find('li')
     64 print(type(lis))
     65 print(lis)
     66 
     67 lis = items.children()
     68 print(type(lis))
     69 print(lis)
     70 
     71 lis = items.children('.active')
     72 print(lis)
     73 
     74 # 父元素
     75 html = '''
     76 <div id = "container">
     77     <ul class = "list">
     78         <li class = "item-0">first item</li>
     79         <li class = "item-1"><a href = "link2.html">second item</a></li>
     80         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     81         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     82         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     83     </ul>
     84 </div>
     85 '''
     86 from pyquery import PyQuery as pq
     87 doc = pq(html)
     88 items = doc('.list')
     89 container = items.parent()
     90 print(type(container))
     91 print(container)
     92 
     93 html = '''
     94 <div class = "wrap">
     95     <div id = "container">
     96         <ul class = "list">
     97             <li class = "item-0">first item</li>
     98             <li class = "item-1"><a href = "link2.html">second item</a></li>
     99             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
    100             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
    101             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
    102         </ul>
    103     </div>
    104 </div>
    105 '''
    106 from pyquery import PyQuery as pq
    107 doc = pq(html)
    108 items = doc('.list')
    109 parents = items.parents()
    110 print(type(parents))
    111 print(parents)
    112 
    113 parents = items.parents('.wrap')
    114 print(parents)
      1 # 兄弟元素
      2 html = '''
      3 <div class = "wrap">
      4     <div id = "container">
      5         <ul class = "list">
      6             <li class = "item-0">first item</li>
      7             <li class = "item-1"><a href = "link2.html">second item</a></li>
      8             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
      9             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     10             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     11         </ul>
     12     </div>
     13 </div>
     14 '''
     15 from pyquery import PyQuery as pq
     16 doc = pq(html)
     17 # 注意下面item-0后面直接是. 没有空格
     18 li = doc('.list .item-0.active')
     19 print(li.siblings())
     20 
     21 print(li.siblings('.active'))
     22 
     23 # 遍历
     24 # 单个元素
     25 html = '''
     26 <div class = "wrap">
     27     <div id = "container">
     28         <ul class = "list">
     29             <li class = "item-0">first item</li>
     30             <li class = "item-1"><a href = "link2.html">second item</a></li>
     31             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     32             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     33             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     34         </ul>
     35     </div>
     36 </div>
     37 '''
     38 from pyquery import PyQuery as pq
     39 doc = pq(html)
     40 li = doc('.item-0.active')
     41 print(li)
     42 
     43 html = '''
     44 <div class = "wrap">
     45     <div id = "container">
     46         <ul class = "list">
     47             <li class = "item-0">first item</li>
     48             <li class = "item-1"><a href = "link2.html">second item</a></li>
     49             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     50             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     51             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     52         </ul>
     53     </div>
     54 </div>
     55 '''
     56 from pyquery import PyQuery as pq
     57 doc = pq(html)
     58 lis = doc('li').items()
     59 print(type(lis))
     60 for li in lis:
     61     print(li)
     62 
     63 # 获取信息
     64 # 获取属性
     65 html = '''
     66 <div class = "wrap">
     67     <div id = "container">
     68         <ul class = "list">
     69             <li class = "item-0">first item</li>
     70             <li class = "item-1"><a href = "link2.html">second item</a></li>
     71             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
     72             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
     73             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
     74         </ul>
     75     </div>
     76 </div>
     77 '''
     78 from pyquery import PyQuery as pq
     79 doc = pq(html)
     80 a = doc('.item-0.active a')
     81 print(a)
     82 # 获取属性的两种方法
     83 print(a.attr('href'))
     84 print(a.attr.href)
     85 
     86 # 获取文本
     87 print(a.text())
     88 
     89 # 获取html
     90 from pyquery import PyQuery as pq
     91 doc = pq(html)
     92 li = doc('.item-0.active')
     93 print(li)
     94 # 得到<li>标签里面的代码
     95 print(li.html())
     96 
     97 # DOM操作
     98 # addClass、removeClass
     99 from pyquery import PyQuery as pq
    100 doc = pq(html)
    101 li = doc('.item-0.active')
    102 print(li)
    103 li.remove_class('active')
    104 print(li)
    105 li.add_class('active')
    106 print(li)
    107 
    108 # attr CSS
    109 li.attr('name', 'link')
    110 print(li)
    111 li.css('font-size', '14px')
    112 print(li)
    113 
    114 # remove
    115 html = '''
    116 <div class = "wrap">
    117     Hello,World
    118     <p>This is a paragraph</p>
    119 </div>
    120 '''
    121 from pyquery import PyQuery as pq
    122 doc = pq(html)
    123 wrap = doc('.wrap')
    124 print(wrap.text())
    125 wrap.find('p').remove()
    126 print(wrap.text())
    127 
    128 # 伪类选择器
    129 html = '''
    130 <div class = "wrap">
    131     <div id = "container">
    132         <ul class = "list">
    133             <li class = "item-0">first item</li>
    134             <li class = "item-1"><a href = "link2.html">second item</a></li>
    135             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
    136             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
    137             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
    138         </ul>
    139     </div>
    140 </div>
    141 '''
    142 from pyquery import PyQuery as pq
    143 doc = pq(html)
    144 # 获取第一个元素
    145 li = doc('li:first-child')
    146 print(li)
    147 # 获取最后一个元素
    148 li = doc('li:last-child')
    149 print(li)
    150 # 获取第二个元素
    151 li = doc('li:nth-child(2)')
    152 print(li)
    153 # 获取下标为2的元素后面的所有元素(下标从0开始)
    154 li = doc('li:gt(2)')
    155 print(li)
    156 # 获取下标为偶数的元素
    157 li = doc('li:nth-child(2n)')
    158 print(li)
    159 # 获取内容包含second 的元素
    160 li = doc('li:contains(second)')
    161 print(li)
  • 相关阅读:
    [Python] Marshmallow QuickStart
    [Python]Marshmallow 代码
    [python]Flask-migrate简单入门
    [数据库]Sqlite使用入门
    [Python] dict对象的keys()和values()返回的值,是否总是保证一一对应?
    【Weiss】【第03章】练习3.20:中缀表达式转后缀表达式
    【Weiss】【第03章】练习3.19:计算后缀表达式
    【Weiss】【第03章】练习3.18:检查平衡符号
    【Weiss】【第03章】练习3.17:懒惰删除
    【TIJ4】第六章全部习题【习题未完成】
  • 原文地址:https://www.cnblogs.com/duxie/p/10039234.html
Copyright © 2011-2022 走看看