zoukankan      html  css  js  c++  java
  • pyquery库

    一、pyquery库

    PyQuery是一个类似于jQuery的解析网页工具,使用lxml操作xml和html文档,它的语法和jQuery很像。和XPATH,Beautiful Soup比起来,PyQuery更加灵活,提供增加节点的class信息,移除某个节点,提取文本信息等功能,pyquery和BeautifulSoup都是用来解析html的库,但是pyquery的CSS选择器更加强大。

    安装:

    pip install pyquery

    1、URL初始化,通过网址初始化pyquery对象

    1 from pyquery import PyQuery as pq
    2 
    3 with open("./html.html","r",encoding="utf-8") as f:
    4     html = f.read()
    5 
    6 # 通过网页初始化pyquery对象
    7 doc = pq(html)
    8 print(type(doc)) #<class 'pyquery.pyquery.PyQuery'>

    二、CSS选择器

    1、查找元素-子元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li><a href="/">首页</a></li>
     5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 字符串初始化,得到pyquery对象
    13 doc = pq(html)
    14 items = doc(".nav")
    15 list = items.find("li") 
    16 child = items.children("li") #子元素

    2、查找元素-父元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li><a href="/">首页</a></li>
     5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 查找父元素
    13 doc = pq(html)
    14 items = doc.find(".nav")
    15 parent = items.parent(".collapse")
    16 print(parent)

     3、查找元素-兄弟元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 查找兄弟元素
    13 doc = pq(html)
    14 li  = doc(".navbar-nav .item-0.active") #选择器后面不加空格,表示并列,即获取item-0又满足active
    15 print(li.siblings()) #获取除了课程建设标签之外的其他兄弟标签

     5、遍历-多个元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc("li").items() #获取多个元素,items()生成一个产生器
    15 print(type(element)) #<class 'generator'>
    16 for ele in element:
    17     print(ele)

    三、获取信息

    1、获取标签属性--attr()

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc(".item-0.active a")
    15 print(element.attr("name")) #result: course

    2、获取文本--text()

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc(".item-0.active a")
    15 print(element.text()) #result:课程建设

    3、获取HTML

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active")
    15 print(li.html()) #result:<a href="/v2.0/doc/app02.html" name="course">课程建设</a>

    四、DOM操作

    1、增加/删除属性---addClass、removeClass

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active")
    15 print(li) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
    16 
    17 remove_attr = li.removeClass("active")
    18 print(remove_attr) #result:<li class="item-0"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
    19 
    20 add_attr = li.addClass("active")
    21 print(add_attr) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>

     2、修改标签属性、修改样式---attr、css

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active a")
    15 
    16 update_attr = li.attr("name","cour") #若该标签不存在name属性则新增,若存在则修改该属性
    17 print(update_attr) #result:<a href="/v2.0/doc/app02.html" name="cour">课程建设</a>
    18 
    19 css_tag = li.css("font-size","14px")
    20 print(css_tag) #result:<a href="/v2.0/doc/app02.html" name="cour" style="font-size: 14px">课程建设</a>

     3、remove()

     1 html_test = """
     2     <div class="rem">
     3         www.baidu.com,百度一下
     4         <p>《大秦赋》</p>
     5     </div>
     6 """
     7 
     8 from pyquery import PyQuery as pq
     9 doc = pq(html_test)
    10 # 只需要获取p标签上面一句话
    11 element = doc(".rem")
    12 ele = element.find("p").remove() # 找到p标签并移除
    13 print(element.text()) #在获取文本
  • 相关阅读:
    Java实现 LeetCode 50 Pow(x,n)
    Java实现 LeetCode 50 Pow(x,n)
    Java实现 LeetCode 49 字母异位词分组
    Java实现 LeetCode 49 字母异位词分组
    Java实现 LeetCode 49 字母异位词分组
    Java实现 LeetCode 48 旋转图像
    Java实现 LeetCode 48 旋转图像
    Java实现 LeetCode 48 旋转图像
    Java实现 LeetCode 47 全排列 II(二)
    Java实现 LeetCode 47 全排列 II(二)
  • 原文地址:https://www.cnblogs.com/yzmPython/p/14103633.html
Copyright © 2011-2022 走看看