zoukankan      html  css  js  c++  java
  • pyquery库

    一、pyquery库

    PyQuery是一个类似于jQuery的解析网页工具,使用lxml操作xml和html文档,它的语法和jQuery很像。和XPATH,Beautiful Soup比起来,PyQuery更加灵活,提供增加节点的class信息,移除某个节点,提取文本信息等功能,pyquery和BeautifulSoup都是用来解析html的库,但是pyquery的CSS选择器更加强大。

    安装:

    pip install pyquery

    1、URL初始化,通过网址初始化pyquery对象

    1 from pyquery import PyQuery as pq
    2 
    3 with open("./html.html","r",encoding="utf-8") as f:
    4     html = f.read()
    5 
    6 # 通过网页初始化pyquery对象
    7 doc = pq(html)
    8 print(type(doc)) #<class 'pyquery.pyquery.PyQuery'>

    二、CSS选择器

    1、查找元素-子元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li><a href="/">首页</a></li>
     5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 字符串初始化,得到pyquery对象
    13 doc = pq(html)
    14 items = doc(".nav")
    15 list = items.find("li") 
    16 child = items.children("li") #子元素

    2、查找元素-父元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li><a href="/">首页</a></li>
     5             <li><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 查找父元素
    13 doc = pq(html)
    14 items = doc.find(".nav")
    15 parent = items.parent(".collapse")
    16 print(parent)

     3、查找元素-兄弟元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 # 查找兄弟元素
    13 doc = pq(html)
    14 li  = doc(".navbar-nav .item-0.active") #选择器后面不加空格,表示并列,即获取item-0又满足active
    15 print(li.siblings()) #获取除了课程建设标签之外的其他兄弟标签

     5、遍历-多个元素

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc("li").items() #获取多个元素,items()生成一个产生器
    15 print(type(element)) #<class 'generator'>
    16 for ele in element:
    17     print(ele)

    三、获取信息

    1、获取标签属性--attr()

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc(".item-0.active a")
    15 print(element.attr("name")) #result: course

    2、获取文本--text()

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 element = doc(".item-0.active a")
    15 print(element.text()) #result:课程建设

    3、获取HTML

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active")
    15 print(li.html()) #result:<a href="/v2.0/doc/app02.html" name="course">课程建设</a>

    四、DOM操作

    1、增加/删除属性---addClass、removeClass

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active")
    15 print(li) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
    16 
    17 remove_attr = li.removeClass("active")
    18 print(remove_attr) #result:<li class="item-0"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
    19 
    20 add_attr = li.addClass("active")
    21 print(add_attr) #result:<li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>

     2、修改标签属性、修改样式---attr、css

     1 html = """
     2     <div class="collapse navbar-collapse" id="navbar-collapse-2">
     3         <ul class="nav navbar-nav">
     4             <li class="item-0"><a href="/">首页</a></li>
     5             <li class="item-1"><a href="/v2.0/doc/app01.html">系统开发</a></li>
     6             <li class="item-0 active"><a href="/v2.0/doc/app02.html" name="course">课程建设</a></li>
     7             <li class="item-1"><a href="/v2.0/doc/app03.html">资源共享</a></li>
     8             <li class="item-2"><a href="http://www.beian.gov.cn" target="_blank">粤ICP备13085204号-1</a></li>
     9         </ul>
    10     </div>"""
    11 
    12 from pyquery import PyQuery as pq
    13 doc = pq(html)
    14 li = doc(".item-0.active a")
    15 
    16 update_attr = li.attr("name","cour") #若该标签不存在name属性则新增,若存在则修改该属性
    17 print(update_attr) #result:<a href="/v2.0/doc/app02.html" name="cour">课程建设</a>
    18 
    19 css_tag = li.css("font-size","14px")
    20 print(css_tag) #result:<a href="/v2.0/doc/app02.html" name="cour" style="font-size: 14px">课程建设</a>

     3、remove()

     1 html_test = """
     2     <div class="rem">
     3         www.baidu.com,百度一下
     4         <p>《大秦赋》</p>
     5     </div>
     6 """
     7 
     8 from pyquery import PyQuery as pq
     9 doc = pq(html_test)
    10 # 只需要获取p标签上面一句话
    11 element = doc(".rem")
    12 ele = element.find("p").remove() # 找到p标签并移除
    13 print(element.text()) #在获取文本
  • 相关阅读:
    FocusBI:MDX检索多维模型
    FocusBI:地产分析&雪花模型
    FocusBI:租房分析&星型模型
    FocusBI:《DW/BI项目管理》之SSIS执行情况
    FocusBI:租房分析可视化(PowerBI网址体验)
    Eclipse创建自定义HTML5,JSP模板
    小测试解析
    vue---组件通讯
    前期准备-Git篇
    npm install 关于 sass 屡次失败问题
  • 原文地址:https://www.cnblogs.com/yzmPython/p/14103633.html
Copyright © 2011-2022 走看看