zoukankan      html  css  js  c++  java
  • 爬虫之pyquery库

    官方文档:https://pyquery.readthedocs.io/en/latest/

    PyQuery是一个强大又灵活的网页解析库。如果你觉得正则写起来太麻烦、BeautifulSoup语法太难记,而你熟悉jQury的语法,那么PyQuery就是你的绝佳选择。

    一、开始

    字符串初始化:

    from pyquery import PyQuery as pq
    d = pq("<html>哈哈哈</html>")  # 现在d就相当于jQuery的$
    print(d("html"))

    URL初始化:

    from pyquery import PyQuery as pq
    d = pq(url="https://www.baidu.com")
    print(d("head"))

    文件初始化:

    from pyquery import PyQuery as pq
    d = pq(filename='demo.html')  # filename指定文件路径
    print(d("head"))

    二、基本CSS选择器

    html = """
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    print(d("#container .list li"))
    View Code

    三、查找元素

    子元素

    d("css选择器").find("li")
    html = """
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    items = d(".list")
    print(type(items))  # <class 'pyquery.pyquery.PyQuery'>
    li = items.find("li")
    print(type(li))    # <class 'pyquery.pyquery.PyQuery'>
    print(li)
    """
     <li class="item-0">first item</li>
     <li class="item-1"><a href="link2.html">second item</a></li>
     <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     <li class="item-0"><a href="link5.html">fifth item</a></li>
    """
    View Code

    父元素

    d("css选择器").parent(<css选择器(可无)>)
    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    items = d(".list")
    parents = items.parents()
    print(parents)
    """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    """
    d(".list").parents()
    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    items = d(".list")
    parents = items.parents(".wrap")
    print(parents)
    """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    d(".list").parents(".wrap")

    兄弟元素

    d("css选择器").siblings(<css选择器(可无)>)
    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d(".list .item-0.active")
    print(li.siblings())
    """
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0">first item</li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
    """
    View Code
    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d(".list .item-0.active")
    print(li.siblings(".active"))
    """
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    """
    View Code

    四、遍历

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d("li").items()
    print(type(li))  # <class 'generator'>
    for i in li:
        print(i)
    """
    <li class="item-0">first item</li>     
    <li class="item-1"><a href="link2.html">second item</a></li>           
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>            
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>            
    <li class="item-0"><a href="link5.html">fifth item</a></li>
    """
    View Code

    五、获取信息

    获取属性

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    a = d(".item-0.active a")
    print(a.attr("href"))
    print(a.attr.href)
    View Code

    获取文本

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    a = d(".item-0.active a")
    print(a.text())
    """
    third item
    """
    View Code

    获取html

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d(".item-0.active")
    print(li)
    print(li.html())
    """
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>        
    <a href="link3.html"><span class="bold">third item</span></a>
    """
    View Code

    六、DOM操作

    addClass()、removeClass()

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d(".item-0.active")
    print(li)
    li.removeClass("active")
    print(li)
    li.addClass("active")
    print(li)
    """
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>            
    <li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>            
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    """
    View Code

    attr()、css()

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
             </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d(".item-0.active")
    print(li)
    li.attr("name", "link")
    print(li)
    li.css("font-size", "14px")
    print(li)
    """
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>        
    <li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>       
    <li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
    """
    View Code

    remove()

    html = """
    <div class="wrap">
        Hello, World.
        <p>This is a paragraph.</p>
     </div>
    """
    
    from pyquery import PyQuery as pq
    d = pq(html)
    wrap = d(".wrap")
    print(wrap.text())
    """
    Hello, World.
    This is a paragraph.
    """
    wrap.find("p").remove()
    print(wrap.text())  # Hello, World.
    View Code

    其他DOM方法 

    https://pyquery.readthedocs.io/en/latest/api.html

    七、伪类选择器

    html = """
    <div class="wrap">
        <div id="container">
            <ul class="list">
                 <li class="item-0">first item</li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
                 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    """
    from pyquery import PyQuery as pq
    d = pq(html)
    li = d("li:first-child")
    print(li)  # <li class="item-0">first item</li>
    li = d("li:last-child")
    print(li)  # <li class="item-0"><a href="link5.html">fifth item</a></li>
    li = d("li:nth-child(2)")
    print(li)  # <li class="item-1"><a href="link2.html">second item</a></li>
    li = d("li:gt(2)")  # 从0开始计数,索引大于2
    print(li)
    """
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     <li class="item-0"><a href="link5.html">fifth item</a></li>
    """
    li = d("li:nth-child(2n)")  # 获取偶数顺序的元素(从1开始)
    print(li)
    """
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    """
    li = d("li:contains(second)")  # 根据文本匹配,匹配文本包含second的标签
    print(li)  # <li class="item-1"><a href="link2.html">second item</a></li>
    View Code

    更多选择器:http://www.w3school.com.cn/cssref/css_selectors.asp

  • 相关阅读:
    使用 libevent 和 libev 提高网络应用性能
    在PHP中PDO解决中文乱码问题的一些补充
    apache重写规则详解
    Apache的配置
    正则表达式30分钟入门教程
    LVS+keepalived搭建负载均衡
    php判断终端是手机还是电脑访问网站代码
    nginx 502 bad gateway
    算法复习-深度优先遍历和回溯法的关系
    分支限界法和回溯法对比
  • 原文地址:https://www.cnblogs.com/believepd/p/10657877.html
Copyright © 2011-2022 走看看