zoukankan      html  css  js  c++  java
  • Python3爬虫(七) 解析库的使用之pyquery

     Infi-chu:

    http://www.cnblogs.com/Infi-chu/

    pyquery专门针对CSS和jQuery的操作处理

    1.初始化
    字符串初始化

    from pyquery import PyQuery as pq
    doc = pq(html)	# 传入html文本
    print(doc('li'))
    

    URL初始化

    from pyquery import PyQuery as pq
    doc = pq(url='www.baidu.com')
    print(doc('title'))
    # 另一种方法
    from pyquery import PyQuery as pq
    import requests
    doc = pq(requests.get('http://www.baidu.com'))
    print(doc('title'))
    

    文件初始化

    from pyquery import PyQuery as pq
    doc = pq(filename='text.html')
    print(doc('li'))
    

    2.基本CSS选择器

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    print(doc(#head .head_wrapper a))
    print(type(doc(#head .head_wrapper a)))
    

    3.查找节点
    子节点

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    items = doc('.head_wrapper')
    print(type(items))
    print(items)
    lis = items.find('a')	# find()是查找符合条件的所有子孙节点,只查找子节点的可以使用children()
    print(type(lis))
    print(lis)
    

    父节点
    使用parent()方法获取该节点的父节点
    使用parents()方法获取该节点的祖先节点

    兄弟节点
    使用siblings()方法获取兄弟节点

    4.遍历

    from pyquery import PyQuery as pq
    doc = pq(html)
    lis = doc('li').items()
    print(type(lis))
    for li in lis:
        print(li,type(li))
    

    5.获取信息
    获取属性
    使用attr()方法获取属性(值)

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    items = doc('.head_wrapper')
    print(items.attr('href'))
    # 也可以写成
    print(items.attr.href)
    
    # 获取所有a的属性
    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    a = doc('a')
    for i in a:
        print(i.attr.href)
    

    获取文本
    使用text()方法获取纯文本纯字符串内容

    from pyquery import PyQuery as pq
    doc = pq(url = 'http://www.baidu.com')
    a = doc('a')
    print(i.text())    # 无需遍历
    

    使用html()方法保留标签内部的东西

    from pyquery import PyQuery as pq
    doc = pq(url = 'http://www.baidu.com')
    a = doc('a')
    for i in a:
        print(i)
        print(i.html())
    

    6.节点操作
    addClass和removeClass

    from pyquery import PyQuery as pq
    html = '''
    <div class="wrap">
    <div id="container">
    <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class"bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0 active"><a href="link5.html">fifth item</a></li>
    </ul>
    </div>
    </div>
    '''
    doc = pq(html)
    li = doc('.item-0 active')
    print(li)
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
    

    attr、text和html

    from pyquery import PyQuery as pq
    html = '''
    <div class="div">
    <p>ASD</p>
    <ul class="list">
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    </ul>
    </div>
    '''
    doc = pq(html)
    li = doc('.item-0 active')
    print(li)
    li.attr('name','link')
    print(li)
    li.text('changed item')
    print(li)
    li.html('<span>changed item</span>')
    print(li)
    

    remove()

    from pyquery import PyQuery as pq
    doc = pq(html)
    res = doc('.div')
    print(res.find('ul').remove().text())
    

    7.伪类选择器
    待完善

  • 相关阅读:
    π框架参数规则(正则表达式验证)
    SQL查询优化的一些建议
    phalApi框架打印SQL语句
    phpstorm注册码
    phalApi数据库操作
    内容的全局搜索
    xampp虚拟主机的配置
    navicat自动备份数据
    IOC和AOP扩展
    Spring AOP
  • 原文地址:https://www.cnblogs.com/Infi-chu/p/8986379.html
Copyright © 2011-2022 走看看