zoukankan      html  css  js  c++  java
  • Python开发之爬虫模块介绍(二)

    BeautifulSoup模块

    是一个又灵活又方便的网页解析库,而且处理高效,支持多种解析器,利用它不用编写正则表达式即可方便的实现网页信息的提取。

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.prettify()) # 格式化代码
    print(soup.title.string) # 取出title
    

    1、标签选择器

    • 选择元素
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title) # <title>Blog示例</title>
    print(type(soup.title)) #<class 'bs4.element.Tag'>
    
    print(soup.head) #<head><title>Blog示例</title></head>
    print(soup.a)
    '''
    <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>
    '''
    
    • 获取名称
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title.name) # title
    
    • 获取属性
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'></p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.attrs['name']) # dromouse
    print(soup.p['name']) #dromouse
    
    • 获取内容
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.string) # 人生苦短
    
    • 嵌套
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.head.title.string) # Blog示例
    
    • 子节点和子孙节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.contents) # 是一个列表
    '''
    ['\n', <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>, '\n', <p name="dromouse">人生苦短</p>, '\n']
    '''
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.children)
    for i, children in enumerate(soup.body.children):
        print(i,children)
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.descendants) 
    for i, children in enumerate(soup.body.descendants):
        print(i,children)
    • 父节点和祖先节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.img.parent) 
    '''
    <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>
    '''
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(list(soup.img.parents)) # 祖先节点
    
    • 兄弟节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(list(soup.a.next_siblings)) # 兄弟节点
    print(list(soup.a.previous_siblings))
    

    2、标准选择器

    find_all(name,attrs,recursive,text,**kwargs)

    可根据标签名,属性,内容查找文档

    • name
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all('p')) # 是一个列表
    print(soup.find_all('p')[0])
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all('a')) # 是一个列表,
    print(type(soup.find_all('a'))) 
    for i in soup.find_all('a'):
        print(i.find_all('img'))
    
    • attrs
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all(attrs={"class":'logo'})) 
    print(soup.find_all(attrs={"name":'dromouse'})) 
    
    • text
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all(text='人生苦短'))
    

    find返回单个元素,find_all返回所有元素,用法和find_all一样。

    3、CSS选择器

    通过select()直接传入CSS选择器即可完成选择

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('.logo'))
    print(soup.select('body p'))
    

    获取属性

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('body img')[0].attrs['src'])
    

    获取内容

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('body p')[0].get_text())
    

    总结:

    • 推荐使用lxml解析库,必要时使用html.parser
    • 标签选择筛选功能弱但是速度快
    • 建议使用find、find_all查询匹配单个结果或者多个结果
    • 如果对CSS选择器比较熟建议使用select()
    • 最后记住常用的获取属性和文本的方法。

    PyQuery解析库

    是一个强大又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难,如果你熟悉jQuery的语法,那么PyQuery就是你的绝佳选择,好多如果如果如果。。。。。

    1、PyQuery初始化

    • 初始化字符串
    html='''
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html) # pyquery对象
    print(doc('li')) # 元素选择
    
    • URL初始化
    from pyquery import PyQuery as pq
    
    doc = pq(url='http://www.baidu.com')
    print(doc('title'))
    
    • 文件初始化
    from pyquery import PyQuery as pq
    
    doc = pq(filename='pyquery.html')
    print(doc('li'))
    

    2、基本CSS选择器

    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    print(doc('#container .list li'))
    

    3、查找元素

    • 子元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list')
    # print(type(items))
    # print(items)
    li = items.find('li')
    # print(li)
    lis = items.children()
    # print(lis)
    print(items.children('.active'))
    
    • 父元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list')
    con = items.parent()
    print(con)
    
    • 兄弟节点
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list .item-0.active')
    print(items.siblings())
    

    4、遍历

    • 单个元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.item-0.active')
    print(items)
    
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li= doc('li').items() # 遍历的方法是items()
    print(li) #<generator object PyQuery.items at 0x0305F2D0> 生成器
    print(next(li))
    for i in li:
        print(i)
    

    5、获取信息

    • 获取属性
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    
    • 获取文本
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.text())
    
    • 获取HTML
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.html())
    

    6、DOM操作

    • addClass、removeClass
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li = doc('.item-0.active')
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
    
    • attr 、css
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li = doc('.item-0.active')
    li.attr('name','link')
    print(li)
    li.css('color','red')
    print(li)
    
    • remove
    html='''
    <div class='wrap'>
        Hello,world
        <p>This is a paragraph</p>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    wrap = doc('.wrap')
    print(wrap.text())
    wrap.find('p').remove()
    print(wrap.text())
    

      

  • 相关阅读:
    LeetCode 79. 单词搜索
    LeetCode 1143. 最长公共子序列
    LeetCode 55. 跳跃游戏
    LeetCode 48. 旋转图像
    LeetCode 93. 复原 IP 地址
    LeetCode 456. 132模式
    LeetCode 341. 扁平化嵌套列表迭代器
    LeetCode 73. 矩阵置零
    LeetCode 47. 全排列 II
    LeetCode 46. 全排列
  • 原文地址:https://www.cnblogs.com/crazyforever/p/5053749.html
Copyright © 2011-2022 走看看