zoukankan      html  css  js  c++  java
  • Python开发之爬虫模块介绍(二)

    BeautifulSoup模块

    是一个又灵活又方便的网页解析库,而且处理高效,支持多种解析器,利用它不用编写正则表达式即可方便的实现网页信息的提取。

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.prettify()) # 格式化代码
    print(soup.title.string) # 取出title
    

    1、标签选择器

    • 选择元素
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title) # <title>Blog示例</title>
    print(type(soup.title)) #<class 'bs4.element.Tag'>
    
    print(soup.head) #<head><title>Blog示例</title></head>
    print(soup.a)
    '''
    <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>
    '''
    
    • 获取名称
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <div class="author-name">我的Blog</div>
        <div class="info">这个人很懒,什么都没有留下。</div>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title.name) # title
    
    • 获取属性
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'></p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.attrs['name']) # dromouse
    print(soup.p['name']) #dromouse
    
    • 获取内容
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.string) # 人生苦短
    
    • 嵌套
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.head.title.string) # Blog示例
    
    • 子节点和子孙节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.contents) # 是一个列表
    '''
    ['\n', <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>, '\n', <p name="dromouse">人生苦短</p>, '\n']
    '''
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.children)
    for i, children in enumerate(soup.body.children):
        print(i,children)
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.body.descendants) 
    for i, children in enumerate(soup.body.descendants):
        print(i,children)
    • 父节点和祖先节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.img.parent) 
    '''
    <a class="logo" href="#">
    <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
    </a>
    '''
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(list(soup.img.parents)) # 祖先节点
    
    • 兄弟节点
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(list(soup.a.next_siblings)) # 兄弟节点
    print(list(soup.a.previous_siblings))
    

    2、标准选择器

    find_all(name,attrs,recursive,text,**kwargs)

    可根据标签名,属性,内容查找文档

    • name
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all('p')) # 是一个列表
    print(soup.find_all('p')[0])
    
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all('a')) # 是一个列表,
    print(type(soup.find_all('a'))) 
    for i in soup.find_all('a'):
        print(i.find_all('img'))
    
    • attrs
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all(attrs={"class":'logo'})) 
    print(soup.find_all(attrs={"name":'dromouse'})) 
    
    • text
    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.find_all(text='人生苦短'))
    

    find返回单个元素,find_all返回所有元素,用法和find_all一样。

    3、CSS选择器

    通过select()直接传入CSS选择器即可完成选择

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('.logo'))
    print(soup.select('body p'))
    

    获取属性

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('body img')[0].attrs['src'])
    

    获取内容

    html = '''
    <html lang="zh-CN">
    <head><title>Blog示例</title></head>
    <body>
        <a href="#" class="logo">
            <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
        </a>    
        <p name='dromouse'>人生苦短</p>
        <p name='dromouse1'>人生苦短1</p>
    </body>
    </html>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.select('body p')[0].get_text())
    

    总结:

    • 推荐使用lxml解析库,必要时使用html.parser
    • 标签选择筛选功能弱但是速度快
    • 建议使用find、find_all查询匹配单个结果或者多个结果
    • 如果对CSS选择器比较熟建议使用select()
    • 最后记住常用的获取属性和文本的方法。

    PyQuery解析库

    是一个强大又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难,如果你熟悉jQuery的语法,那么PyQuery就是你的绝佳选择,好多如果如果如果。。。。。

    1、PyQuery初始化

    • 初始化字符串
    html='''
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html) # pyquery对象
    print(doc('li')) # 元素选择
    
    • URL初始化
    from pyquery import PyQuery as pq
    
    doc = pq(url='http://www.baidu.com')
    print(doc('title'))
    
    • 文件初始化
    from pyquery import PyQuery as pq
    
    doc = pq(filename='pyquery.html')
    print(doc('li'))
    

    2、基本CSS选择器

    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    print(doc('#container .list li'))
    

    3、查找元素

    • 子元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list')
    # print(type(items))
    # print(items)
    li = items.find('li')
    # print(li)
    lis = items.children()
    # print(lis)
    print(items.children('.active'))
    
    • 父元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list')
    con = items.parent()
    print(con)
    
    • 兄弟节点
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.list .item-0.active')
    print(items.siblings())
    

    4、遍历

    • 单个元素
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    items = doc('.item-0.active')
    print(items)
    
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li= doc('li').items() # 遍历的方法是items()
    print(li) #<generator object PyQuery.items at 0x0305F2D0> 生成器
    print(next(li))
    for i in li:
        print(i)
    

    5、获取信息

    • 获取属性
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    
    • 获取文本
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.text())
    
    • 获取HTML
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    a= doc('.item-0.active a')
    print(a)
    print(a.html())
    

    6、DOM操作

    • addClass、removeClass
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li = doc('.item-0.active')
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
    
    • attr 、css
    html='''
    <div id='container'>
        <ul class='list'>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    li = doc('.item-0.active')
    li.attr('name','link')
    print(li)
    li.css('color','red')
    print(li)
    
    • remove
    html='''
    <div class='wrap'>
        Hello,world
        <p>This is a paragraph</p>
    </div>
    '''
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    wrap = doc('.wrap')
    print(wrap.text())
    wrap.find('p').remove()
    print(wrap.text())
    

      

  • 相关阅读:
    cf round #421 div2 D. Mister B and PR Shifts
    cf round #421 div2 C. Mister B and Boring Game(trick)
    UVa 12716 GCD XOR
    cf 821E Okabe and El Psy Kongroo(矩阵快速幂)
    hdu 6109 数据分割(并查集+set)
    poj 2887 Big String(块状链表)
    hdu 6119 小小粉丝度度熊(区间双指针)
    hdu 6118 度度熊的交易计划(可行费用流)
    hdu 6015 Gameia(树上博弈)
    hdu 6096 String(AC自动机巧妙建图)
  • 原文地址:https://www.cnblogs.com/crazyforever/p/5053749.html
Copyright © 2011-2022 走看看