zoukankan      html  css  js  c++  java
  • python 带你了解爬虫

    一篇文章带你了解《python爬虫》

    一 什么是网络爬虫:

           1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。

      2. 专业介绍:百度百科

    二 python urllib:

    # demo01.py(urillb基本使用)

    复制代码
    # 导入urllib库(该库不需要安装)
    import urllib.request
    # 请求百度,并接收响应
    response = urllib.request.urlopen("http://www.baidu.com/")
    # 打印页面
    print(response.read().decode('utf-8'))
    复制代码

    # demo2.py(用法讲解)

    复制代码
    # urllib 用法讲解
    # urlopen : urllib.request.urlopen('网址','数据','超时设置')
    
    import urllib.request
    import urllib.parse
    import urllib.error
    
    """
    A:
    response = urllib.request.urlopen('http://www.baidu.com/')
    print(response.read().decode('utf-8'))
    
    B:
    data = urllib.parse.urlencode({'word': 'hello'}).encode('utf-8')
    response = urllib.request.urlopen("http://httpbin.org/post", data = data)
    print(response.read())
    
    C:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
    print(response.read())
    """
    
    try:
        response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
    except urllib.error.URLError as e:
        if isinstance(e.reason.socket.timeout):
            print(response.read())
    复制代码

    # demo03.py(响应)

    复制代码
    # urllib 响应
    
    import urllib.request
    response = urllib.request.urlopen("http://www.baidu.com/")
    # 打印响应类型
    print(type(response))
    # 打印状态码
    print(response.status)
    # 打印响应头
    print(response.getheaders())
    复制代码

    # demo04.py(Request 详解

    复制代码
    # Request 详解
    
    import urllib.request
    from urllib import parse
    
    """
    A:
    request = urllib.request.Request('http://www.baidu.com')
    response = urllib.request.urlopen(request)
    print(response.read().decode('utf-8'))
    
    B:
    url = "http://httpbin.org/post"
    # 指定请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
        "Host": "api.github.com"
    }
    # 请求数据
    dict = {
        "name":"Germey"
    }
    data = bytes(parse.urlencode(dict),encoding='utf-8')
    request = urllib.request.Request(url=url,data=data,headers=headers,method='POST')
    response = urllib.request.urlopen(request)
    print(response.read().decode('utf-8'))
    """
    
    url = "http://httpbin.org/post"
    # 请求数据
    dict = {
        "name":"Germey"
    }
    data = bytes(parse.urlencode(dict),encoding='utf-8')
    request = urllib.request.Request(url=url,data=data,method='POST')
    request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
    response = urllib.request.urlopen(request)
    print(response.read().decode('utf-8'))
    复制代码

    # demo05.py (代理)

    复制代码
    # handler(代理)
    import urllib.request
    
    proxy_header = urllib.request.ProxyHandler({
        "http":"http://xxx.xxx.xxx.xxx:xxxx",
        "https":"https://xxx.xxx.xxx.xxx:xxxx"
    })
    opener = urllib.request.build_opener(proxy_header)
    response = opener.open('http://www.baidu.com')
    print(response.read().decode('utf-8'))
    复制代码

    # demo06.py(cookie)

    复制代码
    # cookie
    
    import http.cookiejar
    import urllib.request
    
    """
    A: http.cookiejar 简单使用
    cookie = http.cookiejar.CookieJar()
    handir = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handir)
    response = opener.open('http://www.baidu.com')
    print(response.read().decode('utf-8'))
    
    B:MozillaCookieJar 将网站的cookie存储在本地文件中
    filename = "utils/cookie.txt"
    cookie = http.cookiejar.MozillaCookieJar(filename)
    handir = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handir)
    response = opener.open('http://www.baidu.com')
    cookie.save(ignore_discard=True,ignore_expires=True)
    
    C: LWPCookieJar 将网站的cookie存储在本地文件中
    filename = "utils/cookie01.txt"
    cookie = http.cookiejar.LWPCookieJar(filename)
    handir = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handir)
    response = opener.open('http://www.baidu.com')
    cookie.save(ignore_discard=True,ignore_expires=True)
    
    D: 使用文件中的cookie
    """
    cookie = http.cookiejar.LWPCookieJar()
    cookie.load('utils/cookie01.txt',ignore_discard=True,ignore_expires=True)
    handir = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handir)
    response = opener.open('http://www.baidu.com')
    print(response.read().decode('utf-8'))
    复制代码

    # demo07.py(异常处理)

    复制代码
    # 异常处理
    
    import urllib.request
    from urllib import error
    
    """
    A: urllib error 简单使用
    try:
        response = urllib.request.urlopen('http://www.baidu.com')
    except error.URLError as e:
        print(e.reason)
        
    B:
    try:
        response = urllib.request.urlopen('http://www.baidu.com/')
        print(response.read().decode('utf-8'))
    except error.URLError as e:
        print(e.reason)
    else:
        print("*************")
        
    C: timeout
    try:
        response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
    except error.URLError as e:
        print(e.reason)
    """
    
    # 一个不存在的连接
    try:
        response = urllib.request.urlopen("http://www.abcdhaha2.com/")
        html = response.read().decode('utf-8')
        print(html)
    except error.URLError as e:
        print(e.reason)
    复制代码

    # demo08.py(URL解析)

    复制代码
    from urllib.parse import urlparse
    from urllib.parse import urlunparse
    from urllib.parse import urljoin
    from urllib.parse import urlencode
    
    # 语法:urlparse("网址",scheme='http|https', allow_fragments=True)
    
    # A
    resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment')
    print(type(resuit))
    print(resuit)
    
    # B
    resuit = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme="https")
    print(resuit)
    
    # C
    resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=True)
    print(resuit)
    
    # D
    resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
    print(resuit)
    
    # E
    resuit = urlparse('https://www.baidu.com/index.html#comment', allow_fragments=False)
    print(resuit)
    
    # F (urlunparse)
    data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
    print(urlunparse(data))
    
    # G (urljoin)
    # 语法 : urljoin("网址","要添加的后缀")
    print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html"))
    
    # H (urlencode)
    params = {
        'name': 'hello_urllib',
        'age': 18
    }
    base_url = 'http://www.baidu.com?'
    url = base_url + urlencode(params)
    print(url)
    复制代码

    三 python requests:

    1. 安装 requests 库:pip install requests

    # demo01.py

    复制代码
    # requests 基本使用
    
    import requests
    
    response = requests.get("http://www.baidu.com")
    print(type(response))           # 打印响应类型
    print(response.status_code)     # 打印状态码
    print(type(response.text))      # 打印响应内容类型
    print(response.text)            # 打印响应内容
    print(response.cookies)         # 打印响应cookie
    复制代码

    2. 请求方式:

    复制代码
    1 requests.get('网址')
    2 requests.post('网址')
    3 requests.put('网址')
    4 requests.patch('网址')
    5 requests.delete('网址')
    6 requests.head('网址')
    7 requests.options('网址')
    复制代码

    3. 基本get请求:

    # demo02.py

    复制代码
    import requests
    
    """
    A:
    response = requests.get('http://www.baidu.com')
    print(response.text)
    
    B:
    response = requests.get('http://httpbin.org/get?name=hello&age=22')
    print(response.text)
    """
    
    data = {
        "name":"hello",
        "age":22
    }
    response = requests.get('http://httpbin.org/get',params=data)
    print(response.text)
    复制代码

    4. 解析json:

    # demo03.py

    复制代码
    # 解析json
    
    import requests
    response = requests.get('https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www')
    print(type(response))
    print(response.json())
    print(type(response.json()))
    复制代码

    5. 获取二进制数据

    # demo04.py

    复制代码
    import requests
    
    """
    A:
    response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500')
    print(type(response.text))
    print(type(response.content))
    print(response.text)
    print(response.content)
    """
    response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500')
    with open('images/image.png','wb') as f:
        f.write(response.content)
        f.close()
    复制代码

    6. 添加headers:

    # demo05.py

    复制代码
    import requests
    
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
    }
    response = requests.get("http://www.baidu.com",headers=headers)
    print(response.text)
    复制代码

    7. 基本的post请求

    # demo06.py

    复制代码
    import requests
    
    """
    A:
    data = {
        "name":"hello",
        "age":22
    }
    response = requests.post("http://httpbin.org/post",data=data)
    print(response.text)
    """
    
    data = {
        "name":"hello",
        "age":22
    }
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
    }
    response = requests.post("http://httpbin.org/post",data=data,headers=headers)
    print(response.text)
    复制代码

    8. 响应:(response属性)

    # demo07.py

    复制代码
    import requests
    
    response = requests.get('http://www.baidu.com')
    print(type(response.status_code),response.status_code)      # 打印响应 状态码类型 和 状态码
    print(type(response.headers),response.headers)              # 打印响应 头类型 和 响应头
    print(type(response.cookies),response.cookies)              # 打印响应 cookies类型 和 cookies
    print(type(response.url),response.url)                      # 打印响应 URL类型 和 URL
    print(type(response.history),response.history)              # 打印历史记录
    复制代码

    9. 状态码判断:

    # demo08.py

    复制代码
    import requests
    
    """
    A:
    response = requests.get('http://www.baidu.com')
    # 这里使用了python三元表达式
    exit() if not response.status_code == requests.codes.ok else print('request successfully')
    
    B:
    response = requests.get('http://www.baidu.com')
    # 这里使用了python三元表达式
    exit() if not response.status_code == 200 else print('request successfully')
    
    """
    response = requests.get('http://www.baidu.com')
    if not response.status_code == 200:
        exit()
    else:
        print('request successfully')
    
    # 以上三种方式表达的意思是一样的
    复制代码

    10. 高级操作:

    # demo09.py

    复制代码
    import requests
    
    # A: 上传文件 ----------------------------------------------------------------
    files = {
        "files":open('images/image.png','rb')
    }
    response = requests.post('http://www.baidu.com',files=files)
    print(response.text)
    
    # B:获取cookie -------------------------------------------------------------
    response = requests.get('http://www.baidu.com')
    print(response.cookies)
    for key,value in response.cookies.items():
        print(key + "=" + value)
        
    # C: 会话维持 --------------------------------------------------------------
    requests.get('http://httpbin.org/cookie/set/number/123456789')
    response = requests.get('http://httpbin.org/cookkie')
    print(response.text)
    
    s = requests.session()
    s.get('http://httpbin.org/cookie/set/number/123456789')
    response = s.get('http://httpbin.org/cookkie')
    print(response.text)
    
    # D: 代理设置 --------------------------------------------------------------
    # 方式一:
    proxies = {
        'http':'http://ip:port',
        'https':'https://ip:port'
    }
    response = requests.get('http://www.baidu.com',proxies=proxies)
    print(response.status_code)
    
    # 方式二:
    proxies = {
        'http':'http://user:password@ip:port/',
        'https':'https://user:password@ip:port/'
    }
    response = requests.get('http://www.baidu.com',proxies=proxies)
    print(response.status_code)
    
    # 方式三:
    proxies = {
        'http':'socks5://ip:port',
        'https':'socks5://ip:port'
    }
    response = requests.get('http://www.baidu.com',proxies=proxies)
    print(response.status_code)
    
    # E: 证书认证 ----------------------------------------------------------------
    response = requests.get('http://www.12306.cn')
    print(response.status_code)
    
    response = requests.get('http://www.12306.cn',verify=False)
    print(response.status_code)
    
    # 注意这里的路径 'path/server.crt','path/key' 该成自己的
    response = requests.get('http://www.12306.cn',cert=('path/server.crt','path/key'))
    print(response.status_code)
    
    # F:超时设置 ----------------------------------------------------------------
    from requests.exceptions import ReadTimeout
    try:
        response = requests.get('http://www.taobao.com', timeout=0.1)
        print(response.status_code)
    except ReadTimeout:
        print("Timeout")
        
    # G: 认证管理 ----------------------------------------------------------------
    from requests.auth import HTTPBasicAuth
    response = requests.get('http://www.taobao.com', auth=HTTPBasicAuth('user','123'))
    print(response.status_code)
    
    response = requests.get('http://www.taobao.com', auth=('user','123'))
    print(response.status_code)
    
    # H: 异常处理 ----------------------------------------------------------------
    from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException
    try:
        response = requests.get('http://www.taobao.com', timeout=0.1)
        print(response.status_code)
    except ReadTimeout:
        print("Timeout")
    except HTTPError:
        print("HTTPError")
    except ConnectionError:
        print("ConnectionError")
    except RequestException:
        print("Error")
    复制代码

    四 BeautifulSoup库详解:(网页解析器)

    1. 安装 :pip install beautifulsoup4

    2. BeautifulSoup基本用法:

    # demo01.py

    复制代码
    # BeautifulSoup 的基本使用
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div>
        <h2>这是一个列表</h2>
        <ul>
            <li>选项1</li>
            <li>选项2</li>
            <li>选项3</li>
            <li>选项4</li>
            <li>选项5</li>
            <li>选项6</li>
            <li>选项7</li>
            <li>选项8</li>
            <li>选项9</li>
        </ul>
    </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.prettify())
    print(soup.title.string)
    复制代码

    3. 标签选择器:(只能拿一次)

    # demo02.py

    复制代码
    # BeautifulSoup 标签选择器(只拿一次)
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div>
        <h2>这是一个列表</h2>
        <ul>
            <li>选项1</li>
            <li>选项2</li>
            <li>选项3</li>
            <li>选项4</li>
            <li>选项5</li>
            <li>选项6</li>
            <li>选项7</li>
            <li>选项8</li>
            <li>选项9</li>
        </ul>
    </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.title)
    print(type(soup.title))
    print(soup.head)
    print(soup.li)
    复制代码

    4. 获取标签名称:

    # demo03.py

    复制代码
    # BeautifulSoup 获取标签名称
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.title.name)
    复制代码

    5. 获取标签属性:

    # demo04.py

    复制代码
    # BeautifulSoup 获取标签属性
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <p class="font-p"></p>
    <a href="http://www.baidu.com">百度一下 你就知道</a>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.attrs)
    print(soup.p.attrs["class"])
    print(soup.a.attrs["href"])
    复制代码

    6. 获取内容:

    # demo05.py

    复制代码
    # BeautifulSoup 获取标签属性
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    div
    <a href="http://www.baidu.com">百度一下 你就知道</a>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.string)
    print(soup.a.string)
    复制代码

    7. 嵌套选择:

     # demo06.py

    复制代码
    # 嵌套选择
    from bs4 import BeautifulSoup
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div>
        <h2>这是一个列表</h2>
        <ul>
            <li>选项1</li>
        </ul>
    </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.ul.li.string)
    复制代码

    8. 子节点和孙节点:

     # demo07.py

    复制代码
    # 子节点和孙节点
    from bs4 import BeautifulSoup
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div>
        <h2>这是一个列表</h2>
        <ul><li>选项1</li><li>选项2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul>
    </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html, 'lxml')
    print(soup.ul.contents) # 选择所有子节点 返回值为列表类型
    print(soup.ul.childern) # 选择单个子节点
    print(soup.ul.descendants)  # 获取所有子孙节点
    for i,child in enumerate(soup.ul.descendants):
        print(i,child)
    复制代码

    9. 父节点和祖先节点:

     # demo08.py

    复制代码
    # 父节点和祖先节点
    
    from bs4 import BeautifulSoup
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Title</title>
    </head>
    <body>
    <div>
        <ol>
            <li><a href="http://www.baidu.com">百度一下 你就知道</a></li>
        </ol>
    </div>
    </body>
    </html>
    """
    soup = BeautifulSoup(html,'lxml')
    print(soup.a.parent)    # 选择父节点
    print(type(soup.a.parents)) # 选择所有父节点
    print(list(enumerate(soup.a.parents)))
    复制代码

    10.兄弟节点:

    # demo09.py

    复制代码
    # 兄弟节点
    from bs4 import BeautifulSoup
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div>
        <h1>我是一个大大的H1</h1>
        <h2>我是一个大大的H2</h2>
        <p>我是一个简单的p标签</p>
        <h3>我是一个大大的H3</h3>
        <h4>我是一个大大的H4</h4>
    </div>
    </body>
    </html>
    """
    html = html.replace('
    ','').replace(' ','')   # 去掉html代码的 "
    " 和 空格
    soup = BeautifulSoup(html, 'lxml')
    print(list(enumerate(soup.p.next_siblings)))    # 获取当前加点下所有的兄弟节点
    print(list(enumerate(soup.p.previous_siblings)))    # 获取当前加点上所有的兄弟节点
    复制代码

    11. 标准选择器(***重点***)

     # demo10.py

    复制代码
    from bs4 import BeautifulSoup
    
    # 标准选择器(重点 建议反复观看)
    # 语法:find_all(name,attrs,recursive,text,**kwargs)
    """
    find 返回符合条件的单个元素 find_all 返回所有符合条件的所有元素
        1. find_parent()          # 返回直接父节点
        2. find_parents()         # 获取所有祖先节点
        3. find_next_sibling()    # 返回当前节点后边一个兄弟节点
        4. find_next_siblings()   # 返回当前节点后边所有兄弟节点
        5. find_all_next()        # 返回当前节点后所有符合条件的节点
        6. find_next()            # 返回当前节点后第一个符合条件的节点
        7. find_all_previous()    # 返回当前节点后所有符合条件的节点
        8. find_previous()        # 返回当前节点后第一个符合条件的节点
    """
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    
    
    # A:name --------------------------------------------------------------
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all('ul'))  # 获取所有ul标签 返回列表类型
    print(type(soup.find_all('ul')[0])) # 获取类型
    for ul in soup.find_all('ul'): 
        print(ul.find_all('li'))
    
    # B:attrs -------------------------------------------------------------
    # 方式一:
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(attrs={"id":"list-1"})) # 获取 id 为 list-1 的所有元素
    print(soup.find_all(attrs={"class":"lisi"}))    # 获取 class 为 lisi 的所有元素
    # 方式二:
    print(soup.find_all(id = "list-1")) # 获取 id 为 list-1 的所有元素
    print(soup.find_all(class_ = "lisi"))   # 获取 class 为 lisi 的所有元素
    # 以上两种方式执行结果是一样的
    
    # C:text --------------------------------------------------------------
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(text = "选项1"))
    
    # D:css选择器(***) -----------------------------------------------------
    # 1:
    soup = BeautifulSoup(html, 'lxml')
    print(soup.select('#list-2'))       # ID 选择器
    print(soup.select('.zhangsan'))     # class 选择器
    print(soup.select('ul li'))         # 标签选择器
    print(soup.select('#divid h2'))     # ID 和 标签 共同使用
    
    # 2:
    soup = BeautifulSoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul.select('li'))
        
    # 3:属性选择器
    soup = BeautifulSoup(html, 'lxml')
    for ul in soup.select('ul'):
        print(ul.get('id'))
        print(ul['id'])
    
    # 4:获取内容
    soup = BeautifulSoup(html, 'lxml')
    for li in soup.select('li'):
        print(li.get_text())
    复制代码

    五 pyquery 库详解

    1. 安装: pip install pyquery

    2. 初始化:

    # demo01.py

    复制代码
    # 初始化
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    
    # A: 字符串初始化 -------------------------------------------------------------------------------------------------------
    doc = PyQuery(html)
    print(doc('li'))
    
    # B: URL初始化 ----------------------------------------------------------------------------------------------------------
    doc = PyQuery(url="http://www.baidu.com")
    print(doc('head'))
    
    # C: 文件初始化(在同级目录下创建index.html 代码和上边的一样) ---------------------------------------------------------------
    # 这种方法会报错 :UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 187: illegal multibyte sequence
    # 解决方法去掉html文件中的中文字符,这种解决方式不推荐(有待研究)
    # doc = PyQuery(filename='index.html')
    # print(doc('li'))
    
    # 可以改成这种方法(但是,总感觉有问题)
    with open("index.html","r",encoding="utf-8")as f:
        doc = f.read()
    result = PyQuery(doc)
    print(result('li'))
    复制代码

    3. 基本CSS选择器:

    # demo02.py

    复制代码
    # 基本CSS选择器
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    print(doc('#divid #list-1 li'))
    复制代码

    4. 查找元素:

    A: 子元素

    # demo03.py

    复制代码
    # 子元素
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    items = doc('#list-1')
    print(type(items))
    print(items)
    li_list = items.find('li')
    print(type(li_list))
    print(li_list)
    复制代码

    B: 父元素

    # demo04.py

    复制代码
    # 父元素
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    items = doc('#list-1')
    container = items.parent()
    print(type(container))
    print(container)
    parents = items.parents()
    print(type(parents))
    print(parents)
    复制代码

    C: 兄弟元素

    # demo05.py

    复制代码
    # 兄弟元素
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    lis = doc('#list-1 .zhangsan')
    print(lis.siblings())
    print(lis.siblings('.zhangsan'))
    复制代码

    D: 遍历

    # demo06.py

    复制代码
    # 遍历
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项2</li>
            <li class="lisi">选项3</li>
        </ul>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    lis = doc('#list-2 .lisi')
    print(lis)
    li_list = doc('.lisi').items()
    print(type(li_list))
    for li in li_list:
        print(li)
    复制代码

    E: 获取信息(标签属性)

    # demo07.py

    复制代码
    # 获取信息(获取属性)
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <a href="http://www.baidu.com">百度一下 你就知道</a>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    a = doc('#divid a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    复制代码

    F: 获取文本

    # demo08.py

    复制代码
    # 获取文本
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <a href="http://www.baidu.com">百度一下 你就知道</a>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    a = doc('#divid a')
    print(a)
    print(a.text())
    复制代码

    G: 获取html

    # demo09.py

    复制代码
    # 获取html
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    <div id="divid">
        <a href="http://www.baidu.com">百度一下 你就知道</a>
    </div>
    </body>
    </html>
    """
    doc = PyQuery(html)
    div = doc('#divid')
    print(div)
    print(div.html())
    复制代码

    H: DOM操作

    # demo10.py

    复制代码
    # DOM 操作
    from pyquery import PyQuery
    
    html = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>BeautifulSoup 学习</title>
    </head>
    <body>
    <h1>BeautifulSoup</h1>
    div id="divid">
        <h2>这是一个列表</h2>
        <ul id="list-1">
            <li class="zhangsan">选项1</li>
            <li class="zhangsan">选项2</li>
            <li class="zhangsan">选项3</li>
        </ul>
        <ul id="list-2">
            <li class="lisi">选项1</li>
            <li class="lisi">选项1</li>
            <li class="lisi">选项1</li>
        </ul>
    </div>
    </body>
    </html>
    """
    
    # 1. addClass,removeClass ----------------------------------------------------------------------------------------------
    doc = PyQuery(html)
    li = doc('.lisi')
    print(li)
    li.remove_class('lisi')
    print(li)
    li.add_class('zhangsan')
    print(li)
    
    # 2. attr,css ----------------------------------------------------------------------------------------------------------
    doc = PyQuery(html)
    li = doc('.zhangsan')
    print(li)
    li.attr('name','link')
    print(li)
    li.css('font-size','40px')
    print(li)
    
    # 3. remove ------------------------------------------------------------------------------------------------------------
    doc = PyQuery(html)
    div = doc('#divid')
    print(div.text())
    div = doc.find('h2').remove()
    print(div.text())
    
    # 4. 伪类选择器 ---------------------------------------------------------------------------------------------------------
    doc = PyQuery(html)
    li = doc('.zhangsan:first-child')       # 获取列表的第一个选项
    print(li)
    li = doc('.zhangsan:last-child')        # 获取列表的最后一个选项
    print(li)
    li = doc('.zhangsan:nth-child(2)')      # 获取列表的第二个选项
    print(li)
    li = doc('.zhangsan:gt(0)')             # 获取索引大于0的所有选项
    print(li)
    li = doc('.zhangsan:nth-child(1n)')     # 获取第一个之后的所有选项(包括第一个选项)
    print(li)
    li = doc('.zhangsan:contains(选项3)')    # 过去内容为"选项3"的选项
    print(li)
    复制代码

    六 selenium库详解(自动化测试工具)

    selenium 在爬虫中主要用来解决JavaScrapt渲染问题

    1. 安装:pip install selenium

    2. 基本使用:

    # demo01.py

    复制代码
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    
    """
    项目目标:实现百度搜索
    1. 创建浏览器对象 请求百度
    2. 元素定位输入框
    3. 输入搜索内容
    4. 点击回车
    """
    # 创建浏览器对象(我用的是谷歌浏览器)
    browser = webdriver.Chrome()
    try:
        # 请求百度
        browser.get("http://www.baidu.com")
        # 定位输入框
        input = browser.find_element_by_id('kw')
        # 输入搜索内容
        input.send_keys("selenium")
        # 点击回车
        input.send_keys(Keys.ENTER)
        # 打印当前的url地址
        print(browser.current_url)
        # 打印cookies
        print(browser.get_cookies())
        # 打印页面
        print(browser.page_source)
    except Exception as e:
        print(e,"=============================")
    finally:
        browser.close()
    
    """
    有可能会遇到的错误
    1. selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
        这是由于程序找不到 chromedriver 驱动
    解决:
        下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
        注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
    
    2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78
        这是由于 ChromeDriver 和 Chrome 版本不对应
    解决:
        删除之前下载的 chromedriver
        重新下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
        注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
        
    大功告成
    """
    复制代码

    3. 声明浏览器对象

    # demo02.py

    复制代码
    # selenium 声明浏览器
    from selenium import webdriver
    browser = webdriver.Chrome()    # 谷歌浏览器
    browser = webdriver.Firefox()   # 火狐浏览器
    browser = webdriver.Edge()      # 微软浏览器
    browser = webdriver.PhantomJS() # 无界面浏览器
    browser = webdriver.Safari()    # Safari浏览器
    复制代码

    4. 访问页面

     # demo03.py

    复制代码
    import time
    from selenium import webdriver
    
    # 声明浏览器对象
    browser = webdriver.Chrome()
    # 访问淘宝
    browser.get('https://www.taobao.com')
    # 将浏览器最大化显示
    browser.maximize_window()
    # 停止5秒
    time.sleep(5)
    # 打印响应页面
    print(browser.page_source)
    # 关闭浏览器
    browser.close()
    复制代码

    5. 查找元素(单个元素)

    # demo04.py

    复制代码
    # 查找元素(单个元素)
    from selenium import webdriver
    
    # 声明浏览器对象
    browser = webdriver.Chrome()
    # 访问淘宝
    browser.get('https://www.taobao.com')
    # 将浏览器最大化显示
    browser.maximize_window()
    # 定位淘宝搜索框(三种方式都可以)
    input_id = browser.find_element_by_id('q')
    input_selector = browser.find_element_by_css_selector('#q')
    input_xpath = browser.find_element_by_xpath('//*[@id="q"]')
    print(input_id)
    print(input_selector)
    print(input_xpath)
    # 关闭浏览器
    browser.close()
    
    """
    查找单个元素常用方法:
        browser.find_element_by_xpath()
        browser.find_element_by_name()
        browser.find_element_by_link_text()
        browser.find_element_by_partial_link_text()
        browser.find_element_by_tag_name()
        browser.find_element_by_class_name()
        browser.find_element_by_css_selector()
    """
    复制代码

    6. 查找元素(多个元素)

    # demo05.py

    复制代码
    # 查找元素(单个元素)
    from selenium import webdriver
    
    # 声明浏览器对象
    browser = webdriver.Chrome()
    # 访问淘宝
    browser.get('https://www.taobao.com')
    # 将浏览器最大化显示
    browser.maximize_window()
    # 查找 class="J_Cat a-all" 的所有元素
    li_list = browser.find_elements_by_css_selector('.J_Cat')
    print(li_list)
    # 关闭浏览器
    browser.close()
    
    """
    查找多个元素常用方法:
        browser.find_elements_by_xpath()
        browser.find_elements_by_name()
        browser.find_elements_by_link_text()
        browser.find_elements_by_partial_link_text()
        browser.find_elements_by_tag_name()
        browser.find_elements_by_class_name()
        browser.find_elements_by_css_selector()
    """
    复制代码

    7. 元素交互

    # demo06.py

    复制代码
    import time
    from selenium import webdriver
    
    # 声明浏览器对象
    browser = webdriver.Chrome()
    # 请求淘宝
    browser.get("https://www.taobao.com")
    # 窗口最大化
    browser.maximize_window()
    # 定位搜索框
    input = browser.find_element_by_id('q')
    # 输入"内存条"
    input.send_keys("内存条")
    time.sleep(3)
    # 清除搜索框内容
    input.clear()
    time.sleep(5)
    # 输入 "1T硬盘"
    input.send_keys("1T硬盘")
    # 定位搜索按钮
    button = browser.find_element_by_class_name('btn-search')
    # 点击搜索按钮
    button.click()
    time.sleep(10)
    # 关闭浏览器
    browser.close()
    复制代码

    8. 执行javascrapt

    # demo07.py

    复制代码
    # 执行 javascrapt
    from selenium import webdriver
    browser = webdriver.Chrome()
    browser.get("https://www.taobao.com")
    # 滚动条拉到最下边
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    # 弹窗
    browser.execute_script('alert("To Bottom")')
    复制代码

    9. 获取元素信息(获取属性)

    # demo08.py

    复制代码
    # 获取元素信息(获取属性)
    
    from selenium import webdriver
    browser = webdriver.Chrome()
    url = "https://www.zhihu.com/"
    browser.get(url)
    logo = browser.find_element_by_css_selector('.SignFlowHomepage-logo')
    print(logo)
    print(logo.get_attribute('src'))
    browser.close()
    复制代码

    10. 获取元素信息(获取文本值)

    # demo09.py

    复制代码
    # 获取元素信息(获取文本值)
    
    from selenium import webdriver
    browser = webdriver.Chrome()
    url = "https://www.zhihu.com/explore"
    browser.get(url)
    input = browser.find_element_by_id('Popover1-toggle')
    input.send_keys('新冠病毒')
    print(input.text)
    复制代码

    11. 获取元素信息(获取ID,位置,标签名,大小)

    # demo10.py

    复制代码
    # 获取元素信息(获取ID,位置,标签名,大小)
    
    from selenium import webdriver
    browser = webdriver.Chrome()
    url = "https://www.zhihu.com/explore"
    browser.get(url)
    input = browser.find_element_by_id('Popover1-toggle')
    print(input.id)
    print(input.location)
    print(input.tag_name)
    print(input.size)
    browser.close()
    复制代码

    12. 获取元素信息(iframe)

    # demo11.py

    复制代码
    # 获取元素信息(iframe)
    
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    browser = webdriver.Chrome()
    url = "https://www.runoob.com/try/try.php?filename=tryjquery_hide"
    browser.get(url)
    browser.switch_to.frame('iframeResult')
    button = browser.find_element_by_css_selector('button')
    print(button)
    try:
        logo = browser.find_element_by_class_name('logo')
    except NoSuchElementException:
        print('NO LOGO')
    finally:
        browser.switch_to.parent_frame()
        logo = browser.find_element_by_class_name('logo')
        print(logo)
        print(logo.text)
        browser.close()
    复制代码

    13. 等待

    # demo12.py

    复制代码
    # 等待
    
    """ 
    显示等待就是有条件的等待
    隐式等待就是无条件的等待
    
    隐式等待
        当使用了隐式等待执行测试的时候,如果 WebDriver 没有在 DOM 中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,
        换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找 DOM,默认的时间是 0
    
    显式等待
        指定某个条件,然后设置最长等待时间。如果在这个时间还没有找到元素,那么便会抛出异常。
        只有该条件触发,才执行后续代码,这个使用更灵活。 
        主要涉及到selenium.webdriver.support 下的expected_conditions类。 
    """
    
    from selenium import webdriver
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    browser = webdriver.Chrome()
    browser.get('http://www.taobao.com')
    browser.maximize_window()
    browser.implicitly_wait(10)
    wait = WebDriverWait(browser,10)
    input = wait.until(EC.presence_of_all_elements_located((By.ID,'q')))
    button = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.btn-search')))
    print(input)
    print(button)
    browser.close()
    复制代码

    14. 浏览器的前进和后退

    # demo13.py

    复制代码
    # 浏览器的前进和后退
    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    time.sleep(1)
    browser.get('https://www.taobao.com')
    time.sleep(1)
    browser.get('https://www.cnblogs.com/xingxingnbsp/')
    time.sleep(1)
    browser.back()
    time.sleep(2)
    browser.forward()
    time.sleep(2)
    browser.close()
    复制代码

    15. Cookies

    # demo14.py

    复制代码
    # cookies
    from selenium import webdriver
    browser = webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    browser.add_cookie({"name":"name","domain":"www.zhihu.com","value":"germey"})
    print(browser.get_cookies())
    browser.delete_all_cookies()
    print(browser.get_cookies())
    browser.close()
    复制代码

    16. 选项卡管理(不兼容)

    # demo15.py

    复制代码
    # 选项卡管理
    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    time.sleep(2)
    browser.execute_script('window.open()')
    print(browser.window_handles)
    browser.switch_to_window(browser.window_handles[1])
    browser.get('https://www.taobao.com')
    time.sleep(2)
    browser.get('https://www.cnblogs.com/xingxingnbsp/')
    time.sleep(3)
    browser.close()
    复制代码

    17. 异常处理

     # demo16.py

    复制代码
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException,NoSuchElementException
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
    except TimeoutException:
        print('Time Out')
    try:
        browser.find_element_by_id('hello')
    except NoSuchElementException:
        print('No Element')
    finally:
        browser.close()
    复制代码

     文章摘录:https://www.cnblogs.com/xingxingnbsp/p/12129466.html

  • 相关阅读:
    【转】关于char * 与 char[]
    网页打印js代码
    无法用排他锁锁定该数据库,以执行该操作。 (Microsoft SQL Server,错误: 5030)
    CKEditor使用笔记
    FormView作为单独编辑页笔记
    用WindowsMediaPlayer控件写个WinForm播放器
    ListView搭配DataPager控件实现分页笔记
    如何禁用ViewState
    C#获取本机IP搜集整理7种方法
    ListView高效率分页笔记
  • 原文地址:https://www.cnblogs.com/shenmiyang/p/14893668.html
Copyright © 2011-2022 走看看