zoukankan      html  css  js  c++  java
  • elixir 高可用系列(五) Supervisor

    概述

    OTP 平台的容错性高,是因为它提供了机制来监控所有 processes 的状态,如果有进程出现异常, 不仅可以及时检测到错误,还可以对 processes 进行重启等操作。

    有了 supervisor,可以有效的提高系统的可用性,一个 supervior 监督一个或多个应用, 同时, supervior 也可以监督 supervior,从而形成一个监督树,提高整个系统的可用性。

    注意 ,supervior 最好只用于监督,不要有其他的业务逻辑处理,越是接近监督树根部的 supervior 就要越简单, 因为 supervior 简单就不容易出错,它是保证系统高可用的关键。

    监督者示例

    下面,使用 elixir 中提供的 Supervisor 模块,构造简单的监督示例来演示如何提高系统的可用性。

    监督策略

    监督策略有4种:

    1. :one_for_one 只重启出错的 process
    2. :one_for_all 当有 process 出错时,重启所有的 process
    3. :rest_for_one 重启出错的 process ,以及所有在它之后启动的 process(也就是重启对出错 process 有依赖的 所有 process)
    4. :simple_one_for_one 类似 :one_for_one ,但是 supervior 只能包含一个 process

    监督策略的转换非常简单,下面演示2种监督策略的示例:

    one for one

    defmodule PseudoServerA do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerA PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerA", []}
      end
    end
    
    defmodule PseudoServerB do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerB PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerB", []}
      end
    end
    
    defmodule PseudoServerC do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerC PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerC", []}
      end
    end
    
    defmodule SupervisorTest do
      import Supervisor.Spec
    
      def init() do
        children = [
          worker(PseudoServerA, [[], [name: :server_a]]),
          worker(PseudoServerB, [[], [name: :server_b]]),
          worker(PseudoServerC, [[], [name: :server_c]])
        ]
    
        # Start the supervisor with children
        Supervisor.start_link(children, strategy: :one_for_one)
      end
    
    end
    

    测试方式:

    $ iex -S mix
    
    # 启动 supervisor 及其监督的3个 process 
    iex(1)> SupervisorTest.init
    {:ok, #PID<0.145.0>}
    
    # 启动后, 3个 process 的 PID 如下
    iex(2)> GenServer.call(:server_a, :display)
    'ServerA PID: <0.146.0>'
    iex(3)> GenServer.call(:server_b, :display)
    'ServerB PID: <0.147.0>'
    iex(4)> GenServer.call(:server_c, :display)
    'ServerC PID: <0.148.0>'
    
    # 通过消息 :err 让 serverA 出错
    iex(5)> GenServer.cast(:server_a, :err)
    :ok
    iex(6)>
    14:47:53.119 [error] GenServer :server_a terminating
    ** (stop) "stop ServerA"
    Last message: {:"$gen_cast", :err}
    State: []
    
    nil
    
    # serverA 出错后,再次查看3个process的PID,发现 supervisor 只重启了 serverA,符合策略 :one_for_one
    iex(7)> GenServer.call(:server_a, :display)
    'ServerA PID: <0.155.0>'
    iex(8)> GenServer.call(:server_b, :display)
    'ServerB PID: <0.147.0>'
    iex(9)> GenServer.call(:server_c, :display)
    'ServerC PID: <0.148.0>'
    

    one_for_all

    我们换一种监督策略试试看,只需要将上面的代码

    # Start the supervisor with children
    Supervisor.start_link(children, strategy: :one_for_one)
    

    改成

    # Start the supervisor with children
    Supervisor.start_link(children, strategy: :one_for_all)
    

    测试步骤 和 one_for_one 一样:

    $ iex -S mix
    
    # 启动 supervisor 及其监督的3个 process 
    iex(1)> SupervisorTest.init
    {:ok, #PID<0.145.0>}
    
    # 启动后, 3个 process 的 PID 如下
    iex(2)> GenServer.call(:server_a, :display)
    'ServerA PID: <0.146.0>'
    iex(3)> GenServer.call(:server_b, :display)
    'ServerB PID: <0.147.0>'
    iex(4)> GenServer.call(:server_c, :display)
    'ServerC PID: <0.148.0>'
    
    # 通过消息 :err 让 serverA 出错
    iex(5)> GenServer.cast(:server_a, :err)
    :ok
    iex(6)>
    14:55:16.183 [error] GenServer :server_a terminating
     ** (stop) "stop ServerA"
     Last message: {:"$gen_cast", :err}
     State: []
    
     nil
    
    # serverA 出错后,再次查看3个process的PID,发现 supervisor 重启了所有 process,符合策略 :one_for_all
    iex(7)> GenServer.call(:server_a, :display)
    'ServerA PID: <0.153.0>'
    iex(8)> GenServer.call(:server_b, :display)
    'ServerB PID: <0.154.0>'
    iex(9)> GenServer.call(:server_c, :display)
    'ServerC PID: <0.156.0>'
    

    监督树

    监督者并不是一维的,监督者也可以监督其它监督者,从而形成树状的监督关系。

    修改上面的测试代码如下:(只修改了 Supervisor 的部分)

    defmodule PseudoServerA do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerA PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerA", []}
      end
    end
    
    defmodule PseudoServerB do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerB PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerB", []}
      end
    end
    
    defmodule PseudoServerC do
      use GenServer
    
      def start_link(state, opts \ []) do
        GenServer.start_link(__MODULE__, state, opts)
      end
    
      def handle_call(:display, _from, []) do
        {:reply, 'ServerC PID: ' ++ :erlang.pid_to_list(self()), []}
      end
    
      def handle_cast(:err, []) do
        {:stop, "stop ServerC", []}
      end
    end
    
    defmodule SupervisorBranch do
      import Supervisor.Spec
    
      def start_link(state) do
        children = [
          worker(PseudoServerA, [[], [name: :server_a]]),
          worker(PseudoServerB, [[], [name: :server_b]]),
        ]
    
        Supervisor.start_link(children, strategy: :one_for_one)
      end
    
    end
    
    defmodule SupervisorRoot do
      import Supervisor.Spec
    
      def init() do
        children = [
          supervisor(SupervisorBranch, [[name: :supervisor_branch]]),
          worker(PseudoServerC, [[], [name: :server_c]])
        ]
    
        # Start the supervisor with children
        Supervisor.start_link(children, strategy: :one_for_all)
      end
    
    end
    

    测试流程如下:

    # 启动 根 监督者 
    iex(1)> SupervisorRoot.init
    {:ok, #PID<0.149.0>}
    
    # 启动后,查看 3 个process 的PID
    iex(2)> GenServer.call(:server_a, :display)
    'ServerA PID: <0.151.0>'
    iex(3)> GenServer.call(:server_b, :display)
    'ServerB PID: <0.152.0>'
    iex(4)> GenServer.call(:server_c, :display)
    'ServerC PID: <0.153.0>'
    
    # 通过消息 :err 让 serverA 出错
    iex(5)> GenServer.cast(:server_a, :err)
    :ok
    iex(6)>
    15:31:15.846 [error] GenServer :server_a terminating
     ** (stop) "stop ServerA"
     Last message: {:"$gen_cast", :err}
     State: []
    
     nil
    
     # serverA 出错后,因为它的监督者 SupervisorBranch 的策略是 :one_for_one,所以只重启了 serverA
     iex(7)> GenServer.call(:server_a, :display)
     'ServerA PID: <0.158.0>'
     iex(8)> GenServer.call(:server_b, :display)
     'ServerB PID: <0.152.0>'
     iex(9)> GenServer.call(:server_c, :display)
     'ServerC PID: <0.153.0>'
    
     # 通过消息 :err 让 serverC 出错
     iex(10)> GenServer.cast(:server_c, :err)
     :ok
    
     15:31:35.264 [error] GenServer :server_c terminating
     ** (stop) "stop ServerC"
     Last message: {:"$gen_cast", :err}
     State: []
    
     # serverC 出错后,因为它的监督者 SupervisorRoot 的策略是 :one_for_all,所以所有的 proocess 都重启了
     iex(11)> GenServer.call(:server_a, :display)
     'ServerA PID: <0.166.0>'
     iex(12)> GenServer.call(:server_c, :display)
     'ServerC PID: <0.168.0>'
     iex(13)> GenServer.call(:server_b, :display)
     'ServerB PID: <0.167.0>'
    

    通过监督树,我们可以给不同的 process 分组,然后让每个组有不同的监督策略。

    总结

    有了监督机制,可以及时的把握所有 process 的状态,通过监督树,还可以加入不同恢复机制。 因此,用好 Supervisor 模块,可以极大提高系统的可用性。

    Supervisor 模块详细内容可以参见:http://elixir-lang.org/docs/stable/elixir/Supervisor.html

    来源:http://blog.iotalabs.io/

  • 相关阅读:
    POJ 1062 昂贵的聘礼(最短路)题解
    BZOj 墨墨的等式(转化为最短路)题解
    BZOJ 2763 飞行路线(分层图最短路)题解
    HDU 6342 Expression in Memories(模拟)多校题解
    codeforces 543B Destroying Roads
    codeforces 639B Bear and Forgotten Tree 3
    codeforces 645D Robot Rapping Results Report
    codeforces 702E Analysis of Pathes in Functional Graph
    codeforces 721C journey
    codeforces 711D Directed Roads
  • 原文地址:https://www.cnblogs.com/wang_yb/p/5564459.html
Copyright © 2011-2022 走看看