zoukankan      html  css  js  c++  java
  • 基于Pacemaker的PostgreSQL高可用集群

    集群特性

    秒级自动failover
    failover零数据丢失(防脑裂)
    支持在线主从切换
    支持读写分离
    支持读负载均衡
    支持动态增加和删除只读节点

    一、操作系统配置(pg01 && pg02 && pg03)

    1.禁用防火墙、selinux
    2.同步系统时间
    3.修改hosts文件

    # more /etc/hosts
    127.0.0.1   localhost 
    192.168.43.83    pg01
    192.168.43.123   pg02
    192.168.43.246   pg03
    ### vip ####
    192.168.43.10    vip-master
    192.168.43.20    vip-slave
    

    4.配置系统参数

    # vim /etc/security/limits.conf
    *  soft   stack    10240
    *  hard   stack    10240
    *  soft    nofile  131072  
    *  hard    nofile  131072  
    *  soft    nproc   131072  
    *  hard    nproc   131072  
    *  soft    core    unlimited  
    *  hard    core    unlimited 
    # vim /etc/sysctl.conf
    kernel.sem = 50100 64128000 50100 1280
    # sysctl -p
    

    二、安装高可用集群包(pg01 && pg02 && pg03)

    1,安装集群包
    # yum install -y pacemaker corosync pcs ipvsadm
    2.启用pcsd服务
    # systemctl start pcsd
    # systemctl enable pcsd
    # systemctl enable corosync
    # systemctl enable pacemaker
    
    [root@pg02 ~]# systemctl is-enabled corosync
    disabled
    [root@pg02 ~]# systemctl is-enabled pcsd
    enabled
    [root@pg02 ~]# systemctl is-enabled pacemaker
    disabled
    [root@pg02 ~]# 
    # 
    
    3.设置hacluster用户密码
    # echo hacluster|passwd hacluster --stdin
    4.集群认证
    在任何一个节点上执行:
    # pcs cluster auth -u hacluster -p hacluster pg01 pg02 pg03
    pg03: Authorized
    pg02: Authorized
    pg01: Authorized
    #
    5.同步配置
    在任何一个节点上执行:
    # pcs cluster setup --last_man_standing=1 --name pgcluster pg01 pg02 pg03
    Destroying cluster on nodes: pg01, pg02, pg03...
    pg01: Stopping Cluster (pacemaker)...
    pg03: Stopping Cluster (pacemaker)...
    pg02: Stopping Cluster (pacemaker)...
    pg02: Successfully destroyed cluster
    pg01: Successfully destroyed cluster
    pg03: Successfully destroyed cluster
    Sending 'pacemaker_remote authkey' to 'pg01', 'pg02', 'pg03'
    pg02: successful distribution of the file 'pacemaker_remote authkey'
    pg01: successful distribution of the file 'pacemaker_remote authkey'
    pg03: successful distribution of the file 'pacemaker_remote authkey'
    Sending cluster config files to the nodes...
    pg01: Succeeded
    pg02: Succeeded
    pg03: Succeeded
    Synchronizing pcsd certificates on nodes pg01, pg02, pg03...
    pg03: Success
    pg02: Success
    pg01: Success
    Restarting pcsd on the nodes in order to reload the certificates...
    pg03: Success
    pg02: Success
    pg01: Success
    # 
    6、启动集群
    在任何一个节点上执行:
    # pcs cluster start --all
    pg01: Starting Cluster (corosync)...
    pg02: Starting Cluster (corosync)...
    pg03: Starting Cluster (corosync)...
    pg01: Starting Cluster (pacemaker)...
    pg03: Starting Cluster (pacemaker)...
    pg02: Starting Cluster (pacemaker)...
    #
    

    三、install postgresql(pg01 && pg02 && pg03)

    1.安装PG
    数据库版本:pg 11.x以下版本
    # yum install gcc-c++ readline-devel zlib-devel
    # tar zxvf postgresql-11.10.tar.gz
    # cd postgresql-11.10
    # ./configure --prefix=/opt/pgsql/11.10
    # make -j 2
    # make install
    2.配置用户、数据目录(pg01 && pg02 && pg03)
    # useradd postgres
    # passwd postgres
    # vim /home/postgres/.bash_profile
    添加:
    export PGDATA=/pgdata
    export PATH=/opt/pgsql/11.10/bin:$PATH
    # mkdir /pgdata
    # chown postgres.postgres /pgdata/
    # chmod 700 /pgdata/
    3.主节点(pg01)
    [root@pg01 ~]# su - postgres
    [postgres@pg01 ~]$ initdb -D /pgdata/
    [postgres@pg01 ~]$ cd /pgdata/
    4.配置数据库参数
    postgresql.conf
    listen_addresses = '*'
    wal_level = logical
    wal_log_hints = on
    max_wal_size = 10GB
    min_wal_size = 80MB
    checkpoint_completion_target = 0.9 
    archive_mode = on
    archive_command = '/bin/true'
    wal_keep_segments = 1000
    synchronous_standby_names = ''
    hot_standby_feedback = on 
    logging_collector = on
    log_filename = 'postgresql-%a.log'
    log_truncate_on_rotation = on
    log_rotation_size = 0
    log_min_duration_statement = 0
    log_checkpoints = on
    log_connections = on
    log_disconnections = on
    log_line_prefix = '%t [%p]: db=%d,user=%u,app=%a,client=%h '
    log_lock_waits = on
    log_temp_files = 0
    log_autovacuum_min_duration = 0
    lc_messages = 'en_US.UTF-8'
    pg_hba.conf
    # TYPE  DATABASE        USER            ADDRESS                 METHOD
    # "local" is for Unix domain socket connections only
    local   all             all                                     trust
    # IPv4 local connections:
    host    all             all             127.0.0.1/32            md5
    host    all             all             0.0.0.0/0               md5
    # IPv6 local connections:
    #host    all             all             ::1/128                 trust
    # Allow replication connections from localhost, by a user with the
    # replication privilege.
    local   replication     all                                     trust
    host    replication     all             127.0.0.1/32            md5
    host    replication     repluser        192.168.43.0/24         md5
    5.启动master数据库
    [root@pg01 ~]# su - postgres
    Last login: Mon Dec 14 11:26:05 CST 2020 on pts/0
    [postgres@pg01 ~]$ 
    [postgres@pg01 ~]$ pg_ctl start -D /pgdata/
    6.创建复制用户
    postgres=# create user repluser with replication password 'repluser';
    CREATE ROLE
    postgres=# du
                                       List of roles
     Role name |                         Attributes                         | Member of 
    -----------+------------------------------------------------------------+-----------
     postgres  | Superuser, Create role, Create DB, Replication, Bypass RLS | {}
     repluser  | Replication                                                | {}
    postgres=# 
    6.创建slave(pg02 && pg03)
    # su - postgres
    $ pg_basebackup -h pg01 -U repluser -p 5432 -D /pgdata --wal-method=stream --checkpoint=fast --progress --verbose
    7.停止master(pg01)
    [postgres@pg01 pgdata]$ pg_ctl stop -D /pgdata/
    waiting for server to shut down.... done
    server stopped
    [postgres@pg01 pgdata]$ pg_ctl status -D /pgdata/
    pg_ctl: no server running
    [postgres@pg01 pgdata]$ 
    
    

    注意事项:
    数据库无需配置自启动,由集群软件自动拉起

    四、配置集群

    # cat cluster_setup.sh
    pcs cluster cib pgsql_cfg
    pcs -f pgsql_cfg property set no-quorum-policy="ignore"           
    pcs -f pgsql_cfg property set stonith-enabled="false"                      
    pcs -f pgsql_cfg resource defaults resource-stickiness="INFINITY"      
    pcs -f pgsql_cfg resource defaults migration-threshold="3"                         
    #### vip-master ###               
    pcs -f pgsql_cfg resource create vip-master IPaddr2 ip="192.168.43.10" cidr_netmask="24" 
    op start  timeout="60s" interval="0s"  on-fail="restart"    
    op monitor timeout="60s" interval="10s" on-fail="restart"    
    op stop    timeout="60s" interval="0s"  on-fail="block"  
    #### vip-slave ###                                         
    pcs -f pgsql_cfg resource create vip-slave IPaddr2 ip="192.168.43.20" cidr_netmask="24" 
    op start   timeout="60s" interval="0s"  on-fail="restart"    
    op monitor timeout="60s" interval="10s" on-fail="restart"    
    op stop    timeout="60s" interval="0s"  on-fail="block" 
    #### pgsql resource ####                            
    pcs -f pgsql_cfg resource create pgsql pgsql 
    pgctl="/opt/pgsql/11.10/bin/pg_ctl" 
    psql="/opt/pgsql/11.10/bin/psql" 
    pgdata="/pgdata/" 
    config="/pgdata/postgresql.conf" 
    rep_mode="sync" node_list="pg01 pg02 pg03" master_ip="192.168.43.10"  
    repuser="repluser" 
    primary_conninfo_opt="password=repluser 
    keepalives_idle=60 keepalives_interval=5 keepalives_count=5" 
    restart_on_promote='true' 
    op start   timeout="60s" interval="0s"  on-fail="restart" 
    op monitor timeout="60s" interval="4s" on-fail="restart" 
    op monitor timeout="60s" interval="3s" on-fail="restart" role="Master" 
    op promote timeout="60s" interval="0s"  on-fail="restart" 
    op demote  timeout="60s" interval="0s"  on-fail="stop" 
    op stop    timeout="60s" interval="0s"  on-fail="block"  
    #### setting master #####
    pcs -f pgsql_cfg resource master pgsql-cluster pgsql master-max=1 master-node-max=1 clone-max=3 clone-node-max=1 notify=true
    #### master group #####
    pcs -f pgsql_cfg resource group add master-group vip-master 
    #### slave group #####      
    pcs -f pgsql_cfg resource group add slave-group vip-slave 
    #### master group setting #####             
    pcs -f pgsql_cfg constraint colocation add master-group with master pgsql-cluster INFINITY  
    pcs -f pgsql_cfg constraint order promote pgsql-cluster then start master-group symmetrical=false score=INFINITY                                                                                                 
    pcs -f pgsql_cfg constraint order demote  pgsql-cluster then stop  master-group symmetrical=false score=0 
    #### slave-group  setting  #####                                                                                              
    pcs -f pgsql_cfg constraint colocation add slave-group with slave pgsql-cluster INFINITY        
    pcs -f pgsql_cfg constraint order promote pgsql-cluster then start slave-group symmetrical=false score=INFINITY                                                                                                       
    pcs -f pgsql_cfg constraint order demote  pgsql-cluster then stop  slave-group symmetrical=false score=0
    #### push config ####
    pcs cluster cib-push pgsql_cfg
    
    
    修改集群配置:
    
    # cibadmin --query > tmp.xml
    # vi tmp.xml
    # cibadmin --replace --xml-file tmp.xml
    
    

    五、集群状态检查

    [root@pg01 ~]# pcs status corosync
    
    Membership information
    ----------------------
        Nodeid      Votes Name
             1          1 pg01 (local)
             2          1 pg02
             3          1 pg03
    [root@pg01 ~]#
    
    # pcs property list
    Cluster Properties:
     cluster-infrastructure: corosync
     cluster-name: pgcluster
     dc-version: 1.1.23-1.el7-9acf116022
     have-watchdog: false
     no-quorum-policy: ignore
     stonith-enabled: false
    Node Attributes:
     pg01: pgsql-data-status=LATEST
     pg02: pgsql-data-status=STREAMING|SYNC
     pg03: pgsql-data-status=STREAMING|ASYNC
    [root@pg01 ~]# 
    
    # pcs resource show
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg02 ]
         Slaves: [ pg01 pg03 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg02
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg03
    [root@pg02 ~]# 
    
    pcs resource cleanup 
    //xx表示虚拟资源名称,当集群有资源处于unmanaged的状态时,
    可以用这个命令清理掉失败的信息,然后重置资源状态
    
    [root@pg02 ~]# pcs resource cleanup
    Cleaned up all resources on all nodes
    [root@pg02 ~]# 
    
    
    # pcs status
    Cluster name: pgcluster
    Stack: corosync
    Current DC: pg02 (version 1.1.23-1.el7-9acf116022) - partition with quorum
    Last updated: Mon Dec 14 13:26:08 2020
    Last change: Mon Dec 14 13:19:49 2020 by root via crm_attribute on pg02
    
    3 nodes configured
    5 resource instances configured
    
    Online: [ pg01 pg02 pg03 ]
    
    Full list of resources:
    
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg02 ]
         Slaves: [ pg01 pg03 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg02
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg03
    
    Daemon Status:
      corosync: active/enabled
      pacemaker: active/enabled
      pcsd: active/enabled
    [root@pg03 ~]# 
    
    [root@pg01 ~]# crm_mon -Arf -1
    Stack: corosync
    Current DC: pg01 (version 1.1.23-1.el7-9acf116022) - partition with quorum
    Last updated: Mon Dec 14 13:38:05 2020
    Last change: Mon Dec 14 13:33:33 2020 by root via crm_attribute on pg01
    
    3 nodes configured
    5 resource instances configured
    
    Online: [ pg01 pg02 pg03 ]
    
    Full list of resources:
    
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg01 ]
         Slaves: [ pg02 pg03 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg01
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg03
    
    Node Attributes:
    * Node pg01:
        + master-pgsql                    	: 1000      
        + pgsql-data-status               	: LATEST    
        + pgsql-master-baseline           	: 0000000008000098
        + pgsql-status                    	: PRI       
    * Node pg02:
        + master-pgsql                    	: -INFINITY 
        + pgsql-data-status               	: STREAMING|ASYNC
        + pgsql-status                    	: HS:async  
    * Node pg03:
        + master-pgsql                    	: 100       
        + pgsql-data-status               	: STREAMING|SYNC
        + pgsql-status                    	: HS:sync   
    
    Migration Summary:
    * Node pg01:
    * Node pg02:
    * Node pg03:
    [root@pg01 ~]# 
    
    

    六、问题排查

    master宕机启动时,需要删除临时锁文件方可进行集群角色转换
    
    1.查看本机集群状态
    
    [root@pg01 ~]# crm_mon -Arf -1
    Stack: corosync
    Current DC: pg03 (version 1.1.23-1.el7-9acf116022) - partition with quorum
    Last updated: Mon Dec 14 13:39:58 2020
    Last change: Mon Dec 14 13:39:02 2020 by root via crm_attribute on pg03
    
    3 nodes configured
    5 resource instances configured
    
    Online: [ pg01 pg02 pg03 ]
    
    Full list of resources:
    
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg03 ]
         Slaves: [ pg02 ]
         Stopped: [ pg01 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg03
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg02
    
    Node Attributes:
    * Node pg01:
        + master-pgsql                    	: -INFINITY 
        + pgsql-data-status               	: DISCONNECT
        + pgsql-status                    	: STOP      
    * Node pg02:
        + master-pgsql                    	: 100       
        + pgsql-data-status               	: STREAMING|SYNC
        + pgsql-status                    	: HS:sync   
    * Node pg03:
        + master-pgsql                    	: 1000      
        + pgsql-data-status               	: LATEST    
        + pgsql-master-baseline           	: 0000000009000098
        + pgsql-status                    	: PRI       
    
    Migration Summary:
    * Node pg01:
       pgsql: migration-threshold=3 fail-count=1000000 last-failure='Mon Dec 14 13:39:47 2020'
    * Node pg02:
    * Node pg03:
    
    Failed Resource Actions:
    * pgsql_start_0 on pg01 'unknown error' (1): call=15, status=complete, exitreason='My data may be inconsistent. You have to remove /var/lib/pgsql/tmp/PGSQL.lock file to force start.',
        last-rc-change='Mon Dec 14 13:39:45 2020', queued=0ms, exec=120ms
    
    2.删除PGSQL.lock文件
    [root@pg01 ~]# cd /var/lib/pgsql/tmp/
    [root@pg01 tmp]# ls
    PGSQL.lock  rep_mode.conf
    [root@pg01 tmp]# rm -rf PGSQL.lock 
    [root@pg01 tmp]# ll
    total 4
    -rw-r----- 1 postgres postgres 31 Dec 14 13:39 rep_mode.conf
    [root@pg01 tmp]# 
    [root@pg01 tmp]# 
    
    3.检查集群状态
    [root@pg01 tmp]# crm_mon -Arf -1
    Stack: corosync
    Current DC: pg03 (version 1.1.23-1.el7-9acf116022) - partition with quorum
    Last updated: Mon Dec 14 13:40:18 2020
    Last change: Mon Dec 14 13:39:02 2020 by root via crm_attribute on pg03
    
    3 nodes configured
    5 resource instances configured
    
    Online: [ pg01 pg02 pg03 ]
    
    Full list of resources:
    
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg03 ]
         Slaves: [ pg02 ]
         Stopped: [ pg01 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg03
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg02
    
    Node Attributes:
    * Node pg01:
        + master-pgsql                    	: -INFINITY 
        + pgsql-data-status               	: DISCONNECT
        + pgsql-status                    	: STOP      
    * Node pg02:
        + master-pgsql                    	: 100       
        + pgsql-data-status               	: STREAMING|SYNC
        + pgsql-status                    	: HS:sync   
    * Node pg03:
        + master-pgsql                    	: 1000      
        + pgsql-data-status               	: LATEST    
        + pgsql-master-baseline           	: 0000000009000098
        + pgsql-status                    	: PRI       
    
    Migration Summary:
    * Node pg01:
       pgsql: migration-threshold=3 fail-count=1000000 last-failure='Mon Dec 14 13:39:47 2020'
    * Node pg02:
    * Node pg03:
    
    Failed Resource Actions:
    * pgsql_start_0 on pg01 'unknown error' (1): call=15, status=complete, exitreason='My data may be inconsistent. You have to remove /var/lib/pgsql/tmp/PGSQL.lock file to force start.',
        last-rc-change='Mon Dec 14 13:39:45 2020', queued=0ms, exec=120ms
    
    3.重置并刷新集群状态
    [root@pg01 tmp]# crm_resource --cleanup
    Cleaned up all resources on all nodes
    Waiting for 1 reply from the CRMd. OK
    
    4.检查集群状态
    
    [root@pg01 tmp]# crm_mon -Arf -1
    Stack: corosync
    Current DC: pg03 (version 1.1.23-1.el7-9acf116022) - partition with quorum
    Last updated: Mon Dec 14 13:40:40 2020
    Last change: Mon Dec 14 13:40:38 2020 by root via crm_attribute on pg03
    
    3 nodes configured
    5 resource instances configured
    
    Online: [ pg01 pg02 pg03 ]
    
    Full list of resources:
    
     Master/Slave Set: pgsql-cluster [pgsql]
         Masters: [ pg03 ]
         Slaves: [ pg01 pg02 ]
     Resource Group: master-group
         vip-master	(ocf::heartbeat:IPaddr2):	Started pg03
     Resource Group: slave-group
         vip-slave	(ocf::heartbeat:IPaddr2):	Started pg02
    
    Node Attributes:
    * Node pg01:
        + master-pgsql                    	: -INFINITY 
        + pgsql-data-status               	: STREAMING|ASYNC
        + pgsql-status                    	: HS:async  
    * Node pg02:
        + master-pgsql                    	: 100       
        + pgsql-data-status               	: STREAMING|SYNC
        + pgsql-status                    	: HS:sync   
    * Node pg03:
        + master-pgsql                    	: 1000      
        + pgsql-data-status               	: LATEST    
        + pgsql-master-baseline           	: 0000000009000098
        + pgsql-status                    	: PRI       
    
    Migration Summary:
    * Node pg01:
    * Node pg02:
    * Node pg03:
    [root@pg01 tmp]# 
    
    
    常用管理命令
    
    临时维护,只用到了以下几个命令
    pcs status //查看集群状态
    pcs resource show //查看资源
    pcs resource create ClusterIP IPaddr2 ip=192.168.0.120 cidr_netmask=32 //创建一个虚拟IP资源
    pcs resource cleanup //xx表示虚拟资源名称,当集群有资源处于unmanaged的状态时,可以用这个命令清理掉失败的信息,然后重置资源状态
    pcs resource list //查看资源列表
    pcs resource restart //重启资源
    pcs resource enable //启动资源
    pcs resource disable //关闭资源
    pcs resource delete //删除资源
    
  • 相关阅读:
    Day 13 匿名函数 :内置函数: 闭包
    Day 12 生成器:生成器表达式:列表表达式:yield:yield from:内置函数
    最近这些天
    正则表达式 I
    Python 软件规范开发
    模块
    常用模块介绍 time datetime os sys hashlib json pickle collections
    模拟博客园登录 基础版
    SQL 必知必会
    Python中的线程和进程
  • 原文地址:https://www.cnblogs.com/Alicebat/p/14148933.html
Copyright © 2011-2022 走看看