zoukankan      html  css  js  c++  java
  • k8s HA master 节点宕机修复

    k8s 集群

    # k8s 集群
    192.168.71.201 k8s-master01 master01
    192.168.71.202 k8s-master02 master02 harbor
    192.168.71.203 k8s-master03 master03
    192.168.71.204 k8s-node01  node01
    192.168.71.205 k8s-node02  node02
    
    
    k8s-master01   Ready      master   170m   v1.16.0
    k8s-master02   Ready      master   167m   v1.16.0
    k8s-master03   Ready      master   164m   v1.16.0
    k8s-node01     Ready      <none>   162m   v1.16.0
    k8s-node02     Ready      <none>   162m   v1.16.0
    
    
    集群虚拟ip 192.168.71.200
    k8s 高可用集群部署参照之前文档: https://www.cnblogs.com/lixinliang/p/12217033.html
    
    

    故障重现

    #制造故障
    将第一个master 节点关机
    因check_haproxy.sh  脚本,虚拟ip会漂移至 master02 节点  
    

    故障恢复

    # 获取etcd 集群故障 etcd member  id
    首先在一台健康的master02 上运行下面的命令获取etcd集群中故障member的ID
    ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
    docker exec 
      -it ${ETCD} 
      etcdctl 
      --endpoints https://127.0.0.1:2379 
      --ca-file /etc/kubernetes/pki/etcd/ca.crt 
      --cert-file /etc/kubernetes/pki/etcd/peer.crt 
      --key-file /etc/kubernetes/pki/etcd/peer.key 
      cluster-health
    
    结果显示报错:  member 19c5f5e4748dc98b is unreachbel
    那么故障id 为: 19c5f5e4748dc98b 
    
    #删除故障etcd  member  id
    由于故障节点已经被重置,因此相当于该ID对应的ETCD实例已经丢失,无法再取得联系。因此直接运行下面命令将故障的member从etcd集群中删除。
    
    ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
    
    docker exec 
      -it ${ETCD} 
      etcdctl 
      --endpoints https://127.0.0.1:2379 
      --ca-file /etc/kubernetes/pki/etcd/ca.crt 
      --cert-file /etc/kubernetes/pki/etcd/server.crt 
      --key-file /etc/kubernetes/pki/etcd/server.key 
      member remove 19c5f5e4748dc98b
    
    
    # 再次查看etcd 集群状态
    再次查看只剩下两个etcd 节点为 healthy 状态
    docker exec 
      -it ${ETCD} 
      etcdctl 
      --endpoints https://127.0.0.1:2379 
      --ca-file /etc/kubernetes/pki/etcd/ca.crt 
      --cert-file /etc/kubernetes/pki/etcd/peer.crt 
      --key-file /etc/kubernetes/pki/etcd/peer.key 
      cluster-health
    
    # 加入新节点
    
    + 基础配置 
    * 主机名修改为master01 原来主机名
    * /etc/hosts 文件保持同步
    * 免密登录
    
    
    + 准备 keepalived、haproxy 配置文件
    直接拷贝原来 master01 的相关配置,并启动服务
    
    + 分发证书
    在 master02 节点分发证书至 master01
    
    
    #!/bin/bash
    for index in 201; do
      ip=192.168.71.${index}
      ssh $ip "mkdir -p /etc/kubernetes/pki/etcd; mkdir -p ~/.kube/"
      scp /etc/kubernetes/pki/ca.crt $ip:/etc/kubernetes/pki/ca.crt
      scp /etc/kubernetes/pki/ca.key $ip:/etc/kubernetes/pki/ca.key
      scp /etc/kubernetes/pki/sa.key $ip:/etc/kubernetes/pki/sa.key
      scp /etc/kubernetes/pki/sa.pub $ip:/etc/kubernetes/pki/sa.pub
      scp /etc/kubernetes/pki/front-proxy-ca.crt $ip:/etc/kubernetes/pki/front-proxy-ca.crt
      scp /etc/kubernetes/pki/front-proxy-ca.key $ip:/etc/kubernetes/pki/front-proxy-ca.key
      scp /etc/kubernetes/pki/etcd/ca.crt $ip:/etc/kubernetes/pki/etcd/ca.crt
      scp /etc/kubernetes/pki/etcd/ca.key $ip:/etc/kubernetes/pki/etcd/ca.key
      scp /etc/kubernetes/admin.conf $ip:/etc/kubernetes/admin.conf
      scp /etc/kubernetes/admin.conf $ip:~/.kube/config
    done
    
    
    + 在 master01 准备 kubeadm_master01.conf 配置文件
    随后将新的(初始化过的)节点加入到集群中,重新组成三节点的HA master,注意重建master的过程中使用了kubeadm的配置文件,该配置文件为HA master首次部署过程中使用过的,此处直接复用该配置文件。
    
    注意: 以下文件需要修改两处地方,不能拿原来配置文件直接使用
    
    $ cat kubeadm_master01.conf
    apiVersion: kubeadm.k8s.io/v1beta1
    kind: InitConfiguration
    localAPIEndpoint:
      advertiseAddress: 192.168.71.201
      bindPort: 6443
    ---
    apiVersion: kubeadm.k8s.io/v1beta1
    kind: ClusterConfiguration
    kubernetesVersion: v1.16.0
    controlPlaneEndpoint: "192.168.71.200:8443"
    imageRepository: registry.aliyuncs.com/google_containers
    apiServer:
      certSANs:
      - "master01"
      - "master02"
      - "master03"
      - 192.168.71.201
      - 192.168.71.202
      - 192.168.71.203
      - 192.168.71.200
    networking:
      podSubnet: "10.244.0.0/16"
      serviceSubnet: "10.96.0.0/12"
    certificatesDir: /etc/kubernetes/pki
    clusterName: kubernetes
    
    etcd:
      local:
        extraArgs:
          listen-client-urls: "https://127.0.0.1:2379,https://192.168.71.201:2379"
          advertise-client-urls: "https://192.168.71.201:2379"
          listen-peer-urls: "https://192.168.71.201:2380"
          initial-advertise-peer-urls: "https://192.168.71.201:2380"
          initial-cluster: "k8s-master01=https://192.168.71.201:2380,k8s-master02=https://192.168.71.202:2380,k8s-master03=https://192.168.71.203:2380"  
    # 第一处: 注意上一行需要修改,确保包括该重置节点在内的所有etcd节点的HOST=IP地址对都被列出在该配置中,不然新节点的etcd启动失败
    
          initial-cluster-state: existing
    # 第二处: 注意上一行需要修改,因为是集群已经存在,所以将刚开始的 new 修改为existing ,如果损坏的master 节点不是master01,那么此处将不用修改 
    
        serverCertSANs:
          - master01
          - 192.168.71.201
        peerCertSANs:
          - master01
          - 192.168.71.201
    ---
    apiVersion: kubeproxy.config.k8s.io/v1alpha1
    kind: KubeProxyConfiguration
    mode: ipvs
    
    
    在 master01 执行:
    # 配置证书
    kubeadm init phase certs all --config kubeadm_master01.conf
    # 配置etcd
    kubeadm init phase etcd local --config kubeadm_master01.conf
    # 生成kubelet配置文件
    kubeadm init phase kubeconfig kubelet --config kubeadm_master01.conf
    # 启动kubelet
    kubeadm init phase kubelet-start --config kubeadm_master01.conf
    # 将master01的etcd加入集群
    
    
     kubectl exec -n kube-system  etcd-k8s-master02 -- etcdctl --ca-file /etc/kubernetes/pki/etcd/ca.crt --cert-file /etc/kubernetes/pki/etcd/peer.crt --key-file /etc/kubernetes/pki/etcd/peer.key --endpoints=https://192.168.71.202:2379 member add master1 https://192.168.71.201:2380
    
    
    # 启动 kube-apiserver、kube-controller-manager、kube-scheduler
    kubeadm init phase kubeconfig all --config kubeadm_master01.conf
    kubeadm init phase control-plane all --config kubeadm_master01.conf
    
    
    # 将节点标记为master
    kubeadm init phase mark-control-plane --config kubeadm_master01.conf
    
    # 查看
    kubectl get nodes
    
    
    
    # 再次查看etcd 集群状态
    [root@k8s-master02 ~]#  docker exec   -it ${ETCD}   etcdctl   --endpoints https://127.0.0.1:2379   --ca-file /etc/kubernetes/pki/etcd/ca.crt   --cert-file /etc/kubernetes/pki/etcd/peer.crt   --key-file /etc/kubernetes/pki/etcd/peer.key   cluster-health
    member 858768c8e151d5d8 is healthy: got healthy result from https://192.168.71.202:2379
    member c79fe8ecd577a746 is healthy: got healthy result from https://192.168.71.203:2379
    member e2892a4ec808af4e is healthy: got healthy result from https://192.168.71.201:2379
    cluster is healthy
    
    
    正常显示 etcd 集群,证明mater01 已修复。
    
  • 相关阅读:
    Angularjs 中文版API v1.3.9 阅读
    jquery之ajax
    requirejs中 shim
    reqiurejs学习
    angularjs与require的集成摘抄
    安装go-admin
    多种GitHub加速方式
    git初始化仓库操作
    git cherry-pick
    gc问题定位
  • 原文地址:https://www.cnblogs.com/lixinliang/p/14346859.html
Copyright © 2011-2022 走看看