zoukankan html css js c++ java

k8s 高可用

k8s集群清除

kubeadm resef -f

root@ubuntu:~/cluster# ps -elf | grep kube
4 S root      8099  8071  4  80   0 - 161890 futex_ Jun18 ?       12:33:47 kube-apiserver --advertise-address=10.10.16.82 --allow-privileged=true --authorization-mode=Node,RBAC --client-ca-file=/etc/kubernetes/pki/ca.crt --enable-admission-plugins=NodeRestriction --enable-bootstrap-token-auth=true --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key --etcd-servers=https://127.0.0.1:2379 --insecure-port=0 --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key --requestheader-allowed-names=front-proxy-client --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt --requestheader-extra-headers-prefix=X-Remote-Extra- --requestheader-group-headers=X-Remote-Group --requestheader-username-headers=X-Remote-User --secure-port=6443 --service-account-key-file=/etc/kubernetes/pki/sa.pub --service-cluster-ip-range=10.96.0.0/12 --tls-cert-file=/etc/kubernetes/pki/apiserver.crt --tls-private-key-file=/etc/kubernetes/pki/apiserver.key
0 S root     13537 36620  0  80   0 -  1096 pipe_w 16:36 pts/0    00:00:00 grep kube
4 S root     17548 17521  0  80   0 - 187659 futex_ Jun21 ?       00:07:47 /usr/bin/kube-controllers
4 S root     55252 55222  0  80   0 - 36362 ep_pol Jun25 ?        00:24:17 /usr/local/bin/kube-proxy --config=/var/lib/kube-proxy/config.conf --hostname-override=ubuntu
root@ubuntu:~/cluster# kill -9 8099  17548   55252
root@ubuntu:~/cluster#

第一个master节点初始化

haproxy配置

#---------------------------------------------------------------------
backend kube-apiserver
    mode tcp
    option tcplog
    option tcp-check
    balance roundrobin
    default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100
    server ubuntu 10.10.16.82:6443 check

keepavlivede vip

root@ubuntu:/etc/haproxy# cat ../keepalived/keepalived.conf
global_defs {
   script_user root 
   enable_script_security
 
}
 
vrrp_script chk_haproxy {
    script "/bin/bash -c 'if [[ $(netstat -nlp | grep 9443 | wc -l) ]]; then exit 0; else exit 1; fi'"  # haproxy 检测
    interval 2  # 每2秒执行一次检测
    #weight -10 # 权重变化
}
 
vrrp_instance VI_1 {
  interface enahisic2i0   ###宿主机网卡名
 
  state BACKUP
  virtual_router_id 61 # id设为相同，表示是同一个虚拟路由组
  priority 80 #初始权重
  nopreempt #不抢占
 
  unicast_peer {
  10.10.16.47
  10.10.16.251 
  }
 
  virtual_ipaddress {
    10.10.16.249  # vip
  }
 
  authentication {
    auth_type PASS
    auth_pass password
  }
 
  track_script {
      chk_haproxy
  }
 
  #notify "/container/service/keepalived/assets/"
}

root@ubuntu:~# kubeadm config print init-defaults > kubeadm-init.yaml.yaml
W0630 16:17:15.326593    6239 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io]
root@ubuntu:~# kubeadm config images list
I0630 16:20:18.442685    7275 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18
W0630 16:20:19.242028    7275 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io]
k8s.gcr.io/kube-apiserver:v1.18.20
k8s.gcr.io/kube-controller-manager:v1.18.20
k8s.gcr.io/kube-scheduler:v1.18.20
k8s.gcr.io/kube-proxy:v1.18.20
k8s.gcr.io/pause:3.2
k8s.gcr.io/etcd:3.4.3-0
k8s.gcr.io/coredns:1.6.7
root@ubuntu:~#

root@ubuntu:~/cluster#  kubeadm config images list | grep k8s.gcr.io/kube-apiserver:v1.18.20
I0630 16:47:23.769353   17086 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18
W0630 16:47:24.488215   17086 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io]
k8s.gcr.io/kube-apiserver:v1.18.20
root@ubuntu:~/cluster# kubeadm config images pull
I0630 16:48:28.388046   17364 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18
W0630 16:48:29.103399   17364 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io]
failed to pull image "k8s.gcr.io/kube-apiserver:v1.18.20": output: Error response from daemon: Get https://k8s.gcr.io/v2/: net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)
, error: exit status 1
To see the stack trace of this error execute with --v=5 or higher
root@ubuntu:~/cluster#

原来是本有误

kubernetesVersion: stable

改成

kubernetesVersion: v1.18.0

root@ubuntu:~/cluster# cat clus.sh 
proxy=10.10.16.249
etcd1=10.10.18.42
etcd2=10.10.18.43
etcd3=10.10.18.44
cat << EOF > kubeadm-config.yaml
apiVersion: kubeadm.k8s.io/v1beta2
kind: ClusterConfiguration
kubernetesVersion: v1.18.0
imageRepository: registry.cn-hangzhou.aliyuncs.com/google_containers
apiServer:
  certSANs:
    - "$proxy"
controlPlaneEndpoint: "$proxy:6443"
etcd:
  external:
    endpoints:
    - "https://$etcd1:2379"
    - "https://$etcd2:2379"
    - "https://$etcd3:2379"
    caFile:  /opt/etcd/ssl/ca.pem
    certFile: /opt/etcd/ssl/server.pem 
    keyFile: /opt/etcd/ssl/server-key.pem
networking:
    podSubnet: "10.244.0.0/16"
EOF

kubeadm init --config kubeadm-config.yaml

You can now join any number of control-plane nodes by copying certificate authorities
and service account keys on each node and then running the following as root:

  kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s 
    --discovery-token-ca-cert-hash sha256:dd 
    --control-plane 

Then you can join any number of worker nodes by running the following on each as root:

kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s 
    --discovery-token-ca-cert-hash sha256:dd

执行

  mkdir -p $HOME/.kube
  sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
  sudo chown $(id -u):$(id -g) $HOME/.kube/config

root@ubuntu:~# kubectl get nodes
NAME     STATUS   ROLES    AGE     VERSION
ubuntu   Ready    master   6m59s   v1.18.1
root@ubuntu:~#

kubectl get pods -o wide -n kube-system
NAME                             READY   STATUS              RESTARTS   AGE     IP            NODE     NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r         0/1     ContainerCreating   0          3m9s    <none>        ubuntu   <none>           <none>
coredns-546565776c-xfwr5         0/1     ContainerCreating   0          3m21s   <none>        ubuntu   <none>           <none>
kube-apiserver-ubuntu            1/1     Running             0          10m     10.10.16.82   ubuntu   <none>           <none>
kube-controller-manager-ubuntu   1/1     Running             4          10m     10.10.16.82   ubuntu   <none>           <none>
kube-flannel-ds-arm64-2js47      0/1     CrashLoopBackOff    5          5m1s    10.10.16.82   ubuntu   <none>           <none>
kube-proxy-m8s4m                 1/1     Running             0          5m      10.10.16.82   ubuntu   <none>           <none>
kube-scheduler-ubuntu            1/1     Running             4          10m     10.10.16.82   ubuntu   <none>           <none>

没有pause, coredns处于createing

root@ubuntu:/etc/kubernetes/pki# kubectl delete pod   kube-flannel-ds-arm64-2js47   -n kube-system
pod "kube-flannel-ds-arm64-2js47" deleted
root@ubuntu:/etc/kubernetes/pki# kubectl get pods -o wide -n kube-system
NAME                             READY   STATUS              RESTARTS   AGE     IP            NODE     NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r         0/1     ContainerCreating   0          6m17s   <none>        ubuntu   <none>           <none>
coredns-546565776c-xfwr5         0/1     ContainerCreating   0          6m29s   <none>        ubuntu   <none>           <none>
kube-apiserver-ubuntu            1/1     Running             0          13m     10.10.16.82   ubuntu   <none>           <none>
kube-controller-manager-ubuntu   1/1     Running             5          13m     10.10.16.82   ubuntu   <none>           <none>
kube-proxy-m8s4m                 1/1     Running             0          8m8s    10.10.16.82   ubuntu   <none>           <none>
kube-scheduler-ubuntu            1/1     Running             5          13m     10.10.16.82   ubuntu   <none>           <none>

其他master节点加入

拷贝秘钥

scp ca.* sa.* front-proxy-ca.* root@10.10.16.251:/etc/kubernetes/pki/

执行join

[root@centos7 ~]# kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s     --discovery-token-ca-cert-hash sha256:dd30de5ae3a2006842ae01c6bf4     --control-plane 
[preflight] Running pre-flight checks
        [WARNING Hostname]: hostname "centos7" could not be reached
        [WARNING Hostname]: hostname "centos7": lookup centos7 on 8.8.8.8:53: no such host
[preflight] Reading configuration from the cluster...

配置kubeCtl

To start administering your cluster from this node, you need to run the following as a regular user:

        mkdir -p $HOME/.kube
        sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
        sudo chown $(id -u):$(id -g) $HOME/.kube/config

Run 'kubectl get nodes' to see this node join the cluster.

[root@centos7 ~]# mkdir -p $HOME/.kube
[root@centos7 ~]# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
[root@centos7 ~]#  chown $(id -u):$(id -g) $HOME/.kube/config
[root@centos7 ~]#

[root@centos7 ~]# kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE    IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          10m    <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          10m    <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          101s   10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          44s    10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          17m    10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             0          98s    10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             0          33s    10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             5          17m    10.10.16.82    ubuntu    <none>           <none>
kube-flannel-ds-arm64-88gxp       0/1     CrashLoopBackOff    4          3m6s   10.10.16.251   centos7   <none>           <none>
kube-flannel-ds-arm64-w9t92       0/1     CrashLoopBackOff    4          2m     10.10.16.47    cloud     <none>           <none>
kube-flannel-ds-arm64-xg9s8       0/1     CrashLoopBackOff    5          4m6s   10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          3m7s   10.10.16.251   centos7   <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          2m1s   10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          12m    10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             0          101s   10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             0          33s    10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             5          17m    10.10.16.82    ubuntu    <none>           <none>
[root@centos7 ~]#

更改haproxy配置

backend kube-apiserver
    mode tcp
    option tcplog
    option tcp-check
    balance roundrobin
    default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100
    server ubuntu 10.10.16.82:6443 check
    server cloud  10.10.16.47:6443 check
    server centos7 10.10.16.251:6443 check

tcpdump抓包

root@ubuntu:~/cluster# tcpdump -i enahisic2i0 tcp and port 6443 -ennvv
tcpdump: listening on enahisic2i0, link-type EN10MB (Ethernet), capture size 262144 bytes
17:19:49.151776 b0:08:75:5f:b8:5b > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 18123, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.251.33358 > 10.10.16.249.6443: Flags [.], cksum 0xf5cb (correct), seq 3177185730, ack 3261322716, win 253, options [nop,nop,TS val 1926893007 ecr 1263307546], length 0
17:19:49.151801 48:57:02:64:e7:ab > b0:08:75:5f:b8:5b, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 6868, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.249.6443 > 10.10.16.251.33358: Flags [.], cksum 0x362e (incorrect -> 0x7e9a), seq 1, ack 1, win 501, options [nop,nop,TS val 1263337810 ecr 1926893007], length 0
17:19:49.633218 b0:08:75:5f:b8:5b > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 103: (tos 0x0, ttl 64, id 35885, offset 0, flags [DF], proto TCP (6), length 89)
    10.10.16.251.33024 > 10.10.16.249.6443: Flags [P.], cksum 0x98c2 (correct), seq 2495572463:2495572500, ack 2740706400, win 1933, options [nop,nop,TS val 1926923752 ecr 1263334669], length 37
17:19:49.633250 48:57:02:64:e7:ab > b0:08:75:5f:b8:5b, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 2218, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.249.6443 > 10.10.16.251.33024: Flags [.], cksum 0x362e (incorrect -> 0xdec1), seq 1, ack 37, win 501, options [nop,nop,TS val 1263338291 ecr 1926923752], length 0
17:19:50.133211 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 26626, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.47.40622 > 10.10.16.249.6443: Flags [.], cksum 0x2231 (correct), seq 669238092, ack 2486368405, win 501, options [nop,nop,TS val 1255232087 ecr 2251803177], length 0
17:19:50.133236 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 19924, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.249.6443 > 10.10.16.47.40622: Flags [.], cksum 0x3562 (incorrect -> 0x153e), seq 1, ack 1, win 501, options [nop,nop,TS val 2251833897 ecr 1255139146], length 0
17:19:50.149613 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 37266, offset 0, flags [DF], proto TCP (6), length 90)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [P.], cksum 0x3885 (correct), seq 2736523175:2736523213, ack 776784886, win 1145, options [nop,nop,TS val 1255232103 ecr 2251829749], length 38
17:19:50.150506 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 149: (tos 0x0, ttl 64, id 23752, offset 0, flags [DF], proto TCP (6), length 135)
    10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x35b5 (incorrect -> 0x22d6), seq 1:84, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232103], length 83
17:19:50.150569 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37267, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x7a8b (correct), seq 38, ack 84, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0
17:19:50.150576 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 6169: (tos 0x0, ttl 64, id 23753, offset 0, flags [DF], proto TCP (6), length 6155)
    10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x4d39 (incorrect -> 0xe699), seq 84:6187, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232103], length 6103
17:19:50.150657 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37268, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x74eb (correct), seq 38, ack 1532, win 1137, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0
17:19:50.150705 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37269, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x62d4 (correct), seq 38, ack 6187, win 1113, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0
17:19:50.150710 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 97: (tos 0x0, ttl 64, id 23758, offset 0, flags [DF], proto TCP (6), length 83)
    10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x3581 (incorrect -> 0xeac6), seq 6187:6218, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232104], length 31
17:19:50.150755 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37270, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x6295 (correct), seq 38, ack 6218, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0
17:19:50.150997 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 101: (tos 0x0, ttl 64, id 37271, offset 0, flags [DF], proto TCP (6), length 87)
    10.10.16.47.40486 > 10.10.16.249.6443: Flags [P.], cksum 0xb851 (correct), seq 38:73, ack 6218, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 35
17:19:50.154107 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 103: (tos 0x0, ttl 64, id 65234, offset 0, flags [DF], proto TCP (6), length 89)
    10.10.16.47.40732 > 10.10.16.249.6443: Flags [P.], cksum 0xb7ea (correct), seq 1810087930:1810087967, ack 1457312118, win 3442, options [nop,nop,TS val 1255232107 ecr 2251829777], length 37
17:19:50.192794 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 23759, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.249.6443 > 10.10.16.47.40486: Flags [.], cksum 0x3562 (incorrect -> 0x6060), seq 6218, ack 73, win 1632, options [nop,nop,TS val 2251833957 ecr 1255232104], length 0
17:19:50.196795 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 19605, offset 0, flags [DF], proto TCP (6), length 52)
    10.10.16.249.6443 > 10.10.16.47.40732: Flags [.], cksum 0x3562 (incorrect -> 0xcd10), seq 1, ack 37, win 501, options [nop,nop,TS val 2251833961 ecr 1255232107], length 0
^C
23 packets captured
23 packets received by filter
0 packets dropped by kernel

屏蔽其他两台机器的apiserver

#---------------------------------------------------------------------
# round robin balancing between the various backends
#---------------------------------------------------------------------
backend kube-apiserver
    mode tcp
    option tcplog
    option tcp-check
    balance roundrobin
    default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100
    #server ubuntu 10.10.16.82:6443 check
    server cloud  10.10.16.47:6443 check
    #server centos7 10.10.16.251:6443 check
"haproxy.cfg" [dos] 78L, 3163C written                                                                                                                                                                                          
root@ubuntu:/etc/haproxy# systemctl restart haproxy
root@ubuntu:/etc/haproxy# conntrack -L | grep 9443
tcp      6 42 TIME_WAIT src=10.10.16.81 dst=10.10.16.249 sport=38376 dport=9443 src=10.10.16.249 dst=10.10.16.81 sport=9443 dport=38376 [ASSURED] mark=0 use=1
tcp      6 79 TIME_WAIT src=10.10.16.81 dst=10.10.16.249 sport=38388 dport=9443 src=10.10.16.249 dst=10.10.16.81 sport=9443 dport=38388 [ASSURED] mark=0 use=1
conntrack v1.4.4 (conntrack-tools): 257 flow entries have been shown.
root@ubuntu:/etc/haproxy#

[root@bogon ~]# telnet 10.10.16.249 9443
Trying 10.10.16.249...
Connected to 10.10.16.249.
Escape character is '^]'.
^CConnection closed by foreign host.
[root@bogon ~]#

root@ubuntu:/etc/haproxy# kubectl get nodes -o wide
NAME      STATUS   ROLES    AGE   VERSION   INTERNAL-IP    EXTERNAL-IP   OS-IMAGE                   KERNEL-VERSION                CONTAINER-RUNTIME
centos7   Ready    master   19m   v1.18.1   10.10.16.251   <none>        CentOS Linux 7 (AltArch)   4.14.0-115.el7a.0.1.aarch64   docker://1.13.1
cloud     Ready    master   18m   v1.21.1   10.10.16.47    <none>        Ubuntu 18.04.3 LTS         5.5.19-050519-generic         docker://19.3.13
ubuntu    Ready    master   35m   v1.18.1   10.10.16.82    <none>        Ubuntu 18.04.3 LTS         5.0.0-23-generic              containerd://1.3.7
root@ubuntu:/etc/haproxy#

worker join

屏蔽其他两台机器的apiserver进行join

[root@bogon ~]# kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s 
>     --discovery-token-ca-cert-hash sha256:dd30de5ae3a2006842ae01c6bf4294370c2b941964086d8a 
W0630 17:31:53.688439   54158 join.go:346] [preflight] WARNING: JoinControlPane.controlPlane settings will be ignored when control-plane flag is not set.
[preflight] Running pre-flight checks
        [WARNING Service-Docker]: docker service is not enabled, please run 'systemctl enable docker.service'
        [WARNING IsDockerSystemdCheck]: detected "cgroupfs" as the Docker cgroup driver. The recommended driver is "systemd". Please follow the guide at https://kubernetes.io/docs/setup/cri/
        [WARNING SystemVerification]: this Docker version is not on the list of validated versions: 20.10.7. Latest validated version: 19.03
        [WARNING Service-Kubelet]: kubelet service is not enabled, please run 'systemctl enable kubelet.service'
[preflight] Reading configuration from the cluster...
[preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -oyaml'
[kubelet-start] Downloading configuration for the kubelet from the "kubelet-config-1.18" ConfigMap in the kube-system namespace
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Starting the kubelet
[kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap...

This node has joined the cluster:
* Certificate signing request was sent to apiserver and a response was received.
* The Kubelet was informed of the new secure connection details.

Run 'kubectl get nodes' on the control-plane to see this node join the cluster.

[root@centos7 ~]# kubectl get nodes -o wide
NAME      STATUS   ROLES    AGE     VERSION   INTERNAL-IP    EXTERNAL-IP   OS-IMAGE                   KERNEL-VERSION                CONTAINER-RUNTIME
bogon     Ready    <none>   6m42s   v1.18.1   10.10.16.81    <none>        CentOS Linux 7 (AltArch)   4.14.0-115.8.1.el7a.aarch64   docker://20.10.7
centos7   Ready    master   27m     v1.18.1   10.10.16.251   <none>        CentOS Linux 7 (AltArch)   4.14.0-115.el7a.0.1.aarch64   docker://1.13.1
cloud     Ready    master   26m     v1.21.1   10.10.16.47    <none>        Ubuntu 18.04.3 LTS         5.5.19-050519-generic         docker://19.3.13
ubuntu    Ready    master   43m     v1.18.1   10.10.16.82    <none>        Ubuntu 18.04.3 LTS         5.0.0-23-generic              containerd://1.3.7

各个master看到的pod一样

root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE     IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          30m     <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          30m     <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          21m     10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          20m     10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          37m     10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             0          21m     10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             0          20m     10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             5          38m     10.10.16.82    ubuntu    <none>           <none>
kube-flannel-ds-arm64-88gxp       0/1     CrashLoopBackOff    9          23m     10.10.16.251   centos7   <none>           <none>
kube-flannel-ds-arm64-czn2z       0/1     Init:0/1            0          2m12s   10.10.16.81    bogon     <none>           <none>
kube-flannel-ds-arm64-w9t92       0/1     CrashLoopBackOff    9          22m     10.10.16.47    cloud     <none>           <none>
kube-flannel-ds-arm64-xg9s8       0/1     CrashLoopBackOff    9          24m     10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          23m     10.10.16.251   centos7   <none>           <none>
kube-proxy-c9jvr                  1/1     Running             0          2m9s    10.10.16.81    bogon     <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          22m     10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          32m     10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             0          21m     10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             0          20m     10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             5          37m     10.10.16.82    ubuntu    <none>           <none>
root@ubuntu:/etc/haproxy#

[root@centos7 ~]#  kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE     IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          31m     <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          32m     <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          22m     10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          21m     10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          38m     10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             0          22m     10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             0          21m     10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             5          39m     10.10.16.82    ubuntu    <none>           <none>
kube-flannel-ds-arm64-88gxp       0/1     CrashLoopBackOff    9          24m     10.10.16.251   centos7   <none>           <none>
kube-flannel-ds-arm64-czn2z       0/1     Error               2          3m16s   10.10.16.81    bogon     <none>           <none>
kube-flannel-ds-arm64-w9t92       0/1     CrashLoopBackOff    9          23m     10.10.16.47    cloud     <none>           <none>
kube-flannel-ds-arm64-xg9s8       0/1     CrashLoopBackOff    9          25m     10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          24m     10.10.16.251   centos7   <none>           <none>
kube-proxy-c9jvr                  1/1     Running             0          3m13s   10.10.16.81    bogon     <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          23m     10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          33m     10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             0          22m     10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             0          21m     10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             5          38m     10.10.16.82    ubuntu    <none>           <none>

root@cloud:~# kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE     IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          32m     <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          32m     <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          23m     10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          22m     10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          39m     10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             0          23m     10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             0          22m     10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             5          39m     10.10.16.82    ubuntu    <none>           <none>
kube-flannel-ds-arm64-88gxp       0/1     CrashLoopBackOff    9          25m     10.10.16.251   centos7   <none>           <none>
kube-flannel-ds-arm64-czn2z       0/1     CrashLoopBackOff    3          4m      10.10.16.81    bogon     <none>           <none>
kube-flannel-ds-arm64-w9t92       0/1     CrashLoopBackOff    9          23m     10.10.16.47    cloud     <none>           <none>
kube-flannel-ds-arm64-xg9s8       0/1     CrashLoopBackOff    9          26m     10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          25m     10.10.16.251   centos7   <none>           <none>
kube-proxy-c9jvr                  1/1     Running             0          3m57s   10.10.16.81    bogon     <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          23m     10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          34m     10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             0          23m     10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             0          22m     10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             5          39m     10.10.16.82    ubuntu    <none>           <none>
root@cloud:~#

K8S Leader elect

scheduler, controller-manager 参与了 Pod 的调度及具体的各种资源的管控，如果同时有多个 controller-manager 来对 Pod 资源进行调度，结果太美不敢看，那么 k8s 是如何做到正确运转的呢？

k8s 所有功能都是通过 services 对外暴露接口，而 services 对应的是具体的 endpoints ，那么来看下 scheduler 和 controller-manager 的 endpoints 是什么：

[root@centos7 ~]# kubectl -n kube-system describe endpoints kube-scheduler
Name:         kube-scheduler
Namespace:    kube-system
Labels:       <none>
Annotations:  control-plane.alpha.kubernetes.io/leader:
                {"holderIdentity":"cloud_67d2e0f4-9461-4105-819a-6c4c5c0e211c","leaseDurationSeconds":15,"acquireTime":"2021-06-30T09:42:33Z","renewTime":...
Subsets:
Events:
  Type    Reason          Age    From               Message
  Normal  LeaderElection  46m    default-scheduler  ubuntu_6e41aa59-1362-41bc-9670-f540218f4001 became leader
  Normal  LeaderElection  45m    default-scheduler  ubuntu_b48bb455-1df3-484f-a603-8f07af14ca9e became leader
  Normal  LeaderElection  42m    default-scheduler  ubuntu_2a65880e-36b9-4361-ad23-a5f4626353d3 became leader
  Normal  LeaderElection  39m    default-scheduler  ubuntu_d0b9359f-9441-4330-b2c0-db77ba6abb84 became leader
  Normal  LeaderElection  36m    default-scheduler  ubuntu_ac5ddc82-14d2-43db-9501-7578382bee79 became leader
  Normal  LeaderElection  33m    default-scheduler  ubuntu_c6fb755b-8532-4ae4-8e9d-42663f054077 became leader
  Normal  LeaderElection  64s    default-scheduler  ubuntu_c6fb755b-8532-4ae4-8e9d-42663f054077 stopped leading
  Normal  LeaderElection  42s    default-scheduler  centos7_e6c965d7-7873-45c5-92e1-2a7248f429d5 became leader
  Normal  LeaderElection  4s     default-scheduler  cloud_67d2e0f4-9461-4105-819a-6c4c5c0e211c became leader

[root@centos7 ~]# kubectl get pod  -n kube-system  | grep ubuntu_6e41aa59-1362-41bc-9670-f540218f4001
[root@centos7 ~]# kubectl get pod  -n kube-system  | grep 6e41aa59-1362-41bc-9670-f540218f4001
[root@centos7 ~]#

可以看到关键字 control-plane.alpha.kubernetes.io/leader ，这两个组件是通过 leader 选举来从集群中多个节点选择一个执行具体动作，如果我们去看 /etc/kubernetes/manifests/ 下的配置文件，会看到这行配置：

[root@centos7 ~]# ls /etc/kubernetes/manifests/
kube-apiserver.yaml  kube-controller-manager.yaml  kube-scheduler.yaml
[root@centos7 ~]#

[root@centos7 ~]# cat /etc/kubernetes/manifests/kube-scheduler.yaml  | grep leader-elec
    - --leader-elect=true
[root@centos7 ~]#

通过在 YAML 中添加 leader-elect=true 来决定是否进行选主逻辑。而这个参数也是在执行 kubeadm 部署集群时就自动配置好了，无需手动配置。

flannel

删除

root@ubuntu:~# kubectl delete -f kube-flannel.yml 
podsecuritypolicy.policy "psp.flannel.unprivileged" deleted
clusterrole.rbac.authorization.k8s.io "flannel" deleted
clusterrolebinding.rbac.authorization.k8s.io "flannel" deleted
serviceaccount "flannel" deleted
configmap "kube-flannel-cfg" deleted

root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE   IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          54m   <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          54m   <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          45m   10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          44m   10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          61m   10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             3          45m   10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             3          44m   10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             8          62m   10.10.16.82    ubuntu    <none>           <none>
kube-flannel-ds-arm64-88gxp       0/1     CrashLoopBackOff    13         47m   10.10.16.251   centos7   <none>           <none>
kube-flannel-ds-arm64-czn2z       0/1     CrashLoopBackOff    9          26m   10.10.16.81    bogon     <none>           <none>
kube-flannel-ds-arm64-w9t92       0/1     CrashLoopBackOff    13         46m   10.10.16.47    cloud     <none>           <none>
kube-flannel-ds-arm64-xg9s8       0/1     CrashLoopBackOff    14         48m   10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          47m   10.10.16.251   centos7   <none>           <none>
kube-proxy-c9jvr                  1/1     Running             0          26m   10.10.16.81    bogon     <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          46m   10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          56m   10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             3          45m   10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             3          44m   10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             7          61m   10.10.16.82    ubuntu    <none>           <none>
root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system
NAME                              READY   STATUS              RESTARTS   AGE   IP             NODE      NOMINATED NODE   READINESS GATES
coredns-546565776c-4b95r          0/1     ContainerCreating   0          55m   <none>         ubuntu    <none>           <none>
coredns-546565776c-xfwr5          0/1     ContainerCreating   0          55m   <none>         ubuntu    <none>           <none>
kube-apiserver-centos7            1/1     Running             0          46m   10.10.16.251   centos7   <none>           <none>
kube-apiserver-cloud              1/1     Running             0          45m   10.10.16.47    cloud     <none>           <none>
kube-apiserver-ubuntu             1/1     Running             0          62m   10.10.16.82    ubuntu    <none>           <none>
kube-controller-manager-centos7   1/1     Running             3          46m   10.10.16.251   centos7   <none>           <none>
kube-controller-manager-cloud     1/1     Running             3          45m   10.10.16.47    cloud     <none>           <none>
kube-controller-manager-ubuntu    1/1     Running             9          63m   10.10.16.82    ubuntu    <none>           <none>
kube-proxy-4bg7x                  1/1     Running             0          48m   10.10.16.251   centos7   <none>           <none>
kube-proxy-c9jvr                  1/1     Running             0          27m   10.10.16.81    bogon     <none>           <none>
kube-proxy-jtrkp                  1/1     Running             0          47m   10.10.16.47    cloud     <none>           <none>
kube-proxy-m8s4m                  1/1     Running             0          57m   10.10.16.82    ubuntu    <none>           <none>
kube-scheduler-centos7            1/1     Running             3          46m   10.10.16.251   centos7   <none>           <none>
kube-scheduler-cloud              1/1     Running             3          45m   10.10.16.47    cloud     <none>           <none>
kube-scheduler-ubuntu             1/1     Running             8          62m   10.10.16.82    ubuntu    <none>           <none>
root@ubuntu:/etc/haproxy#

calico

root@ubuntu:~# kubectl apply -f rbac-kdd.yaml
clusterrole.rbac.authorization.k8s.io/calico-node created

etcdserver: request timed out

root@ubuntu:~# kubectl get pods -n kube-system
NAME                                       READY   STATUS              RESTARTS   AGE
calico-kube-controllers-5978c5f6b5-ghldp   0/1     ContainerCreating   0          16m
calico-node-489j2                          0/1     CrashLoopBackOff    7          16m
calico-node-6vvpc                          0/1     CrashLoopBackOff    7          16m
calico-node-9pk6p                          0/1     CrashLoopBackOff    7          16m
calico-node-bzshf                          1/1     Running             4          16m
coredns-546565776c-4b95r                   0/1     Running             0          17h
coredns-546565776c-xfwr5                   0/1     Running             1          17h
kube-apiserver-centos7                     1/1     Running             0          17h
kube-apiserver-cloud                       1/1     Running             0          17h
kube-apiserver-ubuntu                      1/1     Running             0          18h
kube-controller-manager-centos7            0/1     CrashLoopBackOff    105        17h
kube-controller-manager-cloud              0/1     CrashLoopBackOff    102        17h
kube-controller-manager-ubuntu             0/1     CrashLoopBackOff    110        18h
kube-proxy-4bg7x                           1/1     Running             0          17h
kube-proxy-c9jvr                           1/1     Running             0          17h
kube-proxy-jtrkp                           1/1     Running             0          17h
kube-proxy-m8s4m                           1/1     Running             0          17h
kube-scheduler-centos7                     0/1     Error               106        17h
kube-scheduler-cloud                       0/1     CrashLoopBackOff    103        17h
kube-scheduler-ubuntu                      0/1     CrashLoopBackOff    103        18h
root@ubuntu:~#

root@ubuntu:~# kubectl logs kube-scheduler-centos7  -n kube-system
I0701 06:03:02.545799       1 registry.go:150] Registering EvenPodsSpread predicate and priority function
I0701 06:03:02.545878       1 registry.go:150] Registering EvenPodsSpread predicate and priority function
I0701 06:03:03.476897       1 serving.go:313] Generated self-signed cert in-memory
I0701 06:03:25.644478       1 registry.go:150] Registering EvenPodsSpread predicate and priority function
I0701 06:03:25.645525       1 registry.go:150] Registering EvenPodsSpread predicate and priority function
W0701 06:03:25.649067       1 authorization.go:47] Authorization is disabled
W0701 06:03:25.649106       1 authentication.go:40] Authentication is disabled
I0701 06:03:25.649138       1 deprecated_insecure_serving.go:51] Serving healthz insecurely on [::]:10251
I0701 06:03:25.652068       1 configmap_cafile_content.go:202] Starting client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file
I0701 06:03:25.652126       1 shared_informer.go:223] Waiting for caches to sync for client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file
I0701 06:03:25.652165       1 configmap_cafile_content.go:202] Starting client-ca::kube-system::extension-apiserver-authentication::client-ca-file
I0701 06:03:25.652289       1 shared_informer.go:223] Waiting for caches to sync for client-ca::kube-system::extension-apiserver-authentication::client-ca-file
I0701 06:03:25.653086       1 secure_serving.go:178] Serving securely on 127.0.0.1:10259
I0701 06:03:25.653150       1 tlsconfig.go:240] Starting DynamicServingCertificateController
I0701 06:03:25.752411       1 shared_informer.go:230] Caches are synced for client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file 
I0701 06:03:25.752477       1 shared_informer.go:230] Caches are synced for client-ca::kube-system::extension-apiserver-authentication::client-ca-file 
I0701 06:03:25.753337       1 leaderelection.go:242] attempting to acquire leader lease  kube-system/kube-scheduler...
E0701 06:03:34.354405       1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: etcdserver: request timed out
E0701 06:03:44.354967       1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers)
E0701 06:03:58.380840       1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: net/http: request canceled (Client.Timeout exceeded while awaiting headers)
E0701 06:04:12.688606       1 leaderelection.go:356] Failed to update lock: resource name may not be empty
E0701 06:04:33.681427       1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/apis/coordination.k8s.io/v1/namespaces/kube-system/leases/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers)
I0701 06:04:38.103083       1 leaderelection.go:252] successfully acquired lease kube-system/kube-scheduler
E0701 06:05:23.387441       1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers)
I0701 06:05:23.387558       1 leaderelection.go:277] failed to renew lease kube-system/kube-scheduler: timed out waiting for the condition
F0701 06:05:23.387597       1 server.go:244] leaderelection lost
root@ubuntu:~#

root@ubuntu:~/cfssl/etcd# kubectl get cs
NAME                 STATUS      MESSAGE                                                                                     ERROR
controller-manager   Unhealthy   Get http://127.0.0.1:10252/healthz: dial tcp 127.0.0.1:10252: connect: connection refused   
scheduler            Unhealthy   Get http://127.0.0.1:10251/healthz: dial tcp 127.0.0.1:10251: connect: connection refused   
etcd-1               Unhealthy   HTTP probe failed with statuscode: 503                                                      
etcd-0               Unhealthy   HTTP probe failed with statuscode: 503                                                      
etcd-2               Unhealthy   HTTP probe failed with statuscode: 503

root@ubuntu:~/cfssl/etcd# kubectl cluster-info
Kubernetes master is running at https://10.10.16.249:6443
KubeDNS is running at https://10.10.16.249:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy

To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.
root@ubuntu:~/cfssl/etcd#

root@ubuntu:~/cfssl/etcd#  ETCDCTL_API=3 ./etcdctl --cacert  ./etcd/ca.pem  --cert   ./etcd/server.pem --key   ./etcd/server-key.pem  --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379  endpoint health
https://10.10.18.43:2379 is healthy: successfully committed proposal: took = 47.926029ms
https://10.10.18.44:2379 is healthy: successfully committed proposal: took = 47.95631ms
https://10.10.18.42:2379 is healthy: successfully committed proposal: took = 732.843088ms
root@ubuntu:~/cfssl/etcd#

root@ubuntu:~/cfssl/etcd# kubectl get endpoints kube-scheduler --namespace=kube-system -o yaml
apiVersion: v1
kind: Endpoints
metadata:
  annotations:
    control-plane.alpha.kubernetes.io/leader: '{"holderIdentity":"cloud_7d377bb4-928f-43df-af28-1d2060d0cb3d","leaseDurationSeconds":15,"acquireTime":"2021-07-01T03:35:41Z","renewTime":"2021-07-01T03:39:31Z","leaderTransitions":391}'
  creationTimestamp: "2021-06-30T04:03:33Z"
  managedFields:
  - apiVersion: v1
    fieldsType: FieldsV1
    fieldsV1:
      f:metadata:
        f:annotations:
          .: {}
          f:control-plane.alpha.kubernetes.io/leader: {}
    manager: kube-scheduler
    operation: Update
    time: "2021-07-01T03:39:31Z"
  name: kube-scheduler
  namespace: kube-system
  resourceVersion: "117994"
  selfLink: /api/v1/namespaces/kube-system/endpoints/kube-scheduler
  uid: b04f734c-b252-4a32-958c-8fd2ba19be7b

read-only range request etcd took too long to execut

journalctl -xeu etcd

Jul 01 11:43:18 host-10-10-18-42 etcd[21504]: request "header:<ID:18202839804396473573 > txn:<compare:<target:MOD key:"/registry/services/endpoints/kube-system/kube-controller-manager" mod_revision:118388 > success:<request_put:<key:"/registry/servi
Jul 01 11:43:21 host-10-10-18-42 etcd[21504]: timed out waiting for read index response (local node might have slow network)
Jul 01 11:43:21 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/specs/default/kubernetes" " with result "error:etcdserver: request timed out" took too long (7.00031982s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/storageclasses" range_end:"/registry/storageclasset" count_only:true " with result "range_response_count:0 size:6" took too long (4.82138998s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/leases/kube-system/kube-scheduler" " with result "range_response_count:1 size:480" took too long (8.6141478s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/volumeattachments" range_end:"/registry/volumeattachmentt" count_only:true " with result "range_response_count:0 size:6" took too long (3.10040272s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/validatingwebhookconfigurations" range_end:"/registry/validatingwebhookconfigurationt" count_only:true " with result "range_response_count:0 size:6" took too long
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/endpoints/kube-system/kube-scheduler" " with result "range_response_count:1 size:577" took too long (6.4749108s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/events" range_end:"/registry/eventt" count_only:true " with result "range_response_count:0 size:9" took too long (3.22013804s) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/health" " with result "range_response_count:0 size:6" took too long (107.31128ms) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/persistentvolumes" range_end:"/registry/persistentvolumet" count_only:true " with result "range_response_count:0 size:6" took too long (590.10484ms) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/pods" range_end:"/registry/podt" count_only:true " with result "range_response_count:0 size:8" took too long (870.81538ms) to execute
Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/endpoints/kube-system/kube-controller-manager" " with result "range_response_count:1 size:605" took too long (1.01391092s) to execute
Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/events/kube-system/kube-apiserver-centos7.168d5129b54e2e25" " with result "range_response_count:1 size:793" took too long (679.77894ms) to execute
Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/ingressclasses" range_end:"/registry/ingressclasset" count_only:true " with result "range_response_count:0 size:6" took too long (384.0858ms) to execute
Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/validatingwebhookconfigurations" range_end:"/registry/validatingwebhookconfigurationt" count_only:true " with result "range_response_count:0 size:6" took too long
Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/clusterrolebindings" range_end:"/registry/clusterrolebindingt" count_only:true " with result "range_response_count:0 size:8" took too long (308.35244ms) to execut

查看etcd master节点

ETCDCTL_API=3 ./etcdctl --cacert  ./etcd/ca.pem  --cert   ./etcd/server.pem --key   ./etcd/server-key.pem  --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379 endpoint status --write-out=table
+--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
|         ENDPOINT         |        ID        | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS |
+--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
| https://10.10.18.42:2379 | b7ea7727f764ab2c |  3.3.11 |   14 MB |     false |      false |      1687 |     198120 |                  0 |        |
| https://10.10.18.43:2379 | 596ace533af60ec1 |  3.3.11 |   14 MB |     false |      false |      1687 |     198123 |                  0 |        |
| https://10.10.18.44:2379 | 59d7c74cdcbcfc9d |  3.3.11 |   14 MB |      true |      false |      1687 |     198125 |                  0 |        |
+--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+

Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 248.28942ms)
Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 307.05404ms)
Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 307.15236ms)
Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 429.77822ms)
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 429.86552ms)
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: server is likely overloaded
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found")
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found")
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found")
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found")
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found")
Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found")

root@ubuntu:~/cfssl/etcd#  ETCDCTL_API=3 ./etcdctl --cacert  ./etcd/ca.pem  --cert   ./etcd/server.pem --key   ./etcd/server-key.pem  --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379  alarm list
root@ubuntu:~/cfssl/etcd#

处理

在etcd服务节点上修改/etc/etcd/etcd.conf文件，添加如下内容：

6秒检测频率

ETCD_HEARTBEAT_INTERVAL=6000     
ETCD_ELECTION_TIMEOUT=30000
election-timeout则至少是heartbeat-interval的5倍

增加etcd的容量，由2G-->8G,增加以下三个参数

vi /etc/systemd/system/rio-etcd.service
## auto-compaction-retention 参数#(单位⼩时)

--auto-compaction-mode=revision --auto-compaction-retention=24 --quota-backend-bytes=8589934592

root@ubuntu:~# kubectl get pods -n kube-system
NAME                                       READY   STATUS              RESTARTS   AGE
calico-kube-controllers-5978c5f6b5-ghldp   0/1     ContainerCreating   0          73m
calico-node-489j2                          0/1     CrashLoopBackOff    23         73m
calico-node-6vvpc                          0/1     Running             23         73m
calico-node-9pk6p                          0/1     CrashLoopBackOff    23         73m
calico-node-bzshf                          1/1     Running             4          73m
coredns-546565776c-4b95r                   0/1     Running             0          18h
coredns-546565776c-xfwr5                   0/1     Running             1          18h
kube-apiserver-centos7                     1/1     Running             0          18h
kube-apiserver-cloud                       1/1     Running             0          18h
kube-apiserver-ubuntu                      1/1     Running             0          19h
kube-controller-manager-centos7            1/1     Running             110        18h
kube-controller-manager-cloud              1/1     Running             106        18h
kube-controller-manager-ubuntu             1/1     Running             117        19h
kube-proxy-4bg7x                           1/1     Running             0          18h
kube-proxy-c9jvr                           1/1     Running             0          18h
kube-proxy-jtrkp                           1/1     Running             0          18h
kube-proxy-m8s4m                           1/1     Running             0          18h
kube-scheduler-centos7                     1/1     Running             112        18h
kube-scheduler-cloud                       1/1     Running             110        18h
kube-scheduler-ubuntu                      1/1     Running             108        19h
root@ubuntu:~#

root@ubuntu:~/cfssl/etcd# kubectl get cs
NAME                 STATUS    MESSAGE             ERROR
controller-manager   Healthy   ok                  
scheduler            Healthy   ok                  
etcd-2               Healthy   {"health":"true"}   
etcd-1               Healthy   {"health":"true"}   
etcd-0               Healthy   {"health":"true"}   
root@ubuntu:~/cfssl/etcd#

root@ubuntu:~# kubectl logs calico-node-9pk6p  -n kube-system
2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 390: Early log level set to info
2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 406: Using NODENAME environment for node name bogon
2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 418: Determined node name: bogon
2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 103: Starting node bogon with version v3.19.1
2021-07-01 03:56:42.927 [INFO][9] startup/startup.go 450: Checking datastore connection
2021-07-01 03:56:42.935 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:42Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:43.943 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:43Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:44.951 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:44Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:45.959 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:45Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:46.968 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:46Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:47.976 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:47Z is after 2020-07-18T15:04:13Z
2021-07-01 03:56:48.984 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:48Z is after 2020-0

calico etcd_endpoints设置

不需要配置

root@ubuntu:~# cat calico.yaml | grep etcd_endpoints
root@ubuntu:~#

root@ubuntu:~# kubectl apply -f rbac-kdd.yaml 
clusterrole.rbac.authorization.k8s.io/calico-node created
clusterrolebinding.rbac.authorization.k8s.io/calico-node created
root@ubuntu:~# kubectl apply -f calico.yaml
configmap/calico-config created
customresourcedefinition.apiextensions.k8s.io/bgpconfigurations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/bgppeers.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/blockaffinities.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/clusterinformations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/felixconfigurations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/globalnetworkpolicies.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/globalnetworksets.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/hostendpoints.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamblocks.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamconfigs.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamhandles.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ippools.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/kubecontrollersconfigurations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/networkpolicies.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/networksets.crd.projectcalico.org created
clusterrole.rbac.authorization.k8s.io/calico-kube-controllers created
clusterrolebinding.rbac.authorization.k8s.io/calico-kube-controllers created
clusterrole.rbac.authorization.k8s.io/calico-node configured
clusterrolebinding.rbac.authorization.k8s.io/calico-node configured
daemonset.apps/calico-node created
serviceaccount/calico-node created
deployment.apps/calico-kube-controllers created
serviceaccount/calico-kube-controllers created
poddisruptionbudget.policy/calico-kube-controllers created
root@ubuntu:~# kubectl get pods -o wide -n kube-system
NAME                                       READY   STATUS              RESTARTS   AGE     IP               NODE      NOMINATED NODE   READINESS GATES
calico-kube-controllers-5978c5f6b5-xk6cq   0/1     ContainerCreating   0          12s     <none>           centos7   <none>           <none>
calico-node-6tc54                          0/1     Running             0          12s     10.10.16.81      bogon     <none>           <none>
calico-node-blvrv                          0/1     Running             0          12s     10.10.16.82      ubuntu    <none>           <none>
calico-node-nwpfl                          0/1     Running             0          12s     10.10.16.47      cloud     <none>           <none>
calico-node-rswtj                          0/1     Running             0          12s     10.10.16.251     centos7   <none>           <none>
coredns-546565776c-82jfw                   1/1     Running             0          4m40s   10.244.243.193   ubuntu    <none>           <none>
coredns-546565776c-px8bd                   0/1     ContainerCreating   0          4m40s   <none>           ubuntu    <none>           <none>
kube-apiserver-centos7                     1/1     Running             0          99s     10.10.16.251     centos7   <none>           <none>
kube-apiserver-cloud                       1/1     Running             0          100s    10.10.16.47      cloud     <none>           <none>
kube-apiserver-ubuntu                      1/1     Running             0          4m49s   10.10.16.82      ubuntu    <none>           <none>
kube-controller-manager-centos7            1/1     Running             0          100s    10.10.16.251     centos7   <none>           <none>
kube-controller-manager-cloud              1/1     Running             0          100s    10.10.16.47      cloud     <none>           <none>
kube-controller-manager-ubuntu             1/1     Running             146        4m49s   10.10.16.82      ubuntu    <none>           <none>
kube-proxy-6jk97                           1/1     Running             0          4m40s   10.10.16.82      ubuntu    <none>           <none>
kube-proxy-dpwh6                           1/1     Running             0          100s    10.10.16.251     centos7   <none>           <none>
kube-proxy-wvdkr                           1/1     Running             0          3m42s   10.10.16.81      bogon     <none>           <none>
kube-proxy-xbdlt                           1/1     Running             0          101s    10.10.16.47      cloud     <none>           <none>
kube-scheduler-centos7                     1/1     Running             0          99s     10.10.16.251     centos7   <none>           <none>
kube-scheduler-cloud                       1/1     Running             0          100s    10.10.16.47      cloud     <none>           <none>
kube-scheduler-ubuntu                      1/1     Running             130        4m49s   10.10.16.82      ubuntu    <none>           <none>
root@ubuntu:~#

root@ubuntu:~# kubectl get pods -o wide -n kube-system
NAME                                       READY   STATUS    RESTARTS   AGE     IP               NODE      NOMINATED NODE   READINESS GATES
calico-kube-controllers-5978c5f6b5-xk6cq   1/1     Running   0          2m33s   10.244.129.129   centos7   <none>           <none>
calico-node-6tc54                          1/1     Running   0          2m33s   10.10.16.81      bogon     <none>           <none>
calico-node-blvrv                          1/1     Running   0          2m33s   10.10.16.82      ubuntu    <none>           <none>
calico-node-nwpfl                          1/1     Running   0          2m33s   10.10.16.47      cloud     <none>           <none>
calico-node-rswtj                          1/1     Running   0          2m33s   10.10.16.251     centos7   <none>           <none>
coredns-546565776c-82jfw                   1/1     Running   0          7m1s    10.244.243.193   ubuntu    <none>           <none>
coredns-546565776c-px8bd                   1/1     Running   0          7m1s    10.244.243.194   ubuntu    <none>           <none>
kube-apiserver-centos7                     1/1     Running   0          4m      10.10.16.251     centos7   <none>           <none>
kube-apiserver-cloud                       1/1     Running   0          4m1s    10.10.16.47      cloud     <none>           <none>
kube-apiserver-ubuntu                      1/1     Running   0          7m10s   10.10.16.82      ubuntu    <none>           <none>
kube-controller-manager-centos7            1/1     Running   0          4m1s    10.10.16.251     centos7   <none>           <none>
kube-controller-manager-cloud              1/1     Running   0          4m1s    10.10.16.47      cloud     <none>           <none>
kube-controller-manager-ubuntu             1/1     Running   146        7m10s   10.10.16.82      ubuntu    <none>           <none>
kube-proxy-6jk97                           1/1     Running   0          7m1s    10.10.16.82      ubuntu    <none>           <none>
kube-proxy-dpwh6                           1/1     Running   0          4m1s    10.10.16.251     centos7   <none>           <none>
kube-proxy-wvdkr                           1/1     Running   0          6m3s    10.10.16.81      bogon     <none>           <none>
kube-proxy-xbdlt                           1/1     Running   0          4m2s    10.10.16.47      cloud     <none>           <none>
kube-scheduler-centos7                     1/1     Running   0          4m      10.10.16.251     centos7   <none>           <none>
kube-scheduler-cloud                       1/1     Running   0          4m1s    10.10.16.47      cloud     <none>           <none>
kube-scheduler-ubuntu                      1/1     Running   130        7m10s   10.10.16.82      ubuntu    <none>           <none>
root@ubuntu:~#

root@ubuntu:~# netstat -pan | grep 2379 | wc -l
500
root@ubuntu:~#

etcd 问题、调优、监控

Kubernetes 实战-Leader 选举

Kubernetes 实战-高可用集群部署

查看全文

相关阅读:
关于Response.End的亦常
 关于web.config
本地打印机的设置
 xml的操作
 javascript定义对象的几种简单方法
 WindowsXP下共享无线网络设置步骤
 第四章：使用Rich控件
 第五章母版页
 第八章数据访问概述
 对lock(obj)中的obj的理解

原文地址：https://www.cnblogs.com/dream397/p/14955223.html