k8s集群清除
kubeadm resef -f
root@ubuntu:~/cluster# ps -elf | grep kube 4 S root 8099 8071 4 80 0 - 161890 futex_ Jun18 ? 12:33:47 kube-apiserver --advertise-address=10.10.16.82 --allow-privileged=true --authorization-mode=Node,RBAC --client-ca-file=/etc/kubernetes/pki/ca.crt --enable-admission-plugins=NodeRestriction --enable-bootstrap-token-auth=true --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key --etcd-servers=https://127.0.0.1:2379 --insecure-port=0 --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key --requestheader-allowed-names=front-proxy-client --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt --requestheader-extra-headers-prefix=X-Remote-Extra- --requestheader-group-headers=X-Remote-Group --requestheader-username-headers=X-Remote-User --secure-port=6443 --service-account-key-file=/etc/kubernetes/pki/sa.pub --service-cluster-ip-range=10.96.0.0/12 --tls-cert-file=/etc/kubernetes/pki/apiserver.crt --tls-private-key-file=/etc/kubernetes/pki/apiserver.key 0 S root 13537 36620 0 80 0 - 1096 pipe_w 16:36 pts/0 00:00:00 grep kube 4 S root 17548 17521 0 80 0 - 187659 futex_ Jun21 ? 00:07:47 /usr/bin/kube-controllers 4 S root 55252 55222 0 80 0 - 36362 ep_pol Jun25 ? 00:24:17 /usr/local/bin/kube-proxy --config=/var/lib/kube-proxy/config.conf --hostname-override=ubuntu root@ubuntu:~/cluster# kill -9 8099 17548 55252 root@ubuntu:~/cluster#
第一个master节点初始化
haproxy配置
#--------------------------------------------------------------------- backend kube-apiserver mode tcp option tcplog option tcp-check balance roundrobin default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100 server ubuntu 10.10.16.82:6443 check
keepavlivede vip
root@ubuntu:/etc/haproxy# cat ../keepalived/keepalived.conf global_defs { script_user root enable_script_security } vrrp_script chk_haproxy { script "/bin/bash -c 'if [[ $(netstat -nlp | grep 9443 | wc -l) ]]; then exit 0; else exit 1; fi'" # haproxy 检测 interval 2 # 每2秒执行一次检测 #weight -10 # 权重变化 } vrrp_instance VI_1 { interface enahisic2i0 ###宿主机网卡名 state BACKUP virtual_router_id 61 # id设为相同,表示是同一个虚拟路由组 priority 80 #初始权重 nopreempt #不抢占 unicast_peer { 10.10.16.47 10.10.16.251 } virtual_ipaddress { 10.10.16.249 # vip } authentication { auth_type PASS auth_pass password } track_script { chk_haproxy } #notify "/container/service/keepalived/assets/" }
root@ubuntu:~# kubeadm config print init-defaults > kubeadm-init.yaml.yaml W0630 16:17:15.326593 6239 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io] root@ubuntu:~# kubeadm config images list I0630 16:20:18.442685 7275 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18 W0630 16:20:19.242028 7275 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io] k8s.gcr.io/kube-apiserver:v1.18.20 k8s.gcr.io/kube-controller-manager:v1.18.20 k8s.gcr.io/kube-scheduler:v1.18.20 k8s.gcr.io/kube-proxy:v1.18.20 k8s.gcr.io/pause:3.2 k8s.gcr.io/etcd:3.4.3-0 k8s.gcr.io/coredns:1.6.7 root@ubuntu:~#
root@ubuntu:~/cluster# kubeadm config images list | grep k8s.gcr.io/kube-apiserver:v1.18.20 I0630 16:47:23.769353 17086 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18 W0630 16:47:24.488215 17086 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io] k8s.gcr.io/kube-apiserver:v1.18.20 root@ubuntu:~/cluster# kubeadm config images pull I0630 16:48:28.388046 17364 version.go:252] remote version is much newer: v1.21.2; falling back to: stable-1.18 W0630 16:48:29.103399 17364 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io] failed to pull image "k8s.gcr.io/kube-apiserver:v1.18.20": output: Error response from daemon: Get https://k8s.gcr.io/v2/: net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers) , error: exit status 1 To see the stack trace of this error execute with --v=5 or higher root@ubuntu:~/cluster#
原来是本有误
kubernetesVersion: stable
改成
kubernetesVersion: v1.18.0
root@ubuntu:~/cluster# cat clus.sh proxy=10.10.16.249 etcd1=10.10.18.42 etcd2=10.10.18.43 etcd3=10.10.18.44 cat << EOF > kubeadm-config.yaml apiVersion: kubeadm.k8s.io/v1beta2 kind: ClusterConfiguration kubernetesVersion: v1.18.0 imageRepository: registry.cn-hangzhou.aliyuncs.com/google_containers apiServer: certSANs: - "$proxy" controlPlaneEndpoint: "$proxy:6443" etcd: external: endpoints: - "https://$etcd1:2379" - "https://$etcd2:2379" - "https://$etcd3:2379" caFile: /opt/etcd/ssl/ca.pem certFile: /opt/etcd/ssl/server.pem keyFile: /opt/etcd/ssl/server-key.pem networking: podSubnet: "10.244.0.0/16" EOF
kubeadm init --config kubeadm-config.yaml
You can now join any number of control-plane nodes by copying certificate authorities and service account keys on each node and then running the following as root: kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s --discovery-token-ca-cert-hash sha256:dd --control-plane Then you can join any number of worker nodes by running the following on each as root: kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s --discovery-token-ca-cert-hash sha256:dd
执行
mkdir -p $HOME/.kube sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config sudo chown $(id -u):$(id -g) $HOME/.kube/config
root@ubuntu:~# kubectl get nodes NAME STATUS ROLES AGE VERSION ubuntu Ready master 6m59s v1.18.1 root@ubuntu:~#
kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 3m9s <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 3m21s <none> ubuntu <none> <none> kube-apiserver-ubuntu 1/1 Running 0 10m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-ubuntu 1/1 Running 4 10m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-2js47 0/1 CrashLoopBackOff 5 5m1s 10.10.16.82 ubuntu <none> <none> kube-proxy-m8s4m 1/1 Running 0 5m 10.10.16.82 ubuntu <none> <none> kube-scheduler-ubuntu 1/1 Running 4 10m 10.10.16.82 ubuntu <none> <none>
没有pause, coredns处于createing
root@ubuntu:/etc/kubernetes/pki# kubectl delete pod kube-flannel-ds-arm64-2js47 -n kube-system pod "kube-flannel-ds-arm64-2js47" deleted root@ubuntu:/etc/kubernetes/pki# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 6m17s <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 6m29s <none> ubuntu <none> <none> kube-apiserver-ubuntu 1/1 Running 0 13m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-ubuntu 1/1 Running 5 13m 10.10.16.82 ubuntu <none> <none> kube-proxy-m8s4m 1/1 Running 0 8m8s 10.10.16.82 ubuntu <none> <none> kube-scheduler-ubuntu 1/1 Running 5 13m 10.10.16.82 ubuntu <none> <none>
其他master节点加入
拷贝秘钥
scp ca.* sa.* front-proxy-ca.* root@10.10.16.251:/etc/kubernetes/pki/
执行join
[root@centos7 ~]# kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s --discovery-token-ca-cert-hash sha256:dd30de5ae3a2006842ae01c6bf4 --control-plane [preflight] Running pre-flight checks [WARNING Hostname]: hostname "centos7" could not be reached [WARNING Hostname]: hostname "centos7": lookup centos7 on 8.8.8.8:53: no such host [preflight] Reading configuration from the cluster...
配置kubeCtl
To start administering your cluster from this node, you need to run the following as a regular user: mkdir -p $HOME/.kube sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config sudo chown $(id -u):$(id -g) $HOME/.kube/config Run 'kubectl get nodes' to see this node join the cluster. [root@centos7 ~]# mkdir -p $HOME/.kube [root@centos7 ~]# cp -i /etc/kubernetes/admin.conf $HOME/.kube/config [root@centos7 ~]# chown $(id -u):$(id -g) $HOME/.kube/config [root@centos7 ~]#
[root@centos7 ~]# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 10m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 10m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 101s 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 44s 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 17m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 98s 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 33s 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 5 17m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-88gxp 0/1 CrashLoopBackOff 4 3m6s 10.10.16.251 centos7 <none> <none> kube-flannel-ds-arm64-w9t92 0/1 CrashLoopBackOff 4 2m 10.10.16.47 cloud <none> <none> kube-flannel-ds-arm64-xg9s8 0/1 CrashLoopBackOff 5 4m6s 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 3m7s 10.10.16.251 centos7 <none> <none> kube-proxy-jtrkp 1/1 Running 0 2m1s 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 12m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 0 101s 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 33s 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 5 17m 10.10.16.82 ubuntu <none> <none> [root@centos7 ~]#
更改haproxy配置
backend kube-apiserver mode tcp option tcplog option tcp-check balance roundrobin default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100 server ubuntu 10.10.16.82:6443 check server cloud 10.10.16.47:6443 check server centos7 10.10.16.251:6443 check
tcpdump抓包
root@ubuntu:~/cluster# tcpdump -i enahisic2i0 tcp and port 6443 -ennvv tcpdump: listening on enahisic2i0, link-type EN10MB (Ethernet), capture size 262144 bytes 17:19:49.151776 b0:08:75:5f:b8:5b > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 18123, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.251.33358 > 10.10.16.249.6443: Flags [.], cksum 0xf5cb (correct), seq 3177185730, ack 3261322716, win 253, options [nop,nop,TS val 1926893007 ecr 1263307546], length 0 17:19:49.151801 48:57:02:64:e7:ab > b0:08:75:5f:b8:5b, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 6868, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.249.6443 > 10.10.16.251.33358: Flags [.], cksum 0x362e (incorrect -> 0x7e9a), seq 1, ack 1, win 501, options [nop,nop,TS val 1263337810 ecr 1926893007], length 0 17:19:49.633218 b0:08:75:5f:b8:5b > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 103: (tos 0x0, ttl 64, id 35885, offset 0, flags [DF], proto TCP (6), length 89) 10.10.16.251.33024 > 10.10.16.249.6443: Flags [P.], cksum 0x98c2 (correct), seq 2495572463:2495572500, ack 2740706400, win 1933, options [nop,nop,TS val 1926923752 ecr 1263334669], length 37 17:19:49.633250 48:57:02:64:e7:ab > b0:08:75:5f:b8:5b, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 2218, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.249.6443 > 10.10.16.251.33024: Flags [.], cksum 0x362e (incorrect -> 0xdec1), seq 1, ack 37, win 501, options [nop,nop,TS val 1263338291 ecr 1926923752], length 0 17:19:50.133211 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 26626, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.47.40622 > 10.10.16.249.6443: Flags [.], cksum 0x2231 (correct), seq 669238092, ack 2486368405, win 501, options [nop,nop,TS val 1255232087 ecr 2251803177], length 0 17:19:50.133236 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 19924, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.249.6443 > 10.10.16.47.40622: Flags [.], cksum 0x3562 (incorrect -> 0x153e), seq 1, ack 1, win 501, options [nop,nop,TS val 2251833897 ecr 1255139146], length 0 17:19:50.149613 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 37266, offset 0, flags [DF], proto TCP (6), length 90) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [P.], cksum 0x3885 (correct), seq 2736523175:2736523213, ack 776784886, win 1145, options [nop,nop,TS val 1255232103 ecr 2251829749], length 38 17:19:50.150506 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 149: (tos 0x0, ttl 64, id 23752, offset 0, flags [DF], proto TCP (6), length 135) 10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x35b5 (incorrect -> 0x22d6), seq 1:84, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232103], length 83 17:19:50.150569 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37267, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x7a8b (correct), seq 38, ack 84, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0 17:19:50.150576 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 6169: (tos 0x0, ttl 64, id 23753, offset 0, flags [DF], proto TCP (6), length 6155) 10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x4d39 (incorrect -> 0xe699), seq 84:6187, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232103], length 6103 17:19:50.150657 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37268, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x74eb (correct), seq 38, ack 1532, win 1137, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0 17:19:50.150705 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37269, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x62d4 (correct), seq 38, ack 6187, win 1113, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0 17:19:50.150710 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 97: (tos 0x0, ttl 64, id 23758, offset 0, flags [DF], proto TCP (6), length 83) 10.10.16.249.6443 > 10.10.16.47.40486: Flags [P.], cksum 0x3581 (incorrect -> 0xeac6), seq 6187:6218, ack 38, win 1632, options [nop,nop,TS val 2251833914 ecr 1255232104], length 31 17:19:50.150755 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 37270, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [.], cksum 0x6295 (correct), seq 38, ack 6218, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 0 17:19:50.150997 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 101: (tos 0x0, ttl 64, id 37271, offset 0, flags [DF], proto TCP (6), length 87) 10.10.16.47.40486 > 10.10.16.249.6443: Flags [P.], cksum 0xb851 (correct), seq 38:73, ack 6218, win 1145, options [nop,nop,TS val 1255232104 ecr 2251833914], length 35 17:19:50.154107 9c:52:f8:67:c4:d3 > 48:57:02:64:e7:ab, ethertype IPv4 (0x0800), length 103: (tos 0x0, ttl 64, id 65234, offset 0, flags [DF], proto TCP (6), length 89) 10.10.16.47.40732 > 10.10.16.249.6443: Flags [P.], cksum 0xb7ea (correct), seq 1810087930:1810087967, ack 1457312118, win 3442, options [nop,nop,TS val 1255232107 ecr 2251829777], length 37 17:19:50.192794 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 23759, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.249.6443 > 10.10.16.47.40486: Flags [.], cksum 0x3562 (incorrect -> 0x6060), seq 6218, ack 73, win 1632, options [nop,nop,TS val 2251833957 ecr 1255232104], length 0 17:19:50.196795 48:57:02:64:e7:ab > 9c:52:f8:67:c4:d3, ethertype IPv4 (0x0800), length 66: (tos 0x0, ttl 64, id 19605, offset 0, flags [DF], proto TCP (6), length 52) 10.10.16.249.6443 > 10.10.16.47.40732: Flags [.], cksum 0x3562 (incorrect -> 0xcd10), seq 1, ack 37, win 501, options [nop,nop,TS val 2251833961 ecr 1255232107], length 0 ^C 23 packets captured 23 packets received by filter 0 packets dropped by kernel
屏蔽其他两台机器的apiserver
#--------------------------------------------------------------------- # round robin balancing between the various backends #--------------------------------------------------------------------- backend kube-apiserver mode tcp option tcplog option tcp-check balance roundrobin default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100 #server ubuntu 10.10.16.82:6443 check server cloud 10.10.16.47:6443 check #server centos7 10.10.16.251:6443 check "haproxy.cfg" [dos] 78L, 3163C written root@ubuntu:/etc/haproxy# systemctl restart haproxy root@ubuntu:/etc/haproxy# conntrack -L | grep 9443 tcp 6 42 TIME_WAIT src=10.10.16.81 dst=10.10.16.249 sport=38376 dport=9443 src=10.10.16.249 dst=10.10.16.81 sport=9443 dport=38376 [ASSURED] mark=0 use=1 tcp 6 79 TIME_WAIT src=10.10.16.81 dst=10.10.16.249 sport=38388 dport=9443 src=10.10.16.249 dst=10.10.16.81 sport=9443 dport=38388 [ASSURED] mark=0 use=1 conntrack v1.4.4 (conntrack-tools): 257 flow entries have been shown. root@ubuntu:/etc/haproxy#
[root@bogon ~]# telnet 10.10.16.249 9443 Trying 10.10.16.249... Connected to 10.10.16.249. Escape character is '^]'. ^CConnection closed by foreign host. [root@bogon ~]#
root@ubuntu:/etc/haproxy# kubectl get nodes -o wide NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME centos7 Ready master 19m v1.18.1 10.10.16.251 <none> CentOS Linux 7 (AltArch) 4.14.0-115.el7a.0.1.aarch64 docker://1.13.1 cloud Ready master 18m v1.21.1 10.10.16.47 <none> Ubuntu 18.04.3 LTS 5.5.19-050519-generic docker://19.3.13 ubuntu Ready master 35m v1.18.1 10.10.16.82 <none> Ubuntu 18.04.3 LTS 5.0.0-23-generic containerd://1.3.7 root@ubuntu:/etc/haproxy#
worker join
屏蔽其他两台机器的apiserver进行join
[root@bogon ~]# kubeadm join 10.10.16.249:6443 --token qiojkb.svprbcf7fd7k8m4s > --discovery-token-ca-cert-hash sha256:dd30de5ae3a2006842ae01c6bf4294370c2b941964086d8a W0630 17:31:53.688439 54158 join.go:346] [preflight] WARNING: JoinControlPane.controlPlane settings will be ignored when control-plane flag is not set. [preflight] Running pre-flight checks [WARNING Service-Docker]: docker service is not enabled, please run 'systemctl enable docker.service' [WARNING IsDockerSystemdCheck]: detected "cgroupfs" as the Docker cgroup driver. The recommended driver is "systemd". Please follow the guide at https://kubernetes.io/docs/setup/cri/ [WARNING SystemVerification]: this Docker version is not on the list of validated versions: 20.10.7. Latest validated version: 19.03 [WARNING Service-Kubelet]: kubelet service is not enabled, please run 'systemctl enable kubelet.service' [preflight] Reading configuration from the cluster... [preflight] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -oyaml' [kubelet-start] Downloading configuration for the kubelet from the "kubelet-config-1.18" ConfigMap in the kube-system namespace [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml" [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env" [kubelet-start] Starting the kubelet [kubelet-start] Waiting for the kubelet to perform the TLS Bootstrap... This node has joined the cluster: * Certificate signing request was sent to apiserver and a response was received. * The Kubelet was informed of the new secure connection details. Run 'kubectl get nodes' on the control-plane to see this node join the cluster.
[root@centos7 ~]# kubectl get nodes -o wide NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME bogon Ready <none> 6m42s v1.18.1 10.10.16.81 <none> CentOS Linux 7 (AltArch) 4.14.0-115.8.1.el7a.aarch64 docker://20.10.7 centos7 Ready master 27m v1.18.1 10.10.16.251 <none> CentOS Linux 7 (AltArch) 4.14.0-115.el7a.0.1.aarch64 docker://1.13.1 cloud Ready master 26m v1.21.1 10.10.16.47 <none> Ubuntu 18.04.3 LTS 5.5.19-050519-generic docker://19.3.13 ubuntu Ready master 43m v1.18.1 10.10.16.82 <none> Ubuntu 18.04.3 LTS 5.0.0-23-generic containerd://1.3.7
各个master看到的pod一样
root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 30m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 30m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 21m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 20m 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 37m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 21m 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 20m 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 5 38m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-88gxp 0/1 CrashLoopBackOff 9 23m 10.10.16.251 centos7 <none> <none> kube-flannel-ds-arm64-czn2z 0/1 Init:0/1 0 2m12s 10.10.16.81 bogon <none> <none> kube-flannel-ds-arm64-w9t92 0/1 CrashLoopBackOff 9 22m 10.10.16.47 cloud <none> <none> kube-flannel-ds-arm64-xg9s8 0/1 CrashLoopBackOff 9 24m 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 23m 10.10.16.251 centos7 <none> <none> kube-proxy-c9jvr 1/1 Running 0 2m9s 10.10.16.81 bogon <none> <none> kube-proxy-jtrkp 1/1 Running 0 22m 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 32m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 0 21m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 20m 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 5 37m 10.10.16.82 ubuntu <none> <none> root@ubuntu:/etc/haproxy#
[root@centos7 ~]# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 31m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 32m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 22m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 21m 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 38m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 22m 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 21m 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 5 39m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-88gxp 0/1 CrashLoopBackOff 9 24m 10.10.16.251 centos7 <none> <none> kube-flannel-ds-arm64-czn2z 0/1 Error 2 3m16s 10.10.16.81 bogon <none> <none> kube-flannel-ds-arm64-w9t92 0/1 CrashLoopBackOff 9 23m 10.10.16.47 cloud <none> <none> kube-flannel-ds-arm64-xg9s8 0/1 CrashLoopBackOff 9 25m 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 24m 10.10.16.251 centos7 <none> <none> kube-proxy-c9jvr 1/1 Running 0 3m13s 10.10.16.81 bogon <none> <none> kube-proxy-jtrkp 1/1 Running 0 23m 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 33m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 0 22m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 21m 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 5 38m 10.10.16.82 ubuntu <none> <none>
root@cloud:~# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 32m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 32m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 23m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 22m 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 39m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 23m 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 22m 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 5 39m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-88gxp 0/1 CrashLoopBackOff 9 25m 10.10.16.251 centos7 <none> <none> kube-flannel-ds-arm64-czn2z 0/1 CrashLoopBackOff 3 4m 10.10.16.81 bogon <none> <none> kube-flannel-ds-arm64-w9t92 0/1 CrashLoopBackOff 9 23m 10.10.16.47 cloud <none> <none> kube-flannel-ds-arm64-xg9s8 0/1 CrashLoopBackOff 9 26m 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 25m 10.10.16.251 centos7 <none> <none> kube-proxy-c9jvr 1/1 Running 0 3m57s 10.10.16.81 bogon <none> <none> kube-proxy-jtrkp 1/1 Running 0 23m 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 34m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 0 23m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 22m 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 5 39m 10.10.16.82 ubuntu <none> <none> root@cloud:~#
K8S Leader elect
scheduler, controller-manager 参与了 Pod 的调度及具体的各种资源的管控,如果同时有多个 controller-manager 来对 Pod 资源进行调度,结果太美不敢看,那么 k8s 是如何做到正确运转的呢?
k8s 所有功能都是通过 services
对外暴露接口,而 services
对应的是具体的 endpoints
,那么来看下 scheduler 和 controller-manager 的 endpoints
是什么:
[root@centos7 ~]# kubectl -n kube-system describe endpoints kube-scheduler Name: kube-scheduler Namespace: kube-system Labels: <none> Annotations: control-plane.alpha.kubernetes.io/leader: {"holderIdentity":"cloud_67d2e0f4-9461-4105-819a-6c4c5c0e211c","leaseDurationSeconds":15,"acquireTime":"2021-06-30T09:42:33Z","renewTime":... Subsets: Events: Type Reason Age From Message Normal LeaderElection 46m default-scheduler ubuntu_6e41aa59-1362-41bc-9670-f540218f4001 became leader Normal LeaderElection 45m default-scheduler ubuntu_b48bb455-1df3-484f-a603-8f07af14ca9e became leader Normal LeaderElection 42m default-scheduler ubuntu_2a65880e-36b9-4361-ad23-a5f4626353d3 became leader Normal LeaderElection 39m default-scheduler ubuntu_d0b9359f-9441-4330-b2c0-db77ba6abb84 became leader Normal LeaderElection 36m default-scheduler ubuntu_ac5ddc82-14d2-43db-9501-7578382bee79 became leader Normal LeaderElection 33m default-scheduler ubuntu_c6fb755b-8532-4ae4-8e9d-42663f054077 became leader Normal LeaderElection 64s default-scheduler ubuntu_c6fb755b-8532-4ae4-8e9d-42663f054077 stopped leading Normal LeaderElection 42s default-scheduler centos7_e6c965d7-7873-45c5-92e1-2a7248f429d5 became leader Normal LeaderElection 4s default-scheduler cloud_67d2e0f4-9461-4105-819a-6c4c5c0e211c became leader
[root@centos7 ~]# kubectl get pod -n kube-system | grep ubuntu_6e41aa59-1362-41bc-9670-f540218f4001 [root@centos7 ~]# kubectl get pod -n kube-system | grep 6e41aa59-1362-41bc-9670-f540218f4001 [root@centos7 ~]#
可以看到关键字 control-plane.alpha.kubernetes.io/leader
,这两个组件是通过 leader 选举来从集群中多个节点选择一个执行具体动作,如果我们去看 /etc/kubernetes/manifests/
下的配置文件,会看到这行配置:
[root@centos7 ~]# ls /etc/kubernetes/manifests/ kube-apiserver.yaml kube-controller-manager.yaml kube-scheduler.yaml [root@centos7 ~]#
[root@centos7 ~]# cat /etc/kubernetes/manifests/kube-scheduler.yaml | grep leader-elec - --leader-elect=true [root@centos7 ~]#
通过在 YAML 中添加 leader-elect=true
来决定是否进行选主逻辑。而这个参数也是在执行 kubeadm
部署集群时就自动配置好了,无需手动配置。
flannel
删除
root@ubuntu:~# kubectl delete -f kube-flannel.yml podsecuritypolicy.policy "psp.flannel.unprivileged" deleted clusterrole.rbac.authorization.k8s.io "flannel" deleted clusterrolebinding.rbac.authorization.k8s.io "flannel" deleted serviceaccount "flannel" deleted configmap "kube-flannel-cfg" deleted
root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 54m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 54m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 45m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 44m 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 61m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 3 45m 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 3 44m 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 8 62m 10.10.16.82 ubuntu <none> <none> kube-flannel-ds-arm64-88gxp 0/1 CrashLoopBackOff 13 47m 10.10.16.251 centos7 <none> <none> kube-flannel-ds-arm64-czn2z 0/1 CrashLoopBackOff 9 26m 10.10.16.81 bogon <none> <none> kube-flannel-ds-arm64-w9t92 0/1 CrashLoopBackOff 13 46m 10.10.16.47 cloud <none> <none> kube-flannel-ds-arm64-xg9s8 0/1 CrashLoopBackOff 14 48m 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 47m 10.10.16.251 centos7 <none> <none> kube-proxy-c9jvr 1/1 Running 0 26m 10.10.16.81 bogon <none> <none> kube-proxy-jtrkp 1/1 Running 0 46m 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 56m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 3 45m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 3 44m 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 7 61m 10.10.16.82 ubuntu <none> <none> root@ubuntu:/etc/haproxy# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES coredns-546565776c-4b95r 0/1 ContainerCreating 0 55m <none> ubuntu <none> <none> coredns-546565776c-xfwr5 0/1 ContainerCreating 0 55m <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 46m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 45m 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 62m 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 3 46m 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 3 45m 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 9 63m 10.10.16.82 ubuntu <none> <none> kube-proxy-4bg7x 1/1 Running 0 48m 10.10.16.251 centos7 <none> <none> kube-proxy-c9jvr 1/1 Running 0 27m 10.10.16.81 bogon <none> <none> kube-proxy-jtrkp 1/1 Running 0 47m 10.10.16.47 cloud <none> <none> kube-proxy-m8s4m 1/1 Running 0 57m 10.10.16.82 ubuntu <none> <none> kube-scheduler-centos7 1/1 Running 3 46m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 3 45m 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 8 62m 10.10.16.82 ubuntu <none> <none> root@ubuntu:/etc/haproxy#
calico
root@ubuntu:~# kubectl apply -f rbac-kdd.yaml
clusterrole.rbac.authorization.k8s.io/calico-node created
etcdserver: request timed out
root@ubuntu:~# kubectl get pods -n kube-system NAME READY STATUS RESTARTS AGE calico-kube-controllers-5978c5f6b5-ghldp 0/1 ContainerCreating 0 16m calico-node-489j2 0/1 CrashLoopBackOff 7 16m calico-node-6vvpc 0/1 CrashLoopBackOff 7 16m calico-node-9pk6p 0/1 CrashLoopBackOff 7 16m calico-node-bzshf 1/1 Running 4 16m coredns-546565776c-4b95r 0/1 Running 0 17h coredns-546565776c-xfwr5 0/1 Running 1 17h kube-apiserver-centos7 1/1 Running 0 17h kube-apiserver-cloud 1/1 Running 0 17h kube-apiserver-ubuntu 1/1 Running 0 18h kube-controller-manager-centos7 0/1 CrashLoopBackOff 105 17h kube-controller-manager-cloud 0/1 CrashLoopBackOff 102 17h kube-controller-manager-ubuntu 0/1 CrashLoopBackOff 110 18h kube-proxy-4bg7x 1/1 Running 0 17h kube-proxy-c9jvr 1/1 Running 0 17h kube-proxy-jtrkp 1/1 Running 0 17h kube-proxy-m8s4m 1/1 Running 0 17h kube-scheduler-centos7 0/1 Error 106 17h kube-scheduler-cloud 0/1 CrashLoopBackOff 103 17h kube-scheduler-ubuntu 0/1 CrashLoopBackOff 103 18h root@ubuntu:~#
root@ubuntu:~# kubectl logs kube-scheduler-centos7 -n kube-system I0701 06:03:02.545799 1 registry.go:150] Registering EvenPodsSpread predicate and priority function I0701 06:03:02.545878 1 registry.go:150] Registering EvenPodsSpread predicate and priority function I0701 06:03:03.476897 1 serving.go:313] Generated self-signed cert in-memory I0701 06:03:25.644478 1 registry.go:150] Registering EvenPodsSpread predicate and priority function I0701 06:03:25.645525 1 registry.go:150] Registering EvenPodsSpread predicate and priority function W0701 06:03:25.649067 1 authorization.go:47] Authorization is disabled W0701 06:03:25.649106 1 authentication.go:40] Authentication is disabled I0701 06:03:25.649138 1 deprecated_insecure_serving.go:51] Serving healthz insecurely on [::]:10251 I0701 06:03:25.652068 1 configmap_cafile_content.go:202] Starting client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file I0701 06:03:25.652126 1 shared_informer.go:223] Waiting for caches to sync for client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file I0701 06:03:25.652165 1 configmap_cafile_content.go:202] Starting client-ca::kube-system::extension-apiserver-authentication::client-ca-file I0701 06:03:25.652289 1 shared_informer.go:223] Waiting for caches to sync for client-ca::kube-system::extension-apiserver-authentication::client-ca-file I0701 06:03:25.653086 1 secure_serving.go:178] Serving securely on 127.0.0.1:10259 I0701 06:03:25.653150 1 tlsconfig.go:240] Starting DynamicServingCertificateController I0701 06:03:25.752411 1 shared_informer.go:230] Caches are synced for client-ca::kube-system::extension-apiserver-authentication::requestheader-client-ca-file I0701 06:03:25.752477 1 shared_informer.go:230] Caches are synced for client-ca::kube-system::extension-apiserver-authentication::client-ca-file I0701 06:03:25.753337 1 leaderelection.go:242] attempting to acquire leader lease kube-system/kube-scheduler... E0701 06:03:34.354405 1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: etcdserver: request timed out E0701 06:03:44.354967 1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers) E0701 06:03:58.380840 1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: net/http: request canceled (Client.Timeout exceeded while awaiting headers) E0701 06:04:12.688606 1 leaderelection.go:356] Failed to update lock: resource name may not be empty E0701 06:04:33.681427 1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/apis/coordination.k8s.io/v1/namespaces/kube-system/leases/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers) I0701 06:04:38.103083 1 leaderelection.go:252] successfully acquired lease kube-system/kube-scheduler E0701 06:05:23.387441 1 leaderelection.go:320] error retrieving resource lock kube-system/kube-scheduler: Get https://10.10.16.249:6443/api/v1/namespaces/kube-system/endpoints/kube-scheduler?timeout=10s: context deadline exceeded (Client.Timeout exceeded while awaiting headers) I0701 06:05:23.387558 1 leaderelection.go:277] failed to renew lease kube-system/kube-scheduler: timed out waiting for the condition F0701 06:05:23.387597 1 server.go:244] leaderelection lost root@ubuntu:~#
root@ubuntu:~/cfssl/etcd# kubectl get cs NAME STATUS MESSAGE ERROR controller-manager Unhealthy Get http://127.0.0.1:10252/healthz: dial tcp 127.0.0.1:10252: connect: connection refused scheduler Unhealthy Get http://127.0.0.1:10251/healthz: dial tcp 127.0.0.1:10251: connect: connection refused etcd-1 Unhealthy HTTP probe failed with statuscode: 503 etcd-0 Unhealthy HTTP probe failed with statuscode: 503 etcd-2 Unhealthy HTTP probe failed with statuscode: 503
root@ubuntu:~/cfssl/etcd# kubectl cluster-info Kubernetes master is running at https://10.10.16.249:6443 KubeDNS is running at https://10.10.16.249:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'. root@ubuntu:~/cfssl/etcd#
root@ubuntu:~/cfssl/etcd# ETCDCTL_API=3 ./etcdctl --cacert ./etcd/ca.pem --cert ./etcd/server.pem --key ./etcd/server-key.pem --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379 endpoint health https://10.10.18.43:2379 is healthy: successfully committed proposal: took = 47.926029ms https://10.10.18.44:2379 is healthy: successfully committed proposal: took = 47.95631ms https://10.10.18.42:2379 is healthy: successfully committed proposal: took = 732.843088ms root@ubuntu:~/cfssl/etcd#
root@ubuntu:~/cfssl/etcd# kubectl get endpoints kube-scheduler --namespace=kube-system -o yaml apiVersion: v1 kind: Endpoints metadata: annotations: control-plane.alpha.kubernetes.io/leader: '{"holderIdentity":"cloud_7d377bb4-928f-43df-af28-1d2060d0cb3d","leaseDurationSeconds":15,"acquireTime":"2021-07-01T03:35:41Z","renewTime":"2021-07-01T03:39:31Z","leaderTransitions":391}' creationTimestamp: "2021-06-30T04:03:33Z" managedFields: - apiVersion: v1 fieldsType: FieldsV1 fieldsV1: f:metadata: f:annotations: .: {} f:control-plane.alpha.kubernetes.io/leader: {} manager: kube-scheduler operation: Update time: "2021-07-01T03:39:31Z" name: kube-scheduler namespace: kube-system resourceVersion: "117994" selfLink: /api/v1/namespaces/kube-system/endpoints/kube-scheduler uid: b04f734c-b252-4a32-958c-8fd2ba19be7b
read-only range request etcd took too long to execut
journalctl -xeu etcd
Jul 01 11:43:18 host-10-10-18-42 etcd[21504]: request "header:<ID:18202839804396473573 > txn:<compare:<target:MOD key:"/registry/services/endpoints/kube-system/kube-controller-manager" mod_revision:118388 > success:<request_put:<key:"/registry/servi Jul 01 11:43:21 host-10-10-18-42 etcd[21504]: timed out waiting for read index response (local node might have slow network) Jul 01 11:43:21 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/specs/default/kubernetes" " with result "error:etcdserver: request timed out" took too long (7.00031982s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/storageclasses" range_end:"/registry/storageclasset" count_only:true " with result "range_response_count:0 size:6" took too long (4.82138998s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/leases/kube-system/kube-scheduler" " with result "range_response_count:1 size:480" took too long (8.6141478s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/volumeattachments" range_end:"/registry/volumeattachmentt" count_only:true " with result "range_response_count:0 size:6" took too long (3.10040272s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/validatingwebhookconfigurations" range_end:"/registry/validatingwebhookconfigurationt" count_only:true " with result "range_response_count:0 size:6" took too long Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/endpoints/kube-system/kube-scheduler" " with result "range_response_count:1 size:577" took too long (6.4749108s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/events" range_end:"/registry/eventt" count_only:true " with result "range_response_count:0 size:9" took too long (3.22013804s) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/health" " with result "range_response_count:0 size:6" took too long (107.31128ms) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/persistentvolumes" range_end:"/registry/persistentvolumet" count_only:true " with result "range_response_count:0 size:6" took too long (590.10484ms) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/pods" range_end:"/registry/podt" count_only:true " with result "range_response_count:0 size:8" took too long (870.81538ms) to execute Jul 01 11:43:22 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/services/endpoints/kube-system/kube-controller-manager" " with result "range_response_count:1 size:605" took too long (1.01391092s) to execute Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/events/kube-system/kube-apiserver-centos7.168d5129b54e2e25" " with result "range_response_count:1 size:793" took too long (679.77894ms) to execute Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/ingressclasses" range_end:"/registry/ingressclasset" count_only:true " with result "range_response_count:0 size:6" took too long (384.0858ms) to execute Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/validatingwebhookconfigurations" range_end:"/registry/validatingwebhookconfigurationt" count_only:true " with result "range_response_count:0 size:6" took too long Jul 01 11:43:23 host-10-10-18-42 etcd[21504]: read-only range request "key:"/registry/clusterrolebindings" range_end:"/registry/clusterrolebindingt" count_only:true " with result "range_response_count:0 size:8" took too long (308.35244ms) to execut
查看etcd master节点
ETCDCTL_API=3 ./etcdctl --cacert ./etcd/ca.pem --cert ./etcd/server.pem --key ./etcd/server-key.pem --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379 endpoint status --write-out=table +--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://10.10.18.42:2379 | b7ea7727f764ab2c | 3.3.11 | 14 MB | false | false | 1687 | 198120 | 0 | | | https://10.10.18.43:2379 | 596ace533af60ec1 | 3.3.11 | 14 MB | false | false | 1687 | 198123 | 0 | | | https://10.10.18.44:2379 | 59d7c74cdcbcfc9d | 3.3.11 | 14 MB | true | false | 1687 | 198125 | 0 | | +--------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 248.28942ms) Jul 01 14:57:15 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 307.05404ms) Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 307.15236ms) Jul 01 14:57:16 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 429.77822ms) Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to send out heartbeat on time (exceeded the 100ms timeout for 429.86552ms) Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: server is likely overloaded Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found") Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found") Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found") Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2f ("lease not found") Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found") Jul 01 14:57:17 host-10-10-18-44 etcd[30050]: failed to revoke 2b2c7a6035ec5a2d ("lease not found")
root@ubuntu:~/cfssl/etcd# ETCDCTL_API=3 ./etcdctl --cacert ./etcd/ca.pem --cert ./etcd/server.pem --key ./etcd/server-key.pem --endpoints=https://10.10.18.42:2379,https://10.10.18.43:2379,https://10.10.18.44:2379 alarm list root@ubuntu:~/cfssl/etcd#
处理
在etcd服务节点上修改/etc/etcd/etcd.conf文件,添加如下内容:
6秒检测频率
ETCD_HEARTBEAT_INTERVAL=6000
ETCD_ELECTION_TIMEOUT=30000
election-timeout则至少是heartbeat-interval的5倍
增加etcd的容量,由2G-->8G,增加以下三个参数
vi /etc/systemd/system/rio-etcd.service
## auto-compaction-retention 参数#(单位⼩时)
--auto-compaction-mode=revision --auto-compaction-retention=24 --quota-backend-bytes=8589934592
root@ubuntu:~# kubectl get pods -n kube-system NAME READY STATUS RESTARTS AGE calico-kube-controllers-5978c5f6b5-ghldp 0/1 ContainerCreating 0 73m calico-node-489j2 0/1 CrashLoopBackOff 23 73m calico-node-6vvpc 0/1 Running 23 73m calico-node-9pk6p 0/1 CrashLoopBackOff 23 73m calico-node-bzshf 1/1 Running 4 73m coredns-546565776c-4b95r 0/1 Running 0 18h coredns-546565776c-xfwr5 0/1 Running 1 18h kube-apiserver-centos7 1/1 Running 0 18h kube-apiserver-cloud 1/1 Running 0 18h kube-apiserver-ubuntu 1/1 Running 0 19h kube-controller-manager-centos7 1/1 Running 110 18h kube-controller-manager-cloud 1/1 Running 106 18h kube-controller-manager-ubuntu 1/1 Running 117 19h kube-proxy-4bg7x 1/1 Running 0 18h kube-proxy-c9jvr 1/1 Running 0 18h kube-proxy-jtrkp 1/1 Running 0 18h kube-proxy-m8s4m 1/1 Running 0 18h kube-scheduler-centos7 1/1 Running 112 18h kube-scheduler-cloud 1/1 Running 110 18h kube-scheduler-ubuntu 1/1 Running 108 19h root@ubuntu:~#
root@ubuntu:~/cfssl/etcd# kubectl get cs NAME STATUS MESSAGE ERROR controller-manager Healthy ok scheduler Healthy ok etcd-2 Healthy {"health":"true"} etcd-1 Healthy {"health":"true"} etcd-0 Healthy {"health":"true"} root@ubuntu:~/cfssl/etcd#
root@ubuntu:~# kubectl logs calico-node-9pk6p -n kube-system 2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 390: Early log level set to info 2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 406: Using NODENAME environment for node name bogon 2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 418: Determined node name: bogon 2021-07-01 03:56:42.924 [INFO][9] startup/startup.go 103: Starting node bogon with version v3.19.1 2021-07-01 03:56:42.927 [INFO][9] startup/startup.go 450: Checking datastore connection 2021-07-01 03:56:42.935 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:42Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:43.943 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:43Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:44.951 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:44Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:45.959 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:45Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:46.968 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:46Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:47.976 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:47Z is after 2020-07-18T15:04:13Z 2021-07-01 03:56:48.984 [INFO][9] startup/startup.go 465: Hit error connecting to datastore - retry error=Get "https://10.96.0.1:443/api/v1/nodes/foo": x509: certificate has expired or is not yet valid: current time 2021-07-01T03:56:48Z is after 2020-0
calico etcd_endpoints设置
不需要配置
root@ubuntu:~# cat calico.yaml | grep etcd_endpoints
root@ubuntu:~#
root@ubuntu:~# kubectl apply -f rbac-kdd.yaml clusterrole.rbac.authorization.k8s.io/calico-node created clusterrolebinding.rbac.authorization.k8s.io/calico-node created root@ubuntu:~# kubectl apply -f calico.yaml configmap/calico-config created customresourcedefinition.apiextensions.k8s.io/bgpconfigurations.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/bgppeers.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/blockaffinities.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/clusterinformations.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/felixconfigurations.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/globalnetworkpolicies.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/globalnetworksets.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/hostendpoints.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/ipamblocks.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/ipamconfigs.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/ipamhandles.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/ippools.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/kubecontrollersconfigurations.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/networkpolicies.crd.projectcalico.org created customresourcedefinition.apiextensions.k8s.io/networksets.crd.projectcalico.org created clusterrole.rbac.authorization.k8s.io/calico-kube-controllers created clusterrolebinding.rbac.authorization.k8s.io/calico-kube-controllers created clusterrole.rbac.authorization.k8s.io/calico-node configured clusterrolebinding.rbac.authorization.k8s.io/calico-node configured daemonset.apps/calico-node created serviceaccount/calico-node created deployment.apps/calico-kube-controllers created serviceaccount/calico-kube-controllers created poddisruptionbudget.policy/calico-kube-controllers created root@ubuntu:~# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES calico-kube-controllers-5978c5f6b5-xk6cq 0/1 ContainerCreating 0 12s <none> centos7 <none> <none> calico-node-6tc54 0/1 Running 0 12s 10.10.16.81 bogon <none> <none> calico-node-blvrv 0/1 Running 0 12s 10.10.16.82 ubuntu <none> <none> calico-node-nwpfl 0/1 Running 0 12s 10.10.16.47 cloud <none> <none> calico-node-rswtj 0/1 Running 0 12s 10.10.16.251 centos7 <none> <none> coredns-546565776c-82jfw 1/1 Running 0 4m40s 10.244.243.193 ubuntu <none> <none> coredns-546565776c-px8bd 0/1 ContainerCreating 0 4m40s <none> ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 99s 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 100s 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 4m49s 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 100s 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 100s 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 146 4m49s 10.10.16.82 ubuntu <none> <none> kube-proxy-6jk97 1/1 Running 0 4m40s 10.10.16.82 ubuntu <none> <none> kube-proxy-dpwh6 1/1 Running 0 100s 10.10.16.251 centos7 <none> <none> kube-proxy-wvdkr 1/1 Running 0 3m42s 10.10.16.81 bogon <none> <none> kube-proxy-xbdlt 1/1 Running 0 101s 10.10.16.47 cloud <none> <none> kube-scheduler-centos7 1/1 Running 0 99s 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 100s 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 130 4m49s 10.10.16.82 ubuntu <none> <none> root@ubuntu:~#
root@ubuntu:~# kubectl get pods -o wide -n kube-system NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES calico-kube-controllers-5978c5f6b5-xk6cq 1/1 Running 0 2m33s 10.244.129.129 centos7 <none> <none> calico-node-6tc54 1/1 Running 0 2m33s 10.10.16.81 bogon <none> <none> calico-node-blvrv 1/1 Running 0 2m33s 10.10.16.82 ubuntu <none> <none> calico-node-nwpfl 1/1 Running 0 2m33s 10.10.16.47 cloud <none> <none> calico-node-rswtj 1/1 Running 0 2m33s 10.10.16.251 centos7 <none> <none> coredns-546565776c-82jfw 1/1 Running 0 7m1s 10.244.243.193 ubuntu <none> <none> coredns-546565776c-px8bd 1/1 Running 0 7m1s 10.244.243.194 ubuntu <none> <none> kube-apiserver-centos7 1/1 Running 0 4m 10.10.16.251 centos7 <none> <none> kube-apiserver-cloud 1/1 Running 0 4m1s 10.10.16.47 cloud <none> <none> kube-apiserver-ubuntu 1/1 Running 0 7m10s 10.10.16.82 ubuntu <none> <none> kube-controller-manager-centos7 1/1 Running 0 4m1s 10.10.16.251 centos7 <none> <none> kube-controller-manager-cloud 1/1 Running 0 4m1s 10.10.16.47 cloud <none> <none> kube-controller-manager-ubuntu 1/1 Running 146 7m10s 10.10.16.82 ubuntu <none> <none> kube-proxy-6jk97 1/1 Running 0 7m1s 10.10.16.82 ubuntu <none> <none> kube-proxy-dpwh6 1/1 Running 0 4m1s 10.10.16.251 centos7 <none> <none> kube-proxy-wvdkr 1/1 Running 0 6m3s 10.10.16.81 bogon <none> <none> kube-proxy-xbdlt 1/1 Running 0 4m2s 10.10.16.47 cloud <none> <none> kube-scheduler-centos7 1/1 Running 0 4m 10.10.16.251 centos7 <none> <none> kube-scheduler-cloud 1/1 Running 0 4m1s 10.10.16.47 cloud <none> <none> kube-scheduler-ubuntu 1/1 Running 130 7m10s 10.10.16.82 ubuntu <none> <none> root@ubuntu:~#
root@ubuntu:~# netstat -pan | grep 2379 | wc -l 500 root@ubuntu:~#