一、软件功能说明
1.MetricServer:是kubernetes集群资源使用情况的聚合器,收集数据给kubernetes集群内使用,如kubectl,hpa,scheduler等。
2.PrometheusOperator:是一个系统监测和警报工具箱,用来存储监控数据。
3.NodeExporter:用于各node的关键度量指标状态数据。
4.KubeStateMetrics:收集kubernetes集群内资源对象数据,制定告警规则。
5.Prometheus:采用pull方式收集apiserver,scheduler,controller-manager,kubelet组件数据,通过http协议传输。
6.Grafana:是可视化数据统计和监控平台。
7.Alertmanager:实现短信或邮件报警。
二、程序安装部署
下载各程序部署文件
# mkdir /opt/kubernetes/monitor && cd /opt/kubernetes/monitor # git clone https://github.com/kubernetes-incubator/metrics-server.git # git clone https://github.com/mgxian/k8s-monitor.git
1.部署MetricServer
1)修改metrics-server-deployment.yaml文件为如下内容
# echo '' > metrics-server/deploy/1.8+/metrics-server-deployment.yaml # vi metrics-server/deploy/1.8+/metrics-server-deployment.yaml apiVersion: v1 kind: ServiceAccount metadata: name: metrics-server namespace: kube-system --- apiVersion: extensions/v1beta1 kind: Deployment metadata: name: metrics-server namespace: kube-system labels: k8s-app: metrics-server spec: selector: matchLabels: k8s-app: metrics-server template: metadata: name: metrics-server labels: k8s-app: metrics-server spec: serviceAccountName: metrics-server containers: - name: metrics-server image: mirrorgooglecontainers/metrics-server-amd64:v0.2.1 imagePullPolicy: Always volumeMounts: - mountPath: /opt/kubernetes/ssl name: ca-ssl command: - /metrics-server - --source=kubernetes.summary_api:'' - --requestheader-client-ca-file=/opt/kubernetes/ssl/ca.pem volumes: - name: ca-ssl hostPath: path: /opt/kubernetes/ssl
2)部署MetricServer
# kubectl create -f metrics-server/deploy/1.8+/ clusterrolebinding.rbac.authorization.k8s.io "metrics-server:system:auth-delegator" created rolebinding.rbac.authorization.k8s.io "metrics-server-auth-reader" created apiservice.apiregistration.k8s.io "v1beta1.metrics.k8s.io" created serviceaccount "metrics-server" created deployment.extensions "metrics-server" created service "metrics-server" created clusterrole.rbac.authorization.k8s.io "system:metrics-server" created clusterrolebinding.rbac.authorization.k8s.io "system:metrics-server" created
3)状态查看
# kubectl get svc -o wide --all-namespaces NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR default kubernetes ClusterIP 10.1.0.1 <none> 443/TCP 18d <none> kube-system coredns ClusterIP 10.1.0.2 <none> 53/UDP,53/TCP 10d k8s-app=coredns kube-system kubernetes-dashboard NodePort 10.1.56.6 <none> 443:31944/TCP 14d k8s-app=kubernetes-dashboard kube-system metrics-server ClusterIP 10.1.79.15 <none> 443/TCP 15m k8s-app=metrics-server # kubectl get pods -n kube-system NAME READY STATUS RESTARTS AGE calico-kube-controllers-98989846-6th9k 1/1 Running 21 18d calico-node-bdhj4 2/2 Running 42 18d calico-node-wv9nb 2/2 Running 38 18d coredns-77c989547b-9p9fs 1/1 Running 5 10d coredns-77c989547b-k6g2c 1/1 Running 8 10d kubernetes-dashboard-66c9d98865-kdhpg 1/1 Running 12 14d metrics-server-6d6df698b9-7zscb 1/1 Running 0 16m
2.创建namespace并部署PrometheusOperator
# kubectl apply -f k8s-monitor/monitoring-namespace.yaml namespace "monitoring" created # kubectl apply -f k8s-monitor/prometheus-operator.yaml serviceaccount "prometheus-operator" created clusterrole.rbac.authorization.k8s.io "prometheus-operator" created clusterrolebinding.rbac.authorization.k8s.io "prometheus-operator" created deployment.apps "prometheus-operator" created service "prometheus-operator" created
2)状态查看
# kubectl get pod -n monitoring NAME READY STATUS RESTARTS AGE prometheus-operator-7d9fd546c4-bmjc4 1/1 Running 0 1h # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE prometheus-operator ClusterIP None <none> 8080/TCP 1h # kubectl get crd NAME AGE alertmanagers.monitoring.coreos.com 3m prometheuses.monitoring.coreos.com 3m prometheusrules.monitoring.coreos.com 3m servicemonitors.monitoring.coreos.com 3m
3.部署kubernetes组件服务
# kubectl apply -f k8s-monitor/kube-k8s-service.yaml service "kube-scheduler-prometheus-discovery" created service "kube-controller-manager-prometheus-discovery" created service "coredns-prometheus-discovery" created
状态查看
# kubectl get svc -n kube-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE coredns ClusterIP 10.1.0.2 <none> 53/UDP,53/TCP 10d coredns-prometheus-discovery ClusterIP None <none> 9153/TCP 13s kube-controller-manager-prometheus-discovery ClusterIP None <none> 10252/TCP 13s kube-scheduler-prometheus-discovery ClusterIP None <none> 10251/TCP 13s kubelet ClusterIP None <none> 10250/TCP 1m kubernetes-dashboard NodePort 10.1.56.6 <none> 443:31944/TCP 14d metrics-server ClusterIP 10.1.48.143 <none> 443/TCP 13m
4.部署NodeExporter
# kubectl apply -f k8s-monitor/node_exporter.yaml serviceaccount "node-exporter" created clusterrole.rbac.authorization.k8s.io "node-exporter" created clusterrolebinding.rbac.authorization.k8s.io "node-exporter" created daemonset.apps "node-exporter" created service "node-exporter" created
状态查看
# kubectl get pods -n monitoring NAME READY STATUS RESTARTS AGE node-exporter-767lz 2/2 Running 0 4s node-exporter-8t8wh 2/2 Running 0 4s prometheus-operator-7d9fd546c4-bmjc4 1/1 Running 0 2h # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE node-exporter ClusterIP None <none> 9100/TCP 20s prometheus-operator ClusterIP None <none> 8080/TCP 2h
5.部署KubeStateMetrics
# kubectl apply -f k8s-monitor/kube-state-metrics.yaml serviceaccount "kube-state-metrics" created role.rbac.authorization.k8s.io "kube-state-metrics" created rolebinding.rbac.authorization.k8s.io "kube-state-metrics" created clusterrole.rbac.authorization.k8s.io "kube-state-metrics" created clusterrolebinding.rbac.authorization.k8s.io "kube-state-metrics" created deployment.apps "kube-state-metrics" created service "kube-state-metrics" created
状态查看
# kubectl get pods -n monitoring NAME READY STATUS RESTARTS AGE kube-state-metrics-8545d67875-lwwd9 4/4 Running 0 1m node-exporter-767lz 2/2 Running 0 37m node-exporter-8t8wh 2/2 Running 0 37m prometheus-operator-7d9fd546c4-bmjc4 1/1 Running 0 2h # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 2m node-exporter ClusterIP None <none> 9100/TCP 38m prometheus-operator ClusterIP None <none> 8080/TCP 2h
6.部署Prometheus
# kubectl apply -f k8s-monitor/prometheus.yaml serviceaccount "prometheus-k8s" created clusterrole.rbac.authorization.k8s.io "prometheus-k8s" created clusterrolebinding.rbac.authorization.k8s.io "prometheus-k8s" created prometheus.monitoring.coreos.com "k8s" created service "prometheus-k8s" created prometheusrule.monitoring.coreos.com "prometheus-k8s-rules" created
状态查看
# kubectl get pods -n monitoring NAME READY STATUS RESTARTS AGE kube-state-metrics-8545d67875-lwwd9 4/4 Running 0 59m node-exporter-767lz 2/2 Running 0 1h node-exporter-8t8wh 2/2 Running 0 1h prometheus-k8s-0 3/3 Running 1 1m prometheus-k8s-1 3/3 Running 0 1m prometheus-operator-7d9fd546c4-bmjc4 1/1 Running 0 3h # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 59m node-exporter ClusterIP None <none> 9100/TCP 1h prometheus-k8s NodePort 10.1.31.15 <none> 9090:30172/TCP 1m prometheus-operated ClusterIP None <none> 9090/TCP 54m prometheus-operator ClusterIP None <none> 8080/TCP 3h
1)配置数据收集
# kubectl apply -f k8s-monitor/kube-servicemonitor.yaml servicemonitor.monitoring.coreos.com "kube-apiserver" created servicemonitor.monitoring.coreos.com "kubelet" created servicemonitor.monitoring.coreos.com "kube-controller-manager" created servicemonitor.monitoring.coreos.com "kube-scheduler" created servicemonitor.monitoring.coreos.com "coredns" created servicemonitor.monitoring.coreos.com "kube-state-metrics" created servicemonitor.monitoring.coreos.com "node-exporter" created servicemonitor.monitoring.coreos.com "prometheus-operator" created servicemonitor.monitoring.coreos.com "prometheus" created
状态查看
# kubectl get servicemonitors -n monitoring NAME AGE coredns 8s kube-apiserver 8s kube-controller-manager 8s kube-scheduler 8s kube-state-metrics 8s kubelet 8s node-exporter 8s prometheus 7s prometheus-operator 8s
2)查看Prometheus中的数据
查看Prometheus页面访问端口
# echo $(kubectl get svc -n monitoring | grep prometheus-k8s | awk '{print $(NF-1)}' | cut -d ':' -f 2 | cut -d '/' -f 1) 30172
注:可使用任一node主机IP加此端口号以HTTP方式访问
3)prometheus主页
7.部署Grafana
# kubectl apply -f k8s-monitor/grafana.yaml secret "grafana-datasources" created serviceaccount "grafana" created configmap "grafana-dashboards" created configmap "grafana-dashboard-k8s-cluster-rsrc-use" created configmap "grafana-dashboard-k8s-node-rsrc-use" created configmap "grafana-dashboard-k8s-resources-cluster" created configmap "grafana-dashboard-k8s-resources-namespace" created configmap "grafana-dashboard-k8s-resources-pod" created configmap "grafana-dashboard-nodes" created configmap "grafana-dashboard-pods" created configmap "grafana-dashboard-statefulset" created deployment.apps "grafana" created service "grafana" created
1)状态查看
# kubectl get pods -n monitoring NAME READY STATUS RESTARTS AGE grafana-5b68464b84-b9qtz 1/1 Running 0 4m # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE alertmanager-main NodePort 10.1.139.227 <none> 9093:31953/TCP 19h alertmanager-operated ClusterIP None <none> 9093/TCP,6783/TCP 19h grafana NodePort 10.1.199.80 <none> 3000:30809/TCP 20h kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 21h node-exporter ClusterIP None <none> 9100/TCP 22h prometheus-k8s NodePort 10.1.31.15 <none> 9090:30172/TCP 20h prometheus-operated ClusterIP None <none> 9090/TCP 21h prometheus-operator ClusterIP None <none> 8080/TCP 1d # kubectl get svc -n monitoring | grep grafana grafana NodePort 10.1.199.80 <none> 3000:30809/TCP 5m
2)查看Grafana页面访问端口
# echo $(kubectl get svc -n monitoring | grep grafana | awk '{print $(NF-1)}' | cut -d ':' -f 2 | cut -d '/' -f 1) 30809
注:可使用任一node主机IP加此端口号以HTTP方式访问
3)Grafana主页面,默认用户名/密码,admin/admin
4)集群状态页面
5)集群状态以命名空间视角页面
6)POD信息页面
8.部署Alertmanager
# kubectl apply -f k8s-monitor/alertmanager.yaml serviceaccount "alertmanager-main" created secret "alertmanager-main" created alertmanager.monitoring.coreos.com "main" created service "alertmanager-main" created servicemonitor.monitoring.coreos.com "alertmanager" created
1)状态查看
# kubectl get pods -n monitoring NAME READY STATUS RESTARTS AGE alertmanager-main-0 2/2 Running 0 17s alertmanager-main-1 2/2 Running 0 11s # kubectl get svc -n monitoring NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE alertmanager-main NodePort 10.1.139.227 <none> 9093:31953/TCP 1m alertmanager-operated ClusterIP None <none> 9093/TCP,6783/TCP 1m # kubectl get svc -n monitoring | grep alertmanager-main alertmanager-main NodePort 10.1.139.227 <none> 9093:31953/TCP 20h
2)Alertmanager页面查看访问端口
# echo $(kubectl get svc -n monitoring | grep alertmanager-main | awk '{print $(NF-1)}' | cut -d ':' -f 2 | cut -d '/' -f 1) 31953
注:可使用任一node主机IP加此端口号以HTTP方式访问
3)Alertmanager主页面