zoukankan      html  css  js  c++  java
  • k8s部署prometheus

    采集方案

    • 通过prometheus-node-exporter采集主机的性能指标数据,并通过暴露的 /metrics 接口用prometheus抓取
    •  
    • 通过kube-apiserver、kube-controller-manager、kube-scheduler、etcd、kubelet、kube-proxy自身暴露的 /metrics 获取节点上与k8s集群相关的一些指标数据
    •  
    • 通过cadvisor采集容器、Pod相关的性能指标数据,并通过暴露的 /metrics 接口用prometheus抓取
    •  
    • 通过blackbox-exporter采集应用的网络性能(http、tcp、icmp等)数据,并通过暴露的 /metrics 接口用prometheus抓取
    •  
    • 通过kube-state-metrics采集k8s资源对象的状态指标数据,并通过暴露的 /metrics 接口用prometheus抓取
       
    • 应用自己采集容器中进程主动暴露的指标数据(暴露指标的功能由应用自己实现,并添加约定的annotation,prometheus负责根据annotation实现抓取)

    抓取介绍

    Kubernetes可以约定好带哪些annotation前缀的服务是自主暴露监控指标的服务。应用添加约定的这些annotations,Prometheus可以根据annotation实现抓取。例如:

    • prometheus.io/scrape: 'true' 获知对应的endpoint是需要被scrape的
    • prometheus.io/app-metrics: 'true' 获知对应的endpoint中有应用进程暴露的metrics
    • prometheus.io/app-metrics-port: '8080' 获知进程暴露的metrics的端口
    • prometheus.io/app-metrics-path: '/metrics' 获知进程暴露的metrics的具体路径

    应用可以在service中指定约定的annotation,实现Prometheus对该应用的网络服务进行探测:

    http探测:
        prometheus.io/scrape: 'true'
        prometheus.io/http-probe: 'true'
        prometheus.io/http-probe-port: '8080'
        prometheus.io/http-probe-path: '/healthz'
     
    tcp探测:
        prometheus.io/scrape: 'true'
        prometheus.io/tcp-probe: 'true'
        prometheus.io/tcp-probe-port: '80'

    Prometheus根据这些annotations可以获知相应service是需要被探测的,探测的网络协议可以是http、tcp或其他,以及具体的探测端口。http探测需要知道探测的具体url。

    mkdir -p /kubernetes/promethues/{node-exporter,k8s,kube-state-metrics,blackbox-exporter,email,alertmanager,prometheus,grafana}
    
    #创建名称空间
    kubectl create namespace prometheus

    node-exporter

    [root@k8s-master prometheus]# cat node-exporter/node-exporter.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: node-exporter
      namespace: prometheus
      labels:
        app: node-exporter
    spec:
      selector:
        app: node-exporter
      ports:
      - name: node-exporter
        port: 9100
        protocol: TCP
        targetPort: 9100
     
    ---
    apiVersion: apps/v1
    kind: DaemonSet
    metadata:
      name: node-exporter
      namespace: prometheus
      labels:
        app: node-exporter
    spec:
      selector:
        matchLabels:
          app: node-exporter
      template:
        metadata:
          name: node-exporter
          labels:
            app: node-exporter
        spec:
          containers:
          - name: node-exporter
            image: prom/node-exporter:latest
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 9100
              hostPort: 9100
          tolerations:
          - key: node-role.kubernetes.io/master
            operator: Exists
            effect: NoSchedule

    k8s组件

    controller-manager:

    [root@k8s-master prometheus]# cat k8s/kube-controller-manager-prometheus-discovery.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: kube-controller-manager-prometheus-discovery
      namespace: prometheus
      labels:
        component: kube-controller-manager
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      selector:
        component: kube-controller-manager
      ports:
      - name: http-metrics
        port: 10252
        targetPort: 10252
        protocol: TCP
      clusterIP: None

    kube-scheduler:

    [root@k8s-master prometheus]# cat k8s/kube-scheduler-prometheus-discovery.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: kube-scheduler-prometheus-discovery
      namespace: prometheus
      labels:
        component: kube-scheduler
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      selector:
        component: kube-scheduler
      ports:
      - name: http-metrics
        port: 10251
        protocol: TCP
        targetPort: 10251
      clusterIP: None

    kube-proxy:

    [root@k8s-master prometheus]# cat k8s/kube-proxy-prometheus-discovery.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: kube-proxy-prometheus-discovery
      namespace: prometheus
      labels:
        k8s-app: kube-proxy
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      selector:
        k8s-app: kube-proxy
      ports:
      - name: http-metrics
        port: 10249
        protocol: TCP
        targetPort: 10249
      clusterIP: None

    kube-state-metrics

    rbac.yaml

    [root@k8s-master prometheus]# cat kube-state-metrics/rbac.yaml 
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: kube-state-metrics
      namespace: prometheus
      labels:
        app: kube-state-metrics
     
    --- 
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRole
    metadata:
      name: kube-state-metrics
      labels:
        app: kube-state-metrics
    rules:
    - apiGroups:
      - ""
      resources:
      - configmaps
      - secrets
      - nodes
      - pods
      - services
      - resourcequotas
      - replicationcontrollers
      - limitranges
      - persistentvolumeclaims
      - persistentvolumes
      - namespaces
      - endpoints
      verbs:
      - list
      - watch
    - apiGroups:
      - extensions
      resources:
      - daemonsets
      - deployments
      - replicasets
      - ingresses
      verbs:
      - list
      - watch
    - apiGroups:
      - apps
      resources:
      - statefulsets
      - daemonsets
      - deployments
      - replicasets
      verbs:
      - list
      - watch
    - apiGroups:
      - batch
      resources:
      - cronjobs
      - jobs
      verbs:
      - list
      - watch
    - apiGroups:
      - autoscaling
      resources:
      - horizontalpodautoscalers
      verbs:
      - list
      - watch
    - apiGroups:
      - authentication.k8s.io
      resources:
      - tokenreviews
      verbs:
      - create
    - apiGroups:
      - authorization.k8s.io
      resources:
      - subjectaccessreviews
      verbs:
      - create
    - apiGroups:
      - policy
      resources:
      - poddisruptionbudgets
      verbs:
      - list
      - watch
    - apiGroups:
      - certificates.k8s.io
      resources:
      - certificatesigningrequests
      verbs:
      - list
      - watch
    - apiGroups:
      - storage.k8s.io
      resources:
      - storageclasses
      - volumeattachments
      verbs:
      - list
      - watch
    - apiGroups:
      - admissionregistration.k8s.io
      resources:
      - mutatingwebhookconfigurations
      - validatingwebhookconfigurations
      verbs:
      - list
      - watch
    - apiGroups:
      - networking.k8s.io
      resources:
      - networkpolicies
      verbs:
      - list
      - watch
    - apiGroups:
      - coordination.k8s.io
      resources:
      - leases
      verbs:
      - list
      - watch
      
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRoleBinding
    metadata:
      name: kube-state-metrics
      labels:
        app: kube-state-metrics
    roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: kube-state-metrics
    subjects:
    - kind: ServiceAccount
      name: kube-state-metrics
      namespace: prometheus

    kube-state-metrics.yaml

    [root@k8s-master prometheus]# cat kube-state-metrics/kube-state-metrics.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: kube-state-metrics
      namespace: prometheus
      labels:
        app: kube-state-metrics
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/http-probe: 'true'
        prometheus.io/http-probe-path: '/healthz'
        prometheus.io/http-probe-port: '8080'
    spec:
      selector:
        app: kube-state-metrics
      ports:
      - name: kube-state-metrics
        port: 8080
        protocol: TCP
        targetPort: 8080
       
    ---
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: kube-state-metrics
      namespace: prometheus
      labels:
        app: kube-state-metrics
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: kube-state-metrics
      template:
        metadata:
          labels:
            app: kube-state-metrics
        spec:
          serviceAccountName: kube-state-metrics
          containers:
          - name: kube-state-metrics
            image: quay.io/coreos/kube-state-metrics:v1.9.8     # kube-state-metrics:v1.9.7 适用于Kubernetes 1.16以上版本
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 8080
          tolerations:
          - key: node-role.kubernetes.io/master
            operator: Exists
            effect: NoSchedule

    blackbox-exporter

    blackbox-exporter是一个黑盒探测工具,可以对服务的http、tcp、icmp等进行网络探测。

    config.yaml

    [root@k8s-master prometheus]# cat blackbox-exporter/config.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: blackbox-exporter
      namespace: prometheus
      labels:
        app: blackbox-exporter
    data:
      blackbox.yml: |-
        modules:
          http_2xx:
            prober: http
            timeout: 10s
            http:
              valid_http_versions: ["HTTP/1.1", "HTTP/2"]
              valid_status_codes: []
              method: GET
              preferred_ip_protocol: "ip4"
          http_post_2xx:
            prober: http
            timeout: 10s
            http:
              valid_http_versions: ["HTTP/1.1", "HTTP/2"]
              method: POST
              preferred_ip_protocol: "ip4"
          tcp_connect:
            prober: tcp
            timeout: 10s
          icmp:
            prober: icmp
            timeout: 10s
            icmp:
              preferred_ip_protocol: "ip4"

    blackbox-exporter.yaml

    [root@k8s-master prometheus]# cat blackbox-exporter/blackbox-exporter.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: blackbox-exporter
      namespace: prometheus
      labels:
        app: blackbox-exporter
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      selector:
        app: blackbox-exporter
      ports:
      - name: blackbox
        port: 9115
        protocol: TCP
        targetPort: 9115
        nodePort: 30115
      type: NodePort
      
    ---  
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: blackbox-exporter
      namespace: prometheus
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: blackbox-exporter
      template:
        metadata:
          labels:
            app: blackbox-exporter
        spec:
          containers:
          - name: blackbox-exporter
            image: prom/blackbox-exporter:latest
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 9115
            readinessProbe:
              tcpSocket:
                port: 9115
              initialDelaySeconds: 10
              timeoutSeconds: 5
            volumeMounts:
            - name: config
              mountPath: /etc/blackbox_exporter
            args:
            - '--config.file=/etc/blackbox_exporter/blackbox.yml'
            - '--web.listen-address=:9115'
          volumes:
          - name: config
            configMap:
              name: blackbox-exporter
          nodeSelector:
            node-role.kubernetes.io/master: ""
          tolerations:
          - key: node-role.kubernetes.io/master
            operator: Exists
            effect: NoSchedule

    注意:
    blackbox-exporter的配置文件为 /etc/blackbox_exporter/blackbox.yml, 运行时可以动态重载配置文件,当重新加载配置文件失败时,不影响运行中的配置。

    重载方式:curl -XPOST http://ip:9115/-/reload

    alertmanager

    templates.yaml

    [root@k8s-master prometheus]# cat alertmanager/templates.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: alertmanager-templates
      namespace: prometheus
    data:
      mail.tmpl: |
        {{ define "default-monitor.html" }}
        {{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
        @警报
        <pre>
          类型: {{ .Labels.alertname }}
          实例: {{ .Labels.instance }}
          信息: {{ .Annotations.summary }}
          详情: {{ .Annotations.description }}
          时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
        </pre>
        {{ end }}{{ end -}}
        {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
        @恢复
        <pre>
          类型: {{ .Labels.alertname }}
          实例: {{ .Labels.instance }}
          信息: {{ .Annotations.summary }}
          时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          恢复: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
        </pre>
        {{ end }}{{ end -}}
        {{- end }}

    config.yaml

    [root@k8s-master prometheus]# cat alertmanager/config.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: alertmanager-config
      namespace: prometheus
    data:
      config.yml: |
        global:
          resolve_timeout: 5m
          smtp_smarthost: 'smtp.qq.com:465'                #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
          smtp_from: 'xx@qq.com'                #发送邮箱名称
          smtp_auth_username: 'xx@qq.com'               #邮箱名称
          smtp_auth_password: 'xxxx'                #邮箱密码或授权码
          smtp_require_tls: false
        templates:
        - '/etc/templates/*.tmpl'
        route:
          receiver: default-receiver
          group_wait: 10s
          group_interval: 1m
          repeat_interval: 1h
          group_by: ['alertname']
        receivers:
        - name: 'default-receiver'
          email_configs:
          - to: 'xx@qq.com'
            send_resolved: true
            html: '{{ template "default-monitor.html" }}'
            headers: {Subject: "[WARN] 报警邮件 test"}

    alertmanager.yaml

    [root@k8s-master prometheus]# cat alertmanager/alertmanager.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: alertmanager
      namespace: prometheus
      labels:
        name: alertmanager
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      type: NodePort
      selector:
        app: alertmanager
      ports:
      - name: alertmanager
        port: 9093
        protocol: TCP
        targetPort: 9093
           
    ---
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: alertmanager
      namespace: prometheus
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: alertmanager
      template:
        metadata:
          name: alertmanager
          labels:
            app: alertmanager
        spec:
          containers:
          - name: alertmanager
            image: prom/alertmanager:latest
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 9093
            args:
              - "--config.file=/etc/alertmanager/config.yml"
              - "--storage.path=/alertmanager"
            volumeMounts:
            - name: config
              mountPath: /etc/alertmanager
            - name: templates
              mountPath: /etc/templates
            - name: alertmanager
              mountPath: /alertmanager
          volumes:
          - name: config
            configMap:
              name: alertmanager-config
          - name: templates
            configMap:
              name: alertmanager-templates
          - name: alertmanager
            emptyDir: {}

    prometheus

    rbac.yaml

    [root@k8s-master prometheus]# cat prometheus/rbac.yaml 
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: prometheus
      namespace: prometheus
     
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRole
    metadata:
      name: prometheus
    rules:
    - apiGroups: [""]
      resources:
      - nodes
      - nodes/proxy
      - services
      - endpoints
      - pods
      verbs: ["get", "list", "watch"]
    - apiGroups: ["networking.k8s.io"]
      resources:
      - ingresses
      verbs: ["get", "list", "watch"]
    - apiGroups: [""]
      resources:
      - configmaps
      verbs: ["get"]
    - nonResourceURLs: ["/metrics"]
      verbs: ["get"]
     
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRoleBinding
    metadata:
      name: prometheus
    roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: prometheus
    subjects:
    - kind: ServiceAccount
      name: prometheus
      namespace: prometheus

    config.yaml

    [root@k8s-master prometheus]# cat prometheus/config.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-config
      namespace: prometheus
    data:
      prometheus.yml: |
        global:
          scrape_interval: 10s
          scrape_timeout: 10s
          evaluation_interval: 10s
        alerting:
          alertmanagers:
          - static_configs:
            - targets:
              - alertmanager:9093
        rule_files:
          - "/etc/prometheus-rules/*.rules"
        scrape_configs:
          - job_name: 'node-exporter'                #node节点性能指标数据
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_endpoint_port_name]
              regex: true;node-exporter
              action: keep
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
              action: replace
              target_label: __scheme__
              regex: (https?)
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
              action: replace
              target_label: __metrics_path__
              regex: (.+)
            - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
              action: replace
              target_label: __address__
              regex: (.+)(?::d+);(d+)
              replacement: $1:$2
            - action: labelmap
              regex: __meta_kubernetes_service_label_(.+)
            - source_labels: [__meta_kubernetes_namespace]
              action: replace
              target_label: kubernetes_namespace
            - source_labels: [__meta_kubernetes_service_name]
              action: replace
              target_label: kubernetes_name
              
          - job_name: 'kube-apiservers'
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
              regex: default;kubernetes;https
              action: keep
            
          - job_name: 'kube-controller-manager'
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_namespace, __meta_kubernetes_service_name]
              regex: true;kube-system;kube-controller-manager-prometheus-discovery
              action: keep
              
          - job_name: 'kube-scheduler'
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_namespace, __meta_kubernetes_service_name]
              regex: true;kube-system;kube-scheduler-prometheus-discovery
              action: keep
              
          - job_name: 'kubelet'
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: node
            relabel_configs:
            - action: labelmap
              regex: __meta_kubernetes_node_label_(.+)
            - target_label: __address__
              replacement: 10.150.90.242:6443
            - source_labels: [__meta_kubernetes_node_name]
              regex: (.+)
              target_label: __metrics_path__
              replacement: /api/v1/nodes/${1}/proxy/metrics
          - job_name: 'kube-proxy'
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_namespace, __meta_kubernetes_service_name]
              regex: true;kube-system;kube-proxy-prometheus-discovery
              action: keep
     
          - job_name: 'kubernetes-cadvisor'                #容器、Pod相关的性能指标数据
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: node
            relabel_configs:
            - action: labelmap
              regex: __meta_kubernetes_node_label_(.+)
            - target_label: __address__
              replacement: 10.150.90.242:6443
            - source_labels: [__meta_kubernetes_node_name]
              regex: (.+)
              target_label: __metrics_path__
              replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
            metric_relabel_configs:
            - source_labels: [id]
              action: replace
              regex: '^/machine.slice/machine-rkt\x2d([^\]+)\.+/([^/]+).service$'
              target_label: rkt_container_name
              replacement: '${2}-${1}'
            - source_labels: [id]
              action: replace
              regex: '^/system.slice/(.+).service$'
              target_label: systemd_service_name
              replacement: '${1}'
              
          - job_name: 'kube-state-metrics'              #资源对象(Deployment、Pod等)的状态
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_endpoint_port_name]
              regex: true;kube-state-metrics
              action: keep
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
              action: replace
              target_label: __scheme__
              regex: (https?)
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
              action: replace
              target_label: __metrics_path__
              regex: (.+)
            - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
              action: replace
              target_label: __address__
              regex: (.+)(?::d+);(d+)
              replacement: $1:$2
            - action: labelmap
              regex: __meta_kubernetes_service_label_(.+)
            - source_labels: [__meta_kubernetes_namespace]
              action: replace
              target_label: kubernetes_namespace
            - source_labels: [__meta_kubernetes_service_name]
              action: replace
              target_label: kubernetes_name
              
          - job_name: 'kubernetes-service-http-probe'               #通过http方式探测Service状态
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: service
            metrics_path: /probe
            params:
              module: [http_2xx]
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_http_probe]
              regex: true;true
              action: keep
            - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_namespace, __meta_kubernetes_service_annotation_prometheus_io_http_probe_port, __meta_kubernetes_service_annotation_prometheus_io_http_probe_path]
              action: replace
              target_label: __param_target
              regex: (.+);(.+);(.+);(.+)
              replacement: $1.$2:$3$4
            - target_label: __address__
              replacement: 10.150.90.242:30115
            - source_labels: [__param_target]
              target_label: instance
            - action: labelmap
              regex: __meta_kubernetes_service_annotation_prometheus_io_app_info_(.+)
     
          - job_name: 'kubernetes-service-tcp-probe'                #通过tcp方式探测Service状态
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
            kubernetes_sd_configs:
            - role: service
            metrics_path: /probe
            params:
              module: [tcp_connect]
            relabel_configs:
            - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_tcp_probe]
              regex: true;true
              action: keep
            - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_namespace, __meta_kubernetes_service_annotation_prometheus_io_tcp_probe_port]
              action: replace
              target_label: __param_target
              regex: (.+);(.+);(.+)
              replacement: $1.$2:$3
            - target_label: __address__
              replacement: 10.150.90.242:30115
            - source_labels: [__param_target]
              target_label: instance
            - action: labelmap
              regex: __meta_kubernetes_service_annotation_prometheus_io_app_info_(.+)
              
          - job_name: 'kubernetes-ingresses'              #通过http方式探测ingresses状态
            kubernetes_sd_configs:
            - role: ingress
            metrics_path: /probe
            params:
              module: [http_2xx]
            relabel_configs:
            - source_labels: [__meta_kubernetes_ingress_scheme, __address__, __meta_kubernetes_ingress_path]
              regex: (.+);(.+);(.+)
              replacement: ${1}://${2}${3}
              target_label: __param_target
            - target_label: __address__
              replacement: 10.150.90.242:30115
            - source_labels: [__param_target]
              target_label: instance
            - action: labelmap
              regex: __meta_kubernetes_ingress_label_(.+)
            - source_labels: [__meta_kubernetes_namespace]
              target_label: kubernetes_namespace
            - source_labels: [__meta_kubernetes_ingress_name]
              target_label: kubernetes_name

    rules.yaml

    [root@k8s-master prometheus]# cat prometheus/rules.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-rules
      namespace: prometheus
    data:
      node.rules: |
        groups:
        - name: node
          rules:
          - alert: NodeDown
            expr: up == 0
            for: 3m
            labels:
              severity: critical
            annotations:
              summary: "{{ $labels.instance }}: down"
              description: "{{ $labels.instance }} has been down for more than 3m"
              value: "{{ $value }}"
          - alert: NodeCPUHigh
            expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 75
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: High CPU usage"
              description: "{{$labels.instance}}: CPU usage is above 75%"
              value: "{{ $value }}"
     
          - alert: NodeCPUIowaitHigh
            expr: avg by (instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 50
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: High CPU iowait usage"
              description: "{{$labels.instance}}: CPU iowait usage is above 50%"
              value: "{{ $value }}"
     
          - alert: NodeMemoryUsageHigh
            expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: High memory usage"
              description: "{{$labels.instance}}: Memory usage is above 90%"
              value: "{{ $value }}"
     
          - alert: NodeDiskRootLow
            expr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 80
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: Low disk(the / partition) space"
              description: "{{$labels.instance}}: Disk(the / partition) usage is above 80%"
              value: "{{ $value }}"
            
          - alert: NodeDiskBootLow
            expr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: Low disk(the /boot partition) space"
              description: "{{$labels.instance}}: Disk(the /boot partition) usage is above 80%"
              value: "{{ $value }}"
     
          - alert: NodeLoad5High
            expr: (node_load5) > (count by (instance) (node_cpu_seconds_total{mode='system'}) * 2)
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "{{$labels.instance}}: Load(5m) High"
              description: "{{$labels.instance}}: Load(5m) is 2 times the number of CPU cores"
              value: "{{ $value }}"

    prometheus.yaml

    [root@k8s-master prometheus]# cat prometheus/prometheus.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: prometheus
      namespace: prometheus
      labels:
        app: prometheus
      annotations:
        prometheus.io/scrape: 'true'
    spec:
      selector:
        app: prometheus
      type: NodePort
      ports:
      - name: prometheus
        port: 9090
        protocol: TCP
        targetPort: 9090
              
    ---
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: prometheus
      namespace: prometheus
      labels:
        app: prometheus
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: prometheus
      template:
        metadata:
          name: prometheus
          labels:
            app: prometheus
        spec:
          serviceAccountName: prometheus
          containers:
          - name: prometheus
            image: prom/prometheus:latest
            imagePullPolicy: IfNotPresent
            args:
              - '--storage.tsdb.path=/prometheus'
              - '--storage.tsdb.retention.time=30d'
              - '--config.file=/etc/prometheus/prometheus.yml'
            ports:
            - containerPort: 9090
            volumeMounts:
            - name: config
              mountPath: /etc/prometheus
            - name: rules
              mountPath: /etc/prometheus-rules
            - name: prometheus
              mountPath: /prometheus
          volumes:
          - name: config
            configMap:
              name: prometheus-config
          - name: rules
            configMap:
              name: prometheus-rules
          - name: prometheus
            emptyDir: {}

    grafana

    secret.yaml

            emptyDir: {}
    [root@k8s-master prometheus]# cat grafana/secret.yaml 
    apiVersion: v1
    kind: Secret
    metadata:
      name: grafana
      namespace: prometheus
    data:
      admin-password: YWRtaW4=              # base64 加解密
      admin-username: YWRtaW4=
    type: Opaque

    grafana.yaml

    [root@k8s-master prometheus]# cat grafana/grafana.yaml 
    apiVersion: v1
    kind: Service
    metadata:
      name: grafana
      namespace: prometheus
      labels:
        app: grafana
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/path: '/metrics'
    spec:
      selector:
        app: grafana
      type: NodePort
      ports:
      - name: grafana
        port: 3000
        protocol: TCP
        targetPort: 3000
     
    ---
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: grafana
      namespace: prometheus
      labels:
        app: grafana
    spec:
      replicas: 1
      selector:
        matchLabels:
          app: grafana
      template:
        metadata:
          labels:
            app: grafana
        spec:
          containers:
          - name: grafana
            image: grafana/grafana:latest
            imagePullPolicy: IfNotPresent
            ports:
            - containerPort: 3000
              name: grafana
            env:
              - name: GF_AUTH_BASIC_ENABLED
                value: "true"
              - name: GF_AUTH_ANONYMOUS_ENABLED
                value: "false"
              - name: GF_AUTH_ANONYMOUS_ORG_ROLE
                value: Admin
              - name: GF_DASHBOARDS_JSON_ENABLED
                value: "true"
              - name: GF_INSTALL_PLUGINS
                value: grafana-kubernetes-app               #安装grafana-kubernetes-app插件
              - name: GF_SECURITY_ADMIN_USER
                valueFrom:
                  secretKeyRef:
                    name: grafana
                    key: admin-username
              - name: GF_SECURITY_ADMIN_PASSWORD
                valueFrom:
                  secretKeyRef:
                    name: grafana
                    key: admin-password
            readinessProbe:
              httpGet:
                path: /login
                port: 3000
              initialDelaySeconds: 10
              timeoutSeconds: 5
            volumeMounts:
            - name: grafana-storage
              mountPath: /var/lib/grafana
          volumes:
          - name: grafana-storage
            emptyDir: {}

    注:仅是本人测试用 没有使用到ingress,以下为别人提供的一份监控指标,仅供参考

    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-config-tmpl
      namespace: thanos
    data:
      prometheus.yaml.tmpl: |-
        global:
          scrape_interval: 30s
          evaluation_interval: 30s
          external_labels:
            cluster: prometheus-ha
            prometheus_replica: $(POD_NAME)
        alerting:
          alertmanagers:
            - static_configs:
              - targets: ["alertmanager"]
        remote_write:
          - url: http://thanos-receive.thanos.svc.cluster.local:19291/api/v1/receive
        rule_files:
        - /etc/prometheus/rules/*rules.yaml
        scrape_configs:
    
        - job_name: ack-node-export
          honor_timestamps: true
          scrape_interval: 30s
          scrape_timeout: 10s
          metrics_path: /metrics
          scheme: http
          file_sd_configs:
          - files:
            - /prometheus/config/ack-node.json
            refresh_interval: 5m
          relabel_configs:
          - source_labels: [__address__]
            separator: ;
            regex: (.*?):(d+)
            target_label: Bsc_host_ip
            replacement: $1
            action: replace
    
        - job_name: filebeat-node
          honor_timestamps: true
          scrape_interval: 30s
          scrape_timeout: 10s
          metrics_path: /metrics
          scheme: http
          file_sd_configs:
          - files:
            - /prometheus/config/filebeat.json
            refresh_interval: 5m
          relabel_configs:
          - source_labels: [__address__]
            separator: ;
            regex: (.*?):(d+)
            target_label: Bsc_host_ip
            replacement: $1
            action: replace        
    
        - job_name: 'kube-state-metrics'
          kubernetes_sd_configs:
          - role: endpoints
          relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: ([^:]+)(?::d+)?;(d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            regex: kube-state-metrics
            target_label: service_name
            replacement: $1
            action: keep
          - source_labels: [__meta_kubernetes_endpoint_address_target_kind,__meta_kubernetes_endpoint_node_name]
            target_label: Bsc_cloud_region_env_type
            replacement: $1$2
          - source_labels: [__meta_kubernetes_namespace]
            target_label: Bsc_project_name
          - source_labels: [__meta_kubernetes_pod_host_ip]
            target_label: Bsc_host_ip
          - source_labels: [__meta_kubernetes_endpoints_name]
            target_label: Bsc_job
          - regex:  
            replacement: kube-state-metrics
            target_label: Bsc_k8s_namespace
          - regex:  
            replacement: kube-state-metrics
            target_label: Bsc_k8s_pod_name
          - regex:  
            replacement: kube-state-metrics
            target_label: Bsc_k8s_pod_ip 
    
        - job_name: 'kubernetes-nodes-kubelet'
          metrics_path: /metrics/cadvisor
          kubernetes_sd_configs:
          - role: node
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels: [__meta_kubernetes_node_name]
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    
          metric_relabel_configs:
          - source_labels: [instance]
            separator: ;
            regex: (.+)
            target_label: node
            replacement: $1
            action: replace
    
          - source_labels: [pod_name]
            separator: ;
            regex: (.+)
            target_label: pod
            replacement: $1
            action: replace
          - source_labels: [container_name]
            separator: ;
            regex: (.+)
            target_label: container
            replacement: $1
            action: replace
    
        # 采集:Apiserver 生存指标
        # 创建的job name 名称为 kubernetes-apiservers
        - job_name: kubernetes-apiservers
          # 基于k8s的服务发现
          kubernetes_sd_configs:
          - role: endpoints
          # 使用通信标记标签
          relabel_configs:
          # 保留正则匹配标签
          - action: keep
            # 已经包含
            regex: default;kubernetes;https
            source_labels:
            - __meta_kubernetes_namespace
            - __meta_kubernetes_service_name
            - __meta_kubernetes_endpoint_port_name
          # 使用方法为https、默认http
          scheme: https
          tls_config:
            # promethus访问Apiserver使用认证
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            # 跳过https认证
            insecure_skip_verify: true
          # promethus访问Apiserver使用认证
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    
    
        # 采集:kubernetes-services 服务指标
        - job_name: kubernetes-services
          kubernetes_sd_configs:
          - role: service
          # 黑盒探测,探测IP与端口是否可用
          metrics_path: /probe
          params:
            module:
            - http_2xx
          relabel_configs:
          - action: keep
            regex: true
            source_labels:
            - __meta_kubernetes_service_annotation_prometheus_io_probe
          - source_labels:
            - __address__
            target_label: __param_target
          # 使用 blackbox进行黑盒探测
          - replacement: blackbox
            target_label: __address__
          - source_labels:
            - __param_target
            target_label: instance
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels:
            - __meta_kubernetes_namespace
            target_label: kubernetes_namespace
          - source_labels:
            - __meta_kubernetes_service_name
            target_label: kubernetes_name
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    
        # 采集: kubernetes-pods 信息
        - job_name: 'kubernetes-pods'
          kubernetes_sd_configs:
          - role: pod
          relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: ([^:]+)(?::d+)?;(d+)
            # replacement: $1:$2
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    ---
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-rules
      labels:
        name: prometheus-rules
      namespace: thanos
    data:
     cadvisor-rules.yaml: |
        groups:
          - name: cadvisor容器-监控告警
            rules:
    
            - alert: 容器cpu使用量
              expr: sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / (sum(container_spec_cpu_quota{name=~".+"}/100000) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip)) * 100  >= 80
              for: 2m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}'
                description: "容器cpu使用量达到百分之八十"
    
            - alert: 容器cpu使用量
              expr: sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / (sum(container_spec_cpu_quota{name=~".+"}/100000) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip)) * 100  >= 95
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "容器cpu使用量达到百分之九十五并持续五分钟"            
    
            - alert: 容器内存使用率
              expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100 != +inf  >= 80
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "容器内存使用率达到百分之八十"
    
            - alert: 容器内存使用率
              expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100 != +inf  >= 95
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "容器内存使用率达到百分之九十五持续五分钟"            
    
            - alert: 容器磁盘使用率
              expr: (container_fs_usage_bytes/container_fs_limit_bytes) *100 >= 80
              for: 2m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}' 
                description: "容器磁盘使用率达到百分之八十"
    
            - alert: 容器磁盘使用率
              expr: (container_fs_usage_bytes/container_fs_limit_bytes) *100 >= 90
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "容器磁盘使用率达到九十"            
    
            - alert: 容器磁盘IO写入使用率
              expr: sum(rate(container_fs_writes_bytes_total[5m])) by (name,device,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip,pod) / 1024 / 1024 > 10
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "容器磁盘IO写入使用率大于10M"
    
            - alert: 容器磁盘IO读取使用率
              expr: sum(rate(container_fs_writes_bytes_total[5m])) by (name,device,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip,pod) / 1024 / 1024 > 10
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "容器磁盘IO读取使用率大于10M"
    
            - alert: 容器网络接收速率
              expr: sum(rate(container_network_receive_bytes_total[5m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / 1024 / 1024 > 50
              for: 5m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}' 
                description: "五分钟容器网络接收速率大于50M"
    
            - alert: 容器网络接收速率
              expr: sum(rate(container_network_receive_bytes_total[5m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / 1024 / 1024 > 80
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "五分钟容器网络接收速率大于80M"
    
            - alert: 容器网络传输速率
              expr: sum(rate(container_network_transmit_bytes_total[5m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / 1024 / 1024 > 50
              for: 5m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}' 
                description: "五分钟容器网络接收速率大于50M"
    
            - alert: 容器网络传输速率
              expr: sum(rate(container_network_transmit_bytes_total[5m])) by (name,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / 1024 / 1024 > 80
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "五分钟容器网络接收速率大于80M"
    
            - alert: 当容器超出其CPU限制时的时间
              expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 10
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "当容器超出其CPU限制时的时间"
      state-rules.yaml: |
        groups:
          - name: state-监控告警
            rules:
            - alert: node节点断开连接
              expr: kube_node_status_condition{condition="Ready",status="true"} == 0
              for: 10m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "node节点{{ $labels.instance }}断开连接,已经不再集群内"
    
            - alert: k8s节点内存有压力
              expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "k8s节点{{ $labels.instance }}内存可能有压力"
    
            - alert: k8s节点磁盘有压力
              expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "请注意k8s节点{{ $labels.instance }}可能存在磁盘有压力 "
    
            - alert: k8s磁盘不足
              expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "K8s节点{{ $labels.instance }}磁盘空间不足"
    
            - alert: 节点POD使用数量百分比大于70
              expr: sum by (node,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job,Bsc_host_ip,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) * 100 > 70
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "K8s节点POD使用数量大于百分之七十"
    
            - alert: 在过去十分钟容器被kill的次数
              expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) > 5
              for: 0m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "过去10分钟内容器 {{ $labels.container }} 在pod {{ $labels.namespace }}/{{ $labels.pod }} 被杀死了 '{{ $value }}'次"
    
            - alert: kube-state-metrics存在执行失败的Job
              expr: kube_job_status_failed > 0
              for: 0m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "Job {{$labels.namespace}}/{{$labels.exported_job}} 未能完成"
    
            - alert: 集群中存在失败的PVC
              expr: kube_persistentvolumeclaim_status_phase{phase="Failed"} == 1
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}'
                description: " k8s PersistentVolumeClaim 已挂起 {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} "
    
            - alert: PV使用状态
              expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
              for: 0m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "K8s节点{{ $labels.instance }}PV处于错误状态"
    
            - alert: StatefulSet是否down
              expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
              for: 1m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}'
                description: "监控K8S集群StatefulSet是否down,不于1就告警"
    
            - alert: 容器的状态
              expr: min_over_time(sum by (namespace,pod,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip,phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
              for: 0m
              labels:
                severity: critical
              annotations:
                summary: '{{ $labels.phase }}'
                description: "Pod状态"
    
            - alert: 容器重启时间
              expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 3 > 0
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "Pod 重启时间,重启时间超过3m告警"
    
            - alert: 容器重启时间
              expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 10 > 0
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "Pod 重启时间,重启时间超过10m告警"
    
            - alert: 监测 StatefulSet 副本是否达到预期
              expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
              for: 10m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "K8s{{ $labels.instance }}StatefulSet副本不匹配"
    
            - alert: K8s状态集生成失配
              expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
              for: 10m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "状态集已失败,但尚未被回滚"
    
    
            - alert: pod重启次数大于10
              expr: kube_pod_container_status_restarts_total > 10
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "pod重启次数大于10次"
    
            - alert: APIServer 请求错误
              expr: sum(rate(apiserver_request_total{job="kubernetes-apiservers",code=~"5.."}[5m])) by (resource,subresource,verb,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) / sum(rate(apiserver_request_total{job="kubernetes-apiservers"}[5m])) by (resource,subresource,verb,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) != NaN > 0.05
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "5分钟内 APIServer 请求错误为"
    
            - alert: kubelet 客户端证书过期监测
              expr: apiserver_client_certificate_expiration_seconds_count{job="kubernetes-apiservers"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_job,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubernetes-apiservers"}[5m]))) < 2592000
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}'
                description: "离kubelet 客户端证书过期还有30天"
    
      node-rules.yaml: |
        groups:
          - name: 主机-监控告警
            rules:
    
            - alert: 主机CPU使用率
              expr: ceil(100 - sum(increase(node_cpu_seconds_total{mode="idle"}[5m]))  by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) / sum(increase(node_cpu_seconds_total[5m]))  by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job)*100)  >= 80
              for: 5m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}'
                description: "CPU使用率大于百分之八十"
    
            - alert: 主机CPU使用率
              expr: ceil(100 - sum(increase(node_cpu_seconds_total{mode="idle"}[5m]))  by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) / sum(increase(node_cpu_seconds_total[5m]))  by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job)*100)  >= 95
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "CPU使用率大于百分之九十"            
    
            - alert: 主机上容器使用CPU的占比
              expr: avg by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >= 10
              for: 0m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "CPU窃取大于百分之十"
    
            - alert: 主机上容器使用CPU的占比
              expr: avg by(instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >= 50
              for: 0m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "CPU窃取大于百分之五十"
    
            - alert: 主机内存使用率
              expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes ))* 100 >= 80
              for: 10m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}' 
                description: "节点内存大于百分之八十"
    
            - alert: 主机内存使用率
              expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes ))* 100 >= 95
              for: 10m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}' 
                description: "节点内存大于百分之九十万"            
    
            - alert: 主机磁盘剩余空间
              expr: node_filesystem_avail_bytes {fstype=~"ext4|xfs"} / 1024 / 1024 / 1024  < 10
              for: 10m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "主机磁盘剩余容量不足10G"
    
            - alert: 主机磁盘使用率
              expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) * 100 >= 80
              for: 10m
              labels:
                severity: info
              annotations:
                summary: '{{ $value }}'
                description: "主机磁盘使用率不足百分之二十"
    
            - alert: 主机磁盘使用率
              expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) * 100 >= 90
              for: 10m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "主机磁盘使用率不足百分之十"            
    
            - alert: 内存页面错误
              expr: rate(node_vmstat_pgmajfault[1m]) > 1000
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}'
                description: "节点承受着巨大的内存压力。主要页面错误率高"
    
            - alert: 网卡接收数据
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "主机网络接口可能接收的数据太多 (> 100 MB/s)"
    
            - alert: 网卡接收数据
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 250
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "主机网络接口可能接收的数据太多 (> 250 MB/s)"            
    
            - alert: 网卡发送数据
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "主机网络接口可能发送太多数据 (> 100 MB/s)"
    
            - alert: 网卡发送数据
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 250
              for: 5m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}' 
                description: "主机网络接口可能发送太多数据 (> 250 MB/s)"            
    
            - alert: 主机磁盘异常读取
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}'
                description: "磁盘可能读取了太多数据 (> 50 MB/s)"
    
            - alert: 主机异常磁盘写入
              expr: sum by (instance,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_job) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "磁盘可能写入了太多数据 (> 50 MB/s)"
    
            - alert: 主机inodes
              expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "磁盘的可用索引节点快用完了 (< 10 left)"
    
            - alert: 主机inode将在24小时内用完
              expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "文件系统预计将在未来24小时内以当前写入速率耗尽inode"
    
            - alert: 主机磁盘读取延迟
              expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "磁盘延迟正在增长 (读取操作 > 100ms)"
    
            - alert: 主机磁盘写入延迟
              expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "磁盘延迟正在增长 (写入操作 > 100ms)"
    
            - alert: 主机上下文切换
              expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 15000
              for: 0m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "节点上的上下文切换正在增长 (> 1000 / s)"
    
            - alert: 主机交换分区
              expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 != NaN > 80
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "主机交换分区大于百分之八十"
    
            - alert: 主机网络接收错误
              expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) !=NaN > 0.01
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "主机接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 "
    
            - alert: 主机网络传输错误
              expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) !=NaN > 0.01
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "主机接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 "
    
            - alert: 主机网络接口
              expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
              for: 1m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "网络接口{{ $labels.interface }}已经超负荷了."
    
            - alert: 连接状态跟踪
              expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}'
                description: "主机连接数接近极限"
    
            - alert: 时钟偏移
              expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "检测到时钟偏移。时钟不同步"
    
            - alert: 主机时间不同步
              expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
              for: 2m
              labels:
                severity: warning
              annotations:
                summary: '{{ $value }}' 
                description: "时钟不同步"
    
            - alert: 服务器等待关闭的TCP连接数
              expr: node_sockstat_TCP_tw  >=5000
              for: 2m
              labels:
                severity: critical
              annotations:
                summary: '{{ $value }}'  
                description: "主机tcp连接数未关闭大于5000"                  

    参考:https://blog.csdn.net/vic_qxz/article/details/109347645

  • 相关阅读:
    http://www.bugku.com:Bugku——SQL注入1(http://103.238.227.13:10087/)
    [笔记]一道C语言面试题:大整数乘法
    [笔记] Access Control Lists (ACL) 学习笔记汇总
    [笔记]如何将传统的回调函数转换为C#5.0支持的await格式
    6.链接与导航
    9章 下拉菜单
    11章圆角框 本章很重要 经常用到
    原来链接与导航
    7竖直排列的导航菜单
    8.水平导航菜单
  • 原文地址:https://www.cnblogs.com/fat-girl-spring/p/15045671.html
Copyright © 2011-2022 走看看