zoukankan      html  css  js  c++  java
  • k8s全方位监控中-常用rules配置

    [root@VM_0_48_centos prometheus]# cat alertmanager-configmap.yaml
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: alertmanager-config
      namespace: kube-system
      labels:
        kubernetes.io/cluster-service: "true"
        addonmanager.kubernetes.io/mode: EnsureExists
    data:
      alertmanager.yml: |
        global:
          resolve_timeout: 5m
          smtp_smarthost: 'smtp.163.com:25'
          smtp_from: 'xjq18125012766@163.com'
          smtp_auth_username: 'xjq18125012766@163.com'
          smtp_auth_password: 'test123'
          smtp_require_tls: false  
    
        route:
          group_by: ['alertname']
          group_wait: 10s
          group_interval: 30s
          repeat_interval: 10s    
          receiver: 'mail'
    
        receivers:
          - name: 'mail'
            email_configs:
            - to: '2654071080@qq.com'
    [root@VM_0_48_centos prometheus]# cat  prometheus-rules.yaml
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-rules-config
      namespace: kube-system
      labels:
        kubernetes.io/cluster-service: "true"
        addonmanager.kubernetes.io/mode: EnsureExists
    data:
      pods.yml: |
        groups:
        - name: pod.rules
          rules:
          - alert: InstanceDown
            expr: up == 0
            for: 2m
            labels:
              severity: error 
            annotations:
              summary: "监控采集器{{ $labels.instance }}停止工作"
              value: "{{ $value }}"
    
          - alert: PodSvcDown
            expr: probe_success == 0
            for: 1m
            labels:
              severity: error 
            annotations:
              summary: "容器代理服务{{ $labels.instance }}停止工作"
              value: "{{ $value }}"
    
          - alert: MysqlCon
            expr: MysqlCon_metric > 40
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: "mysql连接数过高"
              value: "{{ $value }}"
    
          - alert: PodCpuUsage
            expr: sum by(pod_name, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[1m])) * 100 > 80
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} CPU使用率超过80%"
              value: "{{ $value }}"
    
          - alert: PodMemoryUsage
            expr: sum(container_memory_rss{image!=""}) by(pod_name, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod_name, namespace) * 100 != +inf > 80
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 内存使用率超过80%"
              value: "{{ $value }}"
    
          - alert: PodFailed
            expr: sum (kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
            for: 1m
            labels:
              severity: error 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod status is Failed"
              value: "{{ $value }}"
    
          - alert: PodPending
            expr: sum (kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
            for: 1m
            labels:
              severity: error 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} status is Pending"
              value: "{{ $value }}"
    
          - alert: PodNetworkReceive
            expr: sum (rate (container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace)  > 30000
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 接受到的网络入流量大于30MB/s"
              value: "{{ $value }}K/s"                
    
          - alert: PodNetworkTransmit
            expr: sum (rate (container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace) > 30000
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 传输的网络出流量大于30MB/s"
              value: "{{ $value }}K/s"
    
          - alert: PodRestart
            expr: sum (changes (kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 0
            for: 5s
            labels:
              severity: warning 
            annotations:
              summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod is restart"
              value: "{{ $value }}"
      nodes.yml: |
        groups:
        - name: node.rules
          rules:
          - alert: NodeFilesystemUsage
            expr: 100 - (node_filesystem_free_bytes{device="rootfs"} / node_filesystem_size_bytes{device="rootfs"} * 100) > 85 
            for: 1m
            labels:
              severity: warning 
            annotations:
              summary: "主机 {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率超过80%"
              value: "{{ $value }}"
    
          - alert: NodeMemoryUsage
            expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "主机 {{ $labels.instance }} 内存使用率超过80%"
              value: "{{ $value }}"
    
          - alert: NodeCPUUsage    
            expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80 
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "主机 {{ $labels.instance }} CPU使用率超过80%"
              value: "{{ $value }}"

    2、展示结果

  • 相关阅读:
    C++指针
    写的第一个 JavaCript
    VB.NET中实现"关机/休眠/重启/注销"的类
    VB&VB.NET速查表
    把一个数组的值赋给另一个数组(VB.NET)
    设置装备陈列.htaccess的一些问题结果总结
    httpd.conf文件设置装备布置详解2
    httpd.conf文件设置装备安放详解3
    httpd.conf文件设置详解1
    Red Hat下若何架设FTP做事器
  • 原文地址:https://www.cnblogs.com/xiajq/p/11395316.html
Copyright © 2011-2022 走看看