zoukankan      html  css  js  c++  java
  • prometheus监控kubernetes容器

    prometheus.yaml

    # Prometheus self-monitoring 普罗米修斯自我监控
    groups:
        - name: 普罗米修斯-监控告警                                               #组名,报警规则组名称
          rules:                                                                #定义角色
    # 1.1.1. Prometheus job missing 普罗米修斯失踪  
    # A Prometheus job has disappeared 普罗米修斯的工作不见了  
          - alert: PrometheusJobMissing                                          #告警名称,实例在规定时间无法访问发出告警
            expr: absent(up{job="prometheus"})                                   #expr表达式 
            for: 0m                                                              #for持续时间,表示0M获取不到信息,触发告警
            labels:                                                              
              severity: warning                                                  #告警级别
            annotations:                                                         #注释告警通知  
              summary: Prometheus job missing (instance {{ $labels.instance }})  #自定义告警通知
              description: "A Prometheus job has disappeared
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.2. Prometheus target missing         
    # A Prometheus target has disappeared. An exporter might be crashed.普罗米修斯的目标消失了。出口商可能会破产。
          - alert: PrometheusTargetMissing
            expr: up == 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus target missing (instance {{ $labels.instance }})
              description: "A Prometheus target has disappeared. An exporter might be crashed.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.3. Prometheus all targets missing         
    # A Prometheus job does not have living target anymore.  普罗米修斯的工作已经没有活的目标了
          - alert: PrometheusAllTargetsMissing
            expr: count by (job) (up) == 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus all targets missing (instance {{ $labels.instance }})
              description: "A Prometheus job does not have living target anymore.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.4. Prometheus configuration reload failure         
    # Prometheus configuration reload error    普罗米修斯配置重新加载错误
          - alert: PrometheusConfigurationReloadFailure
             expr: prometheus_config_last_reload_successful != 1
             for: 0m
             labels:
               severity: warning
             annotations:
               summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
               description: "Prometheus configuration reload error
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.5. Prometheus too many restarts          
    # Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. 在过去的15分钟里,普罗米修斯已经重启了两次以上。可能是撞车
          - alert: PrometheusTooManyRestarts
            expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Prometheus too many restarts (instance {{ $labels.instance }})
              description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.6. Prometheus AlertManager configuration reload failure      
    # AlertManager configuration reload error  AlertManager配置重新加载错误
          - alert: PrometheusAlertmanagerConfigurationReloadFailure
            expr: alertmanager_config_last_reload_successful != 1
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
              description: "AlertManager configuration reload error
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.7. Prometheus AlertManager config not synced
    # Configurations of AlertManager cluster instances are out of sync  AlertManager群集实例的配置不同步
          - alert: PrometheusAlertmanagerConfigNotSynced
            expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
              description: "Configurations of AlertManager cluster instances are out of sync
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.8. Prometheus AlertManager E2E dead man switch
    #Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.  普罗米修斯死神开关是一个随时开火的警报。它被用作通过Alertmanager对普罗米修斯的端到端测试          
          - alert: PrometheusAlertmanagerE2eDeadManSwitch
            expr: vector(1)
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
              description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.9. Prometheus not connected to alertmanager
    # Prometheus cannot connect the alertmanager     普罗米修斯无法连接alertmanager 
          - alert: PrometheusNotConnectedToAlertmanager
            expr: prometheus_notifications_alertmanagers_discovered < 1
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
              description: "Prometheus cannot connect the alertmanager
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.10. Prometheus rule evaluation failures
    # Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. 普罗米修斯遇到{$value}}规则评估失败,导致可能被忽略的警报
          - alert: PrometheusRuleEvaluationFailures
            expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.11. Prometheus template text expansion failures
    # Prometheus encountered {{ $value }} template text expansion failures 普罗米修斯遇到{$value}}模板文本扩展失败
          - alert: PrometheusTemplateTextExpansionFailures
            expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} template text expansion failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.12. Prometheus rule evaluation slow
    # Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.普罗米修斯规则评估花费的时间比计划的时间间隔长。它表示存储后端访问速度较慢或查询太复杂。
          - alert: PrometheusRuleEvaluationSlow
            expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
              description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.13. Prometheus notifications backlog
    # The Prometheus notification queue has not been empty for 10 minutes 普罗米修斯通知队列已经有10分钟没有空了。
          - alert: PrometheusNotificationsBacklog
            expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Prometheus notifications backlog (instance {{ $labels.instance }})
              description: "The Prometheus notification queue has not been empty for 10 minutes
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.14. Prometheus AlertManager notification failing
    # Alertmanager is failing sending notifications    Alertmanager无法发送通知
          - alert: PrometheusAlertmanagerNotificationFailing
            expr: rate(alertmanager_notifications_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
              description: "Alertmanager is failing sending notifications
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.15. Prometheus target empty
    # Prometheus has no target in service discovery   普罗米修斯在服务发现中没有目标
          - alert: PrometheusTargetEmpty
            expr: prometheus_sd_discovered_targets == 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus target empty (instance {{ $labels.instance }})
              description: "Prometheus has no target in service discovery
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.16. Prometheus target scraping slow
    # Prometheus is scraping exporters slowly  普罗米修斯正在慢慢地刮
          - alert: PrometheusTargetScrapingSlow
             expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
             for: 5m
             labels:
               severity: warning
             annotations:
               summary: Prometheus target scraping slow (instance {{ $labels.instance }})
               description: "Prometheus is scraping exporters slowly
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.17. Prometheus large scrape
    # Prometheus has many scrapes that exceed the sample limit  普罗米修斯有许多刮痕超过了样本限制
          - alert: PrometheusLargeScrape
            expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: Prometheus large scrape (instance {{ $labels.instance }})
              description: "Prometheus has many scrapes that exceed the sample limit
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.18. Prometheus target scrape duplicate
    # Prometheus has many samples rejected due to duplicate timestamps but different values     普罗米修斯有许多样本由于重复的时间戳而被拒绝,但值不同  
          - alert: PrometheusTargetScrapeDuplicate
            expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
              description: "Prometheus has many samples rejected due to duplicate timestamps but different values
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.19. Prometheus TSDB checkpoint creation failures
    # Prometheus encountered {{ $value }} checkpoint creation failures   普罗米修斯遇到{$value}}检查点创建失败
          - alert: PrometheusTsdbCheckpointCreationFailures
            expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} checkpoint creation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.20. Prometheus TSDB checkpoint deletion failures
    # Prometheus encountered {{ $value }} checkpoint deletion failures   Prometheus遇到{$value}}检查点删除失败
          - alert: PrometheusTsdbCheckpointDeletionFailures
            expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} checkpoint deletion failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.21. Prometheus TSDB compactions failed
    # Prometheus encountered {{ $value }} TSDB compactions failures   普罗米修斯遇到{$value}}TSDB压缩失败
          - alert: PrometheusTsdbCompactionsFailed
            expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} TSDB compactions failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.22. Prometheus TSDB head truncations failed 
    # Prometheus encountered {{ $value }} TSDB head truncation failures  Prometheus遇到{$value}}TSDB头截断失败
          - alert: PrometheusTsdbHeadTruncationsFailed 
            expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} TSDB head truncation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.23. Prometheus TSDB reload failures
    # Prometheus encountered {{ $value }} TSDB reload failures   普罗米修斯遇到{$value}}TSDB重新加载失败
          - alert: PrometheusTsdbReloadFailures
            expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} TSDB reload failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.24. Prometheus TSDB WAL corruptions
    # Prometheus encountered {{ $value }} TSDB WAL corruptions  普罗米修斯遇到了{$value}}TSDB-WAL腐蚀
          - alert: PrometheusTsdbWalCorruptions
            expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} TSDB WAL corruptions
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.1.25. Prometheus TSDB WAL truncations failed
    # Prometheus encountered {{ $value }} TSDB WAL truncation failures  普罗米修斯遇到{$value}}TSDB WAL截断失败
          - alert: PrometheusTsdbWalTruncationsFailed
            expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
              description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"                                                      

    windows.yaml

     # 1.5. Windows Server : prometheus-community/windows_exporter (5 rules)
     groups:
        - name: Docker容器-监控告警                                               #组名,报警规则组名称
          rules:                                                                #定义角色
    # 1.5.1. Windows Server collector Error
    # Collector {{ $labels.collector }} was not successful
          - alert: WindowsServerCollectorError
            expr: windows_exporter_collector_success == 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Windows Server collector Error (instance {{ $labels.instance }})
              description: "Collector {{ $labels.collector }} was not successful
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.5.2. Windows Server service Status
    # Windows Service state is not OK
          - alert: WindowsServerServiceStatus
            expr: windows_service_status{status="ok"} != 1
            for: 1m
            labels:
              severity: critical
            annotations:
              summary: Windows Server service Status (instance {{ $labels.instance }})
              description: "Windows Service state is not OK
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.5.3. Windows Server CPU Usage
    # CPU Usage is more than 80%
          - alert: WindowsServerCpuUsage
            expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Windows Server CPU Usage (instance {{ $labels.instance }})
              description: "CPU Usage is more than 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.5.4. Windows Server memory Usage
    # Memory usage is more than 90%
          - alert: WindowsServerMemoryUsage
            expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Windows Server memory Usage (instance {{ $labels.instance }})
              description: "Memory usage is more than 90%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.5.5. Windows Server disk Space Usage
    # Disk usage is more than 80%
          - alert: WindowsServerDiskSpaceUsage
            expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: Windows Server disk Space Usage (instance {{ $labels.instance }})
              description: "Disk usage is more than 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"

    node-exporter.yaml

    # Host and hardware : node-exporter (31 rules)
    groups:
        - name: 主机节点-监控告警                                                                 #组名,报警规则组名称
          rules:                                                                                #定义角色
    # 1.2.1. Host out of memory
    # 节点内存已满(<10%- alert: 主机内存                                                                  #告警名称,实例在规定时间无法访问发出告警
            expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10            #expr规则
            for: 10m                                                                                 ##for持续时间,表示2M获取不到信息,触发告警
            labels:
              severity: warning                                                                     #告警级别
            annotations:                                                                            #注释告警通知  
              summary: 主机内存不足 (instance {{ $labels.instance }})                         #自定义告警通知
              description: "节点内存已满(<10%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.2. Host memory under memory pressure
    # The node is under heavy memory pressure. High rate of major page faults
          - alert: HostMemoryUnderMemoryPressure
            expr: rate(node_vmstat_pgmajfault[1m]) > 1000
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Host memory under memory pressure (instance {{ $labels.instance }})
              description: "The node is under heavy memory pressure. High rate of major page faults
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.3. Host unusual network throughput in
    # 主机网络接口可能接收的数据太多(>100 MB/s)
          - alert: 网卡接收数据
            expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: 主机网络吞吐量 (instance {{ $labels.instance }})
              description: "主机网络接口可能接收的数据太多主机网络接口可能接收的数据太多 (> 100 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.4. Host unusual network throughput out
    # 主机网络接口可能发送太多数据 (> 100 MB/s)
          - alert: 网卡发送数据
            expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: 主机网络吞吐量 (instance {{ $labels.instance }})
              description: "主机网络接口可能发送太多数据 (> 100 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.5. Host unusual disk read rate
    # 磁盘可能读取了太多数据(>50 MB/s)
          - alert: 主机磁盘异常读取
            expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: 主机磁盘读取率 (instance {{ $labels.instance }})
              description: "磁盘可能读取了太多数据 (> 50 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.6. Host unusual disk write rate
    # Disk is probably writing too much data (> 50 MB/s)
          - alert: 主机异常磁盘写入
            expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Host unusual disk write rate (instance {{ $labels.instance }})
              description: "Disk is probably writing too much data (> 50 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.7. Host out of disk space
    # Disk is almost full (< 10% left)
      # Please add ignored mountpoints in node_exporter parameters like
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
          - alert: 主机磁盘空间
            expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机磁盘空间不足 (instance {{ $labels.instance }})
              description: "磁盘快满了 (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.8. Host disk will fill in 24 hours
    # Filesystem is predicted to run out of space within the next 24 hours at current write rate
      # Please add ignored mountpoints in node_exporter parameters like
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".1
      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
          - alert: 主机磁盘将在24小时内填满
            expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机磁盘将占用24小时 (instance {{ $labels.instance }})
              description: "文件系统预计将在未来24小时内以当前写入速率耗尽空间
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.9. Host out of inodes
    # 磁盘上的可用索引节点快用完了(<10%- alert: 主机inodes
            expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机已用inode(instance {{ $labels.instance }})
              description: "磁盘的可用索引节点快用完了 (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.10. Host inodes will fill in 24 hours
    # Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
          - alert: 主机inode将在24小时内用完
            expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机索引节点将在24小时内用完 (instance {{ $labels.instance }})
              description: "文件系统预计将在未来24小时内以当前写入速率耗尽inode
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.11. Host unusual disk read latency
    # Disk latency is growing (read operations > 100ms)
          - alert: 主机磁盘读取延迟
            expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机磁盘读取延迟 (instance {{ $labels.instance }})
              description: "磁盘延迟正在增长 (读取操作 > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.12. Host unusual disk write latency
    # Disk latency is growing (write operations > 100ms)
          - alert: 主机磁盘写入延迟
            expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机磁盘写入延迟 (instance {{ $labels.instance }})
              description: "磁盘延迟正在增长 (写入操作 > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.13. Host high CPU load
    #mode="idle" 从系统启动开始,累计到当前时刻,除IO等待时间以外的其它等待时间,亦即空闲时间
    # CPU load is > 80%
          - alert: 主机CPU高负载
            expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: 主机高负载 (instance {{ $labels.instance }})
              description: "CPU负载为 > 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.14. Host CPU steal noisy neighbor
    #mode="steal"当运行在虚拟化环境中,花费在其它 OS 中的时间(基于虚拟机监视器 hypervisor 的调度);可以理解成由于虚拟机调度器将 cpu 时间用于其它 OS 了,故当前 OS 无法使用 CPU 的时间。
    # CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
          - alert: HostCpuStealNoisyNeighbor
            expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
              description: "CPU窃取>10%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.15. Host context switching
    # Context switching is growing on node (> 1000 / s)
      # 1000 context switches is an arbitrary number.
      # Alert threshold depends on nature of application.
      # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
          - alert: 主机上下文切换
            expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 15000
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Host context switching (instance {{ $labels.instance }})
              description: "Context switching is growing on node (> 1000 / s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.16. Host swap is filling up
    # Swap is filling up (>80%)
          - alert: 主机交换分区
            expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机交换已满 (instance {{ $labels.instance }})
              description: "主机交换分区 (>80%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.17. Host systemd service crashed
    # systemd service crashed
          - alert: systemd服务崩溃
            expr: node_systemd_unit_state{state="failed"} == 1
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: 主机systemd服务崩溃 (instance {{ $labels.instance }})
              description: "systemd服务崩溃
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.22. Host kernel version deviations
    # Different kernel versions are running
          - alert: 主机内核
             expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
             for: 6h
             labels:
               severity: warning
             annotations:
               summary: Host kernel version deviations (instance {{ $labels.instance }})
               description: "Different kernel versions are running
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.23. Host OOM kill detected
    # OOM kill detected
          - alert: 检测到OOM杀死
            expr: increase(node_vmstat_oom_kill[1m]) > 0
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: 检测到主机OOM终止 (instance {{ $labels.instance }})
              description: "检测到OOM杀死
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.26. Host Network Receive Errors
    # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
          - alert: 主机网络接收错误
            expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机网络接收错误 (instance {{ $labels.instance }})
              description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf "%.0f" $value }} .
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.27. Host Network Transmit Errors
    # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
          - alert: 主机网络传输错误
            expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机网络传输错误 (instance {{ $labels.instance }})
              description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf "%.0f" $value }} 
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.28. Host Network Interface Saturated
    # The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
          - alert: 主机网络接口
            expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: 主机网络接口饱和 (instance {{ $labels.instance }})
              description: "网络接口 "{{ $labels.interface }}" 在 "{{ $labels.instance }}" 已经超负荷了.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.29. Host conntrack limit
    # The number of conntrack is approching limit
          - alert: 连接数接近极限
            expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: 主机连接数接近极限 (instance {{ $labels.instance }})
              description: "主机连接数接近极限
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.30. Host clock skew
    # Clock skew detected. Clock is out of sync.
          - alert: 时钟偏移
            expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机时间偏移 (instance {{ $labels.instance }})
              description: "检测到时钟偏移。时钟不同步.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.2.31. Host clock not synchronising
    # Clock not synchronising.
          - alert: 主机时间不同步
            expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 主机时间不同步 (instance {{ $labels.instance }})
              description: "时钟不同步。
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    

    docker.yaml

    # Docker containers : google/cAdvisor (6 rules)
    groups:
        - name: Docker容器-监控告警                                               #组名,报警规则组名称
          rules:                                                                #定义角色
    # 1.3.1. Container killed
    # A container has disappeared
          - alert: ContainerKilled
            expr: time() - container_last_seen > 60
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Container killed (instance {{ $labels.instance }})
              description: "A container has disappeared
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.3.2. Container CPU usage
    # Container CPU usage is above 80%
     # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly.
      # If you want to exclude it from this alert, exclude the serie having an empty name: container_cpu_usage_seconds_total{name!=""}
          - alert: 容器cpu使用量
            expr: sum(rate(container_cpu_system_seconds_total{name=~".+"}[1m])) by (name,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_host_ip,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) * 100  > 80
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 容器cpu使用量 (instance {{ $labels.instance }})
              description: "容器cpu使用量达到80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
          - alert: 容器内存使用率
            expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: 容器内存使用率
              description: "容器内存使用率是 
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    
    # 1.3.5. Container Volume IO usage
    # Container Volume IO usage is above 80%
          - alert: 容器磁盘使用量
            expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Container Volume IO usage (instance {{ $labels.instance }})
              description: "Container Volume IO usage is above 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.3.6. Container high throttle rate
    # Container is being throttled   
          - alert: ContainerHighThrottleRate
           expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
           for: 2m
           labels:
             severity: warning
           annotations:
             summary: Container high throttle rate (instance {{ $labels.instance }})
             description: "Container is being throttled
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"              

    blackbox.yaml

    # 1.4. Blackbox : prometheus/blackbox_exporter (8 rules)
    groups:
        - name: Blackbox黑匣子-监控告警                                               #组名,报警规则组名称
          rules:                                                                #定义角色
    # 1.4.1. Blackbox probe failed
    # Probe failed
          - alert: BlackboxProbeFailed
            expr: probe_success == 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Blackbox probe failed (instance {{ $labels.instance }})
              description: "Probe failed
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.2. Blackbox slow probe
    # Blackbox probe took more than 1s to complete
          - alert: BlackboxSlowProbe
            expr: avg_over_time(probe_duration_seconds[1m]) > 1
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: Blackbox slow probe (instance {{ $labels.instance }})
              description: "Blackbox probe took more than 1s to complete
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.3. Blackbox probe HTTP failure
    # HTTP status code is not 200-399
          - alert: BlackboxProbeHttpFailure
           expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
           for: 0m
           labels:
             severity: critical
           annotations:
             summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
             description: "HTTP status code is not 200-399
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.4. Blackbox SSL certificate will expire soon
    # SSL certificate expires in 30 days
          - alert: BlackboxSslCertificateWillExpireSoon
            expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
              description: "SSL certificate expires in 30 days
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.5. Blackbox SSL certificate will expire soon
    # SSL certificate expires in 3 days
          - alert: BlackboxSslCertificateWillExpireSoon
            expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
              description: "SSL certificate expires in 3 days
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.6. Blackbox SSL certificate expired
    # SSL certificate has expired already
          - alert: BlackboxSslCertificateExpired
            expr: probe_ssl_earliest_cert_expiry - time() <= 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
              description: "SSL certificate has expired already
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.7. Blackbox probe slow HTTP
    # HTTP request took more than 1s
          - alert: BlackboxProbeSlowHttp
            expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
              description: "HTTP request took more than 1s
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 1.4.8. Blackbox probe slow ping
    # Blackbox ping took more than 1s
          - alert: BlackboxProbeSlowPing
            expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: Blackbox probe slow ping (instance {{ $labels.instance }})
              description: "Blackbox ping took more than 1s
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"                             

    kube-state-mertric.yaml

    # 5.1。Kubernetes: kube-state-metrics (33条规则)
    groups:
        - name: Docker容器-监控告警                                               #组名,报警规则组名称
          rules:                                                                #定义角色
    # 5.1.1. Kubernetes Node ready
    # Node {{ $labels.node }} has been unready for a long time
          - alert: 节点断开连接
            expr: kube_node_status_condition{condition="Ready",status="true"} == 0
            for: 10m
            labels:
              severity: critical
            annotations:
              summary: 节点断开连接 (instance {{ $labels.instance }})
              description: "节点 {{ $labels.node }} 已经很长时间没有联系上了
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.2. Kubernetes memory pressure
    # {{ $labels.node }} has MemoryPressure condition
          - alert: k8s节点内存有压力
            expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: k8s节点内存有压力 (instance {{ $labels.instance }})
              description: "{{ $labels.node }} 是否存在内存有压力
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.3. Kubernetes disk pressure
    # {{ $labels.node }} has DiskPressure condition
          - alert: k8s节点磁盘有压力
            expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: k8s节点存在磁盘有压力 (instance {{ $labels.instance }})
              description: "{{ $labels.node }} has DiskPressure condition
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.4. Kubernetes out of disk
    # {{ $labels.node }} has OutOfDisk condition
          - alert: k8s磁盘不足
            expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: K8s磁盘空间不足 (instance {{ $labels.instance }})
              description: "{{ $labels.node }} 磁盘空间不足
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.5. Kubernetes out of capacity
    # {{ $labels.node }} is out of capacity
          - alert: 容量不足
            expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node) * 100 > 90
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Kubernetes 容量不足 (instance {{ $labels.instance }})
              description: "{{ $labels.node }} 容量不足
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.6. Kubernetes container oom killer
    # Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
          - alert: 十分钟容器被kill的次数
            expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: 十分钟pod被kill的次数 (instance {{ $labels.instance }})
              description: "过去10分钟内容器 {{ $labels.container }} 在pod {{ $labels.namespace }}/{{ $labels.pod }} 被杀死了 {{ $value }}  
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.7. Kubernetes Job failed
    # Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete
          - alert: job 未能完成
            expr: kube_job_status_failed > 0
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: Kubernetes Job 未完成 (instance {{ $labels.instance }})
              description: "Job {{$labels.namespace}}/{{$labels.exported_job}} 未能完成
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.9. Kubernetes PersistentVolumeClaim pending
    # PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending
          - alert: k8s volumeclaim 已挂起
            expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: k8s PersistentVolumeClaim 已挂起 (instance {{ $labels.instance }})
              description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} 已挂起
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.12. Kubernetes PersistentVolume error
    # Persistent volume is in bad state
          - alert: 永久卷处于错误状态
            expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: K8s 永久卷处于错误状态 (instance {{ $labels.instance }})
              description: "永久卷处于错误状态
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.13. Kubernetes StatefulSet down
    # A StatefulSet went down
          - alert: k8s 状态集
            expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
            for: 1m
            labels:
              severity: critical
            annotations:
              summary: Kubernetes 状态集 down (instance {{ $labels.instance }})
              description: "A StatefulSet went down
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.17. Kubernetes Pod not healthy
    # Pod has been in a non-ready state for longer than 15 minutes.
          - alert: POd 亚健康状态
            expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
            for: 0m
            labels:
              severity: critical
            annotations:
              summary: k8s Pod not healthy (instance {{ $labels.instance }})
              description: "Pod已处于非就绪状态超过15分钟。
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.18. Kubernetes pod crash looping
    # Pod {{ $labels.pod }} is crash looping
          - alert: K8s Pod CrashLooping
            expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
              description: "Pod {{ $labels.pod }} 崩溃循环
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.21. Kubernetes StatefulSet replicas mismatch
    # A StatefulSet does not match the expected number of replicas.
          - alert: 状态集与副本的预期数量不匹配
            expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: Kubernetes 状态集副本不匹配 (instance {{ $labels.instance }})
              description: "状态集与副本的预期数量不匹配.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.23. Kubernetes StatefulSet generation mismatch
    # A StatefulSet has failed but has not been rolled back.
          - alert: K8s状态集生成失配
            expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
            for: 10m
            labels:
              severity: critical
            annotations:
              summary: Kubernetes 状态集生成失配 (instance {{ $labels.instance }})
              description: "状态集已失败,但尚未被回滚。
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.28. Kubernetes job slow completion
    # Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time
          - alert: K8s Job 缓慢完成
            expr: kube_job_spec_completions - kube_job_status_succeeded > 0
            for: 12h    
            labels:
              severity: critical
            annotations:
              summary: Kubernetes job 完成缓慢 (instance {{ $labels.instance }})
              description: "K8s Job {{ $labels.namespace }}/{{ $labels.job_name }} 未及时完成.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.30. Kubernetes API client errors
    # Kubernetes API client is experiencing high error rate
          - alert: K8s API客户端错误
            expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: Kubernetes API客户端错误 (instance {{ $labels.instance }})
              description: "Kubernetes API客户端遇到高错误率
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.31. Kubernetes client certificate expires next week
    # A client certificate used to authenticate to the apiserver is expiring next week.
    #      - alert: KubernetesClientCertificateExpiresNextWeek
    #        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
    #        for: 0m
    #        labels:
    #          severity: warning
    #        annotations:
    #          summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
    #          description: "用于向apiserver进行身份验证的客户端证书将于下周过期。
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.32. Kubernetes client certificate expires soon
    # A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
    #      - alert: KubernetesClientCertificateExpiresSoon
    #        expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
    #        for: 0m
    #        labels:
    #          severity: critical
    #        annotations:
    #          summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
    #          description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"
    # 5.1.33. Kubernetes API server latency
    # Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
    #      - alert: KubernetesApiServerLatency
    #        expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
    #        for: 2m
    #        labels:
    #          severity: warning
    #        annotations:
    #          summary: Kubernetes API server latency (instance {{ $labels.instance }})
    #          description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}"                               

    注:未测试,谨慎使用

  • 相关阅读:
    tomcat 启动 报错Neither the JAVA_HOME nor the JRE_HOME environment variable is definedtemp
    tomcat linux 加入服务自动启动
    registered the JDBC driver [com.mysql.jdbc.Driver] but failed to unregister it when the web application was stopped. To prevent a memory leak, the JDBC Driver has been forcibly unregistered.
    tomcat 8 加 struts2的 java.lang.NoSuchFieldException: resourceEntries
    tomcat Can't create cache file!
    tomcat 部署时修改服务器时间
    tomcat java变量环境设置
    scrapy 动态IP、随机UA、验证码
    scrapy xpath、正则表达式、css选择器
    Saltstack windows可视化操作(十四)
  • 原文地址:https://www.cnblogs.com/fat-girl-spring/p/15045717.html
Copyright © 2011-2022 走看看