zoukankan      html  css  js  c++  java
  • cAdvisor容器监控规则

    其他说明参考host主机监控规则:https://www.cnblogs.com/sanduzxcvbnm/p/13589848.html

    在prometheus主程序目录下的rules目录下新建docker.yml文件,添加上如下内容,然后重启prometheus。

    groups:
    - name:  Docker containers monitoring
      rules: 
      - alert: ContainerKilled
        expr: time() - container_last_seen > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container killed (instance {{ $labels.instance }})"
          description: "A container has disappeared
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ContainerCpuUsage
        expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container CPU usage (instance {{ $labels.instance }})"
          description: "Container CPU usage is above 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ContainerMemoryUsage
        expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Memory usage (instance {{ $labels.instance }})"
          description: "Container Memory usage is above 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ContainerVolumeUsage
        expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Volume usage (instance {{ $labels.instance }})"
          description: "Container Volume usage is above 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ContainerVolumeIoUsage
        expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container Volume IO usage (instance {{ $labels.instance }})"
          description: "Container Volume IO usage is above 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ContainerHighThrottleRate
        expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container high throttle rate (instance {{ $labels.instance }})"
          description: "Container is being throttled
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: PgbouncerActiveConnectinos
        expr: pgbouncer_pools_server_active_connections > 200
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
          description: "PGBouncer pools are filling up
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: PgbouncerErrors
        expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PGBouncer errors (instance {{ $labels.instance }})"
          description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: PgbouncerMaxConnections
        expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PGBouncer max connections (instance {{ $labels.instance }})"
          description: "The number of PGBouncer client connections has reached max_client_conn.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: SidekiqQueueSize
        expr: sidekiq_queue_size{} > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Sidekiq queue size (instance {{ $labels.instance }})"
          description: "Sidekiq queue {{ $labels.name }} is growing
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: SidekiqSchedulingLatencyTooHigh
        expr: max(sidekiq_queue_latency) > 120
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})"
          description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ConsulServiceHealthcheckFailed
        expr: consul_catalog_service_node_healthy == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Consul service healthcheck failed (instance {{ $labels.instance }})"
          description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ConsulMissingMasterNode
        expr: consul_raft_peers < 3
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Consul missing master node (instance {{ $labels.instance }})"
          description: "Numbers of consul raft peers should be 3, in order to preserve quorum.
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
      - alert: ConsulAgentUnhealthy
        expr: consul_health_node_status{status="critical"} == 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Consul agent unhealthy (instance {{ $labels.instance }})"
          description: "A Consul agent is down
      VALUE = {{ $value }}
      LABELS: {{ $labels }}"
    
  • 相关阅读:
    思考题
    对敏捷开发的见解
    Code Review(自评)
    求数组最大子数组和
    [ASP.NET]在虚拟目录中禁止web.config继承IIS根目录的web.config的配置
    客户的真实需求
    利用using和try/finally語句來清理資源.
    《代码整洁之道》简单总结
    ASP.NET页面级别的事
    根据DIV移动生成图片
  • 原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/13597305.html
Copyright © 2011-2022 走看看