此处记录prometheus监控项,exporter为 node_exporter
vim rules.yml
groups: - name: node rules: - alert: server_status expr: up{job="node"} == 0 for: 15s labels: severity: 'critical' annotations: summary: " node_exporter is down" - name: cluster rules: - alert: CPU expr: (1-rate(node_cpu_seconds_total{mode="idle"}[1m]))*100 > 90 for: 5s labels: severity: 'warning' annotations: summary: " cpu利用率超过 90%,{{ .Labels.name }}当前值: {{ $value }}%" # - alert: LOAD1 # expr: node_load5 > Logical_CPU_core_total*0.3 or node_load1 > Logical_CPU_core_total*0.4 or node_load15 > Logical_CPU_core_total*0.2 # for: 5s # labels: # severity: 'critical' # annotations: # summary: " load过高 当前值为 {{ $value }}" - alert: LOAD1 expr: node_load1 > Logical_CPU_core_total*3 for: 5s labels: severity: 'warning' annotations: summary: " load1>cpu*3 当前值为 {{ $value }}" - alert: LOAD5 expr: node_load5 > Logical_CPU_core_total*2 for: 5s labels: severity: 'warning' annotations: summary: " load5>cpu*2 当前值为 {{ $value }}" - alert: LOAD15 expr: node_load15 > Logical_CPU_core_total*2 for: 5s labels: severity: 'warning' annotations: summary: " load15>cpu*2 当前值为 {{ $value }}" - alert: space_root expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/"})*100 > 80 for: 5s labels: severity: 'critical' annotations: summary: " /下空间使用率大于80% 当前值为{{ $value }}% " - alert: space_data expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/data"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/data"})*100 > 80 for: 5s labels: severity: 'critical' annotations: summary: " /data空间使用率大于80% 当前值为{{ $value }}% " - alert: upload_rate expr: rate(node_network_transmit_bytes_total{device="eth0"}[1m])/1048576 > 10 for: 5s labels: severity: 'warning' annotations: summary: " 上传速率大于10M 当前值为{{ $value }}M" - alert: download_rate expr: rate(node_network_receive_bytes_total{device="eth0"}[1m])/1048576 > 10 for: 5s labels: severity: 'warning' annotations: summary: " 下载速率大于10M 当前值为{{ $value }}M " - alert: inode_size expr: (1-node_filesystem_files_free{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_files{fstype=~"xfs|ext4",mountpoint="/"})*100 > 50 for: 5s labels: severity: 'critical' annotations: summary: " /下inode使用率大于50% 当前值为{{ $value }}% " - alert: Memory_usage expr: (1-(node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes)*100 > 80 for: 5s labels: severity: 'warning' annotations: summary: "内存使用率大于80% 当前值为{{ $value }}% " - alert: iowait expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100) > 50 for: 5s labels: severity: 'critical' annotations: summary: "cpu iowait大于50% 当前值为{{ $value }}% " - alert: procs_zombie expr: procs_zombie > 20 for: 5s labels: severity: 'critical' annotations: summary: " procs_zombie 大于20 当前值为{{ $value }} " - alert: logined_users expr: logined_users_total > 25 for: 5s labels: severity: 'critical' annotations: summary: "logined_users 大于25 当前值为{{ $value }} "