Prometheus安装
cd /usr/src/
wget https://github.com/prometheus/prometheus/releases/download/v2.12.0/prometheus-2.12.0.linux-amd64.tar.gz
tar xf prometheus-2.12.0.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/
ln -s prometheus-2.12.0.linux-amd64 prometheus
设置systemctl prometheus 启动
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus: the monitoring system
Documentation=http://prometheus.io/docs/
[Service]
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
Restart=always
StartLimitInterval=0
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
systemctl enable prometheus ##制作开机启动prometheus
systemctl start prometheus ##启动prometheus
systemctl status prometheus ##查看promethus状态
node_exporter安装部署 -> promethus依赖node_exporter来采集信息
cd /usr/src/
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar xf node_exporter-0.18.1.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/
ln -s node_exporter-0.18.1.linux-amd64 node_exporter
制作systemctl方式启动node_exporter
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=Prometheus node exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
[Service]
Restart=on-failure
ExecStart="/usr/local/node_exporter/node_exporter"
[Install]
WantedBy=multi-user.target
EOF
制作node_exporter服务启动
systemctl enable node_exporter.service ##制作开机启动node_exporter
systemctl start node_exporter.service ##启动node_exporter
systemctl status node_exporter.service ##查看node_exporter状态
访问方式
http://localhost:9090
获取主机信息
curl http://localhost:9090/metrics
如需要设置报警面板显示和监控多台机器可参考142机器的promethus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['192.168.1.140:9090']
- job_name: 'promethues-node'
static_configs:
- targets:
- 192.168.1.140:9100
- 192.168.1.137:9100
- 192.168.1.57:9100
- 192.168.1.141:9100
- 192.168.1.60:9100
- 192.168.1.201:9100
- targets: ['192.168.1.59:9100']
labels:
instance: dataexa-insight-59
- job_name: 'jmx'
static_configs:
- targets:
- 192.168.1.59:3010
- job_name: pushgateway
static_configs:
- targets: ['192.168.191.159:9091']
labels:
instance: pushgateway
jvm 监控
# 资料来源 https://www.jianshu.com/p/adada9c1f7dd
wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.3.1/jmx_prometheus_javaagent-0.3.1.jar
# java -javaagent:/usr/local/prometheus/jmx_exporter/jmx_prometheus_javaagent-0.3.1.jar=3010:/usr/local/prometheus/jmx_exporter/jmx_exporter.yml -jar yourJar.jar
报警规则编写
需要在promethus.yml的同级目录下创建rules --> mkdir rules
cat warining.yml
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
status: 非常严重
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过5分钟"
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
for: 1m
labels:
status: 一般告警
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "{{$labels.mountpoint }} CPU使用大于80%(目前使用:{{$value}}%)"
- alert: 内存使用
expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 内存使用率过高!"
description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: 网络
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
- alert: 网络
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
安装grafana
cd /usr/local/src/
wget https://dl.grafana.com/oss/release/grafana-5.4.3-1.x86_64.rpm
yum localinstall grafana-5.4.3-1.x86_64.rpm
#启动服务
systemctl start grafana-server
#查看服务是否正常启动
systemctl status grafana-server
#自启动
systemctl enable grafana-server
访问
浏览器访问http://localhost:3000
grafana网页操作
https://www.cnblogs.com/zhaojiedi1992/p/zhaojiedi_liunx_64_prometheus_granafa.html
监控gpu
url:https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/exporters/prometheus-dcgm
实际操作:
docker run -d --runtime=nvidia --name=nvidia-dcgm-exporter -v /run/prometheus:/run/prometheus nvidia/dcgm-exporter
docker run -d --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus"
启动的三个服务
systemctl start prometheus
systemctl start node_exporter
systemctl start grafana-server
alertmanager报警插件安装
wget https://github.com/prometheus/alertmanager/releases/download/v0.19.0/alertmanager-0.19.0.linux-amd64.tar.gz
tar xf alertmanager-0.19.0.linux-amd64.tar.gz -C /usr/local
mv alertmanager-0.19.0.linux-amd64 alertmanager
/usr/local/alertmanager/bin/ alertmanager #启动
# 配置报警文件
cat alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:8060/dingtalk/webhook/send'
钉钉报警
下载dingtalk进行报警
资料来源 https://www.codetd.com/article/6798984
下载好之后 选择使用markdown格式的报警格式
cat > /usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template << EOF
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
**Labels**
{{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Annotations**
{{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Source:** [{{ .Annotations.summary }}]({{ .GeneratorURL }})
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}#### [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
#dingtalk插件 指定格式模板和钉钉接口来启动
nohup prometheus-webhook-dingtalk --template.file="/usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template/default.tmpl" --ding.profile="webhook=https://oapi.dingtalk.com/robot/send?access_token=64a517b7a1d0ad2dc23exxxx00fe18b0b4e15491f179456f94b6ff5" 2>&1 1>dingding.log &
钉钉报警群设置
只需要设置好公网ip即可
自定义监控项pushgateway
wget https://github.com/prometheus/pushgateway/releases/download/v0.10.0/pushgateway-0.10.0.linux-amd64.tar.gz
tar xf pushgateway-0.10.0.linux-amd64.tar.gz -C /usr/local
mv /usr/local/pushgateway-0.10.0.linux-amd64 /usr/local/pushgateway
/usr/local/pushgateway/bin/pushgateway #启动
使用脚本来获取机器值
cat count_netstat_wait_connections.sh
#!/bin/bash
instance_name=`hostname -f | cut -d'.' -f1` #获取本机名,用于后面的的标签
label="count_netstat_wait_connections" #定义key名
count_netstat_wait_connections=`netstat -an | grep -i wait | wc -l` #获取数据的命令
echo "$label: $count_netstat_wait_connections"
echo "$label $count_netstat_wait_connections" | curl --data-binary @- http://localhost:9091/metrics/job/pushgateway_test/instance/$instance_name
#这里pushgateway_test就是prometheus主配置文件里job的名字,需要保持一致,这样数据就会推送给这个job。后面的instance则是指定机器名,使用的就是脚本里获取的那个
promethus页面查看值
在promethus页面查询这个变量 count_netstat_wait_connections 即可获取到值