zoukankan      html  css  js  c++  java
  • promethus grafana dingtalk pushgateway alertermanager

    Prometheus安装

     cd /usr/src/
     wget https://github.com/prometheus/prometheus/releases/download/v2.12.0/prometheus-2.12.0.linux-amd64.tar.gz
    tar xf prometheus-2.12.0.linux-amd64.tar.gz  -C /usr/local/
    cd /usr/local/
    ln -s prometheus-2.12.0.linux-amd64  prometheus
    

    设置systemctl prometheus 启动

    cat > /usr/lib/systemd/system/prometheus.service     << EOF
    [Unit]
    Description=Prometheus: the monitoring system
    Documentation=http://prometheus.io/docs/
    
    [Service]
    ExecStart=/usr/local/prometheus/prometheus  --config.file=/usr/local/prometheus/prometheus.yml
    Restart=always
    StartLimitInterval=0
    RestartSec=10
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    systemctl enable prometheus       ##制作开机启动prometheus  
    systemctl start prometheus            ##启动prometheus
    systemctl status prometheus         ##查看promethus状态
    

    node_exporter安装部署 -> promethus依赖node_exporter来采集信息

    cd /usr/src/
    wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
    tar xf node_exporter-0.18.1.linux-amd64.tar.gz  -C /usr/local/
    cd  /usr/local/
    ln -s node_exporter-0.18.1.linux-amd64  node_exporter
    

    制作systemctl方式启动node_exporter

    cat >  /usr/lib/systemd/system/node_exporter.service  <<  EOF
    [Unit]
    Description=Prometheus node exporter
    After=local-fs.target network-online.target network.target
    Wants=local-fs.target network-online.target network.target
    
    [Service]
    Restart=on-failure
    ExecStart="/usr/local/node_exporter/node_exporter"
    
    [Install]
    WantedBy=multi-user.target
    EOF
    

    制作node_exporter服务启动

    systemctl enable node_exporter.service     ##制作开机启动node_exporter
    systemctl start node_exporter.service          ##启动node_exporter
    systemctl status node_exporter.service       ##查看node_exporter状态
    

    访问方式

    http://localhost:9090
    

    获取主机信息

    curl http://localhost:9090/metrics
    

    如需要设置报警面板显示和监控多台机器可参考142机器的promethus.yml

    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
            - 127.0.0.1:9093
          # - alertmanager:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules/*.yml"
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
        static_configs:
        - targets: ['192.168.1.140:9090']
      - job_name: 'promethues-node'
        static_configs:
        - targets:
            - 192.168.1.140:9100
            - 192.168.1.137:9100
            - 192.168.1.57:9100
            - 192.168.1.141:9100
            - 192.168.1.60:9100
            - 192.168.1.201:9100
        - targets: ['192.168.1.59:9100']
          labels:
           instance: dataexa-insight-59
    
      - job_name: 'jmx'
        static_configs:
        - targets:
            - 192.168.1.59:3010
        - job_name: pushgateway
          static_configs:
          - targets: ['192.168.191.159:9091']
            labels:
              instance: pushgateway
    

    jvm 监控

    # 资料来源 https://www.jianshu.com/p/adada9c1f7dd
    wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.3.1/jmx_prometheus_javaagent-0.3.1.jar
    # java -javaagent:/usr/local/prometheus/jmx_exporter/jmx_prometheus_javaagent-0.3.1.jar=3010:/usr/local/prometheus/jmx_exporter/jmx_exporter.yml -jar yourJar.jar
    
    

    报警规则编写

    需要在promethus.yml的同级目录下创建rules  --> mkdir rules
    cat warining.yml
    groups:
        - name: 主机状态-监控告警
          rules:
          - alert: 主机状态
            expr: up == 0
            for: 1m
            labels:
              status: 非常严重
            annotations:
              summary: "{{$labels.instance}}:服务器宕机"
              description: "{{$labels.instance}}:服务器延时超过5分钟"
    
          - alert: CPU使用情况
            expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
            for: 1m
            labels:
              status: 一般告警
            annotations:
              summary: "{{$labels.mountpoint}} CPU使用率过高!"
              description: "{{$labels.mountpoint }} CPU使用大于80%(目前使用:{{$value}}%)"
    
          - alert: 内存使用
            expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} 内存使用率过高!"
              description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
          - alert: IO性能
            expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
              description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
    
          - alert: 网络
            expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
              description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
    
          - alert: 网络
            expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
              description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
    
          - alert: TCP会话
            expr: node_netstat_Tcp_CurrEstab > 1000
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
              description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
    
          - alert: 磁盘容量
            expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
            for: 1m
            labels:
              status: 严重告警
            annotations:
              summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
              description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
    

    安装grafana

    cd /usr/local/src/
    wget https://dl.grafana.com/oss/release/grafana-5.4.3-1.x86_64.rpm
    yum localinstall grafana-5.4.3-1.x86_64.rpm
    #启动服务
    systemctl start grafana-server
    #查看服务是否正常启动
    systemctl status grafana-server
    #自启动
    systemctl enable grafana-server
    

    访问

    浏览器访问http://localhost:3000
    

    grafana网页操作

    https://www.cnblogs.com/zhaojiedi1992/p/zhaojiedi_liunx_64_prometheus_granafa.html
    

    监控gpu

    url:https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/exporters/prometheus-dcgm
    实际操作:
    docker run -d --runtime=nvidia --name=nvidia-dcgm-exporter -v /run/prometheus:/run/prometheus nvidia/dcgm-exporter
    docker run -d --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus"
    

    启动的三个服务

    systemctl start prometheus                                      
    systemctl start node_exporter                                 
    systemctl start grafana-server              
    

    alertmanager报警插件安装

    wget https://github.com/prometheus/alertmanager/releases/download/v0.19.0/alertmanager-0.19.0.linux-amd64.tar.gz
    tar xf alertmanager-0.19.0.linux-amd64.tar.gz -C /usr/local
    mv alertmanager-0.19.0.linux-amd64  alertmanager
    /usr/local/alertmanager/bin/ alertmanager     #启动
    
    # 配置报警文件
    cat alertmanager.yml
    global:
      resolve_timeout: 5m
    
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      webhook_configs:
      - url: 'http://localhost:8060/dingtalk/webhook/send'
    

    钉钉报警

    下载dingtalk进行报警
    资料来源 https://www.codetd.com/article/6798984
    下载好之后 选择使用markdown格式的报警格式
    cat > /usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template    <<  EOF
    {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
    {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
    
    {{ define "__text_alert_list" }}{{ range . }}
    **Labels**
    {{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}
    **Annotations**
    {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
    {{ end }}
    **Source:** [{{ .Annotations.summary }}]({{ .GeneratorURL }})
    
    {{ end }}{{ end }}
    
    {{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
    {{ define "ding.link.content" }}#### [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
    {{ template "__text_alert_list" .Alerts.Firing }}
    {{ end }}
    
    #dingtalk插件 指定格式模板和钉钉接口来启动
    nohup prometheus-webhook-dingtalk --template.file="/usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template/default.tmpl" --ding.profile="webhook=https://oapi.dingtalk.com/robot/send?access_token=64a517b7a1d0ad2dc23exxxx00fe18b0b4e15491f179456f94b6ff5"   2>&1 1>dingding.log &
    

    钉钉报警群设置

    只需要设置好公网ip即可
    

    自定义监控项pushgateway

    wget https://github.com/prometheus/pushgateway/releases/download/v0.10.0/pushgateway-0.10.0.linux-amd64.tar.gz
    tar xf pushgateway-0.10.0.linux-amd64.tar.gz  -C /usr/local
    mv /usr/local/pushgateway-0.10.0.linux-amd64  /usr/local/pushgateway
    /usr/local/pushgateway/bin/pushgateway  #启动
    

    使用脚本来获取机器值

    cat count_netstat_wait_connections.sh
    #!/bin/bash
    instance_name=`hostname -f | cut -d'.' -f1`  #获取本机名,用于后面的的标签
    label="count_netstat_wait_connections"  #定义key名
    count_netstat_wait_connections=`netstat -an | grep -i wait | wc -l`  #获取数据的命令
    echo "$label: $count_netstat_wait_connections"
    echo "$label  $count_netstat_wait_connections" | curl --data-binary @- http://localhost:9091/metrics/job/pushgateway_test/instance/$instance_name
    #这里pushgateway_test就是prometheus主配置文件里job的名字,需要保持一致,这样数据就会推送给这个job。后面的instance则是指定机器名,使用的就是脚本里获取的那个
    

    promethus页面查看值

    在promethus页面查询这个变量 count_netstat_wait_connections  即可获取到值
    
  • 相关阅读:
    cin 文件结束符
    C++ 代码折叠
    QTP/UFT 11.51 发布,支持Windows8和IE10等新特性
    QTP11.5测试手机 UFT Mobile
    Ranorex 4.0.2发布,支持Firefox19
    Selenium的WebDriver API 提交 W3C 标准化
    UFT/QTP11.5新特性
    2012年总结与2013年展望
    《TestComplete自动化测试实践》培训课程
    QTP11.5(HP UFT 11.5)下载地址
  • 原文地址:https://www.cnblogs.com/sxgaofeng/p/12618175.html
Copyright © 2011-2022 走看看