zoukankan      html  css  js  c++  java
  • prometheus容器部署(prom、rule、alert、dingding)

    prometheus

    先创建好映射目录/data/prometheus,然后把配置文件放上去,包括prom、record、alert配置,然后再启动容器

    #cat prometheus.yml 
    global:
      scrape_interval:     60s
      evaluation_interval: 60s
    
    rule_files:
       - "/etc/prometheus/node-exporter-record-rules.yml"
       - "/etc/prometheus/node-exporter-alert-rules.yml"
    
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - 192.168.18.19:9093
     
    scrape_configs:
      - job_name: prometheus
        static_configs:
          - targets: ['localhost:9090']
            labels:
              instance: prometheus
     
      - job_name: linux
        static_configs:
              - targets: ['10.0.23.211:9100']
                labels:
                  instance: 10.0.23.211
        
    
              - targets: ['10.0.23.210:9100']
                labels:
                  instance: 10.0.23.210
    

      

    # cat node-exporter-record-rules.yml
    groups:
      - name: linux
        rules:
        - expr: up
          record: node_exporter:up 
          labels: 
            desc: "节点是否在线, 在线1,不在线0"
            unit: " "
            job: "linux"
        - expr: time() - node_boot_time_seconds{}
          record: node_exporter:node_uptime
          labels: 
            desc: "节点的运行时间"
            unit: "s"
            job: "linux"
    ##############################################################################################
    #                              cpu                                                           #
        - expr: (1 - avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="idle"}[5m])))  * 100 
          record: node_exporter:cpu:total:percent
          labels: 
            desc: "节点的cpu总消耗百分比"
            unit: "%"
            job: "linux"
    
        - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="idle"}[5m])))  * 100 
          record: node_exporter:cpu:idle:percent
          labels: 
            desc: "节点的cpu idle百分比"
            unit: "%"
            job: "linux"
    
        - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="iowait"}[5m])))  * 100 
          record: node_exporter:cpu:iowait:percent
          labels: 
            desc: "节点的cpu iowait百分比"
            unit: "%"
            job: "linux"
    
    
        - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="system"}[5m])))  * 100 
          record: node_exporter:cpu:system:percent
          labels: 
            desc: "节点的cpu system百分比"
            unit: "%"
            job: "linux"
    
        - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode="user"}[5m])))  * 100 
          record: node_exporter:cpu:user:percent
          labels: 
            desc: "节点的cpu user百分比"
            unit: "%"
            job: "linux"
    
        - expr: (avg by (environment,instance) (irate(node_cpu_seconds_total{job="linux",mode=~"softirq|nice|irq|steal"}[5m])))  * 100 
          record: node_exporter:cpu:other:percent
          labels: 
            desc: "节点的cpu 其他的百分比"
            unit: "%"
            job: "linux"
    ##############################################################################################
    
    
    ##############################################################################################
    #                                    memory                                                  #
        - expr: node_memory_MemTotal_bytes{job="linux"}
          record: node_exporter:memory:total
          labels: 
            desc: "节点的内存总量"
            unit: byte
            job: "linux"
    
        - expr: node_memory_MemFree_bytes{job="linux"}
          record: node_exporter:memory:free
          labels: 
            desc: "节点的剩余内存量"
            unit: byte
            job: "linux"
    
        - expr: node_memory_MemTotal_bytes{job="linux"} - node_memory_MemFree_bytes{job="linux"}
          record: node_exporter:memory:used
          labels: 
            desc: "节点的已使用内存量"
            unit: byte
            job: "linux"
    
        - expr: node_memory_MemTotal_bytes{job="linux"} - node_memory_MemAvailable_bytes{job="linux"}
          record: node_exporter:memory:actualused
          labels: 
            desc: "节点用户实际使用的内存量"
            unit: byte
            job: "linux"
    
        - expr: (1-(node_memory_MemAvailable_bytes{job="linux"} / (node_memory_MemTotal_bytes{job="linux"})))* 100
          record: node_exporter:memory:used:percent
          labels: 
            desc: "节点的内存使用百分比"
            unit: "%"
            job: "linux"
    
        - expr: ((node_memory_MemAvailable_bytes{job="linux"} / (node_memory_MemTotal_bytes{job="linux"})))* 100
          record: node_exporter:memory:free:percent
          labels: 
            desc: "节点的内存剩余百分比"
            unit: "%"
            job: "linux"
    ##############################################################################################
    #                                   load                                                     #
        - expr: sum by (instance) (node_load1{job="linux"})
          record: node_exporter:load:load1
          labels: 
            desc: "系统1分钟负载"
            unit: " "
            job: "linux"
    
        - expr: sum by (instance) (node_load5{job="linux"})
          record: node_exporter:load:load5
          labels: 
            desc: "系统5分钟负载"
            unit: " "
            job: "linux"
    
        - expr: sum by (instance) (node_load15{job="linux"})
          record: node_exporter:load:load15
          labels: 
            desc: "系统15分钟负载"
            unit: " "
            job: "linux"
       
    ##############################################################################################
    #                                 disk                                                       #
        - expr: node_filesystem_size_bytes{job="linux" ,fstype=~"ext4|xfs"}
          record: node_exporter:disk:usage:total
          labels: 
            desc: "节点的磁盘总量"
            unit: byte
            job: "linux"
    
        - expr: node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"}
          record: node_exporter:disk:usage:free
          labels: 
            desc: "节点的磁盘剩余空间"
            unit: byte
            job: "linux"
    
        - expr: node_filesystem_size_bytes{job="linux",fstype=~"ext4|xfs"} - node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"}
          record: node_exporter:disk:usage:used
          labels: 
            desc: "节点的磁盘使用的空间"
            unit: byte
            job: "linux"
    
        - expr:  (1 - node_filesystem_avail_bytes{job="linux",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{job="linux",fstype=~"ext4|xfs"}) * 100 
          record: node_exporter:disk:used:percent    
          labels: 
            desc: "节点的磁盘的使用百分比"
            unit: "%"
            job: "linux"
    
        - expr: irate(node_disk_reads_completed_total{job="linux"}[1m])
          record: node_exporter:disk:read:count:rate
          labels: 
            desc: "节点的磁盘读取速率"
            unit: "次/秒"
            job: "linux"
    
        - expr: irate(node_disk_writes_completed_total{job="linux"}[1m])
          record: node_exporter:disk:write:count:rate
          labels: 
            desc: "节点的磁盘写入速率"
            unit: "次/秒"
            job: "linux"
    
        - expr: (irate(node_disk_written_bytes_total{job="linux"}[1m]))/1024/1024
          record: node_exporter:disk:read:mb:rate
          labels: 
            desc: "节点的设备读取MB速率"
            unit: "MB/s"
            job: "linux"
    
        - expr: (irate(node_disk_read_bytes_total{job="linux"}[1m]))/1024/1024
          record: node_exporter:disk:write:mb:rate
          labels: 
            desc: "节点的设备写入MB速率"
            unit: "MB/s"
            job: "linux"
    
    ##############################################################################################
    #                                filesystem                                                  #
        - expr:   (1 -node_filesystem_files_free{job="linux",fstype=~"ext4|xfs"} / node_filesystem_files{job="linux",fstype=~"ext4|xfs"}) * 100 
          record: node_exporter:filesystem:used:percent    
          labels: 
            desc: "节点的inode的剩余可用的百分比"
            unit: "%"
            job: "linux"
    #############################################################################################
    #                                filefd                                                     #
        - expr: node_filefd_allocated{job="linux"}
          record: node_exporter:filefd_allocated:count
          labels: 
            desc: "节点的文件描述符打开个数"
            unit: "%"
            job: "linux"
     
        - expr: node_filefd_allocated{job="linux"}/node_filefd_maximum{job="linux"} * 100 
          record: node_exporter:filefd_allocated:percent
          labels: 
            desc: "节点的文件描述符打开百分比"
            unit: "%"
            job: "linux"
    
    #############################################################################################
    #                                network                                                    #
        - expr: avg by (environment,instance,device) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netin:bit:rate
          labels: 
            desc: "节点网卡eth0每秒接收的比特数"
            unit: "bit/s"
            job: "linux"
    
        - expr: avg by (environment,instance,device) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netout:bit:rate
          labels: 
            desc: "节点网卡eth0每秒发送的比特数"
            unit: "bit/s"
            job: "linux"
    
        - expr: avg by (environment,instance,device) (irate(node_network_receive_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netin:packet:rate
          labels: 
            desc: "节点网卡每秒接收的数据包个数"
            unit: "个/秒"
            job: "linux"
    
        - expr: avg by (environment,instance,device) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netout:packet:rate
          labels: 
            desc: "节点网卡发送的数据包个数"
            unit: "个/秒"
            job: "linux"
    
        - expr: avg by (environment,instance,device) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netin:error:rate
          labels: 
            desc: "节点设备驱动器检测到的接收错误包的数量"
            unit: "个/秒"
            job: "linux"
    
        - expr: avg by (environment,instance,device) (irate(node_network_transmit_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m]))
          record: node_exporter:network:netout:error:rate
          labels: 
            desc: "节点设备驱动器检测到的发送错误包的数量"
            unit: "个/秒"
            job: "linux"
          
        - expr: node_tcp_connection_states{job="linux", state="established"}
          record: node_exporter:network:tcp:established:count
          labels: 
            desc: "节点当前established的个数"
            unit: "个"
            job: "linux"
    
        - expr: node_tcp_connection_states{job="linux", state="time_wait"}
          record: node_exporter:network:tcp:timewait:count
          labels: 
            desc: "节点timewait的连接数"
            unit: "个"
            job: "linux"
    
        - expr: sum by (environment,instance) (node_tcp_connection_states{job="linux"})
          record: node_exporter:network:tcp:total:count
          labels: 
            desc: "节点tcp连接总数"
            unit: "个"
            job: "linux"
       
    #############################################################################################
    #                                process                                                    #
        - expr: node_processes_state{state="Z"}
          record: node_exporter:process:zoom:total:count
          labels: 
            desc: "节点当前状态为zoom的个数"
            unit: "个"
            job: "linux"
    #############################################################################################
    #                                other                                                    #
        - expr: abs(node_timex_offset_seconds{job="linux"})
          record: node_exporter:time:offset
          labels: 
            desc: "节点的时间偏差"
            unit: "s"
            job: "linux"
    
    #############################################################################################
       
        - expr: count by (instance) ( count by (instance,cpu) (node_cpu_seconds_total{ mode='system'}) ) 
          record: node_exporter:cpu:count
    #
    

      

    # cat node-exporter-alert-rules.yml 
    groups:
      - name: node-exporter-alert
        rules:
        - alert: node-exporter-down
          expr: node_exporter:up == 0 
          for: 1m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 宕机了"  
            description: "instance: {{ $labels.instance }} 
    - job: {{ $labels.job }} 关机了, 时间已经1分钟了。" 
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
    
    
        - alert: node-exporter-cpu-high 
          expr:  node_exporter:cpu:total:percent > 80
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
    
        - alert: node-exporter-cpu-iowait-high 
          expr:  node_exporter:cpu:iowait:percent >= 12
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
    
    
        - alert: node-exporter-memory-high
          expr:  node_exporter:memory:used:percent > 80
          for: 3m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
    
        - alert: node-exporter-disk-high
          expr:  node_exporter:disk:used:percent > 80
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
        - alert: node-exporter-inode-high
          expr:  node_exporter:filesystem:used:percent > 80
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} inode 使用率高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    
        - alert: node-exporter-filefd-allocated-percent-high 
          expr:  node_exporter:filefd_allocated:percent > 80
          for: 10m
          labels: 
            severity: info
          annotations: 
            summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}"  
            description: ""    
            value: "{{ $value }}"
            instance: "{{ $labels.instance }}"
    

      

    docker run -d -p 9090:9090 
     -v /data/prometheus:/etc/prometheus/ -v /data/promdata:/prometheus 
     -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro prom/prometheus 
     --config.file=/etc/prometheus/prometheus.yml 
     --storage.tsdb.path=/prometheus 
     --web.console.libraries=/usr/share/prometheus/console_libraries 
     --web.console.templates=/usr/share/prometheus/consoles 
     --web.enable-admin-api 	# 控制对admin HTTP API的访问,其中包括删除时间序列等功能
     --web.enable-lifecycle		# 支持热更新,直接执行localhost:9090/-/reload立即生效
    

      

    下载node_exporter后放入/usr/local/bin/

    chmod +x node_exporter

    groupadd -r prometheus
    useradd -r -g prometheus -s /sbin/nologin -M -c "prometheus Daemons" prometheus
    
    cat << EOF > /usr/lib/systemd/system/node_exporter.service 
    [Service]
    User=prometheus
    Group=prometheus
    ExecStart=/usr/local/bin/node_exporter
    
    [Install]
    WantedBy=multi-user.target
    
    [Unit]
    Description=node_exporter
    After=network.target 
    EOF
    
    systemctl start node_exporter
    systemctl status node_exporter
    systemctl enable node_exporter
    

      

    alertmanager

    docker run -d -p 9093:9093  -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro  
     -v /etc/timezone:/etc/timezone:ro prom/alertmanager
    

    alert没有映射配置文件,直接到容器里修改后重启

    报警接收者走的是钉钉的webhook

    cat /etc/alertmanager/alertmanager.yml
    global:
      resolve_timeout: 5m
    route:
      receiver: webhook
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 4h
      group_by: [alertname]
      routes:
      - receiver: webhook
        group_wait: 10s
        match:
          team: node
     
    receivers:
    - name: webhook
      webhook_configs:
      - send_resolved: true
        url: http://192.168.18.19:8060/dingtalk/webhook1/send
    

      

    prometheus-webhook-dingtalk

    启动时带上钉钉机器人给到的token启动

    docker run -d --restart always -p 8060:8060  -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro  timonwong/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxx"
    

      

  • 相关阅读:
    linux(ubuntu) 系统修改/etc/fstab文件后无法进入系统的解决方法-摘录
    linux实现实时同步服务
    linux利用网易邮箱发送邮件
    企业数据库备份方案——mysqldump完全备份+binlog增量备份
    Nginx下隐藏index.php
    linux日志详解-摘录
    expect免交互用法
    删除超过多少天的日志文件或者备份文件
    Python_结合Re正则模块爬虫
    Jmeter性能分析
  • 原文地址:https://www.cnblogs.com/jabbok/p/12594195.html
Copyright © 2011-2022 走看看