zoukankan      html  css  js  c++  java
  • prometheus 使用 ipmi exporter 增加硬件级别监控

    prometheus 监控硬件

    安装ipmitool 并加载相应模块

    yum install ipmitool freeipmi  -y
    modprobe ipmi_msghandler
    modprobe ipmi_devintf
    modprobe ipmi_poweroff
    modprobe ipmi_si
    modprobe ipmi_watchdog
    

    下载 ipmi_exporter 源码包

    wget https://github.com/soundcloud/ipmi_exporter/releases/download/v1.0.0/ipmi_exporter-v1.0.0.linux-amd64.tar.gz  
    tar -xf ipmi_exporter-v1.0.0.linux-amd64.tar.gz   -C /opt/
    cd /opt/ipmi_exporter-v1.0.0.linux-amd64/
    

    增加配置文件

    cat ipmi_remote.yml
    modules:
            10.193.x.x:               #远控卡ip地址
                        user: "root"  #远控卡用户
                        pass: "xxxxxxxxxxxxx"  #远控卡密码
                        # Available collectors are bmc, ipmi, chassis, and dcmi 
                        collectors:
                        - bmc
                        - ipmi
                        - dcmi
                        - chassis
                        # Got any sensors you don't care about? Add them here. 
                        exclude_sensor_ids:
                        - 2
                        - 29
                        - 32
    
    

    启动ipmi_exporter

    ./ipmi_exporter  --config.file=/usr/local/ipmi_exporter-v1.0.0.linux-amd64/ipmi_remote.yml  --web.listen-address=:19293 & 
    

    增加prometheus server job 配置

    #增加监控ipmi exporter rules 规则
      - "rules/Memory_hardware.yml"
      - "rules/power.yml"
      - "rules/fan.yml"
      - "rules/processor.yml"
      - "rules/harddisk.yml"
    
    #增加主配置文件job
    #cat /usr/local/prometheus/prometheus.yml
      - job_name: 'ipmi_exporter'
        file_sd_configs:
        - refresh_interval: 5s  
          files:
          - ./conf.d/ipmi_exporter.json
    #cat  /usr/local/prometheus/conf.d/ipmi_exporter.json 
    [
    {
    "targets": ["10.65.x.x:19293"],
    "labels": {
    "hostname": "lgy-storage-glusterxxx"
    }
    }
    ]
    
    

    增加rules 配置文件

    # cd /usr/local/prometheus/rules
    # cat Memory_hardware.yml  (内存条监控)
    groups:
    - name: Memory_hardware
      rules:
      - alert: Memory_hardware error
        expr: ipmi_sensor_state{type="Memory"} == 1
        for: 3m
        labels:
          user: caizh
        annotations:
          summary: "Instance {{ $labels.instance }} 内存硬件警告"
          description: "{{ $labels.instance }} of job {{$labels.job}} 内存硬件警告,当前状态[{{ $value }}]."
    
    
    
    # cat power.yml (服务器电源模块监控)
    groups:
    - name: power status
      rules:
      - alert: power bad
        expr: ipmi_sensor_state{name="Status",type="Power Supply"} == 1
        for: 3m
        labels:
          user: caizh
        annotations:
          summary: "Instance {{ $labels.instance }} 电源坏了"
          description: "{{ $labels.instance }} of job {{$labels.job}} 电源坏了,当前状态[{{ $value }}]."
    
    
    #  cat fan.yml  (服务器风扇监控)
    groups:
    - name: fan status
      rules:
      - alert: speed fan bad
        expr: ipmi_fan_speed_state{} == 1
        for: 3m
        labels:
          user: caizh
        annotations:
          summary: "Instance {{ $labels.instance }} 风扇坏了"
          description: "{{ $labels.instance }} of job {{$labels.job}} 风扇坏了,当前状态[{{ $value }}]."
    
    
    # cat processor.yml (服务器处理器监控)
    groups:
    - name: Processor
      rules:
      - alert: Processor hardware error
        expr: ipmi_sensor_state{name="Status",type="Processor"} == 1
        for: 3m
        labels:
          user: caizh
        annotations:
          summary: "Instance {{ $labels.instance }} 处理器硬件警告"
    
    
    #  cat harddisk.yml (硬盘监控,主要是raid 组监控,系统盘和数据盘分开做的raid 组,会有两个参数)
    groups:
    - name: harddisk
      rules:
      - alert: hard disk bad
        expr: ipmi_sensor_state{type="Drive Slot"} == 1
        for: 3m
        labels:
          user: caizh
        annotations:
          summary: "Instance {{ $labels.instance }} 硬盘坏了"
          description: "{{ $labels.instance }} of job {{$labels.job}} 硬盘坏了,当前状态[{{ $value }}]."
    
    
    
    
  • 相关阅读:
    星月教你做网站(HTML5+CSS+JS)----html背景知识 HTML5环境安装 创建HTML5文档1(结构)
    带你学C带你飞--16拾遗 17数组
    神经网络
    交互项回归与分组回归有什么差异?
    逻辑回归输出的值是真实的概率吗?
    逻辑回归
    机器学习中的核函数与核方法
    Regularization for Logistic Regression: L1, L2, Gauss or Laplace?
    What is Wrong with Linear Regression for Classification?What is Wrong with Linear Regression for Classification?
    感知器算法
  • 原文地址:https://www.cnblogs.com/lixinliang/p/15019679.html
Copyright © 2011-2022 走看看