zoukankan      html  css  js  c++  java
  • nagios系列(六)之nagios实现对服务器cpu温度的监控

    1、安装硬件传感器监控软件sensors
    yum install -y lm_sensors*


    2、运行sensors-detect进行传感器检测
    ##一路回车即可


    Do you want to overwrite /etc/sysconfig/lm_sensors? (YES/no): 
    Starting lm_sensors: loading module coretemp               [  OK  ]
    Unloading i2c-dev... OK


    3、运行sensors看是否能读取数据,如下像下面这样表示正常
    # sensors
    coretemp-isa-0000
    Adapter: ISA adapter
    ERROR: Can't get value of subfeature temp1_input: Can't read
    Physical id 0:  +0.0°C  (high = +100.0°C, crit = +100.0°C)  
    ERROR: Can't get value of subfeature temp2_input: Can't read
    Core 0:         +0.0°C  (high = +100.0°C, crit = +100.0°C)  
    ERROR: Can't get value of subfeature temp3_input: Can't read
    Core 1:         +0.0°C  (high = +100.0°C, crit = +100.0°C)  


    coretemp-isa-0002
    Adapter: ISA adapter
    ERROR: Can't get value of subfeature temp1_input: Can't read
    Physical id 1:  +0.0°C  (high = +100.0°C, crit = +100.0°C)  
    ERROR: Can't get value of subfeature temp2_input: Can't read
    Core 0:         +0.0°C  (high = +100.0°C, crit = +100.0°C)  
    ERROR: Can't get value of subfeature temp3_input: Can't read
    Core 1:         +0.0°C  (high = +100.0°C, crit = +100.0°C)  


    4、添加监控脚本vim /usr/local/nagios/libexec/check_cputemp


    #!/bin/sh
    #########check_cputemp###########
    #date : May 2013
    #Licence GPLv2
    #by Barlow
    #/usr/local/nagios/libexec/check_cputemp
    #you can use NRPE to define service in nagios
    #check_nrpe!check_cputemp
    # Plugin return statements
    STATE_OK=0
    STATE_WARNING=1
    STATE_CRITICAL=2
    STATE_UNKNOWN=3
    print_help_msg(){
    $Echo "Usage: $0 -h to get help."
    }
    print_full_help_msg(){
    $Echo "Usage:"
    $Echo "$0 [ -v ] -m sensors -w cpuT -c cpuT"
    $Echo "Sepicify the method to use the temperature data sensors."
    $Echo "And the corresponding Critical value must greater than Warning value."
    $Echo "Example:"
    $Echo "${0} -m sensors -w 40 -c 50"
    }
    print_err_msg(){
    $Echo "Error."
    print_full_help_msg
    }
    to_debug(){
    if [ "$Debug" = "true" ]; then
    $Echo "$*" >> /var/log/check_sys_temperature.log.$$ 2>&1
    fi
    }
    unset LANG
    Echo="echo -e"
    if [ $# -lt 1 ]; then
    print_help_msg
    exit 3
    else
    while getopts :vhm:w:c: OPTION
    do
    case $OPTION
    in
    v)
    #$Echo "Verbose mode."
    Debug=true
    ;;
    m)
    method=$OPTARG
    ;;
    w)
    WARNING=$OPTARG
    ;;
    c)
    CRITICAL=$OPTARG ;;
    h)
    print_full_help_msg
    exit 3
    ;;
    ?)
    $Echo "Error: Illegal Option."
    print_help_msg
    exit 3
    ;;
    esac
    done
    if [ "$method" = "sensors" ]; then
    use_sensors="true"
    to_debug use_sensors
    else
    $Echo "Error. Must to sepcify the method to use sensors."
    print_full_help_msg
    exit 3
    fi
    to_debug All Values  are " Warning: "$WARNING" and Critical: "$CRITICAL" ".
    fi
    #########lm_sensors##################
    if [ "$use_sensors" = "true" ]; then
    sensorsCheckOut=`which sensors 2>&1`
    if [ $? -ne 0 ];then
    echo $sensorsCheckOut
    echo Maybe you need to check your sensors.
    exit 3
    fi
    to_debug Use $sensorsCheckOut to check system temperature
    TEMP1=`sensors | head -3 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
    TEMP2=`sensors | head -4 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
    TEMP3=`sensors | head -5 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
    TEMP4=`sensors | head -6 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
    ##温度的取数根据你cpu的核数确定,我的是四核,所以取TEMP1-4个CPU温度数并计算平均值
    SUM=$(( $TEMP1 + $TEMP2 + $TEMP3 + $TEMP4 ))
    TEMP=$(($SUM/4))
    if [ -z "$TEMP" ] ; then
    $Echo "No Data been get here. Please confirm your ARGS and re-check it with Verbose mode, then to check the log."
    exit 3
    fi
    to_debug temperature data is $TEMP
    else
    $Echo "Error. Must to sepcify the method to use sensors"
    print_full_help_msg
    exit 3
    fi
    ######### Comparaison with the warnings and criticals thresholds given by user############
    CPU_TEMP=$TEMP
    #if [ "$WARNING" != "0" ] || [ "$CRITICAL" != "0" ]; then
    if [ "$CPU_TEMP" -gt "$CRITICAL" ]  && [ "$CRITICAL" != "0" ]; then
    STATE="$STATE_CRITICAL"
    STATE_MESSAGE="CRITICAL"
    to_debug $STATE , Message is $STATE_MESSAGE
    elif [ "$CPU_TEMP" -gt "$WARNING" ] && [ "$WARNING" != "0" ]; then
    STATE="$STATE_WARNING"
    STATE_MESSAGE="WARNING"
    to_debug $STATE , Message is $STATE_MESSAGE
    else
    STATE="$STATE_OK"
    STATE_MESSAGE="OK"
    to_debug $STATE , Message is $STATE_MESSAGE
    fi
    ##返回值中注意要包含性能数据,即采用|分隔的后半部数据,且数据单位不能包含中文,否则使用PNP等绘图软件无法正常绘图。
    echo "The TEMPERATURE "$STATE_MESSAGE" "-" The CPU's Temperature is "$CPU_TEMP" ℃ ! | 温度=`echo $CPU_TEMP`Celsius;$WARNING;$CRITICAL"
    exit $STATE


    5、赋予脚本执行权限:
    chmod +x /usr/local/nagios/libexec/check_cputemp


    6、配置vim /usr/local/nagios/etc/nrpe.cfg,添加如下一行:
    echo "command[check_cputemp]=/usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45" >>/usr/local/nagios/etc/nrpe.cfg


    重新启动客户端nrpe服务
    -w 表示警告值,-c表示关键(紧急)值,自行根据实际情况调整
    注意:以上六步均在被监控机上完成。


    在客户端测试是否ok,虚拟机测试不成功,需要在物理机上实现
    # /usr/local/nagios/libexec/check_cputemp -m sensors -w 38 -c 45
    The TEMPERATURE OK - The CPU's Temperature is 14 ℃ ! | 温度=14Celsius;38;45
    服务端执行测试:
    /usr/local/nagios/libexec/check_nrpe -H 192.168.8.93 -c check_cputemp

    7、在Nagios服务端配置服务:
    define service{
    use             generic-service
    host_name 需要被监控的hostname
    service_description CPU Temperature
    check_command check_nrpe!check_cputemp
    }

    保存后重启nagios服务


  • 相关阅读:
    sizzle编译函数
    人人都是 DBA(XII)查询信息收集脚本汇编
    人人都是 DBA(XI)I/O 信息收集脚本汇编
    人人都是 DBA(X)资源信息收集脚本汇编
    人人都是 DBA(IX)服务器信息收集脚本汇编
    人人都是 DBA(VIII)SQL Server 页存储结构
    人人都是 DBA(VII)B 树和 B+ 树
    人人都是 DBA(VI)SQL Server 事务日志
    人人都是 DBA(V)SQL Server 数据库文件
    人人都是 DBA(IV)SQL Server 内存管理
  • 原文地址:https://www.cnblogs.com/reblue520/p/6239760.html
Copyright © 2011-2022 走看看