zoukankan      html  css  js  c++  java
  • 如来神掌第二式第三招----Shell应用案例之主机监控

    ###############################################################################
    # Name : Mahavairocana                                                                                                                                           
    # Author : Mahavairocana                                                                                                                                         
    # QQ : 10353512                                                                                                                                                    
    # WeChat : shenlan-qianlan                                                                                                                                      
    # Blog : http://www.cnblogs.com/Mahavairocana/                                                                                                       
    # Description : You are welcome to reprint, or hyperlinks to indicate the                                                                        
    #                    source of the article, as well as author information.                                                                                ###############################################################################

    1、菜鸟版

    1、查看主机网卡流量
    
    #!/bin/bash
    #network
    while : ; do
    time=’date +%m”-”%d” “%k”:”%M’
    day=’date +%m”-”%d’
    rx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’
    tx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-sleep 2
    rx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’
    tx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-’
    rx_result=$[(rx_after-rx_before)/256]
    tx_result=$[(tx_after-tx_before)/256]
    echo “$time Now_In_Speed: “$rx_result”kbps Now_OUt_Speed: “$tx_result”kbps”
    sleep 2
    done
    
    2、系统状况监控
    
    #!/bin/sh
    #systemstat.sh
    ip=192.168.1.227
    top -n 2| grep “Cpu” >>./temp/cpu.txt
    free -m | grep “Mem” >> ./temp/mem.txt
    df -k | grep “sda1″ >> ./temp/drive_sda1.txt
    #df -k | grep sda2 >> ./temp/drive_sda2.txt
    df -k | grep “/mnt/storage_0″ >> ./temp/mnt_storage_0.txt
    df -k | grep “/mnt/storage_pic” >> ./temp/mnt_storage_pic.txt
    time=`date +%m”.”%d” “%k”:”%M`
    connect=`netstat -na | grep219.238.148.30:80″ | wc -l`
    echo “$time $connect” >> ./temp/connect_count.txt
    
    3、监控主机的磁盘空间,当使用空间超过90%就通过发mail来发警告
    
    #!/bin/bash
    #monitor available disk space
    SPACE=’df | sed -n ‘/ / $ / p’ | gawk ‘{print $5}’ | sed ’s/%//
    if [ $SPACE -ge 90 ]
    then
    fty89@163.com
    fi
    
    4、监控CPU和内存的使用情况
    
    #!/bin/bash
    #script to capture system statistics
    OUTFILE=/home/xu/capstats.csv
    DATE=’date +%m/%d/%Y’
    TIME=’date +%k:%m:%s’
    TIMEOUT=’uptime’
    VMOUT=’vmstat 1 2′
    users=’echo $TIMEOUT | gawk ‘{print $4}’ ‘
    LOAD=’echo $TIMEOUT | gawk ‘{print $9}’ | sed “s/,//’ ‘
    FREE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ | gawk ‘{print $4} ‘ ‘
    IDLE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ |gawk ‘{print $15}’ ‘
    echo “$DATE,$TIME,$USERS,$LOAD,$FREE,$IDLE” >> $OUTFILE
    
    5、全方位监控主机
    
    #!/bin/bash
    # check_xu.sh
    # 0 * * * * /home/check_xu.sh
    DAT=”`date +%Y%m%d`”
    HOUR=”`date +%H`”
    DIR=”/home/oslog/host_${DAT}/${HOUR}”
    DELAY=60
    COUNT=60
    # whether the responsible directory exist
    if ! test -d ${DIR}
    then
    /bin/mkdir -p ${DIR}
    fi
    # general check
    export TERM=linux
    /usr/bin/top -b -d ${DELAY} -n ${COUNT} > ${DIR}/top_${DAT}.log 2>&1 &
    # cpu check
    /usr/bin/sar -u ${DELAY} ${COUNT} > ${DIR}/cpu_${DAT}.log 2>&1 &
    #/usr/bin/mpstat -P 0 ${DELAY} ${COUNT} > ${DIR}/cpu_0_${DAT}.log 2>&1 &
    #/usr/bin/mpstat -P 1 ${DELAY} ${COUNT} > ${DIR}/cpu_1_${DAT}.log 2>&1 &
    # memory check
    /usr/bin/vmstat ${DELAY} ${COUNT} > ${DIR}/vmstat_${DAT}.log 2>&1 &
    # I/O check
    /usr/bin/iostat ${DELAY} ${COUNT} > ${DIR}/iostat_${DAT}.log 2>&1 &
    # network check
    /usr/bin/sar -n DEV ${DELAY} ${COUNT} > ${DIR}/net_${DAT}.log 2>&1 &
    #/usr/bin/sar -n EDEV ${DELAY} ${COUNT} > ${DIR}/net_edev_${DAT}.log 2>&1 &

    2、进阶版

    #! /bin/sh
    #################################################
    # 主机健康状态监控脚本
    # (监控内容:内存、CPU、磁盘、网卡)
    #
    # V1.0 Writen by: MR.G Date:2012-03-20
    ##################################################
    
    export LANG=C
    
    #设定管理员的信箱
    Email=zhangxiaogang@8tgame.com
    
    #设定日期格式
    time=`date "+%Y-%m-%d %H:%M:%S"`
    
    #设定日志文件
    log=`date +%Y-%m-%d`.log
    
    #设定配置文件的路径
    config=config.ini
    
    if [[ -f $config && -s $config ]];then
    
    for ip in `cat $config`
    do
    # -------------------------------------------------------------------------------------------------
    # 检查服务器的状态,如果异常,则发送mail报警。
    # -------------------------------------------------------------------------------------------------
    ssh $ip pwd &> /dev/null
    if [ $? -ne 0 ];then
    #echo "主机:$ip的SSH无法登陆,请及时处理!" | mail -s "$ip SSH状态异常" $Email
    echo "$time $ip 的SSH状态检查完毕,状态:Failed.">>$log
    else
    echo "$time $ip 的SSH状态检查完毕,状态:Ok." >>$log
    
    # -------------------------------------------------------------------------------------------------
    # 更新服务器的时间
    # ----------------------------------------------------------
    #设定NTP Server
    server="ntp.fudan.edu.cn"
    
    ssh $ip /usr/sbin/ntpdate -s $server
    ssh $ip /usr/sbin/hwclock --systohc
    # -------------------------------------------------------------------------------------------------
    # 检查服务器网络状态,如果无法ping通,则发送mail报警。(ICMP过滤的除外)
    # -------------------------------------------------------------------------------------------------
    #设定检测的网站
    site=www.baidu.com
    
    ssh $ip ping -c3 www.baidu.com >/dev/null
    if [ $? -ne 0 ]; then
    
    #echo "主机:$ip无法ping通,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email
    echo "$time $ip 的网络状态检查完毕,状态:Failed.">>$log
    else
    echo "$time $ip 的网络状态检查完毕,状态:Ok." >>$log
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统的磁盘空间,如果使用率超过90%,则发送mail报警。
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    space_warn="90"
    
    ssh $ip df -P | grep "^/dev"| awk '{print $0}' | while read x
    do
    space_name=`echo $x | awk '{print $1}'`
    space_per=`echo $x | awk '{print $5}' | sed 's/%//g'`
    space_used=`echo $x | awk '{print $3}'`
    if [ $space_per -ge $space_warn ];then
    #echo "主机:$ip的$space_name分区仅剩下$space_used M,使用率为$space_per,已超过指定阀值,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email
    echo -e "$time $ip 的$space_name分区检查完毕,状态:Failed.">>$log
    else
    echo -e "$time $ip 的$space_name分区检查完毕,状态:Ok." >>$log
    fi
    done
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统的内存状态,如果交换分区的使用率超过80%,则发送mail报警。
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    swap_warn=80
    
    swap_total=`ssh $ip free -m | grep "Swap" | awk '{print $4}'`
    swap_free=`ssh $ip free -m | grep "Swap" | awk '{print $2}' `
    swap_used=`ssh $ip free -m | grep "Swap" | awk '{print $3}' `
    
    if [ $swap_used -ne 0 ];then
    swap_per=$[[$swap_used / $swap_total]*100]
    if [ $swap_per -ge $swap_warn ];then
    #echo "主机:$ip的Swap交换分区仅剩下$swap_free M,使用率为$swap_per,已超过指定阀值,请及时处理!" | mail -s "$ip 内存使用警告" $Email
    echo "$time $ip的Swap分区检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip的Swap分区检查完毕,状态:Ok." >> $log
    fi
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统CPU的状态,如果使用率超过80%,则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    cpu_warn=80
    
    cpu_free=`ssh $ip top -b -n 1 | grep "Cpu" | awk '{print $5}' | sed 's/%id,//g'`
    cpu_used=$(echo "100 - $cpu_free" | bc)
    
    if [ `echo "$cpu_used >= $cpu_warn" | bc ` -eq 1 ];then
    #echo "主机:$ip的CPU使用率为$cpu_used%,已超过指定阀值,请及时处理!" | mail -s "$ip CPU使用警告" $Email
    echo "$time $ip 的CPU状态检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的CPU状态检查完毕,状态:Ok." >> $log
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统登陆的用户数,如果当前用户数超过3个,则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    users_max=4
    
    users_now=`ssh $ip uptime | awk '{print $4}'`
    
    if [ $user_now >=$users_max ];then
    #echo "$ip登陆的用户数已经达到了$user_nowg个,已超过指定的阀值,请及时处理!" | mail -s "$ip 用户数报警" $Email
    echo "$time $ip的用户数检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的用户数检查完毕,状态:Ok." >> $log
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统十五分钟内的平均负载情况,如果超过0.7(单核),则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    load_warn=0.7
    
    cpu_num=`ssh $ip cat /proc/cpuinfo | grep -c "model name"`
    
    load_num=`ssh $ip uptime | awk '{print $10}'`
    
    load_average=`echo "scale=2;$load_num/$cpu_num" | bc`
    
    if [ `echo "$load_average >= $load_warn" | bc` -eq 1 ];then
    #echo "$ip 15分钟单核的平均负载已经达到$load_average,已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email
    echo "$time $ip 的平均负载检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的平均负载检查完毕,状态:Ok." >> $log
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统当前的IP连接数,如果超过8000,则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值
    conns_warn=8000
    
    ip_conns=`ssh $ip netstat -an | grep tcp | grep EST | wc -l`
    
    if [ $ip_conns -ge $conns_warn ];then
    #echo "$ip 的IP连接数已经达到$ip_conns,已超过指定的阀值,请及时处理!" | mail -s "$ip IP连接数" $Email
    echo "$time $ip 的IP连接数检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的IP连接数检查完毕,状态:Ok." >> $log
    
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统Apache服务的运行状态,如果不返回200,则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'`
    
    if [ "$httpd" != "1" ];then
    echo "$time $ip Apache状态异常,尝试重启进程……" >> $log
    ssh $ip /etc/init.d/httpd restart &> /dev/null
    ssh $ip sleep 100
    httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'`
    if [ "$httpd" != "1" ];then
    result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP"| awk '{print $6}'`
    if [ "$result" != "200" ];then
    #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
    echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log
    
    fi
    else
    #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
    echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
    fi
    else
    result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP" | awk '{print $6}'`
    if [ "$result" != "200" ];then
    #echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
    echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log
    
    fi
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统MySQL服务的运行状态,通过检查端口3360,若重启后不正常发送mail报警(没有考虑锁表的情况)
    # -------------------------------------------------------------------------------------------------
    
    PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'`
    
    if [ $PORT -eq 3306 ];then
    echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log
    else
    echo "$time $ip MySQL状态异常,尝试重启进程……" >> $log
    ssh $ip /etc/init.d/mysqld restart &>/dev/null
    PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'`
    if [ $PORT -eq 3306 ];then
    echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log
    else
    #echo "主机:$ip 的MySQL服务已经没有响应,请及时处理!" | mail -s "$ip MySQL服务警告" $Email
    echo "$time $ip 的MySQL状态检查完毕,状态:Failed." >> $log
    fi
    fi
    
    # -------------------------------------------------------------------------------------------------
    # 检查系统网卡的流速情况,如果超过指定的阀值,则发送mail报警
    # -------------------------------------------------------------------------------------------------
    
    #设定的阀值,单位KB/S
    speed_warn=10240
    
    send_before=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'`
    recv_before=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'`
    
    sleep 1
    
    send_after=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'`
    recv_after=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'`
    
    send_bytes=`expr $send_after - $send_before`
    recv_bytes=`expr $recv_after - $recv_before`
    
    send_speed=`expr $send_bytes / 1024`
    recv_speed=`expr $recv_bytes / 1024`
    
    if [[ `echo "$send_speed >= $speed_warn" | bc` -eq 1 || `echo "$recv_speed >= $speed_warn" | bc` -eq 1 ]];then
    # echo "$ip 的网卡流速为$send_speed Kb/s(上行)/$recv_speed Kb/s(下行),已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email
    echo "$time $ip 的网卡流速检查完毕,状态:Failed." >> $log
    else
    echo "$time $ip 的网卡流速检查完毕,状态:Ok." >> $log
    fi
    
    fi
    done
    else
    echo "配置文件不存在或内容为空,请检查!"
    
    fi

    3、日志监控

    需要准备环境 
    1: rsync 安装包 (yum 安装 编译安装均可)
    2:防火墙开放相应端口
    3:sendemail 客户端  上传到/usr/bin/后添加执行权限    (sendEmail附件放在附件)
    
    服务端执行命令
    echo "work:work" > /etc/rsyncd.pas  | chmod600 /etc/rsync.pas
    
    1,编辑配置文件/etc/rsyncd.conf如下
    [global]
    uid = root
    gid = root
    use chroot = yes
    max connections = 50
    pid file = /var/run/rsyncd.pid
    lock file = /var/run/rsyncd.lock
    log file = /var/log/rsyncd.log
    transfer logging = yes
    log format = %t %a %m %f %b
    syslog facility = local3
    timeout = 300
    
    
    [1.1]
    read only = false
    write only = yes
    path = /$path
    comment = log
    auth users = log
    secrets file = /etc/rsync.pas
    hosts allow = 10.1.1.1
    
    [1.2]
    read only = false
    write only = yes
    path = /$path
    comment = log
    auth users = log
    secrets file = /etc/rsync.pas
    hosts allow = 10.1.1.2
    
    [1.21]
    read only = false
    write only = yes
    path = /$path
    comment = log
    auth users = log
    secrets file = /etc/rsync.pas
    hosts allow = 10.1.1.21
    
    [1.22]
    read only = false
    write only = yes
    path = /$path
    comment = log
    auth users = log
    secrets file = /etc/rsync.pas
    hosts allow = 10.1.1.22
    
    
    
    巡检脚本
    #!/bin/bash
    Path=/var/log/
    Time=`date "+%Y-%m-%d"`
    Ytime=`date -d yesterday "+%Y-%m-%d"`
    Tlmip="10.1.8.1 10.1.8.2"
    Tpip="10.1.8.21 10.1.8.22"
    ID=tomcatserver.pid
    Tlmlog="tlm.log tlm-trace.log catalina-`date -d yesterday  "+%Y-%m-%d"`.out  catalina-`date  "+%Y-%m-%d"`.out"
    Tplog="tp.log tp-trace.log catalina-`date -d yesterday  "+%Y-%m-%d"`.out  catalina-`date  "+%Y-%m-%d"`.out"
    Errorlog="$Path""$Time-error.log"
    Contacts="a@vmware.com,b@vmware.com"  ###定义接受邮件的联系人,中间以英文逗号隔开即可###
    
    ############################################检测服务##############################################
    for I in "$Tlmip $Tpip";do
            nc -v -w 10 -z $I 80
            if [ $? -ne 0]
            then
            echo "$l service abnormity" >> $Errorlog
            fi
    done
    #############################################巡检TLM#############################################
    for P in $Tlmip;do
    id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l`
            if [$id -gt 4000 ];then
                    echo "$P 连接数超过4000,请查看!!!>>$Errorlog" 
            fi
    
            for L in $Tlmlog;do
            cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime"  >> $Errorlog
            done
    done
    ##############################################巡检TP#############################################
    
    for P in $Tpip;do
    id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l`
            if [$id -gt 4000 ];then
                     echo "$P 连接数超过4000,请查看!!!>>$Errorlog" 
            fi
    
            for L in $Tplog;do
            cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime"  >> $Errorlog
            done
    done
    
    ################################将报错信息以附件形式发送到指定邮件#####################################
    
    /usr/bin/sendEmail -t $Contacts -f 抄送账号 -s smtp地址以及端口:25 -xu 发件箱账号 -xp  发件箱密码 -o message-file=$Errorlog -u "巡检报错信息"    
    
    rm -rf  $Errorlog
    
    
    echo "55 8 * * * /path/*.sh" >> /var/spool/cron/root
    
    
    
    
    
    
    
    
    客户端分别执行命令
    echo "work:work" > /etc/rsyncd.pas  | chmod600 /etc/rsync.pas
    
    vim /path/rsync.sh 
    #/bin/sh
    rsync --port=服务端口 -aP --bwlimit 3000 /opt/vmware/instances/myserver/logs/ work@跳板机IP::模块 --password-file=/etc/rsync.pas                 #####模块即为客户端定义好的1.1  1.2  1.21  1.22
    
    chmod a+x /path/rsync
    echo "50 8 * * * /path/rsync.sh" >> /var/spool/cron/root
  • 相关阅读:
    SQLSERVER查询数据库文件大小
    SQLSERVER 检查内容
    20万DBA都在关注的12个问题 [转载]
    oswbb工具分析主机性能
    Suspending MMON slave action kewrmapsa_ for 82800 seconds
    一次direct path read 故障处理
    Auto Optimizer Stats Collection in 12c
    ORACLE 动态注册和静态注册的区别(转)
    记一次cursor pin s wait on X的处理
    树莓派3b的raspberrypi系统安装pip
  • 原文地址:https://www.cnblogs.com/Mahavairocana/p/8261686.html
Copyright © 2011-2022 走看看