zoukankan      html  css  js  c++  java
  • 监控平台prometheus实施步骤简单记录

    ---平台暂定为单节点,如有需要,可扩展为高可用集群

    1. 环境准备

      cat /etc/redhat-release 
      	CentOS Linux release 7.6.1810 (Core)
      	
      cat >> /etc/security/limits.conf <<EOF #增大环境描述符
      	root soft nofile 65535
      	root hard nofile 65535
      	* soft nproc 65535
      	* hard nproc 65535
      	* soft nofile 65535
      	* hard nofile 65535
      	EOF
      	echo "ulimit -SH 65535" >> /etc/rc.local
      	ulimit -SH 65535
      curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
      
      curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
      mkdir /soft 
      mkdir /application
      

      2.安装步骤

      1.安装prometheus server
      #进入软件目录
      cd /application
      
      yum install git -y
      
      git clone https://github.com/prometheus/prometheus.git
      
      ln -s prometheus-2.18.0-rc.0.linux-amd64 prometheus
      
      cd prometheus
      
      make build
      ./prometheus --config.file=your_config.yml
      
      #配置启动文件
      vi /etc/systemd/system/prometheus.service
      [Unit]
      Description=Prometheus Monitoring System
      Documentation=Prometheus Monitoring System
      
      [Service]
      ExecStart=/application/prometheus/prometheus 
        --config.file=/application/prometheus/prometheus.yml 
        --web.listen-address=:9090 
        --web.enable-lifecycle 
        --storage.tsdb.retention=30d    
        --web.read-timeout=5m       
        --web.max-connections=512  
        --web.external-url=::9090  
        --web.route-prefix=/application/prometheus  
        --web.user-assets=/application/prometheus   
        --web.enable-lifecycle   
        --web.enable-admin-api     
      
      [Install]
      WantedBy=multi-user.target
      
      systemctl deamon-reload
      
      netstat -ltnp|grep 9090
      
      2.安装mysqld—exporter
      前提:创建用户
      CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'XXXXXXXX' WITH MAX_USER_CONNECTIONS 3;
      GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';
      
      下载安装文件
      wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz
      
      cd /application
      
      tar xf prometheus-2.18.0-rc.0.linux-amd64.tar.gz
      
      ln -s mysqld_exporter-0.12.1.linux-amd64 mysqld_exporter
      
      配置mysql用户密码
      
      vim .my.cnf
      [client]
      user=xxxx
      password=xxxx
      
      启动服务
      nohup ./mysqld_exporter --collect.auto_increment.columns --no-collect.auto_increment.columns --config.my-cnf=.my.cnf &
      
      安装grafana
      sudo nano /etc/yum.repos.d/grafana.repo
      
      vim /etc/yum.repos.d/grafana.repo
      [grafana]
      name=grafana
      baseurl=https://packages.grafana.com/enterprise/rpm
      repo_gpgcheck=1
      enabled=1
      gpgcheck=1
      gpgkey=https://packages.grafana.com/gpg.key
      sslverify=1
      sslcacert=/etc/pki/tls/certs/ca-bundle.crt
      #安装
      yum install grafana-enterprise -y
      
      systemctl start grafana
      
      netstat -ltnp|grep 3000
      
      #安装alertmanager
      wget https://github.com/prometheus/alertmanager/releases/download/v0.20.0/alertmanager-0.20.0.linux-amd64.tar.gz
      
      cd /application
      
      tar xf alertmanager-0.20.0.linux-amd64.tar.gz
      
      ln -s alertmanager-0.20.0.linux-amd64 alertmanager #稍后启动
      
      nohup ./alertmanager --config.file=alertmanage.yml &
      
      

    3.配置文件

    #主配置文件
    [root@prometheus prometheus]# cat prometheus.yml
    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
           - 127.0.0.1:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
       - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['10.0.0.15:9090']
      - job_name: 'zabbix-server'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['10.0.0.201:9100']
      - job_name: 'zabbix-server-mysql'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['10.0.0.201:9104']
      - job_name: 'test-mysql'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['10.0.0.15:9104']
    
    #规则配置文件
    groups:
    - name: MySQLStatsAlert
      rules:
      - alert: MySQL is down
        expr: mysql_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} MySQL is down"
          description: "MySQL database is down. This requires immediate action!"
      - alert: open files high
        expr: mysql_global_status_innodb_num_open_files > (mysql_global_variables_open_files_limit) * 0.75
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} open files high"
          description: "Open files is high. Please consider increasing open_files_limit."
      - alert: Read buffer size is bigger than max. allowed packet size
        expr: mysql_global_variables_read_buffer_size > mysql_global_variables_slave_max_allowed_packet 
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Read buffer size is bigger than max. allowed packet size"
          description: "Read buffer size (read_buffer_size) is bigger than max. allowed packet size (max_allowed_packet).This can break your replication."
      - alert: Sort buffer possibly missconfigured
        expr: mysql_global_variables_innodb_sort_buffer_size <256*1024 or mysql_global_variables_read_buffer_size > 4*1024*1024 
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Sort buffer possibly missconfigured"
          description: "Sort buffer size is either too big or too small. A good value for sort_buffer_size is between 256k and 4M."
      - alert: Thread stack size is too small
        expr: mysql_global_variables_thread_stack <196608
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Thread stack size is too small"
          description: "Thread stack size is too small. This can cause problems when you use Stored Language constructs for example. A typical is 256k for thread_stack_size."
      - alert: Used more than 80% of max connections limited 
        expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections * 0.8
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Used more than 80% of max connections limited"
          description: "Used more than 80% of max connections limited"
      - alert: InnoDB Force Recovery is enabled
        expr: mysql_global_variables_innodb_force_recovery != 0 
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} InnoDB Force Recovery is enabled"
          description: "InnoDB Force Recovery is enabled. This mode should be used for data recovery purposes only. It prohibits writing to the data."
      - alert: InnoDB Log File size is too small
        expr: mysql_global_variables_innodb_log_file_size < 16777216 
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} InnoDB Log File size is too small"
          description: "The InnoDB Log File size is possibly too small. Choosing a small InnoDB Log File size can have significant performance impacts."
      - alert: InnoDB Flush Log at Transaction Commit
        expr: mysql_global_variables_innodb_flush_log_at_trx_commit != 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} InnoDB Flush Log at Transaction Commit"
          description: "InnoDB Flush Log at Transaction Commit is set to a values != 1. This can lead to a loss of commited transactions in case of a power failure."
      - alert: Table definition cache too small
        expr: mysql_global_status_open_table_definitions > mysql_global_variables_table_definition_cache
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Table definition cache too small"
          description: "Your Table Definition Cache is possibly too small. If it is much too small this can have significant performance impacts!"
      - alert: Table open cache too small
        expr: mysql_global_status_open_tables >mysql_global_variables_table_open_cache * 99/100
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Table open cache too small"
          description: "Your Table Open Cache is possibly too small (old name Table Cache). If it is much too small this can have significant performance impacts!"
      - alert: Thread stack size is possibly too small
        expr: mysql_global_variables_thread_stack < 262144
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Thread stack size is possibly too small"
          description: "Thread stack size is possibly too small. This can cause problems when you use Stored Language constructs for example. A typical is 256k for thread_stack_size."
      - alert: InnoDB Buffer Pool Instances is too small
        expr: mysql_global_variables_innodb_buffer_pool_instances == 1
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} InnoDB Buffer Pool Instances is too small"
          description: "If you are using MySQL 5.5 and higher you should use several InnoDB Buffer Pool Instances for performance reasons. Some rules are: InnoDB Buffer Pool Instance should be at least 1 Gbyte in size. InnoDB Buffer Pool Instances you can set equal to the number of cores of your machine."
      - alert: InnoDB Plugin is enabled
        expr: mysql_global_variables_ignore_builtin_innodb == 1
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} InnoDB Plugin is enabled"
          description: "InnoDB Plugin is enabled"
      - alert: Binary Log is disabled
        expr: mysql_global_variables_log_bin != 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Binary Log is disabled"
          description: "Binary Log is disabled. This prohibits you to do Point in Time Recovery (PiTR)."
      - alert: Binlog Cache size too small
        expr: mysql_global_variables_binlog_cache_size < 1048576
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Binlog Cache size too small"
          description: "Binlog Cache size is possibly to small. A value of 1 Mbyte or higher is OK."
      - alert: Binlog Statement Cache size too small
        expr: mysql_global_variables_binlog_stmt_cache_size <1048576 and mysql_global_variables_binlog_stmt_cache_size > 0
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Binlog Statement Cache size too small"
          description: "Binlog Statement Cache size is possibly to small. A value of 1 Mbyte or higher is typically OK."
      - alert: Binlog Transaction Cache size too small
        expr: mysql_global_variables_binlog_cache_size  <1048576
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Binlog Transaction Cache size too small"
          description: "Binlog Transaction Cache size is possibly to small. A value of 1 Mbyte or higher is typically OK."
      - alert: Sync Binlog is enabled
        expr: mysql_global_variables_sync_binlog == 1
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Sync Binlog is enabled"
          description: "Sync Binlog is enabled. This leads to higher data security but on the cost of write performance."
      - alert: IO thread stopped
        expr: mysql_slave_status_slave_io_running != 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} IO thread stopped"
          description: "IO thread has stopped. This is usually because it cannot connect to the Master any more."
      - alert: SQL thread stopped 
        expr: mysql_slave_status_slave_sql_running == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} SQL thread stopped"
          description: "SQL thread has stopped. This is usually because it cannot apply a SQL statement received from the master."
      - alert: SQL thread stopped
        expr: mysql_slave_status_slave_sql_running != 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} Sync Binlog is enabled"
          description: "SQL thread has stopped. This is usually because it cannot apply a SQL statement received from the master."
      - alert: Slave lagging behind Master
        expr: rate(mysql_slave_status_seconds_behind_master[1m]) >30 
        for: 1m
        labels:
          severity: warning 
        annotations:
          summary: "Instance {{ $labels.instance }} Slave lagging behind Master"
          description: "Slave is lagging behind Master. Please check if Slave threads are running and if there are some performance issues!"
      - alert: Slave is NOT read only(Please ignore this warning indicator.)
        expr: mysql_global_variables_read_only != 0
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} Slave is NOT read only"
    - name: example
      rules:
    
      # Alert for any instance that is unreachable for >5 minutes.
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
    - name: mysql
      rules:
    
      # Alert for any instance that is unreachable for >5 minutes.
      - alert: 主从挂了
        expr: mysql_slave_status_slave_io_running == 0
        for: 0m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} 主从"
          description: "{{ $labels.instance }} of job {{ $labels.job }} 主从挂了."
    
    #alertmanager报警配置
    global:
      resolve_timeout: 1m
      smtp_smarthost: 'smtp.qq.com:465'
      smtp_from: 'xxxxxxx@qq.com'
      smtp_auth_username: 'xxxxxxx@qq.com'
      smtp_auth_password: 'xxxxxxx'
      smtp_require_tls: false
    
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      email_configs:
      - to: 'xxxxxxx@126.com'
    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']
    
    1. grafana配置监控项

    2. 备用配合方案 zabbix

      1.配置初始环境
      增大文件描述符 (三台都操作)
      	cat >> /etc/security/limits.conf <<EOF
      	root soft nofile 65535
      	root hard nofile 65535
      	* soft nproc 65535
      	* hard nproc 65535
      	* soft nofile 65535
      	* hard nofile 65535
      	EOF
      	echo "ulimit -SH 65535" >> /etc/rc.local
      	ulimit -SH 65535
      2.配置yum环境
      curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
      curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
      rpm -Uvh https://repo.zabbix.com/zabbix/4.0/rhel/7/x86_64/zabbix-release-4.0-2.el7.noarch.rpm
      #这边要进行一个换源,要不然会被墙(最近才被墙)
      复制zabbix.repo 到其他服务器
      
      3.安装zabbix-server #只在服务端安装
      yum install zabbix-server-mysql zabbix-web-mysql zabbix-agent httpd  zabbix-get -y   #客户端只安装agent
      yum install zabbix-agent -y
      
      4.安装mysql数据库
      yum install mariadb-server -y
      systemctl start mariadb
      systemctl enable mariadb
      
      5.导入表结构
      mysql_secure_installation
      mysql -e "create database zabbix character set utf8 collate utf8_bin;"
      mysql -e "grant all privileges on zabbix.* to zabbix@localhost identified by '123456';"
      zcat /usr/share/doc/zabbix-server-mysql*/create.sql.gz | mysql  zabbix
      
      6,修改配置文件
      vi /etc/zabbix/zabbix_server.conf 
      DBHost=localhost 
      DBName=zabbix
      DBUser=zabbix
      DBPassword=1qaz@WSX
      
      
      vi /etc/httpd/conf.d/zabbix.conf
      php_value date.timezone Asia/Shanghai
      
      7.启动
      systemctl start zabbix-server
      systemctl enable zabbix-server
      systemctl start httpd
      systemctl enable httpd
      
      8.进入zabbix界面
      配置数据库用户密码
      账号 Admin 密码 zabbix
      
      9.导入模板
      。。。
      
      10.修改模板
      cp userparameter_percona_mysql.conf /etc/zabbix/zabbix_agentd.d/
      vim ss_get_mysql_stats.php #修改账号密码
      vim get_mysql_stats_wrapper.sh #修改账号密码  主从同步的
      
      11.解决图形字符乱码问题
      将simkai.ttf 拷贝到/usr/share/fonts/dejavu/ 下
      重新创建软链接
      ln -s /usr/share/fonts/dejavu/simkai.ttf zabbix-web-font
      
      11.安装测试用数据库(agent)
      略
      
      
      
      1.data目录755授权,zabbix授权 2》/dev/null
      
      #process select super
      #SELECT, PROCESS, SUPER
      replication slave, replication client
      

      6.zabbix客户端脚本配合

      #!/bin/bash
      #create by dhc
      DIR=`dirname $0`
      zabbix_conf=/etc/zabbix/zabbix_agentd.conf
      source /etc/init.d/functions
      yum_dir=/etc/yum.repos.d
      percona_name=percona-zabbix-templates-1.1.8-1.noarch.rpm
      scripts_dir=/var/lib/zabbix/percona/scripts
      template_file=/var/lib/zabbix/percona/templates/userparameter_percona_mysql.conf
      zabbix_keydir=/etc/zabbix/zabbix_agentd.d/
      yum_ip=10.0.0.50
      #config yum repo
      yum_repo(){
      #curl -o ${yum_dir}/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
      #curl -o ${yum_dir}/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
      cp $DIR/zabbix.repo ${yum_dir}/
      yum clean all
      yum makecache
      }
      #install zabbix... rpm packages
      yum_install(){
      ping -w1 -c1 $yum_ip &>/dev/null
      if [  $? -eq 0 ];then
      	yum install zabbix-agent php php-mysql -y
      	if [ ! $? -eq 0 ];then
      		action "安装失败,请手动执行 yum install zabbix-agent php php-mysql -y     " /bin/false
      		exit
      	 fi
      else
      	 action "连接不到$yum_ip....     " /bin/false
      	exit
      fi
      }
      #config zabbix_agent point to zabbix_server
      zabbix_conf(){
      if [ -e $zabbix_conf ];then
      	rm -f $zabbix_conf 2&>/dev/null
      	cp $DIR/zabbix_agentd.conf /etc/zabbix/
      	sed -i "/^Hostname/c Hostname=`hostname`" $zabbix_conf
      else
      	action "zabbix_agent 安装失败,请检查  " /bin/false
      	exit
      fi
      }
      
      zabbix_start(){
      systemctl start zabbix-agent
      }
      zabbix_restart(){
      systemctl restart zabbix-agent
      }
      #install percona monitoring templates 
      percona_moni(){
      if [ -e $percona_name ];then
      	rpm -ivh $DIR/percona-zabbix-templates-1.1.8-1.noarch.rpm
      	if [ -e $template_file ];then
      #		cp $template_file $zabbix_keydir
      		cp $DIR/userparameter_percona_mysql.conf $zabbix_keydir
      		rm -f $scripts_dir/get_mysql_stats_wrapper.sh && cp $DIR/get_mysql_stats_wrapper.sh $scripts_dir
      		chmod +x $scripts_dir/get_mysql_stats_wrapper.sh
      	else
      		action "$DIR perconna rpm 安装失败  " /bin/false
      		exit
      	fi
      else
      	action "$DIR perconna rpm包不存在  " /bin/false
      	exit
      fi
      }
      #editer the scripts with mysql user password sock.
      scripts_conf(){
      	echo -e "============================================================="
      	action  "install is ok " /bin/true
      	echo -e "============================================================="
      	echo -e "
       
       
       
       
        "
      	sleep 3
      	echo -e "*****请在配置文件中修改数据库信息 !!!*****
      	
       $scripts_dir/ss_get_mysql_stats.php (30行)
      	
       $scripts_dir/get_mysql_stats_wrapper.sh (19行)"
      }
      #chown zabbix for /tmp/localhost_file
      chown_file(){
      > /tmp/localhost-mysql_cacti_stats.txt
      chown zabbix.zabbix /tmp/localhost-mysql_cacti_stats.txt
      }
      last(){
      /usr/bin/php -q /var/lib/zabbix/percona/scripts/ss_get_mysql_stats.php --host localhost --items gg
      rm -rf /tmp/localhost-mysql_cacti_stats.txt
      }
      yum_repo
      yum_install
      zabbix_conf
      zabbix_start
      percona_moni
      scripts_conf
      chown_file
      zabbix_restart
      last
      
      
  • 相关阅读:
    统计学习方法 | 第1章 统计学习方法概论
    统计学习方法 | 第2章 感知机
    LeetCode | DP专题详解
    论文阅读 | Towards a Robust Deep Neural Network in Text Domain A Survey
    DFS(深度优先搜索)和BFS(广度优先搜索)
    Analysis Methods in Neural Language Processing: A Survey
    Spring AMQP 发送消息到 RabbitMQ 收到 x-queue-type 错误
    Gradle 的项目导入到 IntelliJ 后子项目源代码不能导入
    在 CentOS 7 上安装 RabbitMQ
    IntelliJ IDEA 运行项目的时候提示 Command line is too long 错误
  • 原文地址:https://www.cnblogs.com/dinghc/p/13260401.html
Copyright © 2011-2022 走看看