zoukankan      html  css  js  c++  java
  • 基于Prometheus监控实例

    部署Prometheus

    基于Prometheus+Grafana监控服务对象,如服务器,MySQL/mongodb等数据库

    前期准备

    软件下载

    #  Prometheus Server
    https://prometheus.io/download/
    
    wget -c https://github.com/prometheus/prometheus/releases/download/v2.20.0/prometheus-2.20.0.linux-amd64.tar.gz &
    
    # 告警通知管理组件
    wget -c https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz &
    
    # exporter组件
    wget -c https://github.com/prometheus/consul_exporter/releases/download/v0.7.1/consul_exporter-0.7.1.linux-amd64.tar.gz &
    wget -c https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz &
    wget -c https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz &
    
    

    Prometheus 安装

    传统二进制包安装和 Docker 安装方式

    二进制包安装

    mkdir -p /ups/app/monitor/
    # 解压
    tar -xf prometheus-*.linux-amd64.tar.gz -C /ups/app/monitor/
    # 重命名目录
    cd /ups/app/monitor/
    mv prometheus-*.linux-amd64 prometheus
    ln -s prometheus-2.20.0 prometheus
    
    # 创建目录
    mkdir -p prometheus/{bin,logs,config/rules,data}
    cd prometheus/config && mkdir -p targets/{node,redis,postgresql,mysql}
    # 创建用户
    # groupadd -g 2000 prometheus
    useradd -r -M -c "Prometheus Server" -d /ups/app/monitor/ -s /sbin/nologin prometheus
    # 修改目录属主
    chown -R prometheus.prometheus /ups/app/monitor/prometheus-2.20.0
    # 重构目录结构
    cd /ups/app/monitor/prometheus
    mv prometheus promtool tsdb bin/
    mv prometheus.yml config/
    
    服务启动参数项
    [root@progs prometheus]# ./bin/prometheus --help
    usage: prometheus [<flags>]
    
    The Prometheus monitoring server
    
    Flags:
      -h, --help                     Show context-sensitive help (also try --help-long and --help-man).
          --version                  Show application version.
          --config.file="prometheus.yml"  
                                     Prometheus configuration file path.
          --web.listen-address="0.0.0.0:9090"  
                                     Address to listen on for UI, API, and telemetry.
          --web.read-timeout=5m      Maximum duration before timing out read of the request, and closing idle connections.
          --web.max-connections=512  Maximum number of simultaneous connections.
          --web.external-url=<URL>   The URL under which Prometheus is externally reachable (for example, if Prometheus is served via a reverse
                                     proxy). Used for generating relative and absolute links back to Prometheus itself. If the URL has a path
                                     portion, it will be used to prefix all HTTP endpoints served by Prometheus. If omitted, relevant URL
                                     components will be derived automatically.
          --web.route-prefix=<path>  Prefix for the internal routes of web endpoints. Defaults to path of --web.external-url.
          --web.user-assets=<path>   Path to static asset directory, available at /user.
          --web.enable-lifecycle     Enable shutdown and reload via HTTP request.
          --web.enable-admin-api     Enable API endpoints for admin control actions.
          --web.console.templates="consoles"  
                                     Path to the console template directory, available at /consoles.
          --web.console.libraries="console_libraries"  
                                     Path to the console library directory.
          --web.page-title="Prometheus Time Series Collection and Processing Server"  
                                     Document title of Prometheus instance.
          --web.cors.origin=".*"     Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1|domain2).com'
          --storage.tsdb.path="data/"  
                                     Base path for metrics storage.
          --storage.tsdb.retention=STORAGE.TSDB.RETENTION  
                                     [DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use
                                     "storage.tsdb.retention.time" instead.
          --storage.tsdb.retention.time=STORAGE.TSDB.RETENTION.TIME  
                                     How long to retain samples in storage. When this flag is set it overrides "storage.tsdb.retention". If neither
                                     this flag nor "storage.tsdb.retention" nor "storage.tsdb.retention.size" is set, the retention time defaults
                                     to 15d. Units Supported: y, w, d, h, m, s, ms.
          --storage.tsdb.retention.size=STORAGE.TSDB.RETENTION.SIZE  
                                     [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units supported: KB, MB, GB, TB, PB.
                                     This flag is experimental and can be changed in future releases.
          --storage.tsdb.no-lockfile  
                                     Do not create lockfile in data directory.
          --storage.tsdb.allow-overlapping-blocks  
                                     [EXPERIMENTAL] Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge.
          --storage.tsdb.wal-compression  
                                     Compress the tsdb WAL.
          --storage.remote.flush-deadline=<duration>  
                                     How long to wait flushing sample on shutdown or config reload.
          --storage.remote.read-sample-limit=5e7  
                                     Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no
                                     limit. This limit is ignored for streamed response types.
          --storage.remote.read-concurrent-limit=10  
                                     Maximum number of concurrent remote read calls. 0 means no limit.
          --storage.remote.read-max-bytes-in-frame=1048576  
                                     Maximum number of bytes in a single frame for streaming remote read response types before marshalling. Note
                                     that client might have limit on frame size as well. 1MB as recommended by protobuf by default.
          --rules.alert.for-outage-tolerance=1h  
                                     Max time to tolerate prometheus outage for restoring "for" state of alert.
          --rules.alert.for-grace-period=10m  
                                     Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured
                                     "for" time greater than grace period.
          --rules.alert.resend-delay=1m  
                                     Minimum amount of time to wait before resending an alert to Alertmanager.
          --alertmanager.notification-queue-capacity=10000  
                                     The capacity of the queue for pending Alertmanager notifications.
          --alertmanager.timeout=10s  
                                     Timeout for sending alerts to Alertmanager.
          --query.lookback-delta=5m  The maximum lookback duration for retrieving metrics during expression evaluations and federation.
          --query.timeout=2m         Maximum time a query may take before being aborted.
          --query.max-concurrency=20  
                                     Maximum number of queries executed concurrently.
          --query.max-samples=50000000  
                                     Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load
                                     more samples than this into memory, so this also limits the number of samples a query can return.
          --log.level=info           Only log messages with the given severity or above. One of: [debug, info, warn, error]
          --log.format=logfmt        Output format of log messages. One of: [logfmt, json]
    
    配置服务项
    # 配置服务启动项
    cat > /usr/lib/systemd/system/prometheus.service <<-EOF
    [Unit]
    Description=https://prometheus.io
    After=network.target
    #After=postgresql.service mariadb.service mysql.service
    Wants=network-online.target
    
    [Service]
    User=prometheus
    Group=prometheus
    
    Type=simple
    
    WorkingDirectory=/ups/app/monitor/prometheus/
    # RuntimeDirectory=prometheus
    # RuntimeDirectoryMode=0750
    ExecStart=/ups/app/monitor/prometheus/bin/prometheus 
        --config.file=/ups/app/monitor/prometheus/config/prometheus.yml 
        --storage.tsdb.retention=30d 
        --storage.tsdb.path="/ups/app/monitor/prometheus/data/" 
        --web.console.templates=/ups/app/monitor/prometheus/consoles 
        --web.console.libraries=/ups/app/monitor/prometheus/console_libraries 
        --web.enable-lifecycle --web.enable-admin-api 
        --web.listen-address=:9090 
    Restart=on-failure
    # Sets open_files_limit
    LimitNOFILE=10000
    TimeoutStopSec=20
    
    StandardOutput=syslog
    StandardError=syslog
    SyslogIdentifier=prometheus
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    日志重定向输出到指定文件
    cat > /etc/rsyslog.d/prometheus.conf <<-EOF
    if $programname == 'prometheus' then /ups/app/monitor/prometheus/logs/prometheusd.log
    & stop
    EOF
    
    配置参数文件

    vi /ups/app/monitor/prometheus/config/prometheus.yml

    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - progs:9093  # 对应启动的altermanager节点的9093端口
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules/alert_node.yml"
      - "rules/alert_mysql.yml"
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    
      - job_name: 'node'
        static_configs:
        - targets: ['localhost:9100']
        relabel_configs:
        - action: replace
          source_labels: ['__address__']  ##源标签
          regex: (.*):(.*)                ##正则,会匹配到__address__值
          replacement: $1                 ##引用正则匹配到的内容
          target_label: HOSTNAME          ##赋予新的标签,名为HOSTNAME
    
      - job_name: 'MySQL'
        static_configs:
        - targets: ['localhost:9104']
        relabel_configs:
        - action: replace
          source_labels: ['__address__']  ##源标签
          regex: (.*):(.*)                ##正则,会匹配到__address__值
          replacement: $1                 ##引用正则匹配到的内容
          target_label: instance          ##赋予新的标签,名为 instance
    
    检查配置文件
    cd /ups/app/monitor/prometheus
    ./bin/promtool check config config/prometheus.yml
    
    启动服务
    # 启动服务
    ./bin/prometheus --config.file=config/prometheus.yml
    或
    # 加载服务
    systemctl daemon-reload
    
    systemctl enable prometheus.service
    systemctl start  prometheus.service
    systemctl stop   prometheus.service
    systemctl status prometheus.service
    
    重新加载Prometheus服务

    增加启动参数--web.enable-lifecycle可以不关闭服务方式加载配置

    curl -X POST http://localhost:9090/-/reload
    
    验证
    # 运行 version 检查运行环境是否正常
    ./bin/prometheus version
    
    lsof -i :9090
    
    # 打开Web界面,默认端口9090
    http://192.168.10.181:9090
    

    img

    Docker安装方式

    安装docker软件
    yum -y install docker
    
    执行命令安装Prometheus
    使用Quay.io or Docker Hub Docker镜像仓库安装
    $ docker run --name prometheus -d -p 127.0.0.1:9090:9090 quay.io/prometheus/prometheus
    
    # 通过prometheus.yml文件启动
    docker run 
        -p 9090:9090 
        -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml 
        prom/prometheus
    
    # 配置使用额外的卷
    docker run 
        -p 9090:9090 
        -v /path/to/config:/etc/prometheus 
        prom/prometheus
    
    
    通过Dockerfile安装
    FROM prom/prometheus
    ADD prometheus.yml /etc/prometheus/
    
    # 
    docker build -t my-prometheus .
    docker run -p 9090:9090 my-prometheus
    
    Docker管理prometheus
    # 运行 docker ps 查看所有服务
    docker ps
    
    
    运行 docker start prometheus 启动服务
    
    运行 docker stats prometheus 查看 prometheus 状态
    
    运行 docker stop prometheus 停止服务
    

    配置

    Prometheus 启动的时候,可以加载运行参数 -config.file 指定配置文件,默认为 prometheus.yml

    在配置文件中我们可以指定 global, alerting, rule_files, scrape_configs, remote_write, remote_read 等属性。

    全局配置

    global 属于全局的默认配置,它主要包含 4 个属性,

    • scrape_interval: 拉取 targets 的默认时间间隔。
    • scrape_timeout: 拉取一个 target 的超时时间。
    • evaluation_interval: 执行 rules 的时间间隔。
    • external_labels: 额外的属性,会添加到拉取的数据并存到数据库中。

    告警配置

    可以使用运行参数 -alertmanager.xxx 来配置 Alertmanager,它这种方式不灵活。不支持动态更新加载,以及动态定义告警属性。

    因此,通过alerting 配置主要用来解决这个问题。它能够更好的管理 Alertmanager, 主要包含 2 个参数:

    • alert_relabel_configs: 动态修改 alert 属性的规则配置。
    • alertmanagers: 用于动态发现 Alertmanager 的配置。

    image-20200804120850181

    规则配置

    rule_files 主要用于配置 rules 文件,它支持多个文件以及文件目录

    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    

    数据拉取配置

    scrape_configs 主要用于配置拉取数据节点,每一个拉取配置主要包含以下参数:

    • job_name:任务名称
    • honor_labels: 用于解决拉取数据标签有冲突,当设置为 true, 以拉取数据为准,否则以服务配置为准
    • params:数据拉取访问时带的请求参数
    • scrape_interval: 拉取时间间隔
    • scrape_timeout: 拉取超时时间
    • metrics_path: 拉取节点的 metric 路径
    • scheme: 拉取数据访问协议
    • sample_limit: 存储的数据标签个数限制,如果超过限制,该数据将被忽略,不入存储;默认值为0,表示没有限制
    • relabel_configs: 拉取数据重置标签配置
    • metric_relabel_configs:metric 重置标签配置

    远程可写存储

    remote_write 主要用于可写远程存储配置,主要包含以下参数:

    • url: 访问地址
    • remote_timeout: 请求超时时间
    • write_relabel_configs: 标签重置配置, 拉取到的数据,经过重置处理后,发送给远程存储

    注意: remote_write 属于试验阶段,慎用

    远程可读存储

    remote_read 主要用于可读远程存储配置,主要包含以下参数:

    • url: 访问地址
    • remote_timeout: 请求超时时间

    注意: remote_read 属于试验阶段,慎用

    服务发现

    在 Prometheus 的配置中,一个最重要的概念就是数据源 target,而数据源的配置主要分为静态配置和动态发现, 大致为以下几类:

    • static_configs: 静态服务发现
    • dns_sd_configs: DNS 服务发现
    • file_sd_configs: 文件服务发现
    • consul_sd_configs: Consul 服务发现
    • serverset_sd_configs: Serverset 服务发现
    • nerve_sd_configs: Nerve 服务发现
    • marathon_sd_configs: Marathon 服务发现
    • kubernetes_sd_configs: Kubernetes 服务发现
    • gce_sd_configs: GCE 服务发现
    • ec2_sd_configs: EC2 服务发现
    • openstack_sd_configs: OpenStack 服务发现
    • azure_sd_configs: Azure 服务发现
    • triton_sd_configs: Triton 服务发现

    配置样例

    global:
      scrape_interval:     15s # By default, scrape targets every 15 seconds.
      evaluation_interval: 15s # By default, scrape targets every 15 seconds.
    
    rule_files:
      - "rules/node.rules"
    
    scrape_configs:
      - job_name: 'prometheus'
        scrape_interval: 5s
        static_configs:
          - targets: ['localhost:9090']
    
      - job_name: 'node'
        scrape_interval: 8s
        static_configs:
          - targets: ['127.0.0.1:9100', '127.0.0.12:9100']
    
      - job_name: 'mysqld'
        static_configs:
          - targets: ['127.0.0.1:9104']
          
      - job_name: 'memcached'
        static_configs:
          - targets: ['127.0.0.1:9150']
    

    部署Grafana

    web可视化软件

    软件下载地址

    # grafana程序包
    https://grafana.com/grafana/download 
    # grafana-dashboards包
    https://github.com/percona/grafana-dashboards/releases
    
    
    # Standalone Linux Binaries(64 Bit)SHA256: b6cbc04505edb712f206228261d0ea5ab7e9c03e9f77d0d36930886c861366ed
    wget https://dl.grafana.com/oss/release/grafana-7.1.1.linux-amd64.tar.gz
    tar -xf grafana-7.1.1.linux-amd64.tar.gz
    

    软件安装部署

    二进制包安装

    mkdir -p /ups/app/monitor/
    # 解压
    tar -xf grafana-*.linux-amd64.tar.gz -C /ups/app/monitor/
    
    # 重命名目录
    cd /ups/app/monitor/
    mv grafana-6.7.1 grafana
    mkdir -p /ups/app/monitor/grafana/{logs}
    
    # 创建用户
    # groupadd -g 2001 grafana
    useradd -r -d /ups/app/monitor/grafana -c "Grafana Server" -M -s /sbin/nologin grafana
    
    # 修改目录属主
    chown -R grafana.grafana /ups/app/monitor/grafana
    
    配置服务项
    # 配置服务启动项
    cat > /usr/lib/systemd/system/grafana.service <<-EOF
    [Unit]
    Description=Grafana instance
    Documentation=http://docs.grafana.org
    Wants=network-online.target
    After=network-online.target
    #After=After=postgresql-12.service mysql3308.service mysql.service
    
    [Service]
    # EnvironmentFile=/etc/sysconfig/grafana-server
    User=grafana
    Group=grafana
    Type=notify
    Restart=on-failure
    WorkingDirectory=/ups/app/monitor/grafana
    RuntimeDirectory=grafana
    RuntimeDirectoryMode=0750
    
    # ExecStart=/ups/app/monitor/grafana/bin/grafana-server                               
    #                             --config=${CONF_FILE}                                   
    #                             --pidfile=${PID_FILE_DIR}/grafana-server.pid            
    #                             --packaging=rpm                                         
    #                             cfg:default.paths.logs=${LOG_DIR}                       
    #                             cfg:default.paths.data=${DATA_DIR}                      
    #                             cfg:default.paths.plugins=${PLUGINS_DIR}                
    #                             cfg:default.paths.provisioning=${PROVISIONING_CFG_DIR}  
    
    ExecStart=/ups/app/monitor/grafana/bin/grafana-server
    LimitNOFILE=10000
    TimeoutStopSec=20
    
    #StandardOutput=syslog
    #StandardError=syslog
    #SyslogIdentifier=grafana
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    日志重定向输出到指定文件()
    cat > /etc/rsyslog.d/grafana.conf <<-EOF
    if $programname == 'grafana' then /ups/app/monitor/grafana/logs/grafana.log
    & stop
    EOF
    
    启动服务
    # 启动服务
    /ups/app/monitor/grafana/bin/grafana-server &
    或
    # 加载服务
    systemctl daemon-reload
    
    systemctl enable  grafana.service
    systemctl start   grafana.service
    systemctl stop    grafana.service
    systemctl restart grafana.service
    systemctl status  grafana.service
    

    Docker安装方式

    docker run -d --name=grafana -p 3000:3000 grafana/grafana
    

    验证

    # 打开Web界面,默认端口3000 (默认账号/密码:admin/admin)
    http://192.168.10.181:3000
    

    img

    配置文件

    路径

    • 默认路径:$WORKING_DIR/conf/defaults.ini
    • 自定义配置:$WORKING_DIR/conf/custom.ini
    • 使用--config参数覆盖自定义配置文件路径
      • ./grafana-server --config /custom/config.ini --homepath /custom/homepath cfg:default.paths.logs=/custom/path

    添加插件

    语法

    [root@progs bin]# ./grafana-cli --help
    NAME:
       Grafana CLI - A new cli application
    
    USAGE:
       grafana-cli [global options] command [command options] [arguments...]
    
    VERSION:
       7.1.1
    
    AUTHOR:
       Grafana Project <hello@grafana.com>
    
    COMMANDS:
       plugins  Manage plugins for grafana
       admin    Grafana admin commands
       help, h  Shows a list of commands or help for one command
    
    GLOBAL OPTIONS:
       --pluginsDir value       Path to the Grafana plugin directory (default: "/var/lib/grafana/plugins") [$GF_PLUGIN_DIR]
       --repo value             URL to the plugin repository (default: "https://grafana.com/api/plugins") [$GF_PLUGIN_REPO]
       --pluginUrl value        Full url to the plugin zip file instead of downloading the plugin from grafana.com/api [$GF_PLUGIN_URL]
       --insecure               Skip TLS verification (insecure) (default: false)
       --debug                  Enable debug logging (default: false)
       --configOverrides value  Configuration options to override defaults as a string. e.g. cfg:default.paths.log=/dev/null
       --homepath value         Path to Grafana install/home path, defaults to working directory
       --config value           Path to config file
       --help, -h               show help (default: false)
       --version, -v            print the version (default: false)
    
    
    # 查询可用的插件
    grafana-cli plugins list-remote
    
    id: abhisant-druid-datasource version: 0.0.5
    id: agenty-flowcharting-panel version: 0.9.0
    id: aidanmountford-html-panel version: 0.0.1
    id: akumuli-datasource version: 1.3.11
    id: alexanderzobnin-zabbix-app version: 3.12.4
    id: alexandra-trackmap-panel version: 1.2.5
    id: andig-darksky-datasource version: 1.0.1
    id: aquaqanalytics-kdbadaptor-datasource version: 1.0.1
    id: ayoungprogrammer-finance-datasource version: 1.0.0
    id: belugacdn-app version: 1.2.0
    id: bessler-pictureit-panel version: 1.0.0
    id: blackmirror1-singlestat-math-panel version: 1.1.7
    id: blackmirror1-statusbygroup-panel version: 1.1.1
    id: bosun-app version: 0.0.28
    id: briangann-datatable-panel version: 1.0.2
    id: briangann-gauge-panel version: 0.0.6
    id: btplc-alarm-box-panel version: 1.0.8
    id: btplc-peak-report-panel version: 0.2.4
    id: btplc-status-dot-panel version: 0.2.4
    id: btplc-trend-box-panel version: 0.1.9
    id: camptocamp-prometheus-alertmanager-datasource version: 0.0.8
    id: citilogics-geoloop-panel version: 1.1.1
    id: cloudflare-app version: 0.1.4
    id: cloudspout-button-panel version: 7.0.3
    id: cognitedata-datasource version: 2.0.0
    id: corpglory-progresslist-panel version: 1.0.5
    id: dalmatinerdb-datasource version: 1.0.5
    id: dalvany-image-panel version: 2.1.1
    id: ddurieux-glpi-app version: 1.3.0
    id: devicehive-devicehive-datasource version: 2.0.1
    id: devopsprodigy-kubegraf-app version: 1.4.2
    id: digiapulssi-breadcrumb-panel version: 1.1.6
    id: digiapulssi-organisations-panel version: 1.3.0
    id: digrich-bubblechart-panel version: 1.1.0
    id: doitintl-bigquery-datasource version: 1.0.8
    id: farski-blendstat-panel version: 1.0.2
    id: fastweb-openfalcon-datasource version: 1.0.0
    id: fatcloud-windrose-panel version: 0.7.0
    id: fetzerch-sunandmoon-datasource version: 0.1.6
    id: flant-statusmap-panel version: 0.2.0
    id: foursquare-clouderamanager-datasource version: 0.9.2
    id: fzakaria-simple-annotations-datasource version: 1.0.0
    id: gnocchixyz-gnocchi-datasource version: 1.7.0
    id: goshposh-metaqueries-datasource version: 0.0.3
    id: grafana-azure-data-explorer-datasource version: 2.1.0
    id: grafana-azure-monitor-datasource version: 0.3.0
    id: grafana-clock-panel version: 1.1.1
    id: grafana-googlesheets-datasource version: 1.0.0
    id: grafana-image-renderer version: 2.0.0
    id: grafana-influxdb-08-datasource version: 1.0.2
    id: grafana-influxdb-flux-datasource version: 7.0.0
    id: grafana-kairosdb-datasource version: 3.0.1
    id: grafana-kubernetes-app version: 1.0.1
    id: grafana-piechart-panel version: 1.5.0
    id: grafana-polystat-panel version: 1.2.0
    id: grafana-simple-json-datasource version: 1.4.0
    id: grafana-strava-datasource version: 1.1.1
    id: grafana-worldmap-panel version: 0.3.2
    id: gretamosa-topology-panel version: 1.0.0
    id: gridprotectionalliance-openhistorian-datasource version: 1.0.2
    id: gridprotectionalliance-osisoftpi-datasource version: 1.0.4
    id: hawkular-datasource version: 1.1.1
    id: ibm-apm-datasource version: 0.9.0
    id: instana-datasource version: 2.7.3
    id: jasonlashua-prtg-datasource version: 4.0.3
    id: jdbranham-diagram-panel version: 1.6.2
    id: jeanbaptistewatenberg-percent-panel version: 1.0.6
    id: kentik-app version: 1.3.4
    id: larona-epict-panel version: 1.2.2
    id: linksmart-hds-datasource version: 1.0.1
    id: linksmart-sensorthings-datasource version: 1.3.0
    id: logzio-datasource version: 5.0.0
    id: macropower-analytics-panel version: 1.0.0
    id: magnesium-wordcloud-panel version: 1.0.0
    id: marcuscalidus-svg-panel version: 0.3.3
    id: marcusolsson-hourly-heatmap-panel version: 0.4.1
    id: marcusolsson-treemap-panel version: 0.2.0
    id: michaeldmoore-annunciator-panel version: 1.0.5
    id: michaeldmoore-multistat-panel version: 1.4.1
    id: monasca-datasource version: 1.0.0
    id: monitoringartist-monitoringart-datasource version: 1.0.0
    id: moogsoft-aiops-app version: 8.0.0
    id: mtanda-google-calendar-datasource version: 1.0.4
    id: mtanda-heatmap-epoch-panel version: 0.1.7
    id: mtanda-histogram-panel version: 0.1.6
    id: mxswat-separator-panel version: 1.0.0
    id: natel-discrete-panel version: 0.1.0
    id: natel-influx-admin-panel version: 0.0.5
    id: natel-plotly-panel version: 0.0.6
    id: natel-usgs-datasource version: 0.0.2
    id: neocat-cal-heatmap-panel version: 0.0.3
    id: novalabs-annotations-panel version: 0.0.1
    id: ns1-app version: 0.0.7
    id: ntop-ntopng-datasource version: 1.0.0
    id: opennms-helm-app version: 5.0.1
    id: ovh-warp10-datasource version: 2.2.0
    id: paytm-kapacitor-datasource version: 0.1.2
    id: percona-percona-app version: 1.0.0
    id: petrslavotinek-carpetplot-panel version: 0.1.1
    id: pierosavi-imageit-panel version: 0.1.3
    id: pr0ps-trackmap-panel version: 2.1.0
    id: praj-ams-datasource version: 1.2.0
    id: pue-solr-datasource version: 1.0.2
    id: quasardb-datasource version: 3.8.2
    id: rackerlabs-blueflood-datasource version: 0.0.2
    id: radensolutions-netxms-datasource version: 1.2.2
    id: raintank-snap-app version: 0.0.5
    id: raintank-worldping-app version: 1.2.7
    id: redis-datasource version: 1.1.2
    id: ryantxu-ajax-panel version: 0.0.7-dev
    id: ryantxu-annolist-panel version: 0.0.1
    id: satellogic-3d-globe-panel version: 0.1.0
    id: savantly-heatmap-panel version: 0.2.0
    id: sbueringer-consul-datasource version: 0.1.5
    id: scadavis-synoptic-panel version: 1.0.4
    id: sidewinder-datasource version: 0.2.0
    id: simpod-json-datasource version: 0.2.0
    id: skydive-datasource version: 1.2.0
    id: smartmakers-trafficlight-panel version: 1.0.0
    id: sni-pnp-datasource version: 1.0.5
    id: sni-thruk-datasource version: 1.0.3
    id: snuids-radar-panel version: 1.4.4
    id: snuids-trafficlights-panel version: 1.4.5
    id: spotify-heroic-datasource version: 0.0.1
    id: stagemonitor-elasticsearch-app version: 0.83.2
    id: udoprog-heroic-datasource version: 0.1.0
    id: vertamedia-clickhouse-datasource version: 2.0.2
    id: vertica-grafana-datasource version: 0.1.0
    id: vonage-status-panel version: 1.0.9
    id: voxter-app version: 0.0.1
    id: xginn8-pagerduty-datasource version: 0.2.1
    id: yesoreyeram-boomtable-panel version: 1.3.0
    id: yesoreyeram-boomtheme-panel version: 0.1.0
    id: zuburqan-parity-report-panel version: 1.2.1
    
    

    安装插件

    安装到默认插件路径

    ./grafana-cli --pluginsDir /ups/app/monitor/grafana/data/plugins plugins install grafana-piechart-panel 
    ./grafana-cli --pluginsDir /ups/app/monitor/grafana/data/plugins plugins install grafana-polystat-panel
    ./grafana-cli --pluginsDir /ups/app/monitor/grafana/data/plugins plugins install digiapulssi-breadcrumb-panel 
    

    安装过程截图

    image-20200804165921770

    结果确认

    ./bin/grafana-cli plugins ls
    

    导入模板

    前端界面导入文件

    image-20200804124555116

    后台配置模板路径

    # 1. 解压
    unzip -qo grafana-dashboards-2.9.0.zip
    cd grafana-dashboards-2.9.0
    cp -r dashboards /ups/app/monitor/grafana/grafana-dashboards
    
    # 2. 创建 mysqld_export.yml 文件
    cat > /ups/app/monitor/grafana/conf/provisioning/dashboards/mysqld_export.yml <<-EOF
    apiVersion: 1
    
    providers:
      - name: 'mysqld_exporter'
         orgId: 1
         folder: ''
         type: file
         options:
           path: /ups/app/monitor/grafana/grafana-dashboards
    EOF
    
    # 3. 重启grafana服务
    

    配置Promethues数据源

    img

    img

    Exporter软件

    在 Prometheus 中负责数据汇报的程序统一叫做 Exporter, 而不同的 Exporter 负责不同的业务。

    软件

    主机监控程序(node_exporter)

    软件部署

    二进制安装
    软件安装
    # 创建用户
    #groupadd -g 2000 prometheus
    useradd -r -M -c "Prometheus agent" -d /ups/app/monitor/ -s /sbin/nologin prometheus
    
    # 解压文件
    mkdir -p /ups/app/monitor/
    tar -xf node_exporter-*.linux-amd64.tar.gz -C /ups/app/monitor/ --no-same-owner
    
    # 重命名目录
    cd /ups/app/monitor/
    mv node_exporter-*.linux-amd64 node_exporter
    
    # 修改目录属主
    # chown -R prometheus.prometheus /ups/app/monitor/node_exporter
    
    配置服务项
    # 配置服务文件
    cat > /usr/lib/systemd/system/node_exporter.service <<-EOF
    [Unit]
    Description=node exporter
    Documentation=https://prometheus.io
    After=network.target
    
    [Service]
    #User=prometheus
    #Group=prometheus
    Restart=on-failure
    ExecStart=/ups/app/monitor/node_exporter/node_exporter --web.listen-address=:9100
    StandardOutput=syslog
    StandardError=syslog
    SyslogIdentifier=node_exporter
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    • 日志重定向输出到指定文件

      •   cat > /etc/rsyslog.d/node_exporter.conf <<-EOF
          if $programname == 'node_exporter' then /ups/app/monitor/node_exporter/node.log
          & stop
          EOF
        
    启动服务
    # 启动服务
    systemctl daemon-reload
    systemctl restart node_exporter.service
    systemctl status node_exporter.service
    
    或
    
    # 启动客户端
    cd /ups/app/monitor/node_exporter
    ./node_exporter &
    
    Docker安装
    docker run -d -p 9100:9100 
      -v "/proc:/host/proc:ro" 
      -v "/sys:/host/sys:ro" 
      -v "/:/rootfs:ro" 
      --net="host" 
      quay.io/prometheus/node-exporter 
        -collector.procfs /host/proc 
        -collector.sysfs /host/sys 
        -collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc)($|/)"
    

    接入Prometheus监控

    exporter集中式配置
    • 修改prometheus参数文件

    利用 Prometheus 的 static_configs 来拉取 node_exporter 的数据。打开 prometheus.yml 文件, 在 scrape_configs 中添加如下配置

    # 配置prometheus.yml文件
    cat >> /ups/app/monitor/prometheus/config/prometheus.yml <<-EOF
    
      - job_name: 'node_exporter'
        scrape_interval: 1s
        file_sd_configs:
          - files:
            - targets/node/nodes-instances.json
            refresh_interval: 10s
        relabel_configs:
        - action: replace
          source_labels: ['__address__']
          regex: (.*):(.*)
          replacement: $1
          target_label: hostname
        - action: labeldrop
          regex: __meta_filepath
    EOF
    
    • 配置主机服务器列表json文件

    vi /ups/app/monitor/prometheus/config/targets/node/nodes-instances.json

    [
      {
        "targets": [ "192.168.10.181:9100","192.168.10.182:9100", "192.168.10.190:9100","192.168.10.191:9100","192.168.10.192:9100"]
      }
    ]
    
    exporter独立配置

    每个监控对象独立一个文件配置

    • 修改Prometheus参数配置文件
    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - progs:9093  # 对应启动的altermanager节点的9093端口
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules/alert_node.yml"
      - "rules/alert_mysql.yml"
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    
      - job_name: 'node_exporter'
        scrape_interval: 1s
        file_sd_configs:
          - files:
            - targets/node/*.yml
            refresh_interval: 10s
        relabel_configs:
        - action: replace
          source_labels: ['__address__']
          regex: (.*):(.*)
          replacement: $1
          target_label: hostname
        - action: labeldrop
          regex: __meta_filepath
    
    • 配置主机服务器实例文件
    vi /ups/app/monitor/prometheus/config/targets/node/nodes1-instances.yml
    [
      {
        "targets": ["192.168.10.181:9100"],
        "labels": { }
      }
    ]
    
    vi /ups/app/monitor/prometheus/config/targets/node/nodes2-instances.yml
    [
      {
        "targets": ["192.168.10.182:9100"],
        "labels": { }
      }
    ]
    
    重启prometheus加载配置
    # 检查并重新加载配置文件
    ./bin/promtool check config config/prometheus.yml
    # 重启服务
    systemctl restart prometheus
    
    

    访问

    浏览器中访问 http://IP:9100/metrics

    image-20200804123852395

    监控功能

    默认开启的功能
    名称 说明 系统
    arp /proc/net/arp 中收集 ARP 统计信息 Linux
    conntrack /proc/sys/net/netfilter/ 中收集 conntrack 统计信息 Linux
    cpu 收集 cpu 统计信息 Darwin, Dragonfly, FreeBSD, Linux
    diskstats /proc/diskstats 中收集磁盘 I/O 统计信息 Linux
    edac 错误检测与纠正统计信息 Linux
    entropy 可用内核熵信息 Linux
    exec execution 统计信息 Dragonfly, FreeBSD
    filefd /proc/sys/fs/file-nr 中收集文件描述符统计信息 Linux
    filesystem 文件系统统计信息,例如磁盘已使用空间 Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
    hwmon /sys/class/hwmon/ 中收集监控器或传感器数据信息 Linux
    infiniband 从 InfiniBand 配置中收集网络统计信息 Linux
    loadavg 收集系统负载信息 Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
    mdadm /proc/mdstat 中获取设备统计信息 Linux
    meminfo 内存统计信息 Darwin, Dragonfly, FreeBSD, Linux
    netdev 网口流量统计信息,单位 bytes Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
    netstat /proc/net/netstat 收集网络统计数据,等同于 netstat -s Linux
    sockstat /proc/net/sockstat 中收集 socket 统计信息 Linux
    stat /proc/stat 中收集各种统计信息,包含系统启动时间,forks, 中断等 Linux
    textfile 通过 --collector.textfile.directory 参数指定本地文本收集路径,收集文本信息 any
    time 系统当前时间 any
    uname 通过 uname 系统调用, 获取系统信息 any
    vmstat /proc/vmstat 中收集统计信息 Linux
    wifi 收集 wifi 设备相关统计数据 Linux
    xfs 收集 xfs 运行时统计信息 Linux (kernel 4.4+)
    zfs 收集 zfs 性能统计信息 Linux
    默认关闭功能
    名称 说明 系统
    bonding 收集系统配置以及激活的绑定网卡数量 Linux
    buddyinfo /proc/buddyinfo 中收集内存碎片统计信息 Linux
    devstat 收集设备统计信息 Dragonfly, FreeBSD
    drbd 收集远程镜像块设备(DRBD)统计信息 Linux
    interrupts 收集更具体的中断统计信息 Linux,OpenBSD
    ipvs /proc/net/ip_vs 中收集 IPVS 状态信息,从 /proc/net/ip_vs_stats 获取统计信息 Linux
    ksmd /sys/kernel/mm/ksm 中获取内核和系统统计信息 Linux
    logind logind 中收集会话统计信息 Linux
    meminfo_numa /proc/meminfo_numa 中收集内存统计信息 Linux
    mountstats /proc/self/mountstat 中收集文件系统统计信息,包括 NFS 客户端统计信息 Linux
    nfs /proc/net/rpc/nfs 中收集 NFS 统计信息,等同于 nfsstat -c Linux
    qdisc 收集队列推定统计信息 Linux
    runit 收集 runit 状态信息 any
    supervisord 收集 supervisord 状态信息 any
    systemd systemd 中收集设备系统状态信息 Linux
    tcpstat /proc/net/tcp/proc/net/tcp6 收集 TCP 连接状态信息 Linux

    监控MySQL

    MySQL数据库服务器上安装mysql_exporter

    安装exporter软件

    # 创建用户
    # groupadd -g 2000 prometheus
    useradd -u 2000 -M -c "Prometheus agent" -s /sbin/nologin prometheus
    
    # 解压文件
    mkdir -p /ups/app/monitor/
    tar -xf mysqld_exporter-0.12.1.linux-amd64.tar.gz -C /ups/app/monitor/
    
    # 重命名目录
    cd /ups/app/monitor/
    mv mysqld_exporter-0.12.1.linux-amd64 mysqld_exporter
    
    # 修改目录属主
    chown -R prometheus.prometheus /ups/app/monitor/mysqld_exporter
    
    创建MySQL监控用户

    在待监控MySQL上创建用户

    CREATE USER 'monitor'@'localhost' IDENTIFIED BY 'monitor';
    GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'monitor'@'localhost';
    CREATE USER 'monitor'@'192.168.10.%' IDENTIFIED BY 'monitor';
    GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'monitor'@'192.168.10.%';
    flush privileges;
    
    配置客户端账号密码文件
    cat > /ups/app/monitor/mysqld_exporter/.my.cnf <<EOF
    [client]
    user=monitor
    password=monitor
    port=3308
    socket=/ups/app/mysql/mysql3308/logs/mysql3308.sock
    host=progs
    EOF
    
    chmod 400 /ups/app/monitor/mysqld_exporter/.my.cnf
    chown prometheus:prometheus /ups/app/monitor/mysqld_exporter/.my.cnf
    
    配置服务
    # 配置服务文件
    cat > /usr/lib/systemd/system/mysql_exporter.service <<-EOF
    [Unit]
    Description=mysqld exporter
    Documentation=https://prometheus.io
    After=network.target
    After=postgresql-12.service mysql3308.service mysql.service
    
    [Service]
    Restart=on-failure
    # ExecStart=/ups/app/monitor/mysqld_exporter/mysqld_exporter --config.my-cnf=/ups/app/monitor/mysqld_exporter/.my.cnf
    
    ExecStart=/ups/app/monitor/mysqld_exporter/mysqld_exporter 
                --config.my-cnf=/ups/app/monitor/mysqld_exporter/.my.cnf 
                --collect.info_schema.innodb_tablespaces 
                --collect.info_schema.innodb_metrics  
                --collect.perf_schema.tableiowaits 
                --collect.perf_schema.indexiowaits 
                --collect.perf_schema.tablelocks 
                --collect.engine_innodb_status 
                --collect.perf_schema.file_events 
                --collect.binlog_size 
                --collect.info_schema.clientstats 
                --collect.perf_schema.eventswaits
    
    StandardOutput=syslog
    StandardError=syslog
    SyslogIdentifier=mysqld_exporter
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    • 日志重定向输出到指定文件

      •   cat > /etc/rsyslog.d/mysqld_exporter.conf <<-EOF
          if $programname == 'mysqld_exporter' then /ups/app/monitor/mysqld_exporter/node.log
          & stop
          EOF
        
    启动服务
    # 启动服务
    systemctl daemon-reload
    systemctl restart mysql_exporter.service
    systemctl status mysql_exporter.service
    
    或
    
    # 启动客户端
    ./mysqld_exporter --config.my-cnf=/ups/app/monitor/mysqld_exporter/.my.cnf
    
    # 默认端口:9104
    lsof -i :9104
    netstat -tnlp|grep ':9104'
    
    验证

    http://192.168.10.181:9104/metrics

    img

    加入到Prometheus监控(Prometheus Server端)

    # 配置prometheus.yml文件
    cat >> /ups/app/monitor/prometheus/config/prometheus.yml <<-EOF
    
      - job_name: 'MySQL'
        static_configs:
        - targets: ['progs:9104','192.168.10.181:9104']
    
    EOF
    
    
    重启prometheus
    # 检查并重新加载配置文件
    ./bin/promtool check config config/prometheus.yml
    # 重启服务
    systemctl restart prometheus
    
    验证
    http://192.168.10.181:9090/tagets
    
    

    img

    监控PostgreSQL

    软件部署

    下载地址
    wget -c https://github.com/wrouesnel/postgres_exporter/releases/download/v0.8.0/postgres_exporter_v0.8.0_linux-amd64.tar.gz
    
    安装
    二进制包安装
    • 解压
    tar -xf postgres_exporter_v0.8.0_linux-amd64.tar.gz -C /ups/app/monitor
    mv postgres_exporter* postgres_exporter
    
    • 配置服务项
    # 配置服务文件
    cat > /usr/lib/systemd/system/postgres_exporter.service <<-EOF
    [Unit]
    Description=PostgreSQL Exporter
    Documentation=https://github.com/wrouesnel/postgres_exporter
    After=network.target
    
    [Service]
    Type=simple
    
    User=postgres
    Group=postgres
    
    # DATA_SOURCE_NAME="postgresql://postgres:postgres@localhost:5432/postgres?sslmode=disable";
    Environment="DATA_SOURCE_PASS_FILE=/home/postgres/.pgpass"
    Environment="DATA_SOURCE_NAME=postgresql://postgres@localhost:5432/postgres?sslmode=disable"
    Environment="PG_EXPORTER_EXTEND_QUERY_PATH=/ups/app/monitor/postgres_exporter/queries.yaml"
    WorkingDirectory=/ups/app/monitor/postgres_exporter
    
    ExecStart=/ups/app/monitor/postgres_exporter/postgres_exporter --web.listen-address=:9187 #--log.level=debug
    
    Restart=on-failure
    
    StandardOutput=syslog
    StandardError=syslog
    SyslogIdentifier=postgres_exporter
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    • 配置自定义查询语句文件

    需要开启pg_stat_statements插件

    vi /ups/app/monitor/postgres_exporter/queries.yaml

    pg_replication:
      query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag"
      master: true
      metrics:
        - lag:
            usage: "GAUGE"
            description: "Replication lag behind master in seconds"
    
    pg_postmaster:
      query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
      master: true
      metrics:
        - start_time_seconds:
            usage: "GAUGE"
            description: "Time at which postmaster started"
    
    pg_stat_user_tables:
      query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of current database"
        - schemaname:
            usage: "LABEL"
            description: "Name of the schema that this table is in"
        - relname:
            usage: "LABEL"
            description: "Name of this table"
        - seq_scan:
            usage: "COUNTER"
            description: "Number of sequential scans initiated on this table"
        - seq_tup_read:
            usage: "COUNTER"
            description: "Number of live rows fetched by sequential scans"
        - idx_scan:
            usage: "COUNTER"
            description: "Number of index scans initiated on this table"
        - idx_tup_fetch:
            usage: "COUNTER"
            description: "Number of live rows fetched by index scans"
        - n_tup_ins:
            usage: "COUNTER"
            description: "Number of rows inserted"
        - n_tup_upd:
            usage: "COUNTER"
            description: "Number of rows updated"
        - n_tup_del:
            usage: "COUNTER"
            description: "Number of rows deleted"
        - n_tup_hot_upd:
            usage: "COUNTER"
            description: "Number of rows HOT updated (i.e., with no separate index update required)"
        - n_live_tup:
            usage: "GAUGE"
            description: "Estimated number of live rows"
        - n_dead_tup:
            usage: "GAUGE"
            description: "Estimated number of dead rows"
        - n_mod_since_analyze:
            usage: "GAUGE"
            description: "Estimated number of rows changed since last analyze"
        - last_vacuum:
            usage: "GAUGE"
            description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
        - last_autovacuum:
            usage: "GAUGE"
            description: "Last time at which this table was vacuumed by the autovacuum daemon"
        - last_analyze:
            usage: "GAUGE"
            description: "Last time at which this table was manually analyzed"
        - last_autoanalyze:
            usage: "GAUGE"
            description: "Last time at which this table was analyzed by the autovacuum daemon"
        - vacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
        - autovacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been vacuumed by the autovacuum daemon"
        - analyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually analyzed"
        - autoanalyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been analyzed by the autovacuum daemon"
    
    pg_statio_user_tables:
      query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of current database"
        - schemaname:
            usage: "LABEL"
            description: "Name of the schema that this table is in"
        - relname:
            usage: "LABEL"
            description: "Name of this table"
        - heap_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table"
        - heap_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table"
        - idx_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from all indexes on this table"
        - idx_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in all indexes on this table"
        - toast_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table's TOAST table (if any)"
        - toast_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table's TOAST table (if any)"
        - tidx_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
        - tidx_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table's TOAST table indexes (if any)"
            
    pg_database:
      query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size FROM pg_database"
      master: true
      cache_seconds: 30
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of the database"
        - size_bytes:
            usage: "GAUGE"
            description: "Disk space used by the database"
    
    pg_stat_statements:
      query: "SELECT t2.rolname, t3.datname, queryid, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 join pg_roles t2 on (t1.userid=t2.oid) join pg_database t3 on (t1.dbid=t3.oid)"
      master: true
      metrics:
        - rolname:
            usage: "LABEL"
            description: "Name of user"
        - datname:
            usage: "LABEL"
            description: "Name of database"
        - queryid:
            usage: "LABEL"
            description: "Query ID"
        - calls:
            usage: "COUNTER"
            description: "Number of times executed"
        - total_time_seconds:
            usage: "COUNTER"
            description: "Total time spent in the statement, in milliseconds"
        - min_time_seconds:
            usage: "GAUGE"
            description: "Minimum time spent in the statement, in milliseconds"
        - max_time_seconds:
            usage: "GAUGE"
            description: "Maximum time spent in the statement, in milliseconds"
        - mean_time_seconds:
            usage: "GAUGE"
            description: "Mean time spent in the statement, in milliseconds"
        - stddev_time_seconds:
            usage: "GAUGE"
            description: "Population standard deviation of time spent in the statement, in milliseconds"
        - rows:
            usage: "COUNTER"
            description: "Total number of rows retrieved or affected by the statement"
        - shared_blks_hit:
            usage: "COUNTER"
            description: "Total number of shared block cache hits by the statement"
        - shared_blks_read:
            usage: "COUNTER"
            description: "Total number of shared blocks read by the statement"
        - shared_blks_dirtied:
            usage: "COUNTER"
            description: "Total number of shared blocks dirtied by the statement"
        - shared_blks_written:
            usage: "COUNTER"
            description: "Total number of shared blocks written by the statement"
        - local_blks_hit:
            usage: "COUNTER"
            description: "Total number of local block cache hits by the statement"
        - local_blks_read:
            usage: "COUNTER"
            description: "Total number of local blocks read by the statement"
        - local_blks_dirtied:
            usage: "COUNTER"
            description: "Total number of local blocks dirtied by the statement"
        - local_blks_written:
            usage: "COUNTER"
            description: "Total number of local blocks written by the statement"
        - temp_blks_read:
            usage: "COUNTER"
            description: "Total number of temp blocks read by the statement"
        - temp_blks_written:
            usage: "COUNTER"
            description: "Total number of temp blocks written by the statement"
        - blk_read_time_seconds:
            usage: "COUNTER"
            description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
        - blk_write_time_seconds:
            usage: "COUNTER"
            description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
    
    • 日志重定向输出到指定文件
    cat > /etc/rsyslog.d/postgres_exporter.conf <<-EOF
    if $programname == 'postgres_exporter' then /ups/app/monitor/postgres_exporter/exporter.log
    & stop
    EOF
    
    • 启动服务
    # 启动服务
    systemctl daemon-reload
    systemctl restart postgres_exporter.service
    systemctl status postgres_exporter.service
    
    
    # 命令行启动客户端-- postgresql://postgres:password@localhost:5432/postgres
    export DATA_SOURCE_PASS_FILE="/home/postgres/.pgpass"
    export DATA_SOURCE_NAME="postgresql://postgres@localhost:5432/postgres?sslmode=disable"
    export PG_EXPORTER_EXTEND_QUERY_PATH="/ups/app/monitor/postgres_exporter/queries.yaml"
    
    /ups/app/monitor/postgres_exporter/postgres_exporter &
    
    Docker安装
    docker run --net=host -e DATA_SOURCE_NAME="postgresql://postgres:password@localhost:5432/postgres?sslmode=disable" wrouesnel/postgres_exporter
    

    接入Prometheus监控

    添加配置Prometheus文件

      - job_name: 'postgres_exporter'
        scrape_interval: 1s
        file_sd_configs:
          - files:
            - targets/postgresql/*.yml
            refresh_interval: 10s
        relabel_configs:
        - action: replace
          source_labels: ['__address__']
          regex: (.*):(.*)
          replacement: $1
          target_label: hostname
        - action: labeldrop
          regex: __meta_filepath
    

    添加监控对象

    vi targets/postgresql/pg1-instance.yml

    [
      {
        "targets": ["localhost:9187"],
        "labels": { "instance": 'postgres:5432' }
      }
    ]
    
    告警规则文件

    vi rules/alert_pg.yml

    ---
    groups:
      - name: PostgreSQL
        rules:
        - alert: PostgreSQLMaxConnectionsReached
          expr: sum(pg_stat_activity_count) by (instance) > sum(pg_settings_max_connections) by (instance)
          for: 1m
          labels:
            severity: email
          annotations:
            summary: "{{ $labels.instance }} has maxed out Postgres connections."
            description: "{{ $labels.instance }} is exceeding the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Services may be degraded - please take immediate action (you probably need to increase max_connections in the Docker image and re-deploy."
    
        - alert: PostgreSQLHighConnections
          expr: sum(pg_stat_activity_count) by (instance) > sum(pg_settings_max_connections * 0.8) by (instance)
          for: 10m
          labels:
            severity: email
          annotations:
            summary: "{{ $labels.instance }} is over 80% of max Postgres connections."
            description: "{{ $labels.instance }} is exceeding 80% of the currently configured maximum Postgres connection limit (current value: {{ $value }}s). Please check utilization graphs and confirm if this is normal service growth, abuse or an otherwise temporary condition or if new resources need to be provisioned (or the limits increased, which is mostly likely)."
    
        - alert: PostgreSQLDown
          expr: pg_up != 1
          for: 1m
          labels:
            severity: email
          annotations:
            summary: "PostgreSQL is not processing queries: {{ $labels.instance }}"
            description: "{{ $labels.instance }} is rejecting query requests from the exporter, and thus probably not allowing DNS requests to work either. User services should not be effected provided at least 1 node is still alive."
    
        - alert: PostgreSQLSlowQueries
          expr: avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60
          for: 2m
          labels:
            severity: email
          annotations:
            summary: "PostgreSQL high number of slow on {{ $labels.cluster }} for database {{ $labels.datname }} "
            description: "PostgreSQL high number of slow queries {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }} "
    
        - alert: PostgreSQLQPS
          expr: avg(irate(pg_stat_database_xact_commit{datname!~"template.*"}[5m]) + irate(pg_stat_database_xact_rollback{datname!~"template.*"}[5m])) by (datname) > 10000
          for: 5m
          labels:
            severity: email
          annotations:
            summary: "PostgreSQL high number of queries per second {{ $labels.cluster }} for database {{ $labels.datname }}"
            description: "PostgreSQL high number of queries per second on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}"
    
        - alert: PostgreSQLCacheHitRatio
          expr: avg(rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) / (rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m]) + rate(pg_stat_database_blks_read{datname!~"template.*"}[5m]))) by (datname) < 0.98
          for: 5m
          labels:
            severity: email
          annotations:
            summary: "PostgreSQL low cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }}"
            description: "PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database {{ $labels.datname }} with a value of {{ $value }}"
    
    non-superuser收集指标所需权限
    DATA_SOURCE_NAME=postgresql://postgres_exporter:password@localhost:5432/postgres?sslmode=disable
    
    -- To use IF statements, hence to be able to check if the user exists before
    -- attempting creation, we need to switch to procedural SQL (PL/pgSQL)
    -- instead of standard SQL.
    -- More: https://www.postgresql.org/docs/9.3/plpgsql-overview.html
    -- To preserve compatibility with <9.0, DO blocks are not used; instead,
    -- a function is created and dropped.
    CREATE OR REPLACE FUNCTION __tmp_create_user() returns void as $$
    BEGIN
      IF NOT EXISTS (
              SELECT                       -- SELECT list can stay empty for this
              FROM   pg_catalog.pg_user
              WHERE  usename = 'postgres_exporter') THEN
        CREATE USER postgres_exporter;
      END IF;
    END;
    $$ language plpgsql;
    
    SELECT __tmp_create_user();
    DROP FUNCTION __tmp_create_user();
    
    ALTER USER postgres_exporter WITH PASSWORD 'password';
    ALTER USER postgres_exporter SET SEARCH_PATH TO postgres_exporter,pg_catalog;
    
    -- If deploying as non-superuser (for example in AWS RDS), uncomment the GRANT
    -- line below and replace <MASTER_USER> with your root user.
    -- GRANT postgres_exporter TO <MASTER_USER>;
    CREATE SCHEMA IF NOT EXISTS postgres_exporter;
    GRANT USAGE ON SCHEMA postgres_exporter TO postgres_exporter;
    GRANT CONNECT ON DATABASE postgres TO postgres_exporter;
    
    CREATE OR REPLACE FUNCTION get_pg_stat_activity() RETURNS SETOF pg_stat_activity AS
    $$ SELECT * FROM pg_catalog.pg_stat_activity; $$
    LANGUAGE sql
    VOLATILE
    SECURITY DEFINER;
    
    CREATE OR REPLACE VIEW postgres_exporter.pg_stat_activity
    AS
      SELECT * from get_pg_stat_activity();
    
    GRANT SELECT ON postgres_exporter.pg_stat_activity TO postgres_exporter;
    
    CREATE OR REPLACE FUNCTION get_pg_stat_replication() RETURNS SETOF pg_stat_replication AS
    $$ SELECT * FROM pg_catalog.pg_stat_replication; $$
    LANGUAGE sql
    VOLATILE
    SECURITY DEFINER;
    
    CREATE OR REPLACE VIEW postgres_exporter.pg_stat_replication
    AS
      SELECT * FROM get_pg_stat_replication();
    
    GRANT SELECT ON postgres_exporter.pg_stat_replication TO postgres_exporter;
    
    重新加载配置
    # 启用了--web.enable-lifecycle选项
    curl -X POST http://localhost:9090/-/reload
    或
    systemctl reload prometheus
    

    监控redis

    软件部署

    下载地址
    wget -c https://github.com/oliver006/redis_exporter/releases/download/v1.9.0/redis_exporter-v1.9.0.linux-amd64.tar.gz
    
    安装
    二进制包安装
    • 解压
    tar -xf redis_exporter-v1.9.0.linux-amd64.tar.gz -C /ups/app/monitor/
    mv redis_exporter-* redis_exporter
    
    
    • 配置服务项
    # 配置服务文件
    cat > /usr/lib/systemd/system/redis_exporter.service <<-EOF
    [Unit]
    Description=Redis Exporter
    Documentation=https://github.com/oliver006/redis_exporter
    After=network.target
    
    [Service]
    #User=prometheus
    #Group=prometheus
    Restart=on-failure
    ExecStart=/ups/app/monitor/redis_exporter/redis_exporter -redis-only-metrics --web.listen-address=:9121
    StandardOutput=syslog
    StandardError=syslog
    SyslogIdentifier=redis_exporter
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    • 日志重定向输出到指定文件
    cat > /etc/rsyslog.d/redis_exporter.conf <<-EOF
    if $programname == 'redis_exporter' then /ups/app/monitor/redis_exporter/exporter.log
    & stop
    EOF
    
    • 启动服务
    # 启动服务
    systemctl daemon-reload
    systemctl restart redis_exporter.service
    systemctl status redis_exporter.service
    
    
    # 命令行启动客户端
    cd /ups/app/monitor/redis_exporter
    ./redis_exporter &
    
    Docker安装
    docker run -d --name redis_exporter -p 9121:9121 oliver006/redis_exporter
    

    接入Prometheus监控

    配置prometheus.yml文件

    添加redis数据采集项

    • 集中式配置
    scrape_configs:
      - job_name: 'redis_exporter'
        file_sd_configs:
          - files:
            - targets/redis/redis-instances.json
        metrics_path: /scrape
        relabel_configs:
          - action: replace
            source_labels: ['__address__']
            regex: (.*):(.*):(.*)
            replacement: $2
            target_label: hostip
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: 192.168.10.181:9121
    
      ## config for scraping the exporter itself
      - job_name: 'redis_exporter_single'
        static_configs:
          - targets:
            - 192.168.10.181:9121
    

    配置redis服务器json文件

    vi targets/redis/redis-instances.json

    [
      {
        "targets": [ "redis://192.168.10.181:6379", "redis://192.168.10.151:6379"],
        "labels": { }
      }
    ]
    

    ​ 带密码URI格式:redis://host:<<PASSWORD>>@<<HOSTNAME>>:<<PORT>>

    • 独立配置
    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - progs:9093  # 对应启动的altermanager节点的9093端口
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules/alert_node.yml"
      - "rules/alert_mysql.yml"
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    
      - job_name: 'node_exporter'
        scrape_interval: 1s
        file_sd_configs:
          - files:
            - targets/node/*.yml
            refresh_interval: 10s
        relabel_configs:
        - action: replace
          source_labels: ['__address__']
          regex: (.*):(.*)
          replacement: $1
          target_label: hostname
        - action: labeldrop
          regex: __meta_filepath
    
      - job_name: 'redis_exporter'
        scrape_interval: 1s
        file_sd_configs:
          - files:
            - targets/redis/*.yml
        metrics_path: /scrape
        relabel_configs:
          - action: replace
            source_labels: ['__address__']
            regex: (.*):(.*):(.*)
            replacement: $2
            target_label: hostip
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: 192.168.10.181:9121
    

    配置redis服务器json文件

    vi targets/redis/redis1_exporter.yml
    [
      {
        "targets": [ "redis://192.168.10.181:6379"],
        "labels": { }
      }
    ]
    
    vi targets/redis/redis2_exporter.yml
    [
      {
        "targets": [ "redis://192.168.10.151:6379"],
        "labels": { }
      }
    ]
    
    重启prometheus加载配置
    # 检查并重新加载配置文件
    ./bin/promtool check config config/prometheus.yml
    # 重启服务
    systemctl restart prometheus
    

    告警组件

    在 Prometheus 中告警分为两部分:

    • Prometheus 服务根据所设置的告警规则将告警信息发送给 Alertmanager。
    • Alertmanager 对收到的告警信息进行处理,包括去重,降噪,分组,策略路由告警通知。

    使用告警服务主要的步骤如下:

    • 下载配置 Alertmanager。
    • 通过设置 -alertmanager.url 让 Prometheus 服务与 Alertmanager 进行通信。
    • 在 Prometheus 服务中设置告警规则。

    安装告警管理模块软件

    二进制安装

    mkdir -p /ups/app/monitor/
    # 解压
    tar -xf alertmanager-0.20.0.linux-amd64.tar.gz -C /ups/app/monitor/ --no-same-owner
    cd /ups/app/monitor/
    mv alertmanager-0.20.0.linux-amd64/ alertmanager
    
    # 创建用户
    # groupadd -g 2000 prometheus
    useradd -r -M -s /sbin/nologin -d /ups/app/monitor/alertmanager -c "Prometheus agent" prometheus
    
    # 创建目录
    cd /ups/app/monitor/
    mkdir -p alertmanager/{bin,logs,config,data}
    cd alertmanager
    mv alertmanager.yml config/
    mv alertmanager amtool bin/
    
    # 修改目录属主
    chown -R prometheus.prometheus /ups/app/monitor/alertmanager
    
    配置服务项
    # 配置服务启动项
    cat > /usr/lib/systemd/system/alertmanager.service <<-EOF
    [Unit]
    Description=alertmanager
    Documentation=https://prometheus.io/
    After=network.target
    
    [Service]
    Type=simple
    User=prometheus
    Group=prometheus
    ExecStart=/ups/app/monitor/alertmanager/bin/alertmanager 
            --config.file=/ups/app/monitor/alertmanager/config/alertmanager.yml 
            --web.listen-address=192.168.10.181:9093 
            --cluster.listen-address=0.0.0.0:8001 
            --storage.path=/ups/app/monitor/alertmanager/data 
            --log.level=info
    Restart=on-failure
    LimitNOFILE=65536
    
    [Install]
    WantedBy=multi-user.target
    EOF
    
    基本配置

    cat /ups/app/monitor/alertmanager/config/alertmanager.yml

    global:
      resolve_timeout: 5m
    
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      webhook_configs:
      - url: 'http://127.0.0.1:5001/'
    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']
    
    启动服务
    # 加载服务
    systemctl daemon-reload
    systemctl enable alertmanager.service 
    systemctl start alertmanager.service 
    systemctl status alertmanager
    

    案例

    通过企业微信接收告警

    准备工作
    • 注册企业微信账号
    • 创建第三方应用,点击创建应用按钮 -> 填写应用
    详细配置
    prometheus 配置

    vi /ups/app/monitor/promethues/config/prometheus.yml

    # my global config
    global:
      scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).
    
    # Alertmanager configuration
    alerting:
      alertmanagers:
      - static_configs:
        - targets:
          - localhost:9093
    
    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
      - "rules.yml"
      # - "first_rules.yml"
      # - "second_rules.yml"
    
    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: 'prometheus'
    
        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.
    
        static_configs:
        - targets: ['localhost:9090']
    
      - job_name: 'node'
        static_configs:
        - targets: ['localhost:9100']
    
    

    rules.yml 配置

    cat > /ups/app/monitor/promethues/config/rules.yml <<-EOF
    groups:
    - name: node
      rules:
      - alert: server_status
        expr: up{job="node"} == 0
        for: 15s
        annotations:
          summary: "机器 {{ $labels.instance }} 挂了"
    EOF
    
    alertmanger 配置
    cat > /ups/app/monitor/alertmanager/config/alertmanager.yml <<-EOF
    global:
      resolve_timeout: 5m
    
    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 1h
      receiver: 'wechat'
    receivers:
    - name: 'wechat'
      wechat_configs:
      - corp_id: 'ww9e5158867cf67d24'
        to_party: '1'
        agent_id: '1000002'
        api_secret: 'eRDqnTEOtlk2DtPiaxOA2w5fFyNhpIPkdQU-6Ty94cI'
    EOF
    

    参数说明:

    • corp_id: 企业微信账号唯一 ID, 可以在我的企业中查看。
    • to_party: 需要发送的组。
    • agent_id: 第三方企业应用的 ID,可以在自己创建的第三方企业应用详情页面查看。
    • api_secret: 第三方企业应用的密钥,可以在自己创建的第三方企业应用详情页面查看。

    附录

    参考文档

  • 相关阅读:
    python第八课
    python第七课
    python第六课
    python第五课
    Python基础30类-内置函数实现迭代器协议
    Python基础29类-内置函数(__format__,__slots__,__doc__,__module__,__del__,__call__)
    Python基础28类-内置函数(__getattribute__,__getitem__,__setitem__.__delittem__)
    Python基础27类-包装、组合方式授权、判断对象类型的方法
    Python基础26类-内置函数__setattr__,__getattr__,__delattr__
    Python基础25类-反射
  • 原文地址:https://www.cnblogs.com/binliubiao/p/13442823.html
Copyright © 2011-2022 走看看