zoukankan      html  css  js  c++  java
  • k8s prometheus平台部署相关组件

    k8s-prometheus平台部署相关组件

    1. K8s-prometheus平台部署相关组件

    • prometheus-deployment.yaml #部署Prometheus

    • **prometheus-configmap.yaml ** #Prometheus配置文件,主要配置Kubernetes服务发现

    • prometheus-rules.yaml #Prometheus告警规则

    • grafana.yaml #可视化展示

    • node-exporter.yml #采集节点资源,通过DaemonSet方式部署,并声明让Prometheus收集

    • kube-state-metrics.yaml #采集K8s资源,并声明让Prometheus收集

    • alertmanager-configmap.yaml #配置文件,配置发件人和收件人

    • alertmanager-deployment.yaml #部署Alertmanager告警组件

    2. 案例部署

    • 配置文件编写

      [root@k8s-master prometheus]# cat prometheus-configmap.yaml 
      apiVersion: v1
      kind: ConfigMap
      metadata:
        name: prometheus-config
        namespace: ops 
      data:
        prometheus.yml: |
          rule_files:
          - /etc/config/rules/*.rules
      
          scrape_configs:
          - job_name: prometheus
            static_configs:
            - targets:
              - localhost:9090
      
          - job_name: kubernetes-apiservers
            kubernetes_sd_configs:
            - role: endpoints
            relabel_configs:
            - action: keep
              regex: default;kubernetes;https
              source_labels:
              - __meta_kubernetes_namespace
              - __meta_kubernetes_service_name
              - __meta_kubernetes_endpoint_port_name
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
              insecure_skip_verify: true
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
       
          - job_name: kubernetes-nodes-kubelet
            kubernetes_sd_configs:
            - role: node  # 发现集群中的节点
            relabel_configs:
            # 将标签(.*)作为新标签名,原有值不变
            - action: labelmap
              regex: __meta_kubernetes_node_label_(.+)
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
              insecure_skip_verify: true
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      
          - job_name: kubernetes-nodes-cadvisor
            kubernetes_sd_configs:
            - role: node
            relabel_configs:
            # 将标签(.*)作为新标签名,原有值不变
            - action: labelmap
              regex: __meta_kubernetes_node_label_(.+)
            # 实际访问指标接口 https://NodeIP:10250/metrics/cadvisor,这里替换默认指标URL路径
            - target_label: __metrics_path__
              replacement: /metrics/cadvisor
            scheme: https
            tls_config:
              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
              insecure_skip_verify: true
            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      
          - job_name: kubernetes-service-endpoints
            kubernetes_sd_configs:
            - role: endpoints  # 从Service列表中的Endpoint发现Pod为目标
            relabel_configs:
            # Service没配置注解prometheus.io/scrape的不采集
            - action: keep
              regex: true
              source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_scrape
            # 重命名采集目标协议
            - action: replace
              regex: (https?)
              source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_scheme
              target_label: __scheme__
            # 重命名采集目标指标URL路径
            - action: replace
              regex: (.+)
              source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_path
              target_label: __metrics_path__
            # 重命名采集目标地址
            - action: replace
              regex: ([^:]+)(?::\d+)?;(\d+)
              replacement: $1:$2
              source_labels:
              - __address__
              - __meta_kubernetes_service_annotation_prometheus_io_port
              target_label: __address__
            # 将K8s标签(.*)作为新标签名,原有值不变
            - action: labelmap
              regex: __meta_kubernetes_service_label_(.+)
            # 生成命名空间标签
            - action: replace
              source_labels:
              - __meta_kubernetes_namespace
              target_label: kubernetes_namespace
            # 生成Service名称标签
            - action: replace
              source_labels:
              - __meta_kubernetes_service_name
              target_label: kubernetes_name
      
          - job_name: kubernetes-pods
            kubernetes_sd_configs:
            - role: pod   # 发现所有Pod为目标
            # 重命名采集目标协议
            relabel_configs:
            - action: keep
              regex: true
              source_labels:
              - __meta_kubernetes_pod_annotation_prometheus_io_scrape
            # 重命名采集目标指标URL路径
            - action: replace
              regex: (.+)
              source_labels:
              - __meta_kubernetes_pod_annotation_prometheus_io_path
              target_label: __metrics_path__
            # 重命名采集目标地址
            - action: replace
              regex: ([^:]+)(?::\d+)?;(\d+)
              replacement: $1:$2
              source_labels:
              - __address__
              - __meta_kubernetes_pod_annotation_prometheus_io_port
              target_label: __address__
            # 将K8s标签(.*)作为新标签名,原有值不变
            - action: labelmap
              regex: __meta_kubernetes_pod_label_(.+)
            # 生成命名空间标签
            - action: replace
              source_labels:
              - __meta_kubernetes_namespace
              target_label: kubernetes_namespace
            # 生成Service名称标签
            - action: replace
              source_labels:
              - __meta_kubernetes_pod_name
              target_label: kubernetes_pod_name
      
          alerting:
            alertmanagers:
            - static_configs:
                - targets: ["alertmanager:80"]
      
      [root@k8s-master prometheus]# cat prometheus-deployment.yaml
      apiVersion: apps/v1
      kind: Deployment
      metadata:
        name: prometheus 
        namespace: ops
        labels:
          k8s-app: prometheus
      spec:
        replicas: 1
        selector:
          matchLabels:
            k8s-app: prometheus
        template:
          metadata:
            labels:
              k8s-app: prometheus
          spec:
            serviceAccountName: prometheus
            initContainers:
            - name: "init-chown-data"
              image: "busybox:latest"
              imagePullPolicy: "IfNotPresent"
              command: ["chown", "-R", "65534:65534", "/data"]
              volumeMounts:
              - name: prometheus-data
                mountPath: /data
                subPath: ""
            containers:
              - name: prometheus-server-configmap-reload
                image: "jimmidyson/configmap-reload:v0.1"
                imagePullPolicy: "IfNotPresent"
                args:
                  - --volume-dir=/etc/config
                  - --webhook-url=http://localhost:9090/-/reload
                volumeMounts:
                  - name: config-volume
                    mountPath: /etc/config
                    readOnly: true
                resources:
                  limits:
                    cpu: 10m
                    memory: 10Mi
                  requests:
                    cpu: 10m
                    memory: 10Mi
      
              - name: prometheus-server
                image: "prom/prometheus:v2.20.0"
                imagePullPolicy: "IfNotPresent"
                args:
                  - --config.file=/etc/config/prometheus.yml
                  - --storage.tsdb.path=/data
                  - --web.console.libraries=/etc/prometheus/console_libraries
                  - --web.console.templates=/etc/prometheus/consoles
                  - --web.enable-lifecycle
                ports:
                  - containerPort: 9090
                readinessProbe:
                  httpGet:
                    path: /-/ready
                    port: 9090
                  initialDelaySeconds: 30
                  timeoutSeconds: 30
                livenessProbe:
                  httpGet:
                    path: /-/healthy
                    port: 9090
                  initialDelaySeconds: 30
                  timeoutSeconds: 30
                resources:
                  limits:
                    cpu: 500m
                    memory: 1500Mi
                  requests:
                    cpu: 200m
                    memory: 1000Mi
                  
                volumeMounts:
                  - name: config-volume
                    mountPath: /etc/config
                  - name: prometheus-data
                    mountPath: /data
                    subPath: ""
                  - name: prometheus-rules
                    mountPath: /etc/config/rules
            volumes:
              - name: config-volume
                configMap:
                  name: prometheus-config
              - name: prometheus-rules
                configMap:
                  name: prometheus-rules
              - name: prometheus-data
                persistentVolumeClaim:
                  claimName: prometheus
      ---
      apiVersion: v1
      kind: PersistentVolumeClaim
      metadata:
        name: prometheus
        namespace: ops
      spec:
        storageClassName: "managed-nfs-storage"
        accessModes:
          - ReadWriteMany
        resources:
          requests:
            storage: 10Gi
      ---
      apiVersion: v1
      kind: Service
      metadata: 
        name: prometheus
        namespace: ops
      spec: 
        type: NodePort
        ports: 
          - name: http 
            port: 9090
            protocol: TCP
            targetPort: 9090
            nodePort: 30090
        selector: 
          k8s-app: prometheus
      ---
      apiVersion: v1
      kind: ServiceAccount
      metadata:
        name: prometheus
        namespace: ops
      ---
      apiVersion: rbac.authorization.k8s.io/v1
      kind: ClusterRole
      metadata:
        name: prometheus
      rules:
        - apiGroups:
            - ""
          resources:
            - nodes
            - nodes/metrics
            - services
            - endpoints
            - pods
          verbs:
            - get
            - list
            - watch
        - apiGroups:
            - ""
          resources:
            - configmaps
          verbs:
            - get
        - nonResourceURLs:
            - "/metrics"
          verbs:
            - get
      ---
      apiVersion: rbac.authorization.k8s.io/v1
      kind: ClusterRoleBinding
      metadata:
        name: prometheus
      roleRef:
        apiGroup: rbac.authorization.k8s.io
        kind: ClusterRole
        name: prometheus
      subjects:
      - kind: ServiceAccount
        name: prometheus
        namespace: ops
      
      [root@k8s-master prometheus]# cat prometheus-rules.yaml 
      apiVersion: v1
      kind: ConfigMap
      metadata:
        name: prometheus-rules
        namespace: ops
      data:
        general.rules: |
          groups:
          - name: general.rules
            rules:
            - alert: InstanceDown
              expr: up == 0
              for: 1m
              labels:
                severity: error 
              annotations:
                summary: "Instance {{ $labels.instance }} 停止工作"
                description: "{{ $labels.instance }} job {{ $labels.job }} 已经停止5分钟以上."
                
        node.rules: |
          groups:
          - name: node.rules
            rules:
            - alert: NodeFilesystemUsage
              expr: |
                100 - (node_filesystem_free{fstype=~"ext4|xfs"} / 
                node_filesystem_size{fstype=~"ext4|xfs"} * 100) > 80 
              for: 1m
              labels:
                severity: warning 
              annotations:
                summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
                description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
      
            - alert: NodeMemoryUsage
              expr: |
                100 - (node_memory_MemFree+node_memory_Cached+node_memory_Buffers) / 
                node_memory_MemTotal * 100 > 80
              for: 1m
              labels:
                severity: warning
              annotations:
                summary: "Instance {{ $labels.instance }} 内存使用率过高"
                description: "{{ $labels.instance }}内存使用大于80% (当前值: {{ $value }})"
      
            - alert: NodeCPUUsage    
              expr: |
                100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60 
              for: 1m
              labels:
                severity: warning
              annotations:
                summary: "Instance {{ $labels.instance }} CPU使用率过高"       
                description: "{{ $labels.instance }}CPU使用大于60% (当前值: {{ $value }})"
      
            - alert: KubeNodeNotReady
              expr: |
                kube_node_status_condition{condition="Ready",status="true"} == 0
              for: 1m
              labels:
                severity: error
              annotations:
                message: '{{ $labels.node }} 已经有10多分钟没有准备好了.'
      
        pod.rules: |
          groups:
          - name: pod.rules
            rules:
            - alert: PodCPUUsage
              expr: |
                 sum(rate(container_cpu_usage_seconds_total{image!=""}[1m]) * 100) by (pod, namespace) > 80
              for: 5m
              labels:
                severity: warning 
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
      
            - alert: PodMemoryUsage
              expr: |
                 sum(container_memory_rss{image!=""}) by(pod, namespace) / 
                 sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80
              for: 5m
              labels:
                severity: warning 
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"
      
            - alert: PodNetworkReceive
              expr: |
                 sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace)  > 30000
              for: 5m
              labels:
                severity: warning
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于30MB/s (当前值: {{ $value }}K/s)"           
      
            - alert: PodNetworkTransmit
              expr: | 
                 sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
              for: 5m
              labels:
                severity: warning 
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)"
      
            - alert: PodRestart
              expr: |
                 sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 0
              for: 1m
              labels:
                severity: warning 
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
      
            - alert: PodFailed
              expr: |
                 sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
              for: 5s
              labels:
                severity: error 
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
      
            - alert: PodPending
              expr: | 
                 sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
              for: 1m
              labels:
                severity: error
              annotations:
                summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
      
    • 创建命名空间

      [root@k8s-master prometheus]# kubectl create namespace ops
      namespace/ops created
      
    • 部署prometheus服务

      [root@k8s-master prometheus]# kubectl apply -f prometheus-configmap.yaml 
      configmap/prometheus-config created
      [root@k8s-master prometheus]# kubectl apply -f prometheus-deployment.yaml 
      deployment.apps/prometheus created
      persistentvolumeclaim/prometheus created
      service/prometheus created
      serviceaccount/prometheus created
      clusterrole.rbac.authorization.k8s.io/prometheus created
      clusterrolebinding.rbac.authorization.k8s.io/prometheus created
      [root@k8s-master prometheus]# kubectl apply -f prometheus-rules.yaml 
      
    • 验证prometheus服务是否启动

      [root@k8s-master prometheus]# kubectl get pods -n ops
      NAME                          READY   STATUS    RESTARTS   AGE
      prometheus-859dbbc5f7-rlsqp   2/2     Running   0          4h8m
      
    • 浏览器验证
      image
      image

  • 相关阅读:
    db2 SQL3055N 报错分析解决
    db2之load报错SQL3107W解决
    db2报错之ERRORCODE=-4220 SQLSTATE=NULL
    mybatis-generator自动生成工具配置
    db2 表空间backup pending
    DB2
    Redis
    Linux安装Redis
    vue2.0学习(五)-Vue-router
    JavaScript基础-数据类型
  • 原文地址:https://www.cnblogs.com/scajy/p/15543666.html
Copyright © 2011-2022 走看看