вторник, 6 октября 2020 г.

кастомизация Prometheus и AlertManager в OKD 4.5

 Понадобилось слать алерты в Slack и  кастомизировать текст нотификейшнов.

Для этого в Web UI переходим в Cluster Settings --> Alertmanager --> YAML 


Приводим YAML в такой вид:




1. Prometheus custom-rule

Копируем текст ниже и применяем с помощью CLI oc apply -f custom-rule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: nurlan-alert
  namespace: openshift-monitoring
spec:
  groups:
    - name: kubernetes-apps
      rules:
        - alert: CrashLoopBackOff
          annotations:
            description: '{{ $labels.pod }} has been down for more than 15 minutes.'
            message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
              }}) is restarting {{ printf "%.2f" $value }} times / 15 minutes.
          expr: |
            rate(kube_pod_container_status_restarts_total{namespace=~"nurlan",job="kube-state-metrics"}[15m]) * 60 * 5 > 0
          for: 15m
          labels:
            severity: critical


2. Alertmanager
global:
  resolve_timeout: 5m
  slack_api_url: >-
    https://hooks.slack.com/services/asdasdasd
inhibit_rules:
  - equal:
      - namespace
      - alertname
    source_match:
      severity: critical
    target_match_re:
      severity: warning|info
  - equal:
      - namespace
      - alertname
    source_match:
      severity: warning
    target_match_re:
      severity: info
receivers:
  - name: Critical
    slack_configs:
      - channel: '#general'
        title: |-
          Alert details:
          {{ range .Alerts -}}
            Alert: {{ .Annotations.title }}{{ if .Labels.severity }} - {{ .Labels.severity }}{{ end }}
          Description: 😱  {{ .Annotations.description }}
          Details: ❗️ 
            {{ range .Labels.SortedPairs }}{{ .Name }}: {{ .Value }}
            {{ end }}
          {{ end }}
  - name: Default
  - name: nurlan
    slack_configs:
      - channel: '#general'
        title: |-
          Alert details:
          {{ range .Alerts -}}
            Alert: {{ .Annotations.title }}{{ if .Labels.severity }} - {{ .Labels.severity }}{{ end }}
          Description: {{ .Annotations.description }}
          Details:
            {{ range .Labels.SortedPairs }}{{ .Name }}: {{ .Value }}
            {{ end }}
          {{ end }}
  - name: Watchdog
route:
  group_by:
    - namespace
  group_interval: 3m
  group_wait: 30s
  receiver: Default
  repeat_interval: 1m
  routes:
    - match:
        alertname: Watchdog
      receiver: Watchdog
    - receiver: Critical
      match:
        alertname: KubePodCrashLooping
        namespace: nurlan
        severity: pending
    - receiver: nurlan
      match:
        namespace: nurlan
        severity: pending





Комментариев нет:

Отправить комментарий