Move Prometheus instance to monitoring namespace

2023-08-19 09:13:59 +03:00
parent 62661efc42
commit 6e2f353916
9 changed files with 98 additions and 86 deletions
--- a/monitoring/prometheus.yaml
+++ b/monitoring/prometheus.yaml
@@ -0,0 +1,486 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: metrics
+spec:
+  namespaceSelector: {}
+  selector: {}
+  podMetricsEndpoints:
+    - port: exporter
+    - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  name: prometheus
+spec:
+  nodeSelector:
+    dedicated: monitoring
+  tolerations:
+    - key: dedicated
+      operator: Equal
+      value: monitoring
+      effect: NoSchedule
+  alerting:
+    alertmanagers:
+      - namespace: prometheus-operator
+        name: alertmanager-operated
+        port: web
+  externalUrl: "http://prom.k-space.ee/"
+  replicas: 2
+  shards: 1
+  serviceAccountName: prometheus
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  ruleNamespaceSelector: {}
+  ruleSelector: {}
+  retentionSize: 8GB
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+        - ReadWriteOnce
+        resources:
+          requests:
+            storage: 10Gi
+        storageClassName: local-path
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["get"]
+- apiGroups:
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+- kind: ServiceAccount
+  name: prometheus
+  namespace: prometheus-operator
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: prometheus
+spec:
+  groups:
+  - name: prometheus
+    rules:
+    - alert: PrometheusJobMissing
+      annotations:
+        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \
+          \ LABELS = {{ $labels }}"
+        summary: Prometheus job missing (instance {{ $labels.instance }})
+      expr: absent(up{job="monitoring/prometheus"})
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTargetMissing
+      annotations:
+        description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target missing (instance {{ $labels.instance }})
+      expr: up == 0
+      for: 5m
+      labels:
+        severity: critical
+    - alert: PrometheusAllTargetsMissing
+      annotations:
+        description: "A Prometheus job does not have living target anymore.\n  VALUE\
+          \ = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus all targets missing (instance {{ $labels.instance }})
+      expr: count by (job) (up) == 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusConfigurationReloadFailure
+      annotations:
+        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\
+          \  LABELS = {{ $labels }}"
+        summary: Prometheus configuration reload failure (instance {{ $labels.instance
+          }})
+      expr: prometheus_config_last_reload_successful != 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTooManyRestarts
+      annotations:
+        description: "Prometheus has restarted more than twice in the last 15 minutes.\
+          \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\
+          \ }}"
+        summary: Prometheus too many restarts (instance {{ $labels.instance }})
+      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
+        > 2
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerJobMissing
+      annotations:
+        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager job missing (instance {{ $labels.instance
+          }})
+      expr: absent(up{job="monitoring/alertmanager"})
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerConfigurationReloadFailure
+      annotations:
+        description: "AlertManager configuration reload error\n  VALUE = {{ $value\
+          \ }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager configuration reload failure (instance {{
+          $labels.instance }})
+      expr: alertmanager_config_last_reload_successful != 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerConfigNotSynced
+      annotations:
+        description: "Configurations of AlertManager cluster instances are out of\
+          \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
+          }})
+      expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusNotConnectedToAlertmanager
+      annotations:
+        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\
+          \ }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
+          }})
+      expr: prometheus_notifications_alertmanagers_discovered < 1
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusRuleEvaluationFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} rule evaluation failures,\
+          \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\
+          \ = {{ $labels }}"
+        summary: Prometheus rule evaluation failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTemplateTextExpansionFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} template text expansion\
+          \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus template text expansion failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusRuleEvaluationSlow
+      annotations:
+        description: "Prometheus rule evaluation took more time than the scheduled\
+          \ interval. It indicates a slower storage backend access or too complex\
+          \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+      for: 5m
+      labels:
+        severity: warning
+    - alert: PrometheusNotificationsBacklog
+      annotations:
+        description: "The Prometheus notification queue has not been empty for 10\
+          \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerNotificationFailing
+      annotations:
+        description: "Alertmanager is failing sending notifications\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
+          }})
+      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTargetEmpty
+      annotations:
+        description: "Prometheus has no target in service discovery\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target empty (instance {{ $labels.instance }})
+      expr: prometheus_sd_discovered_targets == 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusLargeScrape
+      annotations:
+        description: "Prometheus has many scrapes that exceed the sample limit\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus large scrape (instance {{ $labels.instance }})
+      expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
+        10
+      for: 5m
+      labels:
+        severity: warning
+    - alert: PrometheusTargetScrapeDuplicate
+      annotations:
+        description: "Prometheus has many samples rejected due to duplicate timestamps\
+          \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target scrape duplicate (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
+        > 0
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTsdbCheckpointCreationFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbCheckpointDeletionFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbCompactionsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbHeadTruncationsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbReloadFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbWalCorruptions
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
+          and wipe /data/wal
+      expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbWalTruncationsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  annotations:
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
+    traefik.ingress.kubernetes.io/router.middlewares: monitoring-prometheus@kubernetescrd
+spec:
+  rules:
+  - host: prom.k-space.ee
+    http:
+      paths:
+      - pathType: Prefix
+        path: "/"
+        backend:
+          service:
+            name: prometheus-operated
+            port:
+              number: 9090
+  tls:
+  - hosts:
+    - "*.k-space.ee"
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: alertmanager
+  annotations:
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
+    traefik.ingress.kubernetes.io/router.middlewares: monitoring-alertmanager@kubernetescrd
+spec:
+  rules:
+  - host: am.k-space.ee
+    http:
+      paths:
+      - pathType: Prefix
+        path: "/"
+        backend:
+          service:
+            name: alertmanager-operated
+            port:
+              number: 9093
+  tls:
+  - hosts:
+    - "*.k-space.ee"
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: prometheus
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus
+  podMetricsEndpoints:
+    - port: web
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: alertmanager
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alertmanager
+  podMetricsEndpoints:
+    - port: web
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus-operator
+  podMetricsEndpoints:
+    - port: http
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: kubelet
+spec:
+  endpoints:
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    port: https-metrics
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    path: /metrics/cadvisor
+    port: https-metrics
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  namespaceSelector:
+    matchNames:
+    - kube-system
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kubelet
+---
+apiVersion: codemowers.io/v1alpha1
+kind: OIDCGWMiddlewareClient
+metadata:
+  name: prometheus
+spec:
+  displayName: Prometheus
+  uri: 'https://prom.k-space.ee'
+  allowedGroups:
+    - k-space:floor
+  headerMapping:
+    email: Remote-Email
+    groups: Remote-Groups
+    name: Remote-Name
+    user: Remote-Username
+---
+apiVersion: codemowers.io/v1alpha1
+kind: OIDCGWMiddlewareClient
+metadata:
+  name: alertmanager
+spec:
+  displayName: AlertManager
+  uri: 'https://am.k-space.ee'
+  allowedGroups:
+    - k-space:kubernetes:admins
+  headerMapping:
+    email: Remote-Email
+    groups: Remote-Groups
+    name: Remote-Name
+    user: Remote-Username