Migrate to Prometheus Operator

2022-09-11 16:24:35 +03:00
parent ee4b1ddf57
commit 1045ed2f26
30 changed files with 32403 additions and 129 deletions
--- a/prometheus-operator/README.md
+++ b/prometheus-operator/README.md
@@ -0,0 +1,19 @@
+# Prometheus operator
+
+```
+curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
+kubectl create namespace prometheus-operator
+kubectl apply --server-side -n prometheus-operator -f bundle.yml
+kubectl delete -n prometheus-operator configmap snmp-exporter
+kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
+kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
+```
+
+# Mikrotik expoeter
+
+```
+ kubectl create -n prometheus-operator secret generic mikrotik-exporter \
+  --from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
+  --from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
+```
+
--- a/prometheus-operator/application.yml
+++ b/prometheus-operator/application.yml
@@ -0,0 +1,762 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: metrics
+spec:
+  namespaceSelector: {}
+  selector: {}
+  podMetricsEndpoints:
+    - port: exporter
+    - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Alertmanager
+metadata:
+  name: alertmanager
+spec:
+  nodeSelector:
+    dedicated: monitoring
+  tolerations:
+    - key: dedicated
+      operator: Equal
+      value: monitoring
+      effect: NoSchedule
+  replicas: 3
+  serviceAccountName: alertmanager
+  externalUrl: http://am.k-space.ee/
+  routePrefix: "/"
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: alertmanager
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  name: prometheus
+spec:
+  nodeSelector:
+    dedicated: monitoring
+  tolerations:
+    - key: dedicated
+      operator: Equal
+      value: monitoring
+      effect: NoSchedule
+  alerting:
+    alertmanagers:
+      - namespace: prometheus-operator
+        name: alertmanager
+        port: http
+        pathPrefix: "/"
+        apiVersion: v2
+  externalUrl: "http://prom.k-space.ee/"
+  replicas: 2
+  shards: 1
+  serviceAccountName: prometheus
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  ruleNamespaceSelector: {}
+  ruleSelector: {}
+  retentionSize: 80GB
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+        - ReadWriteOnce
+        resources:
+          requests:
+            storage: 100Gi
+        storageClassName: local-path
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["get"]
+- apiGroups:
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+- kind: ServiceAccount
+  name: prometheus
+  namespace: prometheus-operator
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: prometheus
+spec:
+  groups:
+  - name: prometheus
+    rules:
+    - alert: PrometheusJobMissing
+      annotations:
+        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \
+          \ LABELS = {{ $labels }}"
+        summary: Prometheus job missing (instance {{ $labels.instance }})
+      expr: absent(up{job="prometheus-operator/prometheus"})
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTargetMissing
+      annotations:
+        description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target missing (instance {{ $labels.instance }})
+      expr: up == 0
+      for: 5m
+      labels:
+        severity: critical
+    - alert: PrometheusAllTargetsMissing
+      annotations:
+        description: "A Prometheus job does not have living target anymore.\n  VALUE\
+          \ = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus all targets missing (instance {{ $labels.instance }})
+      expr: count by (job) (up) == 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusConfigurationReloadFailure
+      annotations:
+        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\
+          \  LABELS = {{ $labels }}"
+        summary: Prometheus configuration reload failure (instance {{ $labels.instance
+          }})
+      expr: prometheus_config_last_reload_successful != 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTooManyRestarts
+      annotations:
+        description: "Prometheus has restarted more than twice in the last 15 minutes.\
+          \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\
+          \ }}"
+        summary: Prometheus too many restarts (instance {{ $labels.instance }})
+      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
+        > 2
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerJobMissing
+      annotations:
+        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager job missing (instance {{ $labels.instance
+          }})
+      expr: absent(up{job="prometheus-operator/alertmanager"})
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerConfigurationReloadFailure
+      annotations:
+        description: "AlertManager configuration reload error\n  VALUE = {{ $value\
+          \ }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager configuration reload failure (instance {{
+          $labels.instance }})
+      expr: alertmanager_config_last_reload_successful != 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerConfigNotSynced
+      annotations:
+        description: "Configurations of AlertManager cluster instances are out of\
+          \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
+          }})
+      expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusNotConnectedToAlertmanager
+      annotations:
+        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\
+          \ }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
+          }})
+      expr: prometheus_notifications_alertmanagers_discovered < 1
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusRuleEvaluationFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} rule evaluation failures,\
+          \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\
+          \ = {{ $labels }}"
+        summary: Prometheus rule evaluation failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTemplateTextExpansionFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} template text expansion\
+          \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus template text expansion failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusRuleEvaluationSlow
+      annotations:
+        description: "Prometheus rule evaluation took more time than the scheduled\
+          \ interval. It indicates a slower storage backend access or too complex\
+          \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+      for: 5m
+      labels:
+        severity: warning
+    - alert: PrometheusNotificationsBacklog
+      annotations:
+        description: "The Prometheus notification queue has not been empty for 10\
+          \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusAlertmanagerNotificationFailing
+      annotations:
+        description: "Alertmanager is failing sending notifications\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
+          }})
+      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTargetEmpty
+      annotations:
+        description: "Prometheus has no target in service discovery\n  VALUE = {{\
+          \ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target empty (instance {{ $labels.instance }})
+      expr: prometheus_sd_discovered_targets == 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusLargeScrape
+      annotations:
+        description: "Prometheus has many scrapes that exceed the sample limit\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus large scrape (instance {{ $labels.instance }})
+      expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
+        10
+      for: 5m
+      labels:
+        severity: warning
+    - alert: PrometheusTargetScrapeDuplicate
+      annotations:
+        description: "Prometheus has many samples rejected due to duplicate timestamps\
+          \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus target scrape duplicate (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
+        > 0
+      for: 0m
+      labels:
+        severity: warning
+    - alert: PrometheusTsdbCheckpointCreationFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbCheckpointDeletionFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbCompactionsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbHeadTruncationsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbReloadFailures
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbWalCorruptions
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
+          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
+          and wipe /data/wal
+      expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
+      for: 0m
+      labels:
+        severity: critical
+    - alert: PrometheusTsdbWalTruncationsFailed
+      annotations:
+        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
+          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
+          }})
+      expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: critical
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus
+  annotations:
+    cert-manager.io/cluster-issuer: default
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
+    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
+spec:
+  rules:
+  - host: prom.k-space.ee
+    http:
+      paths:
+      - pathType: Prefix
+        path: "/"
+        backend:
+          service:
+            name: prometheus-operated
+            port:
+              number: 9090
+  tls:
+  - hosts:
+    - prom.k-space.ee
+    secretName: prom-tls
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: alertmanager
+  annotations:
+    cert-manager.io/cluster-issuer: default
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
+    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
+spec:
+  rules:
+  - host: am.k-space.ee
+    http:
+      paths:
+      - pathType: Prefix
+        path: "/"
+        backend:
+          service:
+            name: alertmanager-operated
+            port:
+              number: 9093
+  tls:
+  - hosts:
+    - am.k-space.ee
+    secretName: alertmanager-tls
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: prometheus
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus
+  podMetricsEndpoints:
+    - port: web
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: alertmanager
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alertmanager
+  podMetricsEndpoints:
+    - port: web
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus-operator
+  podMetricsEndpoints:
+    - port: http
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: kubelet
+spec:
+  endpoints:
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    port: https-metrics
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    path: /metrics/cadvisor
+    port: https-metrics
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  namespaceSelector:
+    matchNames:
+    - kube-system
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kubelet
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: kube-state-metrics
+spec:
+  groups:
+    - name: kube-state-metrics
+      rules:
+        - alert: KubernetesNodeReady
+          expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes Node ready (instance {{ $labels.instance }})
+            description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesMemoryPressure
+          expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+            description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesDiskPressure
+          expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+            description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesOutOfDisk
+          expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes out of disk (instance {{ $labels.instance }})
+            description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesOutOfCapacity
+          expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes out of capacity (instance {{ $labels.instance }})
+            description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesContainerOomKiller
+          expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+            description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesJobFailed
+          expr: kube_job_status_failed > 0
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes Job failed (instance {{ $labels.instance }})
+            description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesCronjobSuspended
+          expr: kube_cronjob_spec_suspend != 0
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesPersistentvolumeclaimPending
+          expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+            description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesVolumeOutOfDiskSpace
+          expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
+            description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesVolumeFullInFourDays
+          expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
+            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesPersistentvolumeError
+          expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
+            description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesStatefulsetDown
+          expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
+            description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesHpaScalingAbility
+          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
+            description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesHpaMetricAvailability
+          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
+            description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesHpaScaleCapability
+          expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
+          for: 2m
+          labels:
+            severity: info
+          annotations:
+            summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
+            description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesPodNotHealthy
+          expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
+            description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesPodCrashLooping
+          expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+            description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesReplicassetMismatch
+          expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
+            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesDeploymentReplicasMismatch
+          expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
+            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesStatefulsetReplicasMismatch
+          expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
+            description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesDeploymentGenerationMismatch
+          expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+            description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesStatefulsetGenerationMismatch
+          expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
+          for: 10m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
+            description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesStatefulsetUpdateNotRolledOut
+          expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
+            description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesDaemonsetRolloutStuck
+          expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
+            description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesDaemonsetMisscheduled
+          expr: kube_daemonset_status_number_misscheduled > 0
+          for: 1m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
+            description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesCronjobTooLong
+          expr: time() - kube_cronjob_next_schedule_time > 3600
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesJobSlowCompletion
+          expr: kube_job_spec_completions - kube_job_status_succeeded > 0
+          for: 12h
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+            description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesApiServerErrors
+          expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes API server errors (instance {{ $labels.instance }})
+            description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesApiClientErrors
+          expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes API client errors (instance {{ $labels.instance }})
+            description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesClientCertificateExpiresNextWeek
+          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
+            description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesClientCertificateExpiresSoon
+          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
+            description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: KubernetesApiServerLatency
+          expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Kubernetes API server latency (instance {{ $labels.instance }})
+            description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/prometheus-operator/blackbox-exporter.yml
+++ b/prometheus-operator/blackbox-exporter.yml
@@ -0,0 +1,258 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: websites
+spec:
+  prober:
+    url: blackbox-exporter
+    path: /probe
+  module: http_2xx
+  targets:
+    staticConfig:
+      static:
+        - https://git.k-space.ee/
+        - https://grafana.k-space.ee/
+        - https://wiki.k-space.ee/
+        - https://pad.k-space.ee/
+        - https://members.k-space.ee/
+        - https://nextcloud.k-space.ee/
+        - http://minio.infra.k-space.ee:9001/login
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: k6.ee
+spec:
+  prober:
+    url: blackbox-exporter
+    path: /probe
+  module: dns_check_traefik
+  targets:
+    staticConfig:
+      static:
+        - 193.40.103.2
+        - 62.65.250.2
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: samba-cluster
+spec:
+  prober:
+    url: blackbox-exporter
+    path: /metrics
+  module: tcp_connect
+  targets:
+    staticConfig:
+      static:
+        - dc1.ad.k-space.ee:636
+        - dc2.ad.k-space.ee:636
+        - dc3.ad.k-space.ee:636
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: misc
+spec:
+  prober:
+    url: blackbox-exporter
+    path: /metrics
+  module: tcp_connect
+  targets:
+    staticConfig:
+      static:
+        - mail.k-space.ee:465
+        - dev.k-space.ee:10648
+        - mariadb.infra.k-space.ee:3306
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: blackbox-exporter
+spec:
+  # https://awesome-prometheus-alerts.grep.to/rules#blackbox
+  groups:
+  - name: blackbox
+    rules:
+    - alert: BlackboxProbeFailed
+      expr: probe_success == 0
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe failed (instance {{ $labels.instance }})
+        description: Probe failed
+    - alert: BlackboxSlowProbe
+      expr: avg_over_time(probe_duration_seconds[1m]) > 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox slow probe (instance {{ $labels.instance }})
+        description: Blackbox probe took more than 1s to complete
+    - alert: BlackboxSlowDNS
+      expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
+        description: Blackbox DNS lookup took more than 1s to complete.
+          It seemed using IPv6 DNS servers in conjunction with Docker resulted
+          in odd 5s latency bump. For now we're using 8.8.8.8 because of that
+    - alert: BlackboxProbeHttpFailure
+      expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
+        description: HTTP status code is not 200-399
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: SSL certificate expires in 30 days
+    - alert: BlackboxSslCertificateWillExpireSoon
+      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
+        description: SSL certificate expires in 3 days
+    - alert: BlackboxSslCertificateExpired
+      expr: probe_ssl_earliest_cert_expiry - time() <= 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
+        description: SSL certificate has expired already
+    - alert: BlackboxProbeSlowHttp
+      expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
+        description: HTTP request took more than 1s
+    - alert: BlackboxProbeSlowPing
+      expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
+        description: Blackbox ping took more than 1s
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: blackbox-exporter
+spec:
+  revisionHistoryLimit: 0
+  replicas: 2
+  selector:
+    matchLabels:
+      app: blackbox-exporter
+  template:
+    metadata:
+      labels:
+        app: blackbox-exporter
+    spec:
+      containers:
+      - name: blackbox-exporter
+        image: prom/blackbox-exporter:v0.20.0
+        volumeMounts:
+        - name: blackbox-exporter-config
+          mountPath: /etc/blackbox_exporter
+      volumes:
+        - name: blackbox-exporter-config
+          configMap:
+            name: blackbox-exporter-config
+      # TODO: Results in odd 6s connection lag if scheduled in VLAN20
+      nodeSelector:
+        dedicated: monitoring
+      tolerations:
+        - key: dedicated
+          operator: Equal
+          value: monitoring
+          effect: NoSchedule
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: app
+                operator: In
+                values:
+                - blackbox-exporter
+            topologyKey: "kubernetes.io/hostname"
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: blackbox-exporter
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 9115
+  selector:
+    app: blackbox-exporter
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: blackbox-exporter-config
+data:
+  config.yml: |-
+    modules:
+      http_2xx:
+        prober: http
+        http:
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
+      http_post_2xx:
+        prober: http
+        http:
+          method: POST
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
+      tcp_connect:
+        prober: tcp
+        tcp:
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
+      icmp:
+        prober: icmp
+        icmp:
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
+      dns_check_traefik:
+        prober: dns
+        dns:
+          query_name: "traefik.k-space.ee"
+          query_type: "A"
+          validate_answer_rrs:
+            fail_if_not_matches_regexp:
+             - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
+      dns_check_k6:
+        prober: dns
+        dns:
+          query_name: "k6.ee"
+          query_type: "A"
+          validate_answer_rrs:
+            fail_if_not_matches_regexp:
+             - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
+          preferred_ip_protocol: "ip4"
+          ip_protocol_fallback: false
--- a/prometheus-operator/bundle.yml
+++ b/prometheus-operator/bundle.yml
--- a/prometheus-operator/mikrotik-exporter.yml
+++ b/prometheus-operator/mikrotik-exporter.yml
@@ -0,0 +1,104 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: mikrotik
+spec:
+  bearerTokenSecret:
+    name: mikrotik-exporter
+    key: PROMETHEUS_BEARER_TOKEN
+  prober:
+    path: /metrics
+    url: mikrotik-exporter
+  targets:
+    staticConfig:
+      static:
+        - router.mgmt.k-space.ee
+        - sw_chaos.mgmt.k-space.ee
+        - sw_poe.mgmt.k-space.ee
+        - sw_mgmt.mgmt.k-space.ee
+        - sw_core02.mgmt.k-space.ee
+        - sw_cyber.mgmt.k-space.ee
+        - sw_ha.mgmt.k-space.ee
+        - sw_asocial.mgmt.k-space.ee
+        - sw_kitchen.mgmt.k-space.ee
+        - sw_core01.mgmt.k-space.ee
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: mikrotik
+spec:
+  groups:
+  - name: mikrotik
+    rules:
+    - alert: MikrotikUplinkRedundancyLost
+      expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
+      for: 0m
+      labels:
+        severity: error
+      annotations:
+        summary: Switch uplink high availability lost
+        description: One of the two 10Gb optical links is malfunctioning
+    - alert: MikrotikLinkRateDegraded
+      expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
+      for: 0m
+      labels:
+        severity: error
+      annotations:
+        summary: 10Gb link degraded
+        description: One of the 10Gb links is running at lower speed
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mikrotik-exporter
+spec:
+  revisionHistoryLimit: 0
+  replicas: 2
+  selector:
+    matchLabels:
+      app: mikrotik-exporter
+  template:
+    metadata:
+      labels:
+        app: mikrotik-exporter
+      annotations:
+        co.elastic.logs/multiline.pattern: '^  '
+        co.elastic.logs/multiline.negate: "false"
+        co.elastic.logs/multiline.match: after
+    spec:
+      containers:
+      - name: mikrotik-exporter
+        image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
+        env:
+          - name: MIKROTIK_USER
+            value: netpoller
+        envFrom:
+          - secretRef:
+              name: mikrotik-exporter
+      nodeSelector:
+        dedicated: monitoring
+      tolerations:
+      - key: dedicated
+        operator: Equal
+        value: monitoring
+        effect: NoSchedule
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - topologyKey: "kubernetes.io/hostname"
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: mikrotik-exporter
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 3001
+  selector:
+    app: mikrotik-exporter
--- a/prometheus-operator/node-exporter.yml
+++ b/prometheus-operator/node-exporter.yml
@@ -0,0 +1,443 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: nodes-proxmox
+spec:
+  targets:
+    staticConfig:
+      static:
+        - nas.mgmt.k-space.ee:9100
+        - pve1.proxmox.infra.k-space.ee:9100
+        - pve8.proxmox.infra.k-space.ee:9100
+        - pve9.proxmox.infra.k-space.ee:9100
+      relabelingConfigs:
+      - sourceLabels: [__param_target]
+        targetLabel: instance
+      - sourceLabels: [__param_target]
+        targetLabel: __address__
+  prober:
+    url: localhost
+    path: /metrics
+  metricRelabelings:
+  - sourceLabels: [__address__]
+    targetLabel: target
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Probe
+metadata:
+  name: nodes-misc
+spec:
+  targets:
+    staticConfig:
+      static:
+          - sprucecone.infra.k-space.ee:9100
+          - cedarcone.infra.k-space.ee:9100
+      relabelingConfigs:
+      - sourceLabels: [__param_target]
+        targetLabel: instance
+      - sourceLabels: [__param_target]
+        targetLabel: __address__
+  prober:
+    url: localhost
+    path: /metrics
+  metricRelabelings:
+  - sourceLabels: [__address__]
+    targetLabel: target
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: node-exporter
+spec:
+  groups:
+  - name: node-exporter
+    rules:
+    - alert: ZfsOfflinePool
+      expr: node_zfs_zpool_state{state!="online"} > 0
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: ZFS offline pool (instance {{ $labels.instance }})
+        description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: HostHighLoad
+      expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host under high load
+        description: Many processes are queued up for execution
+    - alert: HostOutOfMemory
+      expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of memory (instance {{ $labels.instance }})
+        description: Node memory is filling up (< 10% left)
+    - alert: HostMemoryUnderMemoryPressure
+      expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        description: The node is under heavy memory pressure. High rate of major page faults
+    - alert: HostUnusualNetworkThroughputIn
+      expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput in (instance {{ $labels.instance }})
+        description: Host network interfaces are probably receiving too much data (> 160 MB/s)
+    - alert: HostUnusualNetworkThroughputOut
+      expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual network throughput out (instance {{ $labels.instance }})
+        description: Host network interfaces are probably sending too much data (> 160 MB/s)
+    - alert: HostUnusualDiskReadRate
+      expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read rate (instance {{ $labels.instance }})
+        description: Disk is probably reading too much data (> 50 MB/s)
+    - alert: HostUnusualDiskWriteRate
+      expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write rate (instance {{ $labels.instance }})
+        description: Disk is probably writing too much data (> 50 MB/s)
+    # Please add ignored mountpoints in node_exporter parameters like
+    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+    - alert: HostOutOfDiskSpace
+      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of disk space (instance {{ $labels.instance }})
+        description: Disk is almost full (< 10% left)
+    # Please add ignored mountpoints in node_exporter parameters like
+    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+    - alert: HostDiskWillFillIn24Hours
+      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
+    - alert: HostOutOfInodes
+      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host out of inodes (instance {{ $labels.instance }})
+        description: Disk is almost running out of available inodes (< 10% left)
+    - alert: HostInodesWillFillIn24Hours
+      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
+    - alert: HostUnusualDiskReadLatency
+      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk read latency (instance {{ $labels.instance }})
+        description: Disk latency is growing (read operations > 100ms)
+    - alert: HostUnusualDiskWriteLatency
+      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host unusual disk write latency (instance {{ $labels.instance }})
+        description: Disk latency is growing (write operations > 100ms)
+    - alert: HostCpuStealNoisyNeighbor
+      expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
+    # 1000 context switches is an arbitrary number.
+    # Alert threshold depends on nature of application.
+    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+    - alert: HostContextSwitching
+      expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host context switching (instance {{ $labels.instance }})
+        description: Context switching is growing on node (> 50000 / s)
+    - alert: HostSwapIsEnabled
+      expr: node_memory_SwapTotal_bytes > 0
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Swap is discouraged nowadays
+    - alert: HostPhysicalComponentTooHot
+      expr: node_hwmon_temp_celsius > 75
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host physical component too hot (instance {{ $labels.instance }})
+        description: Physical hardware component too hot
+    - alert: HostNodeOvertemperatureAlarm
+      expr: node_hwmon_temp_alarm == 1
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+        description: Physical node temperature alarm triggered
+    - alert: HostRaidArrayGotInactive
+      expr: node_md_state{state="inactive"} > 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
+    - alert: HostRaidDiskFailure
+      expr: node_md_disks{state="failed"} > 0
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
+    - alert: HostOomKillDetected
+      expr: increase(node_vmstat_oom_kill[1m]) > 0
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host OOM kill detected (instance {{ $labels.instance }})
+        description: OOM kill detected
+    - alert: HostEdacCorrectableErrorsDetected
+      expr: increase(node_edac_correctable_errors_total[1m]) > 0
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
+    - alert: HostEdacUncorrectableErrorsDetected
+      expr: node_edac_uncorrectable_errors_total > 0
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
+    - alert: HostNetworkReceiveErrors
+      expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Receive Errors (instance {{ $labels.instance }})
+        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
+    - alert: HostNetworkTransmitErrors
+      expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
+    - alert: HostNetworkInterfaceSaturated
+      expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
+        description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
+    - alert: HostNetworkBondDegraded
+      expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host Network Bond Degraded
+    - alert: HostConntrackLimit
+      expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host conntrack limit (instance {{ $labels.instance }})
+        description: The number of conntrack is approching limit
+    - alert: HostClockSkew
+      expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock skew (instance {{ $labels.instance }})
+        description: Clock skew detected. Clock is out of sync.
+    - alert: HostClockNotSynchronising
+      expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Host clock not synchronising (instance {{ $labels.instance }})
+        description: Clock not synchronising.
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: smart
+spec:
+  groups:
+    - name: smart
+      rules:
+      - alert: SmartSSDWriteRateTooHigh
+        expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: SSD write rate exceeds 10MB/s
+          description: At this rate the SSD will be worn out before warranty period expires
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: temperatures
+spec:
+  groups:
+    - name: temperatures
+      rules:
+      - alert: HighDiskTemperature
+        expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: High HDD/SSD temperature indicates high ambient temperature
+      - alert: HighChipsetTemperature
+        expr: node_hwmon_temp_celsius > 65
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
+      - alert: LowDiskTemperature
+        expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: node-exporter
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+  podMetricsEndpoints:
+    - port: web
+      scrapeTimeout: 30s
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: node-exporter
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  labels:
+    app: node-exporter
+  name: node-exporter
+  annotations:
+    keel.sh/policy: force
+    keel.sh/trigger: poll
+    keel.sh/pollSchedule: "@midnight"
+spec:
+  selector:
+    matchLabels:
+      app: node-exporter
+  template:
+    metadata:
+      labels:
+        app: node-exporter
+    spec:
+      containers:
+      - name: node-exporter
+        args:
+        - --web.listen-address=0.0.0.0:9101
+        - --path.sysfs=/host/sys
+        - --path.rootfs=/host/root
+        - --no-collector.wifi
+        - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
+        - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
+        - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
+        image: prom/node-exporter:v1.3.1
+        resources:
+          limits:
+            cpu: 50m
+            memory: 180Mi
+          requests:
+            cpu: 5m
+            memory: 20Mi
+        volumeMounts:
+        - mountPath: /host/sys
+          mountPropagation: HostToContainer
+          name: sys
+          readOnly: true
+        - mountPath: /host/root
+          mountPropagation: HostToContainer
+          name: root
+          readOnly: true
+        ports:
+        - containerPort: 9101
+          name: web
+        securityContext:
+          runAsGroup: 65532
+          runAsNonRoot: true
+          runAsUser: 65532
+          readOnlyRootFilesystem: true
+      hostNetwork: true
+      hostPID: true
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534
+      serviceAccountName: node-exporter
+      tolerations:
+      - operator: Exists
+      volumes:
+      - hostPath:
+          path: /sys
+        name: sys
+      - hostPath:
+          path: /
+        name: root
--- a/prometheus-operator/snmp-exporter.yml
+++ b/prometheus-operator/snmp-exporter.yml
@@ -0,0 +1,172 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: snmp-exporter
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: snmp-exporter
+  template:
+    metadata:
+      labels:
+        app: snmp-exporter
+    spec:
+      containers:
+        - image: prom/snmp-exporter:latest
+          name: snmp-exporter
+          imagePullPolicy: Always
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 1000
+            readOnlyRootFilesystem: true
+          ports:
+          - containerPort: 9116
+            name: exporter
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: exporter
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: exporter
+          volumeMounts:
+          - name: snmp-exporter
+            mountPath: /etc/snmp_exporter
+      volumes:
+        - name: snmp-exporter
+          configMap:
+            name: snmp-exporter
+      nodeSelector:
+        dedicated: monitoring
+      tolerations:
+      - key: dedicated
+        operator: Equal
+        value: monitoring
+        effect: NoSchedule
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: app
+                operator: In
+                values:
+                - snmp-exporter
+            topologyKey: "kubernetes.io/hostname"
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: snmp-exporter
+spec:
+  type: ClusterIP
+  ports:
+    - name: exporter
+      port: 9116
+      protocol: TCP
+  selector:
+    app: snmp-exporter
+---
+kind: Probe
+apiVersion: monitoring.coreos.com/v1
+metadata:
+  name: ups
+spec:
+  interval: 60s
+  module: rfc1628_ups
+  prober:
+    url: snmp-exporter:9116
+    path: /snmp
+  targets:
+    staticConfig:
+      static:
+        - ups-4.mgmt.k-space.ee
+        - ups-5.mgmt.k-space.ee
+        - ups-6.mgmt.k-space.ee
+        - ups-7.mgmt.k-space.ee
+        - ups-8.mgmt.k-space.ee
+        - ups-9.mgmt.k-space.ee
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: ups
+spec:
+  groups:
+  - name: ups
+    rules:
+    - alert: UPSBatteryLost
+      annotations:
+        summary: One or more UPS-es have degraded batteries.
+      expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
+      for: 1m
+      labels:
+        severity: critical
+    - alert: UPSPowerLost
+      annotations:
+        summary: One or more UPS-es is not in normal operation mode. This either means
+          power is lost or UPS was loaded and it's now in bypass mode.
+      expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6
+      for: 1m
+      labels:
+        severity: critical
+    - alert: UPSExcessivelyLoaded
+      annotations:
+        summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
+          is balanced and load for no UPS stays above 50%.
+      expr: snmp_upsOutputPercentLoad > 80
+      for: 1h
+      labels:
+        severity: critical
+---
+kind: Probe
+apiVersion: monitoring.coreos.com/v1
+metadata:
+  name: printer
+spec:
+  interval: 60s
+  scrapeTimeout: 50s
+  module: printer_mib
+  prober:
+    url: snmp-exporter:9116
+    path: /snmp
+  targets:
+    staticConfig:
+      static:
+        - mfp-cyber.pub.k-space.ee
+        - mfp-chaos.pub.k-space.ee
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: printer
+spec:
+  groups:
+  - name: printer
+    rules:
+    - alert: PrinterNeedsAttention
+      annotations:
+        summary: Printer is in error state. If the underlying reason is 'low on paper'
+          make sure there is enough paper near the printer. It not drop a line at
+          accounting@k-space.ee to order more office supplies.
+      expr: snmp_hrPrinterDetectedErrorState == 1
+      for: 0m
+      labels:
+        severity: warning
+---
+kind: Probe
+apiVersion: monitoring.coreos.com/v1
+metadata:
+  name: beamer
+spec:
+  interval: 60s
+  module: epson_beamer
+  prober:
+    url: snmp-exporter:9116
+    path: /snmp
+  targets:
+    staticConfig:
+      static:
+        - beamer-cyber.sec.k-space.ee
--- a/prometheus-operator/snmp.yml
+++ b/prometheus-operator/snmp.yml