Migrate to Prometheus Operator

2022-09-11 16:24:35 +03:00
parent ee4b1ddf57
commit 1045ed2f26
30 changed files with 32403 additions and 129 deletions
--- a/argocd/applications/prometheus-operator.yml
+++ b/argocd/applications/prometheus-operator.yml
@@ -1,17 +1,14 @@
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
-  name: monitoring
+  name: prometheus-operator
  namespace: argocd
 spec:
  project: default
  source:
    repoURL: 'git@git.k-space.ee:k-space/kube.git'
-    path: monitoring
+    path: prometheus-operator
    targetRevision: HEAD
  destination:
    server: 'https://kubernetes.default.svc'
-    namespace: monitoring
+    namespace: prometheus-operator
  syncPolicy:
    syncOptions:
      - CreateNamespace=true
--- a/argocd/monitoring.yml
+++ b/argocd/monitoring.yml
@@ -0,0 +1,33 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: argocd
 spec:
  selector: {}
  podMetricsEndpoints:
  - port: metrics
  - port: controller
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: argocd
 spec:
  groups:
  - name: argocd
    rules:
    - alert: ArgoNotSynced
      annotations:
        summary: Some applications in Argo are out of sync
      expr: sum by (dest_namespace) (argocd_app_info{sync_status!="Synced"}) > 0
      for: 8h
      labels:
        severity: warning
    - alert: ArgoNotHealthy
      annotations:
        summary: Some applications in Argo are not healthy
      expr: argocd_app_info{health_status!="Healthy"}
      for: 30m
      labels:
        severity: warning
--- a/argocd/values.yaml
+++ b/argocd/values.yaml
@@ -77,10 +77,6 @@ server:
  metrics:
    enabled: true
    service:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8083"
 # We don't use ApplicationSet CRD-s (yet)
 applicationSet:
@@ -89,26 +85,14 @@ applicationSet:
 repoServer:
  metrics:
    enabled: true
    service:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8084"
 notifications:
  metrics:
    enabled: true
    service:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9001"
 controller:
  metrics:
    enabled: true
    service:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8082"
 configs:
  secret:
--- a/camtiler/application.yml
+++ b/camtiler/application.yml
@@ -10,11 +10,11 @@ spec:
  replicas: 2
  selector:
    matchLabels:
-      app: camtiler
+      app.kubernetes.io/name: camtiler
  template:
    metadata:
      labels:
-        app: camtiler
+        app.kubernetes.io/name: camtiler
        component: camtiler
    spec:
      serviceAccountName: camtiler
@@ -25,6 +25,9 @@ spec:
            readOnlyRootFilesystem: true
            runAsNonRoot: true
            runAsUser: 1000
          ports:
            - containerPort: 5000
              name: "http"
 ---
 apiVersion: apps/v1
 kind: Deployment
@@ -38,11 +41,11 @@ spec:
  replicas: 2
  selector:
    matchLabels:
-      app: log-viewer-frontend
+      app.kubernetes.io/name: log-viewer-frontend
  template:
    metadata:
      labels:
-        app: log-viewer-frontend
+        app.kubernetes.io/name: log-viewer-frontend
    spec:
      containers:
        - name: log-viewer-frontend
@@ -64,11 +67,11 @@ spec:
  replicas: 3
  selector:
    matchLabels:
-      app: log-viewer-backend
+      app.kubernetes.io/name: log-viewer-backend
  template:
    metadata:
      labels:
-        app: log-viewer-backend
+        app.kubernetes.io/name: log-viewer-backend
    spec:
      containers:
        - name: log-backend-backend
@@ -109,7 +112,7 @@ metadata:
 spec:
  type: ClusterIP
  selector:
-    app: log-viewer-frontend
+    app.kubernetes.io/name: log-viewer-frontend
  ports:
  - protocol: TCP
    port: 3003
@@ -121,7 +124,7 @@ metadata:
 spec:
  type: ClusterIP
  selector:
-    app: log-viewer-backend
+    app.kubernetes.io/name: log-viewer-backend
  ports:
  - protocol: TCP
    port: 3002
@@ -130,14 +133,12 @@ apiVersion: v1
 kind: Service
 metadata:
  name: camtiler
  annotations:
    prometheus.io/scrape: 'true'
  labels:
    component: camtiler
 spec:
  type: ClusterIP
  selector:
-    app: camtiler
+    app.kubernetes.io/name: camtiler
    component: camtiler
  ports:
  - protocol: TCP
@@ -254,7 +255,7 @@ spec:
          kubernetes.io/metadata.name: monitoring
      podSelector:
        matchLabels:
-          app: prometheus
+          app.kubernetes.io/name: prometheus
  egress:
    - to:
        - ipBlock:
@@ -263,7 +264,7 @@ spec:
    - to:
      - podSelector:
          matchLabels:
-            app: mongodb-svc
+            app.kubernetes.io/name: mongodb-svc
      ports:
      - port: 27017
    - to:
@@ -298,7 +299,7 @@ spec:
          kubernetes.io/metadata.name: monitoring
      podSelector:
        matchLabels:
-          app: prometheus
+          app.kubernetes.io/name: prometheus
  - from:
    - namespaceSelector:
        matchLabels:
@@ -314,7 +315,7 @@ metadata:
 spec:
  podSelector:
    matchLabels:
-      app: log-viewer-backend
+      app.kubernetes.io/name: log-viewer-backend
  policyTypes:
  - Ingress
  - Egress
@@ -322,13 +323,11 @@ spec:
    - to:
      - podSelector:
          matchLabels:
-            app: mongodb-svc
+            app.kubernetes.io/name: mongodb-svc
    - to:
-      - podSelector:
+      - ipBlock:
-          matchLabels:
+          # Minio is accessed thru public endpoint via Traefik
-            v1.min.io/tenant: minio
+          cidr: 193.40.103.0/24
      ports:
      - port: 9000
  ingress:
  - from:
    - namespaceSelector:
@@ -345,7 +344,7 @@ metadata:
 spec:
  podSelector:
    matchLabels:
-      app: log-viewer-frontend
+      app.kubernetes.io/name: log-viewer-frontend
  policyTypes:
  - Ingress
  - Egress
@@ -458,7 +457,6 @@ spec:
             required: ["target"]
         required: ["spec"]
 ---
 ---
 apiVersion: codemowers.io/v1alpha1
 kind: ClusterOperator
 metadata:
@@ -480,7 +478,7 @@ spec:
      spec:
        type: ClusterIP
        selector:
-          app: foobar
+          app.kubernetes.io/name: foobar
          component: camdetect
        ports:
        - protocol: TCP
@@ -506,14 +504,11 @@ spec:
            maxUnavailable: 1
        selector:
          matchLabels:
-            app: foobar
+            app.kubernetes.io/name: foobar
        template:
          metadata:
            annotations:
              prometheus.io/scrape: 'true'
              prometheus.io/port: '5000'
            labels:
-              app: foobar
+              app.kubernetes.io/name: foobar
              component: camdetect
          spec:
            containers:
@@ -590,9 +585,55 @@ spec:
              whenUnsatisfiable: DoNotSchedule
              labelSelector:
                matchLabels:
-                  app: foobar
+                  app.kubernetes.io/name: foobar
                  component: camdetect
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: camtiler
 spec:
  selector: {}
  podMetricsEndpoints:
  - port: http
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: cameras
 spec:
    groups:
    - name: cameras
      rules:
      - alert: CameraLost
        expr: rate(camdetect_rx_frames_total[2m]) < 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Camera feed stopped
      - alert: CameraServerRoomMotion
        expr: camdetect_event_active {app="camdetect-server-room"} > 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Motion was detected in server room
      - alert: CameraSlowUploads
        expr: rate(camdetect_upload_dropped_frames_total[2m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Motion detect snapshots are piling up and not getting uploaded to S3
      - alert: CameraSlowProcessing
        expr: rate(camdetect_download_dropped_frames_total[2m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Motion detection processing pipeline is not keeping up with incoming frames
 ---
 apiVersion: k-space.ee/v1alpha1
 kind: Camera
 metadata:
--- a/drone/application.yml
+++ b/drone/application.yml
@@ -42,9 +42,6 @@ spec:
    metadata:
      labels:
        app: drone
      annotations:
        prometheus.io/port: "80"
        prometheus.io/scrape: "true"
    spec:
      automountServiceAccountToken: false
      securityContext:
--- a/elastic-system/application.yml
+++ b/elastic-system/application.yml
@@ -10,6 +10,9 @@ spec:
  kibanaRef:
    name: kibana
  config:
    http:
      enabled: true
      port: 5066
    filebeat:
      autodiscover:
        providers:
@@ -81,6 +84,14 @@ spec:
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
        - name: exporter
          image: sepa/beats-exporter
          args:
            - -p=5066
          ports:
            - containerPort: 8080
              name: exporter
              protocol: TCP
        volumes:
        - name: varlogcontainers
          hostPath:
--- a/freescout/application.yml
+++ b/freescout/application.yml
@@ -0,0 +1,16 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: freescout
 spec:
  groups:
    - name: freescout
      rules:
      - alert: FreescoutSyncBroken
        expr: time() - wildduck_last_login{email=~"(info|accounting)@k-space.ee"} > 300
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Freescout mailbox synchronization is broken
--- a/kube-system/README.md
+++ b/kube-system/README.md
@@ -0,0 +1,3 @@
 ```
 kubectl apply -n kube-system -f kube-state-metrics.yml
 ``
--- a/kube-system/kube-state-metrics.yml
+++ b/kube-system/kube-state-metrics.yml
@@ -0,0 +1,221 @@
 ---
 apiVersion: v1
 automountServiceAccountToken: false
 kind: ServiceAccount
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
 rules:
 - apiGroups:
  - ""
  resources:
  - configmaps
  - secrets
  - nodes
  - pods
  - services
  - serviceaccounts
  - resourcequotas
  - replicationcontrollers
  - limitranges
  - persistentvolumeclaims
  - persistentvolumes
  - namespaces
  - endpoints
  verbs:
  - list
  - watch
 - apiGroups:
  - apps
  resources:
  - statefulsets
  - daemonsets
  - deployments
  - replicasets
  verbs:
  - list
  - watch
 - apiGroups:
  - batch
  resources:
  - cronjobs
  - jobs
  verbs:
  - list
  - watch
 - apiGroups:
  - autoscaling
  resources:
  - horizontalpodautoscalers
  verbs:
  - list
  - watch
 - apiGroups:
  - authentication.k8s.io
  resources:
  - tokenreviews
  verbs:
  - create
 - apiGroups:
  - authorization.k8s.io
  resources:
  - subjectaccessreviews
  verbs:
  - create
 - apiGroups:
  - policy
  resources:
  - poddisruptionbudgets
  verbs:
  - list
  - watch
 - apiGroups:
  - certificates.k8s.io
  resources:
  - certificatesigningrequests
  verbs:
  - list
  - watch
 - apiGroups:
  - storage.k8s.io
  resources:
  - storageclasses
  - volumeattachments
  verbs:
  - list
  - watch
 - apiGroups:
  - admissionregistration.k8s.io
  resources:
  - mutatingwebhookconfigurations
  - validatingwebhookconfigurations
  verbs:
  - list
  - watch
 - apiGroups:
  - networking.k8s.io
  resources:
  - networkpolicies
  - ingresses
  verbs:
  - list
  - watch
 - apiGroups:
  - coordination.k8s.io
  resources:
  - leases
  verbs:
  - list
  - watch
 - apiGroups:
  - rbac.authorization.k8s.io
  resources:
  - clusterrolebindings
  - clusterroles
  - rolebindings
  - roles
  verbs:
  - list
  - watch
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
 subjects:
 - kind: ServiceAccount
  name: kube-state-metrics
  namespace: kube-system
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kube-state-metrics
    spec:
      automountServiceAccountToken: true
      containers:
      - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.6.0
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
        name: kube-state-metrics
        ports:
        - containerPort: 8080
          name: http-metrics
        - containerPort: 8081
          name: telemetry
        readinessProbe:
          httpGet:
            path: /
            port: 8081
          initialDelaySeconds: 5
          timeoutSeconds: 5
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop:
            - ALL
          readOnlyRootFilesystem: true
          runAsUser: 65534
      nodeSelector:
        kubernetes.io/os: linux
      serviceAccountName: kube-state-metrics
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
 spec:
  clusterIP: None
  ports:
  - name: http-metrics
    port: 8080
    targetPort: http-metrics
  - name: telemetry
    port: 8081
    targetPort: telemetry
  selector:
    app.kubernetes.io/name: kube-state-metrics
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
  name: kube-state-metrics
 spec:
  endpoints:
  - honorLabels: true
    path: /metrics
    port: http-metrics
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
--- a/longhorn-system/README.md
+++ b/longhorn-system/README.md
@@ -7,7 +7,7 @@ and then heavily modified.
 To deploy Longhorn use following:
 ```
-kubectl -n longhorn-system apply -f longhorn.yaml -f ingress.yml
+kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
 ```
 After deploying specify `dedicated=storage:NoSchedule`
--- a/longhorn-system/application-extras.yml
+++ b/longhorn-system/application-extras.yml
@@ -0,0 +1,126 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: longhorn-dashboard
  namespace: longhorn-system
  annotations:
    kubernetes.io/ingress.class: traefik
    cert-manager.io/cluster-issuer: default
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  rules:
  - host: longhorn.k-space.ee
    http:
      paths:
      - pathType: Prefix
        path: "/"
        backend:
          service:
            name: longhorn-frontend
            port:
              number: 80
  tls:
  - hosts:
    - longhorn.k-space.ee
    secretName: longhorn-tls
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: manager
 spec:
  selector: {}
  podMetricsEndpoints:
    - port: manager
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: longhorn
 spec:
  # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/
  groups:
    - name: longhorn
      rules:
      - alert: LonghornVolumeActualSpaceUsedWarning
        annotations:
          description: The accumulated snapshots for volume use up more space than the volume's capacity
          summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
        expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
        for: 5m
        labels:
          issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
          severity: warning
      - alert: LonghornVolumeStatusCritical
        annotations:
          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
            more than 2 minutes.
          summary: Longhorn volume {{$labels.volume}} is Fault
        expr: longhorn_volume_robustness == 3
        for: 5m
        labels:
          issue: Longhorn volume {{$labels.volume}} is Fault.
          severity: critical
      - alert: LonghornVolumeStatusWarning
        annotations:
          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
            more than 5 minutes.
          summary: Longhorn volume {{$labels.volume}} is Degraded
        expr: longhorn_volume_robustness == 2
        for: 5m
        labels:
          issue: Longhorn volume {{$labels.volume}} is Degraded.
          severity: warning
      - alert: LonghornNodeStorageWarning
        annotations:
          description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
            more than 5 minutes.
          summary:  The used storage of node is over 70% of the capacity.
        expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
        for: 5m
        labels:
          issue: The used storage of node {{$labels.node}} is high.
          severity: warning
      - alert: LonghornDiskStorageWarning
        annotations:
          description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
            more than 5 minutes.
          summary:  The used storage of disk is over 70% of the capacity.
        expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
        for: 5m
        labels:
          issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
          severity: warning
      - alert: LonghornNodeDown
        annotations:
          description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
          summary: Longhorn nodes is offline
        expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
        for: 5m
        labels:
          issue: There are {{$value}} Longhorn nodes are offline
          severity: critical
      - alert: LonghornIntanceManagerCPUUsageWarning
        annotations:
          description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
            more than 5 minutes.
          summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
        expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
        for: 5m
        labels:
          issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
          severity: warning
      - alert: LonghornNodeCPUUsageWarning
        annotations:
          description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
            more than 5 minutes.
          summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
        expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
        for: 5m
        labels:
          issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
          severity: warning
--- a/longhorn-system/application.yml
+++ b/longhorn-system/application.yml
--- a/longhorn-system/ingress.yml
+++ b/longhorn-system/ingress.yml
@@ -1,28 +0,0 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: longhorn-dashboard
  namespace: longhorn-system
  annotations:
    kubernetes.io/ingress.class: traefik
    cert-manager.io/cluster-issuer: default
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  rules:
  - host: longhorn.k-space.ee
    http:
      paths:
      - pathType: Prefix
        path: "/"
        backend:
          service:
            name: longhorn-frontend
            port:
              number: 80
  tls:
  - hosts:
    - longhorn.k-space.ee
    secretName: longhorn-tls
--- a/longhorn-system/values.yaml
+++ b/longhorn-system/values.yaml
@@ -1,27 +0,0 @@
 persistence:
  defaultClassReplicaCount: 2
 defaultSettings:
  defaultDataLocality: best-effort
  taintToleration: "dedicated=storage:NoSchedule"
  systemManagedComponentsNodeSelector: "dedicated:storage"
 longhornDriver:
  tolerations:
  - key: dedicated
    operator: Equal
    value: storage
    effect: NoSchedule
 longhornUI:
  tolerations:
  - key: dedicated
    operator: Equal
    value: storage
    effect: NoSchedule
 ingress:
  enabled: true
  host: longhorn.k-space.ee
  tls: true
  tlsSecret: longhorn-tls
--- a/meta-operator/application.yml
+++ b/meta-operator/application.yml
@@ -67,6 +67,11 @@ spec:
                 items:
                   type: object
                   x-kubernetes-preserve-unknown-fields: true
               customresources:
                 type: array
                 items:
                   type: object
                   x-kubernetes-preserve-unknown-fields: true
         required: ["spec"]
 ---
 apiVersion: apps/v1
@@ -178,12 +183,21 @@ rules:
 - apiGroups:
  - codemowers.io
  resources:
  - bindzones
  - clusteroperators
  - keydbs
  verbs:
  - get
  - list
  - watch
 - apiGroups:
  - k-space.ee
  resources:
  - cams
  verbs:
  - get
  - list
  - watch
 ---
 apiVersion: v1
 kind: ServiceAccount
--- a/meta-operator/keydb.yml
+++ b/meta-operator/keydb.yml
@@ -120,7 +120,7 @@ spec:
        type: ClusterIP
        clusterIP: None
        ports:
-        - name: "server"
+        - name: redis
          port: 6379
          protocol: TCP
          targetPort: redis
@@ -137,14 +137,14 @@ spec:
      spec:
        type: ClusterIP
        ports:
-        - name: "server"
+        - name: redis
          port: 6379
          protocol: TCP
          targetPort: redis
-        - name: "redis-exporter"
+        - name: exporter
          port: 9121
          protocol: TCP
-          targetPort: redis-exporter
+          targetPort: exporter
        selector:
          app.kubernetes.io/name: foobar
        sessionAffinity: ClientIP
@@ -163,9 +163,6 @@ spec:
            app.kubernetes.io/name: foobar
        template:
          metadata:
            annotations:
              prometheus.io/port: "9121"
              prometheus.io/scrape: "true"
            labels:
              app.kubernetes.io/name: foobar
          spec:
@@ -237,10 +234,10 @@ spec:
              envFrom:
                - secretRef:
                    name: foobar-secrets
-            - name: redis-exporter
+            - name: exporter
              image: quay.io/oliver006/redis_exporter
              ports:
-              - name: metrics
+              - name: exporter
                containerPort: 9121
              envFrom:
                - secretRef:
--- a/metallb-system/application.yml
+++ b/metallb-system/application.yml
@@ -1,4 +1,14 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: monitoring
  namespace: metallb-system
 spec:
  selector: {}
  podMetricsEndpoints:
    - port: monitoring
 ---
 apiVersion: metallb.io/v1beta1
 kind: MetalLB
 metadata:
--- a/prometheus-operator/README.md
+++ b/prometheus-operator/README.md
@@ -0,0 +1,19 @@
 # Prometheus operator
 ```
 curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
 kubectl create namespace prometheus-operator
 kubectl apply --server-side -n prometheus-operator -f bundle.yml
 kubectl delete -n prometheus-operator configmap snmp-exporter
 kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
 kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
 ```
 # Mikrotik expoeter
 ```
 kubectl create -n prometheus-operator secret generic mikrotik-exporter \
  --from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
  --from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
 ```
--- a/prometheus-operator/application.yml
+++ b/prometheus-operator/application.yml
@@ -0,0 +1,762 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: metrics
 spec:
  namespaceSelector: {}
  selector: {}
  podMetricsEndpoints:
    - port: exporter
    - port: metrics
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Alertmanager
 metadata:
  name: alertmanager
 spec:
  nodeSelector:
    dedicated: monitoring
  tolerations:
    - key: dedicated
      operator: Equal
      value: monitoring
      effect: NoSchedule
  replicas: 3
  serviceAccountName: alertmanager
  externalUrl: http://am.k-space.ee/
  routePrefix: "/"
  securityContext:
    fsGroup: 2000
    runAsGroup: 2000
    runAsNonRoot: true
    runAsUser: 1000
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: alertmanager
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Prometheus
 metadata:
  name: prometheus
 spec:
  nodeSelector:
    dedicated: monitoring
  tolerations:
    - key: dedicated
      operator: Equal
      value: monitoring
      effect: NoSchedule
  alerting:
    alertmanagers:
      - namespace: prometheus-operator
        name: alertmanager
        port: http
        pathPrefix: "/"
        apiVersion: v2
  externalUrl: "http://prom.k-space.ee/"
  replicas: 2
  shards: 1
  serviceAccountName: prometheus
  securityContext:
    fsGroup: 2000
    runAsGroup: 2000
    runAsNonRoot: true
    runAsUser: 1000
  serviceMonitorNamespaceSelector: {}
  serviceMonitorSelector: {}
  podMonitorNamespaceSelector: {}
  podMonitorSelector: {}
  probeNamespaceSelector: {}
  probeSelector: {}
  ruleNamespaceSelector: {}
  ruleSelector: {}
  retentionSize: 80GB
  storage:
    volumeClaimTemplate:
      spec:
        accessModes:
        - ReadWriteOnce
        resources:
          requests:
            storage: 100Gi
        storageClassName: local-path
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: prometheus
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: prometheus
 rules:
 - apiGroups: [""]
  resources:
  - nodes
  - nodes/metrics
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 - apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
 - apiGroups:
  - networking.k8s.io
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
 - nonResourceURLs: ["/metrics"]
  verbs: ["get"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: prometheus
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
 subjects:
 - kind: ServiceAccount
  name: prometheus
  namespace: prometheus-operator
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: prometheus
 spec:
  groups:
  - name: prometheus
    rules:
    - alert: PrometheusJobMissing
      annotations:
        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \
          \ LABELS = {{ $labels }}"
        summary: Prometheus job missing (instance {{ $labels.instance }})
      expr: absent(up{job="prometheus-operator/prometheus"})
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusTargetMissing
      annotations:
        description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus target missing (instance {{ $labels.instance }})
      expr: up == 0
      for: 5m
      labels:
        severity: critical
    - alert: PrometheusAllTargetsMissing
      annotations:
        description: "A Prometheus job does not have living target anymore.\n  VALUE\
          \ = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus all targets missing (instance {{ $labels.instance }})
      expr: count by (job) (up) == 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusConfigurationReloadFailure
      annotations:
        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\
          \  LABELS = {{ $labels }}"
        summary: Prometheus configuration reload failure (instance {{ $labels.instance
          }})
      expr: prometheus_config_last_reload_successful != 1
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusTooManyRestarts
      annotations:
        description: "Prometheus has restarted more than twice in the last 15 minutes.\
          \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\
          \ }}"
        summary: Prometheus too many restarts (instance {{ $labels.instance }})
      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
        > 2
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusAlertmanagerJobMissing
      annotations:
        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\
          \ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus AlertManager job missing (instance {{ $labels.instance
          }})
      expr: absent(up{job="prometheus-operator/alertmanager"})
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusAlertmanagerConfigurationReloadFailure
      annotations:
        description: "AlertManager configuration reload error\n  VALUE = {{ $value\
          \ }}\n  LABELS = {{ $labels }}"
        summary: Prometheus AlertManager configuration reload failure (instance {{
          $labels.instance }})
      expr: alertmanager_config_last_reload_successful != 1
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusAlertmanagerConfigNotSynced
      annotations:
        description: "Configurations of AlertManager cluster instances are out of\
          \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
          }})
      expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusNotConnectedToAlertmanager
      annotations:
        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\
          \ }}\n  LABELS = {{ $labels }}"
        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
          }})
      expr: prometheus_notifications_alertmanagers_discovered < 1
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusRuleEvaluationFailures
      annotations:
        description: "Prometheus encountered {{ $value }} rule evaluation failures,\
          \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\
          \ = {{ $labels }}"
        summary: Prometheus rule evaluation failures (instance {{ $labels.instance
          }})
      expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTemplateTextExpansionFailures
      annotations:
        description: "Prometheus encountered {{ $value }} template text expansion\
          \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus template text expansion failures (instance {{ $labels.instance
          }})
      expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusRuleEvaluationSlow
      annotations:
        description: "Prometheus rule evaluation took more time than the scheduled\
          \ interval. It indicates a slower storage backend access or too complex\
          \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
      expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
      for: 5m
      labels:
        severity: warning
    - alert: PrometheusNotificationsBacklog
      annotations:
        description: "The Prometheus notification queue has not been empty for 10\
          \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
      expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusAlertmanagerNotificationFailing
      annotations:
        description: "Alertmanager is failing sending notifications\n  VALUE = {{\
          \ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
          }})
      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTargetEmpty
      annotations:
        description: "Prometheus has no target in service discovery\n  VALUE = {{\
          \ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus target empty (instance {{ $labels.instance }})
      expr: prometheus_sd_discovered_targets == 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusLargeScrape
      annotations:
        description: "Prometheus has many scrapes that exceed the sample limit\n \
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus large scrape (instance {{ $labels.instance }})
      expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
        10
      for: 5m
      labels:
        severity: warning
    - alert: PrometheusTargetScrapeDuplicate
      annotations:
        description: "Prometheus has many samples rejected due to duplicate timestamps\
          \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus target scrape duplicate (instance {{ $labels.instance
          }})
      expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
        > 0
      for: 0m
      labels:
        severity: warning
    - alert: PrometheusTsdbCheckpointCreationFailures
      annotations:
        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
          }})
      expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbCheckpointDeletionFailures
      annotations:
        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
          }})
      expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbCompactionsFailed
      annotations:
        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
          }})
      expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbHeadTruncationsFailed
      annotations:
        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
          }})
      expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbReloadFailures
      annotations:
        description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
      expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbWalCorruptions
      annotations:
        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
          and wipe /data/wal
      expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
      for: 0m
      labels:
        severity: critical
    - alert: PrometheusTsdbWalTruncationsFailed
      annotations:
        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
          }})
      expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
      for: 0m
      labels:
        severity: critical
 ---
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: prometheus
  annotations:
    cert-manager.io/cluster-issuer: default
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 spec:
  rules:
  - host: prom.k-space.ee
    http:
      paths:
      - pathType: Prefix
        path: "/"
        backend:
          service:
            name: prometheus-operated
            port:
              number: 9090
  tls:
  - hosts:
    - prom.k-space.ee
    secretName: prom-tls
 ---
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: alertmanager
  annotations:
    cert-manager.io/cluster-issuer: default
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 spec:
  rules:
  - host: am.k-space.ee
    http:
      paths:
      - pathType: Prefix
        path: "/"
        backend:
          service:
            name: alertmanager-operated
            port:
              number: 9093
  tls:
  - hosts:
    - am.k-space.ee
    secretName: alertmanager-tls
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: prometheus
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: prometheus
  podMetricsEndpoints:
    - port: web
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: alertmanager
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: alertmanager
  podMetricsEndpoints:
    - port: web
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: operator
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: prometheus-operator
  podMetricsEndpoints:
    - port: http
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
  name: kubelet
 spec:
  endpoints:
  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    honorLabels: true
    interval: 30s
    port: https-metrics
    scheme: https
    tlsConfig:
      insecureSkipVerify: true
  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    honorLabels: true
    interval: 30s
    path: /metrics/cadvisor
    port: https-metrics
    scheme: https
    tlsConfig:
      insecureSkipVerify: true
  namespaceSelector:
    matchNames:
    - kube-system
  selector:
    matchLabels:
      app.kubernetes.io/name: kubelet
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: kube-state-metrics
 spec:
  groups:
    - name: kube-state-metrics
      rules:
        - alert: KubernetesNodeReady
          expr: kube_node_status_condition{condition="Ready",status="true"} == 0
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes Node ready (instance {{ $labels.instance }})
            description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesMemoryPressure
          expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes memory pressure (instance {{ $labels.instance }})
            description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesDiskPressure
          expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes disk pressure (instance {{ $labels.instance }})
            description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesOutOfDisk
          expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes out of disk (instance {{ $labels.instance }})
            description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesOutOfCapacity
          expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes out of capacity (instance {{ $labels.instance }})
            description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesContainerOomKiller
          expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes container oom killer (instance {{ $labels.instance }})
            description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesJobFailed
          expr: kube_job_status_failed > 0
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes Job failed (instance {{ $labels.instance }})
            description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesCronjobSuspended
          expr: kube_cronjob_spec_suspend != 0
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesPersistentvolumeclaimPending
          expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
            description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesVolumeOutOfDiskSpace
          expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
            description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesVolumeFullInFourDays
          expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesPersistentvolumeError
          expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
            description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesStatefulsetDown
          expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
            description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesHpaScalingAbility
          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
            description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesHpaMetricAvailability
          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
            description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesHpaScaleCapability
          expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
          for: 2m
          labels:
            severity: info
          annotations:
            summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
            description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesPodNotHealthy
          expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
            description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesPodCrashLooping
          expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
            description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesReplicassetMismatch
          expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesDeploymentReplicasMismatch
          expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesStatefulsetReplicasMismatch
          expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
            description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesDeploymentGenerationMismatch
          expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
            description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesStatefulsetGenerationMismatch
          expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
            description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesStatefulsetUpdateNotRolledOut
          expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
            description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesDaemonsetRolloutStuck
          expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
            description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesDaemonsetMisscheduled
          expr: kube_daemonset_status_number_misscheduled > 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
            description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesCronjobTooLong
          expr: time() - kube_cronjob_next_schedule_time > 3600
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesJobSlowCompletion
          expr: kube_job_spec_completions - kube_job_status_succeeded > 0
          for: 12h
          labels:
            severity: critical
          annotations:
            summary: Kubernetes job slow completion (instance {{ $labels.instance }})
            description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesApiServerErrors
          expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes API server errors (instance {{ $labels.instance }})
            description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesApiClientErrors
          expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes API client errors (instance {{ $labels.instance }})
            description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesClientCertificateExpiresNextWeek
          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
            description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesClientCertificateExpiresSoon
          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
            description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: KubernetesApiServerLatency
          expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Kubernetes API server latency (instance {{ $labels.instance }})
            description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/prometheus-operator/blackbox-exporter.yml
+++ b/prometheus-operator/blackbox-exporter.yml
@@ -0,0 +1,258 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: websites
 spec:
  prober:
    url: blackbox-exporter
    path: /probe
  module: http_2xx
  targets:
    staticConfig:
      static:
        - https://git.k-space.ee/
        - https://grafana.k-space.ee/
        - https://wiki.k-space.ee/
        - https://pad.k-space.ee/
        - https://members.k-space.ee/
        - https://nextcloud.k-space.ee/
        - http://minio.infra.k-space.ee:9001/login
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: k6.ee
 spec:
  prober:
    url: blackbox-exporter
    path: /probe
  module: dns_check_traefik
  targets:
    staticConfig:
      static:
        - 193.40.103.2
        - 62.65.250.2
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: samba-cluster
 spec:
  prober:
    url: blackbox-exporter
    path: /metrics
  module: tcp_connect
  targets:
    staticConfig:
      static:
        - dc1.ad.k-space.ee:636
        - dc2.ad.k-space.ee:636
        - dc3.ad.k-space.ee:636
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: misc
 spec:
  prober:
    url: blackbox-exporter
    path: /metrics
  module: tcp_connect
  targets:
    staticConfig:
      static:
        - mail.k-space.ee:465
        - dev.k-space.ee:10648
        - mariadb.infra.k-space.ee:3306
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: blackbox-exporter
 spec:
  # https://awesome-prometheus-alerts.grep.to/rules#blackbox
  groups:
  - name: blackbox
    rules:
    - alert: BlackboxProbeFailed
      expr: probe_success == 0
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe failed (instance {{ $labels.instance }})
        description: Probe failed
    - alert: BlackboxSlowProbe
      expr: avg_over_time(probe_duration_seconds[1m]) > 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Blackbox slow probe (instance {{ $labels.instance }})
        description: Blackbox probe took more than 1s to complete
    - alert: BlackboxSlowDNS
      expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
        description: Blackbox DNS lookup took more than 1s to complete.
          It seemed using IPv6 DNS servers in conjunction with Docker resulted
          in odd 5s latency bump. For now we're using 8.8.8.8 because of that
    - alert: BlackboxProbeHttpFailure
      expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
        description: HTTP status code is not 200-399
    - alert: BlackboxSslCertificateWillExpireSoon
      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
        description: SSL certificate expires in 30 days
    - alert: BlackboxSslCertificateWillExpireSoon
      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
        description: SSL certificate expires in 3 days
    - alert: BlackboxSslCertificateExpired
      expr: probe_ssl_earliest_cert_expiry - time() <= 0
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
        description: SSL certificate has expired already
    - alert: BlackboxProbeSlowHttp
      expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
        description: HTTP request took more than 1s
    - alert: BlackboxProbeSlowPing
      expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
        description: Blackbox ping took more than 1s
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: blackbox-exporter
 spec:
  revisionHistoryLimit: 0
  replicas: 2
  selector:
    matchLabels:
      app: blackbox-exporter
  template:
    metadata:
      labels:
        app: blackbox-exporter
    spec:
      containers:
      - name: blackbox-exporter
        image: prom/blackbox-exporter:v0.20.0
        volumeMounts:
        - name: blackbox-exporter-config
          mountPath: /etc/blackbox_exporter
      volumes:
        - name: blackbox-exporter-config
          configMap:
            name: blackbox-exporter-config
      # TODO: Results in odd 6s connection lag if scheduled in VLAN20
      nodeSelector:
        dedicated: monitoring
      tolerations:
        - key: dedicated
          operator: Equal
          value: monitoring
          effect: NoSchedule
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: app
                operator: In
                values:
                - blackbox-exporter
            topologyKey: "kubernetes.io/hostname"
 ---
 kind: Service
 apiVersion: v1
 metadata:
  name: blackbox-exporter
 spec:
  type: ClusterIP
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 9115
  selector:
    app: blackbox-exporter
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: blackbox-exporter-config
 data:
  config.yml: |-
    modules:
      http_2xx:
        prober: http
        http:
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
      http_post_2xx:
        prober: http
        http:
          method: POST
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
      tcp_connect:
        prober: tcp
        tcp:
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
      icmp:
        prober: icmp
        icmp:
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
      dns_check_traefik:
        prober: dns
        dns:
          query_name: "traefik.k-space.ee"
          query_type: "A"
          validate_answer_rrs:
            fail_if_not_matches_regexp:
             - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
      dns_check_k6:
        prober: dns
        dns:
          query_name: "k6.ee"
          query_type: "A"
          validate_answer_rrs:
            fail_if_not_matches_regexp:
             - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
          preferred_ip_protocol: "ip4"
          ip_protocol_fallback: false
--- a/prometheus-operator/bundle.yml
+++ b/prometheus-operator/bundle.yml
--- a/prometheus-operator/mikrotik-exporter.yml
+++ b/prometheus-operator/mikrotik-exporter.yml
@@ -0,0 +1,104 @@
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: mikrotik
 spec:
  bearerTokenSecret:
    name: mikrotik-exporter
    key: PROMETHEUS_BEARER_TOKEN
  prober:
    path: /metrics
    url: mikrotik-exporter
  targets:
    staticConfig:
      static:
        - router.mgmt.k-space.ee
        - sw_chaos.mgmt.k-space.ee
        - sw_poe.mgmt.k-space.ee
        - sw_mgmt.mgmt.k-space.ee
        - sw_core02.mgmt.k-space.ee
        - sw_cyber.mgmt.k-space.ee
        - sw_ha.mgmt.k-space.ee
        - sw_asocial.mgmt.k-space.ee
        - sw_kitchen.mgmt.k-space.ee
        - sw_core01.mgmt.k-space.ee
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: mikrotik
 spec:
  groups:
  - name: mikrotik
    rules:
    - alert: MikrotikUplinkRedundancyLost
      expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
      for: 0m
      labels:
        severity: error
      annotations:
        summary: Switch uplink high availability lost
        description: One of the two 10Gb optical links is malfunctioning
    - alert: MikrotikLinkRateDegraded
      expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
      for: 0m
      labels:
        severity: error
      annotations:
        summary: 10Gb link degraded
        description: One of the 10Gb links is running at lower speed
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: mikrotik-exporter
 spec:
  revisionHistoryLimit: 0
  replicas: 2
  selector:
    matchLabels:
      app: mikrotik-exporter
  template:
    metadata:
      labels:
        app: mikrotik-exporter
      annotations:
        co.elastic.logs/multiline.pattern: '^  '
        co.elastic.logs/multiline.negate: "false"
        co.elastic.logs/multiline.match: after
    spec:
      containers:
      - name: mikrotik-exporter
        image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
        env:
          - name: MIKROTIK_USER
            value: netpoller
        envFrom:
          - secretRef:
              name: mikrotik-exporter
      nodeSelector:
        dedicated: monitoring
      tolerations:
      - key: dedicated
        operator: Equal
        value: monitoring
        effect: NoSchedule
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
          - topologyKey: "kubernetes.io/hostname"
 ---
 kind: Service
 apiVersion: v1
 metadata:
  name: mikrotik-exporter
 spec:
  type: ClusterIP
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 3001
  selector:
    app: mikrotik-exporter
--- a/prometheus-operator/node-exporter.yml
+++ b/prometheus-operator/node-exporter.yml
@@ -0,0 +1,443 @@
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: nodes-proxmox
 spec:
  targets:
    staticConfig:
      static:
        - nas.mgmt.k-space.ee:9100
        - pve1.proxmox.infra.k-space.ee:9100
        - pve8.proxmox.infra.k-space.ee:9100
        - pve9.proxmox.infra.k-space.ee:9100
      relabelingConfigs:
      - sourceLabels: [__param_target]
        targetLabel: instance
      - sourceLabels: [__param_target]
        targetLabel: __address__
  prober:
    url: localhost
    path: /metrics
  metricRelabelings:
  - sourceLabels: [__address__]
    targetLabel: target
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: Probe
 metadata:
  name: nodes-misc
 spec:
  targets:
    staticConfig:
      static:
          - sprucecone.infra.k-space.ee:9100
          - cedarcone.infra.k-space.ee:9100
      relabelingConfigs:
      - sourceLabels: [__param_target]
        targetLabel: instance
      - sourceLabels: [__param_target]
        targetLabel: __address__
  prober:
    url: localhost
    path: /metrics
  metricRelabelings:
  - sourceLabels: [__address__]
    targetLabel: target
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: node-exporter
 spec:
  groups:
  - name: node-exporter
    rules:
    - alert: ZfsOfflinePool
      expr: node_zfs_zpool_state{state!="online"} > 0
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: ZFS offline pool (instance {{ $labels.instance }})
        description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostHighLoad
      expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Host under high load
        description: Many processes are queued up for execution
    - alert: HostOutOfMemory
      expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of memory (instance {{ $labels.instance }})
        description: Node memory is filling up (< 10% left)
    - alert: HostMemoryUnderMemoryPressure
      expr: rate(node_vmstat_pgmajfault[1m]) > 1000
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
        description: The node is under heavy memory pressure. High rate of major page faults
    - alert: HostUnusualNetworkThroughputIn
      expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
        description: Host network interfaces are probably receiving too much data (> 160 MB/s)
    - alert: HostUnusualNetworkThroughputOut
      expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput out (instance {{ $labels.instance }})
        description: Host network interfaces are probably sending too much data (> 160 MB/s)
    - alert: HostUnusualDiskReadRate
      expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk read rate (instance {{ $labels.instance }})
        description: Disk is probably reading too much data (> 50 MB/s)
    - alert: HostUnusualDiskWriteRate
      expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk write rate (instance {{ $labels.instance }})
        description: Disk is probably writing too much data (> 50 MB/s)
    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostOutOfDiskSpace
      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: Disk is almost full (< 10% left)
    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostDiskWillFillIn24Hours
      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
        description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
    - alert: HostOutOfInodes
      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of inodes (instance {{ $labels.instance }})
        description: Disk is almost running out of available inodes (< 10% left)
    - alert: HostInodesWillFillIn24Hours
      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
        description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
    - alert: HostUnusualDiskReadLatency
      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk read latency (instance {{ $labels.instance }})
        description: Disk latency is growing (read operations > 100ms)
    - alert: HostUnusualDiskWriteLatency
      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk write latency (instance {{ $labels.instance }})
        description: Disk latency is growing (write operations > 100ms)
    - alert: HostCpuStealNoisyNeighbor
      expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
    # 1000 context switches is an arbitrary number.
    # Alert threshold depends on nature of application.
    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
    - alert: HostContextSwitching
      expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host context switching (instance {{ $labels.instance }})
        description: Context switching is growing on node (> 50000 / s)
    - alert: HostSwapIsEnabled
      expr: node_memory_SwapTotal_bytes > 0
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Swap is discouraged nowadays
    - alert: HostPhysicalComponentTooHot
      expr: node_hwmon_temp_celsius > 75
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host physical component too hot (instance {{ $labels.instance }})
        description: Physical hardware component too hot
    - alert: HostNodeOvertemperatureAlarm
      expr: node_hwmon_temp_alarm == 1
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
        description: Physical node temperature alarm triggered
    - alert: HostRaidArrayGotInactive
      expr: node_md_state{state="inactive"} > 0
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Host RAID array got inactive (instance {{ $labels.instance }})
        description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
    - alert: HostRaidDiskFailure
      expr: node_md_disks{state="failed"} > 0
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host RAID disk failure (instance {{ $labels.instance }})
        description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
    - alert: HostOomKillDetected
      expr: increase(node_vmstat_oom_kill[1m]) > 0
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host OOM kill detected (instance {{ $labels.instance }})
        description: OOM kill detected
    - alert: HostEdacCorrectableErrorsDetected
      expr: increase(node_edac_correctable_errors_total[1m]) > 0
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
    - alert: HostEdacUncorrectableErrorsDetected
      expr: node_edac_uncorrectable_errors_total > 0
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
    - alert: HostNetworkReceiveErrors
      expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Receive Errors (instance {{ $labels.instance }})
        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
    - alert: HostNetworkTransmitErrors
      expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
    - alert: HostNetworkInterfaceSaturated
      expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
        description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
    - alert: HostNetworkBondDegraded
      expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Bond Degraded
    - alert: HostConntrackLimit
      expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host conntrack limit (instance {{ $labels.instance }})
        description: The number of conntrack is approching limit
    - alert: HostClockSkew
      expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host clock skew (instance {{ $labels.instance }})
        description: Clock skew detected. Clock is out of sync.
    - alert: HostClockNotSynchronising
      expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host clock not synchronising (instance {{ $labels.instance }})
        description: Clock not synchronising.
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: smart
 spec:
  groups:
    - name: smart
      rules:
      - alert: SmartSSDWriteRateTooHigh
        expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: SSD write rate exceeds 10MB/s
          description: At this rate the SSD will be worn out before warranty period expires
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: temperatures
 spec:
  groups:
    - name: temperatures
      rules:
      - alert: HighDiskTemperature
        expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: High HDD/SSD temperature indicates high ambient temperature
      - alert: HighChipsetTemperature
        expr: node_hwmon_temp_celsius > 65
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
      - alert: LowDiskTemperature
        expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: node-exporter
 spec:
  selector:
    matchLabels:
      app: node-exporter
  podMetricsEndpoints:
    - port: web
      scrapeTimeout: 30s
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: node-exporter
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  labels:
    app: node-exporter
  name: node-exporter
  annotations:
    keel.sh/policy: force
    keel.sh/trigger: poll
    keel.sh/pollSchedule: "@midnight"
 spec:
  selector:
    matchLabels:
      app: node-exporter
  template:
    metadata:
      labels:
        app: node-exporter
    spec:
      containers:
      - name: node-exporter
        args:
        - --web.listen-address=0.0.0.0:9101
        - --path.sysfs=/host/sys
        - --path.rootfs=/host/root
        - --no-collector.wifi
        - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
        - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
        - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
        image: prom/node-exporter:v1.3.1
        resources:
          limits:
            cpu: 50m
            memory: 180Mi
          requests:
            cpu: 5m
            memory: 20Mi
        volumeMounts:
        - mountPath: /host/sys
          mountPropagation: HostToContainer
          name: sys
          readOnly: true
        - mountPath: /host/root
          mountPropagation: HostToContainer
          name: root
          readOnly: true
        ports:
        - containerPort: 9101
          name: web
        securityContext:
          runAsGroup: 65532
          runAsNonRoot: true
          runAsUser: 65532
          readOnlyRootFilesystem: true
      hostNetwork: true
      hostPID: true
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      serviceAccountName: node-exporter
      tolerations:
      - operator: Exists
      volumes:
      - hostPath:
          path: /sys
        name: sys
      - hostPath:
          path: /
        name: root
--- a/prometheus-operator/snmp-exporter.yml
+++ b/prometheus-operator/snmp-exporter.yml
@@ -0,0 +1,172 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: snmp-exporter
 spec:
  replicas: 2
  selector:
    matchLabels:
      app: snmp-exporter
  template:
    metadata:
      labels:
        app: snmp-exporter
    spec:
      containers:
        - image: prom/snmp-exporter:latest
          name: snmp-exporter
          imagePullPolicy: Always
          securityContext:
            runAsNonRoot: true
            runAsUser: 1000
            readOnlyRootFilesystem: true
          ports:
          - containerPort: 9116
            name: exporter
          livenessProbe:
            httpGet:
              path: /health
              port: exporter
          readinessProbe:
            httpGet:
              path: /health
              port: exporter
          volumeMounts:
          - name: snmp-exporter
            mountPath: /etc/snmp_exporter
      volumes:
        - name: snmp-exporter
          configMap:
            name: snmp-exporter
      nodeSelector:
        dedicated: monitoring
      tolerations:
      - key: dedicated
        operator: Equal
        value: monitoring
        effect: NoSchedule
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: app
                operator: In
                values:
                - snmp-exporter
            topologyKey: "kubernetes.io/hostname"
 ---
 kind: Service
 apiVersion: v1
 metadata:
  name: snmp-exporter
 spec:
  type: ClusterIP
  ports:
    - name: exporter
      port: 9116
      protocol: TCP
  selector:
    app: snmp-exporter
 ---
 kind: Probe
 apiVersion: monitoring.coreos.com/v1
 metadata:
  name: ups
 spec:
  interval: 60s
  module: rfc1628_ups
  prober:
    url: snmp-exporter:9116
    path: /snmp
  targets:
    staticConfig:
      static:
        - ups-4.mgmt.k-space.ee
        - ups-5.mgmt.k-space.ee
        - ups-6.mgmt.k-space.ee
        - ups-7.mgmt.k-space.ee
        - ups-8.mgmt.k-space.ee
        - ups-9.mgmt.k-space.ee
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: ups
 spec:
  groups:
  - name: ups
    rules:
    - alert: UPSBatteryLost
      annotations:
        summary: One or more UPS-es have degraded batteries.
      expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
      for: 1m
      labels:
        severity: critical
    - alert: UPSPowerLost
      annotations:
        summary: One or more UPS-es is not in normal operation mode. This either means
          power is lost or UPS was loaded and it's now in bypass mode.
      expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6
      for: 1m
      labels:
        severity: critical
    - alert: UPSExcessivelyLoaded
      annotations:
        summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
          is balanced and load for no UPS stays above 50%.
      expr: snmp_upsOutputPercentLoad > 80
      for: 1h
      labels:
        severity: critical
 ---
 kind: Probe
 apiVersion: monitoring.coreos.com/v1
 metadata:
  name: printer
 spec:
  interval: 60s
  scrapeTimeout: 50s
  module: printer_mib
  prober:
    url: snmp-exporter:9116
    path: /snmp
  targets:
    staticConfig:
      static:
        - mfp-cyber.pub.k-space.ee
        - mfp-chaos.pub.k-space.ee
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: printer
 spec:
  groups:
  - name: printer
    rules:
    - alert: PrinterNeedsAttention
      annotations:
        summary: Printer is in error state. If the underlying reason is 'low on paper'
          make sure there is enough paper near the printer. It not drop a line at
          accounting@k-space.ee to order more office supplies.
      expr: snmp_hrPrinterDetectedErrorState == 1
      for: 0m
      labels:
        severity: warning
 ---
 kind: Probe
 apiVersion: monitoring.coreos.com/v1
 metadata:
  name: beamer
 spec:
  interval: 60s
  module: epson_beamer
  prober:
    url: snmp-exporter:9116
    path: /snmp
  targets:
    staticConfig:
      static:
        - beamer-cyber.sec.k-space.ee
--- a/prometheus-operator/snmp.yml
+++ b/prometheus-operator/snmp.yml
--- a/rosdump/README.md
+++ b/rosdump/README.md
@@ -19,7 +19,7 @@ but it does not export Prometheus metrics either.
 To apply changes run in this directory:
 ```
-kubectl apply -n rosdump -f cronjob.yaml
+kubectl apply -n rosdump -f application.yml
 ```
 To trigger cronjob:
--- a/rosdump/application.yml
+++ b/rosdump/application.yml
@@ -87,7 +87,6 @@ spec:
                          path: ssh_known_hosts
                  - configMap:
                      name: rosdump-config
 ---
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
@@ -108,3 +107,19 @@ spec:
    ports:
    - protocol: TCP
      port: 22
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
  name: rosdump
 spec:
  groups:
    - name: rosdump
      rules:
        - alert: MikrotikBackupsBroken
          expr: absent(kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"}) or time() - kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"} > 3600
          for: 4h
          labels:
            severity: warning
          annotations:
            summary: Mikrotik backups are broken
--- a/traefik/README.md
+++ b/traefik/README.md
@@ -1,6 +1,7 @@
 Traefik Ingress Controller:
 ```
 kubectl create namespace traefik
 helm template --include-crds -n traefik --release-name k6 traefik/traefik -f values.yml > application.yml
-kubectl apply -n traefik -f namespace.yml -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
+kubectl apply -n traefik -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
 ```
--- a/traefik/application-extras.yml
+++ b/traefik/application-extras.yml
@@ -28,9 +28,6 @@ kind: Service
 metadata:
  name: traefik-metrics
  namespace: traefik
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: '9100'
 spec:
  selector:
    app.kubernetes.io/name: traefik
@@ -92,6 +89,16 @@ spec:
  - Ingress
  - Egress
  ingress:
  - from:
    - namespaceSelector:
        matchLabels:
          kubernetes.io/metadata.name: prometheus-operator
      podSelector:
        matchLabels:
          app.kubernetes.io/name: prometheus
    ports:
    - protocol: TCP
      port: 9100
  - from:
    - ipBlock:
        cidr: 0.0.0.0/0
@@ -109,3 +116,14 @@ spec:
  replacePathRegex:
    regex: ^/metrics
    replacement: /
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: traefik
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: traefik
  podMetricsEndpoints:
    - port: metrics
--- a/traefik/values.yml
+++ b/traefik/values.yml
@@ -17,9 +17,8 @@ deployment:
    keel.sh/trigger: patch
    keel.sh/pollSchedule: "@midnight"
-  podAnnotations:
+accessLog:
-    prometheus.io/scrape: 'true'
+  format: json
    prometheus.io/port: '9100'
 # Globally redirect to https://
 globalArguments: