longhorn-system: Updates

2024-08-14 07:36:31 +03:00
parent e5e4a07d01
commit cfc5a739a1
6 changed files with 126 additions and 4720 deletions
--- a/longhorn-system/application-extras.yml
+++ b/longhorn-system/application-extras.yml
@@ -1,3 +1,4 @@
+# yamllint disable rule:line-length
 ---
 apiVersion: codemowers.cloud/v1beta1
 kind: OIDCMiddlewareClient
@@ -27,19 +28,19 @@ metadata:
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  rules:
-  - host: longhorn.k-space.ee
-    http:
-      paths:
-      - pathType: Prefix
-        path: "/"
-        backend:
-          service:
-            name: longhorn-frontend
-            port:
-              number: 80
+    - host: longhorn.k-space.ee
+      http:
+        paths:
+          - pathType: Prefix
+            path: "/"
+            backend:
+              service:
+                name: longhorn-frontend
+                port:
+                  number: 80
  tls:
-  - hosts:
-    - "*.k-space.ee"
+    - hosts:
+        - "*.k-space.ee"
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
@@ -59,81 +60,81 @@ spec:
  groups:
    - name: longhorn
      rules:
-      - alert: LonghornVolumeActualSpaceUsedWarning
-        annotations:
-          description: The accumulated snapshots for volume use up more space than the volume's capacity
-          summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
-        expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
-        for: 5m
-        labels:
-          issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornVolumeStatusCritical
-        annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
-            more than 2 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Fault
-        expr: longhorn_volume_robustness == 3
-        for: 5m
-        labels:
-          issue: Longhorn volume {{$labels.volume}} is Fault.
-          severity: critical
-      - alert: LonghornVolumeStatusWarning
-        annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
-            more than 5 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Degraded
-        expr: longhorn_volume_robustness == 2
-        for: 5m
-        labels:
-          issue: Longhorn volume {{$labels.volume}} is Degraded.
-          severity: warning
-      - alert: LonghornNodeStorageWarning
-        annotations:
-          description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
-          summary:  The used storage of node is over 70% of the capacity.
-        expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
-        for: 5m
-        labels:
-          issue: The used storage of node {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornDiskStorageWarning
-        annotations:
-          description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
-          summary:  The used storage of disk is over 70% of the capacity.
-        expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
-        for: 5m
-        labels:
-          issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornNodeDown
-        annotations:
-          description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
-          summary: Longhorn nodes is offline
-        expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
-        for: 5m
-        labels:
-          issue: There are {{$value}} Longhorn nodes are offline
-          severity: critical
-      - alert: LonghornIntanceManagerCPUUsageWarning
-        annotations:
-          description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
-            more than 5 minutes.
-          summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
-        expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
-        for: 5m
-        labels:
-          issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
-          severity: warning
-      - alert: LonghornNodeCPUUsageWarning
-        annotations:
-          description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
-            more than 5 minutes.
-          summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
-        expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
-        for: 5m
-        labels:
-          issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
-          severity: warning
+        - alert: LonghornVolumeActualSpaceUsedWarning
+          annotations:
+            description: The accumulated snapshots for volume use up more space than the volume's capacity
+            summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
+          expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
+          for: 5m
+          labels:
+            issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornVolumeStatusCritical
+          annotations:
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
+              more than 2 minutes.
+            summary: Longhorn volume {{$labels.volume}} is Fault
+          expr: longhorn_volume_robustness == 3
+          for: 5m
+          labels:
+            issue: Longhorn volume {{$labels.volume}} is Fault.
+            severity: critical
+        - alert: LonghornVolumeStatusWarning
+          annotations:
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
+              more than 5 minutes.
+            summary: Longhorn volume {{$labels.volume}} is Degraded
+          expr: longhorn_volume_robustness == 2
+          for: 5m
+          labels:
+            issue: Longhorn volume {{$labels.volume}} is Degraded.
+            severity: warning
+        - alert: LonghornNodeStorageWarning
+          annotations:
+            description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
+              more than 5 minutes.
+            summary: The used storage of node is over 70% of the capacity.
+          expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
+          for: 5m
+          labels:
+            issue: The used storage of node {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornDiskStorageWarning
+          annotations:
+            description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
+              more than 5 minutes.
+            summary: The used storage of disk is over 70% of the capacity.
+          expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
+          for: 5m
+          labels:
+            issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornNodeDown
+          annotations:
+            description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
+            summary: Longhorn nodes is offline
+          expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
+          for: 5m
+          labels:
+            issue: There are {{$value}} Longhorn nodes are offline
+            severity: critical
+        - alert: LonghornIntanceManagerCPUUsageWarning
+          annotations:
+            description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
+              more than 5 minutes.
+            summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
+          expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
+          for: 5m
+          labels:
+            issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
+            severity: warning
+        - alert: LonghornNodeCPUUsageWarning
+          annotations:
+            description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
+              more than 5 minutes.
+            summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
+          expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
+          for: 5m
+          labels:
+            issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
+            severity: warning