longhorn-system: Updates

2024-08-14 07:36:31 +03:00
parent e5e4a07d01
commit cfc5a739a1
6 changed files with 126 additions and 4720 deletions
--- a/longhorn-system/.gitignore
+++ b/longhorn-system/.gitignore
@@ -0,0 +1 @@
 longhorn.yaml
--- a/longhorn-system/README.md
+++ b/longhorn-system/README.md
@@ -1,19 +1,40 @@
 # Longhorn distributed block storage system
-Pull the manifest and apply changes
+## For users
 You should really avoid using Longhorn as it has over time proven to be
 unreliable system. Prefer using remote databases in your application via
 the Kubernetes operator pattern.
 Use Longhorn for applications that need persistent storage, but are unable
 to provide replication in the application layer:
 * Applications that insist writing into filesystem
 * Applications that serve Git repositories (eg Gitea)
 * Applications that check out Git repositories (eg Woodpecker, Drone and CI systems)
 * Applications that need to use SQLite
 Instead of using built-in `longhorn` storage class, please add new storage class
 with suitable replication, data locality parameters and reclaim policy
 [https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here)
 Longhorn backups are made once per day and it's configured to be uploaded to
 the Minio S3 bucket hosted at nas.k-space.ee
 ## For administrators
 Longhorn was last upgraded with following snippet:
 ```
-wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml
+wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml
 patch -p0 < changes.diff
 kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml
 ```
-To upgrade use following:
+After initial deployment `dedicated=storage:NoSchedule` was specified
 ```
 kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
 ```
 After deploying specify `dedicated=storage:NoSchedule`
 for `Kubernetes Taint Toleration` under `Setting -> General` on
 [Longhorn Dashboard](https://longhorn.k-space.ee/).
-Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others.
+Suitable nodes were tagged with  `storage` and Longhorn scheduling was disabled on others.
 Refer to `application.yaml` to see how backups are configured.
--- a/longhorn-system/application-extras.yml
+++ b/longhorn-system/application-extras.yml
@@ -1,3 +1,4 @@
 # yamllint disable rule:line-length
 ---
 apiVersion: codemowers.cloud/v1beta1
 kind: OIDCMiddlewareClient
@@ -27,19 +28,19 @@ metadata:
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  rules:
-  - host: longhorn.k-space.ee
+    - host: longhorn.k-space.ee
-    http:
+      http:
-      paths:
+        paths:
-      - pathType: Prefix
+          - pathType: Prefix
-        path: "/"
+            path: "/"
-        backend:
+            backend:
-          service:
+              service:
-            name: longhorn-frontend
+                name: longhorn-frontend
-            port:
+                port:
-              number: 80
+                  number: 80
  tls:
-  - hosts:
+    - hosts:
-    - "*.k-space.ee"
+        - "*.k-space.ee"
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
@@ -59,81 +60,81 @@ spec:
  groups:
    - name: longhorn
      rules:
-      - alert: LonghornVolumeActualSpaceUsedWarning
+        - alert: LonghornVolumeActualSpaceUsedWarning
-        annotations:
+          annotations:
-          description: The accumulated snapshots for volume use up more space than the volume's capacity
+            description: The accumulated snapshots for volume use up more space than the volume's capacity
-          summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
+            summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
-        expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
+          expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
+            issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
-          severity: warning
+            severity: warning
-      - alert: LonghornVolumeStatusCritical
+        - alert: LonghornVolumeStatusCritical
-        annotations:
+          annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
-            more than 2 minutes.
+              more than 2 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Fault
+            summary: Longhorn volume {{$labels.volume}} is Fault
-        expr: longhorn_volume_robustness == 3
+          expr: longhorn_volume_robustness == 3
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: Longhorn volume {{$labels.volume}} is Fault.
+            issue: Longhorn volume {{$labels.volume}} is Fault.
-          severity: critical
+            severity: critical
-      - alert: LonghornVolumeStatusWarning
+        - alert: LonghornVolumeStatusWarning
-        annotations:
+          annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
-            more than 5 minutes.
+              more than 5 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Degraded
+            summary: Longhorn volume {{$labels.volume}} is Degraded
-        expr: longhorn_volume_robustness == 2
+          expr: longhorn_volume_robustness == 2
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: Longhorn volume {{$labels.volume}} is Degraded.
+            issue: Longhorn volume {{$labels.volume}} is Degraded.
-          severity: warning
+            severity: warning
-      - alert: LonghornNodeStorageWarning
+        - alert: LonghornNodeStorageWarning
-        annotations:
+          annotations:
-          description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
+            description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
+              more than 5 minutes.
-          summary:  The used storage of node is over 70% of the capacity.
+            summary: The used storage of node is over 70% of the capacity.
-        expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
+          expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: The used storage of node {{$labels.node}} is high.
+            issue: The used storage of node {{$labels.node}} is high.
-          severity: warning
+            severity: warning
-      - alert: LonghornDiskStorageWarning
+        - alert: LonghornDiskStorageWarning
-        annotations:
+          annotations:
-          description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
+            description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
+              more than 5 minutes.
-          summary:  The used storage of disk is over 70% of the capacity.
+            summary: The used storage of disk is over 70% of the capacity.
-        expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
+          expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
+            issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
-          severity: warning
+            severity: warning
-      - alert: LonghornNodeDown
+        - alert: LonghornNodeDown
-        annotations:
+          annotations:
-          description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
+            description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
-          summary: Longhorn nodes is offline
+            summary: Longhorn nodes is offline
-        expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
+          expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: There are {{$value}} Longhorn nodes are offline
+            issue: There are {{$value}} Longhorn nodes are offline
-          severity: critical
+            severity: critical
-      - alert: LonghornIntanceManagerCPUUsageWarning
+        - alert: LonghornIntanceManagerCPUUsageWarning
-        annotations:
+          annotations:
-          description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
+            description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
-            more than 5 minutes.
+              more than 5 minutes.
-          summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
+            summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
-        expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
+          expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
+            issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
-          severity: warning
+            severity: warning
-      - alert: LonghornNodeCPUUsageWarning
+        - alert: LonghornNodeCPUUsageWarning
-        annotations:
+          annotations:
-          description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
+            description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
-            more than 5 minutes.
+              more than 5 minutes.
-          summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
+            summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
-        expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
+          expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
-        for: 5m
+          for: 5m
-        labels:
+          labels:
-          issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
+            issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
-          severity: warning
+            severity: warning
--- a/longhorn-system/application.yml
+++ b/longhorn-system/application.yml
--- a/longhorn-system/backup.yaml
+++ b/longhorn-system/backup.yaml
@@ -24,7 +24,7 @@ value: 'miniobucket-backup-owner-secrets'
 apiVersion: longhorn.io/v1beta1
 kind: RecurringJob
 metadata:
-  name: backup 
+  name: backup
  namespace: longhorn-system
 spec:
  cron: "0 2 * * *"
--- a/longhorn-system/changes.diff
+++ b/longhorn-system/changes.diff
@@ -1,5 +1,5 @@
--- application.yml	2024-07-07 14:16:47.953593433 +0300
+--- longhorn.yaml	2024-07-07 14:16:47.953593433 +0300
-+++ application.modded	2024-07-07 14:18:51.103452617 +0300
+++ longhorn.modded	2024-07-07 14:18:51.103452617 +0300
@@ -86,14 +86,14 @@
         storageclass.kubernetes.io/is-default-class: "true"
     provisioner: driver.longhorn.io