longhorn-system: Updates

2024-08-14 07:36:31 +03:00
parent e5e4a07d01
commit cfc5a739a1
6 changed files with 126 additions and 4720 deletions
--- a/longhorn-system/.gitignore
+++ b/longhorn-system/.gitignore
@@ -0,0 +1 @@
+longhorn.yaml
--- a/longhorn-system/README.md
+++ b/longhorn-system/README.md
@@ -1,19 +1,40 @@
 # Longhorn distributed block storage system

-Pull the manifest and apply changes
+## For users
+
+You should really avoid using Longhorn as it has over time proven to be
+unreliable system. Prefer using remote databases in your application via
+the Kubernetes operator pattern.
+
+Use Longhorn for applications that need persistent storage, but are unable
+to provide replication in the application layer:
+
+* Applications that insist writing into filesystem
+* Applications that serve Git repositories (eg Gitea)
+* Applications that check out Git repositories (eg Woodpecker, Drone and CI systems)
+* Applications that need to use SQLite
+
+Instead of using built-in `longhorn` storage class, please add new storage class
+with suitable replication, data locality parameters and reclaim policy
+[https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here)
+
+Longhorn backups are made once per day and it's configured to be uploaded to
+the Minio S3 bucket hosted at nas.k-space.ee
+
+
+## For administrators
+
+Longhorn was last upgraded with following snippet:

 ```
-wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml
+wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml
 patch -p0 < changes.diff
+kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml
 ```

-To upgrade use following:
-
-```
-kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
-```
-
-After deploying specify `dedicated=storage:NoSchedule`
+After initial deployment `dedicated=storage:NoSchedule` was specified
 for `Kubernetes Taint Toleration` under `Setting -> General` on
 [Longhorn Dashboard](https://longhorn.k-space.ee/).
-Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others.
+Suitable nodes were tagged with  `storage` and Longhorn scheduling was disabled on others.
+
+Refer to `application.yaml` to see how backups are configured.
--- a/longhorn-system/application-extras.yml
+++ b/longhorn-system/application-extras.yml
@@ -1,3 +1,4 @@
+# yamllint disable rule:line-length
 ---
 apiVersion: codemowers.cloud/v1beta1
 kind: OIDCMiddlewareClient
@@ -27,19 +28,19 @@ metadata:
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  rules:
-  - host: longhorn.k-space.ee
-    http:
-      paths:
-      - pathType: Prefix
-        path: "/"
-        backend:
-          service:
-            name: longhorn-frontend
-            port:
-              number: 80
+    - host: longhorn.k-space.ee
+      http:
+        paths:
+          - pathType: Prefix
+            path: "/"
+            backend:
+              service:
+                name: longhorn-frontend
+                port:
+                  number: 80
  tls:
-  - hosts:
-    - "*.k-space.ee"
+    - hosts:
+        - "*.k-space.ee"
 ---
 apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
@@ -59,81 +60,81 @@ spec:
  groups:
    - name: longhorn
      rules:
-      - alert: LonghornVolumeActualSpaceUsedWarning
-        annotations:
-          description: The accumulated snapshots for volume use up more space than the volume's capacity
-          summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
-        expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
-        for: 5m
-        labels:
-          issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornVolumeStatusCritical
-        annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
-            more than 2 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Fault
-        expr: longhorn_volume_robustness == 3
-        for: 5m
-        labels:
-          issue: Longhorn volume {{$labels.volume}} is Fault.
-          severity: critical
-      - alert: LonghornVolumeStatusWarning
-        annotations:
-          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
-            more than 5 minutes.
-          summary: Longhorn volume {{$labels.volume}} is Degraded
-        expr: longhorn_volume_robustness == 2
-        for: 5m
-        labels:
-          issue: Longhorn volume {{$labels.volume}} is Degraded.
-          severity: warning
-      - alert: LonghornNodeStorageWarning
-        annotations:
-          description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
-          summary:  The used storage of node is over 70% of the capacity.
-        expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
-        for: 5m
-        labels:
-          issue: The used storage of node {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornDiskStorageWarning
-        annotations:
-          description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
-            more than 5 minutes.
-          summary:  The used storage of disk is over 70% of the capacity.
-        expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
-        for: 5m
-        labels:
-          issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
-          severity: warning
-      - alert: LonghornNodeDown
-        annotations:
-          description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
-          summary: Longhorn nodes is offline
-        expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
-        for: 5m
-        labels:
-          issue: There are {{$value}} Longhorn nodes are offline
-          severity: critical
-      - alert: LonghornIntanceManagerCPUUsageWarning
-        annotations:
-          description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
-            more than 5 minutes.
-          summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
-        expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
-        for: 5m
-        labels:
-          issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
-          severity: warning
-      - alert: LonghornNodeCPUUsageWarning
-        annotations:
-          description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
-            more than 5 minutes.
-          summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
-        expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
-        for: 5m
-        labels:
-          issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
-          severity: warning
+        - alert: LonghornVolumeActualSpaceUsedWarning
+          annotations:
+            description: The accumulated snapshots for volume use up more space than the volume's capacity
+            summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
+          expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
+          for: 5m
+          labels:
+            issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornVolumeStatusCritical
+          annotations:
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
+              more than 2 minutes.
+            summary: Longhorn volume {{$labels.volume}} is Fault
+          expr: longhorn_volume_robustness == 3
+          for: 5m
+          labels:
+            issue: Longhorn volume {{$labels.volume}} is Fault.
+            severity: critical
+        - alert: LonghornVolumeStatusWarning
+          annotations:
+            description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
+              more than 5 minutes.
+            summary: Longhorn volume {{$labels.volume}} is Degraded
+          expr: longhorn_volume_robustness == 2
+          for: 5m
+          labels:
+            issue: Longhorn volume {{$labels.volume}} is Degraded.
+            severity: warning
+        - alert: LonghornNodeStorageWarning
+          annotations:
+            description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
+              more than 5 minutes.
+            summary: The used storage of node is over 70% of the capacity.
+          expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
+          for: 5m
+          labels:
+            issue: The used storage of node {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornDiskStorageWarning
+          annotations:
+            description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
+              more than 5 minutes.
+            summary: The used storage of disk is over 70% of the capacity.
+          expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
+          for: 5m
+          labels:
+            issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
+            severity: warning
+        - alert: LonghornNodeDown
+          annotations:
+            description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
+            summary: Longhorn nodes is offline
+          expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
+          for: 5m
+          labels:
+            issue: There are {{$value}} Longhorn nodes are offline
+            severity: critical
+        - alert: LonghornIntanceManagerCPUUsageWarning
+          annotations:
+            description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
+              more than 5 minutes.
+            summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
+          expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
+          for: 5m
+          labels:
+            issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
+            severity: warning
+        - alert: LonghornNodeCPUUsageWarning
+          annotations:
+            description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
+              more than 5 minutes.
+            summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
+          expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
+          for: 5m
+          labels:
+            issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
+            severity: warning
--- a/longhorn-system/application.yml
+++ b/longhorn-system/application.yml
--- a/longhorn-system/changes.diff
+++ b/longhorn-system/changes.diff
@@ -1,5 +1,5 @@
--- application.yml	2024-07-07 14:16:47.953593433 +0300
-+++ application.modded	2024-07-07 14:18:51.103452617 +0300
+--- longhorn.yaml	2024-07-07 14:16:47.953593433 +0300
+++ longhorn.modded	2024-07-07 14:18:51.103452617 +0300
@@ -86,14 +86,14 @@
         storageclass.kubernetes.io/is-default-class: "true"
     provisioner: driver.longhorn.io