longhorn-system: Updates

This commit is contained in:
Lauri Võsandi 2024-08-14 07:36:31 +03:00
parent e5e4a07d01
commit cfc5a739a1
6 changed files with 126 additions and 4720 deletions

1
longhorn-system/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
longhorn.yaml

View File

@ -1,19 +1,40 @@
# Longhorn distributed block storage system # Longhorn distributed block storage system
Pull the manifest and apply changes ## For users
You should really avoid using Longhorn as it has over time proven to be
unreliable system. Prefer using remote databases in your application via
the Kubernetes operator pattern.
Use Longhorn for applications that need persistent storage, but are unable
to provide replication in the application layer:
* Applications that insist writing into filesystem
* Applications that serve Git repositories (eg Gitea)
* Applications that check out Git repositories (eg Woodpecker, Drone and CI systems)
* Applications that need to use SQLite
Instead of using built-in `longhorn` storage class, please add new storage class
with suitable replication, data locality parameters and reclaim policy
[https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here)
Longhorn backups are made once per day and it's configured to be uploaded to
the Minio S3 bucket hosted at nas.k-space.ee
## For administrators
Longhorn was last upgraded with following snippet:
``` ```
wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml
patch -p0 < changes.diff patch -p0 < changes.diff
kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml
``` ```
To upgrade use following: After initial deployment `dedicated=storage:NoSchedule` was specified
```
kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
```
After deploying specify `dedicated=storage:NoSchedule`
for `Kubernetes Taint Toleration` under `Setting -> General` on for `Kubernetes Taint Toleration` under `Setting -> General` on
[Longhorn Dashboard](https://longhorn.k-space.ee/). [Longhorn Dashboard](https://longhorn.k-space.ee/).
Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others. Suitable nodes were tagged with `storage` and Longhorn scheduling was disabled on others.
Refer to `application.yaml` to see how backups are configured.

View File

@ -1,3 +1,4 @@
# yamllint disable rule:line-length
--- ---
apiVersion: codemowers.cloud/v1beta1 apiVersion: codemowers.cloud/v1beta1
kind: OIDCMiddlewareClient kind: OIDCMiddlewareClient
@ -27,19 +28,19 @@ metadata:
traefik.ingress.kubernetes.io/router.tls: "true" traefik.ingress.kubernetes.io/router.tls: "true"
spec: spec:
rules: rules:
- host: longhorn.k-space.ee - host: longhorn.k-space.ee
http: http:
paths: paths:
- pathType: Prefix - pathType: Prefix
path: "/" path: "/"
backend: backend:
service: service:
name: longhorn-frontend name: longhorn-frontend
port: port:
number: 80 number: 80
tls: tls:
- hosts: - hosts:
- "*.k-space.ee" - "*.k-space.ee"
--- ---
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: PodMonitor kind: PodMonitor
@ -59,81 +60,81 @@ spec:
groups: groups:
- name: longhorn - name: longhorn
rules: rules:
- alert: LonghornVolumeActualSpaceUsedWarning - alert: LonghornVolumeActualSpaceUsedWarning
annotations: annotations:
description: The accumulated snapshots for volume use up more space than the volume's capacity description: The accumulated snapshots for volume use up more space than the volume's capacity
summary: The actual used space of Longhorn volume is twice the size of the volume capacity. summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
for: 5m for: 5m
labels: labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
severity: warning severity: warning
- alert: LonghornVolumeStatusCritical - alert: LonghornVolumeStatusCritical
annotations: annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes. more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3 expr: longhorn_volume_robustness == 3
for: 5m for: 5m
labels: labels:
issue: Longhorn volume {{$labels.volume}} is Fault. issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical severity: critical
- alert: LonghornVolumeStatusWarning - alert: LonghornVolumeStatusWarning
annotations: annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes. more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2 expr: longhorn_volume_robustness == 2
for: 5m for: 5m
labels: labels:
issue: Longhorn volume {{$labels.volume}} is Degraded. issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning severity: warning
- alert: LonghornNodeStorageWarning - alert: LonghornNodeStorageWarning
annotations: annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes. more than 5 minutes.
summary: The used storage of node is over 70% of the capacity. summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
for: 5m for: 5m
labels: labels:
issue: The used storage of node {{$labels.node}} is high. issue: The used storage of node {{$labels.node}} is high.
severity: warning severity: warning
- alert: LonghornDiskStorageWarning - alert: LonghornDiskStorageWarning
annotations: annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes. more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity. summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
for: 5m for: 5m
labels: labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning severity: warning
- alert: LonghornNodeDown - alert: LonghornNodeDown
annotations: annotations:
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
summary: Longhorn nodes is offline summary: Longhorn nodes is offline
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
for: 5m for: 5m
labels: labels:
issue: There are {{$value}} Longhorn nodes are offline issue: There are {{$value}} Longhorn nodes are offline
severity: critical severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning - alert: LonghornIntanceManagerCPUUsageWarning
annotations: annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes. more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m for: 5m
labels: labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
severity: warning severity: warning
- alert: LonghornNodeCPUUsageWarning - alert: LonghornNodeCPUUsageWarning
annotations: annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
more than 5 minutes. more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m for: 5m
labels: labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure. issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning severity: warning

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
--- application.yml 2024-07-07 14:16:47.953593433 +0300 --- longhorn.yaml 2024-07-07 14:16:47.953593433 +0300
+++ application.modded 2024-07-07 14:18:51.103452617 +0300 +++ longhorn.modded 2024-07-07 14:18:51.103452617 +0300
@@ -86,14 +86,14 @@ @@ -86,14 +86,14 @@
storageclass.kubernetes.io/is-default-class: "true" storageclass.kubernetes.io/is-default-class: "true"
provisioner: driver.longhorn.io provisioner: driver.longhorn.io