longhorn-system: Updates

This commit is contained in:
Lauri Võsandi 2024-08-14 07:36:31 +03:00
parent e5e4a07d01
commit cfc5a739a1
6 changed files with 126 additions and 4720 deletions

1
longhorn-system/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
longhorn.yaml

View File

@ -1,19 +1,40 @@
# Longhorn distributed block storage system
Pull the manifest and apply changes
## For users
You should really avoid using Longhorn as it has over time proven to be
unreliable system. Prefer using remote databases in your application via
the Kubernetes operator pattern.
Use Longhorn for applications that need persistent storage, but are unable
to provide replication in the application layer:
* Applications that insist writing into filesystem
* Applications that serve Git repositories (eg Gitea)
* Applications that check out Git repositories (eg Woodpecker, Drone and CI systems)
* Applications that need to use SQLite
Instead of using built-in `longhorn` storage class, please add new storage class
with suitable replication, data locality parameters and reclaim policy
[https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here)
Longhorn backups are made once per day and it's configured to be uploaded to
the Minio S3 bucket hosted at nas.k-space.ee
## For administrators
Longhorn was last upgraded with following snippet:
```
wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml
wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml
patch -p0 < changes.diff
kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml
```
To upgrade use following:
```
kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
```
After deploying specify `dedicated=storage:NoSchedule`
After initial deployment `dedicated=storage:NoSchedule` was specified
for `Kubernetes Taint Toleration` under `Setting -> General` on
[Longhorn Dashboard](https://longhorn.k-space.ee/).
Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others.
Suitable nodes were tagged with `storage` and Longhorn scheduling was disabled on others.
Refer to `application.yaml` to see how backups are configured.

View File

@ -1,3 +1,4 @@
# yamllint disable rule:line-length
---
apiVersion: codemowers.cloud/v1beta1
kind: OIDCMiddlewareClient
@ -27,19 +28,19 @@ metadata:
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
rules:
- host: longhorn.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: longhorn-frontend
port:
number: 80
- host: longhorn.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: longhorn-frontend
port:
number: 80
tls:
- hosts:
- "*.k-space.ee"
- hosts:
- "*.k-space.ee"
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
@ -59,81 +60,81 @@ spec:
groups:
- name: longhorn
rules:
- alert: LonghornVolumeActualSpaceUsedWarning
annotations:
description: The accumulated snapshots for volume use up more space than the volume's capacity
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
for: 5m
labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning
- alert: LonghornNodeDown
annotations:
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
summary: Longhorn nodes is offline
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning
- alert: LonghornVolumeActualSpaceUsedWarning
annotations:
description: The accumulated snapshots for volume use up more space than the volume's capacity
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
for: 5m
labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning
- alert: LonghornNodeDown
annotations:
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
summary: Longhorn nodes is offline
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ value: 'miniobucket-backup-owner-secrets'
apiVersion: longhorn.io/v1beta1
kind: RecurringJob
metadata:
name: backup
name: backup
namespace: longhorn-system
spec:
cron: "0 2 * * *"

View File

@ -1,5 +1,5 @@
--- application.yml 2024-07-07 14:16:47.953593433 +0300
+++ application.modded 2024-07-07 14:18:51.103452617 +0300
--- longhorn.yaml 2024-07-07 14:16:47.953593433 +0300
+++ longhorn.modded 2024-07-07 14:18:51.103452617 +0300
@@ -86,14 +86,14 @@
storageclass.kubernetes.io/is-default-class: "true"
provisioner: driver.longhorn.io