forked from k-space/kube
longhorn-system: Updates
This commit is contained in:
parent
e5e4a07d01
commit
cfc5a739a1
1
longhorn-system/.gitignore
vendored
Normal file
1
longhorn-system/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
longhorn.yaml
|
@ -1,19 +1,40 @@
|
|||||||
# Longhorn distributed block storage system
|
# Longhorn distributed block storage system
|
||||||
|
|
||||||
Pull the manifest and apply changes
|
## For users
|
||||||
|
|
||||||
|
You should really avoid using Longhorn as it has over time proven to be
|
||||||
|
unreliable system. Prefer using remote databases in your application via
|
||||||
|
the Kubernetes operator pattern.
|
||||||
|
|
||||||
|
Use Longhorn for applications that need persistent storage, but are unable
|
||||||
|
to provide replication in the application layer:
|
||||||
|
|
||||||
|
* Applications that insist writing into filesystem
|
||||||
|
* Applications that serve Git repositories (eg Gitea)
|
||||||
|
* Applications that check out Git repositories (eg Woodpecker, Drone and CI systems)
|
||||||
|
* Applications that need to use SQLite
|
||||||
|
|
||||||
|
Instead of using built-in `longhorn` storage class, please add new storage class
|
||||||
|
with suitable replication, data locality parameters and reclaim policy
|
||||||
|
[https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here)
|
||||||
|
|
||||||
|
Longhorn backups are made once per day and it's configured to be uploaded to
|
||||||
|
the Minio S3 bucket hosted at nas.k-space.ee
|
||||||
|
|
||||||
|
|
||||||
|
## For administrators
|
||||||
|
|
||||||
|
Longhorn was last upgraded with following snippet:
|
||||||
|
|
||||||
```
|
```
|
||||||
wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml
|
wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml
|
||||||
patch -p0 < changes.diff
|
patch -p0 < changes.diff
|
||||||
|
kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
To upgrade use following:
|
After initial deployment `dedicated=storage:NoSchedule` was specified
|
||||||
|
|
||||||
```
|
|
||||||
kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
After deploying specify `dedicated=storage:NoSchedule`
|
|
||||||
for `Kubernetes Taint Toleration` under `Setting -> General` on
|
for `Kubernetes Taint Toleration` under `Setting -> General` on
|
||||||
[Longhorn Dashboard](https://longhorn.k-space.ee/).
|
[Longhorn Dashboard](https://longhorn.k-space.ee/).
|
||||||
Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others.
|
Suitable nodes were tagged with `storage` and Longhorn scheduling was disabled on others.
|
||||||
|
|
||||||
|
Refer to `application.yaml` to see how backups are configured.
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# yamllint disable rule:line-length
|
||||||
---
|
---
|
||||||
apiVersion: codemowers.cloud/v1beta1
|
apiVersion: codemowers.cloud/v1beta1
|
||||||
kind: OIDCMiddlewareClient
|
kind: OIDCMiddlewareClient
|
||||||
@ -27,19 +28,19 @@ metadata:
|
|||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||||
spec:
|
spec:
|
||||||
rules:
|
rules:
|
||||||
- host: longhorn.k-space.ee
|
- host: longhorn.k-space.ee
|
||||||
http:
|
http:
|
||||||
paths:
|
paths:
|
||||||
- pathType: Prefix
|
- pathType: Prefix
|
||||||
path: "/"
|
path: "/"
|
||||||
backend:
|
backend:
|
||||||
service:
|
service:
|
||||||
name: longhorn-frontend
|
name: longhorn-frontend
|
||||||
port:
|
port:
|
||||||
number: 80
|
number: 80
|
||||||
tls:
|
tls:
|
||||||
- hosts:
|
- hosts:
|
||||||
- "*.k-space.ee"
|
- "*.k-space.ee"
|
||||||
---
|
---
|
||||||
apiVersion: monitoring.coreos.com/v1
|
apiVersion: monitoring.coreos.com/v1
|
||||||
kind: PodMonitor
|
kind: PodMonitor
|
||||||
@ -59,81 +60,81 @@ spec:
|
|||||||
groups:
|
groups:
|
||||||
- name: longhorn
|
- name: longhorn
|
||||||
rules:
|
rules:
|
||||||
- alert: LonghornVolumeActualSpaceUsedWarning
|
- alert: LonghornVolumeActualSpaceUsedWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: The accumulated snapshots for volume use up more space than the volume's capacity
|
description: The accumulated snapshots for volume use up more space than the volume's capacity
|
||||||
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
|
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
|
||||||
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
|
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
|
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LonghornVolumeStatusCritical
|
- alert: LonghornVolumeStatusCritical
|
||||||
annotations:
|
annotations:
|
||||||
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
|
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
|
||||||
more than 2 minutes.
|
more than 2 minutes.
|
||||||
summary: Longhorn volume {{$labels.volume}} is Fault
|
summary: Longhorn volume {{$labels.volume}} is Fault
|
||||||
expr: longhorn_volume_robustness == 3
|
expr: longhorn_volume_robustness == 3
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: Longhorn volume {{$labels.volume}} is Fault.
|
issue: Longhorn volume {{$labels.volume}} is Fault.
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: LonghornVolumeStatusWarning
|
- alert: LonghornVolumeStatusWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
|
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
|
||||||
more than 5 minutes.
|
more than 5 minutes.
|
||||||
summary: Longhorn volume {{$labels.volume}} is Degraded
|
summary: Longhorn volume {{$labels.volume}} is Degraded
|
||||||
expr: longhorn_volume_robustness == 2
|
expr: longhorn_volume_robustness == 2
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: Longhorn volume {{$labels.volume}} is Degraded.
|
issue: Longhorn volume {{$labels.volume}} is Degraded.
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LonghornNodeStorageWarning
|
- alert: LonghornNodeStorageWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
|
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
|
||||||
more than 5 minutes.
|
more than 5 minutes.
|
||||||
summary: The used storage of node is over 70% of the capacity.
|
summary: The used storage of node is over 70% of the capacity.
|
||||||
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
|
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: The used storage of node {{$labels.node}} is high.
|
issue: The used storage of node {{$labels.node}} is high.
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LonghornDiskStorageWarning
|
- alert: LonghornDiskStorageWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
|
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
|
||||||
more than 5 minutes.
|
more than 5 minutes.
|
||||||
summary: The used storage of disk is over 70% of the capacity.
|
summary: The used storage of disk is over 70% of the capacity.
|
||||||
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
|
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
|
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LonghornNodeDown
|
- alert: LonghornNodeDown
|
||||||
annotations:
|
annotations:
|
||||||
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
|
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
|
||||||
summary: Longhorn nodes is offline
|
summary: Longhorn nodes is offline
|
||||||
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
|
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: There are {{$value}} Longhorn nodes are offline
|
issue: There are {{$value}} Longhorn nodes are offline
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: LonghornIntanceManagerCPUUsageWarning
|
- alert: LonghornIntanceManagerCPUUsageWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
|
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
|
||||||
more than 5 minutes.
|
more than 5 minutes.
|
||||||
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
|
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
|
||||||
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
|
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
|
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: LonghornNodeCPUUsageWarning
|
- alert: LonghornNodeCPUUsageWarning
|
||||||
annotations:
|
annotations:
|
||||||
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
|
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
|
||||||
more than 5 minutes.
|
more than 5 minutes.
|
||||||
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
|
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
|
||||||
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
|
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
|
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
|
||||||
severity: warning
|
severity: warning
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -24,7 +24,7 @@ value: 'miniobucket-backup-owner-secrets'
|
|||||||
apiVersion: longhorn.io/v1beta1
|
apiVersion: longhorn.io/v1beta1
|
||||||
kind: RecurringJob
|
kind: RecurringJob
|
||||||
metadata:
|
metadata:
|
||||||
name: backup
|
name: backup
|
||||||
namespace: longhorn-system
|
namespace: longhorn-system
|
||||||
spec:
|
spec:
|
||||||
cron: "0 2 * * *"
|
cron: "0 2 * * *"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
--- application.yml 2024-07-07 14:16:47.953593433 +0300
|
--- longhorn.yaml 2024-07-07 14:16:47.953593433 +0300
|
||||||
+++ application.modded 2024-07-07 14:18:51.103452617 +0300
|
+++ longhorn.modded 2024-07-07 14:18:51.103452617 +0300
|
||||||
@@ -86,14 +86,14 @@
|
@@ -86,14 +86,14 @@
|
||||||
storageclass.kubernetes.io/is-default-class: "true"
|
storageclass.kubernetes.io/is-default-class: "true"
|
||||||
provisioner: driver.longhorn.io
|
provisioner: driver.longhorn.io
|
||||||
|
Loading…
Reference in New Issue
Block a user