diff --git a/longhorn-system/.gitignore b/longhorn-system/.gitignore deleted file mode 100644 index d899503..0000000 --- a/longhorn-system/.gitignore +++ /dev/null @@ -1 +0,0 @@ -longhorn.yaml diff --git a/longhorn-system/README.md b/longhorn-system/README.md deleted file mode 100644 index fde44ef..0000000 --- a/longhorn-system/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Longhorn distributed block storage system - -## For users - -You should really avoid using Longhorn as it has over time -[proven to be unreliable system](https://www.reddit.com/r/kubernetes/comments/1cbggo8/longhorn_is_unreliable/). -Prefer using remote databases in your application via -the Kubernetes operator pattern. - -Use Longhorn for applications that need persistent storage, but are unable -to provide replication in the application layer: - -* Applications that insist writing into filesystem -* Applications that serve Git repositories (eg Gitea) -* Applications that check out Git repositories (eg Woodpecker, Drone and CI systems) -* Applications that need to use SQLite - -Instead of using built-in `longhorn` storage class, please add new storage class -with suitable replication, data locality parameters and reclaim policy -[here](https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml) - -Longhorn backups are made once per day and it's configured to be uploaded to -the Minio S3 bucket hosted at nas.k-space.ee - - -## For administrators - -Longhorn was last upgraded with following snippet: - -``` -wget https://raw.githubusercontent.com/longhorn/longhorn/v1.8.2/deploy/longhorn.yaml -patch -p0 < changes.diff -kubectl -n longhorn-system apply -f longhorn.yaml -f application-extras.yml -f backup.yaml -``` - -After initial deployment `dedicated=storage:NoSchedule` was specified -for `Kubernetes Taint Toleration` under `Setting -> General` on -[Longhorn Dashboard](https://longhorn.k-space.ee/). -Suitable nodes were tagged with `storage` and Longhorn scheduling was disabled on others. -This is to prevent scheduling Longhorn data on arbitrary Kubernetes nodes as -`storage[1-4].kube.k-space.ee` nodes are the ones which have additional 200G volume mounted at `/mnt/persistent/` diff --git a/longhorn-system/application-extras.yml b/longhorn-system/application-extras.yml deleted file mode 100644 index d55caf4..0000000 --- a/longhorn-system/application-extras.yml +++ /dev/null @@ -1,138 +0,0 @@ ---- -apiVersion: codemowers.cloud/v1beta1 -kind: OIDCMiddlewareClient -metadata: - name: ui -spec: - displayName: Longhorn - uri: 'https://longhorn.k-space.ee' - allowedGroups: - - k-space:kubernetes:admins - headerMapping: - email: Remote-Email - groups: Remote-Groups - name: Remote-Name - user: Remote-Username ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: longhorn-dashboard - namespace: longhorn-system - annotations: - kubernetes.io/ingress.class: traefik - external-dns.alpha.kubernetes.io/target: traefik.k-space.ee - traefik.ingress.kubernetes.io/router.entrypoints: websecure - traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-ui@kubernetescrd -spec: - rules: - - host: longhorn.k-space.ee - http: - paths: - - pathType: Prefix - path: "/" - backend: - service: - name: longhorn-frontend - port: - number: 80 - tls: - - hosts: - - "*.k-space.ee" ---- -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: manager -spec: - selector: {} - podMetricsEndpoints: - - port: manager ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: longhorn -spec: - # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/ - groups: - - name: longhorn - rules: - - alert: LonghornVolumeActualSpaceUsedWarning - annotations: - description: The accumulated snapshots for volume use up more space than the volume's capacity - summary: The actual used space of Longhorn volume is twice the size of the volume capacity. - expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 - for: 5m - labels: - issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. - severity: warning - - alert: LonghornVolumeStatusCritical - annotations: - description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for - more than 2 minutes. - summary: Longhorn volume {{$labels.volume}} is Fault - expr: longhorn_volume_robustness == 3 - for: 5m - labels: - issue: Longhorn volume {{$labels.volume}} is Fault. - severity: critical - - alert: LonghornVolumeStatusWarning - annotations: - description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for - more than 5 minutes. - summary: Longhorn volume {{$labels.volume}} is Degraded - expr: longhorn_volume_robustness == 2 - for: 5m - labels: - issue: Longhorn volume {{$labels.volume}} is Degraded. - severity: warning - - alert: LonghornNodeStorageWarning - annotations: - description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for - more than 5 minutes. - summary: The used storage of node is over 70% of the capacity. - expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 - for: 5m - labels: - issue: The used storage of node {{$labels.node}} is high. - severity: warning - - alert: LonghornDiskStorageWarning - annotations: - description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for - more than 5 minutes. - summary: The used storage of disk is over 70% of the capacity. - expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 - for: 5m - labels: - issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. - severity: warning - - alert: LonghornNodeDown - annotations: - description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. - summary: Longhorn nodes is offline - expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 - for: 5m - labels: - issue: There are {{$value}} Longhorn nodes are offline - severity: critical - - alert: LonghornIntanceManagerCPUUsageWarning - annotations: - description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for - more than 5 minutes. - summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. - expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 - for: 5m - labels: - issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. - severity: warning - - alert: LonghornNodeCPUUsageWarning - annotations: - description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for - more than 5 minutes. - summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. - expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 - for: 5m - labels: - issue: Longhorn node {{$labels.node}} experiences high CPU pressure. - severity: warning diff --git a/longhorn-system/backup.yaml b/longhorn-system/backup.yaml deleted file mode 100644 index f34df76..0000000 --- a/longhorn-system/backup.yaml +++ /dev/null @@ -1,46 +0,0 @@ ---- -apiVersion: codemowers.cloud/v1beta1 -kind: MinioBucketClaim -metadata: - name: backup -spec: - capacity: 1Ti - class: external ---- -apiVersion: longhorn.io/v1beta2 -kind: Setting -metadata: - name: backup-target - namespace: longhorn-system -value: 's3://longhorn-system-a4b235c5-7919-4cb0-9949-259e60c579f1@us-east1/' ---- -apiVersion: longhorn.io/v1beta2 -kind: Setting -metadata: - name: backup-target-credential-secret - namespace: longhorn-system -value: 'miniobucket-backup-owner-secrets' ---- -apiVersion: longhorn.io/v1beta1 -kind: RecurringJob -metadata: - name: backup - namespace: longhorn-system -spec: - cron: "0 2 * * *" - task: backup - groups: - - default - retain: 1 - concurrency: 4 ---- -apiVersion: longhorn.io/v1beta1 -kind: RecurringJob -metadata: - name: trim - namespace: longhorn-system -spec: - cron: "0 * * * *" - task: trim - groups: - - default diff --git a/longhorn-system/changes.diff b/longhorn-system/changes.diff deleted file mode 100644 index 860c33f..0000000 --- a/longhorn-system/changes.diff +++ /dev/null @@ -1,53 +0,0 @@ ---- longhorn.yaml 2024-07-07 14:16:47.953593433 +0300 -+++ longhorn.modded 2024-07-07 14:18:51.103452617 +0300 -@@ -86,14 +86,14 @@ - storageclass.kubernetes.io/is-default-class: "true" - provisioner: driver.longhorn.io - allowVolumeExpansion: true -- reclaimPolicy: "Delete" -+ reclaimPolicy: "Retain" - volumeBindingMode: Immediate - parameters: -- numberOfReplicas: "3" -+ numberOfReplicas: "2" - staleReplicaTimeout: "30" - fromBackup: "" -- fsType: "ext4" -- dataLocality: "disabled" -+ fsType: "xfs" -+ dataLocality: "best-effort" - unmapMarkSnapChainRemoved: "ignored" - --- - # Source: longhorn/templates/crds.yaml -@@ -4379,6 +4379,15 @@ - app.kubernetes.io/version: v1.6.2 - app: longhorn-manager - spec: -+ tolerations: -+ - key: dedicated -+ operator: Equal -+ value: nvr -+ effect: NoSchedule -+ - key: arch -+ operator: Equal -+ value: arm64 -+ effect: NoSchedule - containers: - - name: longhorn-manager - image: longhornio/longhorn-manager:v1.6.2 -@@ -4484,6 +4493,15 @@ - app.kubernetes.io/version: v1.6.2 - app: longhorn-driver-deployer - spec: -+ tolerations: -+ - key: dedicated -+ operator: Equal -+ value: nvr -+ effect: NoSchedule -+ - key: arch -+ operator: Equal -+ value: arm64 -+ effect: NoSchedule - initContainers: - - name: wait-longhorn-manager - image: longhornio/longhorn-manager:v1.6.2