forked from k-space/kube
		
	longhorn-system: Updates
This commit is contained in:
		
							
								
								
									
										1
									
								
								longhorn-system/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								longhorn-system/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| longhorn.yaml | ||||
| @@ -1,19 +1,40 @@ | ||||
| # Longhorn distributed block storage system | ||||
|  | ||||
| Pull the manifest and apply changes | ||||
| ## For users | ||||
|  | ||||
| You should really avoid using Longhorn as it has over time proven to be | ||||
| unreliable system. Prefer using remote databases in your application via | ||||
| the Kubernetes operator pattern. | ||||
|  | ||||
| Use Longhorn for applications that need persistent storage, but are unable | ||||
| to provide replication in the application layer: | ||||
|  | ||||
| * Applications that insist writing into filesystem | ||||
| * Applications that serve Git repositories (eg Gitea) | ||||
| * Applications that check out Git repositories (eg Woodpecker, Drone and CI systems) | ||||
| * Applications that need to use SQLite | ||||
|  | ||||
| Instead of using built-in `longhorn` storage class, please add new storage class | ||||
| with suitable replication, data locality parameters and reclaim policy | ||||
| [https://git.k-space.ee/k-space/kube/src/branch/master/storage-class.yaml](here) | ||||
|  | ||||
| Longhorn backups are made once per day and it's configured to be uploaded to | ||||
| the Minio S3 bucket hosted at nas.k-space.ee | ||||
|  | ||||
|  | ||||
| ## For administrators | ||||
|  | ||||
| Longhorn was last upgraded with following snippet: | ||||
|  | ||||
| ``` | ||||
| wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.1/deploy/longhorn.yaml -O application.yml | ||||
| wget https://raw.githubusercontent.com/longhorn/longhorn/v1.6.2/deploy/longhorn.yaml | ||||
| patch -p0 < changes.diff | ||||
| kubectl -n longhorn-system apply -f longhorn.yml -f application-extras.yml -f backup.yaml | ||||
| ``` | ||||
|  | ||||
| To upgrade use following: | ||||
|  | ||||
| ``` | ||||
| kubectl -n longhorn-system apply -f application.yml -f application-extras.yml | ||||
| ``` | ||||
|  | ||||
| After deploying specify `dedicated=storage:NoSchedule` | ||||
| After initial deployment `dedicated=storage:NoSchedule` was specified | ||||
| for `Kubernetes Taint Toleration` under `Setting -> General` on | ||||
| [Longhorn Dashboard](https://longhorn.k-space.ee/). | ||||
| Proceed to tag suitable nodes with `storage` and disable Longhorn scheduling on others. | ||||
| Suitable nodes were tagged with  `storage` and Longhorn scheduling was disabled on others. | ||||
|  | ||||
| Refer to `application.yaml` to see how backups are configured. | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| # yamllint disable rule:line-length | ||||
| --- | ||||
| apiVersion: codemowers.cloud/v1beta1 | ||||
| kind: OIDCMiddlewareClient | ||||
| @@ -27,19 +28,19 @@ metadata: | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
| spec: | ||||
|   rules: | ||||
|   - host: longhorn.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: longhorn-frontend | ||||
|             port: | ||||
|               number: 80 | ||||
|     - host: longhorn.k-space.ee | ||||
|       http: | ||||
|         paths: | ||||
|           - pathType: Prefix | ||||
|             path: "/" | ||||
|             backend: | ||||
|               service: | ||||
|                 name: longhorn-frontend | ||||
|                 port: | ||||
|                   number: 80 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - "*.k-space.ee" | ||||
|     - hosts: | ||||
|         - "*.k-space.ee" | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| @@ -59,81 +60,81 @@ spec: | ||||
|   groups: | ||||
|     - name: longhorn | ||||
|       rules: | ||||
|       - alert: LonghornVolumeActualSpaceUsedWarning | ||||
|         annotations: | ||||
|           description: The accumulated snapshots for volume use up more space than the volume's capacity | ||||
|           summary: The actual used space of Longhorn volume is twice the size of the volume capacity. | ||||
|         expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornVolumeStatusCritical | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for | ||||
|             more than 2 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Fault | ||||
|         expr: longhorn_volume_robustness == 3 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Fault. | ||||
|           severity: critical | ||||
|       - alert: LonghornVolumeStatusWarning | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Degraded | ||||
|         expr: longhorn_volume_robustness == 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Degraded. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of node is over 70% of the capacity. | ||||
|         expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornDiskStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of disk is over 70% of the capacity. | ||||
|         expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeDown | ||||
|         annotations: | ||||
|           description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. | ||||
|           summary: Longhorn nodes is offline | ||||
|         expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: There are {{$value}} Longhorn nodes are offline | ||||
|           severity: critical | ||||
|       - alert: LonghornIntanceManagerCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. | ||||
|         expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. | ||||
|         expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn node {{$labels.node}} experiences high CPU pressure. | ||||
|           severity: warning | ||||
|         - alert: LonghornVolumeActualSpaceUsedWarning | ||||
|           annotations: | ||||
|             description: The accumulated snapshots for volume use up more space than the volume's capacity | ||||
|             summary: The actual used space of Longhorn volume is twice the size of the volume capacity. | ||||
|           expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. | ||||
|             severity: warning | ||||
|         - alert: LonghornVolumeStatusCritical | ||||
|           annotations: | ||||
|             description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for | ||||
|               more than 2 minutes. | ||||
|             summary: Longhorn volume {{$labels.volume}} is Fault | ||||
|           expr: longhorn_volume_robustness == 3 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: Longhorn volume {{$labels.volume}} is Fault. | ||||
|             severity: critical | ||||
|         - alert: LonghornVolumeStatusWarning | ||||
|           annotations: | ||||
|             description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for | ||||
|               more than 5 minutes. | ||||
|             summary: Longhorn volume {{$labels.volume}} is Degraded | ||||
|           expr: longhorn_volume_robustness == 2 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: Longhorn volume {{$labels.volume}} is Degraded. | ||||
|             severity: warning | ||||
|         - alert: LonghornNodeStorageWarning | ||||
|           annotations: | ||||
|             description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for | ||||
|               more than 5 minutes. | ||||
|             summary: The used storage of node is over 70% of the capacity. | ||||
|           expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: The used storage of node {{$labels.node}} is high. | ||||
|             severity: warning | ||||
|         - alert: LonghornDiskStorageWarning | ||||
|           annotations: | ||||
|             description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for | ||||
|               more than 5 minutes. | ||||
|             summary: The used storage of disk is over 70% of the capacity. | ||||
|           expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. | ||||
|             severity: warning | ||||
|         - alert: LonghornNodeDown | ||||
|           annotations: | ||||
|             description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. | ||||
|             summary: Longhorn nodes is offline | ||||
|           expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: There are {{$value}} Longhorn nodes are offline | ||||
|             severity: critical | ||||
|         - alert: LonghornIntanceManagerCPUUsageWarning | ||||
|           annotations: | ||||
|             description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for | ||||
|               more than 5 minutes. | ||||
|             summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. | ||||
|           expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. | ||||
|             severity: warning | ||||
|         - alert: LonghornNodeCPUUsageWarning | ||||
|           annotations: | ||||
|             description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for | ||||
|               more than 5 minutes. | ||||
|             summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. | ||||
|           expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             issue: Longhorn node {{$labels.node}} experiences high CPU pressure. | ||||
|             severity: warning | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -24,7 +24,7 @@ value: 'miniobucket-backup-owner-secrets' | ||||
| apiVersion: longhorn.io/v1beta1 | ||||
| kind: RecurringJob | ||||
| metadata: | ||||
|   name: backup  | ||||
|   name: backup | ||||
|   namespace: longhorn-system | ||||
| spec: | ||||
|   cron: "0 2 * * *" | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- application.yml	2024-07-07 14:16:47.953593433 +0300 | ||||
| +++ application.modded	2024-07-07 14:18:51.103452617 +0300 | ||||
| --- longhorn.yaml	2024-07-07 14:16:47.953593433 +0300 | ||||
| +++ longhorn.modded	2024-07-07 14:18:51.103452617 +0300 | ||||
| @@ -86,14 +86,14 @@ | ||||
|          storageclass.kubernetes.io/is-default-class: "true" | ||||
|      provisioner: driver.longhorn.io | ||||
|   | ||||
		Reference in New Issue
	
	Block a user