forked from k-space/kube
		
	Migrate to Prometheus Operator
This commit is contained in:
		
							
								
								
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,126 @@ | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: longhorn-dashboard | ||||
|   namespace: longhorn-system | ||||
|   annotations: | ||||
|     kubernetes.io/ingress.class: traefik | ||||
|     cert-manager.io/cluster-issuer: default | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
| spec: | ||||
|   rules: | ||||
|   - host: longhorn.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: longhorn-frontend | ||||
|             port: | ||||
|               number: 80 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - longhorn.k-space.ee | ||||
|     secretName: longhorn-tls | ||||
|  | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: manager | ||||
| spec: | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|     - port: manager | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: longhorn | ||||
| spec: | ||||
|   # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/ | ||||
|   groups: | ||||
|     - name: longhorn | ||||
|       rules: | ||||
|       - alert: LonghornVolumeActualSpaceUsedWarning | ||||
|         annotations: | ||||
|           description: The accumulated snapshots for volume use up more space than the volume's capacity | ||||
|           summary: The actual used space of Longhorn volume is twice the size of the volume capacity. | ||||
|         expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornVolumeStatusCritical | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for | ||||
|             more than 2 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Fault | ||||
|         expr: longhorn_volume_robustness == 3 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Fault. | ||||
|           severity: critical | ||||
|       - alert: LonghornVolumeStatusWarning | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Degraded | ||||
|         expr: longhorn_volume_robustness == 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Degraded. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of node is over 70% of the capacity. | ||||
|         expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornDiskStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of disk is over 70% of the capacity. | ||||
|         expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeDown | ||||
|         annotations: | ||||
|           description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. | ||||
|           summary: Longhorn nodes is offline | ||||
|         expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: There are {{$value}} Longhorn nodes are offline | ||||
|           severity: critical | ||||
|       - alert: LonghornIntanceManagerCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. | ||||
|         expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. | ||||
|         expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn node {{$labels.node}} experiences high CPU pressure. | ||||
|           severity: warning | ||||
		Reference in New Issue
	
	Block a user