--- apiVersion: codemowers.io/v1alpha1 kind: OIDCGWMiddlewareClient metadata: name: ui spec: displayName: Longhorn uri: 'https://longhorn.k-space.ee' allowedGroups: - k-space:kubernetes:admins headerMapping: email: Remote-Email groups: Remote-Groups name: Remote-Name user: Remote-Username --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: longhorn-dashboard namespace: longhorn-system annotations: kubernetes.io/ingress.class: traefik external-dns.alpha.kubernetes.io/target: traefik.k-space.ee traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.middlewares: longhorn-system-ui@kubernetescrd traefik.ingress.kubernetes.io/router.tls: "true" spec: rules: - host: longhorn.k-space.ee http: paths: - pathType: Prefix path: "/" backend: service: name: longhorn-frontend port: number: 80 tls: - hosts: - "*.k-space.ee" --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: manager spec: selector: {} podMetricsEndpoints: - port: manager --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: longhorn spec: # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/ groups: - name: longhorn rules: - alert: LonghornVolumeActualSpaceUsedWarning annotations: description: The accumulated snapshots for volume use up more space than the volume's capacity summary: The actual used space of Longhorn volume is twice the size of the volume capacity. expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 for: 5m labels: issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. severity: warning - alert: LonghornVolumeStatusCritical annotations: description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for more than 2 minutes. summary: Longhorn volume {{$labels.volume}} is Fault expr: longhorn_volume_robustness == 3 for: 5m labels: issue: Longhorn volume {{$labels.volume}} is Fault. severity: critical - alert: LonghornVolumeStatusWarning annotations: description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for more than 5 minutes. summary: Longhorn volume {{$labels.volume}} is Degraded expr: longhorn_volume_robustness == 2 for: 5m labels: issue: Longhorn volume {{$labels.volume}} is Degraded. severity: warning - alert: LonghornNodeStorageWarning annotations: description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes. summary: The used storage of node is over 70% of the capacity. expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 for: 5m labels: issue: The used storage of node {{$labels.node}} is high. severity: warning - alert: LonghornDiskStorageWarning annotations: description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes. summary: The used storage of disk is over 70% of the capacity. expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 for: 5m labels: issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. severity: warning - alert: LonghornNodeDown annotations: description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. summary: Longhorn nodes is offline expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 for: 5m labels: issue: There are {{$value}} Longhorn nodes are offline severity: critical - alert: LonghornIntanceManagerCPUUsageWarning annotations: description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for more than 5 minutes. summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 for: 5m labels: issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. severity: warning - alert: LonghornNodeCPUUsageWarning annotations: description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for more than 5 minutes. summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 for: 5m labels: issue: Longhorn node {{$labels.node}} experiences high CPU pressure. severity: warning