forked from k-space/kube
127 lines
5.1 KiB
YAML
127 lines
5.1 KiB
YAML
apiVersion: networking.k8s.io/v1
|
|
kind: Ingress
|
|
metadata:
|
|
name: longhorn-dashboard
|
|
namespace: longhorn-system
|
|
annotations:
|
|
kubernetes.io/ingress.class: traefik
|
|
cert-manager.io/cluster-issuer: default
|
|
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
|
|
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
|
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
|
|
traefik.ingress.kubernetes.io/router.tls: "true"
|
|
spec:
|
|
rules:
|
|
- host: longhorn.k-space.ee
|
|
http:
|
|
paths:
|
|
- pathType: Prefix
|
|
path: "/"
|
|
backend:
|
|
service:
|
|
name: longhorn-frontend
|
|
port:
|
|
number: 80
|
|
tls:
|
|
- hosts:
|
|
- longhorn.k-space.ee
|
|
secretName: longhorn-tls
|
|
|
|
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PodMonitor
|
|
metadata:
|
|
name: manager
|
|
spec:
|
|
selector: {}
|
|
podMetricsEndpoints:
|
|
- port: manager
|
|
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: longhorn
|
|
spec:
|
|
# Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/
|
|
groups:
|
|
- name: longhorn
|
|
rules:
|
|
- alert: LonghornVolumeActualSpaceUsedWarning
|
|
annotations:
|
|
description: The accumulated snapshots for volume use up more space than the volume's capacity
|
|
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
|
|
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
|
|
for: 5m
|
|
labels:
|
|
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
|
|
severity: warning
|
|
- alert: LonghornVolumeStatusCritical
|
|
annotations:
|
|
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
|
|
more than 2 minutes.
|
|
summary: Longhorn volume {{$labels.volume}} is Fault
|
|
expr: longhorn_volume_robustness == 3
|
|
for: 5m
|
|
labels:
|
|
issue: Longhorn volume {{$labels.volume}} is Fault.
|
|
severity: critical
|
|
- alert: LonghornVolumeStatusWarning
|
|
annotations:
|
|
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
|
|
more than 5 minutes.
|
|
summary: Longhorn volume {{$labels.volume}} is Degraded
|
|
expr: longhorn_volume_robustness == 2
|
|
for: 5m
|
|
labels:
|
|
issue: Longhorn volume {{$labels.volume}} is Degraded.
|
|
severity: warning
|
|
- alert: LonghornNodeStorageWarning
|
|
annotations:
|
|
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
|
|
more than 5 minutes.
|
|
summary: The used storage of node is over 70% of the capacity.
|
|
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
|
|
for: 5m
|
|
labels:
|
|
issue: The used storage of node {{$labels.node}} is high.
|
|
severity: warning
|
|
- alert: LonghornDiskStorageWarning
|
|
annotations:
|
|
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
|
|
more than 5 minutes.
|
|
summary: The used storage of disk is over 70% of the capacity.
|
|
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
|
|
for: 5m
|
|
labels:
|
|
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
|
|
severity: warning
|
|
- alert: LonghornNodeDown
|
|
annotations:
|
|
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
|
|
summary: Longhorn nodes is offline
|
|
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
|
|
for: 5m
|
|
labels:
|
|
issue: There are {{$value}} Longhorn nodes are offline
|
|
severity: critical
|
|
- alert: LonghornIntanceManagerCPUUsageWarning
|
|
annotations:
|
|
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
|
|
more than 5 minutes.
|
|
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
|
|
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
|
|
for: 5m
|
|
labels:
|
|
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
|
|
severity: warning
|
|
- alert: LonghornNodeCPUUsageWarning
|
|
annotations:
|
|
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
|
|
more than 5 minutes.
|
|
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
|
|
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
|
|
severity: warning
|