2022-09-12 04:44:06 +00:00
---
2022-09-11 13:24:35 +00:00
apiVersion : monitoring.coreos.com/v1
kind : Probe
metadata :
name : nodes-proxmox
spec :
targets :
staticConfig :
static :
- nas.mgmt.k-space.ee:9100
- pve1.proxmox.infra.k-space.ee:9100
- pve8.proxmox.infra.k-space.ee:9100
- pve9.proxmox.infra.k-space.ee:9100
relabelingConfigs :
2022-09-12 04:44:06 +00:00
- sourceLabels : [ __param_target]
targetLabel : instance
- sourceLabels : [ __param_target]
targetLabel : __address__
2022-09-11 13:24:35 +00:00
prober :
url : localhost
path : /metrics
metricRelabelings :
2022-09-12 04:44:06 +00:00
- sourceLabels : [ __address__]
targetLabel : target
2022-09-11 13:24:35 +00:00
---
apiVersion : monitoring.coreos.com/v1
kind : Probe
metadata :
name : nodes-misc
spec :
targets :
staticConfig :
static :
2022-09-12 04:44:06 +00:00
- sprucecone.infra.k-space.ee:9100
- cedarcone.infra.k-space.ee:9100
2022-09-11 13:24:35 +00:00
relabelingConfigs :
2022-09-12 04:44:06 +00:00
- sourceLabels : [ __param_target]
targetLabel : instance
- sourceLabels : [ __param_target]
targetLabel : __address__
2022-09-11 13:24:35 +00:00
prober :
url : localhost
path : /metrics
metricRelabelings :
2022-09-12 04:44:06 +00:00
- sourceLabels : [ __address__]
targetLabel : target
2022-09-11 13:24:35 +00:00
---
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
name : node-exporter
spec :
groups :
2022-09-12 04:44:06 +00:00
- name : node-exporter
rules :
- alert : ZfsOfflinePool
expr : node_zfs_zpool_state{state!="online"} > 0
for : 1m
labels :
severity : critical
annotations :
summary : ZFS offline pool (instance {{ $labels.instance }})
description : "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostHighLoad
expr : sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
for : 15m
labels :
severity : warning
annotations :
summary : Host under high load
description : Many processes are queued up for execution
- alert : HostOutOfMemory
expr : (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
for : 2m
labels :
severity : warning
annotations :
summary : Host out of memory (instance {{ $labels.instance }})
description : Node memory is filling up (< 10% left)
- alert : HostMemoryUnderMemoryPressure
expr : rate(node_vmstat_pgmajfault[1m]) > 1000
for : 2m
labels :
severity : warning
annotations :
summary : Host memory under memory pressure (instance {{ $labels.instance }})
description : The node is under heavy memory pressure. High rate of major page faults
- alert : HostUnusualNetworkThroughputIn
expr : sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
for : 1h
labels :
severity : warning
annotations :
summary : Host unusual network throughput in (instance {{ $labels.instance }})
description : Host network interfaces are probably receiving too much data (> 160 MB/s)
- alert : HostUnusualNetworkThroughputOut
expr : sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
for : 1h
labels :
severity : warning
annotations :
summary : Host unusual network throughput out (instance {{ $labels.instance }})
description : Host network interfaces are probably sending too much data (> 160 MB/s)
- alert : HostUnusualDiskReadRate
expr : sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
for : 1h
labels :
severity : warning
annotations :
summary : Host unusual disk read rate (instance {{ $labels.instance }})
description : Disk is probably reading too much data (> 50 MB/s)
- alert : HostUnusualDiskWriteRate
expr : sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
for : 1h
labels :
severity : warning
annotations :
summary : Host unusual disk write rate (instance {{ $labels.instance }})
description : Disk is probably writing too much data (> 50 MB/s)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert : HostOutOfDiskSpace
expr : (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host out of disk space (instance {{ $labels.instance }})
description : Disk is almost full (< 10% left)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert : HostDiskWillFillIn24Hours
expr : (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host disk will fill in 24 hours (instance {{ $labels.instance }})
description : Filesystem is predicted to run out of space within the next 24 hours at current write rate
- alert : HostOutOfInodes
expr : node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host out of inodes (instance {{ $labels.instance }})
description : Disk is almost running out of available inodes (< 10% left)
- alert : HostInodesWillFillIn24Hours
expr : node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for : 2m
labels :
severity : warning
annotations :
summary : Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description : Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
- alert : HostUnusualDiskReadLatency
expr : rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for : 2m
labels :
severity : warning
annotations :
summary : Host unusual disk read latency (instance {{ $labels.instance }})
description : Disk latency is growing (read operations > 100ms)
- alert : HostUnusualDiskWriteLatency
expr : rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for : 2m
labels :
severity : warning
annotations :
summary : Host unusual disk write latency (instance {{ $labels.instance }})
description : Disk latency is growing (write operations > 100ms)
- alert : HostCpuStealNoisyNeighbor
expr : avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for : 0m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description : CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert : HostContextSwitching
expr : (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
for : 0m
labels :
severity : warning
annotations :
summary : Host context switching (instance {{ $labels.instance }})
description : Context switching is growing on node (> 50000 / s)
- alert : HostSwapIsEnabled
expr : node_memory_SwapTotal_bytes > 0
for : 0m
labels :
severity : warning
annotations :
summary : Swap is discouraged nowadays
- alert : HostPhysicalComponentTooHot
expr : node_hwmon_temp_celsius > 75
for : 5m
labels :
severity : warning
annotations :
summary : Host physical component too hot (instance {{ $labels.instance }})
description : Physical hardware component too hot
- alert : HostNodeOvertemperatureAlarm
expr : node_hwmon_temp_alarm == 1
for : 0m
labels :
severity : critical
annotations :
summary : Host node overtemperature alarm (instance {{ $labels.instance }})
description : Physical node temperature alarm triggered
- alert : HostRaidArrayGotInactive
expr : node_md_state{state="inactive"} > 0
for : 0m
labels :
severity : critical
annotations :
summary : Host RAID array got inactive (instance {{ $labels.instance }})
description : RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
- alert : HostRaidDiskFailure
expr : node_md_disks{state="failed"} > 0
for : 2m
labels :
severity : warning
annotations :
summary : Host RAID disk failure (instance {{ $labels.instance }})
description : At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
- alert : HostOomKillDetected
expr : increase(node_vmstat_oom_kill[1m]) > 0
for : 0m
labels :
severity : warning
annotations :
summary : Host OOM kill detected (instance {{ $labels.instance }})
description : OOM kill detected
- alert : HostEdacCorrectableErrorsDetected
expr : increase(node_edac_correctable_errors_total[1m]) > 0
for : 0m
labels :
severity : info
annotations :
summary : Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description : "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostEdacUncorrectableErrorsDetected
expr : node_edac_uncorrectable_errors_total > 0
for : 0m
labels :
severity : warning
annotations :
summary : Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description : "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostNetworkReceiveErrors
expr : rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Receive Errors (instance {{ $labels.instance }})
description : "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostNetworkTransmitErrors
expr : rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Transmit Errors (instance {{ $labels.instance }})
description : "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostNetworkInterfaceSaturated
expr : (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for : 1m
labels :
severity : warning
annotations :
summary : Host Network Interface Saturated (instance {{ $labels.instance }})
description : "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
- alert : HostNetworkBondDegraded
expr : node_bonding_active != node_bonding_slaves {master=~"bond.*"}
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Bond Degraded
- alert : HostConntrackLimit
expr : node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for : 5m
labels :
severity : warning
annotations :
summary : Host conntrack limit (instance {{ $labels.instance }})
description : The number of conntrack is approching limit
- alert : HostClockSkew
expr : (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for : 2m
labels :
severity : warning
annotations :
summary : Host clock skew (instance {{ $labels.instance }})
description : Clock skew detected. Clock is out of sync.
- alert : HostClockNotSynchronising
expr : min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for : 2m
labels :
severity : warning
annotations :
summary : Host clock not synchronising (instance {{ $labels.instance }})
description : Clock not synchronising.
2022-09-11 13:24:35 +00:00
---
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
name : smart
spec :
groups :
- name : smart
rules :
2022-09-12 04:44:06 +00:00
- alert : SmartSSDWriteRateTooHigh
expr : rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
for : 5m
labels :
severity : warning
annotations :
summary : SSD write rate exceeds 10MB/s
description : At this rate the SSD will be worn out before warranty period expires
2022-09-11 13:24:35 +00:00
---
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
name : temperatures
spec :
groups :
- name : temperatures
rules :
2022-09-12 04:44:06 +00:00
- alert : HighDiskTemperature
expr : smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
for : 10m
labels :
severity : critical
annotations :
summary : High HDD/SSD temperature indicates high ambient temperature
- alert : HighChipsetTemperature
expr : node_hwmon_temp_celsius > 65
for : 10m
labels :
severity : warning
annotations :
summary : High chipset (CPU, NB) temperature indicates insufficient or failing fans
- alert : LowDiskTemperature
expr : smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
for : 10m
labels :
severity : critical
annotations :
summary : Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
2022-09-11 13:24:35 +00:00
---
apiVersion : monitoring.coreos.com/v1
kind : PodMonitor
metadata :
name : node-exporter
spec :
selector :
matchLabels :
app : node-exporter
podMetricsEndpoints :
- port : web
scrapeTimeout : 30s
---
apiVersion : v1
kind : ServiceAccount
metadata :
name : node-exporter
---
apiVersion : apps/v1
kind : DaemonSet
metadata :
labels :
app : node-exporter
name : node-exporter
annotations :
keel.sh/policy : force
keel.sh/trigger : poll
keel.sh/pollSchedule : "@midnight"
spec :
selector :
matchLabels :
app : node-exporter
template :
metadata :
labels :
app : node-exporter
spec :
containers :
2022-09-12 04:44:06 +00:00
- name : node-exporter
args :
- --web.listen-address=0.0.0.0:9101
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no -collector.wifi
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image : prom/node-exporter:v1.3.1
resources :
limits :
cpu : 50m
memory : 180Mi
requests :
cpu : 5m
memory : 20Mi
volumeMounts :
- mountPath : /host/sys
mountPropagation : HostToContainer
name : sys
readOnly : true
- mountPath : /host/root
mountPropagation : HostToContainer
name : root
readOnly : true
ports :
- containerPort : 9101
name : web
securityContext :
runAsGroup : 65532
runAsNonRoot : true
runAsUser : 65532
readOnlyRootFilesystem : true
2022-09-11 13:24:35 +00:00
hostNetwork : true
hostPID : true
securityContext :
runAsNonRoot : true
runAsUser : 65534
serviceAccountName : node-exporter
tolerations :
2022-09-12 04:44:06 +00:00
- operator : Exists
2022-09-11 13:24:35 +00:00
volumes :
2022-09-12 04:44:06 +00:00
- hostPath :
path : /sys
name : sys
- hostPath :
path : /
name : root