From 7ae75f2f1ca121cdef0b461c42fd04a52a04aa18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lauri=20V=C3=B5sandi?= Date: Mon, 12 Sep 2022 07:44:06 +0300 Subject: [PATCH] prometheus-operator: Fix node exporter formatting --- prometheus-operator/node-exporter.yml | 683 +++++++++++++------------- 1 file changed, 342 insertions(+), 341 deletions(-) diff --git a/prometheus-operator/node-exporter.yml b/prometheus-operator/node-exporter.yml index 5a2f28d..3e3a131 100644 --- a/prometheus-operator/node-exporter.yml +++ b/prometheus-operator/node-exporter.yml @@ -1,3 +1,4 @@ +--- apiVersion: monitoring.coreos.com/v1 kind: Probe metadata: @@ -11,16 +12,16 @@ spec: - pve8.proxmox.infra.k-space.ee:9100 - pve9.proxmox.infra.k-space.ee:9100 relabelingConfigs: - - sourceLabels: [__param_target] - targetLabel: instance - - sourceLabels: [__param_target] - targetLabel: __address__ + - sourceLabels: [__param_target] + targetLabel: instance + - sourceLabels: [__param_target] + targetLabel: __address__ prober: url: localhost path: /metrics metricRelabelings: - - sourceLabels: [__address__] - targetLabel: target + - sourceLabels: [__address__] + targetLabel: target --- apiVersion: monitoring.coreos.com/v1 kind: Probe @@ -30,19 +31,19 @@ spec: targets: staticConfig: static: - - sprucecone.infra.k-space.ee:9100 - - cedarcone.infra.k-space.ee:9100 + - sprucecone.infra.k-space.ee:9100 + - cedarcone.infra.k-space.ee:9100 relabelingConfigs: - - sourceLabels: [__param_target] - targetLabel: instance - - sourceLabels: [__param_target] - targetLabel: __address__ + - sourceLabels: [__param_target] + targetLabel: instance + - sourceLabels: [__param_target] + targetLabel: __address__ prober: url: localhost path: /metrics metricRelabelings: - - sourceLabels: [__address__] - targetLabel: target + - sourceLabels: [__address__] + targetLabel: target --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -50,263 +51,263 @@ metadata: name: node-exporter spec: groups: - - name: node-exporter - rules: - - alert: ZfsOfflinePool - expr: node_zfs_zpool_state{state!="online"} > 0 - for: 1m - labels: - severity: critical - annotations: - summary: ZFS offline pool (instance {{ $labels.instance }}) - description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostHighLoad - expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5 - for: 15m - labels: - severity: warning - annotations: - summary: Host under high load - description: Many processes are queued up for execution - - alert: HostOutOfMemory - expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory (instance {{ $labels.instance }}) - description: Node memory is filling up (< 10% left) - - alert: HostMemoryUnderMemoryPressure - expr: rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure (instance {{ $labels.instance }}) - description: The node is under heavy memory pressure. High rate of major page faults - - alert: HostUnusualNetworkThroughputIn - expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06 - for: 1h - labels: - severity: warning - annotations: - summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: Host network interfaces are probably receiving too much data (> 160 MB/s) - - alert: HostUnusualNetworkThroughputOut - expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06 - for: 1h - labels: - severity: warning - annotations: - summary: Host unusual network throughput out (instance {{ $labels.instance }}) - description: Host network interfaces are probably sending too much data (> 160 MB/s) - - alert: HostUnusualDiskReadRate - expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000 - for: 1h - labels: - severity: warning - annotations: - summary: Host unusual disk read rate (instance {{ $labels.instance }}) - description: Disk is probably reading too much data (> 50 MB/s) - - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000 - for: 1h - labels: - severity: warning - annotations: - summary: Host unusual disk write rate (instance {{ $labels.instance }}) - description: Disk is probably writing too much data (> 50 MB/s) - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of disk space (instance {{ $labels.instance }}) - description: Disk is almost full (< 10% left) - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostDiskWillFillIn24Hours - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) - description: Filesystem is predicted to run out of space within the next 24 hours at current write rate - - alert: HostOutOfInodes - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host out of inodes (instance {{ $labels.instance }}) - description: Disk is almost running out of available inodes (< 10% left) - - alert: HostInodesWillFillIn24Hours - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) - description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate - - alert: HostUnusualDiskReadLatency - expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk read latency (instance {{ $labels.instance }}) - description: Disk latency is growing (read operations > 100ms) - - alert: HostUnusualDiskWriteLatency - expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write latency (instance {{ $labels.instance }}) - description: Disk latency is growing (write operations > 100ms) - - alert: HostCpuStealNoisyNeighbor - expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) - description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - # 1000 context switches is an arbitrary number. - # Alert threshold depends on nature of application. - # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - - alert: HostContextSwitching - expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000 - for: 0m - labels: - severity: warning - annotations: - summary: Host context switching (instance {{ $labels.instance }}) - description: Context switching is growing on node (> 50000 / s) - - alert: HostSwapIsEnabled - expr: node_memory_SwapTotal_bytes > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Swap is discouraged nowadays - - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot (instance {{ $labels.instance }}) - description: Physical hardware component too hot - - alert: HostNodeOvertemperatureAlarm - expr: node_hwmon_temp_alarm == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Host node overtemperature alarm (instance {{ $labels.instance }}) - description: Physical node temperature alarm triggered - - alert: HostRaidArrayGotInactive - expr: node_md_state{state="inactive"} > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Host RAID array got inactive (instance {{ $labels.instance }}) - description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - - alert: HostRaidDiskFailure - expr: node_md_disks{state="failed"} > 0 - for: 2m - labels: - severity: warning - annotations: - summary: Host RAID disk failure (instance {{ $labels.instance }}) - description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap - - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: OOM kill detected - - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[1m]) > 0 - for: 0m - labels: - severity: info - annotations: - summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) - description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. - - alert: HostEdacUncorrectableErrorsDetected - expr: node_edac_uncorrectable_errors_total > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) - description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. - - alert: HostNetworkReceiveErrors - expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Receive Errors (instance {{ $labels.instance }}) - description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes. - - alert: HostNetworkTransmitErrors - expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Transmit Errors (instance {{ $labels.instance }}) - description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes. - - alert: HostNetworkInterfaceSaturated - expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 - for: 1m - labels: - severity: warning - annotations: - summary: Host Network Interface Saturated (instance {{ $labels.instance }}) - description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded. - - alert: HostNetworkBondDegraded - expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"} - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Bond Degraded - - alert: HostConntrackLimit - expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: Host conntrack limit (instance {{ $labels.instance }}) - description: The number of conntrack is approching limit - - alert: HostClockSkew - expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) - for: 2m - labels: - severity: warning - annotations: - summary: Host clock skew (instance {{ $labels.instance }}) - description: Clock skew detected. Clock is out of sync. - - alert: HostClockNotSynchronising - expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising (instance {{ $labels.instance }}) - description: Clock not synchronising. + - name: node-exporter + rules: + - alert: ZfsOfflinePool + expr: node_zfs_zpool_state{state!="online"} > 0 + for: 1m + labels: + severity: critical + annotations: + summary: ZFS offline pool (instance {{ $labels.instance }}) + description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostHighLoad + expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5 + for: 15m + labels: + severity: warning + annotations: + summary: Host under high load + description: Many processes are queued up for execution + - alert: HostOutOfMemory + expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: Node memory is filling up (< 10% left) + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: The node is under heavy memory pressure. High rate of major page faults + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06 + for: 1h + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: Host network interfaces are probably receiving too much data (> 160 MB/s) + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06 + for: 1h + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: Host network interfaces are probably sending too much data (> 160 MB/s) + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000 + for: 1h + labels: + severity: warning + annotations: + summary: Host unusual disk read rate (instance {{ $labels.instance }}) + description: Disk is probably reading too much data (> 50 MB/s) + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000 + for: 1h + labels: + severity: warning + annotations: + summary: Host unusual disk write rate (instance {{ $labels.instance }}) + description: Disk is probably writing too much data (> 50 MB/s) + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: Disk is almost full (< 10% left) + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostDiskWillFillIn24Hours + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: Filesystem is predicted to run out of space within the next 24 hours at current write rate + - alert: HostOutOfInodes + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: Disk is almost running out of available inodes (< 10% left) + - alert: HostInodesWillFillIn24Hours + expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate + - alert: HostUnusualDiskReadLatency + expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: Disk latency is growing (read operations > 100ms) + - alert: HostUnusualDiskWriteLatency + expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: Disk latency is growing (write operations > 100ms) + - alert: HostCpuStealNoisyNeighbor + expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. + # 1000 context switches is an arbitrary number. + # Alert threshold depends on nature of application. + # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 + - alert: HostContextSwitching + expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000 + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching (instance {{ $labels.instance }}) + description: Context switching is growing on node (> 50000 / s) + - alert: HostSwapIsEnabled + expr: node_memory_SwapTotal_bytes > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Swap is discouraged nowadays + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: Physical hardware component too hot + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_alarm == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: Physical node temperature alarm triggered + - alert: HostRaidArrayGotInactive + expr: node_md_state{state="inactive"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[1m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: OOM kill detected + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[1m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostNetworkReceiveErrors + expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostNetworkTransmitErrors + expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostNetworkInterfaceSaturated + expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 + for: 1m + labels: + severity: warning + annotations: + summary: Host Network Interface Saturated (instance {{ $labels.instance }}) + description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." + - alert: HostNetworkBondDegraded + expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded + - alert: HostConntrackLimit + expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: The number of conntrack is approching limit + - alert: HostClockSkew + expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: Clock skew detected. Clock is out of sync. + - alert: HostClockNotSynchronising + expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: Clock not synchronising. --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -316,14 +317,14 @@ spec: groups: - name: smart rules: - - alert: SmartSSDWriteRateTooHigh - expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000 - for: 5m - labels: - severity: warning - annotations: - summary: SSD write rate exceeds 10MB/s - description: At this rate the SSD will be worn out before warranty period expires + - alert: SmartSSDWriteRateTooHigh + expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000 + for: 5m + labels: + severity: warning + annotations: + summary: SSD write rate exceeds 10MB/s + description: At this rate the SSD will be worn out before warranty period expires --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -333,27 +334,27 @@ spec: groups: - name: temperatures rules: - - alert: HighDiskTemperature - expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45 - for: 10m - labels: - severity: critical - annotations: - summary: High HDD/SSD temperature indicates high ambient temperature - - alert: HighChipsetTemperature - expr: node_hwmon_temp_celsius > 65 - for: 10m - labels: - severity: warning - annotations: - summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans - - alert: LowDiskTemperature - expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10 - for: 10m - labels: - severity: critical - annotations: - summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay + - alert: HighDiskTemperature + expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45 + for: 10m + labels: + severity: critical + annotations: + summary: High HDD/SSD temperature indicates high ambient temperature + - alert: HighChipsetTemperature + expr: node_hwmon_temp_celsius > 65 + for: 10m + labels: + severity: warning + annotations: + summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans + - alert: LowDiskTemperature + expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10 + for: 10m + labels: + severity: critical + annotations: + summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay --- apiVersion: monitoring.coreos.com/v1 kind: PodMonitor @@ -392,40 +393,40 @@ spec: app: node-exporter spec: containers: - - name: node-exporter - args: - - --web.listen-address=0.0.0.0:9101 - - --path.sysfs=/host/sys - - --path.rootfs=/host/root - - --no-collector.wifi - - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ - - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ - image: prom/node-exporter:v1.3.1 - resources: - limits: - cpu: 50m - memory: 180Mi - requests: - cpu: 5m - memory: 20Mi - volumeMounts: - - mountPath: /host/sys - mountPropagation: HostToContainer - name: sys - readOnly: true - - mountPath: /host/root - mountPropagation: HostToContainer - name: root - readOnly: true - ports: - - containerPort: 9101 - name: web - securityContext: - runAsGroup: 65532 - runAsNonRoot: true - runAsUser: 65532 - readOnlyRootFilesystem: true + - name: node-exporter + args: + - --web.listen-address=0.0.0.0:9101 + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --no-collector.wifi + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ + - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ + image: prom/node-exporter:v1.3.1 + resources: + limits: + cpu: 50m + memory: 180Mi + requests: + cpu: 5m + memory: 20Mi + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + ports: + - containerPort: 9101 + name: web + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + readOnlyRootFilesystem: true hostNetwork: true hostPID: true securityContext: @@ -433,11 +434,11 @@ spec: runAsUser: 65534 serviceAccountName: node-exporter tolerations: - - operator: Exists + - operator: Exists volumes: - - hostPath: - path: /sys - name: sys - - hostPath: - path: / - name: root + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root