Migrate to Prometheus Operator

This commit is contained in:
2022-09-11 16:24:35 +03:00
parent ee4b1ddf57
commit 1045ed2f26
30 changed files with 32403 additions and 129 deletions

View File

@@ -0,0 +1,19 @@
# Prometheus operator
```
curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
kubectl create namespace prometheus-operator
kubectl apply --server-side -n prometheus-operator -f bundle.yml
kubectl delete -n prometheus-operator configmap snmp-exporter
kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
```
# Mikrotik expoeter
```
kubectl create -n prometheus-operator secret generic mikrotik-exporter \
--from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
--from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
```

View File

@@ -0,0 +1,762 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: metrics
spec:
namespaceSelector: {}
selector: {}
podMetricsEndpoints:
- port: exporter
- port: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager
spec:
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
replicas: 3
serviceAccountName: alertmanager
externalUrl: http://am.k-space.ee/
routePrefix: "/"
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
spec:
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
alerting:
alertmanagers:
- namespace: prometheus-operator
name: alertmanager
port: http
pathPrefix: "/"
apiVersion: v2
externalUrl: "http://prom.k-space.ee/"
replicas: 2
shards: 1
serviceAccountName: prometheus
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
ruleNamespaceSelector: {}
ruleSelector: {}
retentionSize: 80GB
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: local-path
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus-operator
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusJobMissing
annotations:
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n \
\ LABELS = {{ $labels }}"
summary: Prometheus job missing (instance {{ $labels.instance }})
expr: absent(up{job="prometheus-operator/prometheus"})
for: 0m
labels:
severity: warning
- alert: PrometheusTargetMissing
annotations:
description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target missing (instance {{ $labels.instance }})
expr: up == 0
for: 5m
labels:
severity: critical
- alert: PrometheusAllTargetsMissing
annotations:
description: "A Prometheus job does not have living target anymore.\n VALUE\
\ = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus all targets missing (instance {{ $labels.instance }})
expr: count by (job) (up) == 0
for: 0m
labels:
severity: critical
- alert: PrometheusConfigurationReloadFailure
annotations:
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n\
\ LABELS = {{ $labels }}"
summary: Prometheus configuration reload failure (instance {{ $labels.instance
}})
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusTooManyRestarts
annotations:
description: "Prometheus has restarted more than twice in the last 15 minutes.\
\ It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels\
\ }}"
summary: Prometheus too many restarts (instance {{ $labels.instance }})
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
> 2
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerJobMissing
annotations:
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager job missing (instance {{ $labels.instance
}})
expr: absent(up{job="prometheus-operator/alertmanager"})
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigurationReloadFailure
annotations:
description: "AlertManager configuration reload error\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager configuration reload failure (instance {{
$labels.instance }})
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigNotSynced
annotations:
description: "Configurations of AlertManager cluster instances are out of\
\ sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
}})
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 0m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanager
annotations:
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
}})
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationFailures
annotations:
description: "Prometheus encountered {{ $value }} rule evaluation failures,\
\ leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS\
\ = {{ $labels }}"
summary: Prometheus rule evaluation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTemplateTextExpansionFailures
annotations:
description: "Prometheus encountered {{ $value }} template text expansion\
\ failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus template text expansion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationSlow
annotations:
description: "Prometheus rule evaluation took more time than the scheduled\
\ interval. It indicates a slower storage backend access or too complex\
\ query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
- alert: PrometheusNotificationsBacklog
annotations:
description: "The Prometheus notification queue has not been empty for 10\
\ minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerNotificationFailing
annotations:
description: "Alertmanager is failing sending notifications\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTargetEmpty
annotations:
description: "Prometheus has no target in service discovery\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target empty (instance {{ $labels.instance }})
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
- alert: PrometheusLargeScrape
annotations:
description: "Prometheus has many scrapes that exceed the sample limit\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus large scrape (instance {{ $labels.instance }})
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
10
for: 5m
labels:
severity: warning
- alert: PrometheusTargetScrapeDuplicate
annotations:
description: "Prometheus has many samples rejected due to duplicate timestamps\
\ but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target scrape duplicate (instance {{ $labels.instance
}})
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
for: 0m
labels:
severity: warning
- alert: PrometheusTsdbCheckpointCreationFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCheckpointDeletionFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCompactionsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbHeadTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbReloadFailures
annotations:
description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalCorruptions
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
and wipe /data/wal
expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
annotations:
cert-manager.io/cluster-issuer: default
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
spec:
rules:
- host: prom.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: prometheus-operated
port:
number: 9090
tls:
- hosts:
- prom.k-space.ee
secretName: prom-tls
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
annotations:
cert-manager.io/cluster-issuer: default
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
spec:
rules:
- host: am.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: alertmanager-operated
port:
number: 9093
tls:
- hosts:
- am.k-space.ee
secretName: alertmanager-tls
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: prometheus
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: alertmanager
spec:
selector:
matchLabels:
app.kubernetes.io/name: alertmanager
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: operator
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus-operator
podMetricsEndpoints:
- port: http
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/cadvisor
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kubelet
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kube-state-metrics
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Node ready (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes memory pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes disk pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesOutOfDisk
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes out of disk (instance {{ $labels.instance }})
description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesOutOfCapacity
expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes out of capacity (instance {{ $labels.instance }})
description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesContainerOomKiller
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes container oom killer (instance {{ $labels.instance }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
expr: kube_job_status_failed > 0
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes Job failed (instance {{ $labels.instance }})
description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended
expr: kube_cronjob_spec_suspend != 0
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeFullInFourDays
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
for: 1m
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScalingAbility
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaMetricAvailability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleCapability
expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
for: 2m
labels:
severity: info
annotations:
summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodNotHealthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicassetMismatch
expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 1m
labels:
severity: critical
annotations:
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobTooLong
expr: time() - kube_cronjob_next_schedule_time > 3600
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
expr: kube_job_spec_completions - kube_job_status_succeeded > 0
for: 12h
labels:
severity: critical
annotations:
summary: Kubernetes job slow completion (instance {{ $labels.instance }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiClientErrors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresNextWeek
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresSoon
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes API server latency (instance {{ $labels.instance }})
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@@ -0,0 +1,258 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: websites
spec:
prober:
url: blackbox-exporter
path: /probe
module: http_2xx
targets:
staticConfig:
static:
- https://git.k-space.ee/
- https://grafana.k-space.ee/
- https://wiki.k-space.ee/
- https://pad.k-space.ee/
- https://members.k-space.ee/
- https://nextcloud.k-space.ee/
- http://minio.infra.k-space.ee:9001/login
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: k6.ee
spec:
prober:
url: blackbox-exporter
path: /probe
module: dns_check_traefik
targets:
staticConfig:
static:
- 193.40.103.2
- 62.65.250.2
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: samba-cluster
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- dc1.ad.k-space.ee:636
- dc2.ad.k-space.ee:636
- dc3.ad.k-space.ee:636
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: misc
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- mail.k-space.ee:465
- dev.k-space.ee:10648
- mariadb.infra.k-space.ee:3306
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: blackbox-exporter
spec:
# https://awesome-prometheus-alerts.grep.to/rules#blackbox
groups:
- name: blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: Probe failed
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: Blackbox probe took more than 1s to complete
- alert: BlackboxSlowDNS
expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
description: Blackbox DNS lookup took more than 1s to complete.
It seemed using IPv6 DNS servers in conjunction with Docker resulted
in odd 5s latency bump. For now we're using 8.8.8.8 because of that
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: HTTP status code is not 200-399
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 30 days
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 3 days
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: SSL certificate has expired already
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: HTTP request took more than 1s
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: Blackbox ping took more than 1s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.20.0
volumeMounts:
- name: blackbox-exporter-config
mountPath: /etc/blackbox_exporter
volumes:
- name: blackbox-exporter-config
configMap:
name: blackbox-exporter-config
# TODO: Results in odd 6s connection lag if scheduled in VLAN20
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- blackbox-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9115
selector:
app: blackbox-exporter
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-exporter-config
data:
config.yml: |-
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
icmp:
prober: icmp
icmp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_traefik:
prober: dns
dns:
query_name: "traefik.k-space.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_k6:
prober: dns
dns:
query_name: "k6.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false

28816
prometheus-operator/bundle.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: mikrotik
spec:
bearerTokenSecret:
name: mikrotik-exporter
key: PROMETHEUS_BEARER_TOKEN
prober:
path: /metrics
url: mikrotik-exporter
targets:
staticConfig:
static:
- router.mgmt.k-space.ee
- sw_chaos.mgmt.k-space.ee
- sw_poe.mgmt.k-space.ee
- sw_mgmt.mgmt.k-space.ee
- sw_core02.mgmt.k-space.ee
- sw_cyber.mgmt.k-space.ee
- sw_ha.mgmt.k-space.ee
- sw_asocial.mgmt.k-space.ee
- sw_kitchen.mgmt.k-space.ee
- sw_core01.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: mikrotik
spec:
groups:
- name: mikrotik
rules:
- alert: MikrotikUplinkRedundancyLost
expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
for: 0m
labels:
severity: error
annotations:
summary: Switch uplink high availability lost
description: One of the two 10Gb optical links is malfunctioning
- alert: MikrotikLinkRateDegraded
expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
for: 0m
labels:
severity: error
annotations:
summary: 10Gb link degraded
description: One of the 10Gb links is running at lower speed
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mikrotik-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: mikrotik-exporter
template:
metadata:
labels:
app: mikrotik-exporter
annotations:
co.elastic.logs/multiline.pattern: '^ '
co.elastic.logs/multiline.negate: "false"
co.elastic.logs/multiline.match: after
spec:
containers:
- name: mikrotik-exporter
image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
env:
- name: MIKROTIK_USER
value: netpoller
envFrom:
- secretRef:
name: mikrotik-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: mikrotik-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 3001
selector:
app: mikrotik-exporter

View File

@@ -0,0 +1,443 @@
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: nodes-proxmox
spec:
targets:
staticConfig:
static:
- nas.mgmt.k-space.ee:9100
- pve1.proxmox.infra.k-space.ee:9100
- pve8.proxmox.infra.k-space.ee:9100
- pve9.proxmox.infra.k-space.ee:9100
relabelingConfigs:
- sourceLabels: [__param_target]
targetLabel: instance
- sourceLabels: [__param_target]
targetLabel: __address__
prober:
url: localhost
path: /metrics
metricRelabelings:
- sourceLabels: [__address__]
targetLabel: target
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: nodes-misc
spec:
targets:
staticConfig:
static:
- sprucecone.infra.k-space.ee:9100
- cedarcone.infra.k-space.ee:9100
relabelingConfigs:
- sourceLabels: [__param_target]
targetLabel: instance
- sourceLabels: [__param_target]
targetLabel: __address__
prober:
url: localhost
path: /metrics
metricRelabelings:
- sourceLabels: [__address__]
targetLabel: target
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter
spec:
groups:
- name: node-exporter
rules:
- alert: ZfsOfflinePool
expr: node_zfs_zpool_state{state!="online"} > 0
for: 1m
labels:
severity: critical
annotations:
summary: ZFS offline pool (instance {{ $labels.instance }})
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighLoad
expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
for: 15m
labels:
severity: warning
annotations:
summary: Host under high load
description: Many processes are queued up for execution
- alert: HostOutOfMemory
expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: Node memory is filling up (< 10% left)
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: The node is under heavy memory pressure. High rate of major page faults
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: Host network interfaces are probably receiving too much data (> 160 MB/s)
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: Host network interfaces are probably sending too much data (> 160 MB/s)
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: Disk is probably reading too much data (> 50 MB/s)
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: Disk is probably writing too much data (> 50 MB/s)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: Disk is almost full (< 10% left)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: Disk is almost running out of available inodes (< 10% left)
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: Disk latency is growing (read operations > 100ms)
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: Disk latency is growing (write operations > 100ms)
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: Context switching is growing on node (> 50000 / s)
- alert: HostSwapIsEnabled
expr: node_memory_SwapTotal_bytes > 0
for: 0m
labels:
severity: warning
annotations:
summary: Swap is discouraged nowadays
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: Physical hardware component too hot
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_alarm == 1
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: Physical node temperature alarm triggered
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: OOM kill detected
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
- alert: HostNetworkBondDegraded
expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: The number of conntrack is approching limit
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: Clock skew detected. Clock is out of sync.
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: Clock not synchronising.
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smart
spec:
groups:
- name: smart
rules:
- alert: SmartSSDWriteRateTooHigh
expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
for: 5m
labels:
severity: warning
annotations:
summary: SSD write rate exceeds 10MB/s
description: At this rate the SSD will be worn out before warranty period expires
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: temperatures
spec:
groups:
- name: temperatures
rules:
- alert: HighDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
for: 10m
labels:
severity: critical
annotations:
summary: High HDD/SSD temperature indicates high ambient temperature
- alert: HighChipsetTemperature
expr: node_hwmon_temp_celsius > 65
for: 10m
labels:
severity: warning
annotations:
summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
- alert: LowDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
for: 10m
labels:
severity: critical
annotations:
summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
podMetricsEndpoints:
- port: web
scrapeTimeout: 30s
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: node-exporter
name: node-exporter
annotations:
keel.sh/policy: force
keel.sh/trigger: poll
keel.sh/pollSchedule: "@midnight"
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
containers:
- name: node-exporter
args:
- --web.listen-address=0.0.0.0:9101
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image: prom/node-exporter:v1.3.1
resources:
limits:
cpu: 50m
memory: 180Mi
requests:
cpu: 5m
memory: 20Mi
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
ports:
- containerPort: 9101
name: web
securityContext:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
readOnlyRootFilesystem: true
hostNetwork: true
hostPID: true
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root

View File

@@ -0,0 +1,172 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: snmp-exporter
spec:
replicas: 2
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
containers:
- image: prom/snmp-exporter:latest
name: snmp-exporter
imagePullPolicy: Always
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
ports:
- containerPort: 9116
name: exporter
livenessProbe:
httpGet:
path: /health
port: exporter
readinessProbe:
httpGet:
path: /health
port: exporter
volumeMounts:
- name: snmp-exporter
mountPath: /etc/snmp_exporter
volumes:
- name: snmp-exporter
configMap:
name: snmp-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- snmp-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: snmp-exporter
spec:
type: ClusterIP
ports:
- name: exporter
port: 9116
protocol: TCP
selector:
app: snmp-exporter
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: ups
spec:
interval: 60s
module: rfc1628_ups
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- ups-4.mgmt.k-space.ee
- ups-5.mgmt.k-space.ee
- ups-6.mgmt.k-space.ee
- ups-7.mgmt.k-space.ee
- ups-8.mgmt.k-space.ee
- ups-9.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: ups
spec:
groups:
- name: ups
rules:
- alert: UPSBatteryLost
annotations:
summary: One or more UPS-es have degraded batteries.
expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
for: 1m
labels:
severity: critical
- alert: UPSPowerLost
annotations:
summary: One or more UPS-es is not in normal operation mode. This either means
power is lost or UPS was loaded and it's now in bypass mode.
expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6
for: 1m
labels:
severity: critical
- alert: UPSExcessivelyLoaded
annotations:
summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
is balanced and load for no UPS stays above 50%.
expr: snmp_upsOutputPercentLoad > 80
for: 1h
labels:
severity: critical
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: printer
spec:
interval: 60s
scrapeTimeout: 50s
module: printer_mib
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- mfp-cyber.pub.k-space.ee
- mfp-chaos.pub.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: printer
spec:
groups:
- name: printer
rules:
- alert: PrinterNeedsAttention
annotations:
summary: Printer is in error state. If the underlying reason is 'low on paper'
make sure there is enough paper near the printer. It not drop a line at
accounting@k-space.ee to order more office supplies.
expr: snmp_hrPrinterDetectedErrorState == 1
for: 0m
labels:
severity: warning
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: beamer
spec:
interval: 60s
module: epson_beamer
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- beamer-cyber.sec.k-space.ee

1272
prometheus-operator/snmp.yml Normal file

File diff suppressed because it is too large Load Diff