Move Prometheus instance to monitoring namespace

This commit is contained in:
2023-08-19 09:13:59 +03:00
parent 62661efc42
commit 6e2f353916
9 changed files with 98 additions and 86 deletions

View File

@@ -1,28 +1,11 @@
# Prometheus operator
To deploy Prometheus operator:
```
curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.61.1/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
kubectl create namespace prometheus-operator
kubectl apply --server-side -n prometheus-operator -f bundle.yml
kubectl delete -n prometheus-operator configmap snmp-exporter
kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
```
# Slack
```
kubectl create -n prometheus-operator secret generic slack-secrets \
--from-literal=webhook-url=https://hooks.slack.com/services/...
```
# Mikrotik exporter
```
kubectl create -n prometheus-operator secret generic mikrotik-exporter \
--from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
--from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
```
Note: Do not put any Prometheus instances or exporters in this namespace, instead have them in `monitoring` namespace

View File

@@ -1,547 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: alertmanager
labels:
app.kubernetes.io/name: alertmanager
spec:
route:
routes:
- continue: false
receiver: slack-notifications
matchers:
- matchType: "="
name: severity
value: critical
receiver: 'null'
receivers:
- name: 'null'
- name: 'slack-notifications'
slackConfigs:
- channel: '#kube-prod'
sendResolved: true
apiURL:
name: slack-secrets
key: webhook-url
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: metrics
spec:
namespaceSelector: {}
selector: {}
podMetricsEndpoints:
- port: exporter
- port: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager
spec:
alertmanagerConfigMatcherStrategy:
type: None
alertmanagerConfigNamespaceSelector: {}
alertmanagerConfigSelector: {}
alertmanagerConfiguration:
name: alertmanager
secrets:
- slack-secrets
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
replicas: 3
serviceAccountName: alertmanager
externalUrl: http://am.k-space.ee/
routePrefix: "/"
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
spec:
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
alerting:
alertmanagers:
- namespace: prometheus-operator
name: alertmanager-operated
port: web
externalUrl: "http://prom.k-space.ee/"
replicas: 2
shards: 1
serviceAccountName: prometheus
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
ruleNamespaceSelector: {}
ruleSelector: {}
retentionSize: 8GB
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: local-path
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus-operator
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusJobMissing
annotations:
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n \
\ LABELS = {{ $labels }}"
summary: Prometheus job missing (instance {{ $labels.instance }})
expr: absent(up{job="prometheus-operator/prometheus"})
for: 0m
labels:
severity: warning
- alert: PrometheusTargetMissing
annotations:
description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target missing (instance {{ $labels.instance }})
expr: up == 0
for: 5m
labels:
severity: critical
- alert: PrometheusAllTargetsMissing
annotations:
description: "A Prometheus job does not have living target anymore.\n VALUE\
\ = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus all targets missing (instance {{ $labels.instance }})
expr: count by (job) (up) == 0
for: 0m
labels:
severity: critical
- alert: PrometheusConfigurationReloadFailure
annotations:
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n\
\ LABELS = {{ $labels }}"
summary: Prometheus configuration reload failure (instance {{ $labels.instance
}})
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusTooManyRestarts
annotations:
description: "Prometheus has restarted more than twice in the last 15 minutes.\
\ It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels\
\ }}"
summary: Prometheus too many restarts (instance {{ $labels.instance }})
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
> 2
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerJobMissing
annotations:
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager job missing (instance {{ $labels.instance
}})
expr: absent(up{job="prometheus-operator/alertmanager"})
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigurationReloadFailure
annotations:
description: "AlertManager configuration reload error\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager configuration reload failure (instance {{
$labels.instance }})
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigNotSynced
annotations:
description: "Configurations of AlertManager cluster instances are out of\
\ sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
}})
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 0m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanager
annotations:
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
}})
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationFailures
annotations:
description: "Prometheus encountered {{ $value }} rule evaluation failures,\
\ leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS\
\ = {{ $labels }}"
summary: Prometheus rule evaluation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTemplateTextExpansionFailures
annotations:
description: "Prometheus encountered {{ $value }} template text expansion\
\ failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus template text expansion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationSlow
annotations:
description: "Prometheus rule evaluation took more time than the scheduled\
\ interval. It indicates a slower storage backend access or too complex\
\ query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
- alert: PrometheusNotificationsBacklog
annotations:
description: "The Prometheus notification queue has not been empty for 10\
\ minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerNotificationFailing
annotations:
description: "Alertmanager is failing sending notifications\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTargetEmpty
annotations:
description: "Prometheus has no target in service discovery\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target empty (instance {{ $labels.instance }})
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
- alert: PrometheusLargeScrape
annotations:
description: "Prometheus has many scrapes that exceed the sample limit\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus large scrape (instance {{ $labels.instance }})
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
10
for: 5m
labels:
severity: warning
- alert: PrometheusTargetScrapeDuplicate
annotations:
description: "Prometheus has many samples rejected due to duplicate timestamps\
\ but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target scrape duplicate (instance {{ $labels.instance
}})
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
for: 0m
labels:
severity: warning
- alert: PrometheusTsdbCheckpointCreationFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCheckpointDeletionFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCompactionsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbHeadTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbReloadFailures
annotations:
description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalCorruptions
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
and wipe /data/wal
expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: prometheus-operator-prometheus@kubernetescrd
spec:
rules:
- host: prom.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: prometheus-operated
port:
number: 9090
tls:
- hosts:
- "*.k-space.ee"
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: prometheus-operator-alertmanager@kubernetescrd
spec:
rules:
- host: am.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: alertmanager-operated
port:
number: 9093
tls:
- hosts:
- "*.k-space.ee"
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: prometheus
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: alertmanager
spec:
selector:
matchLabels:
app.kubernetes.io/name: alertmanager
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: operator
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus-operator
podMetricsEndpoints:
- port: http
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/cadvisor
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kubelet
---
apiVersion: codemowers.io/v1alpha1
kind: OIDCGWMiddlewareClient
metadata:
name: prometheus
spec:
displayName: Prometheus
uri: 'https://prom.k-space.ee'
allowedGroups:
- k-space:floor
headerMapping:
email: Remote-Email
groups: Remote-Groups
name: Remote-Name
user: Remote-Username
---
apiVersion: codemowers.io/v1alpha1
kind: OIDCGWMiddlewareClient
metadata:
name: alertmanager
spec:
displayName: AlertManager
uri: 'https://am.k-space.ee'
allowedGroups:
- k-space:kubernetes:admins
headerMapping:
email: Remote-Email
groups: Remote-Groups
name: Remote-Name
user: Remote-Username

View File

@@ -1,258 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: websites
spec:
prober:
url: blackbox-exporter
path: /probe
module: http_2xx
targets:
staticConfig:
static:
- https://git.k-space.ee/
- https://grafana.k-space.ee/
- https://wiki.k-space.ee/
- https://pad.k-space.ee/
- https://members.k-space.ee/
- https://nextcloud.k-space.ee/
- http://minio.infra.k-space.ee:9001/login
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: k6.ee
spec:
prober:
url: blackbox-exporter
path: /probe
module: dns_check_traefik
targets:
staticConfig:
static:
- 193.40.103.2
- 62.65.250.2
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: samba-cluster
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- dc1.ad.k-space.ee:636
- dc2.ad.k-space.ee:636
- dc3.ad.k-space.ee:636
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: misc
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- mail.k-space.ee:465
- dev.k-space.ee:10648
- mariadb.infra.k-space.ee:3306
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: blackbox-exporter
spec:
# https://awesome-prometheus-alerts.grep.to/rules#blackbox
groups:
- name: blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: Probe failed
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: Blackbox probe took more than 1s to complete
- alert: BlackboxSlowDNS
expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
description: Blackbox DNS lookup took more than 1s to complete.
It seemed using IPv6 DNS servers in conjunction with Docker resulted
in odd 5s latency bump. For now we're using 8.8.8.8 because of that
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: HTTP status code is not 200-399
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 30 days
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 3 days
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: SSL certificate has expired already
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: HTTP request took more than 1s
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: Blackbox ping took more than 1s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
spec:
revisionHistoryLimit: 0
replicas: 3
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.20.0
volumeMounts:
- name: blackbox-exporter-config
mountPath: /etc/blackbox_exporter
volumes:
- name: blackbox-exporter-config
configMap:
name: blackbox-exporter-config
# TODO: Results in odd 6s connection lag if scheduled in VLAN20
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- blackbox-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9115
selector:
app: blackbox-exporter
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-exporter-config
data:
config.yml: |-
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
icmp:
prober: icmp
icmp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_traefik:
prober: dns
dns:
query_name: "traefik.k-space.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_k6:
prober: dns
dns:
query_name: "k6.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false

View File

@@ -1,110 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: mikrotik
spec:
bearerTokenSecret:
name: mikrotik-exporter
key: PROMETHEUS_BEARER_TOKEN
prober:
path: /metrics
url: mikrotik-exporter
targets:
staticConfig:
static:
- router.mgmt.k-space.ee
- sw_chaos.mgmt.k-space.ee
- sw_poe.mgmt.k-space.ee
- sw_mgmt.mgmt.k-space.ee
- sw_core02.mgmt.k-space.ee
- sw_cyber.mgmt.k-space.ee
- sw_ha.mgmt.k-space.ee
- sw_asocial.mgmt.k-space.ee
- sw_kitchen.mgmt.k-space.ee
- sw_core01.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: mikrotik
spec:
groups:
- name: mikrotik
rules:
- alert: MikrotikUplinkRedundancyLost
expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
for: 0m
labels:
severity: error
annotations:
summary: Switch uplink high availability lost
description: One of the two 10Gb optical links is malfunctioning
- alert: MikrotikLinkRateDegraded
expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
for: 0m
labels:
severity: error
annotations:
summary: 10Gb link degraded
description: One of the 10Gb links is running at lower speed
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mikrotik-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: mikrotik-exporter
template:
metadata:
labels:
app: mikrotik-exporter
annotations:
co.elastic.logs/multiline.pattern: '^ '
co.elastic.logs/multiline.negate: "false"
co.elastic.logs/multiline.match: after
spec:
containers:
- name: mikrotik-exporter
image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
env:
- name: MIKROTIK_USER
value: netpoller
envFrom:
- secretRef:
name: mikrotik-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- mikrotik-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: mikrotik-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 3001
selector:
app: mikrotik-exporter

View File

@@ -1,430 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: nodes-proxmox
spec:
scrapeTimeout: 30s
targets:
staticConfig:
static:
- nas.mgmt.k-space.ee:9100
- pve1.proxmox.infra.k-space.ee:9100
- pve2.proxmox.infra.k-space.ee:9100
- pve8.proxmox.infra.k-space.ee:9100
- pve9.proxmox.infra.k-space.ee:9100
relabelingConfigs:
- sourceLabels: [__param_target]
targetLabel: instance
- sourceLabels: [__param_target]
targetLabel: __address__
prober:
url: localhost
path: /metrics
metricRelabelings:
- sourceLabels: [__address__]
targetLabel: target
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter
spec:
groups:
- name: node-exporter
rules:
- alert: ZfsOfflinePool
expr: node_zfs_zpool_state{state!="online"} > 0
for: 1m
labels:
severity: critical
annotations:
summary: ZFS offline pool (instance {{ $labels.instance }})
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighLoad
expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
for: 15m
labels:
severity: warning
annotations:
summary: Host under high load
description: Many processes are queued up for execution
- alert: HostOutOfMemory
expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: Node memory is filling up (< 10% left)
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: The node is under heavy memory pressure. High rate of major page faults
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 800e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: Host network interfaces are probably receiving too much data (> 800 MB/s)
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 800e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: Host network interfaces are probably sending too much data (> 800 MB/s)
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 500e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: Disk is probably reading too much data (> 500 MB/s)
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 500e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: Disk is probably writing too much data (> 500 MB/s)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: Disk is almost full (< 10% left)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: Disk is almost running out of available inodes (< 10% left)
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: Disk latency is growing (read operations > 100ms)
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: Disk latency is growing (write operations > 100ms)
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: Context switching is growing on node (> 50000 / s)
- alert: HostSwapIsEnabled
expr: node_memory_SwapTotal_bytes > 0
for: 0m
labels:
severity: warning
annotations:
summary: Swap is discouraged nowadays
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: Physical hardware component too hot
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_alarm == 1
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: Physical node temperature alarm triggered
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: OOM kill detected
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded."
- alert: HostNetworkBondDegraded
expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: The number of conntrack is approching limit
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: Clock skew detected. Clock is out of sync.
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: Clock not synchronising.
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smart
spec:
groups:
- name: smart
rules:
- alert: SmartSSDWriteRateTooHigh
expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
for: 5m
labels:
severity: warning
annotations:
summary: SSD write rate exceeds 10MB/s
description: At this rate the SSD will be worn out before warranty period expires
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: temperatures
spec:
groups:
- name: temperatures
rules:
- alert: HighDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
for: 10m
labels:
severity: critical
annotations:
summary: High HDD/SSD temperature indicates high ambient temperature
- alert: HighChipsetTemperature
expr: node_hwmon_temp_celsius > 65
for: 10m
labels:
severity: warning
annotations:
summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
- alert: LowDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
for: 10m
labels:
severity: critical
annotations:
summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
podMetricsEndpoints:
- port: web
scrapeTimeout: 30s
relabelings:
- sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: node
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: node-exporter
name: node-exporter
annotations:
keel.sh/policy: force
keel.sh/trigger: poll
keel.sh/pollSchedule: "@midnight"
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
containers:
- name: node-exporter
args:
- --web.listen-address=0.0.0.0:9101
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth|cali|vxlan|cni|vnet|tap|lo|wg)
- --collector.netdev.device-exclude=^(veth|cali|vxlan|cni|vnet|tap|lo|wg)
- --collector.diskstats.ignored-devices=^(sr[0-9][0-9]*)$
image: prom/node-exporter:v1.5.0
resources:
limits:
cpu: 50m
memory: 180Mi
requests:
cpu: 5m
memory: 20Mi
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
ports:
- containerPort: 9101
name: web
securityContext:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
readOnlyRootFilesystem: true
hostNetwork: true
hostPID: true
priorityClassName: system-node-critical
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root

View File

@@ -1,184 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: snmp-exporter
spec:
replicas: 2
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
containers:
- image: prom/snmp-exporter:v0.22.0
name: snmp-exporter
imagePullPolicy: IfNotPresent
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
ports:
- containerPort: 9116
name: exporter
livenessProbe:
httpGet:
path: /health
port: exporter
readinessProbe:
httpGet:
path: /health
port: exporter
volumeMounts:
- name: snmp-exporter
mountPath: /etc/snmp_exporter
volumes:
- name: snmp-exporter
configMap:
name: snmp-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- snmp-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: snmp-exporter
spec:
type: ClusterIP
ports:
- name: exporter
port: 9116
protocol: TCP
selector:
app: snmp-exporter
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: ups
spec:
interval: 60s
module: rfc1628_ups
prober:
url: snmp-exporter:9116
path: /snmp
metricRelabelings:
- sourceLabels: [__name__]
regex: '(.*)'
replacement: 'snmp_${1}'
targetLabel: __name__
targets:
staticConfig:
static:
- ups-4.mgmt.k-space.ee
- ups-7.mgmt.k-space.ee
- ups-8.mgmt.k-space.ee
- ups-9.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: ups
spec:
groups:
- name: ups
rules:
- alert: UPSBatteryLost
annotations:
summary: One or more UPS-es have degraded batteries.
expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
for: 1m
labels:
severity: critical
- alert: UPSPowerLost
annotations:
summary: One or more UPS-es is not in normal operation mode. This either means
power is lost or UPS was loaded and it's now in bypass mode.
expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) != 4
for: 1m
labels:
severity: critical
- alert: UPSExcessivelyLoaded
annotations:
summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
is balanced and load for no UPS stays above 50%.
expr: snmp_upsOutputPercentLoad > 80
for: 1h
labels:
severity: critical
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: printer
spec:
interval: 60s
scrapeTimeout: 50s
module: printer_mib
prober:
url: snmp-exporter:9116
path: /snmp
metricRelabelings:
- sourceLabels: [__name__]
regex: '(.*)'
replacement: 'snmp_${1}'
targetLabel: __name__
targets:
staticConfig:
static:
- mfp-chaos.pub.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: printer
spec:
groups:
- name: printer
rules:
- alert: PrinterNeedsAttention
annotations:
summary: Printer is in error state. If the underlying reason is 'low on paper'
make sure there is enough paper near the printer. It not drop a line at
accounting@k-space.ee to order more office supplies.
expr: snmp_hrPrinterDetectedErrorState == 1
for: 0m
labels:
severity: warning
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: beamer
spec:
interval: 60s
module: epson_beamer
prober:
url: snmp-exporter:9116
path: /snmp
metricRelabelings:
- sourceLabels: [__name__]
regex: '(.*)'
replacement: 'snmp_${1}'
targetLabel: __name__
targets:
staticConfig:
static:
- beamer-cyber.sec.k-space.ee

File diff suppressed because it is too large Load Diff