kube/monitoring/blackbox-exporter.yaml

280 lines
7.4 KiB
YAML

---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: websites
spec:
prober:
url: blackbox-exporter
path: /probe
module: http_2xx
targets:
staticConfig:
static:
- https://git.k-space.ee/
- https://grafana.k-space.ee/
- https://wiki.k-space.ee/
- https://pad.k-space.ee/
- https://members.k-space.ee/
- https://nextcloud.k-space.ee/
- http://external-console.minio-clusters.k-space.ee/login
- http://shared-console.minio-clusters.k-space.ee/login
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: bind
spec:
prober:
url: blackbox-exporter
path: /probe
module: dns_check_traefik
targets:
staticConfig:
static:
- 193.40.103.2
- 62.65.250.2
- 172.20.53.1
- 172.20.53.2
- 172.20.53.3
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: samba-cluster
spec:
prober:
url: blackbox-exporter
path: /probe
module: tcp_connect
targets:
staticConfig:
static:
- dc1.ad.k-space.ee:636
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: misc
spec:
prober:
url: blackbox-exporter
path: /probe
module: tcp_connect
targets:
staticConfig:
static:
- mail.k-space.ee:465
- dev.k-space.ee:10648
- mariadb.infra.k-space.ee:3306
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: wildduck
spec:
prober:
url: blackbox-exporter
path: /probe
module: tcp_connect
targets:
staticConfig:
static:
- mail.k-space.ee:25
- mail.k-space.ee:465
- mail.k-space.ee:993
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: blackbox-exporter
spec:
# https://awesome-prometheus-alerts.grep.to/rules#blackbox
groups:
- name: blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: Probe failed
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: Blackbox probe took more than 1s to complete
- alert: BlackboxSlowDNS
expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
description: Blackbox DNS lookup took more than 1s to complete.
It seemed using IPv6 DNS servers in conjunction with Docker resulted
in odd 5s latency bump. For now we're using 8.8.8.8 because of that
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: HTTP status code is not 200-399
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 30 days
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 3 days
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: SSL certificate has expired already
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: HTTP request took more than 1s
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: Blackbox ping took more than 1s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.20.0
ports:
- name: http
containerPort: 9115
volumeMounts:
- name: blackbox-exporter-config
mountPath: /etc/blackbox_exporter
volumes:
- name: blackbox-exporter-config
configMap:
name: blackbox-exporter-config
# TODO: Results in odd 6s connection lag if scheduled in VLAN20
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- blackbox-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9115
selector:
app: blackbox-exporter
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-exporter-config
data:
config.yml: |-
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
icmp:
prober: icmp
icmp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_traefik:
prober: dns
dns:
query_name: "traefik.k-space.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_k6:
prober: dns
dns:
query_name: "k6.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false