--- apiVersion: monitoring.coreos.com/v1 kind: Probe metadata: name: websites spec: prober: url: blackbox-exporter path: /probe module: http_2xx targets: staticConfig: static: - https://git.k-space.ee/ - https://grafana.k-space.ee/ - https://wiki.k-space.ee/ - https://pad.k-space.ee/ - https://members.k-space.ee/ - https://nextcloud.k-space.ee/ - http://minio.infra.k-space.ee:9001/login --- apiVersion: monitoring.coreos.com/v1 kind: Probe metadata: name: k6.ee spec: prober: url: blackbox-exporter path: /probe module: dns_check_traefik targets: staticConfig: static: - 193.40.103.2 - 62.65.250.2 --- apiVersion: monitoring.coreos.com/v1 kind: Probe metadata: name: samba-cluster spec: prober: url: blackbox-exporter path: /metrics module: tcp_connect targets: staticConfig: static: - dc1.ad.k-space.ee:636 - dc2.ad.k-space.ee:636 - dc3.ad.k-space.ee:636 --- apiVersion: monitoring.coreos.com/v1 kind: Probe metadata: name: misc spec: prober: url: blackbox-exporter path: /metrics module: tcp_connect targets: staticConfig: static: - mail.k-space.ee:465 - dev.k-space.ee:10648 - mariadb.infra.k-space.ee:3306 --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: blackbox-exporter spec: # https://awesome-prometheus-alerts.grep.to/rules#blackbox groups: - name: blackbox rules: - alert: BlackboxProbeFailed expr: probe_success == 0 for: 2m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) description: Probe failed - alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 5m labels: severity: warning annotations: summary: Blackbox slow probe (instance {{ $labels.instance }}) description: Blackbox probe took more than 1s to complete - alert: BlackboxSlowDNS expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1 for: 5m labels: severity: warning annotations: summary: Blackbox slow DNS lookup (instance {{ $labels.instance }}) description: Blackbox DNS lookup took more than 1s to complete. It seemed using IPv6 DNS servers in conjunction with Docker resulted in odd 5s latency bump. For now we're using 8.8.8.8 because of that - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 5m labels: severity: critical annotations: summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) description: HTTP status code is not 200-399 - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 for: 0m labels: severity: warning annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: SSL certificate expires in 30 days - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: SSL certificate expires in 3 days - alert: BlackboxSslCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) description: SSL certificate has expired already - alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) description: HTTP request took more than 1s - alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow ping (instance {{ $labels.instance }}) description: Blackbox ping took more than 1s --- apiVersion: apps/v1 kind: Deployment metadata: name: blackbox-exporter spec: revisionHistoryLimit: 0 replicas: 2 selector: matchLabels: app: blackbox-exporter template: metadata: labels: app: blackbox-exporter spec: containers: - name: blackbox-exporter image: prom/blackbox-exporter:v0.20.0 volumeMounts: - name: blackbox-exporter-config mountPath: /etc/blackbox_exporter volumes: - name: blackbox-exporter-config configMap: name: blackbox-exporter-config # TODO: Results in odd 6s connection lag if scheduled in VLAN20 nodeSelector: dedicated: monitoring tolerations: - key: dedicated operator: Equal value: monitoring effect: NoSchedule affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - blackbox-exporter topologyKey: "kubernetes.io/hostname" --- kind: Service apiVersion: v1 metadata: name: blackbox-exporter spec: type: ClusterIP ports: - name: http port: 80 protocol: TCP targetPort: 9115 selector: app: blackbox-exporter --- apiVersion: v1 kind: ConfigMap metadata: name: blackbox-exporter-config data: config.yml: |- modules: http_2xx: prober: http http: preferred_ip_protocol: "ip4" ip_protocol_fallback: false http_post_2xx: prober: http http: method: POST preferred_ip_protocol: "ip4" ip_protocol_fallback: false tcp_connect: prober: tcp tcp: preferred_ip_protocol: "ip4" ip_protocol_fallback: false icmp: prober: icmp icmp: preferred_ip_protocol: "ip4" ip_protocol_fallback: false dns_check_traefik: prober: dns dns: query_name: "traefik.k-space.ee" query_type: "A" validate_answer_rrs: fail_if_not_matches_regexp: - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" preferred_ip_protocol: "ip4" ip_protocol_fallback: false dns_check_k6: prober: dns dns: query_name: "k6.ee" query_type: "A" validate_answer_rrs: fail_if_not_matches_regexp: - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" preferred_ip_protocol: "ip4" ip_protocol_fallback: false