Move Prometheus instance to monitoring namespace
This commit is contained in:
		| @@ -1,28 +1,11 @@ | ||||
| # Prometheus operator | ||||
|  | ||||
| To deploy Prometheus operator: | ||||
|  | ||||
| ``` | ||||
| curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.61.1/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml | ||||
| kubectl create namespace prometheus-operator | ||||
| kubectl apply --server-side -n prometheus-operator -f bundle.yml | ||||
| kubectl delete -n prometheus-operator configmap snmp-exporter | ||||
| kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml | ||||
| kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml | ||||
| ``` | ||||
|  | ||||
|  | ||||
| # Slack | ||||
|  | ||||
| ``` | ||||
|  kubectl create -n prometheus-operator secret generic slack-secrets \ | ||||
|     --from-literal=webhook-url=https://hooks.slack.com/services/... | ||||
| ``` | ||||
|  | ||||
|  | ||||
| # Mikrotik exporter | ||||
|  | ||||
| ``` | ||||
|  kubectl create -n prometheus-operator secret generic mikrotik-exporter \ | ||||
|   --from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \ | ||||
|   --from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30) | ||||
| ``` | ||||
|  | ||||
| Note: Do not put any Prometheus instances or exporters in this namespace, instead have them in `monitoring` namespace | ||||
|   | ||||
| @@ -1,547 +0,0 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1alpha1 | ||||
| kind: AlertmanagerConfig | ||||
| metadata: | ||||
|   name: alertmanager | ||||
|   labels: | ||||
|     app.kubernetes.io/name: alertmanager | ||||
| spec: | ||||
|   route: | ||||
|     routes: | ||||
|       - continue: false | ||||
|         receiver: slack-notifications | ||||
|         matchers: | ||||
|           - matchType: "=" | ||||
|             name: severity | ||||
|             value: critical | ||||
|     receiver: 'null' | ||||
|   receivers: | ||||
|   - name: 'null' | ||||
|   - name: 'slack-notifications' | ||||
|     slackConfigs: | ||||
|     - channel: '#kube-prod' | ||||
|       sendResolved: true | ||||
|       apiURL: | ||||
|         name: slack-secrets | ||||
|         key: webhook-url | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: metrics | ||||
| spec: | ||||
|   namespaceSelector: {} | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|     - port: exporter | ||||
|     - port: metrics | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Alertmanager | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| spec: | ||||
|   alertmanagerConfigMatcherStrategy: | ||||
|     type: None | ||||
|   alertmanagerConfigNamespaceSelector: {} | ||||
|   alertmanagerConfigSelector: {} | ||||
|   alertmanagerConfiguration: | ||||
|     name: alertmanager | ||||
|   secrets: | ||||
|     - slack-secrets | ||||
|   nodeSelector: | ||||
|     dedicated: monitoring | ||||
|   tolerations: | ||||
|     - key: dedicated | ||||
|       operator: Equal | ||||
|       value: monitoring | ||||
|       effect: NoSchedule | ||||
|   replicas: 3 | ||||
|   serviceAccountName: alertmanager | ||||
|   externalUrl: http://am.k-space.ee/ | ||||
|   routePrefix: "/" | ||||
|   securityContext: | ||||
|     fsGroup: 2000 | ||||
|     runAsGroup: 2000 | ||||
|     runAsNonRoot: true | ||||
|     runAsUser: 1000 | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Prometheus | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   nodeSelector: | ||||
|     dedicated: monitoring | ||||
|   tolerations: | ||||
|     - key: dedicated | ||||
|       operator: Equal | ||||
|       value: monitoring | ||||
|       effect: NoSchedule | ||||
|   alerting: | ||||
|     alertmanagers: | ||||
|       - namespace: prometheus-operator | ||||
|         name: alertmanager-operated | ||||
|         port: web | ||||
|   externalUrl: "http://prom.k-space.ee/" | ||||
|   replicas: 2 | ||||
|   shards: 1 | ||||
|   serviceAccountName: prometheus | ||||
|   securityContext: | ||||
|     fsGroup: 2000 | ||||
|     runAsGroup: 2000 | ||||
|     runAsNonRoot: true | ||||
|     runAsUser: 1000 | ||||
|   serviceMonitorNamespaceSelector: {} | ||||
|   serviceMonitorSelector: {} | ||||
|   podMonitorNamespaceSelector: {} | ||||
|   podMonitorSelector: {} | ||||
|   probeNamespaceSelector: {} | ||||
|   probeSelector: {} | ||||
|   ruleNamespaceSelector: {} | ||||
|   ruleSelector: {} | ||||
|   retentionSize: 8GB | ||||
|   storage: | ||||
|     volumeClaimTemplate: | ||||
|       spec: | ||||
|         accessModes: | ||||
|         - ReadWriteOnce | ||||
|         resources: | ||||
|           requests: | ||||
|             storage: 10Gi | ||||
|         storageClassName: local-path | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: prometheus | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: prometheus | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes | ||||
|   - nodes/metrics | ||||
|   - services | ||||
|   - endpoints | ||||
|   - pods | ||||
|   verbs: ["get", "list", "watch"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - configmaps | ||||
|   verbs: ["get"] | ||||
| - apiGroups: | ||||
|   - networking.k8s.io | ||||
|   resources: | ||||
|   - ingresses | ||||
|   verbs: ["get", "list", "watch"] | ||||
| - nonResourceURLs: ["/metrics"] | ||||
|   verbs: ["get"] | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: prometheus | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: prometheus | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus | ||||
|   namespace: prometheus-operator | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   groups: | ||||
|   - name: prometheus | ||||
|     rules: | ||||
|     - alert: PrometheusJobMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \ | ||||
|           \ LABELS = {{ $labels }}" | ||||
|         summary: Prometheus job missing (instance {{ $labels.instance }}) | ||||
|       expr: absent(up{job="prometheus-operator/prometheus"}) | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTargetMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus target has disappeared. An exporter might be crashed.\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target missing (instance {{ $labels.instance }}) | ||||
|       expr: up == 0 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusAllTargetsMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus job does not have living target anymore.\n  VALUE\ | ||||
|           \ = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus all targets missing (instance {{ $labels.instance }}) | ||||
|       expr: count by (job) (up) == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusConfigurationReloadFailure | ||||
|       annotations: | ||||
|         description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\ | ||||
|           \  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus configuration reload failure (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: prometheus_config_last_reload_successful != 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTooManyRestarts | ||||
|       annotations: | ||||
|         description: "Prometheus has restarted more than twice in the last 15 minutes.\ | ||||
|           \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\ | ||||
|           \ }}" | ||||
|         summary: Prometheus too many restarts (instance {{ $labels.instance }}) | ||||
|       expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) | ||||
|         > 2 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerJobMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager job missing (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: absent(up{job="prometheus-operator/alertmanager"}) | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerConfigurationReloadFailure | ||||
|       annotations: | ||||
|         description: "AlertManager configuration reload error\n  VALUE = {{ $value\ | ||||
|           \ }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager configuration reload failure (instance {{ | ||||
|           $labels.instance }}) | ||||
|       expr: alertmanager_config_last_reload_successful != 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerConfigNotSynced | ||||
|       annotations: | ||||
|         description: "Configurations of AlertManager cluster instances are out of\ | ||||
|           \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager config not synced (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusNotConnectedToAlertmanager | ||||
|       annotations: | ||||
|         description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\ | ||||
|           \ }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus not connected to alertmanager (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: prometheus_notifications_alertmanagers_discovered < 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusRuleEvaluationFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} rule evaluation failures,\ | ||||
|           \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\ | ||||
|           \ = {{ $labels }}" | ||||
|         summary: Prometheus rule evaluation failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTemplateTextExpansionFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} template text expansion\ | ||||
|           \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus template text expansion failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusRuleEvaluationSlow | ||||
|       annotations: | ||||
|         description: "Prometheus rule evaluation took more time than the scheduled\ | ||||
|           \ interval. It indicates a slower storage backend access or too complex\ | ||||
|           \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) | ||||
|       expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusNotificationsBacklog | ||||
|       annotations: | ||||
|         description: "The Prometheus notification queue has not been empty for 10\ | ||||
|           \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus notifications backlog (instance {{ $labels.instance }}) | ||||
|       expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerNotificationFailing | ||||
|       annotations: | ||||
|         description: "Alertmanager is failing sending notifications\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager notification failing (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: rate(alertmanager_notifications_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTargetEmpty | ||||
|       annotations: | ||||
|         description: "Prometheus has no target in service discovery\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target empty (instance {{ $labels.instance }}) | ||||
|       expr: prometheus_sd_discovered_targets == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusLargeScrape | ||||
|       annotations: | ||||
|         description: "Prometheus has many scrapes that exceed the sample limit\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus large scrape (instance {{ $labels.instance }}) | ||||
|       expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > | ||||
|         10 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTargetScrapeDuplicate | ||||
|       annotations: | ||||
|         description: "Prometheus has many samples rejected due to duplicate timestamps\ | ||||
|           \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target scrape duplicate (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) | ||||
|         > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTsdbCheckpointCreationFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbCheckpointDeletionFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbCompactionsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB compactions failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbHeadTruncationsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbReloadFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB reload failures\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) | ||||
|       expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbWalCorruptions | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space | ||||
|           and wipe /data/wal | ||||
|       expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbWalTruncationsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
| --- | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: prometheus | ||||
|   annotations: | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: prometheus-operator-prometheus@kubernetescrd | ||||
| spec: | ||||
|   rules: | ||||
|   - host: prom.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: prometheus-operated | ||||
|             port: | ||||
|               number: 9090 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - "*.k-space.ee" | ||||
| --- | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: alertmanager | ||||
|   annotations: | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: prometheus-operator-alertmanager@kubernetescrd | ||||
| spec: | ||||
|   rules: | ||||
|   - host: am.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: alertmanager-operated | ||||
|             port: | ||||
|               number: 9093 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - "*.k-space.ee" | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: prometheus | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: alertmanager | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: operator | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: prometheus-operator | ||||
|   podMetricsEndpoints: | ||||
|     - port: http | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kubelet | ||||
| spec: | ||||
|   endpoints: | ||||
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     honorLabels: true | ||||
|     interval: 30s | ||||
|     port: https-metrics | ||||
|     scheme: https | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     honorLabels: true | ||||
|     interval: 30s | ||||
|     path: /metrics/cadvisor | ||||
|     port: https-metrics | ||||
|     scheme: https | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: kubelet | ||||
| --- | ||||
| apiVersion: codemowers.io/v1alpha1 | ||||
| kind: OIDCGWMiddlewareClient | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   displayName: Prometheus | ||||
|   uri: 'https://prom.k-space.ee' | ||||
|   allowedGroups: | ||||
|     - k-space:floor | ||||
|   headerMapping: | ||||
|     email: Remote-Email | ||||
|     groups: Remote-Groups | ||||
|     name: Remote-Name | ||||
|     user: Remote-Username | ||||
| --- | ||||
| apiVersion: codemowers.io/v1alpha1 | ||||
| kind: OIDCGWMiddlewareClient | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| spec: | ||||
|   displayName: AlertManager | ||||
|   uri: 'https://am.k-space.ee' | ||||
|   allowedGroups: | ||||
|     - k-space:kubernetes:admins | ||||
|   headerMapping: | ||||
|     email: Remote-Email | ||||
|     groups: Remote-Groups | ||||
|     name: Remote-Name | ||||
|     user: Remote-Username | ||||
| @@ -1,258 +0,0 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: websites | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /probe | ||||
|   module: http_2xx | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - https://git.k-space.ee/ | ||||
|         - https://grafana.k-space.ee/ | ||||
|         - https://wiki.k-space.ee/ | ||||
|         - https://pad.k-space.ee/ | ||||
|         - https://members.k-space.ee/ | ||||
|         - https://nextcloud.k-space.ee/ | ||||
|         - http://minio.infra.k-space.ee:9001/login | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: k6.ee | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /probe | ||||
|   module: dns_check_traefik | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - 193.40.103.2 | ||||
|         - 62.65.250.2 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: samba-cluster | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /metrics | ||||
|   module: tcp_connect | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - dc1.ad.k-space.ee:636 | ||||
|         - dc2.ad.k-space.ee:636 | ||||
|         - dc3.ad.k-space.ee:636 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: misc | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /metrics | ||||
|   module: tcp_connect | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - mail.k-space.ee:465 | ||||
|         - dev.k-space.ee:10648 | ||||
|         - mariadb.infra.k-space.ee:3306 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   # https://awesome-prometheus-alerts.grep.to/rules#blackbox | ||||
|   groups: | ||||
|   - name: blackbox | ||||
|     rules: | ||||
|     - alert: BlackboxProbeFailed | ||||
|       expr: probe_success == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox probe failed (instance {{ $labels.instance }}) | ||||
|         description: Probe failed | ||||
|     - alert: BlackboxSlowProbe | ||||
|       expr: avg_over_time(probe_duration_seconds[1m]) > 1 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox slow probe (instance {{ $labels.instance }}) | ||||
|         description: Blackbox probe took more than 1s to complete | ||||
|     - alert: BlackboxSlowDNS | ||||
|       expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox slow DNS lookup (instance {{ $labels.instance }}) | ||||
|         description: Blackbox DNS lookup took more than 1s to complete. | ||||
|           It seemed using IPv6 DNS servers in conjunction with Docker resulted | ||||
|           in odd 5s latency bump. For now we're using 8.8.8.8 because of that | ||||
|     - alert: BlackboxProbeHttpFailure | ||||
|       expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) | ||||
|         description: HTTP status code is not 200-399 | ||||
|     - alert: BlackboxSslCertificateWillExpireSoon | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate expires in 30 days | ||||
|     - alert: BlackboxSslCertificateWillExpireSoon | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate expires in 3 days | ||||
|     - alert: BlackboxSslCertificateExpired | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() <= 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate has expired already | ||||
|     - alert: BlackboxProbeSlowHttp | ||||
|       expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) | ||||
|         description: HTTP request took more than 1s | ||||
|     - alert: BlackboxProbeSlowPing | ||||
|       expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox probe slow ping (instance {{ $labels.instance }}) | ||||
|         description: Blackbox ping took more than 1s | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   revisionHistoryLimit: 0 | ||||
|   replicas: 3 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: blackbox-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: blackbox-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|       - name: blackbox-exporter | ||||
|         image: prom/blackbox-exporter:v0.20.0 | ||||
|         volumeMounts: | ||||
|         - name: blackbox-exporter-config | ||||
|           mountPath: /etc/blackbox_exporter | ||||
|       volumes: | ||||
|         - name: blackbox-exporter-config | ||||
|           configMap: | ||||
|             name: blackbox-exporter-config | ||||
|       # TODO: Results in odd 6s connection lag if scheduled in VLAN20 | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|         - key: dedicated | ||||
|           operator: Equal | ||||
|           value: monitoring | ||||
|           effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - labelSelector: | ||||
|               matchExpressions: | ||||
|               - key: app | ||||
|                 operator: In | ||||
|                 values: | ||||
|                 - blackbox-exporter | ||||
|             topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: http | ||||
|       port: 80 | ||||
|       protocol: TCP | ||||
|       targetPort: 9115 | ||||
|   selector: | ||||
|     app: blackbox-exporter | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: blackbox-exporter-config | ||||
| data: | ||||
|   config.yml: |- | ||||
|     modules: | ||||
|       http_2xx: | ||||
|         prober: http | ||||
|         http: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       http_post_2xx: | ||||
|         prober: http | ||||
|         http: | ||||
|           method: POST | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       tcp_connect: | ||||
|         prober: tcp | ||||
|         tcp: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       icmp: | ||||
|         prober: icmp | ||||
|         icmp: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       dns_check_traefik: | ||||
|         prober: dns | ||||
|         dns: | ||||
|           query_name: "traefik.k-space.ee" | ||||
|           query_type: "A" | ||||
|           validate_answer_rrs: | ||||
|             fail_if_not_matches_regexp: | ||||
|              - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       dns_check_k6: | ||||
|         prober: dns | ||||
|         dns: | ||||
|           query_name: "k6.ee" | ||||
|           query_type: "A" | ||||
|           validate_answer_rrs: | ||||
|             fail_if_not_matches_regexp: | ||||
|              - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
| @@ -1,110 +0,0 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: mikrotik | ||||
| spec: | ||||
|   bearerTokenSecret: | ||||
|     name: mikrotik-exporter | ||||
|     key: PROMETHEUS_BEARER_TOKEN | ||||
|   prober: | ||||
|     path: /metrics | ||||
|     url: mikrotik-exporter | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - router.mgmt.k-space.ee | ||||
|         - sw_chaos.mgmt.k-space.ee | ||||
|         - sw_poe.mgmt.k-space.ee | ||||
|         - sw_mgmt.mgmt.k-space.ee | ||||
|         - sw_core02.mgmt.k-space.ee | ||||
|         - sw_cyber.mgmt.k-space.ee | ||||
|         - sw_ha.mgmt.k-space.ee | ||||
|         - sw_asocial.mgmt.k-space.ee | ||||
|         - sw_kitchen.mgmt.k-space.ee | ||||
|         - sw_core01.mgmt.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: mikrotik | ||||
| spec: | ||||
|   groups: | ||||
|   - name: mikrotik | ||||
|     rules: | ||||
|     - alert: MikrotikUplinkRedundancyLost | ||||
|       expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: error | ||||
|       annotations: | ||||
|         summary: Switch uplink high availability lost | ||||
|         description: One of the two 10Gb optical links is malfunctioning | ||||
|     - alert: MikrotikLinkRateDegraded | ||||
|       expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: error | ||||
|       annotations: | ||||
|         summary: 10Gb link degraded | ||||
|         description: One of the 10Gb links is running at lower speed | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: mikrotik-exporter | ||||
| spec: | ||||
|   revisionHistoryLimit: 0 | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: mikrotik-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: mikrotik-exporter | ||||
|       annotations: | ||||
|         co.elastic.logs/multiline.pattern: '^  ' | ||||
|         co.elastic.logs/multiline.negate: "false" | ||||
|         co.elastic.logs/multiline.match: after | ||||
|     spec: | ||||
|       containers: | ||||
|       - name: mikrotik-exporter | ||||
|         image: harbor.k-space.ee/k-space/mikrotik-exporter:latest | ||||
|         env: | ||||
|           - name: MIKROTIK_USER | ||||
|             value: netpoller | ||||
|         envFrom: | ||||
|           - secretRef: | ||||
|               name: mikrotik-exporter | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|       - key: dedicated | ||||
|         operator: Equal | ||||
|         value: monitoring | ||||
|         effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - labelSelector: | ||||
|               matchExpressions: | ||||
|               - key: app | ||||
|                 operator: In | ||||
|                 values: | ||||
|                 - mikrotik-exporter | ||||
|             topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: mikrotik-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: http | ||||
|       port: 80 | ||||
|       protocol: TCP | ||||
|       targetPort: 3001 | ||||
|   selector: | ||||
|     app: mikrotik-exporter | ||||
| @@ -1,430 +0,0 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: nodes-proxmox | ||||
| spec: | ||||
|   scrapeTimeout: 30s | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - nas.mgmt.k-space.ee:9100 | ||||
|         - pve1.proxmox.infra.k-space.ee:9100 | ||||
|         - pve2.proxmox.infra.k-space.ee:9100 | ||||
|         - pve8.proxmox.infra.k-space.ee:9100 | ||||
|         - pve9.proxmox.infra.k-space.ee:9100 | ||||
|       relabelingConfigs: | ||||
|         - sourceLabels: [__param_target] | ||||
|           targetLabel: instance | ||||
|         - sourceLabels: [__param_target] | ||||
|           targetLabel: __address__ | ||||
|   prober: | ||||
|     url: localhost | ||||
|     path: /metrics | ||||
|   metricRelabelings: | ||||
|     - sourceLabels: [__address__] | ||||
|       targetLabel: target | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   groups: | ||||
|     - name: node-exporter | ||||
|       rules: | ||||
|         - alert: ZfsOfflinePool | ||||
|           expr: node_zfs_zpool_state{state!="online"} > 0 | ||||
|           for: 1m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: ZFS offline pool (instance {{ $labels.instance }}) | ||||
|             description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: HostHighLoad | ||||
|           expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5 | ||||
|           for: 15m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host under high load | ||||
|             description: Many processes are queued up for execution | ||||
|         - alert: HostOutOfMemory | ||||
|           expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host out of memory (instance {{ $labels.instance }}) | ||||
|             description: Node memory is filling up (< 10% left) | ||||
|         - alert: HostMemoryUnderMemoryPressure | ||||
|           expr: rate(node_vmstat_pgmajfault[1m]) > 1000 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host memory under memory pressure (instance {{ $labels.instance }}) | ||||
|             description: The node is under heavy memory pressure. High rate of major page faults | ||||
|         - alert: HostUnusualNetworkThroughputIn | ||||
|           expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 800e+06 | ||||
|           for: 1h | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual network throughput in (instance {{ $labels.instance }}) | ||||
|             description: Host network interfaces are probably receiving too much data (> 800 MB/s) | ||||
|         - alert: HostUnusualNetworkThroughputOut | ||||
|           expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 800e+06 | ||||
|           for: 1h | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual network throughput out (instance {{ $labels.instance }}) | ||||
|             description: Host network interfaces are probably sending too much data (> 800 MB/s) | ||||
|         - alert: HostUnusualDiskReadRate | ||||
|           expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 500e+06 | ||||
|           for: 1h | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual disk read rate (instance {{ $labels.instance }}) | ||||
|             description: Disk is probably reading too much data (> 500 MB/s) | ||||
|         - alert: HostUnusualDiskWriteRate | ||||
|           expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 500e+06 | ||||
|           for: 1h | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual disk write rate (instance {{ $labels.instance }}) | ||||
|             description: Disk is probably writing too much data (> 500 MB/s) | ||||
|         # Please add ignored mountpoints in node_exporter parameters like | ||||
|         # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | ||||
|         # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | ||||
|         - alert: HostOutOfDiskSpace | ||||
|           expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host out of disk space (instance {{ $labels.instance }}) | ||||
|             description: Disk is almost full (< 10% left) | ||||
|         # Please add ignored mountpoints in node_exporter parameters like | ||||
|         # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | ||||
|         # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | ||||
|         - alert: HostDiskWillFillIn24Hours | ||||
|           expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) | ||||
|             description: Filesystem is predicted to run out of space within the next 24 hours at current write rate | ||||
|         - alert: HostOutOfInodes | ||||
|           expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host out of inodes (instance {{ $labels.instance }}) | ||||
|             description: Disk is almost running out of available inodes (< 10% left) | ||||
|         - alert: HostInodesWillFillIn24Hours | ||||
|           expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) | ||||
|             description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate | ||||
|         - alert: HostUnusualDiskReadLatency | ||||
|           expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual disk read latency (instance {{ $labels.instance }}) | ||||
|             description: Disk latency is growing (read operations > 100ms) | ||||
|         - alert: HostUnusualDiskWriteLatency | ||||
|           expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host unusual disk write latency (instance {{ $labels.instance }}) | ||||
|             description: Disk latency is growing (write operations > 100ms) | ||||
|         - alert: HostCpuStealNoisyNeighbor | ||||
|           expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) | ||||
|             description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. | ||||
|         # 1000 context switches is an arbitrary number. | ||||
|         # Alert threshold depends on nature of application. | ||||
|         # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 | ||||
|         - alert: HostContextSwitching | ||||
|           expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host context switching (instance {{ $labels.instance }}) | ||||
|             description: Context switching is growing on node (> 50000 / s) | ||||
|         - alert: HostSwapIsEnabled | ||||
|           expr: node_memory_SwapTotal_bytes > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Swap is discouraged nowadays | ||||
|         - alert: HostPhysicalComponentTooHot | ||||
|           expr: node_hwmon_temp_celsius > 75 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host physical component too hot (instance {{ $labels.instance }}) | ||||
|             description: Physical hardware component too hot | ||||
|         - alert: HostNodeOvertemperatureAlarm | ||||
|           expr: node_hwmon_temp_alarm == 1 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Host node overtemperature alarm (instance {{ $labels.instance }}) | ||||
|             description: Physical node temperature alarm triggered | ||||
|         - alert: HostRaidArrayGotInactive | ||||
|           expr: node_md_state{state="inactive"} > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Host RAID array got inactive (instance {{ $labels.instance }}) | ||||
|             description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. | ||||
|         - alert: HostRaidDiskFailure | ||||
|           expr: node_md_disks{state="failed"} > 0 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host RAID disk failure (instance {{ $labels.instance }}) | ||||
|             description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap | ||||
|         - alert: HostOomKillDetected | ||||
|           expr: increase(node_vmstat_oom_kill[1m]) > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host OOM kill detected (instance {{ $labels.instance }}) | ||||
|             description: OOM kill detected | ||||
|         - alert: HostEdacCorrectableErrorsDetected | ||||
|           expr: increase(node_edac_correctable_errors_total[1m]) > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: info | ||||
|           annotations: | ||||
|             summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) | ||||
|             description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: HostEdacUncorrectableErrorsDetected | ||||
|           expr: node_edac_uncorrectable_errors_total > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) | ||||
|             description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: HostNetworkReceiveErrors | ||||
|           expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host Network Receive Errors (instance {{ $labels.instance }}) | ||||
|             description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: HostNetworkTransmitErrors | ||||
|           expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host Network Transmit Errors (instance {{ $labels.instance }}) | ||||
|             description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: HostNetworkInterfaceSaturated | ||||
|           expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 | ||||
|           for: 1m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host Network Interface Saturated (instance {{ $labels.instance }}) | ||||
|             description: "The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded." | ||||
|         - alert: HostNetworkBondDegraded | ||||
|           expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"} | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host Network Bond Degraded | ||||
|         - alert: HostConntrackLimit | ||||
|           expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host conntrack limit (instance {{ $labels.instance }}) | ||||
|             description: The number of conntrack is approching limit | ||||
|         - alert: HostClockSkew | ||||
|           expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host clock skew (instance {{ $labels.instance }}) | ||||
|             description: Clock skew detected. Clock is out of sync. | ||||
|         - alert: HostClockNotSynchronising | ||||
|           expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Host clock not synchronising (instance {{ $labels.instance }}) | ||||
|             description: Clock not synchronising. | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: smart | ||||
| spec: | ||||
|   groups: | ||||
|     - name: smart | ||||
|       rules: | ||||
|         - alert: SmartSSDWriteRateTooHigh | ||||
|           expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000 | ||||
|           for: 5m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: SSD write rate exceeds 10MB/s | ||||
|             description: At this rate the SSD will be worn out before warranty period expires | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: temperatures | ||||
| spec: | ||||
|   groups: | ||||
|     - name: temperatures | ||||
|       rules: | ||||
|         - alert: HighDiskTemperature | ||||
|           expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45 | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: High HDD/SSD temperature indicates high ambient temperature | ||||
|         - alert: HighChipsetTemperature | ||||
|           expr: node_hwmon_temp_celsius > 65 | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans | ||||
|         - alert: LowDiskTemperature | ||||
|           expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10 | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| spec: | ||||
|  | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: node-exporter | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
|       scrapeTimeout: 30s | ||||
|       relabelings: | ||||
|         - sourceLabels: [__meta_kubernetes_pod_node_name] | ||||
|           targetLabel: node | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: DaemonSet | ||||
| metadata: | ||||
|   labels: | ||||
|     app: node-exporter | ||||
|   name: node-exporter | ||||
|   annotations: | ||||
|     keel.sh/policy: force | ||||
|     keel.sh/trigger: poll | ||||
|     keel.sh/pollSchedule: "@midnight" | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: node-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: node-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|         - name: node-exporter | ||||
|           args: | ||||
|             - --web.listen-address=0.0.0.0:9101 | ||||
|             - --path.sysfs=/host/sys | ||||
|             - --path.rootfs=/host/root | ||||
|             - --no-collector.wifi | ||||
|             - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) | ||||
|             - --collector.netclass.ignored-devices=^(veth|cali|vxlan|cni|vnet|tap|lo|wg) | ||||
|             - --collector.netdev.device-exclude=^(veth|cali|vxlan|cni|vnet|tap|lo|wg) | ||||
|             - --collector.diskstats.ignored-devices=^(sr[0-9][0-9]*)$ | ||||
|           image: prom/node-exporter:v1.5.0 | ||||
|           resources: | ||||
|             limits: | ||||
|               cpu: 50m | ||||
|               memory: 180Mi | ||||
|             requests: | ||||
|               cpu: 5m | ||||
|               memory: 20Mi | ||||
|           volumeMounts: | ||||
|             - mountPath: /host/sys | ||||
|               mountPropagation: HostToContainer | ||||
|               name: sys | ||||
|               readOnly: true | ||||
|             - mountPath: /host/root | ||||
|               mountPropagation: HostToContainer | ||||
|               name: root | ||||
|               readOnly: true | ||||
|           ports: | ||||
|             - containerPort: 9101 | ||||
|               name: web | ||||
|           securityContext: | ||||
|             runAsGroup: 65532 | ||||
|             runAsNonRoot: true | ||||
|             runAsUser: 65532 | ||||
|             readOnlyRootFilesystem: true | ||||
|       hostNetwork: true | ||||
|       hostPID: true | ||||
|       priorityClassName: system-node-critical | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       serviceAccountName: node-exporter | ||||
|       tolerations: | ||||
|         - operator: Exists | ||||
|       volumes: | ||||
|         - hostPath: | ||||
|             path: /sys | ||||
|           name: sys | ||||
|         - hostPath: | ||||
|             path: / | ||||
|           name: root | ||||
| @@ -1,184 +0,0 @@ | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: snmp-exporter | ||||
| spec: | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: snmp-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: snmp-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|         - image: prom/snmp-exporter:v0.22.0 | ||||
|           name: snmp-exporter | ||||
|           imagePullPolicy: IfNotPresent | ||||
|           securityContext: | ||||
|             runAsNonRoot: true | ||||
|             runAsUser: 1000 | ||||
|             readOnlyRootFilesystem: true | ||||
|           ports: | ||||
|           - containerPort: 9116 | ||||
|             name: exporter | ||||
|           livenessProbe: | ||||
|             httpGet: | ||||
|               path: /health | ||||
|               port: exporter | ||||
|           readinessProbe: | ||||
|             httpGet: | ||||
|               path: /health | ||||
|               port: exporter | ||||
|           volumeMounts: | ||||
|           - name: snmp-exporter | ||||
|             mountPath: /etc/snmp_exporter | ||||
|       volumes: | ||||
|         - name: snmp-exporter | ||||
|           configMap: | ||||
|             name: snmp-exporter | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|       - key: dedicated | ||||
|         operator: Equal | ||||
|         value: monitoring | ||||
|         effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - labelSelector: | ||||
|               matchExpressions: | ||||
|               - key: app | ||||
|                 operator: In | ||||
|                 values: | ||||
|                 - snmp-exporter | ||||
|             topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: snmp-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: exporter | ||||
|       port: 9116 | ||||
|       protocol: TCP | ||||
|   selector: | ||||
|     app: snmp-exporter | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: ups | ||||
| spec: | ||||
|   interval: 60s | ||||
|   module: rfc1628_ups | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   metricRelabelings: | ||||
|     - sourceLabels: [__name__] | ||||
|       regex: '(.*)' | ||||
|       replacement: 'snmp_${1}' | ||||
|       targetLabel: __name__ | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - ups-4.mgmt.k-space.ee | ||||
|         - ups-7.mgmt.k-space.ee | ||||
|         - ups-8.mgmt.k-space.ee | ||||
|         - ups-9.mgmt.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: ups | ||||
| spec: | ||||
|   groups: | ||||
|   - name: ups | ||||
|     rules: | ||||
|     - alert: UPSBatteryLost | ||||
|       annotations: | ||||
|         summary: One or more UPS-es have degraded batteries. | ||||
|       expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: UPSPowerLost | ||||
|       annotations: | ||||
|         summary: One or more UPS-es is not in normal operation mode. This either means | ||||
|           power is lost or UPS was loaded and it's now in bypass mode. | ||||
|       expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) != 4 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: UPSExcessivelyLoaded | ||||
|       annotations: | ||||
|         summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es | ||||
|           is balanced and load for no UPS stays above 50%. | ||||
|       expr: snmp_upsOutputPercentLoad > 80 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: critical | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: printer | ||||
| spec: | ||||
|   interval: 60s | ||||
|   scrapeTimeout: 50s | ||||
|   module: printer_mib | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   metricRelabelings: | ||||
|     - sourceLabels: [__name__] | ||||
|       regex: '(.*)' | ||||
|       replacement: 'snmp_${1}' | ||||
|       targetLabel: __name__ | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - mfp-chaos.pub.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: printer | ||||
| spec: | ||||
|   groups: | ||||
|   - name: printer | ||||
|     rules: | ||||
|     - alert: PrinterNeedsAttention | ||||
|       annotations: | ||||
|         summary: Printer is in error state. If the underlying reason is 'low on paper' | ||||
|           make sure there is enough paper near the printer. It not drop a line at | ||||
|           accounting@k-space.ee to order more office supplies. | ||||
|       expr: snmp_hrPrinterDetectedErrorState == 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: beamer | ||||
| spec: | ||||
|   interval: 60s | ||||
|   module: epson_beamer | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   metricRelabelings: | ||||
|     - sourceLabels: [__name__] | ||||
|       regex: '(.*)' | ||||
|       replacement: 'snmp_${1}' | ||||
|       targetLabel: __name__ | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - beamer-cyber.sec.k-space.ee | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user