forked from k-space/kube
		
	
		
			
				
	
	
		
			528 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			528 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PodMonitor
 | |
| metadata:
 | |
|   name: metrics
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   namespaceSelector: {}
 | |
|   selector: {}
 | |
|   podMetricsEndpoints:
 | |
|     - port: exporter
 | |
|     - port: metrics
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: Prometheus
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   enableAdminAPI: true
 | |
|   topologySpreadConstraints:
 | |
|     - maxSkew: 1
 | |
|       topologyKey: topology.kubernetes.io/zone
 | |
|       whenUnsatisfiable: DoNotSchedule
 | |
|       labelSelector:
 | |
|         matchLabels:
 | |
|           app.kubernetes.io/instance: prometheus
 | |
|           app.kubernetes.io/name: prometheus
 | |
|   nodeSelector:
 | |
|     node-role.kubernetes.io/control-plane: ''
 | |
|   tolerations:
 | |
|     - key: node-role.kubernetes.io/control-plane
 | |
|       operator: Equal
 | |
|       value: ''
 | |
|       effect: NoSchedule
 | |
|   alerting:
 | |
|     alertmanagers:
 | |
|       - namespace: monitoring
 | |
|         name: alertmanager-operated
 | |
|         port: web
 | |
|   externalUrl: "http://prom.k-space.ee/"
 | |
|   replicas: 2
 | |
|   shards: 1
 | |
|   serviceAccountName: prometheus
 | |
|   securityContext:
 | |
|     fsGroup: 2000
 | |
|     runAsGroup: 2000
 | |
|     runAsNonRoot: true
 | |
|     runAsUser: 1000
 | |
|   serviceMonitorNamespaceSelector: {}
 | |
|   serviceMonitorSelector: {}
 | |
|   podMonitorNamespaceSelector: {}
 | |
|   podMonitorSelector: {}
 | |
|   probeNamespaceSelector: {}
 | |
|   probeSelector: {}
 | |
|   ruleNamespaceSelector: {}
 | |
|   ruleSelector: {}
 | |
|   retentionSize: 8GB
 | |
|   resources:
 | |
|     limits:
 | |
|       cpu: 500m
 | |
|       memory: 2Gi
 | |
|     requests:
 | |
|       cpu: 100m
 | |
|       memory: 700Mi
 | |
|   storage:
 | |
|     volumeClaimTemplate:
 | |
|       spec:
 | |
|         accessModes:
 | |
|           - ReadWriteOnce
 | |
|         resources:
 | |
|           requests:
 | |
|             storage: 10Gi
 | |
|         storageClassName: prometheus
 | |
| ---
 | |
| apiVersion: v1
 | |
| kind: ServiceAccount
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| ---
 | |
| apiVersion: rbac.authorization.k8s.io/v1
 | |
| kind: ClusterRole
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| rules:
 | |
|   - resources:
 | |
|       - nodes
 | |
|       - nodes/metrics
 | |
|       - services
 | |
|       - endpoints
 | |
|       - pods
 | |
|     apiGroups:
 | |
|       - ""
 | |
|     verbs:
 | |
|       - get
 | |
|       - list
 | |
|       - watch
 | |
|   - resources:
 | |
|       - configmaps
 | |
|     apiGroups:
 | |
|       - ""
 | |
|     verbs:
 | |
|       - get
 | |
|   - resources:
 | |
|       - ingresses
 | |
|     apiGroups:
 | |
|       - networking.k8s.io
 | |
|     verbs:
 | |
|       - get
 | |
|       - list
 | |
|       - watch
 | |
|   - nonResourceURLs:
 | |
|       - /metrics
 | |
|     verbs:
 | |
|       - get
 | |
| ---
 | |
| apiVersion: rbac.authorization.k8s.io/v1
 | |
| kind: ClusterRoleBinding
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| roleRef:
 | |
|   apiGroup: rbac.authorization.k8s.io
 | |
|   kind: ClusterRole
 | |
|   name: prometheus
 | |
| subjects:
 | |
| - kind: ServiceAccount
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PrometheusRule
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   groups:
 | |
|   - name: prometheus
 | |
|     rules:
 | |
|     - alert: PrometheusJobMissing
 | |
|       annotations:
 | |
|         description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \
 | |
|           \ LABELS = {{ $labels }}"
 | |
|         summary: Prometheus job missing (instance {{ $labels.instance }})
 | |
|       expr: absent(up{job="monitoring/prometheus"})
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusTargetMissing
 | |
|       annotations:
 | |
|         description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus target missing (instance {{ $labels.instance }})
 | |
|       expr: up == 0
 | |
|       for: 5m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusAllTargetsMissing
 | |
|       annotations:
 | |
|         description: "A Prometheus job does not have living target anymore.\n  VALUE\
 | |
|           \ = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus all targets missing (instance {{ $labels.instance }})
 | |
|       expr: count by (job) (up) == 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusConfigurationReloadFailure
 | |
|       annotations:
 | |
|         description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\
 | |
|           \  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus configuration reload failure (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: prometheus_config_last_reload_successful != 1
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusTooManyRestarts
 | |
|       annotations:
 | |
|         description: "Prometheus has restarted more than twice in the last 15 minutes.\
 | |
|           \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\
 | |
|           \ }}"
 | |
|         summary: Prometheus too many restarts (instance {{ $labels.instance }})
 | |
|       expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
 | |
|         > 2
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusAlertmanagerJobMissing
 | |
|       annotations:
 | |
|         description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\
 | |
|           \ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus AlertManager job missing (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: absent(up{job="monitoring/alertmanager"})
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusAlertmanagerConfigurationReloadFailure
 | |
|       annotations:
 | |
|         description: "AlertManager configuration reload error\n  VALUE = {{ $value\
 | |
|           \ }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus AlertManager configuration reload failure (instance {{
 | |
|           $labels.instance }})
 | |
|       expr: alertmanager_config_last_reload_successful != 1
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusAlertmanagerConfigNotSynced
 | |
|       annotations:
 | |
|         description: "Configurations of AlertManager cluster instances are out of\
 | |
|           \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusNotConnectedToAlertmanager
 | |
|       annotations:
 | |
|         description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\
 | |
|           \ }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: prometheus_notifications_alertmanagers_discovered < 1
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusRuleEvaluationFailures
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} rule evaluation failures,\
 | |
|           \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\
 | |
|           \ = {{ $labels }}"
 | |
|         summary: Prometheus rule evaluation failures (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTemplateTextExpansionFailures
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} template text expansion\
 | |
|           \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus template text expansion failures (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusRuleEvaluationSlow
 | |
|       annotations:
 | |
|         description: "Prometheus rule evaluation took more time than the scheduled\
 | |
|           \ interval. It indicates a slower storage backend access or too complex\
 | |
|           \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
 | |
|       expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 | |
|       for: 5m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusNotificationsBacklog
 | |
|       annotations:
 | |
|         description: "The Prometheus notification queue has not been empty for 10\
 | |
|           \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus notifications backlog (instance {{ $labels.instance }})
 | |
|       expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusAlertmanagerNotificationFailing
 | |
|       annotations:
 | |
|         description: "Alertmanager is failing sending notifications\n  VALUE = {{\
 | |
|           \ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTargetEmpty
 | |
|       annotations:
 | |
|         description: "Prometheus has no target in service discovery\n  VALUE = {{\
 | |
|           \ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus target empty (instance {{ $labels.instance }})
 | |
|       expr: prometheus_sd_discovered_targets == 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusLargeScrape
 | |
|       annotations:
 | |
|         description: "Prometheus has many scrapes that exceed the sample limit\n \
 | |
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus large scrape (instance {{ $labels.instance }})
 | |
|       expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
 | |
|         10
 | |
|       for: 5m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusTargetScrapeDuplicate
 | |
|       annotations:
 | |
|         description: "Prometheus has many samples rejected due to duplicate timestamps\
 | |
|           \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus target scrape duplicate (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
 | |
|         > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: warning
 | |
|     - alert: PrometheusTsdbCheckpointCreationFailures
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbCheckpointDeletionFailures
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbCompactionsFailed
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbHeadTruncationsFailed
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbReloadFailures
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
 | |
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
 | |
|       expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbWalCorruptions
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
 | |
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
 | |
|           and wipe /data/wal
 | |
|       expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PrometheusTsdbWalTruncationsFailed
 | |
|       annotations:
 | |
|         description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
 | |
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | |
|         summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
 | |
|           }})
 | |
|       expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 | |
|       for: 0m
 | |
|       labels:
 | |
|         severity: critical
 | |
| ---
 | |
| apiVersion: networking.k8s.io/v1
 | |
| kind: Ingress
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
|   annotations:
 | |
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | |
|     traefik.ingress.kubernetes.io/router.tls: "true"
 | |
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | |
|     traefik.ingress.kubernetes.io/router.middlewares: monitoring-prometheus@kubernetescrd
 | |
| spec:
 | |
|   rules:
 | |
|   - host: prom.k-space.ee
 | |
|     http:
 | |
|       paths:
 | |
|       - pathType: Prefix
 | |
|         path: "/"
 | |
|         backend:
 | |
|           service:
 | |
|             name: prometheus-operated
 | |
|             port:
 | |
|               number: 9090
 | |
|   tls:
 | |
|   - hosts:
 | |
|     - "*.k-space.ee"
 | |
| ---
 | |
| apiVersion: networking.k8s.io/v1
 | |
| kind: Ingress
 | |
| metadata:
 | |
|   name: alertmanager
 | |
|   namespace: monitoring
 | |
|   annotations:
 | |
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | |
|     traefik.ingress.kubernetes.io/router.tls: "true"
 | |
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | |
|     traefik.ingress.kubernetes.io/router.middlewares: monitoring-alertmanager@kubernetescrd
 | |
| spec:
 | |
|   rules:
 | |
|   - host: am.k-space.ee
 | |
|     http:
 | |
|       paths:
 | |
|       - pathType: Prefix
 | |
|         path: "/"
 | |
|         backend:
 | |
|           service:
 | |
|             name: alertmanager-operated
 | |
|             port:
 | |
|               number: 9093
 | |
|   tls:
 | |
|   - hosts:
 | |
|     - "*.k-space.ee"
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PodMonitor
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       app.kubernetes.io/name: prometheus
 | |
|   podMetricsEndpoints:
 | |
|     - port: web
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PodMonitor
 | |
| metadata:
 | |
|   name: alertmanager
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       app.kubernetes.io/name: alertmanager
 | |
|   podMetricsEndpoints:
 | |
|     - port: web
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PodMonitor
 | |
| metadata:
 | |
|   name: operator
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       app.kubernetes.io/name: prometheus-operator
 | |
|   podMetricsEndpoints:
 | |
|     - port: http
 | |
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: ServiceMonitor
 | |
| metadata:
 | |
|   name: kubelet
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   endpoints:
 | |
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 | |
|     honorLabels: true
 | |
|     interval: 30s
 | |
|     port: https-metrics
 | |
|     scheme: https
 | |
|     tlsConfig:
 | |
|       insecureSkipVerify: true
 | |
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 | |
|     honorLabels: true
 | |
|     interval: 30s
 | |
|     path: /metrics/cadvisor
 | |
|     port: https-metrics
 | |
|     scheme: https
 | |
|     tlsConfig:
 | |
|       insecureSkipVerify: true
 | |
|   namespaceSelector:
 | |
|     matchNames:
 | |
|     - kube-system
 | |
|   selector:
 | |
|     matchLabels:
 | |
|       app.kubernetes.io/name: kubelet
 | |
| ---
 | |
| apiVersion: codemowers.cloud/v1beta1
 | |
| kind: OIDCMiddlewareClient
 | |
| metadata:
 | |
|   name: prometheus
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   displayName: Prometheus
 | |
|   uri: 'https://prom.k-space.ee'
 | |
|   allowedGroups:
 | |
|     - k-space:floor
 | |
|   headerMapping:
 | |
|     email: Remote-Email
 | |
|     groups: Remote-Groups
 | |
|     name: Remote-Name
 | |
|     user: Remote-Username
 | |
| ---
 | |
| apiVersion: codemowers.cloud/v1beta1
 | |
| kind: OIDCMiddlewareClient
 | |
| metadata:
 | |
|   name: alertmanager
 | |
|   namespace: monitoring
 | |
| spec:
 | |
|   displayName: AlertManager
 | |
|   uri: 'https://am.k-space.ee'
 | |
|   allowedGroups:
 | |
|     - k-space:kubernetes:admins
 | |
|   headerMapping:
 | |
|     email: Remote-Email
 | |
|     groups: Remote-Groups
 | |
|     name: Remote-Name
 | |
|     user: Remote-Username
 |