Migrate to Prometheus Operator
All checks were successful
continuous-integration/drone Build is passing

This commit is contained in:
Lauri Võsandi 2022-09-11 16:24:35 +03:00
parent ee4b1ddf57
commit 1045ed2f26
30 changed files with 32403 additions and 129 deletions

View File

@ -1,17 +1,14 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: monitoring
name: prometheus-operator
namespace: argocd
spec:
project: default
source:
repoURL: 'git@git.k-space.ee:k-space/kube.git'
path: monitoring
path: prometheus-operator
targetRevision: HEAD
destination:
server: 'https://kubernetes.default.svc'
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true
namespace: prometheus-operator

33
argocd/monitoring.yml Normal file
View File

@ -0,0 +1,33 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: argocd
spec:
selector: {}
podMetricsEndpoints:
- port: metrics
- port: controller
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: argocd
spec:
groups:
- name: argocd
rules:
- alert: ArgoNotSynced
annotations:
summary: Some applications in Argo are out of sync
expr: sum by (dest_namespace) (argocd_app_info{sync_status!="Synced"}) > 0
for: 8h
labels:
severity: warning
- alert: ArgoNotHealthy
annotations:
summary: Some applications in Argo are not healthy
expr: argocd_app_info{health_status!="Healthy"}
for: 30m
labels:
severity: warning

View File

@ -77,10 +77,6 @@ server:
metrics:
enabled: true
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8083"
# We don't use ApplicationSet CRD-s (yet)
applicationSet:
@ -89,26 +85,14 @@ applicationSet:
repoServer:
metrics:
enabled: true
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8084"
notifications:
metrics:
enabled: true
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9001"
controller:
metrics:
enabled: true
service:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8082"
configs:
secret:

View File

@ -10,11 +10,11 @@ spec:
replicas: 2
selector:
matchLabels:
app: camtiler
app.kubernetes.io/name: camtiler
template:
metadata:
labels:
app: camtiler
app.kubernetes.io/name: camtiler
component: camtiler
spec:
serviceAccountName: camtiler
@ -25,6 +25,9 @@ spec:
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
ports:
- containerPort: 5000
name: "http"
---
apiVersion: apps/v1
kind: Deployment
@ -38,11 +41,11 @@ spec:
replicas: 2
selector:
matchLabels:
app: log-viewer-frontend
app.kubernetes.io/name: log-viewer-frontend
template:
metadata:
labels:
app: log-viewer-frontend
app.kubernetes.io/name: log-viewer-frontend
spec:
containers:
- name: log-viewer-frontend
@ -64,11 +67,11 @@ spec:
replicas: 3
selector:
matchLabels:
app: log-viewer-backend
app.kubernetes.io/name: log-viewer-backend
template:
metadata:
labels:
app: log-viewer-backend
app.kubernetes.io/name: log-viewer-backend
spec:
containers:
- name: log-backend-backend
@ -109,7 +112,7 @@ metadata:
spec:
type: ClusterIP
selector:
app: log-viewer-frontend
app.kubernetes.io/name: log-viewer-frontend
ports:
- protocol: TCP
port: 3003
@ -121,7 +124,7 @@ metadata:
spec:
type: ClusterIP
selector:
app: log-viewer-backend
app.kubernetes.io/name: log-viewer-backend
ports:
- protocol: TCP
port: 3002
@ -130,14 +133,12 @@ apiVersion: v1
kind: Service
metadata:
name: camtiler
annotations:
prometheus.io/scrape: 'true'
labels:
component: camtiler
spec:
type: ClusterIP
selector:
app: camtiler
app.kubernetes.io/name: camtiler
component: camtiler
ports:
- protocol: TCP
@ -254,7 +255,7 @@ spec:
kubernetes.io/metadata.name: monitoring
podSelector:
matchLabels:
app: prometheus
app.kubernetes.io/name: prometheus
egress:
- to:
- ipBlock:
@ -263,7 +264,7 @@ spec:
- to:
- podSelector:
matchLabels:
app: mongodb-svc
app.kubernetes.io/name: mongodb-svc
ports:
- port: 27017
- to:
@ -298,7 +299,7 @@ spec:
kubernetes.io/metadata.name: monitoring
podSelector:
matchLabels:
app: prometheus
app.kubernetes.io/name: prometheus
- from:
- namespaceSelector:
matchLabels:
@ -314,7 +315,7 @@ metadata:
spec:
podSelector:
matchLabels:
app: log-viewer-backend
app.kubernetes.io/name: log-viewer-backend
policyTypes:
- Ingress
- Egress
@ -322,13 +323,11 @@ spec:
- to:
- podSelector:
matchLabels:
app: mongodb-svc
app.kubernetes.io/name: mongodb-svc
- to:
- podSelector:
matchLabels:
v1.min.io/tenant: minio
ports:
- port: 9000
- ipBlock:
# Minio is accessed thru public endpoint via Traefik
cidr: 193.40.103.0/24
ingress:
- from:
- namespaceSelector:
@ -345,7 +344,7 @@ metadata:
spec:
podSelector:
matchLabels:
app: log-viewer-frontend
app.kubernetes.io/name: log-viewer-frontend
policyTypes:
- Ingress
- Egress
@ -458,7 +457,6 @@ spec:
required: ["target"]
required: ["spec"]
---
---
apiVersion: codemowers.io/v1alpha1
kind: ClusterOperator
metadata:
@ -480,7 +478,7 @@ spec:
spec:
type: ClusterIP
selector:
app: foobar
app.kubernetes.io/name: foobar
component: camdetect
ports:
- protocol: TCP
@ -506,14 +504,11 @@ spec:
maxUnavailable: 1
selector:
matchLabels:
app: foobar
app.kubernetes.io/name: foobar
template:
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '5000'
labels:
app: foobar
app.kubernetes.io/name: foobar
component: camdetect
spec:
containers:
@ -590,9 +585,55 @@ spec:
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: foobar
app.kubernetes.io/name: foobar
component: camdetect
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: camtiler
spec:
selector: {}
podMetricsEndpoints:
- port: http
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: cameras
spec:
groups:
- name: cameras
rules:
- alert: CameraLost
expr: rate(camdetect_rx_frames_total[2m]) < 1
for: 2m
labels:
severity: warning
annotations:
summary: Camera feed stopped
- alert: CameraServerRoomMotion
expr: camdetect_event_active {app="camdetect-server-room"} > 0
for: 1m
labels:
severity: warning
annotations:
summary: Motion was detected in server room
- alert: CameraSlowUploads
expr: rate(camdetect_upload_dropped_frames_total[2m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Motion detect snapshots are piling up and not getting uploaded to S3
- alert: CameraSlowProcessing
expr: rate(camdetect_download_dropped_frames_total[2m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Motion detection processing pipeline is not keeping up with incoming frames
---
apiVersion: k-space.ee/v1alpha1
kind: Camera
metadata:

View File

@ -42,9 +42,6 @@ spec:
metadata:
labels:
app: drone
annotations:
prometheus.io/port: "80"
prometheus.io/scrape: "true"
spec:
automountServiceAccountToken: false
securityContext:

View File

@ -10,6 +10,9 @@ spec:
kibanaRef:
name: kibana
config:
http:
enabled: true
port: 5066
filebeat:
autodiscover:
providers:
@ -81,6 +84,14 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: exporter
image: sepa/beats-exporter
args:
- -p=5066
ports:
- containerPort: 8080
name: exporter
protocol: TCP
volumes:
- name: varlogcontainers
hostPath:

16
freescout/application.yml Normal file
View File

@ -0,0 +1,16 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: freescout
spec:
groups:
- name: freescout
rules:
- alert: FreescoutSyncBroken
expr: time() - wildduck_last_login{email=~"(info|accounting)@k-space.ee"} > 300
for: 10m
labels:
severity: warning
annotations:
summary: Freescout mailbox synchronization is broken

3
kube-system/README.md Normal file
View File

@ -0,0 +1,3 @@
```
kubectl apply -n kube-system -f kube-state-metrics.yml
``

View File

@ -0,0 +1,221 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
name: kube-state-metrics
labels:
app.kubernetes.io/name: kube-state-metrics
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
labels:
app.kubernetes.io/name: kube-state-metrics
rules:
- apiGroups:
- ""
resources:
- configmaps
- secrets
- nodes
- pods
- services
- serviceaccounts
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- certificates.k8s.io
resources:
- certificatesigningrequests
verbs:
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
- clusterroles
- rolebindings
- roles
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
labels:
app.kubernetes.io/name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
labels:
app.kubernetes.io/name: kube-state-metrics
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
template:
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
spec:
automountServiceAccountToken: true
containers:
- image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.6.0
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
readinessProbe:
httpGet:
path: /
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsUser: 65534
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: kube-state-metrics
---
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
labels:
app.kubernetes.io/name: kube-state-metrics
spec:
clusterIP: None
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry
selector:
app.kubernetes.io/name: kube-state-metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-state-metrics
spec:
endpoints:
- honorLabels: true
path: /metrics
port: http-metrics
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics

View File

@ -7,7 +7,7 @@ and then heavily modified.
To deploy Longhorn use following:
```
kubectl -n longhorn-system apply -f longhorn.yaml -f ingress.yml
kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
```
After deploying specify `dedicated=storage:NoSchedule`

View File

@ -0,0 +1,126 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: longhorn-dashboard
namespace: longhorn-system
annotations:
kubernetes.io/ingress.class: traefik
cert-manager.io/cluster-issuer: default
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
rules:
- host: longhorn.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: longhorn-frontend
port:
number: 80
tls:
- hosts:
- longhorn.k-space.ee
secretName: longhorn-tls
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: manager
spec:
selector: {}
podMetricsEndpoints:
- port: manager
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: longhorn
spec:
# Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/
groups:
- name: longhorn
rules:
- alert: LonghornVolumeActualSpaceUsedWarning
annotations:
description: The accumulated snapshots for volume use up more space than the volume's capacity
summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
for: 5m
labels:
issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
severity: warning
- alert: LonghornVolumeStatusCritical
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
more than 2 minutes.
summary: Longhorn volume {{$labels.volume}} is Fault
expr: longhorn_volume_robustness == 3
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Fault.
severity: critical
- alert: LonghornVolumeStatusWarning
annotations:
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
more than 5 minutes.
summary: Longhorn volume {{$labels.volume}} is Degraded
expr: longhorn_volume_robustness == 2
for: 5m
labels:
issue: Longhorn volume {{$labels.volume}} is Degraded.
severity: warning
- alert: LonghornNodeStorageWarning
annotations:
description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of node is over 70% of the capacity.
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of node {{$labels.node}} is high.
severity: warning
- alert: LonghornDiskStorageWarning
annotations:
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
more than 5 minutes.
summary: The used storage of disk is over 70% of the capacity.
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
for: 5m
labels:
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
severity: warning
- alert: LonghornNodeDown
annotations:
description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
summary: Longhorn nodes is offline
expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
for: 5m
labels:
issue: There are {{$value}} Longhorn nodes are offline
severity: critical
- alert: LonghornIntanceManagerCPUUsageWarning
annotations:
description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
more than 5 minutes.
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
for: 5m
labels:
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
severity: warning
- alert: LonghornNodeCPUUsageWarning
annotations:
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
more than 5 minutes.
summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
for: 5m
labels:
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
severity: warning

View File

@ -1,28 +0,0 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: longhorn-dashboard
namespace: longhorn-system
annotations:
kubernetes.io/ingress.class: traefik
cert-manager.io/cluster-issuer: default
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
rules:
- host: longhorn.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: longhorn-frontend
port:
number: 80
tls:
- hosts:
- longhorn.k-space.ee
secretName: longhorn-tls

View File

@ -1,27 +0,0 @@
persistence:
defaultClassReplicaCount: 2
defaultSettings:
defaultDataLocality: best-effort
taintToleration: "dedicated=storage:NoSchedule"
systemManagedComponentsNodeSelector: "dedicated:storage"
longhornDriver:
tolerations:
- key: dedicated
operator: Equal
value: storage
effect: NoSchedule
longhornUI:
tolerations:
- key: dedicated
operator: Equal
value: storage
effect: NoSchedule
ingress:
enabled: true
host: longhorn.k-space.ee
tls: true
tlsSecret: longhorn-tls

View File

@ -67,6 +67,11 @@ spec:
items:
type: object
x-kubernetes-preserve-unknown-fields: true
customresources:
type: array
items:
type: object
x-kubernetes-preserve-unknown-fields: true
required: ["spec"]
---
apiVersion: apps/v1
@ -178,12 +183,21 @@ rules:
- apiGroups:
- codemowers.io
resources:
- bindzones
- clusteroperators
- keydbs
verbs:
- get
- list
- watch
- apiGroups:
- k-space.ee
resources:
- cams
verbs:
- get
- list
- watch
---
apiVersion: v1
kind: ServiceAccount

View File

@ -120,7 +120,7 @@ spec:
type: ClusterIP
clusterIP: None
ports:
- name: "server"
- name: redis
port: 6379
protocol: TCP
targetPort: redis
@ -137,14 +137,14 @@ spec:
spec:
type: ClusterIP
ports:
- name: "server"
- name: redis
port: 6379
protocol: TCP
targetPort: redis
- name: "redis-exporter"
- name: exporter
port: 9121
protocol: TCP
targetPort: redis-exporter
targetPort: exporter
selector:
app.kubernetes.io/name: foobar
sessionAffinity: ClientIP
@ -163,9 +163,6 @@ spec:
app.kubernetes.io/name: foobar
template:
metadata:
annotations:
prometheus.io/port: "9121"
prometheus.io/scrape: "true"
labels:
app.kubernetes.io/name: foobar
spec:
@ -237,10 +234,10 @@ spec:
envFrom:
- secretRef:
name: foobar-secrets
- name: redis-exporter
- name: exporter
image: quay.io/oliver006/redis_exporter
ports:
- name: metrics
- name: exporter
containerPort: 9121
envFrom:
- secretRef:

View File

@ -1,4 +1,14 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: monitoring
namespace: metallb-system
spec:
selector: {}
podMetricsEndpoints:
- port: monitoring
---
apiVersion: metallb.io/v1beta1
kind: MetalLB
metadata:

View File

@ -0,0 +1,19 @@
# Prometheus operator
```
curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
kubectl create namespace prometheus-operator
kubectl apply --server-side -n prometheus-operator -f bundle.yml
kubectl delete -n prometheus-operator configmap snmp-exporter
kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
```
# Mikrotik expoeter
```
kubectl create -n prometheus-operator secret generic mikrotik-exporter \
--from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
--from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
```

View File

@ -0,0 +1,762 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: metrics
spec:
namespaceSelector: {}
selector: {}
podMetricsEndpoints:
- port: exporter
- port: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager
spec:
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
replicas: 3
serviceAccountName: alertmanager
externalUrl: http://am.k-space.ee/
routePrefix: "/"
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
spec:
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
alerting:
alertmanagers:
- namespace: prometheus-operator
name: alertmanager
port: http
pathPrefix: "/"
apiVersion: v2
externalUrl: "http://prom.k-space.ee/"
replicas: 2
shards: 1
serviceAccountName: prometheus
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
ruleNamespaceSelector: {}
ruleSelector: {}
retentionSize: 80GB
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: local-path
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus-operator
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusJobMissing
annotations:
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n \
\ LABELS = {{ $labels }}"
summary: Prometheus job missing (instance {{ $labels.instance }})
expr: absent(up{job="prometheus-operator/prometheus"})
for: 0m
labels:
severity: warning
- alert: PrometheusTargetMissing
annotations:
description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target missing (instance {{ $labels.instance }})
expr: up == 0
for: 5m
labels:
severity: critical
- alert: PrometheusAllTargetsMissing
annotations:
description: "A Prometheus job does not have living target anymore.\n VALUE\
\ = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus all targets missing (instance {{ $labels.instance }})
expr: count by (job) (up) == 0
for: 0m
labels:
severity: critical
- alert: PrometheusConfigurationReloadFailure
annotations:
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n\
\ LABELS = {{ $labels }}"
summary: Prometheus configuration reload failure (instance {{ $labels.instance
}})
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusTooManyRestarts
annotations:
description: "Prometheus has restarted more than twice in the last 15 minutes.\
\ It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels\
\ }}"
summary: Prometheus too many restarts (instance {{ $labels.instance }})
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
> 2
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerJobMissing
annotations:
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager job missing (instance {{ $labels.instance
}})
expr: absent(up{job="prometheus-operator/alertmanager"})
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigurationReloadFailure
annotations:
description: "AlertManager configuration reload error\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager configuration reload failure (instance {{
$labels.instance }})
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerConfigNotSynced
annotations:
description: "Configurations of AlertManager cluster instances are out of\
\ sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
}})
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 0m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanager
annotations:
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value\
\ }}\n LABELS = {{ $labels }}"
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
}})
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationFailures
annotations:
description: "Prometheus encountered {{ $value }} rule evaluation failures,\
\ leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS\
\ = {{ $labels }}"
summary: Prometheus rule evaluation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTemplateTextExpansionFailures
annotations:
description: "Prometheus encountered {{ $value }} template text expansion\
\ failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus template text expansion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusRuleEvaluationSlow
annotations:
description: "Prometheus rule evaluation took more time than the scheduled\
\ interval. It indicates a slower storage backend access or too complex\
\ query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
- alert: PrometheusNotificationsBacklog
annotations:
description: "The Prometheus notification queue has not been empty for 10\
\ minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
- alert: PrometheusAlertmanagerNotificationFailing
annotations:
description: "Alertmanager is failing sending notifications\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTargetEmpty
annotations:
description: "Prometheus has no target in service discovery\n VALUE = {{\
\ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target empty (instance {{ $labels.instance }})
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
- alert: PrometheusLargeScrape
annotations:
description: "Prometheus has many scrapes that exceed the sample limit\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus large scrape (instance {{ $labels.instance }})
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
10
for: 5m
labels:
severity: warning
- alert: PrometheusTargetScrapeDuplicate
annotations:
description: "Prometheus has many samples rejected due to duplicate timestamps\
\ but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus target scrape duplicate (instance {{ $labels.instance
}})
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
for: 0m
labels:
severity: warning
- alert: PrometheusTsdbCheckpointCreationFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCheckpointDeletionFailures
annotations:
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbCompactionsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbHeadTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbReloadFailures
annotations:
description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalCorruptions
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
and wipe /data/wal
expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
for: 0m
labels:
severity: critical
- alert: PrometheusTsdbWalTruncationsFailed
annotations:
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
\ VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
}})
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
annotations:
cert-manager.io/cluster-issuer: default
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
spec:
rules:
- host: prom.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: prometheus-operated
port:
number: 9090
tls:
- hosts:
- prom.k-space.ee
secretName: prom-tls
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
annotations:
cert-manager.io/cluster-issuer: default
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
spec:
rules:
- host: am.k-space.ee
http:
paths:
- pathType: Prefix
path: "/"
backend:
service:
name: alertmanager-operated
port:
number: 9093
tls:
- hosts:
- am.k-space.ee
secretName: alertmanager-tls
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: prometheus
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: alertmanager
spec:
selector:
matchLabels:
app.kubernetes.io/name: alertmanager
podMetricsEndpoints:
- port: web
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: operator
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus-operator
podMetricsEndpoints:
- port: http
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/cadvisor
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kubelet
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kube-state-metrics
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubernetesNodeReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Node ready (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes memory pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes disk pressure (instance {{ $labels.instance }})
description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesOutOfDisk
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes out of disk (instance {{ $labels.instance }})
description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesOutOfCapacity
expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes out of capacity (instance {{ $labels.instance }})
description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesContainerOomKiller
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes container oom killer (instance {{ $labels.instance }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
expr: kube_job_status_failed > 0
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes Job failed (instance {{ $labels.instance }})
description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended
expr: kube_cronjob_spec_suspend != 0
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeFullInFourDays
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
for: 1m
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScalingAbility
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaMetricAvailability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleCapability
expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
for: 2m
labels:
severity: info
annotations:
summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodNotHealthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicassetMismatch
expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 1m
labels:
severity: critical
annotations:
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobTooLong
expr: time() - kube_cronjob_next_schedule_time > 3600
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
expr: kube_job_spec_completions - kube_job_status_succeeded > 0
for: 12h
labels:
severity: critical
annotations:
summary: Kubernetes job slow completion (instance {{ $labels.instance }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiClientErrors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresNextWeek
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresSoon
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
for: 2m
labels:
severity: warning
annotations:
summary: Kubernetes API server latency (instance {{ $labels.instance }})
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View File

@ -0,0 +1,258 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: websites
spec:
prober:
url: blackbox-exporter
path: /probe
module: http_2xx
targets:
staticConfig:
static:
- https://git.k-space.ee/
- https://grafana.k-space.ee/
- https://wiki.k-space.ee/
- https://pad.k-space.ee/
- https://members.k-space.ee/
- https://nextcloud.k-space.ee/
- http://minio.infra.k-space.ee:9001/login
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: k6.ee
spec:
prober:
url: blackbox-exporter
path: /probe
module: dns_check_traefik
targets:
staticConfig:
static:
- 193.40.103.2
- 62.65.250.2
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: samba-cluster
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- dc1.ad.k-space.ee:636
- dc2.ad.k-space.ee:636
- dc3.ad.k-space.ee:636
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: misc
spec:
prober:
url: blackbox-exporter
path: /metrics
module: tcp_connect
targets:
staticConfig:
static:
- mail.k-space.ee:465
- dev.k-space.ee:10648
- mariadb.infra.k-space.ee:3306
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: blackbox-exporter
spec:
# https://awesome-prometheus-alerts.grep.to/rules#blackbox
groups:
- name: blackbox
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: Probe failed
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }})
description: Blackbox probe took more than 1s to complete
- alert: BlackboxSlowDNS
expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
description: Blackbox DNS lookup took more than 1s to complete.
It seemed using IPv6 DNS servers in conjunction with Docker resulted
in odd 5s latency bump. For now we're using 8.8.8.8 because of that
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
description: HTTP status code is not 200-399
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 30 days
- alert: BlackboxSslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: SSL certificate expires in 3 days
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
description: SSL certificate has expired already
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: HTTP request took more than 1s
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: Blackbox ping took more than 1s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.20.0
volumeMounts:
- name: blackbox-exporter-config
mountPath: /etc/blackbox_exporter
volumes:
- name: blackbox-exporter-config
configMap:
name: blackbox-exporter-config
# TODO: Results in odd 6s connection lag if scheduled in VLAN20
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- blackbox-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9115
selector:
app: blackbox-exporter
---
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-exporter-config
data:
config.yml: |-
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
icmp:
prober: icmp
icmp:
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_traefik:
prober: dns
dns:
query_name: "traefik.k-space.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false
dns_check_k6:
prober: dns
dns:
query_name: "k6.ee"
query_type: "A"
validate_answer_rrs:
fail_if_not_matches_regexp:
- "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
preferred_ip_protocol: "ip4"
ip_protocol_fallback: false

28816
prometheus-operator/bundle.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,104 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: mikrotik
spec:
bearerTokenSecret:
name: mikrotik-exporter
key: PROMETHEUS_BEARER_TOKEN
prober:
path: /metrics
url: mikrotik-exporter
targets:
staticConfig:
static:
- router.mgmt.k-space.ee
- sw_chaos.mgmt.k-space.ee
- sw_poe.mgmt.k-space.ee
- sw_mgmt.mgmt.k-space.ee
- sw_core02.mgmt.k-space.ee
- sw_cyber.mgmt.k-space.ee
- sw_ha.mgmt.k-space.ee
- sw_asocial.mgmt.k-space.ee
- sw_kitchen.mgmt.k-space.ee
- sw_core01.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: mikrotik
spec:
groups:
- name: mikrotik
rules:
- alert: MikrotikUplinkRedundancyLost
expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
for: 0m
labels:
severity: error
annotations:
summary: Switch uplink high availability lost
description: One of the two 10Gb optical links is malfunctioning
- alert: MikrotikLinkRateDegraded
expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
for: 0m
labels:
severity: error
annotations:
summary: 10Gb link degraded
description: One of the 10Gb links is running at lower speed
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mikrotik-exporter
spec:
revisionHistoryLimit: 0
replicas: 2
selector:
matchLabels:
app: mikrotik-exporter
template:
metadata:
labels:
app: mikrotik-exporter
annotations:
co.elastic.logs/multiline.pattern: '^ '
co.elastic.logs/multiline.negate: "false"
co.elastic.logs/multiline.match: after
spec:
containers:
- name: mikrotik-exporter
image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
env:
- name: MIKROTIK_USER
value: netpoller
envFrom:
- secretRef:
name: mikrotik-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: mikrotik-exporter
spec:
type: ClusterIP
ports:
- name: http
port: 80
protocol: TCP
targetPort: 3001
selector:
app: mikrotik-exporter

View File

@ -0,0 +1,443 @@
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: nodes-proxmox
spec:
targets:
staticConfig:
static:
- nas.mgmt.k-space.ee:9100
- pve1.proxmox.infra.k-space.ee:9100
- pve8.proxmox.infra.k-space.ee:9100
- pve9.proxmox.infra.k-space.ee:9100
relabelingConfigs:
- sourceLabels: [__param_target]
targetLabel: instance
- sourceLabels: [__param_target]
targetLabel: __address__
prober:
url: localhost
path: /metrics
metricRelabelings:
- sourceLabels: [__address__]
targetLabel: target
---
apiVersion: monitoring.coreos.com/v1
kind: Probe
metadata:
name: nodes-misc
spec:
targets:
staticConfig:
static:
- sprucecone.infra.k-space.ee:9100
- cedarcone.infra.k-space.ee:9100
relabelingConfigs:
- sourceLabels: [__param_target]
targetLabel: instance
- sourceLabels: [__param_target]
targetLabel: __address__
prober:
url: localhost
path: /metrics
metricRelabelings:
- sourceLabels: [__address__]
targetLabel: target
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter
spec:
groups:
- name: node-exporter
rules:
- alert: ZfsOfflinePool
expr: node_zfs_zpool_state{state!="online"} > 0
for: 1m
labels:
severity: critical
annotations:
summary: ZFS offline pool (instance {{ $labels.instance }})
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighLoad
expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
for: 15m
labels:
severity: warning
annotations:
summary: Host under high load
description: Many processes are queued up for execution
- alert: HostOutOfMemory
expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: Node memory is filling up (< 10% left)
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: The node is under heavy memory pressure. High rate of major page faults
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: Host network interfaces are probably receiving too much data (> 160 MB/s)
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: Host network interfaces are probably sending too much data (> 160 MB/s)
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: Disk is probably reading too much data (> 50 MB/s)
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
for: 1h
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: Disk is probably writing too much data (> 50 MB/s)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: Disk is almost full (< 10% left)
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: Disk is almost running out of available inodes (< 10% left)
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: Disk latency is growing (read operations > 100ms)
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: Disk latency is growing (write operations > 100ms)
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching (instance {{ $labels.instance }})
description: Context switching is growing on node (> 50000 / s)
- alert: HostSwapIsEnabled
expr: node_memory_SwapTotal_bytes > 0
for: 0m
labels:
severity: warning
annotations:
summary: Swap is discouraged nowadays
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: Physical hardware component too hot
- alert: HostNodeOvertemperatureAlarm
expr: node_hwmon_temp_alarm == 1
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: Physical node temperature alarm triggered
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: OOM kill detected
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
- alert: HostNetworkBondDegraded
expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: The number of conntrack is approching limit
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: Clock skew detected. Clock is out of sync.
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: Clock not synchronising.
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: smart
spec:
groups:
- name: smart
rules:
- alert: SmartSSDWriteRateTooHigh
expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
for: 5m
labels:
severity: warning
annotations:
summary: SSD write rate exceeds 10MB/s
description: At this rate the SSD will be worn out before warranty period expires
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: temperatures
spec:
groups:
- name: temperatures
rules:
- alert: HighDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
for: 10m
labels:
severity: critical
annotations:
summary: High HDD/SSD temperature indicates high ambient temperature
- alert: HighChipsetTemperature
expr: node_hwmon_temp_celsius > 65
for: 10m
labels:
severity: warning
annotations:
summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
- alert: LowDiskTemperature
expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
for: 10m
labels:
severity: critical
annotations:
summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
podMetricsEndpoints:
- port: web
scrapeTimeout: 30s
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: node-exporter
name: node-exporter
annotations:
keel.sh/policy: force
keel.sh/trigger: poll
keel.sh/pollSchedule: "@midnight"
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
containers:
- name: node-exporter
args:
- --web.listen-address=0.0.0.0:9101
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
- --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
image: prom/node-exporter:v1.3.1
resources:
limits:
cpu: 50m
memory: 180Mi
requests:
cpu: 5m
memory: 20Mi
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
ports:
- containerPort: 9101
name: web
securityContext:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
readOnlyRootFilesystem: true
hostNetwork: true
hostPID: true
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root

View File

@ -0,0 +1,172 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: snmp-exporter
spec:
replicas: 2
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
containers:
- image: prom/snmp-exporter:latest
name: snmp-exporter
imagePullPolicy: Always
securityContext:
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
ports:
- containerPort: 9116
name: exporter
livenessProbe:
httpGet:
path: /health
port: exporter
readinessProbe:
httpGet:
path: /health
port: exporter
volumeMounts:
- name: snmp-exporter
mountPath: /etc/snmp_exporter
volumes:
- name: snmp-exporter
configMap:
name: snmp-exporter
nodeSelector:
dedicated: monitoring
tolerations:
- key: dedicated
operator: Equal
value: monitoring
effect: NoSchedule
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- snmp-exporter
topologyKey: "kubernetes.io/hostname"
---
kind: Service
apiVersion: v1
metadata:
name: snmp-exporter
spec:
type: ClusterIP
ports:
- name: exporter
port: 9116
protocol: TCP
selector:
app: snmp-exporter
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: ups
spec:
interval: 60s
module: rfc1628_ups
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- ups-4.mgmt.k-space.ee
- ups-5.mgmt.k-space.ee
- ups-6.mgmt.k-space.ee
- ups-7.mgmt.k-space.ee
- ups-8.mgmt.k-space.ee
- ups-9.mgmt.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: ups
spec:
groups:
- name: ups
rules:
- alert: UPSBatteryLost
annotations:
summary: One or more UPS-es have degraded batteries.
expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
for: 1m
labels:
severity: critical
- alert: UPSPowerLost
annotations:
summary: One or more UPS-es is not in normal operation mode. This either means
power is lost or UPS was loaded and it's now in bypass mode.
expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6
for: 1m
labels:
severity: critical
- alert: UPSExcessivelyLoaded
annotations:
summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
is balanced and load for no UPS stays above 50%.
expr: snmp_upsOutputPercentLoad > 80
for: 1h
labels:
severity: critical
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: printer
spec:
interval: 60s
scrapeTimeout: 50s
module: printer_mib
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- mfp-cyber.pub.k-space.ee
- mfp-chaos.pub.k-space.ee
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: printer
spec:
groups:
- name: printer
rules:
- alert: PrinterNeedsAttention
annotations:
summary: Printer is in error state. If the underlying reason is 'low on paper'
make sure there is enough paper near the printer. It not drop a line at
accounting@k-space.ee to order more office supplies.
expr: snmp_hrPrinterDetectedErrorState == 1
for: 0m
labels:
severity: warning
---
kind: Probe
apiVersion: monitoring.coreos.com/v1
metadata:
name: beamer
spec:
interval: 60s
module: epson_beamer
prober:
url: snmp-exporter:9116
path: /snmp
targets:
staticConfig:
static:
- beamer-cyber.sec.k-space.ee

1272
prometheus-operator/snmp.yml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,7 @@ but it does not export Prometheus metrics either.
To apply changes run in this directory:
```
kubectl apply -n rosdump -f cronjob.yaml
kubectl apply -n rosdump -f application.yml
```
To trigger cronjob:

View File

@ -87,7 +87,6 @@ spec:
path: ssh_known_hosts
- configMap:
name: rosdump-config
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
@ -108,3 +107,19 @@ spec:
ports:
- protocol: TCP
port: 22
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: rosdump
spec:
groups:
- name: rosdump
rules:
- alert: MikrotikBackupsBroken
expr: absent(kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"}) or time() - kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"} > 3600
for: 4h
labels:
severity: warning
annotations:
summary: Mikrotik backups are broken

View File

@ -1,6 +1,7 @@
Traefik Ingress Controller:
```
kubectl create namespace traefik
helm template --include-crds -n traefik --release-name k6 traefik/traefik -f values.yml > application.yml
kubectl apply -n traefik -f namespace.yml -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
kubectl apply -n traefik -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
```

View File

@ -28,9 +28,6 @@ kind: Service
metadata:
name: traefik-metrics
namespace: traefik
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9100'
spec:
selector:
app.kubernetes.io/name: traefik
@ -92,6 +89,16 @@ spec:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: prometheus-operator
podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- protocol: TCP
port: 9100
- from:
- ipBlock:
cidr: 0.0.0.0/0
@ -109,3 +116,14 @@ spec:
replacePathRegex:
regex: ^/metrics
replacement: /
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: traefik
spec:
selector:
matchLabels:
app.kubernetes.io/name: traefik
podMetricsEndpoints:
- port: metrics

View File

@ -17,9 +17,8 @@ deployment:
keel.sh/trigger: patch
keel.sh/pollSchedule: "@midnight"
podAnnotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '9100'
accessLog:
format: json
# Globally redirect to https://
globalArguments: