forked from k-space/kube
		
	Migrate to Prometheus Operator
This commit is contained in:
		@@ -1,17 +1,14 @@
 | 
			
		||||
apiVersion: argoproj.io/v1alpha1
 | 
			
		||||
kind: Application
 | 
			
		||||
metadata:
 | 
			
		||||
  name: monitoring
 | 
			
		||||
  name: prometheus-operator
 | 
			
		||||
  namespace: argocd
 | 
			
		||||
spec:
 | 
			
		||||
  project: default
 | 
			
		||||
  source:
 | 
			
		||||
    repoURL: 'git@git.k-space.ee:k-space/kube.git'
 | 
			
		||||
    path: monitoring
 | 
			
		||||
    path: prometheus-operator
 | 
			
		||||
    targetRevision: HEAD
 | 
			
		||||
  destination:
 | 
			
		||||
    server: 'https://kubernetes.default.svc'
 | 
			
		||||
    namespace: monitoring
 | 
			
		||||
  syncPolicy:
 | 
			
		||||
    syncOptions:
 | 
			
		||||
      - CreateNamespace=true
 | 
			
		||||
    namespace: prometheus-operator
 | 
			
		||||
							
								
								
									
										33
									
								
								argocd/monitoring.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								argocd/monitoring.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,33 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: argocd
 | 
			
		||||
spec:
 | 
			
		||||
  selector: {}
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
  - port: metrics
 | 
			
		||||
  - port: controller
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: argocd
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: argocd
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: ArgoNotSynced
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Some applications in Argo are out of sync
 | 
			
		||||
      expr: sum by (dest_namespace) (argocd_app_info{sync_status!="Synced"}) > 0
 | 
			
		||||
      for: 8h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: ArgoNotHealthy
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Some applications in Argo are not healthy
 | 
			
		||||
      expr: argocd_app_info{health_status!="Healthy"}
 | 
			
		||||
      for: 30m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
@@ -77,10 +77,6 @@ server:
 | 
			
		||||
 | 
			
		||||
  metrics:
 | 
			
		||||
    enabled: true
 | 
			
		||||
    service:
 | 
			
		||||
      annotations:
 | 
			
		||||
        prometheus.io/scrape: "true"
 | 
			
		||||
        prometheus.io/port: "8083"
 | 
			
		||||
 | 
			
		||||
# We don't use ApplicationSet CRD-s (yet)
 | 
			
		||||
applicationSet:
 | 
			
		||||
@@ -89,26 +85,14 @@ applicationSet:
 | 
			
		||||
repoServer:
 | 
			
		||||
  metrics:
 | 
			
		||||
    enabled: true
 | 
			
		||||
    service:
 | 
			
		||||
      annotations:
 | 
			
		||||
        prometheus.io/scrape: "true"
 | 
			
		||||
        prometheus.io/port: "8084"
 | 
			
		||||
 | 
			
		||||
notifications:
 | 
			
		||||
  metrics:
 | 
			
		||||
    enabled: true
 | 
			
		||||
    service:
 | 
			
		||||
      annotations:
 | 
			
		||||
        prometheus.io/scrape: "true"
 | 
			
		||||
        prometheus.io/port: "9001"
 | 
			
		||||
 | 
			
		||||
controller:
 | 
			
		||||
  metrics:
 | 
			
		||||
    enabled: true
 | 
			
		||||
    service:
 | 
			
		||||
      annotations:
 | 
			
		||||
        prometheus.io/scrape: "true"
 | 
			
		||||
        prometheus.io/port: "8082"
 | 
			
		||||
 | 
			
		||||
configs:
 | 
			
		||||
  secret:
 | 
			
		||||
 
 | 
			
		||||
@@ -10,11 +10,11 @@ spec:
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: camtiler
 | 
			
		||||
      app.kubernetes.io/name: camtiler
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: camtiler
 | 
			
		||||
        app.kubernetes.io/name: camtiler
 | 
			
		||||
        component: camtiler
 | 
			
		||||
    spec:
 | 
			
		||||
      serviceAccountName: camtiler
 | 
			
		||||
@@ -25,6 +25,9 @@ spec:
 | 
			
		||||
            readOnlyRootFilesystem: true
 | 
			
		||||
            runAsNonRoot: true
 | 
			
		||||
            runAsUser: 1000
 | 
			
		||||
          ports:
 | 
			
		||||
            - containerPort: 5000
 | 
			
		||||
              name: "http"
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
@@ -38,11 +41,11 @@ spec:
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: log-viewer-frontend
 | 
			
		||||
      app.kubernetes.io/name: log-viewer-frontend
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: log-viewer-frontend
 | 
			
		||||
        app.kubernetes.io/name: log-viewer-frontend
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
        - name: log-viewer-frontend
 | 
			
		||||
@@ -64,11 +67,11 @@ spec:
 | 
			
		||||
  replicas: 3
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: log-viewer-backend
 | 
			
		||||
      app.kubernetes.io/name: log-viewer-backend
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: log-viewer-backend
 | 
			
		||||
        app.kubernetes.io/name: log-viewer-backend
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
        - name: log-backend-backend
 | 
			
		||||
@@ -109,7 +112,7 @@ metadata:
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  selector:
 | 
			
		||||
    app: log-viewer-frontend
 | 
			
		||||
    app.kubernetes.io/name: log-viewer-frontend
 | 
			
		||||
  ports:
 | 
			
		||||
  - protocol: TCP
 | 
			
		||||
    port: 3003
 | 
			
		||||
@@ -121,7 +124,7 @@ metadata:
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  selector:
 | 
			
		||||
    app: log-viewer-backend
 | 
			
		||||
    app.kubernetes.io/name: log-viewer-backend
 | 
			
		||||
  ports:
 | 
			
		||||
  - protocol: TCP
 | 
			
		||||
    port: 3002
 | 
			
		||||
@@ -130,14 +133,12 @@ apiVersion: v1
 | 
			
		||||
kind: Service
 | 
			
		||||
metadata:
 | 
			
		||||
  name: camtiler
 | 
			
		||||
  annotations:
 | 
			
		||||
    prometheus.io/scrape: 'true'
 | 
			
		||||
  labels:
 | 
			
		||||
    component: camtiler
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  selector:
 | 
			
		||||
    app: camtiler
 | 
			
		||||
    app.kubernetes.io/name: camtiler
 | 
			
		||||
    component: camtiler
 | 
			
		||||
  ports:
 | 
			
		||||
  - protocol: TCP
 | 
			
		||||
@@ -254,7 +255,7 @@ spec:
 | 
			
		||||
          kubernetes.io/metadata.name: monitoring
 | 
			
		||||
      podSelector:
 | 
			
		||||
        matchLabels:
 | 
			
		||||
          app: prometheus
 | 
			
		||||
          app.kubernetes.io/name: prometheus
 | 
			
		||||
  egress:
 | 
			
		||||
    - to:
 | 
			
		||||
        - ipBlock:
 | 
			
		||||
@@ -263,7 +264,7 @@ spec:
 | 
			
		||||
    - to:
 | 
			
		||||
      - podSelector:
 | 
			
		||||
          matchLabels:
 | 
			
		||||
            app: mongodb-svc
 | 
			
		||||
            app.kubernetes.io/name: mongodb-svc
 | 
			
		||||
      ports:
 | 
			
		||||
      - port: 27017
 | 
			
		||||
    - to:
 | 
			
		||||
@@ -298,7 +299,7 @@ spec:
 | 
			
		||||
          kubernetes.io/metadata.name: monitoring
 | 
			
		||||
      podSelector:
 | 
			
		||||
        matchLabels:
 | 
			
		||||
          app: prometheus
 | 
			
		||||
          app.kubernetes.io/name: prometheus
 | 
			
		||||
  - from:
 | 
			
		||||
    - namespaceSelector:
 | 
			
		||||
        matchLabels:
 | 
			
		||||
@@ -314,7 +315,7 @@ metadata:
 | 
			
		||||
spec:
 | 
			
		||||
  podSelector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: log-viewer-backend
 | 
			
		||||
      app.kubernetes.io/name: log-viewer-backend
 | 
			
		||||
  policyTypes:
 | 
			
		||||
  - Ingress
 | 
			
		||||
  - Egress
 | 
			
		||||
@@ -322,13 +323,11 @@ spec:
 | 
			
		||||
    - to:
 | 
			
		||||
      - podSelector:
 | 
			
		||||
          matchLabels:
 | 
			
		||||
            app: mongodb-svc
 | 
			
		||||
            app.kubernetes.io/name: mongodb-svc
 | 
			
		||||
    - to:
 | 
			
		||||
      - podSelector:
 | 
			
		||||
          matchLabels:
 | 
			
		||||
            v1.min.io/tenant: minio
 | 
			
		||||
      ports:
 | 
			
		||||
      - port: 9000
 | 
			
		||||
      - ipBlock:
 | 
			
		||||
          # Minio is accessed thru public endpoint via Traefik
 | 
			
		||||
          cidr: 193.40.103.0/24
 | 
			
		||||
  ingress:
 | 
			
		||||
  - from:
 | 
			
		||||
    - namespaceSelector:
 | 
			
		||||
@@ -345,7 +344,7 @@ metadata:
 | 
			
		||||
spec:
 | 
			
		||||
  podSelector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: log-viewer-frontend
 | 
			
		||||
      app.kubernetes.io/name: log-viewer-frontend
 | 
			
		||||
  policyTypes:
 | 
			
		||||
  - Ingress
 | 
			
		||||
  - Egress
 | 
			
		||||
@@ -458,7 +457,6 @@ spec:
 | 
			
		||||
             required: ["target"]
 | 
			
		||||
         required: ["spec"]
 | 
			
		||||
---
 | 
			
		||||
---
 | 
			
		||||
apiVersion: codemowers.io/v1alpha1
 | 
			
		||||
kind: ClusterOperator
 | 
			
		||||
metadata:
 | 
			
		||||
@@ -480,7 +478,7 @@ spec:
 | 
			
		||||
      spec:
 | 
			
		||||
        type: ClusterIP
 | 
			
		||||
        selector:
 | 
			
		||||
          app: foobar
 | 
			
		||||
          app.kubernetes.io/name: foobar
 | 
			
		||||
          component: camdetect
 | 
			
		||||
        ports:
 | 
			
		||||
        - protocol: TCP
 | 
			
		||||
@@ -506,14 +504,11 @@ spec:
 | 
			
		||||
            maxUnavailable: 1
 | 
			
		||||
        selector:
 | 
			
		||||
          matchLabels:
 | 
			
		||||
            app: foobar
 | 
			
		||||
            app.kubernetes.io/name: foobar
 | 
			
		||||
        template:
 | 
			
		||||
          metadata:
 | 
			
		||||
            annotations:
 | 
			
		||||
              prometheus.io/scrape: 'true'
 | 
			
		||||
              prometheus.io/port: '5000'
 | 
			
		||||
            labels:
 | 
			
		||||
              app: foobar
 | 
			
		||||
              app.kubernetes.io/name: foobar
 | 
			
		||||
              component: camdetect
 | 
			
		||||
          spec:
 | 
			
		||||
            containers:
 | 
			
		||||
@@ -590,9 +585,55 @@ spec:
 | 
			
		||||
              whenUnsatisfiable: DoNotSchedule
 | 
			
		||||
              labelSelector:
 | 
			
		||||
                matchLabels:
 | 
			
		||||
                  app: foobar
 | 
			
		||||
                  app.kubernetes.io/name: foobar
 | 
			
		||||
                  component: camdetect
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: camtiler
 | 
			
		||||
spec:
 | 
			
		||||
  selector: {}
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
  - port: http
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: cameras
 | 
			
		||||
spec:
 | 
			
		||||
    groups:
 | 
			
		||||
    - name: cameras
 | 
			
		||||
      rules:
 | 
			
		||||
      - alert: CameraLost
 | 
			
		||||
        expr: rate(camdetect_rx_frames_total[2m]) < 1
 | 
			
		||||
        for: 2m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Camera feed stopped
 | 
			
		||||
      - alert: CameraServerRoomMotion
 | 
			
		||||
        expr: camdetect_event_active {app="camdetect-server-room"} > 0
 | 
			
		||||
        for: 1m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Motion was detected in server room
 | 
			
		||||
      - alert: CameraSlowUploads
 | 
			
		||||
        expr: rate(camdetect_upload_dropped_frames_total[2m]) > 1
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Motion detect snapshots are piling up and not getting uploaded to S3
 | 
			
		||||
      - alert: CameraSlowProcessing
 | 
			
		||||
        expr: rate(camdetect_download_dropped_frames_total[2m]) > 1
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Motion detection processing pipeline is not keeping up with incoming frames
 | 
			
		||||
---
 | 
			
		||||
apiVersion: k-space.ee/v1alpha1
 | 
			
		||||
kind: Camera
 | 
			
		||||
metadata:
 | 
			
		||||
 
 | 
			
		||||
@@ -42,9 +42,6 @@ spec:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: drone
 | 
			
		||||
      annotations:
 | 
			
		||||
        prometheus.io/port: "80"
 | 
			
		||||
        prometheus.io/scrape: "true"
 | 
			
		||||
    spec:
 | 
			
		||||
      automountServiceAccountToken: false
 | 
			
		||||
      securityContext:
 | 
			
		||||
 
 | 
			
		||||
@@ -10,6 +10,9 @@ spec:
 | 
			
		||||
  kibanaRef:
 | 
			
		||||
    name: kibana
 | 
			
		||||
  config:
 | 
			
		||||
    http:
 | 
			
		||||
      enabled: true
 | 
			
		||||
      port: 5066
 | 
			
		||||
    filebeat:
 | 
			
		||||
      autodiscover:
 | 
			
		||||
        providers:
 | 
			
		||||
@@ -81,6 +84,14 @@ spec:
 | 
			
		||||
              valueFrom:
 | 
			
		||||
                fieldRef:
 | 
			
		||||
                  fieldPath: spec.nodeName
 | 
			
		||||
        - name: exporter
 | 
			
		||||
          image: sepa/beats-exporter
 | 
			
		||||
          args:
 | 
			
		||||
            - -p=5066
 | 
			
		||||
          ports:
 | 
			
		||||
            - containerPort: 8080
 | 
			
		||||
              name: exporter
 | 
			
		||||
              protocol: TCP
 | 
			
		||||
        volumes:
 | 
			
		||||
        - name: varlogcontainers
 | 
			
		||||
          hostPath:
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										16
									
								
								freescout/application.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								freescout/application.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: freescout
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: freescout
 | 
			
		||||
      rules:
 | 
			
		||||
      - alert: FreescoutSyncBroken
 | 
			
		||||
        expr: time() - wildduck_last_login{email=~"(info|accounting)@k-space.ee"} > 300
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Freescout mailbox synchronization is broken
 | 
			
		||||
							
								
								
									
										3
									
								
								kube-system/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								kube-system/README.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
```
 | 
			
		||||
kubectl apply -n kube-system -f kube-state-metrics.yml
 | 
			
		||||
``
 | 
			
		||||
							
								
								
									
										221
									
								
								kube-system/kube-state-metrics.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								kube-system/kube-state-metrics.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,221 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
automountServiceAccountToken: false
 | 
			
		||||
kind: ServiceAccount
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  labels:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
---
 | 
			
		||||
apiVersion: rbac.authorization.k8s.io/v1
 | 
			
		||||
kind: ClusterRole
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  labels:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
rules:
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - ""
 | 
			
		||||
  resources:
 | 
			
		||||
  - configmaps
 | 
			
		||||
  - secrets
 | 
			
		||||
  - nodes
 | 
			
		||||
  - pods
 | 
			
		||||
  - services
 | 
			
		||||
  - serviceaccounts
 | 
			
		||||
  - resourcequotas
 | 
			
		||||
  - replicationcontrollers
 | 
			
		||||
  - limitranges
 | 
			
		||||
  - persistentvolumeclaims
 | 
			
		||||
  - persistentvolumes
 | 
			
		||||
  - namespaces
 | 
			
		||||
  - endpoints
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - apps
 | 
			
		||||
  resources:
 | 
			
		||||
  - statefulsets
 | 
			
		||||
  - daemonsets
 | 
			
		||||
  - deployments
 | 
			
		||||
  - replicasets
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - batch
 | 
			
		||||
  resources:
 | 
			
		||||
  - cronjobs
 | 
			
		||||
  - jobs
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - autoscaling
 | 
			
		||||
  resources:
 | 
			
		||||
  - horizontalpodautoscalers
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - authentication.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - tokenreviews
 | 
			
		||||
  verbs:
 | 
			
		||||
  - create
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - authorization.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - subjectaccessreviews
 | 
			
		||||
  verbs:
 | 
			
		||||
  - create
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - policy
 | 
			
		||||
  resources:
 | 
			
		||||
  - poddisruptionbudgets
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - certificates.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - certificatesigningrequests
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - storage.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - storageclasses
 | 
			
		||||
  - volumeattachments
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - admissionregistration.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - mutatingwebhookconfigurations
 | 
			
		||||
  - validatingwebhookconfigurations
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - networking.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - networkpolicies
 | 
			
		||||
  - ingresses
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - coordination.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - leases
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - rbac.authorization.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - clusterrolebindings
 | 
			
		||||
  - clusterroles
 | 
			
		||||
  - rolebindings
 | 
			
		||||
  - roles
 | 
			
		||||
  verbs:
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
---
 | 
			
		||||
apiVersion: rbac.authorization.k8s.io/v1
 | 
			
		||||
kind: ClusterRoleBinding
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  labels:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
roleRef:
 | 
			
		||||
  apiGroup: rbac.authorization.k8s.io
 | 
			
		||||
  kind: ClusterRole
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
subjects:
 | 
			
		||||
- kind: ServiceAccount
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  namespace: kube-system
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  labels:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
spec:
 | 
			
		||||
  replicas: 1
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
    spec:
 | 
			
		||||
      automountServiceAccountToken: true
 | 
			
		||||
      containers:
 | 
			
		||||
      - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.6.0
 | 
			
		||||
        livenessProbe:
 | 
			
		||||
          httpGet:
 | 
			
		||||
            path: /healthz
 | 
			
		||||
            port: 8080
 | 
			
		||||
          initialDelaySeconds: 5
 | 
			
		||||
          timeoutSeconds: 5
 | 
			
		||||
        name: kube-state-metrics
 | 
			
		||||
        ports:
 | 
			
		||||
        - containerPort: 8080
 | 
			
		||||
          name: http-metrics
 | 
			
		||||
        - containerPort: 8081
 | 
			
		||||
          name: telemetry
 | 
			
		||||
        readinessProbe:
 | 
			
		||||
          httpGet:
 | 
			
		||||
            path: /
 | 
			
		||||
            port: 8081
 | 
			
		||||
          initialDelaySeconds: 5
 | 
			
		||||
          timeoutSeconds: 5
 | 
			
		||||
        securityContext:
 | 
			
		||||
          allowPrivilegeEscalation: false
 | 
			
		||||
          capabilities:
 | 
			
		||||
            drop:
 | 
			
		||||
            - ALL
 | 
			
		||||
          readOnlyRootFilesystem: true
 | 
			
		||||
          runAsUser: 65534
 | 
			
		||||
      nodeSelector:
 | 
			
		||||
        kubernetes.io/os: linux
 | 
			
		||||
      serviceAccountName: kube-state-metrics
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: Service
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
  labels:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
spec:
 | 
			
		||||
  clusterIP: None
 | 
			
		||||
  ports:
 | 
			
		||||
  - name: http-metrics
 | 
			
		||||
    port: 8080
 | 
			
		||||
    targetPort: http-metrics
 | 
			
		||||
  - name: telemetry
 | 
			
		||||
    port: 8081
 | 
			
		||||
    targetPort: telemetry
 | 
			
		||||
  selector:
 | 
			
		||||
    app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: ServiceMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
spec:
 | 
			
		||||
  endpoints:
 | 
			
		||||
  - honorLabels: true
 | 
			
		||||
    path: /metrics
 | 
			
		||||
    port: http-metrics
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: kube-state-metrics
 | 
			
		||||
@@ -7,7 +7,7 @@ and then heavily modified.
 | 
			
		||||
To deploy Longhorn use following:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
kubectl -n longhorn-system apply -f longhorn.yaml -f ingress.yml
 | 
			
		||||
kubectl -n longhorn-system apply -f application.yml -f application-extras.yml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
After deploying specify `dedicated=storage:NoSchedule`
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,126 @@
 | 
			
		||||
apiVersion: networking.k8s.io/v1
 | 
			
		||||
kind: Ingress
 | 
			
		||||
metadata:
 | 
			
		||||
  name: longhorn-dashboard
 | 
			
		||||
  namespace: longhorn-system
 | 
			
		||||
  annotations:
 | 
			
		||||
    kubernetes.io/ingress.class: traefik
 | 
			
		||||
    cert-manager.io/cluster-issuer: default
 | 
			
		||||
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.tls: "true"
 | 
			
		||||
spec:
 | 
			
		||||
  rules:
 | 
			
		||||
  - host: longhorn.k-space.ee
 | 
			
		||||
    http:
 | 
			
		||||
      paths:
 | 
			
		||||
      - pathType: Prefix
 | 
			
		||||
        path: "/"
 | 
			
		||||
        backend:
 | 
			
		||||
          service:
 | 
			
		||||
            name: longhorn-frontend
 | 
			
		||||
            port:
 | 
			
		||||
              number: 80
 | 
			
		||||
  tls:
 | 
			
		||||
  - hosts:
 | 
			
		||||
    - longhorn.k-space.ee
 | 
			
		||||
    secretName: longhorn-tls
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: manager
 | 
			
		||||
spec:
 | 
			
		||||
  selector: {}
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: manager
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: longhorn
 | 
			
		||||
spec:
 | 
			
		||||
  # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: longhorn
 | 
			
		||||
      rules:
 | 
			
		||||
      - alert: LonghornVolumeActualSpaceUsedWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: The accumulated snapshots for volume use up more space than the volume's capacity
 | 
			
		||||
          summary: The actual used space of Longhorn volume is twice the size of the volume capacity.
 | 
			
		||||
        expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
 | 
			
		||||
          severity: warning
 | 
			
		||||
      - alert: LonghornVolumeStatusCritical
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
 | 
			
		||||
            more than 2 minutes.
 | 
			
		||||
          summary: Longhorn volume {{$labels.volume}} is Fault
 | 
			
		||||
        expr: longhorn_volume_robustness == 3
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: Longhorn volume {{$labels.volume}} is Fault.
 | 
			
		||||
          severity: critical
 | 
			
		||||
      - alert: LonghornVolumeStatusWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
 | 
			
		||||
            more than 5 minutes.
 | 
			
		||||
          summary: Longhorn volume {{$labels.volume}} is Degraded
 | 
			
		||||
        expr: longhorn_volume_robustness == 2
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: Longhorn volume {{$labels.volume}} is Degraded.
 | 
			
		||||
          severity: warning
 | 
			
		||||
      - alert: LonghornNodeStorageWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
 | 
			
		||||
            more than 5 minutes.
 | 
			
		||||
          summary:  The used storage of node is over 70% of the capacity.
 | 
			
		||||
        expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: The used storage of node {{$labels.node}} is high.
 | 
			
		||||
          severity: warning
 | 
			
		||||
      - alert: LonghornDiskStorageWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
 | 
			
		||||
            more than 5 minutes.
 | 
			
		||||
          summary:  The used storage of disk is over 70% of the capacity.
 | 
			
		||||
        expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
 | 
			
		||||
          severity: warning
 | 
			
		||||
      - alert: LonghornNodeDown
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
 | 
			
		||||
          summary: Longhorn nodes is offline
 | 
			
		||||
        expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: There are {{$value}} Longhorn nodes are offline
 | 
			
		||||
          severity: critical
 | 
			
		||||
      - alert: LonghornIntanceManagerCPUUsageWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
 | 
			
		||||
            more than 5 minutes.
 | 
			
		||||
          summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
 | 
			
		||||
        expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
 | 
			
		||||
          severity: warning
 | 
			
		||||
      - alert: LonghornNodeCPUUsageWarning
 | 
			
		||||
        annotations:
 | 
			
		||||
          description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
 | 
			
		||||
            more than 5 minutes.
 | 
			
		||||
          summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
 | 
			
		||||
        expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
 | 
			
		||||
          severity: warning
 | 
			
		||||
@@ -1,28 +0,0 @@
 | 
			
		||||
apiVersion: networking.k8s.io/v1
 | 
			
		||||
kind: Ingress
 | 
			
		||||
metadata:
 | 
			
		||||
  name: longhorn-dashboard
 | 
			
		||||
  namespace: longhorn-system
 | 
			
		||||
  annotations:
 | 
			
		||||
    kubernetes.io/ingress.class: traefik
 | 
			
		||||
    cert-manager.io/cluster-issuer: default
 | 
			
		||||
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.tls: "true"
 | 
			
		||||
spec:
 | 
			
		||||
  rules:
 | 
			
		||||
  - host: longhorn.k-space.ee
 | 
			
		||||
    http:
 | 
			
		||||
      paths:
 | 
			
		||||
      - pathType: Prefix
 | 
			
		||||
        path: "/"
 | 
			
		||||
        backend:
 | 
			
		||||
          service:
 | 
			
		||||
            name: longhorn-frontend
 | 
			
		||||
            port:
 | 
			
		||||
              number: 80
 | 
			
		||||
  tls:
 | 
			
		||||
  - hosts:
 | 
			
		||||
    - longhorn.k-space.ee
 | 
			
		||||
    secretName: longhorn-tls
 | 
			
		||||
@@ -1,27 +0,0 @@
 | 
			
		||||
persistence:
 | 
			
		||||
  defaultClassReplicaCount: 2
 | 
			
		||||
 | 
			
		||||
defaultSettings:
 | 
			
		||||
  defaultDataLocality: best-effort
 | 
			
		||||
  taintToleration: "dedicated=storage:NoSchedule"
 | 
			
		||||
  systemManagedComponentsNodeSelector: "dedicated:storage"
 | 
			
		||||
  
 | 
			
		||||
longhornDriver:
 | 
			
		||||
  tolerations:
 | 
			
		||||
  - key: dedicated
 | 
			
		||||
    operator: Equal
 | 
			
		||||
    value: storage
 | 
			
		||||
    effect: NoSchedule
 | 
			
		||||
 | 
			
		||||
longhornUI:
 | 
			
		||||
  tolerations:
 | 
			
		||||
  - key: dedicated
 | 
			
		||||
    operator: Equal
 | 
			
		||||
    value: storage
 | 
			
		||||
    effect: NoSchedule
 | 
			
		||||
 | 
			
		||||
ingress:
 | 
			
		||||
  enabled: true
 | 
			
		||||
  host: longhorn.k-space.ee
 | 
			
		||||
  tls: true
 | 
			
		||||
  tlsSecret: longhorn-tls
 | 
			
		||||
@@ -67,6 +67,11 @@ spec:
 | 
			
		||||
                 items:
 | 
			
		||||
                   type: object
 | 
			
		||||
                   x-kubernetes-preserve-unknown-fields: true
 | 
			
		||||
               customresources:
 | 
			
		||||
                 type: array
 | 
			
		||||
                 items:
 | 
			
		||||
                   type: object
 | 
			
		||||
                   x-kubernetes-preserve-unknown-fields: true
 | 
			
		||||
         required: ["spec"]
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
@@ -178,12 +183,21 @@ rules:
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - codemowers.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - bindzones
 | 
			
		||||
  - clusteroperators
 | 
			
		||||
  - keydbs
 | 
			
		||||
  verbs:
 | 
			
		||||
  - get
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - k-space.ee
 | 
			
		||||
  resources:
 | 
			
		||||
  - cams
 | 
			
		||||
  verbs:
 | 
			
		||||
  - get
 | 
			
		||||
  - list
 | 
			
		||||
  - watch
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: ServiceAccount
 | 
			
		||||
 
 | 
			
		||||
@@ -120,7 +120,7 @@ spec:
 | 
			
		||||
        type: ClusterIP
 | 
			
		||||
        clusterIP: None
 | 
			
		||||
        ports:
 | 
			
		||||
        - name: "server"
 | 
			
		||||
        - name: redis
 | 
			
		||||
          port: 6379
 | 
			
		||||
          protocol: TCP
 | 
			
		||||
          targetPort: redis
 | 
			
		||||
@@ -137,14 +137,14 @@ spec:
 | 
			
		||||
      spec:
 | 
			
		||||
        type: ClusterIP
 | 
			
		||||
        ports:
 | 
			
		||||
        - name: "server"
 | 
			
		||||
        - name: redis
 | 
			
		||||
          port: 6379
 | 
			
		||||
          protocol: TCP
 | 
			
		||||
          targetPort: redis
 | 
			
		||||
        - name: "redis-exporter"
 | 
			
		||||
        - name: exporter
 | 
			
		||||
          port: 9121
 | 
			
		||||
          protocol: TCP
 | 
			
		||||
          targetPort: redis-exporter
 | 
			
		||||
          targetPort: exporter
 | 
			
		||||
        selector:
 | 
			
		||||
          app.kubernetes.io/name: foobar
 | 
			
		||||
        sessionAffinity: ClientIP
 | 
			
		||||
@@ -163,9 +163,6 @@ spec:
 | 
			
		||||
            app.kubernetes.io/name: foobar
 | 
			
		||||
        template:
 | 
			
		||||
          metadata:
 | 
			
		||||
            annotations:
 | 
			
		||||
              prometheus.io/port: "9121"
 | 
			
		||||
              prometheus.io/scrape: "true"
 | 
			
		||||
            labels:
 | 
			
		||||
              app.kubernetes.io/name: foobar
 | 
			
		||||
          spec:
 | 
			
		||||
@@ -237,10 +234,10 @@ spec:
 | 
			
		||||
              envFrom:
 | 
			
		||||
                - secretRef:
 | 
			
		||||
                    name: foobar-secrets
 | 
			
		||||
            - name: redis-exporter
 | 
			
		||||
            - name: exporter
 | 
			
		||||
              image: quay.io/oliver006/redis_exporter
 | 
			
		||||
              ports:
 | 
			
		||||
              - name: metrics
 | 
			
		||||
              - name: exporter
 | 
			
		||||
                containerPort: 9121
 | 
			
		||||
              envFrom:
 | 
			
		||||
                - secretRef:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,14 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: monitoring
 | 
			
		||||
  namespace: metallb-system
 | 
			
		||||
spec:
 | 
			
		||||
  selector: {}
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: monitoring
 | 
			
		||||
---
 | 
			
		||||
apiVersion: metallb.io/v1beta1
 | 
			
		||||
kind: MetalLB
 | 
			
		||||
metadata:
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										19
									
								
								prometheus-operator/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								prometheus-operator/README.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,19 @@
 | 
			
		||||
# Prometheus operator
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml
 | 
			
		||||
kubectl create namespace prometheus-operator
 | 
			
		||||
kubectl apply --server-side -n prometheus-operator -f bundle.yml
 | 
			
		||||
kubectl delete -n prometheus-operator configmap snmp-exporter
 | 
			
		||||
kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml
 | 
			
		||||
kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
# Mikrotik expoeter
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 kubectl create -n prometheus-operator secret generic mikrotik-exporter \
 | 
			
		||||
  --from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \
 | 
			
		||||
  --from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										762
									
								
								prometheus-operator/application.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										762
									
								
								prometheus-operator/application.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,762 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: metrics
 | 
			
		||||
spec:
 | 
			
		||||
  namespaceSelector: {}
 | 
			
		||||
  selector: {}
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: exporter
 | 
			
		||||
    - port: metrics
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Alertmanager
 | 
			
		||||
metadata:
 | 
			
		||||
  name: alertmanager
 | 
			
		||||
spec:
 | 
			
		||||
  nodeSelector:
 | 
			
		||||
    dedicated: monitoring
 | 
			
		||||
  tolerations:
 | 
			
		||||
    - key: dedicated
 | 
			
		||||
      operator: Equal
 | 
			
		||||
      value: monitoring
 | 
			
		||||
      effect: NoSchedule
 | 
			
		||||
  replicas: 3
 | 
			
		||||
  serviceAccountName: alertmanager
 | 
			
		||||
  externalUrl: http://am.k-space.ee/
 | 
			
		||||
  routePrefix: "/"
 | 
			
		||||
  securityContext:
 | 
			
		||||
    fsGroup: 2000
 | 
			
		||||
    runAsGroup: 2000
 | 
			
		||||
    runAsNonRoot: true
 | 
			
		||||
    runAsUser: 1000
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: ServiceAccount
 | 
			
		||||
metadata:
 | 
			
		||||
  name: alertmanager
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Prometheus
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
spec:
 | 
			
		||||
  nodeSelector:
 | 
			
		||||
    dedicated: monitoring
 | 
			
		||||
  tolerations:
 | 
			
		||||
    - key: dedicated
 | 
			
		||||
      operator: Equal
 | 
			
		||||
      value: monitoring
 | 
			
		||||
      effect: NoSchedule
 | 
			
		||||
  alerting:
 | 
			
		||||
    alertmanagers:
 | 
			
		||||
      - namespace: prometheus-operator
 | 
			
		||||
        name: alertmanager
 | 
			
		||||
        port: http
 | 
			
		||||
        pathPrefix: "/"
 | 
			
		||||
        apiVersion: v2
 | 
			
		||||
  externalUrl: "http://prom.k-space.ee/"
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  shards: 1
 | 
			
		||||
  serviceAccountName: prometheus
 | 
			
		||||
  securityContext:
 | 
			
		||||
    fsGroup: 2000
 | 
			
		||||
    runAsGroup: 2000
 | 
			
		||||
    runAsNonRoot: true
 | 
			
		||||
    runAsUser: 1000
 | 
			
		||||
  serviceMonitorNamespaceSelector: {}
 | 
			
		||||
  serviceMonitorSelector: {}
 | 
			
		||||
  podMonitorNamespaceSelector: {}
 | 
			
		||||
  podMonitorSelector: {}
 | 
			
		||||
  probeNamespaceSelector: {}
 | 
			
		||||
  probeSelector: {}
 | 
			
		||||
  ruleNamespaceSelector: {}
 | 
			
		||||
  ruleSelector: {}
 | 
			
		||||
  retentionSize: 80GB
 | 
			
		||||
  storage:
 | 
			
		||||
    volumeClaimTemplate:
 | 
			
		||||
      spec:
 | 
			
		||||
        accessModes:
 | 
			
		||||
        - ReadWriteOnce
 | 
			
		||||
        resources:
 | 
			
		||||
          requests:
 | 
			
		||||
            storage: 100Gi
 | 
			
		||||
        storageClassName: local-path
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: ServiceAccount
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
---
 | 
			
		||||
apiVersion: rbac.authorization.k8s.io/v1
 | 
			
		||||
kind: ClusterRole
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
rules:
 | 
			
		||||
- apiGroups: [""]
 | 
			
		||||
  resources:
 | 
			
		||||
  - nodes
 | 
			
		||||
  - nodes/metrics
 | 
			
		||||
  - services
 | 
			
		||||
  - endpoints
 | 
			
		||||
  - pods
 | 
			
		||||
  verbs: ["get", "list", "watch"]
 | 
			
		||||
- apiGroups: [""]
 | 
			
		||||
  resources:
 | 
			
		||||
  - configmaps
 | 
			
		||||
  verbs: ["get"]
 | 
			
		||||
- apiGroups:
 | 
			
		||||
  - networking.k8s.io
 | 
			
		||||
  resources:
 | 
			
		||||
  - ingresses
 | 
			
		||||
  verbs: ["get", "list", "watch"]
 | 
			
		||||
- nonResourceURLs: ["/metrics"]
 | 
			
		||||
  verbs: ["get"]
 | 
			
		||||
---
 | 
			
		||||
apiVersion: rbac.authorization.k8s.io/v1
 | 
			
		||||
kind: ClusterRoleBinding
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
roleRef:
 | 
			
		||||
  apiGroup: rbac.authorization.k8s.io
 | 
			
		||||
  kind: ClusterRole
 | 
			
		||||
  name: prometheus
 | 
			
		||||
subjects:
 | 
			
		||||
- kind: ServiceAccount
 | 
			
		||||
  name: prometheus
 | 
			
		||||
  namespace: prometheus-operator
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: prometheus
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: PrometheusJobMissing
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \
 | 
			
		||||
          \ LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus job missing (instance {{ $labels.instance }})
 | 
			
		||||
      expr: absent(up{job="prometheus-operator/prometheus"})
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusTargetMissing
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "A Prometheus target has disappeared. An exporter might be crashed.\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus target missing (instance {{ $labels.instance }})
 | 
			
		||||
      expr: up == 0
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusAllTargetsMissing
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "A Prometheus job does not have living target anymore.\n  VALUE\
 | 
			
		||||
          \ = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus all targets missing (instance {{ $labels.instance }})
 | 
			
		||||
      expr: count by (job) (up) == 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusConfigurationReloadFailure
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\
 | 
			
		||||
          \  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus configuration reload failure (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: prometheus_config_last_reload_successful != 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusTooManyRestarts
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus has restarted more than twice in the last 15 minutes.\
 | 
			
		||||
          \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\
 | 
			
		||||
          \ }}"
 | 
			
		||||
        summary: Prometheus too many restarts (instance {{ $labels.instance }})
 | 
			
		||||
      expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
 | 
			
		||||
        > 2
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusAlertmanagerJobMissing
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\
 | 
			
		||||
          \ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus AlertManager job missing (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: absent(up{job="prometheus-operator/alertmanager"})
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusAlertmanagerConfigurationReloadFailure
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "AlertManager configuration reload error\n  VALUE = {{ $value\
 | 
			
		||||
          \ }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus AlertManager configuration reload failure (instance {{
 | 
			
		||||
          $labels.instance }})
 | 
			
		||||
      expr: alertmanager_config_last_reload_successful != 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusAlertmanagerConfigNotSynced
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Configurations of AlertManager cluster instances are out of\
 | 
			
		||||
          \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusNotConnectedToAlertmanager
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\
 | 
			
		||||
          \ }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: prometheus_notifications_alertmanagers_discovered < 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusRuleEvaluationFailures
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} rule evaluation failures,\
 | 
			
		||||
          \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\
 | 
			
		||||
          \ = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus rule evaluation failures (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTemplateTextExpansionFailures
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} template text expansion\
 | 
			
		||||
          \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus template text expansion failures (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusRuleEvaluationSlow
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus rule evaluation took more time than the scheduled\
 | 
			
		||||
          \ interval. It indicates a slower storage backend access or too complex\
 | 
			
		||||
          \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
 | 
			
		||||
      expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusNotificationsBacklog
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "The Prometheus notification queue has not been empty for 10\
 | 
			
		||||
          \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
 | 
			
		||||
      expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusAlertmanagerNotificationFailing
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Alertmanager is failing sending notifications\n  VALUE = {{\
 | 
			
		||||
          \ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: rate(alertmanager_notifications_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTargetEmpty
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus has no target in service discovery\n  VALUE = {{\
 | 
			
		||||
          \ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus target empty (instance {{ $labels.instance }})
 | 
			
		||||
      expr: prometheus_sd_discovered_targets == 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusLargeScrape
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus has many scrapes that exceed the sample limit\n \
 | 
			
		||||
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus large scrape (instance {{ $labels.instance }})
 | 
			
		||||
      expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) >
 | 
			
		||||
        10
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusTargetScrapeDuplicate
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus has many samples rejected due to duplicate timestamps\
 | 
			
		||||
          \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus target scrape duplicate (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
 | 
			
		||||
        > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusTsdbCheckpointCreationFailures
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbCheckpointDeletionFailures
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbCompactionsFailed
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbHeadTruncationsFailed
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbReloadFailures
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} TSDB reload failures\n \
 | 
			
		||||
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbWalCorruptions
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \
 | 
			
		||||
          \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space
 | 
			
		||||
          and wipe /data/wal
 | 
			
		||||
      expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: PrometheusTsdbWalTruncationsFailed
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\
 | 
			
		||||
          \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance
 | 
			
		||||
          }})
 | 
			
		||||
      expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
---
 | 
			
		||||
apiVersion: networking.k8s.io/v1
 | 
			
		||||
kind: Ingress
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
  annotations:
 | 
			
		||||
    cert-manager.io/cluster-issuer: default
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.tls: "true"
 | 
			
		||||
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 | 
			
		||||
spec:
 | 
			
		||||
  rules:
 | 
			
		||||
  - host: prom.k-space.ee
 | 
			
		||||
    http:
 | 
			
		||||
      paths:
 | 
			
		||||
      - pathType: Prefix
 | 
			
		||||
        path: "/"
 | 
			
		||||
        backend:
 | 
			
		||||
          service:
 | 
			
		||||
            name: prometheus-operated
 | 
			
		||||
            port:
 | 
			
		||||
              number: 9090
 | 
			
		||||
  tls:
 | 
			
		||||
  - hosts:
 | 
			
		||||
    - prom.k-space.ee
 | 
			
		||||
    secretName: prom-tls
 | 
			
		||||
---
 | 
			
		||||
apiVersion: networking.k8s.io/v1
 | 
			
		||||
kind: Ingress
 | 
			
		||||
metadata:
 | 
			
		||||
  name: alertmanager
 | 
			
		||||
  annotations:
 | 
			
		||||
    cert-manager.io/cluster-issuer: default
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.tls: "true"
 | 
			
		||||
    external-dns.alpha.kubernetes.io/target: traefik.k-space.ee
 | 
			
		||||
    traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd
 | 
			
		||||
spec:
 | 
			
		||||
  rules:
 | 
			
		||||
  - host: am.k-space.ee
 | 
			
		||||
    http:
 | 
			
		||||
      paths:
 | 
			
		||||
      - pathType: Prefix
 | 
			
		||||
        path: "/"
 | 
			
		||||
        backend:
 | 
			
		||||
          service:
 | 
			
		||||
            name: alertmanager-operated
 | 
			
		||||
            port:
 | 
			
		||||
              number: 9093
 | 
			
		||||
  tls:
 | 
			
		||||
  - hosts:
 | 
			
		||||
    - am.k-space.ee
 | 
			
		||||
    secretName: alertmanager-tls
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: prometheus
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: prometheus
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: web
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: alertmanager
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: alertmanager
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: web
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: operator
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: prometheus-operator
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: http
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: ServiceMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kubelet
 | 
			
		||||
spec:
 | 
			
		||||
  endpoints:
 | 
			
		||||
  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 | 
			
		||||
    honorLabels: true
 | 
			
		||||
    interval: 30s
 | 
			
		||||
    port: https-metrics
 | 
			
		||||
    scheme: https
 | 
			
		||||
    tlsConfig:
 | 
			
		||||
      insecureSkipVerify: true
 | 
			
		||||
  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 | 
			
		||||
    honorLabels: true
 | 
			
		||||
    interval: 30s
 | 
			
		||||
    path: /metrics/cadvisor
 | 
			
		||||
    port: https-metrics
 | 
			
		||||
    scheme: https
 | 
			
		||||
    tlsConfig:
 | 
			
		||||
      insecureSkipVerify: true
 | 
			
		||||
  namespaceSelector:
 | 
			
		||||
    matchNames:
 | 
			
		||||
    - kube-system
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: kubelet
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: kube-state-metrics
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: kube-state-metrics
 | 
			
		||||
      rules:
 | 
			
		||||
        - alert: KubernetesNodeReady
 | 
			
		||||
          expr: kube_node_status_condition{condition="Ready",status="true"} == 0
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Node ready (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesMemoryPressure
 | 
			
		||||
          expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes memory pressure (instance {{ $labels.instance }})
 | 
			
		||||
            description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesDiskPressure
 | 
			
		||||
          expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes disk pressure (instance {{ $labels.instance }})
 | 
			
		||||
            description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesOutOfDisk
 | 
			
		||||
          expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes out of disk (instance {{ $labels.instance }})
 | 
			
		||||
            description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesOutOfCapacity
 | 
			
		||||
          expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes out of capacity (instance {{ $labels.instance }})
 | 
			
		||||
            description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesContainerOomKiller
 | 
			
		||||
          expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes container oom killer (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesJobFailed
 | 
			
		||||
          expr: kube_job_status_failed > 0
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Job failed (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesCronjobSuspended
 | 
			
		||||
          expr: kube_cronjob_spec_suspend != 0
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
 | 
			
		||||
            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesPersistentvolumeclaimPending
 | 
			
		||||
          expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
 | 
			
		||||
            description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesVolumeOutOfDiskSpace
 | 
			
		||||
          expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesVolumeFullInFourDays
 | 
			
		||||
          expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
 | 
			
		||||
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesPersistentvolumeError
 | 
			
		||||
          expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesStatefulsetDown
 | 
			
		||||
          expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
 | 
			
		||||
          for: 1m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesHpaScalingAbility
 | 
			
		||||
          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesHpaMetricAvailability
 | 
			
		||||
          expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
 | 
			
		||||
            description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesHpaScaleCapability
 | 
			
		||||
          expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: info
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
 | 
			
		||||
            description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesPodNotHealthy
 | 
			
		||||
          expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesPodCrashLooping
 | 
			
		||||
          expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesReplicassetMismatch
 | 
			
		||||
          expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesDeploymentReplicasMismatch
 | 
			
		||||
          expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesStatefulsetReplicasMismatch
 | 
			
		||||
          expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesDeploymentGenerationMismatch
 | 
			
		||||
          expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesStatefulsetGenerationMismatch
 | 
			
		||||
          expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesStatefulsetUpdateNotRolledOut
 | 
			
		||||
          expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
 | 
			
		||||
            description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesDaemonsetRolloutStuck
 | 
			
		||||
          expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
 | 
			
		||||
          for: 10m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesDaemonsetMisscheduled
 | 
			
		||||
          expr: kube_daemonset_status_number_misscheduled > 0
 | 
			
		||||
          for: 1m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesCronjobTooLong
 | 
			
		||||
          expr: time() - kube_cronjob_next_schedule_time > 3600
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
 | 
			
		||||
            description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesJobSlowCompletion
 | 
			
		||||
          expr: kube_job_spec_completions - kube_job_status_succeeded > 0
 | 
			
		||||
          for: 12h
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes job slow completion (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesApiServerErrors
 | 
			
		||||
          expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes API server errors (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesApiClientErrors
 | 
			
		||||
          expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes API client errors (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesClientCertificateExpiresNextWeek
 | 
			
		||||
          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesClientCertificateExpiresSoon
 | 
			
		||||
          expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
 | 
			
		||||
          for: 0m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: critical
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
 | 
			
		||||
            description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
        - alert: KubernetesApiServerLatency
 | 
			
		||||
          expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1
 | 
			
		||||
          for: 2m
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Kubernetes API server latency (instance {{ $labels.instance }})
 | 
			
		||||
            description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
							
								
								
									
										258
									
								
								prometheus-operator/blackbox-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										258
									
								
								prometheus-operator/blackbox-exporter.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,258 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: websites
 | 
			
		||||
spec:
 | 
			
		||||
  prober:
 | 
			
		||||
    url: blackbox-exporter
 | 
			
		||||
    path: /probe
 | 
			
		||||
  module: http_2xx
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - https://git.k-space.ee/
 | 
			
		||||
        - https://grafana.k-space.ee/
 | 
			
		||||
        - https://wiki.k-space.ee/
 | 
			
		||||
        - https://pad.k-space.ee/
 | 
			
		||||
        - https://members.k-space.ee/
 | 
			
		||||
        - https://nextcloud.k-space.ee/
 | 
			
		||||
        - http://minio.infra.k-space.ee:9001/login
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: k6.ee
 | 
			
		||||
spec:
 | 
			
		||||
  prober:
 | 
			
		||||
    url: blackbox-exporter
 | 
			
		||||
    path: /probe
 | 
			
		||||
  module: dns_check_traefik
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - 193.40.103.2
 | 
			
		||||
        - 62.65.250.2
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: samba-cluster
 | 
			
		||||
spec:
 | 
			
		||||
  prober:
 | 
			
		||||
    url: blackbox-exporter
 | 
			
		||||
    path: /metrics
 | 
			
		||||
  module: tcp_connect
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - dc1.ad.k-space.ee:636
 | 
			
		||||
        - dc2.ad.k-space.ee:636
 | 
			
		||||
        - dc3.ad.k-space.ee:636
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: misc
 | 
			
		||||
spec:
 | 
			
		||||
  prober:
 | 
			
		||||
    url: blackbox-exporter
 | 
			
		||||
    path: /metrics
 | 
			
		||||
  module: tcp_connect
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - mail.k-space.ee:465
 | 
			
		||||
        - dev.k-space.ee:10648
 | 
			
		||||
        - mariadb.infra.k-space.ee:3306
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: blackbox-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  # https://awesome-prometheus-alerts.grep.to/rules#blackbox
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: blackbox
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: BlackboxProbeFailed
 | 
			
		||||
      expr: probe_success == 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox probe failed (instance {{ $labels.instance }})
 | 
			
		||||
        description: Probe failed
 | 
			
		||||
    - alert: BlackboxSlowProbe
 | 
			
		||||
      expr: avg_over_time(probe_duration_seconds[1m]) > 1
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox slow probe (instance {{ $labels.instance }})
 | 
			
		||||
        description: Blackbox probe took more than 1s to complete
 | 
			
		||||
    - alert: BlackboxSlowDNS
 | 
			
		||||
      expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox slow DNS lookup (instance {{ $labels.instance }})
 | 
			
		||||
        description: Blackbox DNS lookup took more than 1s to complete.
 | 
			
		||||
          It seemed using IPv6 DNS servers in conjunction with Docker resulted
 | 
			
		||||
          in odd 5s latency bump. For now we're using 8.8.8.8 because of that
 | 
			
		||||
    - alert: BlackboxProbeHttpFailure
 | 
			
		||||
      expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
 | 
			
		||||
        description: HTTP status code is not 200-399
 | 
			
		||||
    - alert: BlackboxSslCertificateWillExpireSoon
 | 
			
		||||
      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
 | 
			
		||||
        description: SSL certificate expires in 30 days
 | 
			
		||||
    - alert: BlackboxSslCertificateWillExpireSoon
 | 
			
		||||
      expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
 | 
			
		||||
        description: SSL certificate expires in 3 days
 | 
			
		||||
    - alert: BlackboxSslCertificateExpired
 | 
			
		||||
      expr: probe_ssl_earliest_cert_expiry - time() <= 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
 | 
			
		||||
        description: SSL certificate has expired already
 | 
			
		||||
    - alert: BlackboxProbeSlowHttp
 | 
			
		||||
      expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
 | 
			
		||||
        description: HTTP request took more than 1s
 | 
			
		||||
    - alert: BlackboxProbeSlowPing
 | 
			
		||||
      expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
 | 
			
		||||
        description: Blackbox ping took more than 1s
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: blackbox-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  revisionHistoryLimit: 0
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: blackbox-exporter
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: blackbox-exporter
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
      - name: blackbox-exporter
 | 
			
		||||
        image: prom/blackbox-exporter:v0.20.0
 | 
			
		||||
        volumeMounts:
 | 
			
		||||
        - name: blackbox-exporter-config
 | 
			
		||||
          mountPath: /etc/blackbox_exporter
 | 
			
		||||
      volumes:
 | 
			
		||||
        - name: blackbox-exporter-config
 | 
			
		||||
          configMap:
 | 
			
		||||
            name: blackbox-exporter-config
 | 
			
		||||
      # TODO: Results in odd 6s connection lag if scheduled in VLAN20
 | 
			
		||||
      nodeSelector:
 | 
			
		||||
        dedicated: monitoring
 | 
			
		||||
      tolerations:
 | 
			
		||||
        - key: dedicated
 | 
			
		||||
          operator: Equal
 | 
			
		||||
          value: monitoring
 | 
			
		||||
          effect: NoSchedule
 | 
			
		||||
      affinity:
 | 
			
		||||
        podAntiAffinity:
 | 
			
		||||
          requiredDuringSchedulingIgnoredDuringExecution:
 | 
			
		||||
          - labelSelector:
 | 
			
		||||
              matchExpressions:
 | 
			
		||||
              - key: app
 | 
			
		||||
                operator: In
 | 
			
		||||
                values:
 | 
			
		||||
                - blackbox-exporter
 | 
			
		||||
            topologyKey: "kubernetes.io/hostname"
 | 
			
		||||
---
 | 
			
		||||
kind: Service
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: blackbox-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  ports:
 | 
			
		||||
    - name: http
 | 
			
		||||
      port: 80
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      targetPort: 9115
 | 
			
		||||
  selector:
 | 
			
		||||
    app: blackbox-exporter
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: ConfigMap
 | 
			
		||||
metadata:
 | 
			
		||||
  name: blackbox-exporter-config
 | 
			
		||||
data:
 | 
			
		||||
  config.yml: |-
 | 
			
		||||
    modules:
 | 
			
		||||
      http_2xx:
 | 
			
		||||
        prober: http
 | 
			
		||||
        http:
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
      http_post_2xx:
 | 
			
		||||
        prober: http
 | 
			
		||||
        http:
 | 
			
		||||
          method: POST
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
      tcp_connect:
 | 
			
		||||
        prober: tcp
 | 
			
		||||
        tcp:
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
      icmp:
 | 
			
		||||
        prober: icmp
 | 
			
		||||
        icmp:
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
      dns_check_traefik:
 | 
			
		||||
        prober: dns
 | 
			
		||||
        dns:
 | 
			
		||||
          query_name: "traefik.k-space.ee"
 | 
			
		||||
          query_type: "A"
 | 
			
		||||
          validate_answer_rrs:
 | 
			
		||||
            fail_if_not_matches_regexp:
 | 
			
		||||
             - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
      dns_check_k6:
 | 
			
		||||
        prober: dns
 | 
			
		||||
        dns:
 | 
			
		||||
          query_name: "k6.ee"
 | 
			
		||||
          query_type: "A"
 | 
			
		||||
          validate_answer_rrs:
 | 
			
		||||
            fail_if_not_matches_regexp:
 | 
			
		||||
             - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*"
 | 
			
		||||
          preferred_ip_protocol: "ip4"
 | 
			
		||||
          ip_protocol_fallback: false
 | 
			
		||||
							
								
								
									
										28816
									
								
								prometheus-operator/bundle.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28816
									
								
								prometheus-operator/bundle.yml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										104
									
								
								prometheus-operator/mikrotik-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								prometheus-operator/mikrotik-exporter.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,104 @@
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: mikrotik
 | 
			
		||||
spec:
 | 
			
		||||
  bearerTokenSecret:
 | 
			
		||||
    name: mikrotik-exporter
 | 
			
		||||
    key: PROMETHEUS_BEARER_TOKEN
 | 
			
		||||
  prober:
 | 
			
		||||
    path: /metrics
 | 
			
		||||
    url: mikrotik-exporter
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - router.mgmt.k-space.ee
 | 
			
		||||
        - sw_chaos.mgmt.k-space.ee
 | 
			
		||||
        - sw_poe.mgmt.k-space.ee
 | 
			
		||||
        - sw_mgmt.mgmt.k-space.ee
 | 
			
		||||
        - sw_core02.mgmt.k-space.ee
 | 
			
		||||
        - sw_cyber.mgmt.k-space.ee
 | 
			
		||||
        - sw_ha.mgmt.k-space.ee
 | 
			
		||||
        - sw_asocial.mgmt.k-space.ee
 | 
			
		||||
        - sw_kitchen.mgmt.k-space.ee
 | 
			
		||||
        - sw_core01.mgmt.k-space.ee
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: mikrotik
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: mikrotik
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: MikrotikUplinkRedundancyLost
 | 
			
		||||
      expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: error
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Switch uplink high availability lost
 | 
			
		||||
        description: One of the two 10Gb optical links is malfunctioning
 | 
			
		||||
    - alert: MikrotikLinkRateDegraded
 | 
			
		||||
      expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: error
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: 10Gb link degraded
 | 
			
		||||
        description: One of the 10Gb links is running at lower speed
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: mikrotik-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  revisionHistoryLimit: 0
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: mikrotik-exporter
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: mikrotik-exporter
 | 
			
		||||
      annotations:
 | 
			
		||||
        co.elastic.logs/multiline.pattern: '^  '
 | 
			
		||||
        co.elastic.logs/multiline.negate: "false"
 | 
			
		||||
        co.elastic.logs/multiline.match: after
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
      - name: mikrotik-exporter
 | 
			
		||||
        image: harbor.k-space.ee/k-space/mikrotik-exporter:latest
 | 
			
		||||
        env:
 | 
			
		||||
          - name: MIKROTIK_USER
 | 
			
		||||
            value: netpoller
 | 
			
		||||
        envFrom:
 | 
			
		||||
          - secretRef:
 | 
			
		||||
              name: mikrotik-exporter
 | 
			
		||||
      nodeSelector:
 | 
			
		||||
        dedicated: monitoring
 | 
			
		||||
      tolerations:
 | 
			
		||||
      - key: dedicated
 | 
			
		||||
        operator: Equal
 | 
			
		||||
        value: monitoring
 | 
			
		||||
        effect: NoSchedule
 | 
			
		||||
      affinity:
 | 
			
		||||
        podAntiAffinity:
 | 
			
		||||
          requiredDuringSchedulingIgnoredDuringExecution:
 | 
			
		||||
          - topologyKey: "kubernetes.io/hostname"
 | 
			
		||||
---
 | 
			
		||||
kind: Service
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: mikrotik-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  ports:
 | 
			
		||||
    - name: http
 | 
			
		||||
      port: 80
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
      targetPort: 3001
 | 
			
		||||
  selector:
 | 
			
		||||
    app: mikrotik-exporter
 | 
			
		||||
							
								
								
									
										443
									
								
								prometheus-operator/node-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										443
									
								
								prometheus-operator/node-exporter.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,443 @@
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: nodes-proxmox
 | 
			
		||||
spec:
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - nas.mgmt.k-space.ee:9100
 | 
			
		||||
        - pve1.proxmox.infra.k-space.ee:9100
 | 
			
		||||
        - pve8.proxmox.infra.k-space.ee:9100
 | 
			
		||||
        - pve9.proxmox.infra.k-space.ee:9100
 | 
			
		||||
      relabelingConfigs:
 | 
			
		||||
      - sourceLabels: [__param_target]
 | 
			
		||||
        targetLabel: instance
 | 
			
		||||
      - sourceLabels: [__param_target]
 | 
			
		||||
        targetLabel: __address__
 | 
			
		||||
  prober:
 | 
			
		||||
    url: localhost
 | 
			
		||||
    path: /metrics
 | 
			
		||||
  metricRelabelings:
 | 
			
		||||
  - sourceLabels: [__address__]
 | 
			
		||||
    targetLabel: target
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: Probe
 | 
			
		||||
metadata:
 | 
			
		||||
  name: nodes-misc
 | 
			
		||||
spec:
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
          - sprucecone.infra.k-space.ee:9100
 | 
			
		||||
          - cedarcone.infra.k-space.ee:9100
 | 
			
		||||
      relabelingConfigs:
 | 
			
		||||
      - sourceLabels: [__param_target]
 | 
			
		||||
        targetLabel: instance
 | 
			
		||||
      - sourceLabels: [__param_target]
 | 
			
		||||
        targetLabel: __address__
 | 
			
		||||
  prober:
 | 
			
		||||
    url: localhost
 | 
			
		||||
    path: /metrics
 | 
			
		||||
  metricRelabelings:
 | 
			
		||||
  - sourceLabels: [__address__]
 | 
			
		||||
    targetLabel: target
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: node-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: node-exporter
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: ZfsOfflinePool
 | 
			
		||||
      expr: node_zfs_zpool_state{state!="online"} > 0
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: ZFS offline pool (instance {{ $labels.instance }})
 | 
			
		||||
        description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 | 
			
		||||
    - alert: HostHighLoad
 | 
			
		||||
      expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host under high load
 | 
			
		||||
        description: Many processes are queued up for execution
 | 
			
		||||
    - alert: HostOutOfMemory
 | 
			
		||||
      expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host out of memory (instance {{ $labels.instance }})
 | 
			
		||||
        description: Node memory is filling up (< 10% left)
 | 
			
		||||
    - alert: HostMemoryUnderMemoryPressure
 | 
			
		||||
      expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
 | 
			
		||||
        description: The node is under heavy memory pressure. High rate of major page faults
 | 
			
		||||
    - alert: HostUnusualNetworkThroughputIn
 | 
			
		||||
      expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06
 | 
			
		||||
      for: 1h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
 | 
			
		||||
        description: Host network interfaces are probably receiving too much data (> 160 MB/s)
 | 
			
		||||
    - alert: HostUnusualNetworkThroughputOut
 | 
			
		||||
      expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06
 | 
			
		||||
      for: 1h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual network throughput out (instance {{ $labels.instance }})
 | 
			
		||||
        description: Host network interfaces are probably sending too much data (> 160 MB/s)
 | 
			
		||||
    - alert: HostUnusualDiskReadRate
 | 
			
		||||
      expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000
 | 
			
		||||
      for: 1h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk is probably reading too much data (> 50 MB/s)
 | 
			
		||||
    - alert: HostUnusualDiskWriteRate
 | 
			
		||||
      expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000
 | 
			
		||||
      for: 1h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk is probably writing too much data (> 50 MB/s)
 | 
			
		||||
    # Please add ignored mountpoints in node_exporter parameters like
 | 
			
		||||
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 | 
			
		||||
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 | 
			
		||||
    - alert: HostOutOfDiskSpace
 | 
			
		||||
      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host out of disk space (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk is almost full (< 10% left)
 | 
			
		||||
    # Please add ignored mountpoints in node_exporter parameters like
 | 
			
		||||
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 | 
			
		||||
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 | 
			
		||||
    - alert: HostDiskWillFillIn24Hours
 | 
			
		||||
      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
 | 
			
		||||
        description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
 | 
			
		||||
    - alert: HostOutOfInodes
 | 
			
		||||
      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host out of inodes (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk is almost running out of available inodes (< 10% left)
 | 
			
		||||
    - alert: HostInodesWillFillIn24Hours
 | 
			
		||||
      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
 | 
			
		||||
        description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
 | 
			
		||||
    - alert: HostUnusualDiskReadLatency
 | 
			
		||||
      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual disk read latency (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk latency is growing (read operations > 100ms)
 | 
			
		||||
    - alert: HostUnusualDiskWriteLatency
 | 
			
		||||
      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host unusual disk write latency (instance {{ $labels.instance }})
 | 
			
		||||
        description: Disk latency is growing (write operations > 100ms)
 | 
			
		||||
    - alert: HostCpuStealNoisyNeighbor
 | 
			
		||||
      expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
 | 
			
		||||
        description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
 | 
			
		||||
    # 1000 context switches is an arbitrary number.
 | 
			
		||||
    # Alert threshold depends on nature of application.
 | 
			
		||||
    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
 | 
			
		||||
    - alert: HostContextSwitching
 | 
			
		||||
      expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host context switching (instance {{ $labels.instance }})
 | 
			
		||||
        description: Context switching is growing on node (> 50000 / s)
 | 
			
		||||
    - alert: HostSwapIsEnabled
 | 
			
		||||
      expr: node_memory_SwapTotal_bytes > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Swap is discouraged nowadays
 | 
			
		||||
    - alert: HostPhysicalComponentTooHot
 | 
			
		||||
      expr: node_hwmon_temp_celsius > 75
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host physical component too hot (instance {{ $labels.instance }})
 | 
			
		||||
        description: Physical hardware component too hot
 | 
			
		||||
    - alert: HostNodeOvertemperatureAlarm
 | 
			
		||||
      expr: node_hwmon_temp_alarm == 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
 | 
			
		||||
        description: Physical node temperature alarm triggered
 | 
			
		||||
    - alert: HostRaidArrayGotInactive
 | 
			
		||||
      expr: node_md_state{state="inactive"} > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host RAID array got inactive (instance {{ $labels.instance }})
 | 
			
		||||
        description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 | 
			
		||||
    - alert: HostRaidDiskFailure
 | 
			
		||||
      expr: node_md_disks{state="failed"} > 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host RAID disk failure (instance {{ $labels.instance }})
 | 
			
		||||
        description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
 | 
			
		||||
    - alert: HostOomKillDetected
 | 
			
		||||
      expr: increase(node_vmstat_oom_kill[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host OOM kill detected (instance {{ $labels.instance }})
 | 
			
		||||
        description: OOM kill detected
 | 
			
		||||
    - alert: HostEdacCorrectableErrorsDetected
 | 
			
		||||
      expr: increase(node_edac_correctable_errors_total[1m]) > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: info
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
 | 
			
		||||
        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
 | 
			
		||||
    - alert: HostEdacUncorrectableErrorsDetected
 | 
			
		||||
      expr: node_edac_uncorrectable_errors_total > 0
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
 | 
			
		||||
        description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
 | 
			
		||||
    - alert: HostNetworkReceiveErrors
 | 
			
		||||
      expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host Network Receive Errors (instance {{ $labels.instance }})
 | 
			
		||||
        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.
 | 
			
		||||
    - alert: HostNetworkTransmitErrors
 | 
			
		||||
      expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
 | 
			
		||||
        description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.
 | 
			
		||||
    - alert: HostNetworkInterfaceSaturated
 | 
			
		||||
      expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
 | 
			
		||||
        description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded.
 | 
			
		||||
    - alert: HostNetworkBondDegraded
 | 
			
		||||
      expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"}
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host Network Bond Degraded
 | 
			
		||||
    - alert: HostConntrackLimit
 | 
			
		||||
      expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host conntrack limit (instance {{ $labels.instance }})
 | 
			
		||||
        description: The number of conntrack is approching limit
 | 
			
		||||
    - alert: HostClockSkew
 | 
			
		||||
      expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host clock skew (instance {{ $labels.instance }})
 | 
			
		||||
        description: Clock skew detected. Clock is out of sync.
 | 
			
		||||
    - alert: HostClockNotSynchronising
 | 
			
		||||
      expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Host clock not synchronising (instance {{ $labels.instance }})
 | 
			
		||||
        description: Clock not synchronising.
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: smart
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: smart
 | 
			
		||||
      rules:
 | 
			
		||||
      - alert: SmartSSDWriteRateTooHigh
 | 
			
		||||
        expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: SSD write rate exceeds 10MB/s
 | 
			
		||||
          description: At this rate the SSD will be worn out before warranty period expires
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: temperatures
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: temperatures
 | 
			
		||||
      rules:
 | 
			
		||||
      - alert: HighDiskTemperature
 | 
			
		||||
        expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: critical
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: High HDD/SSD temperature indicates high ambient temperature
 | 
			
		||||
      - alert: HighChipsetTemperature
 | 
			
		||||
        expr: node_hwmon_temp_celsius > 65
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans
 | 
			
		||||
      - alert: LowDiskTemperature
 | 
			
		||||
        expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: critical
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: node-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: node-exporter
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: web
 | 
			
		||||
      scrapeTimeout: 30s
 | 
			
		||||
---
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
kind: ServiceAccount
 | 
			
		||||
metadata:
 | 
			
		||||
  name: node-exporter
 | 
			
		||||
---
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: DaemonSet
 | 
			
		||||
metadata:
 | 
			
		||||
  labels:
 | 
			
		||||
    app: node-exporter
 | 
			
		||||
  name: node-exporter
 | 
			
		||||
  annotations:
 | 
			
		||||
    keel.sh/policy: force
 | 
			
		||||
    keel.sh/trigger: poll
 | 
			
		||||
    keel.sh/pollSchedule: "@midnight"
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: node-exporter
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: node-exporter
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
      - name: node-exporter
 | 
			
		||||
        args:
 | 
			
		||||
        - --web.listen-address=0.0.0.0:9101
 | 
			
		||||
        - --path.sysfs=/host/sys
 | 
			
		||||
        - --path.rootfs=/host/root
 | 
			
		||||
        - --no-collector.wifi
 | 
			
		||||
        - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
 | 
			
		||||
        - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$
 | 
			
		||||
        - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$
 | 
			
		||||
        image: prom/node-exporter:v1.3.1
 | 
			
		||||
        resources:
 | 
			
		||||
          limits:
 | 
			
		||||
            cpu: 50m
 | 
			
		||||
            memory: 180Mi
 | 
			
		||||
          requests:
 | 
			
		||||
            cpu: 5m
 | 
			
		||||
            memory: 20Mi
 | 
			
		||||
        volumeMounts:
 | 
			
		||||
        - mountPath: /host/sys
 | 
			
		||||
          mountPropagation: HostToContainer
 | 
			
		||||
          name: sys
 | 
			
		||||
          readOnly: true
 | 
			
		||||
        - mountPath: /host/root
 | 
			
		||||
          mountPropagation: HostToContainer
 | 
			
		||||
          name: root
 | 
			
		||||
          readOnly: true
 | 
			
		||||
        ports:
 | 
			
		||||
        - containerPort: 9101
 | 
			
		||||
          name: web
 | 
			
		||||
        securityContext:
 | 
			
		||||
          runAsGroup: 65532
 | 
			
		||||
          runAsNonRoot: true
 | 
			
		||||
          runAsUser: 65532
 | 
			
		||||
          readOnlyRootFilesystem: true
 | 
			
		||||
      hostNetwork: true
 | 
			
		||||
      hostPID: true
 | 
			
		||||
      securityContext:
 | 
			
		||||
        runAsNonRoot: true
 | 
			
		||||
        runAsUser: 65534
 | 
			
		||||
      serviceAccountName: node-exporter
 | 
			
		||||
      tolerations:
 | 
			
		||||
      - operator: Exists
 | 
			
		||||
      volumes:
 | 
			
		||||
      - hostPath:
 | 
			
		||||
          path: /sys
 | 
			
		||||
        name: sys
 | 
			
		||||
      - hostPath:
 | 
			
		||||
          path: /
 | 
			
		||||
        name: root
 | 
			
		||||
							
								
								
									
										172
									
								
								prometheus-operator/snmp-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								prometheus-operator/snmp-exporter.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,172 @@
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  name: snmp-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  replicas: 2
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app: snmp-exporter
 | 
			
		||||
  template:
 | 
			
		||||
    metadata:
 | 
			
		||||
      labels:
 | 
			
		||||
        app: snmp-exporter
 | 
			
		||||
    spec:
 | 
			
		||||
      containers:
 | 
			
		||||
        - image: prom/snmp-exporter:latest
 | 
			
		||||
          name: snmp-exporter
 | 
			
		||||
          imagePullPolicy: Always
 | 
			
		||||
          securityContext:
 | 
			
		||||
            runAsNonRoot: true
 | 
			
		||||
            runAsUser: 1000
 | 
			
		||||
            readOnlyRootFilesystem: true
 | 
			
		||||
          ports:
 | 
			
		||||
          - containerPort: 9116
 | 
			
		||||
            name: exporter
 | 
			
		||||
          livenessProbe:
 | 
			
		||||
            httpGet:
 | 
			
		||||
              path: /health
 | 
			
		||||
              port: exporter
 | 
			
		||||
          readinessProbe:
 | 
			
		||||
            httpGet:
 | 
			
		||||
              path: /health
 | 
			
		||||
              port: exporter
 | 
			
		||||
          volumeMounts:
 | 
			
		||||
          - name: snmp-exporter
 | 
			
		||||
            mountPath: /etc/snmp_exporter
 | 
			
		||||
      volumes:
 | 
			
		||||
        - name: snmp-exporter
 | 
			
		||||
          configMap:
 | 
			
		||||
            name: snmp-exporter
 | 
			
		||||
      nodeSelector:
 | 
			
		||||
        dedicated: monitoring
 | 
			
		||||
      tolerations:
 | 
			
		||||
      - key: dedicated
 | 
			
		||||
        operator: Equal
 | 
			
		||||
        value: monitoring
 | 
			
		||||
        effect: NoSchedule
 | 
			
		||||
      affinity:
 | 
			
		||||
        podAntiAffinity:
 | 
			
		||||
          requiredDuringSchedulingIgnoredDuringExecution:
 | 
			
		||||
          - labelSelector:
 | 
			
		||||
              matchExpressions:
 | 
			
		||||
              - key: app
 | 
			
		||||
                operator: In
 | 
			
		||||
                values:
 | 
			
		||||
                - snmp-exporter
 | 
			
		||||
            topologyKey: "kubernetes.io/hostname"
 | 
			
		||||
---
 | 
			
		||||
kind: Service
 | 
			
		||||
apiVersion: v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: snmp-exporter
 | 
			
		||||
spec:
 | 
			
		||||
  type: ClusterIP
 | 
			
		||||
  ports:
 | 
			
		||||
    - name: exporter
 | 
			
		||||
      port: 9116
 | 
			
		||||
      protocol: TCP
 | 
			
		||||
  selector:
 | 
			
		||||
    app: snmp-exporter
 | 
			
		||||
---
 | 
			
		||||
kind: Probe
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: ups
 | 
			
		||||
spec:
 | 
			
		||||
  interval: 60s
 | 
			
		||||
  module: rfc1628_ups
 | 
			
		||||
  prober:
 | 
			
		||||
    url: snmp-exporter:9116
 | 
			
		||||
    path: /snmp
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - ups-4.mgmt.k-space.ee
 | 
			
		||||
        - ups-5.mgmt.k-space.ee
 | 
			
		||||
        - ups-6.mgmt.k-space.ee
 | 
			
		||||
        - ups-7.mgmt.k-space.ee
 | 
			
		||||
        - ups-8.mgmt.k-space.ee
 | 
			
		||||
        - ups-9.mgmt.k-space.ee
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: ups
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: ups
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: UPSBatteryLost
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: One or more UPS-es have degraded batteries.
 | 
			
		||||
      expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: UPSPowerLost
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: One or more UPS-es is not in normal operation mode. This either means
 | 
			
		||||
          power is lost or UPS was loaded and it's now in bypass mode.
 | 
			
		||||
      expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6
 | 
			
		||||
      for: 1m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: UPSExcessivelyLoaded
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es
 | 
			
		||||
          is balanced and load for no UPS stays above 50%.
 | 
			
		||||
      expr: snmp_upsOutputPercentLoad > 80
 | 
			
		||||
      for: 1h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
---
 | 
			
		||||
kind: Probe
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: printer
 | 
			
		||||
spec:
 | 
			
		||||
  interval: 60s
 | 
			
		||||
  scrapeTimeout: 50s
 | 
			
		||||
  module: printer_mib
 | 
			
		||||
  prober:
 | 
			
		||||
    url: snmp-exporter:9116
 | 
			
		||||
    path: /snmp
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - mfp-cyber.pub.k-space.ee
 | 
			
		||||
        - mfp-chaos.pub.k-space.ee
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: printer
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
  - name: printer
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: PrinterNeedsAttention
 | 
			
		||||
      annotations:
 | 
			
		||||
        summary: Printer is in error state. If the underlying reason is 'low on paper'
 | 
			
		||||
          make sure there is enough paper near the printer. It not drop a line at
 | 
			
		||||
          accounting@k-space.ee to order more office supplies.
 | 
			
		||||
      expr: snmp_hrPrinterDetectedErrorState == 1
 | 
			
		||||
      for: 0m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
---
 | 
			
		||||
kind: Probe
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
metadata:
 | 
			
		||||
  name: beamer
 | 
			
		||||
spec:
 | 
			
		||||
  interval: 60s
 | 
			
		||||
  module: epson_beamer
 | 
			
		||||
  prober:
 | 
			
		||||
    url: snmp-exporter:9116
 | 
			
		||||
    path: /snmp
 | 
			
		||||
  targets:
 | 
			
		||||
    staticConfig:
 | 
			
		||||
      static:
 | 
			
		||||
        - beamer-cyber.sec.k-space.ee
 | 
			
		||||
							
								
								
									
										1272
									
								
								prometheus-operator/snmp.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1272
									
								
								prometheus-operator/snmp.yml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -19,7 +19,7 @@ but it does not export Prometheus metrics either.
 | 
			
		||||
To apply changes run in this directory:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
kubectl apply -n rosdump -f cronjob.yaml
 | 
			
		||||
kubectl apply -n rosdump -f application.yml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
To trigger cronjob:
 | 
			
		||||
 
 | 
			
		||||
@@ -87,7 +87,6 @@ spec:
 | 
			
		||||
                          path: ssh_known_hosts
 | 
			
		||||
                  - configMap:
 | 
			
		||||
                      name: rosdump-config
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
apiVersion: networking.k8s.io/v1
 | 
			
		||||
kind: NetworkPolicy
 | 
			
		||||
@@ -108,3 +107,19 @@ spec:
 | 
			
		||||
    ports:
 | 
			
		||||
    - protocol: TCP
 | 
			
		||||
      port: 22
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PrometheusRule
 | 
			
		||||
metadata:
 | 
			
		||||
  name: rosdump
 | 
			
		||||
spec:
 | 
			
		||||
  groups:
 | 
			
		||||
    - name: rosdump
 | 
			
		||||
      rules:
 | 
			
		||||
        - alert: MikrotikBackupsBroken
 | 
			
		||||
          expr: absent(kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"}) or time() - kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"} > 3600
 | 
			
		||||
          for: 4h
 | 
			
		||||
          labels:
 | 
			
		||||
            severity: warning
 | 
			
		||||
          annotations:
 | 
			
		||||
            summary: Mikrotik backups are broken
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,7 @@
 | 
			
		||||
Traefik Ingress Controller:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
kubectl create namespace traefik
 | 
			
		||||
helm template --include-crds -n traefik --release-name k6 traefik/traefik -f values.yml > application.yml
 | 
			
		||||
kubectl apply -n traefik -f namespace.yml -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
 | 
			
		||||
kubectl apply -n traefik -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml
 | 
			
		||||
```
 | 
			
		||||
 
 | 
			
		||||
@@ -28,9 +28,6 @@ kind: Service
 | 
			
		||||
metadata:
 | 
			
		||||
  name: traefik-metrics
 | 
			
		||||
  namespace: traefik
 | 
			
		||||
  annotations:
 | 
			
		||||
    prometheus.io/scrape: 'true'
 | 
			
		||||
    prometheus.io/port: '9100'
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    app.kubernetes.io/name: traefik
 | 
			
		||||
@@ -92,6 +89,16 @@ spec:
 | 
			
		||||
  - Ingress
 | 
			
		||||
  - Egress
 | 
			
		||||
  ingress:
 | 
			
		||||
  - from:
 | 
			
		||||
    - namespaceSelector:
 | 
			
		||||
        matchLabels:
 | 
			
		||||
          kubernetes.io/metadata.name: prometheus-operator
 | 
			
		||||
      podSelector:
 | 
			
		||||
        matchLabels:
 | 
			
		||||
          app.kubernetes.io/name: prometheus
 | 
			
		||||
    ports:
 | 
			
		||||
    - protocol: TCP
 | 
			
		||||
      port: 9100
 | 
			
		||||
  - from:
 | 
			
		||||
    - ipBlock:
 | 
			
		||||
        cidr: 0.0.0.0/0
 | 
			
		||||
@@ -109,3 +116,14 @@ spec:
 | 
			
		||||
  replacePathRegex:
 | 
			
		||||
    regex: ^/metrics
 | 
			
		||||
    replacement: /
 | 
			
		||||
---
 | 
			
		||||
apiVersion: monitoring.coreos.com/v1
 | 
			
		||||
kind: PodMonitor
 | 
			
		||||
metadata:
 | 
			
		||||
  name: traefik
 | 
			
		||||
spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    matchLabels:
 | 
			
		||||
      app.kubernetes.io/name: traefik
 | 
			
		||||
  podMetricsEndpoints:
 | 
			
		||||
    - port: metrics
 | 
			
		||||
 
 | 
			
		||||
@@ -17,9 +17,8 @@ deployment:
 | 
			
		||||
    keel.sh/trigger: patch
 | 
			
		||||
    keel.sh/pollSchedule: "@midnight"
 | 
			
		||||
 | 
			
		||||
  podAnnotations:
 | 
			
		||||
    prometheus.io/scrape: 'true'
 | 
			
		||||
    prometheus.io/port: '9100'
 | 
			
		||||
accessLog:
 | 
			
		||||
  format: json
 | 
			
		||||
 | 
			
		||||
# Globally redirect to https://
 | 
			
		||||
globalArguments:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user