forked from k-space/kube
		
	Migrate to Prometheus Operator
This commit is contained in:
		| @@ -1,17 +1,14 @@ | ||||
| apiVersion: argoproj.io/v1alpha1 | ||||
| kind: Application | ||||
| metadata: | ||||
|   name: monitoring | ||||
|   name: prometheus-operator | ||||
|   namespace: argocd | ||||
| spec: | ||||
|   project: default | ||||
|   source: | ||||
|     repoURL: 'git@git.k-space.ee:k-space/kube.git' | ||||
|     path: monitoring | ||||
|     path: prometheus-operator | ||||
|     targetRevision: HEAD | ||||
|   destination: | ||||
|     server: 'https://kubernetes.default.svc' | ||||
|     namespace: monitoring | ||||
|   syncPolicy: | ||||
|     syncOptions: | ||||
|       - CreateNamespace=true | ||||
|     namespace: prometheus-operator | ||||
							
								
								
									
										33
									
								
								argocd/monitoring.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								argocd/monitoring.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: argocd | ||||
| spec: | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|   - port: metrics | ||||
|   - port: controller | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: argocd | ||||
| spec: | ||||
|   groups: | ||||
|   - name: argocd | ||||
|     rules: | ||||
|     - alert: ArgoNotSynced | ||||
|       annotations: | ||||
|         summary: Some applications in Argo are out of sync | ||||
|       expr: sum by (dest_namespace) (argocd_app_info{sync_status!="Synced"}) > 0 | ||||
|       for: 8h | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: ArgoNotHealthy | ||||
|       annotations: | ||||
|         summary: Some applications in Argo are not healthy | ||||
|       expr: argocd_app_info{health_status!="Healthy"} | ||||
|       for: 30m | ||||
|       labels: | ||||
|         severity: warning | ||||
| @@ -77,10 +77,6 @@ server: | ||||
|  | ||||
|   metrics: | ||||
|     enabled: true | ||||
|     service: | ||||
|       annotations: | ||||
|         prometheus.io/scrape: "true" | ||||
|         prometheus.io/port: "8083" | ||||
|  | ||||
| # We don't use ApplicationSet CRD-s (yet) | ||||
| applicationSet: | ||||
| @@ -89,26 +85,14 @@ applicationSet: | ||||
| repoServer: | ||||
|   metrics: | ||||
|     enabled: true | ||||
|     service: | ||||
|       annotations: | ||||
|         prometheus.io/scrape: "true" | ||||
|         prometheus.io/port: "8084" | ||||
|  | ||||
| notifications: | ||||
|   metrics: | ||||
|     enabled: true | ||||
|     service: | ||||
|       annotations: | ||||
|         prometheus.io/scrape: "true" | ||||
|         prometheus.io/port: "9001" | ||||
|  | ||||
| controller: | ||||
|   metrics: | ||||
|     enabled: true | ||||
|     service: | ||||
|       annotations: | ||||
|         prometheus.io/scrape: "true" | ||||
|         prometheus.io/port: "8082" | ||||
|  | ||||
| configs: | ||||
|   secret: | ||||
|   | ||||
| @@ -10,11 +10,11 @@ spec: | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: camtiler | ||||
|       app.kubernetes.io/name: camtiler | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: camtiler | ||||
|         app.kubernetes.io/name: camtiler | ||||
|         component: camtiler | ||||
|     spec: | ||||
|       serviceAccountName: camtiler | ||||
| @@ -25,6 +25,9 @@ spec: | ||||
|             readOnlyRootFilesystem: true | ||||
|             runAsNonRoot: true | ||||
|             runAsUser: 1000 | ||||
|           ports: | ||||
|             - containerPort: 5000 | ||||
|               name: "http" | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| @@ -38,11 +41,11 @@ spec: | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: log-viewer-frontend | ||||
|       app.kubernetes.io/name: log-viewer-frontend | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: log-viewer-frontend | ||||
|         app.kubernetes.io/name: log-viewer-frontend | ||||
|     spec: | ||||
|       containers: | ||||
|         - name: log-viewer-frontend | ||||
| @@ -64,11 +67,11 @@ spec: | ||||
|   replicas: 3 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: log-viewer-backend | ||||
|       app.kubernetes.io/name: log-viewer-backend | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: log-viewer-backend | ||||
|         app.kubernetes.io/name: log-viewer-backend | ||||
|     spec: | ||||
|       containers: | ||||
|         - name: log-backend-backend | ||||
| @@ -109,7 +112,7 @@ metadata: | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   selector: | ||||
|     app: log-viewer-frontend | ||||
|     app.kubernetes.io/name: log-viewer-frontend | ||||
|   ports: | ||||
|   - protocol: TCP | ||||
|     port: 3003 | ||||
| @@ -121,7 +124,7 @@ metadata: | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   selector: | ||||
|     app: log-viewer-backend | ||||
|     app.kubernetes.io/name: log-viewer-backend | ||||
|   ports: | ||||
|   - protocol: TCP | ||||
|     port: 3002 | ||||
| @@ -130,14 +133,12 @@ apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: camtiler | ||||
|   annotations: | ||||
|     prometheus.io/scrape: 'true' | ||||
|   labels: | ||||
|     component: camtiler | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   selector: | ||||
|     app: camtiler | ||||
|     app.kubernetes.io/name: camtiler | ||||
|     component: camtiler | ||||
|   ports: | ||||
|   - protocol: TCP | ||||
| @@ -254,7 +255,7 @@ spec: | ||||
|           kubernetes.io/metadata.name: monitoring | ||||
|       podSelector: | ||||
|         matchLabels: | ||||
|           app: prometheus | ||||
|           app.kubernetes.io/name: prometheus | ||||
|   egress: | ||||
|     - to: | ||||
|         - ipBlock: | ||||
| @@ -263,7 +264,7 @@ spec: | ||||
|     - to: | ||||
|       - podSelector: | ||||
|           matchLabels: | ||||
|             app: mongodb-svc | ||||
|             app.kubernetes.io/name: mongodb-svc | ||||
|       ports: | ||||
|       - port: 27017 | ||||
|     - to: | ||||
| @@ -298,7 +299,7 @@ spec: | ||||
|           kubernetes.io/metadata.name: monitoring | ||||
|       podSelector: | ||||
|         matchLabels: | ||||
|           app: prometheus | ||||
|           app.kubernetes.io/name: prometheus | ||||
|   - from: | ||||
|     - namespaceSelector: | ||||
|         matchLabels: | ||||
| @@ -314,7 +315,7 @@ metadata: | ||||
| spec: | ||||
|   podSelector: | ||||
|     matchLabels: | ||||
|       app: log-viewer-backend | ||||
|       app.kubernetes.io/name: log-viewer-backend | ||||
|   policyTypes: | ||||
|   - Ingress | ||||
|   - Egress | ||||
| @@ -322,13 +323,11 @@ spec: | ||||
|     - to: | ||||
|       - podSelector: | ||||
|           matchLabels: | ||||
|             app: mongodb-svc | ||||
|             app.kubernetes.io/name: mongodb-svc | ||||
|     - to: | ||||
|       - podSelector: | ||||
|           matchLabels: | ||||
|             v1.min.io/tenant: minio | ||||
|       ports: | ||||
|       - port: 9000 | ||||
|       - ipBlock: | ||||
|           # Minio is accessed thru public endpoint via Traefik | ||||
|           cidr: 193.40.103.0/24 | ||||
|   ingress: | ||||
|   - from: | ||||
|     - namespaceSelector: | ||||
| @@ -345,7 +344,7 @@ metadata: | ||||
| spec: | ||||
|   podSelector: | ||||
|     matchLabels: | ||||
|       app: log-viewer-frontend | ||||
|       app.kubernetes.io/name: log-viewer-frontend | ||||
|   policyTypes: | ||||
|   - Ingress | ||||
|   - Egress | ||||
| @@ -458,7 +457,6 @@ spec: | ||||
|              required: ["target"] | ||||
|          required: ["spec"] | ||||
| --- | ||||
| --- | ||||
| apiVersion: codemowers.io/v1alpha1 | ||||
| kind: ClusterOperator | ||||
| metadata: | ||||
| @@ -480,7 +478,7 @@ spec: | ||||
|       spec: | ||||
|         type: ClusterIP | ||||
|         selector: | ||||
|           app: foobar | ||||
|           app.kubernetes.io/name: foobar | ||||
|           component: camdetect | ||||
|         ports: | ||||
|         - protocol: TCP | ||||
| @@ -506,14 +504,11 @@ spec: | ||||
|             maxUnavailable: 1 | ||||
|         selector: | ||||
|           matchLabels: | ||||
|             app: foobar | ||||
|             app.kubernetes.io/name: foobar | ||||
|         template: | ||||
|           metadata: | ||||
|             annotations: | ||||
|               prometheus.io/scrape: 'true' | ||||
|               prometheus.io/port: '5000' | ||||
|             labels: | ||||
|               app: foobar | ||||
|               app.kubernetes.io/name: foobar | ||||
|               component: camdetect | ||||
|           spec: | ||||
|             containers: | ||||
| @@ -590,9 +585,55 @@ spec: | ||||
|               whenUnsatisfiable: DoNotSchedule | ||||
|               labelSelector: | ||||
|                 matchLabels: | ||||
|                   app: foobar | ||||
|                   app.kubernetes.io/name: foobar | ||||
|                   component: camdetect | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: camtiler | ||||
| spec: | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|   - port: http | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: cameras | ||||
| spec: | ||||
|     groups: | ||||
|     - name: cameras | ||||
|       rules: | ||||
|       - alert: CameraLost | ||||
|         expr: rate(camdetect_rx_frames_total[2m]) < 1 | ||||
|         for: 2m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: Camera feed stopped | ||||
|       - alert: CameraServerRoomMotion | ||||
|         expr: camdetect_event_active {app="camdetect-server-room"} > 0 | ||||
|         for: 1m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: Motion was detected in server room | ||||
|       - alert: CameraSlowUploads | ||||
|         expr: rate(camdetect_upload_dropped_frames_total[2m]) > 1 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: Motion detect snapshots are piling up and not getting uploaded to S3 | ||||
|       - alert: CameraSlowProcessing | ||||
|         expr: rate(camdetect_download_dropped_frames_total[2m]) > 1 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: Motion detection processing pipeline is not keeping up with incoming frames | ||||
| --- | ||||
| apiVersion: k-space.ee/v1alpha1 | ||||
| kind: Camera | ||||
| metadata: | ||||
|   | ||||
| @@ -42,9 +42,6 @@ spec: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: drone | ||||
|       annotations: | ||||
|         prometheus.io/port: "80" | ||||
|         prometheus.io/scrape: "true" | ||||
|     spec: | ||||
|       automountServiceAccountToken: false | ||||
|       securityContext: | ||||
|   | ||||
| @@ -10,6 +10,9 @@ spec: | ||||
|   kibanaRef: | ||||
|     name: kibana | ||||
|   config: | ||||
|     http: | ||||
|       enabled: true | ||||
|       port: 5066 | ||||
|     filebeat: | ||||
|       autodiscover: | ||||
|         providers: | ||||
| @@ -81,6 +84,14 @@ spec: | ||||
|               valueFrom: | ||||
|                 fieldRef: | ||||
|                   fieldPath: spec.nodeName | ||||
|         - name: exporter | ||||
|           image: sepa/beats-exporter | ||||
|           args: | ||||
|             - -p=5066 | ||||
|           ports: | ||||
|             - containerPort: 8080 | ||||
|               name: exporter | ||||
|               protocol: TCP | ||||
|         volumes: | ||||
|         - name: varlogcontainers | ||||
|           hostPath: | ||||
|   | ||||
							
								
								
									
										16
									
								
								freescout/application.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								freescout/application.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: freescout | ||||
| spec: | ||||
|   groups: | ||||
|     - name: freescout | ||||
|       rules: | ||||
|       - alert: FreescoutSyncBroken | ||||
|         expr: time() - wildduck_last_login{email=~"(info|accounting)@k-space.ee"} > 300 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: Freescout mailbox synchronization is broken | ||||
							
								
								
									
										3
									
								
								kube-system/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								kube-system/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| ``` | ||||
| kubectl apply -n kube-system -f kube-state-metrics.yml | ||||
| `` | ||||
							
								
								
									
										221
									
								
								kube-system/kube-state-metrics.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								kube-system/kube-state-metrics.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,221 @@ | ||||
| --- | ||||
| apiVersion: v1 | ||||
| automountServiceAccountToken: false | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| rules: | ||||
| - apiGroups: | ||||
|   - "" | ||||
|   resources: | ||||
|   - configmaps | ||||
|   - secrets | ||||
|   - nodes | ||||
|   - pods | ||||
|   - services | ||||
|   - serviceaccounts | ||||
|   - resourcequotas | ||||
|   - replicationcontrollers | ||||
|   - limitranges | ||||
|   - persistentvolumeclaims | ||||
|   - persistentvolumes | ||||
|   - namespaces | ||||
|   - endpoints | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - apps | ||||
|   resources: | ||||
|   - statefulsets | ||||
|   - daemonsets | ||||
|   - deployments | ||||
|   - replicasets | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - batch | ||||
|   resources: | ||||
|   - cronjobs | ||||
|   - jobs | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - autoscaling | ||||
|   resources: | ||||
|   - horizontalpodautoscalers | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - authentication.k8s.io | ||||
|   resources: | ||||
|   - tokenreviews | ||||
|   verbs: | ||||
|   - create | ||||
| - apiGroups: | ||||
|   - authorization.k8s.io | ||||
|   resources: | ||||
|   - subjectaccessreviews | ||||
|   verbs: | ||||
|   - create | ||||
| - apiGroups: | ||||
|   - policy | ||||
|   resources: | ||||
|   - poddisruptionbudgets | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - certificates.k8s.io | ||||
|   resources: | ||||
|   - certificatesigningrequests | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - storage.k8s.io | ||||
|   resources: | ||||
|   - storageclasses | ||||
|   - volumeattachments | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - admissionregistration.k8s.io | ||||
|   resources: | ||||
|   - mutatingwebhookconfigurations | ||||
|   - validatingwebhookconfigurations | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - networking.k8s.io | ||||
|   resources: | ||||
|   - networkpolicies | ||||
|   - ingresses | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - coordination.k8s.io | ||||
|   resources: | ||||
|   - leases | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - rbac.authorization.k8s.io | ||||
|   resources: | ||||
|   - clusterrolebindings | ||||
|   - clusterroles | ||||
|   - rolebindings | ||||
|   - roles | ||||
|   verbs: | ||||
|   - list | ||||
|   - watch | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: kube-state-metrics | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: kube-state-metrics | ||||
|   namespace: kube-system | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: kube-state-metrics | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app.kubernetes.io/name: kube-state-metrics | ||||
|     spec: | ||||
|       automountServiceAccountToken: true | ||||
|       containers: | ||||
|       - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.6.0 | ||||
|         livenessProbe: | ||||
|           httpGet: | ||||
|             path: /healthz | ||||
|             port: 8080 | ||||
|           initialDelaySeconds: 5 | ||||
|           timeoutSeconds: 5 | ||||
|         name: kube-state-metrics | ||||
|         ports: | ||||
|         - containerPort: 8080 | ||||
|           name: http-metrics | ||||
|         - containerPort: 8081 | ||||
|           name: telemetry | ||||
|         readinessProbe: | ||||
|           httpGet: | ||||
|             path: / | ||||
|             port: 8081 | ||||
|           initialDelaySeconds: 5 | ||||
|           timeoutSeconds: 5 | ||||
|         securityContext: | ||||
|           allowPrivilegeEscalation: false | ||||
|           capabilities: | ||||
|             drop: | ||||
|             - ALL | ||||
|           readOnlyRootFilesystem: true | ||||
|           runAsUser: 65534 | ||||
|       nodeSelector: | ||||
|         kubernetes.io/os: linux | ||||
|       serviceAccountName: kube-state-metrics | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| spec: | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http-metrics | ||||
|     port: 8080 | ||||
|     targetPort: http-metrics | ||||
|   - name: telemetry | ||||
|     port: 8081 | ||||
|     targetPort: telemetry | ||||
|   selector: | ||||
|     app.kubernetes.io/name: kube-state-metrics | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| spec: | ||||
|   endpoints: | ||||
|   - honorLabels: true | ||||
|     path: /metrics | ||||
|     port: http-metrics | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: kube-state-metrics | ||||
| @@ -7,7 +7,7 @@ and then heavily modified. | ||||
| To deploy Longhorn use following: | ||||
|  | ||||
| ``` | ||||
| kubectl -n longhorn-system apply -f longhorn.yaml -f ingress.yml | ||||
| kubectl -n longhorn-system apply -f application.yml -f application-extras.yml | ||||
| ``` | ||||
|  | ||||
| After deploying specify `dedicated=storage:NoSchedule` | ||||
|   | ||||
							
								
								
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								longhorn-system/application-extras.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,126 @@ | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: longhorn-dashboard | ||||
|   namespace: longhorn-system | ||||
|   annotations: | ||||
|     kubernetes.io/ingress.class: traefik | ||||
|     cert-manager.io/cluster-issuer: default | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
| spec: | ||||
|   rules: | ||||
|   - host: longhorn.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: longhorn-frontend | ||||
|             port: | ||||
|               number: 80 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - longhorn.k-space.ee | ||||
|     secretName: longhorn-tls | ||||
|  | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: manager | ||||
| spec: | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|     - port: manager | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: longhorn | ||||
| spec: | ||||
|   # Copied from https://longhorn.io/docs/1.2.4/monitoring/alert-rules-example/ | ||||
|   groups: | ||||
|     - name: longhorn | ||||
|       rules: | ||||
|       - alert: LonghornVolumeActualSpaceUsedWarning | ||||
|         annotations: | ||||
|           description: The accumulated snapshots for volume use up more space than the volume's capacity | ||||
|           summary: The actual used space of Longhorn volume is twice the size of the volume capacity. | ||||
|         expr: longhorn_volume_actual_size_bytes > longhorn_volume_capacity_bytes * 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornVolumeStatusCritical | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for | ||||
|             more than 2 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Fault | ||||
|         expr: longhorn_volume_robustness == 3 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Fault. | ||||
|           severity: critical | ||||
|       - alert: LonghornVolumeStatusWarning | ||||
|         annotations: | ||||
|           description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn volume {{$labels.volume}} is Degraded | ||||
|         expr: longhorn_volume_robustness == 2 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn volume {{$labels.volume}} is Degraded. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of node is over 70% of the capacity. | ||||
|         expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornDiskStorageWarning | ||||
|         annotations: | ||||
|           description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for | ||||
|             more than 5 minutes. | ||||
|           summary:  The used storage of disk is over 70% of the capacity. | ||||
|         expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeDown | ||||
|         annotations: | ||||
|           description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. | ||||
|           summary: Longhorn nodes is offline | ||||
|         expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: There are {{$value}} Longhorn nodes are offline | ||||
|           severity: critical | ||||
|       - alert: LonghornIntanceManagerCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. | ||||
|         expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. | ||||
|           severity: warning | ||||
|       - alert: LonghornNodeCPUUsageWarning | ||||
|         annotations: | ||||
|           description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for | ||||
|             more than 5 minutes. | ||||
|           summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. | ||||
|         expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           issue: Longhorn node {{$labels.node}} experiences high CPU pressure. | ||||
|           severity: warning | ||||
| @@ -1,28 +0,0 @@ | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: longhorn-dashboard | ||||
|   namespace: longhorn-system | ||||
|   annotations: | ||||
|     kubernetes.io/ingress.class: traefik | ||||
|     cert-manager.io/cluster-issuer: default | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
| spec: | ||||
|   rules: | ||||
|   - host: longhorn.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: longhorn-frontend | ||||
|             port: | ||||
|               number: 80 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - longhorn.k-space.ee | ||||
|     secretName: longhorn-tls | ||||
| @@ -1,27 +0,0 @@ | ||||
| persistence: | ||||
|   defaultClassReplicaCount: 2 | ||||
|  | ||||
| defaultSettings: | ||||
|   defaultDataLocality: best-effort | ||||
|   taintToleration: "dedicated=storage:NoSchedule" | ||||
|   systemManagedComponentsNodeSelector: "dedicated:storage" | ||||
|    | ||||
| longhornDriver: | ||||
|   tolerations: | ||||
|   - key: dedicated | ||||
|     operator: Equal | ||||
|     value: storage | ||||
|     effect: NoSchedule | ||||
|  | ||||
| longhornUI: | ||||
|   tolerations: | ||||
|   - key: dedicated | ||||
|     operator: Equal | ||||
|     value: storage | ||||
|     effect: NoSchedule | ||||
|  | ||||
| ingress: | ||||
|   enabled: true | ||||
|   host: longhorn.k-space.ee | ||||
|   tls: true | ||||
|   tlsSecret: longhorn-tls | ||||
| @@ -67,6 +67,11 @@ spec: | ||||
|                  items: | ||||
|                    type: object | ||||
|                    x-kubernetes-preserve-unknown-fields: true | ||||
|                customresources: | ||||
|                  type: array | ||||
|                  items: | ||||
|                    type: object | ||||
|                    x-kubernetes-preserve-unknown-fields: true | ||||
|          required: ["spec"] | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| @@ -178,12 +183,21 @@ rules: | ||||
| - apiGroups: | ||||
|   - codemowers.io | ||||
|   resources: | ||||
|   - bindzones | ||||
|   - clusteroperators | ||||
|   - keydbs | ||||
|   verbs: | ||||
|   - get | ||||
|   - list | ||||
|   - watch | ||||
| - apiGroups: | ||||
|   - k-space.ee | ||||
|   resources: | ||||
|   - cams | ||||
|   verbs: | ||||
|   - get | ||||
|   - list | ||||
|   - watch | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
|   | ||||
| @@ -120,7 +120,7 @@ spec: | ||||
|         type: ClusterIP | ||||
|         clusterIP: None | ||||
|         ports: | ||||
|         - name: "server" | ||||
|         - name: redis | ||||
|           port: 6379 | ||||
|           protocol: TCP | ||||
|           targetPort: redis | ||||
| @@ -137,14 +137,14 @@ spec: | ||||
|       spec: | ||||
|         type: ClusterIP | ||||
|         ports: | ||||
|         - name: "server" | ||||
|         - name: redis | ||||
|           port: 6379 | ||||
|           protocol: TCP | ||||
|           targetPort: redis | ||||
|         - name: "redis-exporter" | ||||
|         - name: exporter | ||||
|           port: 9121 | ||||
|           protocol: TCP | ||||
|           targetPort: redis-exporter | ||||
|           targetPort: exporter | ||||
|         selector: | ||||
|           app.kubernetes.io/name: foobar | ||||
|         sessionAffinity: ClientIP | ||||
| @@ -163,9 +163,6 @@ spec: | ||||
|             app.kubernetes.io/name: foobar | ||||
|         template: | ||||
|           metadata: | ||||
|             annotations: | ||||
|               prometheus.io/port: "9121" | ||||
|               prometheus.io/scrape: "true" | ||||
|             labels: | ||||
|               app.kubernetes.io/name: foobar | ||||
|           spec: | ||||
| @@ -237,10 +234,10 @@ spec: | ||||
|               envFrom: | ||||
|                 - secretRef: | ||||
|                     name: foobar-secrets | ||||
|             - name: redis-exporter | ||||
|             - name: exporter | ||||
|               image: quay.io/oliver006/redis_exporter | ||||
|               ports: | ||||
|               - name: metrics | ||||
|               - name: exporter | ||||
|                 containerPort: 9121 | ||||
|               envFrom: | ||||
|                 - secretRef: | ||||
|   | ||||
| @@ -1,4 +1,14 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: monitoring | ||||
|   namespace: metallb-system | ||||
| spec: | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|     - port: monitoring | ||||
| --- | ||||
| apiVersion: metallb.io/v1beta1 | ||||
| kind: MetalLB | ||||
| metadata: | ||||
|   | ||||
							
								
								
									
										19
									
								
								prometheus-operator/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								prometheus-operator/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| # Prometheus operator | ||||
|  | ||||
| ``` | ||||
| curl -L https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.59.0/bundle.yaml | sed -e 's/namespace: default/namespace: prometheus-operator/g' > bundle.yml | ||||
| kubectl create namespace prometheus-operator | ||||
| kubectl apply --server-side -n prometheus-operator -f bundle.yml | ||||
| kubectl delete -n prometheus-operator configmap snmp-exporter | ||||
| kubectl create -n prometheus-operator configmap snmp-exporter --from-file=snmp.yml | ||||
| kubectl apply -n prometheus-operator -f application.yml -f node-exporter.yml -f blackbox-exporter.yml -f snmp-exporter.yml -f mikrotik-exporter.yml | ||||
| ``` | ||||
|  | ||||
| # Mikrotik expoeter | ||||
|  | ||||
| ``` | ||||
|  kubectl create -n prometheus-operator secret generic mikrotik-exporter \ | ||||
|   --from-literal=MIKROTIK_PASSWORD='f7W!H*Pu' \ | ||||
|   --from-literal=PROMETHEUS_BEARER_TOKEN=$(cat /dev/urandom | base64 | head -c 30) | ||||
| ``` | ||||
|  | ||||
							
								
								
									
										762
									
								
								prometheus-operator/application.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										762
									
								
								prometheus-operator/application.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,762 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: metrics | ||||
| spec: | ||||
|   namespaceSelector: {} | ||||
|   selector: {} | ||||
|   podMetricsEndpoints: | ||||
|     - port: exporter | ||||
|     - port: metrics | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Alertmanager | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| spec: | ||||
|   nodeSelector: | ||||
|     dedicated: monitoring | ||||
|   tolerations: | ||||
|     - key: dedicated | ||||
|       operator: Equal | ||||
|       value: monitoring | ||||
|       effect: NoSchedule | ||||
|   replicas: 3 | ||||
|   serviceAccountName: alertmanager | ||||
|   externalUrl: http://am.k-space.ee/ | ||||
|   routePrefix: "/" | ||||
|   securityContext: | ||||
|     fsGroup: 2000 | ||||
|     runAsGroup: 2000 | ||||
|     runAsNonRoot: true | ||||
|     runAsUser: 1000 | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Prometheus | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   nodeSelector: | ||||
|     dedicated: monitoring | ||||
|   tolerations: | ||||
|     - key: dedicated | ||||
|       operator: Equal | ||||
|       value: monitoring | ||||
|       effect: NoSchedule | ||||
|   alerting: | ||||
|     alertmanagers: | ||||
|       - namespace: prometheus-operator | ||||
|         name: alertmanager | ||||
|         port: http | ||||
|         pathPrefix: "/" | ||||
|         apiVersion: v2 | ||||
|   externalUrl: "http://prom.k-space.ee/" | ||||
|   replicas: 2 | ||||
|   shards: 1 | ||||
|   serviceAccountName: prometheus | ||||
|   securityContext: | ||||
|     fsGroup: 2000 | ||||
|     runAsGroup: 2000 | ||||
|     runAsNonRoot: true | ||||
|     runAsUser: 1000 | ||||
|   serviceMonitorNamespaceSelector: {} | ||||
|   serviceMonitorSelector: {} | ||||
|   podMonitorNamespaceSelector: {} | ||||
|   podMonitorSelector: {} | ||||
|   probeNamespaceSelector: {} | ||||
|   probeSelector: {} | ||||
|   ruleNamespaceSelector: {} | ||||
|   ruleSelector: {} | ||||
|   retentionSize: 80GB | ||||
|   storage: | ||||
|     volumeClaimTemplate: | ||||
|       spec: | ||||
|         accessModes: | ||||
|         - ReadWriteOnce | ||||
|         resources: | ||||
|           requests: | ||||
|             storage: 100Gi | ||||
|         storageClassName: local-path | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: prometheus | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: prometheus | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes | ||||
|   - nodes/metrics | ||||
|   - services | ||||
|   - endpoints | ||||
|   - pods | ||||
|   verbs: ["get", "list", "watch"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - configmaps | ||||
|   verbs: ["get"] | ||||
| - apiGroups: | ||||
|   - networking.k8s.io | ||||
|   resources: | ||||
|   - ingresses | ||||
|   verbs: ["get", "list", "watch"] | ||||
| - nonResourceURLs: ["/metrics"] | ||||
|   verbs: ["get"] | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: prometheus | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: prometheus | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus | ||||
|   namespace: prometheus-operator | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   groups: | ||||
|   - name: prometheus | ||||
|     rules: | ||||
|     - alert: PrometheusJobMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n \ | ||||
|           \ LABELS = {{ $labels }}" | ||||
|         summary: Prometheus job missing (instance {{ $labels.instance }}) | ||||
|       expr: absent(up{job="prometheus-operator/prometheus"}) | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTargetMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus target has disappeared. An exporter might be crashed.\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target missing (instance {{ $labels.instance }}) | ||||
|       expr: up == 0 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusAllTargetsMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus job does not have living target anymore.\n  VALUE\ | ||||
|           \ = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus all targets missing (instance {{ $labels.instance }}) | ||||
|       expr: count by (job) (up) == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusConfigurationReloadFailure | ||||
|       annotations: | ||||
|         description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n\ | ||||
|           \  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus configuration reload failure (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: prometheus_config_last_reload_successful != 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTooManyRestarts | ||||
|       annotations: | ||||
|         description: "Prometheus has restarted more than twice in the last 15 minutes.\ | ||||
|           \ It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels\ | ||||
|           \ }}" | ||||
|         summary: Prometheus too many restarts (instance {{ $labels.instance }}) | ||||
|       expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) | ||||
|         > 2 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerJobMissing | ||||
|       annotations: | ||||
|         description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager job missing (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: absent(up{job="prometheus-operator/alertmanager"}) | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerConfigurationReloadFailure | ||||
|       annotations: | ||||
|         description: "AlertManager configuration reload error\n  VALUE = {{ $value\ | ||||
|           \ }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager configuration reload failure (instance {{ | ||||
|           $labels.instance }}) | ||||
|       expr: alertmanager_config_last_reload_successful != 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerConfigNotSynced | ||||
|       annotations: | ||||
|         description: "Configurations of AlertManager cluster instances are out of\ | ||||
|           \ sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager config not synced (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusNotConnectedToAlertmanager | ||||
|       annotations: | ||||
|         description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value\ | ||||
|           \ }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus not connected to alertmanager (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: prometheus_notifications_alertmanagers_discovered < 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusRuleEvaluationFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} rule evaluation failures,\ | ||||
|           \ leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS\ | ||||
|           \ = {{ $labels }}" | ||||
|         summary: Prometheus rule evaluation failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTemplateTextExpansionFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} template text expansion\ | ||||
|           \ failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus template text expansion failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusRuleEvaluationSlow | ||||
|       annotations: | ||||
|         description: "Prometheus rule evaluation took more time than the scheduled\ | ||||
|           \ interval. It indicates a slower storage backend access or too complex\ | ||||
|           \ query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) | ||||
|       expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusNotificationsBacklog | ||||
|       annotations: | ||||
|         description: "The Prometheus notification queue has not been empty for 10\ | ||||
|           \ minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus notifications backlog (instance {{ $labels.instance }}) | ||||
|       expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusAlertmanagerNotificationFailing | ||||
|       annotations: | ||||
|         description: "Alertmanager is failing sending notifications\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus AlertManager notification failing (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: rate(alertmanager_notifications_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTargetEmpty | ||||
|       annotations: | ||||
|         description: "Prometheus has no target in service discovery\n  VALUE = {{\ | ||||
|           \ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target empty (instance {{ $labels.instance }}) | ||||
|       expr: prometheus_sd_discovered_targets == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusLargeScrape | ||||
|       annotations: | ||||
|         description: "Prometheus has many scrapes that exceed the sample limit\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus large scrape (instance {{ $labels.instance }}) | ||||
|       expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > | ||||
|         10 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTargetScrapeDuplicate | ||||
|       annotations: | ||||
|         description: "Prometheus has many samples rejected due to duplicate timestamps\ | ||||
|           \ but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus target scrape duplicate (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) | ||||
|         > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|     - alert: PrometheusTsdbCheckpointCreationFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} checkpoint creation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbCheckpointDeletionFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbCompactionsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB compactions failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB compactions failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbHeadTruncationsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbReloadFailures | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB reload failures\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) | ||||
|       expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbWalCorruptions | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n \ | ||||
|           \ VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB WAL is corrupt, make sure there is enough disk space | ||||
|           and wipe /data/wal | ||||
|       expr: increase(prometheus_tsdb_wal_corruptions_total[2h]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: PrometheusTsdbWalTruncationsFailed | ||||
|       annotations: | ||||
|         description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n\ | ||||
|           \  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance | ||||
|           }}) | ||||
|       expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
| --- | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: prometheus | ||||
|   annotations: | ||||
|     cert-manager.io/cluster-issuer: default | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd | ||||
| spec: | ||||
|   rules: | ||||
|   - host: prom.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: prometheus-operated | ||||
|             port: | ||||
|               number: 9090 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - prom.k-space.ee | ||||
|     secretName: prom-tls | ||||
| --- | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: alertmanager | ||||
|   annotations: | ||||
|     cert-manager.io/cluster-issuer: default | ||||
|     traefik.ingress.kubernetes.io/router.entrypoints: websecure | ||||
|     traefik.ingress.kubernetes.io/router.tls: "true" | ||||
|     external-dns.alpha.kubernetes.io/target: traefik.k-space.ee | ||||
|     traefik.ingress.kubernetes.io/router.middlewares: traefik-sso@kubernetescrd | ||||
| spec: | ||||
|   rules: | ||||
|   - host: am.k-space.ee | ||||
|     http: | ||||
|       paths: | ||||
|       - pathType: Prefix | ||||
|         path: "/" | ||||
|         backend: | ||||
|           service: | ||||
|             name: alertmanager-operated | ||||
|             port: | ||||
|               number: 9093 | ||||
|   tls: | ||||
|   - hosts: | ||||
|     - am.k-space.ee | ||||
|     secretName: alertmanager-tls | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: prometheus | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: prometheus | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: alertmanager | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: alertmanager | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: operator | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: prometheus-operator | ||||
|   podMetricsEndpoints: | ||||
|     - port: http | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kubelet | ||||
| spec: | ||||
|   endpoints: | ||||
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     honorLabels: true | ||||
|     interval: 30s | ||||
|     port: https-metrics | ||||
|     scheme: https | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     honorLabels: true | ||||
|     interval: 30s | ||||
|     path: /metrics/cadvisor | ||||
|     port: https-metrics | ||||
|     scheme: https | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: kubelet | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| spec: | ||||
|   groups: | ||||
|     - name: kube-state-metrics | ||||
|       rules: | ||||
|         - alert: KubernetesNodeReady | ||||
|           expr: kube_node_status_condition{condition="Ready",status="true"} == 0 | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes Node ready (instance {{ $labels.instance }}) | ||||
|             description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesMemoryPressure | ||||
|           expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes memory pressure (instance {{ $labels.instance }}) | ||||
|             description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesDiskPressure | ||||
|           expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes disk pressure (instance {{ $labels.instance }}) | ||||
|             description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesOutOfDisk | ||||
|           expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes out of disk (instance {{ $labels.instance }}) | ||||
|             description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesOutOfCapacity | ||||
|           expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes out of capacity (instance {{ $labels.instance }}) | ||||
|             description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesContainerOomKiller | ||||
|           expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes container oom killer (instance {{ $labels.instance }}) | ||||
|             description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesJobFailed | ||||
|           expr: kube_job_status_failed > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes Job failed (instance {{ $labels.instance }}) | ||||
|             description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesCronjobSuspended | ||||
|           expr: kube_cronjob_spec_suspend != 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) | ||||
|             description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesPersistentvolumeclaimPending | ||||
|           expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) | ||||
|             description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesVolumeOutOfDiskSpace | ||||
|           expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }}) | ||||
|             description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesVolumeFullInFourDays | ||||
|           expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) | ||||
|             description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesPersistentvolumeError | ||||
|           expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) | ||||
|             description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesStatefulsetDown | ||||
|           expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 | ||||
|           for: 1m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) | ||||
|             description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesHpaScalingAbility | ||||
|           expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }}) | ||||
|             description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesHpaMetricAvailability | ||||
|           expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes HPA metric availability (instance {{ $labels.instance }}) | ||||
|             description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesHpaScaleCapability | ||||
|           expr: kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: info | ||||
|           annotations: | ||||
|             summary: Kubernetes HPA scale capability (instance {{ $labels.instance }}) | ||||
|             description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesPodNotHealthy | ||||
|           expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) | ||||
|             description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesPodCrashLooping | ||||
|           expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) | ||||
|             description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesReplicassetMismatch | ||||
|           expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }}) | ||||
|             description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesDeploymentReplicasMismatch | ||||
|           expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) | ||||
|             description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesStatefulsetReplicasMismatch | ||||
|           expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) | ||||
|             description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesDeploymentGenerationMismatch | ||||
|           expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) | ||||
|             description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesStatefulsetGenerationMismatch | ||||
|           expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) | ||||
|             description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesStatefulsetUpdateNotRolledOut | ||||
|           expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated) | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) | ||||
|             description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesDaemonsetRolloutStuck | ||||
|           expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 | ||||
|           for: 10m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) | ||||
|             description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesDaemonsetMisscheduled | ||||
|           expr: kube_daemonset_status_number_misscheduled > 0 | ||||
|           for: 1m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) | ||||
|             description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesCronjobTooLong | ||||
|           expr: time() - kube_cronjob_next_schedule_time > 3600 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) | ||||
|             description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesJobSlowCompletion | ||||
|           expr: kube_job_spec_completions - kube_job_status_succeeded > 0 | ||||
|           for: 12h | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes job slow completion (instance {{ $labels.instance }}) | ||||
|             description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesApiServerErrors | ||||
|           expr: sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes API server errors (instance {{ $labels.instance }}) | ||||
|             description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesApiClientErrors | ||||
|           expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes API client errors (instance {{ $labels.instance }}) | ||||
|             description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesClientCertificateExpiresNextWeek | ||||
|           expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) | ||||
|             description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesClientCertificateExpiresSoon | ||||
|           expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 | ||||
|           for: 0m | ||||
|           labels: | ||||
|             severity: critical | ||||
|           annotations: | ||||
|             summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) | ||||
|             description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|         - alert: KubernetesApiServerLatency | ||||
|           expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1 | ||||
|           for: 2m | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Kubernetes API server latency (instance {{ $labels.instance }}) | ||||
|             description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
							
								
								
									
										258
									
								
								prometheus-operator/blackbox-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										258
									
								
								prometheus-operator/blackbox-exporter.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,258 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: websites | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /probe | ||||
|   module: http_2xx | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - https://git.k-space.ee/ | ||||
|         - https://grafana.k-space.ee/ | ||||
|         - https://wiki.k-space.ee/ | ||||
|         - https://pad.k-space.ee/ | ||||
|         - https://members.k-space.ee/ | ||||
|         - https://nextcloud.k-space.ee/ | ||||
|         - http://minio.infra.k-space.ee:9001/login | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: k6.ee | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /probe | ||||
|   module: dns_check_traefik | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - 193.40.103.2 | ||||
|         - 62.65.250.2 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: samba-cluster | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /metrics | ||||
|   module: tcp_connect | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - dc1.ad.k-space.ee:636 | ||||
|         - dc2.ad.k-space.ee:636 | ||||
|         - dc3.ad.k-space.ee:636 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: misc | ||||
| spec: | ||||
|   prober: | ||||
|     url: blackbox-exporter | ||||
|     path: /metrics | ||||
|   module: tcp_connect | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - mail.k-space.ee:465 | ||||
|         - dev.k-space.ee:10648 | ||||
|         - mariadb.infra.k-space.ee:3306 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   # https://awesome-prometheus-alerts.grep.to/rules#blackbox | ||||
|   groups: | ||||
|   - name: blackbox | ||||
|     rules: | ||||
|     - alert: BlackboxProbeFailed | ||||
|       expr: probe_success == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox probe failed (instance {{ $labels.instance }}) | ||||
|         description: Probe failed | ||||
|     - alert: BlackboxSlowProbe | ||||
|       expr: avg_over_time(probe_duration_seconds[1m]) > 1 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox slow probe (instance {{ $labels.instance }}) | ||||
|         description: Blackbox probe took more than 1s to complete | ||||
|     - alert: BlackboxSlowDNS | ||||
|       expr: avg_over_time(probe_dns_lookup_time_seconds[1m]) > 1 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox slow DNS lookup (instance {{ $labels.instance }}) | ||||
|         description: Blackbox DNS lookup took more than 1s to complete. | ||||
|           It seemed using IPv6 DNS servers in conjunction with Docker resulted | ||||
|           in odd 5s latency bump. For now we're using 8.8.8.8 because of that | ||||
|     - alert: BlackboxProbeHttpFailure | ||||
|       expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) | ||||
|         description: HTTP status code is not 200-399 | ||||
|     - alert: BlackboxSslCertificateWillExpireSoon | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate expires in 30 days | ||||
|     - alert: BlackboxSslCertificateWillExpireSoon | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate expires in 3 days | ||||
|     - alert: BlackboxSslCertificateExpired | ||||
|       expr: probe_ssl_earliest_cert_expiry - time() <= 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) | ||||
|         description: SSL certificate has expired already | ||||
|     - alert: BlackboxProbeSlowHttp | ||||
|       expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) | ||||
|         description: HTTP request took more than 1s | ||||
|     - alert: BlackboxProbeSlowPing | ||||
|       expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Blackbox probe slow ping (instance {{ $labels.instance }}) | ||||
|         description: Blackbox ping took more than 1s | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   revisionHistoryLimit: 0 | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: blackbox-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: blackbox-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|       - name: blackbox-exporter | ||||
|         image: prom/blackbox-exporter:v0.20.0 | ||||
|         volumeMounts: | ||||
|         - name: blackbox-exporter-config | ||||
|           mountPath: /etc/blackbox_exporter | ||||
|       volumes: | ||||
|         - name: blackbox-exporter-config | ||||
|           configMap: | ||||
|             name: blackbox-exporter-config | ||||
|       # TODO: Results in odd 6s connection lag if scheduled in VLAN20 | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|         - key: dedicated | ||||
|           operator: Equal | ||||
|           value: monitoring | ||||
|           effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - labelSelector: | ||||
|               matchExpressions: | ||||
|               - key: app | ||||
|                 operator: In | ||||
|                 values: | ||||
|                 - blackbox-exporter | ||||
|             topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: blackbox-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: http | ||||
|       port: 80 | ||||
|       protocol: TCP | ||||
|       targetPort: 9115 | ||||
|   selector: | ||||
|     app: blackbox-exporter | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: blackbox-exporter-config | ||||
| data: | ||||
|   config.yml: |- | ||||
|     modules: | ||||
|       http_2xx: | ||||
|         prober: http | ||||
|         http: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       http_post_2xx: | ||||
|         prober: http | ||||
|         http: | ||||
|           method: POST | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       tcp_connect: | ||||
|         prober: tcp | ||||
|         tcp: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       icmp: | ||||
|         prober: icmp | ||||
|         icmp: | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       dns_check_traefik: | ||||
|         prober: dns | ||||
|         dns: | ||||
|           query_name: "traefik.k-space.ee" | ||||
|           query_type: "A" | ||||
|           validate_answer_rrs: | ||||
|             fail_if_not_matches_regexp: | ||||
|              - "traefik\\.k-space\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
|       dns_check_k6: | ||||
|         prober: dns | ||||
|         dns: | ||||
|           query_name: "k6.ee" | ||||
|           query_type: "A" | ||||
|           validate_answer_rrs: | ||||
|             fail_if_not_matches_regexp: | ||||
|              - "k6\\.ee\\.\\t.*\\tIN\\tA\\t193\\.40\\.103\\.[1-9][0-9]*" | ||||
|           preferred_ip_protocol: "ip4" | ||||
|           ip_protocol_fallback: false | ||||
							
								
								
									
										28816
									
								
								prometheus-operator/bundle.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28816
									
								
								prometheus-operator/bundle.yml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										104
									
								
								prometheus-operator/mikrotik-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								prometheus-operator/mikrotik-exporter.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: mikrotik | ||||
| spec: | ||||
|   bearerTokenSecret: | ||||
|     name: mikrotik-exporter | ||||
|     key: PROMETHEUS_BEARER_TOKEN | ||||
|   prober: | ||||
|     path: /metrics | ||||
|     url: mikrotik-exporter | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - router.mgmt.k-space.ee | ||||
|         - sw_chaos.mgmt.k-space.ee | ||||
|         - sw_poe.mgmt.k-space.ee | ||||
|         - sw_mgmt.mgmt.k-space.ee | ||||
|         - sw_core02.mgmt.k-space.ee | ||||
|         - sw_cyber.mgmt.k-space.ee | ||||
|         - sw_ha.mgmt.k-space.ee | ||||
|         - sw_asocial.mgmt.k-space.ee | ||||
|         - sw_kitchen.mgmt.k-space.ee | ||||
|         - sw_core01.mgmt.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: mikrotik | ||||
| spec: | ||||
|   groups: | ||||
|   - name: mikrotik | ||||
|     rules: | ||||
|     - alert: MikrotikUplinkRedundancyLost | ||||
|       expr: mikrotik_interface_running{port=~"sfp-sfpplus[12]", instance!~"sw_core.*", instance!~"sw_mgmt.*"} == 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: error | ||||
|       annotations: | ||||
|         summary: Switch uplink high availability lost | ||||
|         description: One of the two 10Gb optical links is malfunctioning | ||||
|     - alert: MikrotikLinkRateDegraded | ||||
|       expr: mikrotik_interface_rate{port=~"sfp-sfpplus.*"} < 10000000000 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: error | ||||
|       annotations: | ||||
|         summary: 10Gb link degraded | ||||
|         description: One of the 10Gb links is running at lower speed | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: mikrotik-exporter | ||||
| spec: | ||||
|   revisionHistoryLimit: 0 | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: mikrotik-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: mikrotik-exporter | ||||
|       annotations: | ||||
|         co.elastic.logs/multiline.pattern: '^  ' | ||||
|         co.elastic.logs/multiline.negate: "false" | ||||
|         co.elastic.logs/multiline.match: after | ||||
|     spec: | ||||
|       containers: | ||||
|       - name: mikrotik-exporter | ||||
|         image: harbor.k-space.ee/k-space/mikrotik-exporter:latest | ||||
|         env: | ||||
|           - name: MIKROTIK_USER | ||||
|             value: netpoller | ||||
|         envFrom: | ||||
|           - secretRef: | ||||
|               name: mikrotik-exporter | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|       - key: dedicated | ||||
|         operator: Equal | ||||
|         value: monitoring | ||||
|         effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: mikrotik-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: http | ||||
|       port: 80 | ||||
|       protocol: TCP | ||||
|       targetPort: 3001 | ||||
|   selector: | ||||
|     app: mikrotik-exporter | ||||
							
								
								
									
										443
									
								
								prometheus-operator/node-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										443
									
								
								prometheus-operator/node-exporter.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,443 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: nodes-proxmox | ||||
| spec: | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - nas.mgmt.k-space.ee:9100 | ||||
|         - pve1.proxmox.infra.k-space.ee:9100 | ||||
|         - pve8.proxmox.infra.k-space.ee:9100 | ||||
|         - pve9.proxmox.infra.k-space.ee:9100 | ||||
|       relabelingConfigs: | ||||
|       - sourceLabels: [__param_target] | ||||
|         targetLabel: instance | ||||
|       - sourceLabels: [__param_target] | ||||
|         targetLabel: __address__ | ||||
|   prober: | ||||
|     url: localhost | ||||
|     path: /metrics | ||||
|   metricRelabelings: | ||||
|   - sourceLabels: [__address__] | ||||
|     targetLabel: target | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Probe | ||||
| metadata: | ||||
|   name: nodes-misc | ||||
| spec: | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|           - sprucecone.infra.k-space.ee:9100 | ||||
|           - cedarcone.infra.k-space.ee:9100 | ||||
|       relabelingConfigs: | ||||
|       - sourceLabels: [__param_target] | ||||
|         targetLabel: instance | ||||
|       - sourceLabels: [__param_target] | ||||
|         targetLabel: __address__ | ||||
|   prober: | ||||
|     url: localhost | ||||
|     path: /metrics | ||||
|   metricRelabelings: | ||||
|   - sourceLabels: [__address__] | ||||
|     targetLabel: target | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   groups: | ||||
|   - name: node-exporter | ||||
|     rules: | ||||
|     - alert: ZfsOfflinePool | ||||
|       expr: node_zfs_zpool_state{state!="online"} > 0 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: ZFS offline pool (instance {{ $labels.instance }}) | ||||
|         description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | ||||
|     - alert: HostHighLoad | ||||
|       expr: sum(node_load1{}) by (instance) / count(node_cpu_seconds_total{mode="user"}) by (instance) > 2.5 | ||||
|       for: 15m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host under high load | ||||
|         description: Many processes are queued up for execution | ||||
|     - alert: HostOutOfMemory | ||||
|       expr: (node_memory_Cached_bytes + node_memory_Buffers_bytes + node_memory_MemFree_bytes ) / node_memory_MemTotal_bytes * 100 < 20 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host out of memory (instance {{ $labels.instance }}) | ||||
|         description: Node memory is filling up (< 10% left) | ||||
|     - alert: HostMemoryUnderMemoryPressure | ||||
|       expr: rate(node_vmstat_pgmajfault[1m]) > 1000 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host memory under memory pressure (instance {{ $labels.instance }}) | ||||
|         description: The node is under heavy memory pressure. High rate of major page faults | ||||
|     - alert: HostUnusualNetworkThroughputIn | ||||
|       expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) > 160e+06 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual network throughput in (instance {{ $labels.instance }}) | ||||
|         description: Host network interfaces are probably receiving too much data (> 160 MB/s) | ||||
|     - alert: HostUnusualNetworkThroughputOut | ||||
|       expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) > 160e+06 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual network throughput out (instance {{ $labels.instance }}) | ||||
|         description: Host network interfaces are probably sending too much data (> 160 MB/s) | ||||
|     - alert: HostUnusualDiskReadRate | ||||
|       expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50000000 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual disk read rate (instance {{ $labels.instance }}) | ||||
|         description: Disk is probably reading too much data (> 50 MB/s) | ||||
|     - alert: HostUnusualDiskWriteRate | ||||
|       expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50000000 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual disk write rate (instance {{ $labels.instance }}) | ||||
|         description: Disk is probably writing too much data (> 50 MB/s) | ||||
|     # Please add ignored mountpoints in node_exporter parameters like | ||||
|     # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | ||||
|     # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | ||||
|     - alert: HostOutOfDiskSpace | ||||
|       expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host out of disk space (instance {{ $labels.instance }}) | ||||
|         description: Disk is almost full (< 10% left) | ||||
|     # Please add ignored mountpoints in node_exporter parameters like | ||||
|     # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | ||||
|     # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | ||||
|     - alert: HostDiskWillFillIn24Hours | ||||
|       expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) | ||||
|         description: Filesystem is predicted to run out of space within the next 24 hours at current write rate | ||||
|     - alert: HostOutOfInodes | ||||
|       expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host out of inodes (instance {{ $labels.instance }}) | ||||
|         description: Disk is almost running out of available inodes (< 10% left) | ||||
|     - alert: HostInodesWillFillIn24Hours | ||||
|       expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) | ||||
|         description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate | ||||
|     - alert: HostUnusualDiskReadLatency | ||||
|       expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual disk read latency (instance {{ $labels.instance }}) | ||||
|         description: Disk latency is growing (read operations > 100ms) | ||||
|     - alert: HostUnusualDiskWriteLatency | ||||
|       expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host unusual disk write latency (instance {{ $labels.instance }}) | ||||
|         description: Disk latency is growing (write operations > 100ms) | ||||
|     - alert: HostCpuStealNoisyNeighbor | ||||
|       expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) | ||||
|         description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. | ||||
|     # 1000 context switches is an arbitrary number. | ||||
|     # Alert threshold depends on nature of application. | ||||
|     # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 | ||||
|     - alert: HostContextSwitching | ||||
|       expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 50000 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host context switching (instance {{ $labels.instance }}) | ||||
|         description: Context switching is growing on node (> 50000 / s) | ||||
|     - alert: HostSwapIsEnabled | ||||
|       expr: node_memory_SwapTotal_bytes > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Swap is discouraged nowadays | ||||
|     - alert: HostPhysicalComponentTooHot | ||||
|       expr: node_hwmon_temp_celsius > 75 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host physical component too hot (instance {{ $labels.instance }}) | ||||
|         description: Physical hardware component too hot | ||||
|     - alert: HostNodeOvertemperatureAlarm | ||||
|       expr: node_hwmon_temp_alarm == 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Host node overtemperature alarm (instance {{ $labels.instance }}) | ||||
|         description: Physical node temperature alarm triggered | ||||
|     - alert: HostRaidArrayGotInactive | ||||
|       expr: node_md_state{state="inactive"} > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: critical | ||||
|       annotations: | ||||
|         summary: Host RAID array got inactive (instance {{ $labels.instance }}) | ||||
|         description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. | ||||
|     - alert: HostRaidDiskFailure | ||||
|       expr: node_md_disks{state="failed"} > 0 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host RAID disk failure (instance {{ $labels.instance }}) | ||||
|         description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap | ||||
|     - alert: HostOomKillDetected | ||||
|       expr: increase(node_vmstat_oom_kill[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host OOM kill detected (instance {{ $labels.instance }}) | ||||
|         description: OOM kill detected | ||||
|     - alert: HostEdacCorrectableErrorsDetected | ||||
|       expr: increase(node_edac_correctable_errors_total[1m]) > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: info | ||||
|       annotations: | ||||
|         summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) | ||||
|         description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. | ||||
|     - alert: HostEdacUncorrectableErrorsDetected | ||||
|       expr: node_edac_uncorrectable_errors_total > 0 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) | ||||
|         description: {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. | ||||
|     - alert: HostNetworkReceiveErrors | ||||
|       expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host Network Receive Errors (instance {{ $labels.instance }}) | ||||
|         description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes. | ||||
|     - alert: HostNetworkTransmitErrors | ||||
|       expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host Network Transmit Errors (instance {{ $labels.instance }}) | ||||
|         description: {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes. | ||||
|     - alert: HostNetworkInterfaceSaturated | ||||
|       expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host Network Interface Saturated (instance {{ $labels.instance }}) | ||||
|         description: The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded. | ||||
|     - alert: HostNetworkBondDegraded | ||||
|       expr: node_bonding_active != node_bonding_slaves {master=~"bond.*"} | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host Network Bond Degraded | ||||
|     - alert: HostConntrackLimit | ||||
|       expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 | ||||
|       for: 5m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host conntrack limit (instance {{ $labels.instance }}) | ||||
|         description: The number of conntrack is approching limit | ||||
|     - alert: HostClockSkew | ||||
|       expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host clock skew (instance {{ $labels.instance }}) | ||||
|         description: Clock skew detected. Clock is out of sync. | ||||
|     - alert: HostClockNotSynchronising | ||||
|       expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 | ||||
|       for: 2m | ||||
|       labels: | ||||
|         severity: warning | ||||
|       annotations: | ||||
|         summary: Host clock not synchronising (instance {{ $labels.instance }}) | ||||
|         description: Clock not synchronising. | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: smart | ||||
| spec: | ||||
|   groups: | ||||
|     - name: smart | ||||
|       rules: | ||||
|       - alert: SmartSSDWriteRateTooHigh | ||||
|         expr: rate(smartmon_total_lbas_written_raw_value[72h]) * 512 > 10000000 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: SSD write rate exceeds 10MB/s | ||||
|           description: At this rate the SSD will be worn out before warranty period expires | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: temperatures | ||||
| spec: | ||||
|   groups: | ||||
|     - name: temperatures | ||||
|       rules: | ||||
|       - alert: HighDiskTemperature | ||||
|         expr: smartmon_airflow_temperature_cel_raw_value > 45 or smartmon_temperature_celsius_raw_value > 45 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           summary: High HDD/SSD temperature indicates high ambient temperature | ||||
|       - alert: HighChipsetTemperature | ||||
|         expr: node_hwmon_temp_celsius > 65 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           summary: High chipset (CPU, NB) temperature indicates insufficient or failing fans | ||||
|       - alert: LowDiskTemperature | ||||
|         expr: smartmon_airflow_temperature_cel_raw_value < 10 or smartmon_temperature_celsius_raw_value < 10 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           summary: Low HDD/SSD temperature indicates low ambient temperature and stuck server room exhaust fan relay | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: node-exporter | ||||
|   podMetricsEndpoints: | ||||
|     - port: web | ||||
|       scrapeTimeout: 30s | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: DaemonSet | ||||
| metadata: | ||||
|   labels: | ||||
|     app: node-exporter | ||||
|   name: node-exporter | ||||
|   annotations: | ||||
|     keel.sh/policy: force | ||||
|     keel.sh/trigger: poll | ||||
|     keel.sh/pollSchedule: "@midnight" | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: node-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: node-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|       - name: node-exporter | ||||
|         args: | ||||
|         - --web.listen-address=0.0.0.0:9101 | ||||
|         - --path.sysfs=/host/sys | ||||
|         - --path.rootfs=/host/root | ||||
|         - --no-collector.wifi | ||||
|         - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) | ||||
|         - --collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$ | ||||
|         - --collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$ | ||||
|         image: prom/node-exporter:v1.3.1 | ||||
|         resources: | ||||
|           limits: | ||||
|             cpu: 50m | ||||
|             memory: 180Mi | ||||
|           requests: | ||||
|             cpu: 5m | ||||
|             memory: 20Mi | ||||
|         volumeMounts: | ||||
|         - mountPath: /host/sys | ||||
|           mountPropagation: HostToContainer | ||||
|           name: sys | ||||
|           readOnly: true | ||||
|         - mountPath: /host/root | ||||
|           mountPropagation: HostToContainer | ||||
|           name: root | ||||
|           readOnly: true | ||||
|         ports: | ||||
|         - containerPort: 9101 | ||||
|           name: web | ||||
|         securityContext: | ||||
|           runAsGroup: 65532 | ||||
|           runAsNonRoot: true | ||||
|           runAsUser: 65532 | ||||
|           readOnlyRootFilesystem: true | ||||
|       hostNetwork: true | ||||
|       hostPID: true | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       serviceAccountName: node-exporter | ||||
|       tolerations: | ||||
|       - operator: Exists | ||||
|       volumes: | ||||
|       - hostPath: | ||||
|           path: /sys | ||||
|         name: sys | ||||
|       - hostPath: | ||||
|           path: / | ||||
|         name: root | ||||
							
								
								
									
										172
									
								
								prometheus-operator/snmp-exporter.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								prometheus-operator/snmp-exporter.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: snmp-exporter | ||||
| spec: | ||||
|   replicas: 2 | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: snmp-exporter | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: snmp-exporter | ||||
|     spec: | ||||
|       containers: | ||||
|         - image: prom/snmp-exporter:latest | ||||
|           name: snmp-exporter | ||||
|           imagePullPolicy: Always | ||||
|           securityContext: | ||||
|             runAsNonRoot: true | ||||
|             runAsUser: 1000 | ||||
|             readOnlyRootFilesystem: true | ||||
|           ports: | ||||
|           - containerPort: 9116 | ||||
|             name: exporter | ||||
|           livenessProbe: | ||||
|             httpGet: | ||||
|               path: /health | ||||
|               port: exporter | ||||
|           readinessProbe: | ||||
|             httpGet: | ||||
|               path: /health | ||||
|               port: exporter | ||||
|           volumeMounts: | ||||
|           - name: snmp-exporter | ||||
|             mountPath: /etc/snmp_exporter | ||||
|       volumes: | ||||
|         - name: snmp-exporter | ||||
|           configMap: | ||||
|             name: snmp-exporter | ||||
|       nodeSelector: | ||||
|         dedicated: monitoring | ||||
|       tolerations: | ||||
|       - key: dedicated | ||||
|         operator: Equal | ||||
|         value: monitoring | ||||
|         effect: NoSchedule | ||||
|       affinity: | ||||
|         podAntiAffinity: | ||||
|           requiredDuringSchedulingIgnoredDuringExecution: | ||||
|           - labelSelector: | ||||
|               matchExpressions: | ||||
|               - key: app | ||||
|                 operator: In | ||||
|                 values: | ||||
|                 - snmp-exporter | ||||
|             topologyKey: "kubernetes.io/hostname" | ||||
| --- | ||||
| kind: Service | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: snmp-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|     - name: exporter | ||||
|       port: 9116 | ||||
|       protocol: TCP | ||||
|   selector: | ||||
|     app: snmp-exporter | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: ups | ||||
| spec: | ||||
|   interval: 60s | ||||
|   module: rfc1628_ups | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - ups-4.mgmt.k-space.ee | ||||
|         - ups-5.mgmt.k-space.ee | ||||
|         - ups-6.mgmt.k-space.ee | ||||
|         - ups-7.mgmt.k-space.ee | ||||
|         - ups-8.mgmt.k-space.ee | ||||
|         - ups-9.mgmt.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: ups | ||||
| spec: | ||||
|   groups: | ||||
|   - name: ups | ||||
|     rules: | ||||
|     - alert: UPSBatteryLost | ||||
|       annotations: | ||||
|         summary: One or more UPS-es have degraded batteries. | ||||
|       expr: snmp_upsBatteryStatus{upsBatteryStatus!="batteryNormal"} > 0 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: UPSPowerLost | ||||
|       annotations: | ||||
|         summary: One or more UPS-es is not in normal operation mode. This either means | ||||
|           power is lost or UPS was loaded and it's now in bypass mode. | ||||
|       expr: sum(snmp_upsOutputSource { upsOutputSource = 'normal' }) < 6 | ||||
|       for: 1m | ||||
|       labels: | ||||
|         severity: critical | ||||
|     - alert: UPSExcessivelyLoaded | ||||
|       annotations: | ||||
|         summary: One or more UPS-es is loaded more than 50%. Make sure load on UPS-es | ||||
|           is balanced and load for no UPS stays above 50%. | ||||
|       expr: snmp_upsOutputPercentLoad > 80 | ||||
|       for: 1h | ||||
|       labels: | ||||
|         severity: critical | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: printer | ||||
| spec: | ||||
|   interval: 60s | ||||
|   scrapeTimeout: 50s | ||||
|   module: printer_mib | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - mfp-cyber.pub.k-space.ee | ||||
|         - mfp-chaos.pub.k-space.ee | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: printer | ||||
| spec: | ||||
|   groups: | ||||
|   - name: printer | ||||
|     rules: | ||||
|     - alert: PrinterNeedsAttention | ||||
|       annotations: | ||||
|         summary: Printer is in error state. If the underlying reason is 'low on paper' | ||||
|           make sure there is enough paper near the printer. It not drop a line at | ||||
|           accounting@k-space.ee to order more office supplies. | ||||
|       expr: snmp_hrPrinterDetectedErrorState == 1 | ||||
|       for: 0m | ||||
|       labels: | ||||
|         severity: warning | ||||
| --- | ||||
| kind: Probe | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| metadata: | ||||
|   name: beamer | ||||
| spec: | ||||
|   interval: 60s | ||||
|   module: epson_beamer | ||||
|   prober: | ||||
|     url: snmp-exporter:9116 | ||||
|     path: /snmp | ||||
|   targets: | ||||
|     staticConfig: | ||||
|       static: | ||||
|         - beamer-cyber.sec.k-space.ee | ||||
							
								
								
									
										1272
									
								
								prometheus-operator/snmp.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1272
									
								
								prometheus-operator/snmp.yml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -19,7 +19,7 @@ but it does not export Prometheus metrics either. | ||||
| To apply changes run in this directory: | ||||
|  | ||||
| ``` | ||||
| kubectl apply -n rosdump -f cronjob.yaml | ||||
| kubectl apply -n rosdump -f application.yml | ||||
| ``` | ||||
|  | ||||
| To trigger cronjob: | ||||
|   | ||||
| @@ -87,7 +87,6 @@ spec: | ||||
|                           path: ssh_known_hosts | ||||
|                   - configMap: | ||||
|                       name: rosdump-config | ||||
|  | ||||
| --- | ||||
| apiVersion: networking.k8s.io/v1 | ||||
| kind: NetworkPolicy | ||||
| @@ -108,3 +107,19 @@ spec: | ||||
|     ports: | ||||
|     - protocol: TCP | ||||
|       port: 22 | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PrometheusRule | ||||
| metadata: | ||||
|   name: rosdump | ||||
| spec: | ||||
|   groups: | ||||
|     - name: rosdump | ||||
|       rules: | ||||
|         - alert: MikrotikBackupsBroken | ||||
|           expr: absent(kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"}) or time() - kube_cronjob_status_last_successful_time{cronjob="rosdump-cronjob"} > 3600 | ||||
|           for: 4h | ||||
|           labels: | ||||
|             severity: warning | ||||
|           annotations: | ||||
|             summary: Mikrotik backups are broken | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| Traefik Ingress Controller: | ||||
|  | ||||
| ``` | ||||
| kubectl create namespace traefik | ||||
| helm template --include-crds -n traefik --release-name k6 traefik/traefik -f values.yml > application.yml | ||||
| kubectl apply -n traefik -f namespace.yml -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml | ||||
| kubectl apply -n traefik -f application.yml -f application-extras.yml -f whoami.yml -f proxmox.yml -f voron.yml | ||||
| ``` | ||||
|   | ||||
| @@ -28,9 +28,6 @@ kind: Service | ||||
| metadata: | ||||
|   name: traefik-metrics | ||||
|   namespace: traefik | ||||
|   annotations: | ||||
|     prometheus.io/scrape: 'true' | ||||
|     prometheus.io/port: '9100' | ||||
| spec: | ||||
|   selector: | ||||
|     app.kubernetes.io/name: traefik | ||||
| @@ -92,6 +89,16 @@ spec: | ||||
|   - Ingress | ||||
|   - Egress | ||||
|   ingress: | ||||
|   - from: | ||||
|     - namespaceSelector: | ||||
|         matchLabels: | ||||
|           kubernetes.io/metadata.name: prometheus-operator | ||||
|       podSelector: | ||||
|         matchLabels: | ||||
|           app.kubernetes.io/name: prometheus | ||||
|     ports: | ||||
|     - protocol: TCP | ||||
|       port: 9100 | ||||
|   - from: | ||||
|     - ipBlock: | ||||
|         cidr: 0.0.0.0/0 | ||||
| @@ -109,3 +116,14 @@ spec: | ||||
|   replacePathRegex: | ||||
|     regex: ^/metrics | ||||
|     replacement: / | ||||
| --- | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: PodMonitor | ||||
| metadata: | ||||
|   name: traefik | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app.kubernetes.io/name: traefik | ||||
|   podMetricsEndpoints: | ||||
|     - port: metrics | ||||
|   | ||||
| @@ -17,9 +17,8 @@ deployment: | ||||
|     keel.sh/trigger: patch | ||||
|     keel.sh/pollSchedule: "@midnight" | ||||
|  | ||||
|   podAnnotations: | ||||
|     prometheus.io/scrape: 'true' | ||||
|     prometheus.io/port: '9100' | ||||
| accessLog: | ||||
|   format: json | ||||
|  | ||||
| # Globally redirect to https:// | ||||
| globalArguments: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user