monitoring: Specify resource limits

This commit is contained in:
Lauri Võsandi 2024-08-24 12:36:37 +03:00
parent 3fbecab179
commit d7287018ac
3 changed files with 68 additions and 23 deletions

View File

@ -1,15 +1,17 @@
## Monitoring namespace
# Monitoring namespace
Prometheus is accessible at [prom.k-space.ee](https://prom.k-space.ee/)
and the corresponding AlertManager is accessible at [am.k-space.ee](https://am.k-space.ee/).
Both are [deployed by ArgoCD](https://argocd.k-space.ee/applications/monitoring)
from this Git repo directory using Prometheus operator.
Note that Prometheus and other monitoring stack components should appropriate
Note that Prometheus and other monitoring stack components should use appropriate
node selector to make sure the components get scheduled on nodes which are
hosted in a privileged VLAN where they have access to UPS SNMP targets,
Mikrotik router/switch API-s etc.
## For users
To add monitoring targets inside the Kubernetes cluster make use of
[PodMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/getting-started.md#using-podmonitors) or ServiceMonitor custom
resource definitions.
@ -30,6 +32,8 @@ Sample queries:
* [Disk space left](https://prom.k-space.ee/graph?g0.range_input=1h&g0.expr=node_filesystem_avail_bytes&g0.tab=1)
* Minio [s3 egress](https://prom.k-space.ee/graph?g0.expr=rate(minio_s3_traffic_sent_bytes%5B3m%5D)&g0.tab=0&g0.display_mode=lines&g0.show_exemplars=0&g0.range_input=6h), [internode egress](https://prom.k-space.ee/graph?g0.expr=rate(minio_inter_node_traffic_sent_bytes%5B2m%5D)&g0.tab=0&g0.display_mode=lines&g0.show_exemplars=0&g0.range_input=6h), [storage used](https://prom.k-space.ee/graph?g0.expr=minio_node_disk_used_bytes&g0.tab=0&g0.display_mode=lines&g0.show_exemplars=0&g0.range_input=6h)
# For administrators
To reconfigure SNMP targets etc:
```

View File

@ -3,6 +3,7 @@ apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: alertmanager
namespace: monitoring
labels:
app.kubernetes.io/name: alertmanager
spec:
@ -24,12 +25,12 @@ spec:
apiURL:
name: slack-secrets
key: webhook-url
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager
namespace: monitoring
spec:
topologySpreadConstraints:
- maxSkew: 1
@ -55,6 +56,13 @@ spec:
value: ''
effect: NoSchedule
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 8m
memory: 35Mi
serviceAccountName: alertmanager
externalUrl: http://am.k-space.ee/
routePrefix: "/"
@ -68,3 +76,4 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
namespace: monitoring

View File

@ -3,6 +3,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: metrics
namespace: monitoring
spec:
namespaceSelector: {}
selector: {}
@ -14,6 +15,7 @@ apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: prometheus
namespace: monitoring
spec:
topologySpreadConstraints:
- maxSkew: 1
@ -53,11 +55,18 @@ spec:
ruleNamespaceSelector: {}
ruleSelector: {}
retentionSize: 8GB
resources:
limits:
cpu: 500m
memory: 2Gi
requests:
cpu: 100m
memory: 700Mi
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
- ReadWriteOnce
resources:
requests:
storage: 10Gi
@ -67,36 +76,50 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
- resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
apiGroups:
- ""
verbs:
- get
- list
- watch
- resources:
- configmaps
apiGroups:
- ""
verbs:
- get
- resources:
- ingresses
apiGroups:
- networking.k8s.io
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
@ -110,6 +133,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus
namespace: monitoring
spec:
groups:
- name: prometheus
@ -356,6 +380,7 @@ apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: monitoring
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
@ -381,6 +406,7 @@ apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
namespace: monitoring
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
@ -406,6 +432,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: prometheus
namespace: monitoring
spec:
selector:
matchLabels:
@ -417,6 +444,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: alertmanager
namespace: monitoring
spec:
selector:
matchLabels:
@ -428,6 +456,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: operator
namespace: monitoring
spec:
selector:
matchLabels:
@ -439,6 +468,7 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -467,6 +497,7 @@ apiVersion: codemowers.cloud/v1beta1
kind: OIDCMiddlewareClient
metadata:
name: prometheus
namespace: monitoring
spec:
displayName: Prometheus
uri: 'https://prom.k-space.ee'
@ -482,6 +513,7 @@ apiVersion: codemowers.cloud/v1beta1
kind: OIDCMiddlewareClient
metadata:
name: alertmanager
namespace: monitoring
spec:
displayName: AlertManager
uri: 'https://am.k-space.ee'