Expose volume stats as prometheus metrics

This should help in: - Keeping track of deleted PVs with `Retain` policy - Detecting disk overprovisioning
2021-07-02 20:30:48 +04:30
parent 2b6a0a33b8
commit 877e90e034
5 changed files with 57 additions and 5 deletions
--- a/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml
+++ b/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml
@@ -41,6 +41,9 @@ spec:
        - name: csi-driver
          image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
          imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
+          args:
+            - csi-driver
+            - --disable-metrics
          env:
            - name: PROVISIONER_NAME
              value: "{{ .Values.provisionerName }}"
--- a/deploy/charts/rawfile-csi/values.yaml
+++ b/deploy/charts/rawfile-csi/values.yaml
@@ -24,4 +24,4 @@ node:
 imagePullSecrets: []
 serviceMonitor:
  enabled: true
-  interval: 15s
+  interval: 1m
--- a/metrics.py
+++ b/metrics.py
@@ -1,12 +1,40 @@
 from prometheus_client.core import REGISTRY
 from prometheus_client.exposition import start_http_server
+from prometheus_client.metrics_core import GaugeMetricFamily
+
+from rawfile_util import get_capacity, get_volumes_stats


 class VolumeStatsCollector(object):
+    def __init__(self, node):
+        self.node = node
+
    def collect(self):
-        return []
+        remaining_capacity = GaugeMetricFamily(
+            "rawfile_remaining_capacity",
+            "Remaining capacity for creating new volumes on this node",
+            labels=["node"],
+            unit="bytes",
+        )
+        volume_used = GaugeMetricFamily(
+            "rawfile_volume_used",
+            "Actual amount of disk used space by volume",
+            labels=["node", "volume"],
+            unit="bytes",
+        )
+        volume_total = GaugeMetricFamily(
+            "rawfile_volume_total",
+            "Amount of disk allocated to this volume",
+            labels=["node", "volume"],
+            unit="bytes",
+        )
+        remaining_capacity.add_metric([self.node], get_capacity())
+        for volume_id, stats in get_volumes_stats().items():
+            volume_used.add_metric([self.node, volume_id], stats["used"])
+            volume_total.add_metric([self.node, volume_id], stats["total"])
+        return [remaining_capacity, volume_used, volume_total]


-def expose_metrics():
-    REGISTRY.register(VolumeStatsCollector())
+def expose_metrics(node):
+    REGISTRY.register(VolumeStatsCollector(node))
    start_http_server(9100)
--- a/rawfile.py
+++ b/rawfile.py
@@ -28,7 +28,7 @@ def cli(image_repository, image_tag):
 def csi_driver(endpoint, nodeid, enable_metrics):
    migrate_all_volume_schemas()
    if enable_metrics:
-        expose_metrics()
+        expose_metrics(nodeid)
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    csi_pb2_grpc.add_IdentityServicer_to_server(
        bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server
--- a/rawfile_util.py
+++ b/rawfile_util.py
@@ -6,6 +6,7 @@ import time

 from consts import DATA_DIR
 from declarative import be_absent
+from fs_util import path_stats
 from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
 from util import run, run_out

@@ -113,3 +114,23 @@ def migrate_all_volume_schemas():
 def gc_all_volumes(dry_run=True):
    for volume_id in list_all_volumes():
        gc_if_needed(volume_id, dry_run=dry_run)
+
+
+def get_volumes_stats() -> [dict]:
+    volumes_stats = {}
+    for volume_id in list_all_volumes():
+        file = img_file(volume_id=volume_id)
+        stats = file.stat()
+        volumes_stats[volume_id] = {
+            "used": stats.st_blocks * 512,
+            "total": stats.st_size,
+        }
+    return volumes_stats
+
+
+def get_capacity():
+    disk_free_size = path_stats(DATA_DIR)["fs_avail"]
+    capacity = disk_free_size
+    for volume_stat in get_volumes_stats().values():
+        capacity -= volume_stat["total"] - volume_stat["used"]
+    return capacity