From 877e90e03405e954f1e27501332995b63f208a28 Mon Sep 17 00:00:00 2001 From: Mehran Kholdi Date: Fri, 2 Jul 2021 20:30:48 +0430 Subject: [PATCH] Expose volume stats as prometheus metrics This should help in: - Keeping track of deleted PVs with `Retain` policy - Detecting disk overprovisioning --- .../templates/01-controller-plugin.yaml | 3 ++ deploy/charts/rawfile-csi/values.yaml | 2 +- metrics.py | 34 +++++++++++++++++-- rawfile.py | 2 +- rawfile_util.py | 21 ++++++++++++ 5 files changed, 57 insertions(+), 5 deletions(-) diff --git a/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml b/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml index f55912b..694faf1 100644 --- a/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml +++ b/deploy/charts/rawfile-csi/templates/01-controller-plugin.yaml @@ -41,6 +41,9 @@ spec: - name: csi-driver image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}" imagePullPolicy: {{ .Values.controller.image.pullPolicy }} + args: + - csi-driver + - --disable-metrics env: - name: PROVISIONER_NAME value: "{{ .Values.provisionerName }}" diff --git a/deploy/charts/rawfile-csi/values.yaml b/deploy/charts/rawfile-csi/values.yaml index b6df2ba..2ea5c52 100644 --- a/deploy/charts/rawfile-csi/values.yaml +++ b/deploy/charts/rawfile-csi/values.yaml @@ -24,4 +24,4 @@ node: imagePullSecrets: [] serviceMonitor: enabled: true - interval: 15s + interval: 1m diff --git a/metrics.py b/metrics.py index 0830b22..8f4d6a7 100644 --- a/metrics.py +++ b/metrics.py @@ -1,12 +1,40 @@ from prometheus_client.core import REGISTRY from prometheus_client.exposition import start_http_server +from prometheus_client.metrics_core import GaugeMetricFamily + +from rawfile_util import get_capacity, get_volumes_stats class VolumeStatsCollector(object): + def __init__(self, node): + self.node = node + def collect(self): - return [] + remaining_capacity = GaugeMetricFamily( + "rawfile_remaining_capacity", + "Remaining capacity for creating new volumes on this node", + labels=["node"], + unit="bytes", + ) + volume_used = GaugeMetricFamily( + "rawfile_volume_used", + "Actual amount of disk used space by volume", + labels=["node", "volume"], + unit="bytes", + ) + volume_total = GaugeMetricFamily( + "rawfile_volume_total", + "Amount of disk allocated to this volume", + labels=["node", "volume"], + unit="bytes", + ) + remaining_capacity.add_metric([self.node], get_capacity()) + for volume_id, stats in get_volumes_stats().items(): + volume_used.add_metric([self.node, volume_id], stats["used"]) + volume_total.add_metric([self.node, volume_id], stats["total"]) + return [remaining_capacity, volume_used, volume_total] -def expose_metrics(): - REGISTRY.register(VolumeStatsCollector()) +def expose_metrics(node): + REGISTRY.register(VolumeStatsCollector(node)) start_http_server(9100) diff --git a/rawfile.py b/rawfile.py index 1eae50f..634a916 100755 --- a/rawfile.py +++ b/rawfile.py @@ -28,7 +28,7 @@ def cli(image_repository, image_tag): def csi_driver(endpoint, nodeid, enable_metrics): migrate_all_volume_schemas() if enable_metrics: - expose_metrics() + expose_metrics(nodeid) server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) csi_pb2_grpc.add_IdentityServicer_to_server( bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server diff --git a/rawfile_util.py b/rawfile_util.py index 3577a3b..c4aa5cc 100644 --- a/rawfile_util.py +++ b/rawfile_util.py @@ -6,6 +6,7 @@ import time from consts import DATA_DIR from declarative import be_absent +from fs_util import path_stats from volume_schema import migrate_to, LATEST_SCHEMA_VERSION from util import run, run_out @@ -113,3 +114,23 @@ def migrate_all_volume_schemas(): def gc_all_volumes(dry_run=True): for volume_id in list_all_volumes(): gc_if_needed(volume_id, dry_run=dry_run) + + +def get_volumes_stats() -> [dict]: + volumes_stats = {} + for volume_id in list_all_volumes(): + file = img_file(volume_id=volume_id) + stats = file.stat() + volumes_stats[volume_id] = { + "used": stats.st_blocks * 512, + "total": stats.st_size, + } + return volumes_stats + + +def get_capacity(): + disk_free_size = path_stats(DATA_DIR)["fs_avail"] + capacity = disk_free_size + for volume_stat in get_volumes_stats().values(): + capacity -= volume_stat["total"] - volume_stat["used"] + return capacity