Expose volume stats as prometheus metrics

This should help in:

- Keeping track of deleted PVs with `Retain` policy
- Detecting disk overprovisioning
This commit is contained in:
Mehran Kholdi 2021-07-02 20:30:48 +04:30
parent 2b6a0a33b8
commit 877e90e034
5 changed files with 57 additions and 5 deletions

View File

@ -41,6 +41,9 @@ spec:
- name: csi-driver
image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
args:
- csi-driver
- --disable-metrics
env:
- name: PROVISIONER_NAME
value: "{{ .Values.provisionerName }}"

View File

@ -24,4 +24,4 @@ node:
imagePullSecrets: []
serviceMonitor:
enabled: true
interval: 15s
interval: 1m

View File

@ -1,12 +1,40 @@
from prometheus_client.core import REGISTRY
from prometheus_client.exposition import start_http_server
from prometheus_client.metrics_core import GaugeMetricFamily
from rawfile_util import get_capacity, get_volumes_stats
class VolumeStatsCollector(object):
def __init__(self, node):
self.node = node
def collect(self):
return []
remaining_capacity = GaugeMetricFamily(
"rawfile_remaining_capacity",
"Remaining capacity for creating new volumes on this node",
labels=["node"],
unit="bytes",
)
volume_used = GaugeMetricFamily(
"rawfile_volume_used",
"Actual amount of disk used space by volume",
labels=["node", "volume"],
unit="bytes",
)
volume_total = GaugeMetricFamily(
"rawfile_volume_total",
"Amount of disk allocated to this volume",
labels=["node", "volume"],
unit="bytes",
)
remaining_capacity.add_metric([self.node], get_capacity())
for volume_id, stats in get_volumes_stats().items():
volume_used.add_metric([self.node, volume_id], stats["used"])
volume_total.add_metric([self.node, volume_id], stats["total"])
return [remaining_capacity, volume_used, volume_total]
def expose_metrics():
REGISTRY.register(VolumeStatsCollector())
def expose_metrics(node):
REGISTRY.register(VolumeStatsCollector(node))
start_http_server(9100)

View File

@ -28,7 +28,7 @@ def cli(image_repository, image_tag):
def csi_driver(endpoint, nodeid, enable_metrics):
migrate_all_volume_schemas()
if enable_metrics:
expose_metrics()
expose_metrics(nodeid)
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
csi_pb2_grpc.add_IdentityServicer_to_server(
bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server

View File

@ -6,6 +6,7 @@ import time
from consts import DATA_DIR
from declarative import be_absent
from fs_util import path_stats
from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
from util import run, run_out
@ -113,3 +114,23 @@ def migrate_all_volume_schemas():
def gc_all_volumes(dry_run=True):
for volume_id in list_all_volumes():
gc_if_needed(volume_id, dry_run=dry_run)
def get_volumes_stats() -> [dict]:
volumes_stats = {}
for volume_id in list_all_volumes():
file = img_file(volume_id=volume_id)
stats = file.stat()
volumes_stats[volume_id] = {
"used": stats.st_blocks * 512,
"total": stats.st_size,
}
return volumes_stats
def get_capacity():
disk_free_size = path_stats(DATA_DIR)["fs_avail"]
capacity = disk_free_size
for volume_stat in get_volumes_stats().values():
capacity -= volume_stat["total"] - volume_stat["used"]
return capacity