Expose volume stats as prometheus metrics
This should help in: - Keeping track of deleted PVs with `Retain` policy - Detecting disk overprovisioning
This commit is contained in:
parent
2b6a0a33b8
commit
877e90e034
@ -41,6 +41,9 @@ spec:
|
||||
- name: csi-driver
|
||||
image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
|
||||
args:
|
||||
- csi-driver
|
||||
- --disable-metrics
|
||||
env:
|
||||
- name: PROVISIONER_NAME
|
||||
value: "{{ .Values.provisionerName }}"
|
||||
|
@ -24,4 +24,4 @@ node:
|
||||
imagePullSecrets: []
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
interval: 15s
|
||||
interval: 1m
|
||||
|
34
metrics.py
34
metrics.py
@ -1,12 +1,40 @@
|
||||
from prometheus_client.core import REGISTRY
|
||||
from prometheus_client.exposition import start_http_server
|
||||
from prometheus_client.metrics_core import GaugeMetricFamily
|
||||
|
||||
from rawfile_util import get_capacity, get_volumes_stats
|
||||
|
||||
|
||||
class VolumeStatsCollector(object):
|
||||
def __init__(self, node):
|
||||
self.node = node
|
||||
|
||||
def collect(self):
|
||||
return []
|
||||
remaining_capacity = GaugeMetricFamily(
|
||||
"rawfile_remaining_capacity",
|
||||
"Remaining capacity for creating new volumes on this node",
|
||||
labels=["node"],
|
||||
unit="bytes",
|
||||
)
|
||||
volume_used = GaugeMetricFamily(
|
||||
"rawfile_volume_used",
|
||||
"Actual amount of disk used space by volume",
|
||||
labels=["node", "volume"],
|
||||
unit="bytes",
|
||||
)
|
||||
volume_total = GaugeMetricFamily(
|
||||
"rawfile_volume_total",
|
||||
"Amount of disk allocated to this volume",
|
||||
labels=["node", "volume"],
|
||||
unit="bytes",
|
||||
)
|
||||
remaining_capacity.add_metric([self.node], get_capacity())
|
||||
for volume_id, stats in get_volumes_stats().items():
|
||||
volume_used.add_metric([self.node, volume_id], stats["used"])
|
||||
volume_total.add_metric([self.node, volume_id], stats["total"])
|
||||
return [remaining_capacity, volume_used, volume_total]
|
||||
|
||||
|
||||
def expose_metrics():
|
||||
REGISTRY.register(VolumeStatsCollector())
|
||||
def expose_metrics(node):
|
||||
REGISTRY.register(VolumeStatsCollector(node))
|
||||
start_http_server(9100)
|
||||
|
@ -28,7 +28,7 @@ def cli(image_repository, image_tag):
|
||||
def csi_driver(endpoint, nodeid, enable_metrics):
|
||||
migrate_all_volume_schemas()
|
||||
if enable_metrics:
|
||||
expose_metrics()
|
||||
expose_metrics(nodeid)
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
|
||||
csi_pb2_grpc.add_IdentityServicer_to_server(
|
||||
bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server
|
||||
|
@ -6,6 +6,7 @@ import time
|
||||
|
||||
from consts import DATA_DIR
|
||||
from declarative import be_absent
|
||||
from fs_util import path_stats
|
||||
from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
|
||||
from util import run, run_out
|
||||
|
||||
@ -113,3 +114,23 @@ def migrate_all_volume_schemas():
|
||||
def gc_all_volumes(dry_run=True):
|
||||
for volume_id in list_all_volumes():
|
||||
gc_if_needed(volume_id, dry_run=dry_run)
|
||||
|
||||
|
||||
def get_volumes_stats() -> [dict]:
|
||||
volumes_stats = {}
|
||||
for volume_id in list_all_volumes():
|
||||
file = img_file(volume_id=volume_id)
|
||||
stats = file.stat()
|
||||
volumes_stats[volume_id] = {
|
||||
"used": stats.st_blocks * 512,
|
||||
"total": stats.st_size,
|
||||
}
|
||||
return volumes_stats
|
||||
|
||||
|
||||
def get_capacity():
|
||||
disk_free_size = path_stats(DATA_DIR)["fs_avail"]
|
||||
capacity = disk_free_size
|
||||
for volume_stat in get_volumes_stats().values():
|
||||
capacity -= volume_stat["total"] - volume_stat["used"]
|
||||
return capacity
|
||||
|
Loading…
Reference in New Issue
Block a user