Expose volume stats as prometheus metrics
This should help in: - Keeping track of deleted PVs with `Retain` policy - Detecting disk overprovisioning
This commit is contained in:
parent
2b6a0a33b8
commit
877e90e034
@ -41,6 +41,9 @@ spec:
|
|||||||
- name: csi-driver
|
- name: csi-driver
|
||||||
image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
|
image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
|
||||||
imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
|
imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
|
||||||
|
args:
|
||||||
|
- csi-driver
|
||||||
|
- --disable-metrics
|
||||||
env:
|
env:
|
||||||
- name: PROVISIONER_NAME
|
- name: PROVISIONER_NAME
|
||||||
value: "{{ .Values.provisionerName }}"
|
value: "{{ .Values.provisionerName }}"
|
||||||
|
@ -24,4 +24,4 @@ node:
|
|||||||
imagePullSecrets: []
|
imagePullSecrets: []
|
||||||
serviceMonitor:
|
serviceMonitor:
|
||||||
enabled: true
|
enabled: true
|
||||||
interval: 15s
|
interval: 1m
|
||||||
|
34
metrics.py
34
metrics.py
@ -1,12 +1,40 @@
|
|||||||
from prometheus_client.core import REGISTRY
|
from prometheus_client.core import REGISTRY
|
||||||
from prometheus_client.exposition import start_http_server
|
from prometheus_client.exposition import start_http_server
|
||||||
|
from prometheus_client.metrics_core import GaugeMetricFamily
|
||||||
|
|
||||||
|
from rawfile_util import get_capacity, get_volumes_stats
|
||||||
|
|
||||||
|
|
||||||
class VolumeStatsCollector(object):
|
class VolumeStatsCollector(object):
|
||||||
|
def __init__(self, node):
|
||||||
|
self.node = node
|
||||||
|
|
||||||
def collect(self):
|
def collect(self):
|
||||||
return []
|
remaining_capacity = GaugeMetricFamily(
|
||||||
|
"rawfile_remaining_capacity",
|
||||||
|
"Remaining capacity for creating new volumes on this node",
|
||||||
|
labels=["node"],
|
||||||
|
unit="bytes",
|
||||||
|
)
|
||||||
|
volume_used = GaugeMetricFamily(
|
||||||
|
"rawfile_volume_used",
|
||||||
|
"Actual amount of disk used space by volume",
|
||||||
|
labels=["node", "volume"],
|
||||||
|
unit="bytes",
|
||||||
|
)
|
||||||
|
volume_total = GaugeMetricFamily(
|
||||||
|
"rawfile_volume_total",
|
||||||
|
"Amount of disk allocated to this volume",
|
||||||
|
labels=["node", "volume"],
|
||||||
|
unit="bytes",
|
||||||
|
)
|
||||||
|
remaining_capacity.add_metric([self.node], get_capacity())
|
||||||
|
for volume_id, stats in get_volumes_stats().items():
|
||||||
|
volume_used.add_metric([self.node, volume_id], stats["used"])
|
||||||
|
volume_total.add_metric([self.node, volume_id], stats["total"])
|
||||||
|
return [remaining_capacity, volume_used, volume_total]
|
||||||
|
|
||||||
|
|
||||||
def expose_metrics():
|
def expose_metrics(node):
|
||||||
REGISTRY.register(VolumeStatsCollector())
|
REGISTRY.register(VolumeStatsCollector(node))
|
||||||
start_http_server(9100)
|
start_http_server(9100)
|
||||||
|
@ -28,7 +28,7 @@ def cli(image_repository, image_tag):
|
|||||||
def csi_driver(endpoint, nodeid, enable_metrics):
|
def csi_driver(endpoint, nodeid, enable_metrics):
|
||||||
migrate_all_volume_schemas()
|
migrate_all_volume_schemas()
|
||||||
if enable_metrics:
|
if enable_metrics:
|
||||||
expose_metrics()
|
expose_metrics(nodeid)
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
|
||||||
csi_pb2_grpc.add_IdentityServicer_to_server(
|
csi_pb2_grpc.add_IdentityServicer_to_server(
|
||||||
bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server
|
bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server
|
||||||
|
@ -6,6 +6,7 @@ import time
|
|||||||
|
|
||||||
from consts import DATA_DIR
|
from consts import DATA_DIR
|
||||||
from declarative import be_absent
|
from declarative import be_absent
|
||||||
|
from fs_util import path_stats
|
||||||
from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
|
from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
|
||||||
from util import run, run_out
|
from util import run, run_out
|
||||||
|
|
||||||
@ -113,3 +114,23 @@ def migrate_all_volume_schemas():
|
|||||||
def gc_all_volumes(dry_run=True):
|
def gc_all_volumes(dry_run=True):
|
||||||
for volume_id in list_all_volumes():
|
for volume_id in list_all_volumes():
|
||||||
gc_if_needed(volume_id, dry_run=dry_run)
|
gc_if_needed(volume_id, dry_run=dry_run)
|
||||||
|
|
||||||
|
|
||||||
|
def get_volumes_stats() -> [dict]:
|
||||||
|
volumes_stats = {}
|
||||||
|
for volume_id in list_all_volumes():
|
||||||
|
file = img_file(volume_id=volume_id)
|
||||||
|
stats = file.stat()
|
||||||
|
volumes_stats[volume_id] = {
|
||||||
|
"used": stats.st_blocks * 512,
|
||||||
|
"total": stats.st_size,
|
||||||
|
}
|
||||||
|
return volumes_stats
|
||||||
|
|
||||||
|
|
||||||
|
def get_capacity():
|
||||||
|
disk_free_size = path_stats(DATA_DIR)["fs_avail"]
|
||||||
|
capacity = disk_free_size
|
||||||
|
for volume_stat in get_volumes_stats().values():
|
||||||
|
capacity -= volume_stat["total"] - volume_stat["used"]
|
||||||
|
return capacity
|
||||||
|
Loading…
Reference in New Issue
Block a user