Expose volume stats as prometheus metrics
This should help in: - Keeping track of deleted PVs with `Retain` policy - Detecting disk overprovisioning
This commit is contained in:
		@@ -41,6 +41,9 @@ spec:
 | 
			
		||||
        - name: csi-driver
 | 
			
		||||
          image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag }}"
 | 
			
		||||
          imagePullPolicy: {{ .Values.controller.image.pullPolicy }}
 | 
			
		||||
          args:
 | 
			
		||||
            - csi-driver
 | 
			
		||||
            - --disable-metrics
 | 
			
		||||
          env:
 | 
			
		||||
            - name: PROVISIONER_NAME
 | 
			
		||||
              value: "{{ .Values.provisionerName }}"
 | 
			
		||||
 
 | 
			
		||||
@@ -24,4 +24,4 @@ node:
 | 
			
		||||
imagePullSecrets: []
 | 
			
		||||
serviceMonitor:
 | 
			
		||||
  enabled: true
 | 
			
		||||
  interval: 15s
 | 
			
		||||
  interval: 1m
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										34
									
								
								metrics.py
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								metrics.py
									
									
									
									
									
								
							@@ -1,12 +1,40 @@
 | 
			
		||||
from prometheus_client.core import REGISTRY
 | 
			
		||||
from prometheus_client.exposition import start_http_server
 | 
			
		||||
from prometheus_client.metrics_core import GaugeMetricFamily
 | 
			
		||||
 | 
			
		||||
from rawfile_util import get_capacity, get_volumes_stats
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class VolumeStatsCollector(object):
 | 
			
		||||
    def __init__(self, node):
 | 
			
		||||
        self.node = node
 | 
			
		||||
 | 
			
		||||
    def collect(self):
 | 
			
		||||
        return []
 | 
			
		||||
        remaining_capacity = GaugeMetricFamily(
 | 
			
		||||
            "rawfile_remaining_capacity",
 | 
			
		||||
            "Remaining capacity for creating new volumes on this node",
 | 
			
		||||
            labels=["node"],
 | 
			
		||||
            unit="bytes",
 | 
			
		||||
        )
 | 
			
		||||
        volume_used = GaugeMetricFamily(
 | 
			
		||||
            "rawfile_volume_used",
 | 
			
		||||
            "Actual amount of disk used space by volume",
 | 
			
		||||
            labels=["node", "volume"],
 | 
			
		||||
            unit="bytes",
 | 
			
		||||
        )
 | 
			
		||||
        volume_total = GaugeMetricFamily(
 | 
			
		||||
            "rawfile_volume_total",
 | 
			
		||||
            "Amount of disk allocated to this volume",
 | 
			
		||||
            labels=["node", "volume"],
 | 
			
		||||
            unit="bytes",
 | 
			
		||||
        )
 | 
			
		||||
        remaining_capacity.add_metric([self.node], get_capacity())
 | 
			
		||||
        for volume_id, stats in get_volumes_stats().items():
 | 
			
		||||
            volume_used.add_metric([self.node, volume_id], stats["used"])
 | 
			
		||||
            volume_total.add_metric([self.node, volume_id], stats["total"])
 | 
			
		||||
        return [remaining_capacity, volume_used, volume_total]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def expose_metrics():
 | 
			
		||||
    REGISTRY.register(VolumeStatsCollector())
 | 
			
		||||
def expose_metrics(node):
 | 
			
		||||
    REGISTRY.register(VolumeStatsCollector(node))
 | 
			
		||||
    start_http_server(9100)
 | 
			
		||||
 
 | 
			
		||||
@@ -28,7 +28,7 @@ def cli(image_repository, image_tag):
 | 
			
		||||
def csi_driver(endpoint, nodeid, enable_metrics):
 | 
			
		||||
    migrate_all_volume_schemas()
 | 
			
		||||
    if enable_metrics:
 | 
			
		||||
        expose_metrics()
 | 
			
		||||
        expose_metrics(nodeid)
 | 
			
		||||
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
 | 
			
		||||
    csi_pb2_grpc.add_IdentityServicer_to_server(
 | 
			
		||||
        bd2fs.Bd2FsIdentityServicer(rawfile_servicer.RawFileIdentityServicer()), server
 | 
			
		||||
 
 | 
			
		||||
@@ -6,6 +6,7 @@ import time
 | 
			
		||||
 | 
			
		||||
from consts import DATA_DIR
 | 
			
		||||
from declarative import be_absent
 | 
			
		||||
from fs_util import path_stats
 | 
			
		||||
from volume_schema import migrate_to, LATEST_SCHEMA_VERSION
 | 
			
		||||
from util import run, run_out
 | 
			
		||||
 | 
			
		||||
@@ -113,3 +114,23 @@ def migrate_all_volume_schemas():
 | 
			
		||||
def gc_all_volumes(dry_run=True):
 | 
			
		||||
    for volume_id in list_all_volumes():
 | 
			
		||||
        gc_if_needed(volume_id, dry_run=dry_run)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_volumes_stats() -> [dict]:
 | 
			
		||||
    volumes_stats = {}
 | 
			
		||||
    for volume_id in list_all_volumes():
 | 
			
		||||
        file = img_file(volume_id=volume_id)
 | 
			
		||||
        stats = file.stat()
 | 
			
		||||
        volumes_stats[volume_id] = {
 | 
			
		||||
            "used": stats.st_blocks * 512,
 | 
			
		||||
            "total": stats.st_size,
 | 
			
		||||
        }
 | 
			
		||||
    return volumes_stats
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_capacity():
 | 
			
		||||
    disk_free_size = path_stats(DATA_DIR)["fs_avail"]
 | 
			
		||||
    capacity = disk_free_size
 | 
			
		||||
    for volume_stat in get_volumes_stats().values():
 | 
			
		||||
        capacity -= volume_stat["total"] - volume_stat["used"]
 | 
			
		||||
    return capacity
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user