Fix race condition that was causing dangling loop devices

Apparently it is wrong to assume that `DeleteVolume` gets called
only after `UnstageVolume` returns success. This was causing the
disk image file to be deleted while the volume was still mounted.
This would prevent the loop device from getting detached and in
turn disk space from getting reclaimed.
This commit is contained in:
Mehran Kholdi 2021-11-19 18:59:49 +03:30
parent 9d5ed19d7b
commit 63c8eb44ba
3 changed files with 22 additions and 2 deletions

View File

@ -5,3 +5,4 @@ PROVISIONER_VERSION = "0.7.0"
DATA_DIR = "/data" DATA_DIR = "/data"
CONFIG = {} CONFIG = {}
RESOURCE_EXHAUSTED_EXIT_CODE = 101 RESOURCE_EXHAUSTED_EXIT_CODE = 101
VOLUME_IN_USE_EXIT_CODE = 102

View File

@ -5,7 +5,12 @@ import grpc
from google.protobuf.wrappers_pb2 import BoolValue from google.protobuf.wrappers_pb2 import BoolValue
import rawfile_util import rawfile_util
from consts import PROVISIONER_VERSION, PROVISIONER_NAME, RESOURCE_EXHAUSTED_EXIT_CODE from consts import (
PROVISIONER_VERSION,
PROVISIONER_NAME,
RESOURCE_EXHAUSTED_EXIT_CODE,
VOLUME_IN_USE_EXIT_CODE,
)
from csi import csi_pb2, csi_pb2_grpc from csi import csi_pb2, csi_pb2_grpc
from declarative import be_symlink, be_absent from declarative import be_symlink, be_absent
from fs_util import device_stats, mountpoint_to_dev from fs_util import device_stats, mountpoint_to_dev
@ -208,7 +213,13 @@ class RawFileControllerServicer(csi_pb2_grpc.ControllerServicer):
@log_grpc_request @log_grpc_request
def DeleteVolume(self, request, context): def DeleteVolume(self, request, context):
try:
scrub(volume_id=request.volume_id) scrub(volume_id=request.volume_id)
except CalledProcessError as exc:
if exc.returncode == VOLUME_IN_USE_EXIT_CODE:
context.abort(grpc.StatusCode.FAILED_PRECONDITION, "Volume in use")
else:
raise exc
return csi_pb2.DeleteVolumeResponse() return csi_pb2.DeleteVolumeResponse()
@log_grpc_request @log_grpc_request

View File

@ -3,12 +3,20 @@ from util import remote_fn
def scrub(volume_id): def scrub(volume_id):
import time import time
from subprocess import CalledProcessError
import rawfile_util import rawfile_util
from consts import VOLUME_IN_USE_EXIT_CODE
img_dir = rawfile_util.img_dir(volume_id) img_dir = rawfile_util.img_dir(volume_id)
if not img_dir.exists(): if not img_dir.exists():
return return
img_file = rawfile_util.img_file(volume_id)
loops = rawfile_util.attached_loops(img_file)
if len(loops) > 0:
raise CalledProcessError(returncode=VOLUME_IN_USE_EXIT_CODE, cmd="")
now = time.time() now = time.time()
deleted_at = now deleted_at = now
gc_at = now # TODO: GC sensitive PVCs later gc_at = now # TODO: GC sensitive PVCs later