From 47d5cdad2c9b99ae723d433c3c1b64523066f3a3 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 23 Jun 2026 13:12:06 -0700 Subject: [PATCH] Do not remove driver when gpu.deploy.operands label is set to false Previously, setting nvidia.com/gpu.deploy.operands label to 'false' would remove all GPU Operator related pods from a node. This commit makes it so that the driver no longer gets removed when the gpu.deploy.operands label is set to false. To manually remove the driver pod from a node, a user has to explicitly label the node with nvidia.com/gpu.deploy.driver=false. This change provides an extra guardrail for the driver pod whose removal from a node is highly disruptive. Additionally, this change is motivated by our future plans to integrate the NVIDIA DRA Driver for GPUs with the GPU Operator. In particular, this change helps provide a possible migration story from the k8s-device-plugin sw stack to the DRA driver sw stack that leverages the nvidia.com/gpu.deploy.operands label to switch between the respective software components. Signed-off-by: Christopher Desiniotis --- controllers/state_manager.go | 12 +++++-- controllers/state_manager_test.go | 45 ++++++++++++++++++++++++ tests/scripts/verify-disable-operands.sh | 3 +- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 45b890df2d..d4359361f3 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -71,6 +71,7 @@ const ( gpuWorkloadConfigContainer = "container" gpuWorkloadConfigVMPassthrough = "vm-passthrough" gpuWorkloadConfigVMVgpu = "vm-vgpu" + driverDeployLabelKey = "nvidia.com/gpu.deploy.driver" kubevirtDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.sandbox-device-plugin" kataDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.kata-sandbox-device-plugin" podSecurityLabelPrefix = "pod-security.kubernetes.io/" @@ -395,10 +396,15 @@ func removeAllGPUStateLabels(labels map[string]string) bool { // updateGPUStateLabels returns true if the input labels map is modified. func (w *gpuWorkloadConfiguration) updateGPUStateLabels(labels map[string]string) bool { if hasOperandsDisabled(labels) { - // Operands are disabled, delete all GPU state labels + // Operands are disabled: remove all GPU state labels except the driver label. w.log.Info("Operands are disabled for node", "NodeName", w.node, "Label", commonOperandsLabelKey, "Value", "false") - w.log.Info("Disabling all operands for node", "NodeName", w.node) - return removeAllGPUStateLabels(labels) + w.log.Info("Disabling all operands for node (except the GPU driver)", "NodeName", w.node) + driverLabelValue, hasDriverLabel := labels[driverDeployLabelKey] + modified := removeAllGPUStateLabels(labels) + if hasDriverLabel { + labels[driverDeployLabelKey] = driverLabelValue + } + return modified } removed := w.removeGPUStateLabels(labels) added := w.addGPUStateLabels(labels) diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index 6585de1961..0858ed142d 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -21,6 +21,7 @@ import ( "errors" "testing" + "github.com/go-logr/logr" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -207,6 +208,50 @@ func TestHasOperandsDisabled(t *testing.T) { } } +func TestUpdateGPUStateLabels_OperandsDisabled(t *testing.T) { + log := logr.Discard() + w := &gpuWorkloadConfiguration{config: gpuWorkloadConfigContainer, node: "test-node", log: log} + + t.Run("preserves driver label when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + driverDeployLabelKey: "true", + "nvidia.com/gpu.deploy.container-toolkit": "true", + "nvidia.com/gpu.deploy.device-plugin": "true", + "nvidia.com/gpu.deploy.gpu-feature-discovery": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.Equal(t, "true", labels[driverDeployLabelKey], "driver label must be preserved when operands disabled") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.device-plugin") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.gpu-feature-discovery") + }) + + t.Run("driver label not added if absent when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + "nvidia.com/gpu.deploy.container-toolkit": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.NotContains(t, labels, driverDeployLabelKey, "driver label must not be added if it was not present") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + }) + + t.Run("preserves driver label set to false when operands disabled", func(t *testing.T) { + labels := map[string]string{ + commonOperandsLabelKey: "false", + driverDeployLabelKey: "false", + "nvidia.com/gpu.deploy.container-toolkit": "true", + } + modified := w.updateGPUStateLabels(labels) + require.True(t, modified) + require.Equal(t, "false", labels[driverDeployLabelKey], "explicitly-disabled driver label must be preserved") + require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit") + }) +} + func TestHasNFDLabels(t *testing.T) { tests := []struct { labels map[string]string diff --git a/tests/scripts/verify-disable-operands.sh b/tests/scripts/verify-disable-operands.sh index 66c5bb85c9..c0aad843c8 100755 --- a/tests/scripts/verify-disable-operands.sh +++ b/tests/scripts/verify-disable-operands.sh @@ -11,8 +11,7 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh -# We verify that all GPU Operator operands have been deleted -check_pod_deleted "nvidia-driver-daemonset" +# We verify that all GPU Operator operands have been deleted, except the driver. check_pod_deleted "nvidia-container-toolkit-daemonset" check_pod_deleted "nvidia-device-plugin-daemonset" check_pod_deleted "nvidia-dcgm-exporter"