Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ const (
gpuWorkloadConfigContainer = "container"
gpuWorkloadConfigVMPassthrough = "vm-passthrough"
gpuWorkloadConfigVMVgpu = "vm-vgpu"
driverDeployLabelKey = "nvidia.com/gpu.deploy.driver"
kubevirtDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.sandbox-device-plugin"
kataDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.kata-sandbox-device-plugin"
podSecurityLabelPrefix = "pod-security.kubernetes.io/"
Expand Down Expand Up @@ -395,10 +396,15 @@ func removeAllGPUStateLabels(labels map[string]string) bool {
// updateGPUStateLabels returns true if the input labels map is modified.
func (w *gpuWorkloadConfiguration) updateGPUStateLabels(labels map[string]string) bool {
if hasOperandsDisabled(labels) {
// Operands are disabled, delete all GPU state labels
// Operands are disabled: remove all GPU state labels except the driver label.
w.log.Info("Operands are disabled for node", "NodeName", w.node, "Label", commonOperandsLabelKey, "Value", "false")
w.log.Info("Disabling all operands for node", "NodeName", w.node)
return removeAllGPUStateLabels(labels)
w.log.Info("Disabling all operands for node (except the GPU driver)", "NodeName", w.node)
driverLabelValue, hasDriverLabel := labels[driverDeployLabelKey]
modified := removeAllGPUStateLabels(labels)
if hasDriverLabel {
labels[driverDeployLabelKey] = driverLabelValue
}
return modified
}
removed := w.removeGPUStateLabels(labels)
added := w.addGPUStateLabels(labels)
Expand Down
45 changes: 45 additions & 0 deletions controllers/state_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"errors"
"testing"

"github.com/go-logr/logr"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -207,6 +208,50 @@ func TestHasOperandsDisabled(t *testing.T) {
}
}

func TestUpdateGPUStateLabels_OperandsDisabled(t *testing.T) {
log := logr.Discard()
w := &gpuWorkloadConfiguration{config: gpuWorkloadConfigContainer, node: "test-node", log: log}

t.Run("preserves driver label when operands disabled", func(t *testing.T) {
labels := map[string]string{
commonOperandsLabelKey: "false",
driverDeployLabelKey: "true",
"nvidia.com/gpu.deploy.container-toolkit": "true",
"nvidia.com/gpu.deploy.device-plugin": "true",
"nvidia.com/gpu.deploy.gpu-feature-discovery": "true",
}
modified := w.updateGPUStateLabels(labels)
require.True(t, modified)
require.Equal(t, "true", labels[driverDeployLabelKey], "driver label must be preserved when operands disabled")
require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit")
require.NotContains(t, labels, "nvidia.com/gpu.deploy.device-plugin")
require.NotContains(t, labels, "nvidia.com/gpu.deploy.gpu-feature-discovery")
})

t.Run("driver label not added if absent when operands disabled", func(t *testing.T) {
labels := map[string]string{
commonOperandsLabelKey: "false",
"nvidia.com/gpu.deploy.container-toolkit": "true",
}
modified := w.updateGPUStateLabels(labels)
require.True(t, modified)
require.NotContains(t, labels, driverDeployLabelKey, "driver label must not be added if it was not present")
require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit")
})

t.Run("preserves driver label set to false when operands disabled", func(t *testing.T) {
labels := map[string]string{
commonOperandsLabelKey: "false",
driverDeployLabelKey: "false",
"nvidia.com/gpu.deploy.container-toolkit": "true",
}
modified := w.updateGPUStateLabels(labels)
require.True(t, modified)
require.Equal(t, "false", labels[driverDeployLabelKey], "explicitly-disabled driver label must be preserved")
require.NotContains(t, labels, "nvidia.com/gpu.deploy.container-toolkit")
})
}

func TestHasNFDLabels(t *testing.T) {
tests := []struct {
labels map[string]string
Expand Down
3 changes: 1 addition & 2 deletions tests/scripts/verify-disable-operands.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ source ${SCRIPT_DIR}/.definitions.sh
# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

# We verify that all GPU Operator operands have been deleted
check_pod_deleted "nvidia-driver-daemonset"
# We verify that all GPU Operator operands have been deleted, except the driver.
check_pod_deleted "nvidia-container-toolkit-daemonset"
check_pod_deleted "nvidia-device-plugin-daemonset"
check_pod_deleted "nvidia-dcgm-exporter"
Expand Down
Loading