Skip to content

Commit 3aa7aa4

Browse files
authored
[AKS] az aks nodepool add/update --enable-managed-gpu option (#9704)
1 parent 2718c5d commit 3aa7aa4

File tree

10 files changed

+365
-6
lines changed

10 files changed

+365
-6
lines changed

src/aks-preview/HISTORY.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ To release a new version, please select a new version number (usually plus 1 to
1212
Pending
1313
+++++++
1414
* Fix `match_condition` kwarg leaking to HTTP transport by overriding `put_mc` and `add_agentpool` to pass `if_match` / `if_none_match` directly to the vendored SDK. This change fixes the compatibility issue as azure-cli/acs module adopts TypeSpec emitted SDKs while azure-cli-extensions/aks-preview still uses the autorest emitted SDK.
15-
+ `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region.
15+
* `az aks list-vm-skus`: New command to list available VM SKUs for AKS clusters in a given region.
16+
* Add managed GPU enablement option to node pool property in `az aks nodepool add` and `az aks nodepool update`.
17+
18+
1619

1720
19.0.0b27
1821
+++++++

src/aks-preview/azcli_aks_live_test/configs/ext_matrix_default.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
],
2323
"gpu, no quota": [
2424
"test_aks_nodepool_add_with_gpu_instance_profile",
25-
"test_aks_gpu_driver_type"
25+
"test_aks_gpu_driver_type",
26+
"test_aks_nodepool_add_with_enable_managed_gpu",
27+
"test_aks_nodepool_update_with_enable_managed_gpu"
2628
],
2729
"pod ip allocation mode static block, missing feature registration": [
2830
"test_aks_create_with_pod_ip_allocation_mode_static_block"

src/aks-preview/azext_aks_preview/_consts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@
7676
CONST_GPU_DRIVER_INSTALL = "Install"
7777
CONST_GPU_DRIVER_NONE = "None"
7878

79+
# gpu management mode
80+
CONST_GPU_MANAGEMENT_MODE_MANAGED = "Managed"
81+
CONST_GPU_MANAGEMENT_MODE_UNMANAGED = "Unmanaged"
82+
7983
# consts for ManagedCluster
8084
# load balancer sku
8185
CONST_LOAD_BALANCER_SKU_BASIC = "basic"

src/aks-preview/azext_aks_preview/_help.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2203,6 +2203,9 @@
22032203
- name: --enable-artifact-streaming
22042204
type: bool
22052205
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
2206+
- name: --enable-managed-gpu
2207+
type: bool
2208+
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
22062209
- name: --skip-gpu-driver-install
22072210
type: bool
22082211
short-summary: To skip GPU driver auto installation by AKS on a nodepool using GPU vm size if customers want to manage GPU driver installation by their own. If not specified, the default is false.
@@ -2419,6 +2422,9 @@
24192422
- name: --enable-artifact-streaming
24202423
type: bool
24212424
short-summary: Enable artifact streaming for VirtualMachineScaleSets managed by a node pool, to speed up the cold-start of containers on a node through on-demand image loading. To use this feature, container images must also enable artifact streaming on ACR. If not specified, the default is false.
2425+
- name: --enable-managed-gpu
2426+
type: bool
2427+
short-summary: Enable the Managed GPU experience, which installs additional components like DCGM metrics for monitoring on top of the GPU driver. For more details, visit aka.ms/aks/managed-gpu.
24222428
- name: --os-sku
24232429
type: string
24242430
short-summary: The os-sku of the agent node pool.

src/aks-preview/azext_aks_preview/_params.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,12 @@ def load_arguments(self, _):
20312031
validator=validate_artifact_streaming,
20322032
is_preview=True,
20332033
)
2034+
c.argument(
2035+
"enable_managed_gpu",
2036+
action="store_true",
2037+
is_preview=True,
2038+
help="Enable the Managed GPU experience.",
2039+
)
20342040
c.argument(
20352041
"node_public_ip_tags",
20362042
arg_type=tags_type,
@@ -2140,6 +2146,12 @@ def load_arguments(self, _):
21402146
validator=validate_artifact_streaming,
21412147
is_preview=True,
21422148
)
2149+
c.argument(
2150+
"enable_managed_gpu",
2151+
action="store_true",
2152+
is_preview=True,
2153+
help="Enable the Managed GPU experience.",
2154+
)
21432155
c.argument(
21442156
"os_sku",
21452157
arg_type=get_enum_type(node_os_skus_update),

src/aks-preview/azext_aks_preview/agentpool_decorator.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,10 @@
4444
CONST_DEFAULT_WINDOWS_VMS_VM_SIZE,
4545
CONST_MANAGED_CLUSTER_SKU_NAME_AUTOMATIC,
4646
CONST_SSH_ACCESS_LOCALUSER,
47+
CONST_GPU_DRIVER_INSTALL,
4748
CONST_GPU_DRIVER_NONE,
49+
CONST_GPU_MANAGEMENT_MODE_MANAGED,
50+
CONST_GPU_MANAGEMENT_MODE_UNMANAGED,
4851
CONST_NODEPOOL_MODE_MANAGEDSYSTEM,
4952
CONST_NODEPOOL_MODE_MACHINES,
5053
)
@@ -587,6 +590,27 @@ def get_enable_artifact_streaming(self) -> bool:
587590
enable_artifact_streaming = self.agentpool.artifact_streaming_profile.enabled
588591
return enable_artifact_streaming
589592

593+
def get_enable_managed_gpu(self) -> Union[bool, None]:
594+
"""Obtain the value of enable_managed_gpu.
595+
:return: bool
596+
"""
597+
598+
# read the original value passed by the command
599+
enable_managed_gpu = self.raw_param.get("enable_managed_gpu")
600+
601+
# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
602+
if self.decorator_mode == DecoratorMode.CREATE:
603+
if (
604+
self.agentpool and
605+
self.agentpool.gpu_profile is not None and
606+
self.agentpool.gpu_profile.nvidia is not None and
607+
self.agentpool.gpu_profile.nvidia.management_mode is not None
608+
):
609+
enable_managed_gpu = (
610+
self.agentpool.gpu_profile.nvidia.management_mode == CONST_GPU_MANAGEMENT_MODE_MANAGED
611+
)
612+
return enable_managed_gpu
613+
590614
def get_pod_ip_allocation_mode(self: bool = False) -> Union[str, None]:
591615
"""Get the value of pod_ip_allocation_mode.
592616
:return: str or None
@@ -1276,6 +1300,21 @@ def set_up_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
12761300
agentpool.artifact_streaming_profile.enabled = True
12771301
return agentpool
12781302

1303+
def set_up_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
1304+
"""Set up managed GPU property for the AgentPool object."""
1305+
self._ensure_agentpool(agentpool)
1306+
1307+
enable_managed_gpu = self.context.get_enable_managed_gpu()
1308+
1309+
if enable_managed_gpu:
1310+
if agentpool.gpu_profile is None:
1311+
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
1312+
if agentpool.gpu_profile.nvidia is None:
1313+
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
1314+
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
1315+
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
1316+
return agentpool
1317+
12791318
def set_up_ssh_access(self, agentpool: AgentPool) -> AgentPool:
12801319
self._ensure_agentpool(agentpool)
12811320

@@ -1510,6 +1549,8 @@ def construct_agentpool_profile_preview(self) -> AgentPool:
15101549
agentpool = self.set_up_init_taints(agentpool)
15111550
# set up artifact streaming
15121551
agentpool = self.set_up_artifact_streaming(agentpool)
1552+
# set up managed gpu
1553+
agentpool = self.set_up_managed_gpu(agentpool)
15131554
# set up skip_gpu_driver_install
15141555
agentpool = self.set_up_skip_gpu_driver_install(agentpool)
15151556
# set up gpu profile
@@ -1704,6 +1745,29 @@ def update_artifact_streaming(self, agentpool: AgentPool) -> AgentPool:
17041745
agentpool.artifact_streaming_profile.enabled = True
17051746
return agentpool
17061747

1748+
def update_managed_gpu(self, agentpool: AgentPool) -> AgentPool:
1749+
"""Update managed GPU property for the AgentPool object.
1750+
:return: the AgentPool object
1751+
"""
1752+
self._ensure_agentpool(agentpool)
1753+
1754+
enable_managed_gpu = self.context.get_enable_managed_gpu()
1755+
if enable_managed_gpu is None:
1756+
return agentpool
1757+
1758+
if enable_managed_gpu:
1759+
if agentpool.gpu_profile is None:
1760+
agentpool.gpu_profile = self.models.GPUProfile() # pylint: disable=no-member
1761+
if agentpool.gpu_profile.nvidia is None:
1762+
agentpool.gpu_profile.nvidia = self.models.NvidiaGPUProfile() # pylint: disable=no-member
1763+
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_MANAGED
1764+
agentpool.gpu_profile.driver = CONST_GPU_DRIVER_INSTALL
1765+
else:
1766+
if agentpool.gpu_profile and agentpool.gpu_profile.nvidia:
1767+
agentpool.gpu_profile.nvidia.management_mode = CONST_GPU_MANAGEMENT_MODE_UNMANAGED
1768+
1769+
return agentpool
1770+
17071771
def update_os_sku(self, agentpool: AgentPool) -> AgentPool:
17081772
self._ensure_agentpool(agentpool)
17091773

@@ -1828,6 +1892,9 @@ def update_agentpool_profile_preview(self, agentpools: List[AgentPool] = None) -
18281892
# update artifact streaming
18291893
agentpool = self.update_artifact_streaming(agentpool)
18301894

1895+
# update managed gpu
1896+
agentpool = self.update_managed_gpu(agentpool)
1897+
18311898
# update secure boot
18321899
agentpool = self.update_secure_boot(agentpool)
18331900

src/aks-preview/azext_aks_preview/custom.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,6 +1919,7 @@ def aks_agentpool_add(
19191919
asg_ids=None,
19201920
node_public_ip_tags=None,
19211921
enable_artifact_streaming=False,
1922+
enable_managed_gpu=False,
19221923
skip_gpu_driver_install=False,
19231924
gpu_driver=None,
19241925
driver_type=None,
@@ -1993,6 +1994,7 @@ def aks_agentpool_update(
19931994
allowed_host_ports=None,
19941995
asg_ids=None,
19951996
enable_artifact_streaming=False,
1997+
enable_managed_gpu=False,
19961998
os_sku=None,
19971999
ssh_access=None,
19982000
yes=False,

0 commit comments

Comments
 (0)