6262 'The container image to use for the Kubernetes scale benchmark.'
6363 'If not specified, the default image will be used.' ,
6464)
65- VALIDATED_NUM_NODES = flags .DEFINE_integer (
66- 'kubernetes_scale_validated_num_nodes ' ,
65+ EXPECTED_NODES_CREATED = flags .DEFINE_integer (
66+ 'kubernetes_scale_nodes_created ' ,
6767 None ,
68- 'If defined, the benchmark will fail if the number of nodes is not equal '
69- ' to this value after the scale up.' ,
68+ 'If defined, the benchmark will fail if there are not this many node ready '
69+ ' events during scale up.' ,
7070)
7171
7272MANIFEST_TEMPLATE = 'container/kubernetes_scale/kubernetes_scale.yaml.j2'
@@ -196,7 +196,8 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
196196 'node' , start_time , resources_to_ignore = initial_nodes
197197 )
198198 samples += node_samples
199- CheckForNodeFailures (node_samples , initial_nodes )
199+ ValidateNodesCreated (node_samples )
200+ samples += GetStartEndCountSamples (initial_nodes , initial_pods )
200201 metadata = {
201202 'pod_memory' : MEMORY_PER_POD .value ,
202203 'pod_cpu' : CPUS_PER_POD .value ,
@@ -206,8 +207,8 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
206207 if virtual_machine .GPU_COUNT .value :
207208 metadata ['gpu_count' ] = virtual_machine .GPU_COUNT .value
208209 metadata ['gpu_type' ] = virtual_machine .GPU_TYPE .value
209- if VALIDATED_NUM_NODES .value :
210- metadata ['validated_num_nodes' ] = VALIDATED_NUM_NODES .value
210+ if EXPECTED_NODES_CREATED .value :
211+ metadata ['validated_num_nodes' ] = EXPECTED_NODES_CREATED .value
211212 for s in samples :
212213 s .metadata .update (metadata )
213214 return samples
@@ -321,43 +322,91 @@ def _GetSampleByMetricName(
321322 return next ((s for s in samples if s .metric == metric ), None )
322323
323324
324- def CheckForNodeFailures (
325+ def GetStartEndCountSamples (
326+ initial_nodes : set [str ], initial_pods : set [str ]
327+ ) -> list [sample .Sample ]:
328+ """Returns the number of nodes & pods before & after scale up as samples."""
329+ final_nodes = set (
330+ kubernetes_commands .GetNodeNames (
331+ suppress_logging = _ShouldSuppressLogging ()
332+ )
333+ )
334+ if (
335+ EXPECTED_NODES_CREATED .value
336+ and len (final_nodes ) != EXPECTED_NODES_CREATED .value + 1
337+ ):
338+ logging .warning (
339+ 'Expected to have %d nodes after scale up, but there are %d nodes. This'
340+ ' is odd behavior, but not wholly unexpected for Autopilot clusters.' ,
341+ EXPECTED_NODES_CREATED .value + 1 ,
342+ len (final_nodes ),
343+ )
344+ final_pods = set (
345+ kubernetes_commands .GetPodNames (suppress_logging = _ShouldSuppressLogging ())
346+ )
347+ samples = []
348+ samples .extend (_GetResourceCountSamples (initial_nodes , final_nodes , 'node' ))
349+ samples .extend (_GetResourceCountSamples (initial_pods , final_pods , 'pod' ))
350+ return samples
351+
352+
353+ def _GetResourceCountSamples (
354+ initial_resources : set [str ], final_resources : set [str ], resource_type : str
355+ ) -> list [sample .Sample ]:
356+ """Returns the number of resources before & after scale up as samples."""
357+ if len (initial_resources ) >= len (final_resources ):
358+ logging .warning (
359+ 'Started with %d %ss and ended with %d %ss after scale up. Expected to '
360+ 'add resources with scale up, but that did not happen. This is odd '
361+ 'behavior, but might not be wholly unexpected for Autopilot clusters.' ,
362+ len (initial_resources ),
363+ resource_type ,
364+ len (final_resources ),
365+ resource_type ,
366+ )
367+ samples = [
368+ sample .Sample (
369+ f'initial_{ resource_type } _count' ,
370+ len (initial_resources ),
371+ 'count' ,
372+ ),
373+ sample .Sample (
374+ f'final_{ resource_type } _count' ,
375+ len (final_resources ),
376+ 'count' ,
377+ ),
378+ ]
379+ return samples
380+
381+
382+ def ValidateNodesCreated (
325383 node_samples : list [sample .Sample ],
326- initial_nodes : set [str ],
327384):
328- """Fails the benchmark if the wrong number of nodes are present .
385+ """Fails the benchmark if the wrong number of nodes were created .
329386
330387 Args:
331388 node_samples: The samples from node transition times which includes node
332389 Ready count.
333- initial_nodes: The initial nodes in the cluster.
334390
335391 Raises:
336- RunError: If the number of nodes is not equal to the expected number of
337- nodes.
392+ RunError: If the number of node ready events is not as expected.
338393 """
339- if VALIDATED_NUM_NODES .value is None :
394+ if EXPECTED_NODES_CREATED .value is None :
340395 return
341396 node_ready_count_sample = _GetSampleByMetricName (
342397 node_samples , 'node_Ready_count'
343398 )
344399 if node_ready_count_sample is None :
345400 raise errors .Benchmarks .RunError (
346401 'No node ready events were found & we attempted to scale up to'
347- f' { VALIDATED_NUM_NODES .value } nodes.'
402+ f' { EXPECTED_NODES_CREATED .value } nodes.'
348403 )
349- expected_num_nodes = VALIDATED_NUM_NODES .value - len (initial_nodes )
350- if node_ready_count_sample .value != expected_num_nodes :
404+ if node_ready_count_sample .value != EXPECTED_NODES_CREATED .value :
351405 raise errors .Benchmarks .RunError (
352- 'Expected %d nodes to be created, but %d nodes were created &'
353- ' ready.Expected count %d comes from validated num nodes %d - initial'
354- ' nodes %d.'
406+ 'Expected %d nodes to be created, but saw %d node ready events.'
355407 % (
356- expected_num_nodes ,
408+ EXPECTED_NODES_CREATED . value ,
357409 node_ready_count_sample .value ,
358- expected_num_nodes ,
359- VALIDATED_NUM_NODES .value ,
360- len (initial_nodes ),
361410 )
362411 )
363412
0 commit comments