Skip to content

Commit 9b68f5d

Browse files
hubatishcopybara-github
authored andcommitted
Loosen node validation requirements but add start/end node/pod counts
Prior validation was failing for Autopilot because node count goes a little wild. This still works for the simple "ensure I get 1 node ready event for the scale -> 1, then -> 2" test I want it to, but allows the actual node count to vary. PiperOrigin-RevId: 904538165
1 parent 313bb88 commit 9b68f5d

2 files changed

Lines changed: 102 additions & 39 deletions

File tree

perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py

Lines changed: 73 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@
6262
'The container image to use for the Kubernetes scale benchmark.'
6363
'If not specified, the default image will be used.',
6464
)
65-
VALIDATED_NUM_NODES = flags.DEFINE_integer(
66-
'kubernetes_scale_validated_num_nodes',
65+
EXPECTED_NODES_CREATED = flags.DEFINE_integer(
66+
'kubernetes_scale_nodes_created',
6767
None,
68-
'If defined, the benchmark will fail if the number of nodes is not equal'
69-
' to this value after the scale up.',
68+
'If defined, the benchmark will fail if there are not this many node ready'
69+
' events during scale up.',
7070
)
7171

7272
MANIFEST_TEMPLATE = 'container/kubernetes_scale/kubernetes_scale.yaml.j2'
@@ -196,7 +196,8 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
196196
'node', start_time, resources_to_ignore=initial_nodes
197197
)
198198
samples += node_samples
199-
CheckForNodeFailures(node_samples, initial_nodes)
199+
ValidateNodesCreated(node_samples)
200+
samples += GetStartEndCountSamples(initial_nodes, initial_pods)
200201
metadata = {
201202
'pod_memory': MEMORY_PER_POD.value,
202203
'pod_cpu': CPUS_PER_POD.value,
@@ -206,8 +207,8 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
206207
if virtual_machine.GPU_COUNT.value:
207208
metadata['gpu_count'] = virtual_machine.GPU_COUNT.value
208209
metadata['gpu_type'] = virtual_machine.GPU_TYPE.value
209-
if VALIDATED_NUM_NODES.value:
210-
metadata['validated_num_nodes'] = VALIDATED_NUM_NODES.value
210+
if EXPECTED_NODES_CREATED.value:
211+
metadata['validated_num_nodes'] = EXPECTED_NODES_CREATED.value
211212
for s in samples:
212213
s.metadata.update(metadata)
213214
return samples
@@ -321,43 +322,91 @@ def _GetSampleByMetricName(
321322
return next((s for s in samples if s.metric == metric), None)
322323

323324

324-
def CheckForNodeFailures(
325+
def GetStartEndCountSamples(
326+
initial_nodes: set[str], initial_pods: set[str]
327+
) -> list[sample.Sample]:
328+
"""Returns the number of nodes & pods before & after scale up as samples."""
329+
final_nodes = set(
330+
kubernetes_commands.GetNodeNames(
331+
suppress_logging=_ShouldSuppressLogging()
332+
)
333+
)
334+
if (
335+
EXPECTED_NODES_CREATED.value
336+
and len(final_nodes) != EXPECTED_NODES_CREATED.value + 1
337+
):
338+
logging.warning(
339+
'Expected to have %d nodes after scale up, but there are %d nodes. This'
340+
' is odd behavior, but not wholly unexpected for Autopilot clusters.',
341+
EXPECTED_NODES_CREATED.value + 1,
342+
len(final_nodes),
343+
)
344+
final_pods = set(
345+
kubernetes_commands.GetPodNames(suppress_logging=_ShouldSuppressLogging())
346+
)
347+
samples = []
348+
samples.extend(_GetResourceCountSamples(initial_nodes, final_nodes, 'node'))
349+
samples.extend(_GetResourceCountSamples(initial_pods, final_pods, 'pod'))
350+
return samples
351+
352+
353+
def _GetResourceCountSamples(
354+
initial_resources: set[str], final_resources: set[str], resource_type: str
355+
) -> list[sample.Sample]:
356+
"""Returns the number of resources before & after scale up as samples."""
357+
if len(initial_resources) >= len(final_resources):
358+
logging.warning(
359+
'Started with %d %ss and ended with %d %ss after scale up. Expected to '
360+
'add resources with scale up, but that did not happen. This is odd '
361+
'behavior, but might not be wholly unexpected for Autopilot clusters.',
362+
len(initial_resources),
363+
resource_type,
364+
len(final_resources),
365+
resource_type,
366+
)
367+
samples = [
368+
sample.Sample(
369+
f'initial_{resource_type}_count',
370+
len(initial_resources),
371+
'count',
372+
),
373+
sample.Sample(
374+
f'final_{resource_type}_count',
375+
len(final_resources),
376+
'count',
377+
),
378+
]
379+
return samples
380+
381+
382+
def ValidateNodesCreated(
325383
node_samples: list[sample.Sample],
326-
initial_nodes: set[str],
327384
):
328-
"""Fails the benchmark if the wrong number of nodes are present.
385+
"""Fails the benchmark if the wrong number of nodes were created.
329386
330387
Args:
331388
node_samples: The samples from node transition times which includes node
332389
Ready count.
333-
initial_nodes: The initial nodes in the cluster.
334390
335391
Raises:
336-
RunError: If the number of nodes is not equal to the expected number of
337-
nodes.
392+
RunError: If the number of node ready events is not as expected.
338393
"""
339-
if VALIDATED_NUM_NODES.value is None:
394+
if EXPECTED_NODES_CREATED.value is None:
340395
return
341396
node_ready_count_sample = _GetSampleByMetricName(
342397
node_samples, 'node_Ready_count'
343398
)
344399
if node_ready_count_sample is None:
345400
raise errors.Benchmarks.RunError(
346401
'No node ready events were found & we attempted to scale up to'
347-
f' {VALIDATED_NUM_NODES.value} nodes.'
402+
f' {EXPECTED_NODES_CREATED.value} nodes.'
348403
)
349-
expected_num_nodes = VALIDATED_NUM_NODES.value - len(initial_nodes)
350-
if node_ready_count_sample.value != expected_num_nodes:
404+
if node_ready_count_sample.value != EXPECTED_NODES_CREATED.value:
351405
raise errors.Benchmarks.RunError(
352-
'Expected %d nodes to be created, but %d nodes were created &'
353-
' ready.Expected count %d comes from validated num nodes %d - initial'
354-
' nodes %d.'
406+
'Expected %d nodes to be created, but saw %d node ready events.'
355407
% (
356-
expected_num_nodes,
408+
EXPECTED_NODES_CREATED.value,
357409
node_ready_count_sample.value,
358-
expected_num_nodes,
359-
VALIDATED_NUM_NODES.value,
360-
len(initial_nodes),
361410
)
362411
)
363412

tests/linux_benchmarks/kubernetes_scale_benchmark_test.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from perfkitbenchmarker.linux_benchmarks import kubernetes_scale_benchmark
1212
from perfkitbenchmarker.resources.container_service import kubectl
1313
from perfkitbenchmarker.resources.container_service import kubernetes_cluster
14+
from perfkitbenchmarker.resources.container_service import kubernetes_commands
1415
from perfkitbenchmarker.resources.container_service import kubernetes_events
1516
from tests import pkb_common_test_case
1617

@@ -491,38 +492,51 @@ def testCheckFailuresThrowsQuotaExceeded(self):
491492
9,
492493
)
493494

494-
@flagsaver.flagsaver(kubernetes_scale_validated_num_nodes=None)
495+
@flagsaver.flagsaver(kubernetes_scale_nodes_created=None)
495496
def testCheckForNodeFailures_NoValidatedNumNodes(self):
496-
kubernetes_scale_benchmark.CheckForNodeFailures([], set())
497+
kubernetes_scale_benchmark.ValidateNodesCreated([])
497498

498-
@flagsaver.flagsaver(kubernetes_scale_validated_num_nodes=10)
499+
@flagsaver.flagsaver(kubernetes_scale_nodes_created=10)
499500
def testCheckForNodeFailures_NoSample(self):
500501
with self.assertRaisesRegex(
501502
errors.Benchmarks.RunError, 'No node ready events were found'
502503
):
503-
kubernetes_scale_benchmark.CheckForNodeFailures([], set())
504+
kubernetes_scale_benchmark.ValidateNodesCreated([])
504505

505-
@flagsaver.flagsaver(kubernetes_scale_validated_num_nodes=10)
506+
@flagsaver.flagsaver(kubernetes_scale_nodes_created=10)
506507
def testCheckForNodeFailures_Mismatch(self):
507508
with self.assertRaises(
508509
errors.Benchmarks.RunError,
509510
):
510-
kubernetes_scale_benchmark.CheckForNodeFailures(
511+
kubernetes_scale_benchmark.ValidateNodesCreated(
511512
[sample.Sample('node_Ready_count', 5, 'count')],
512-
set(),
513513
)
514514

515-
@flagsaver.flagsaver(kubernetes_scale_validated_num_nodes=10)
515+
@flagsaver.flagsaver(kubernetes_scale_nodes_created=10)
516516
def testCheckForNodeFailures_Match(self):
517-
kubernetes_scale_benchmark.CheckForNodeFailures(
518-
[sample.Sample('node_Ready_count', 10, 'count')], set()
517+
kubernetes_scale_benchmark.ValidateNodesCreated(
518+
[sample.Sample('node_Ready_count', 10, 'count')]
519519
)
520520

521-
@flagsaver.flagsaver(kubernetes_scale_validated_num_nodes=2)
522-
def testCheckForNodeFailures_MatchWithInitialNodes(self):
523-
kubernetes_scale_benchmark.CheckForNodeFailures(
524-
[sample.Sample('node_Ready_count', 1, 'count')], set(['node1'])
525-
)
521+
def testGetStartEndCountSamples(self):
522+
initial_nodes = set(['node1'])
523+
initial_pods = set(['pod1'])
524+
final_nodes = ['node1', 'node2']
525+
final_pods = ['pod1', 'pod2', 'pod3']
526+
527+
with mock.patch.object(
528+
kubernetes_commands, 'GetNodeNames', return_value=final_nodes
529+
), mock.patch.object(
530+
kubernetes_commands, 'GetPodNames', return_value=final_pods
531+
):
532+
samples = kubernetes_scale_benchmark.GetStartEndCountSamples(
533+
initial_nodes, initial_pods
534+
)
535+
samples_by_metric = _SamplesByMetric(samples)
536+
self.assertEqual(samples_by_metric['initial_node_count'].value, 1)
537+
self.assertEqual(samples_by_metric['final_node_count'].value, 2)
538+
self.assertEqual(samples_by_metric['initial_pod_count'].value, 1)
539+
self.assertEqual(samples_by_metric['final_pod_count'].value, 3)
526540

527541

528542
if __name__ == '__main__':

0 commit comments

Comments
 (0)