Skip to content

Commit c20eb25

Browse files
James-QiuHaoranHaoran Qiugoiri
authored
Derive multi-requests configurations like initial replicas and per request latency (#309)
Missing PR: #294 Co-authored-by: Haoran Qiu <haoranqiu@microsoft.com> Co-authored-by: Íñigo Goiri <elgoiri@gmail.com>
1 parent 8efe895 commit c20eb25

6 files changed

Lines changed: 596 additions & 525 deletions

File tree

simulator/cost_estimator_multirequests.ipynb

Lines changed: 205 additions & 371 deletions
Large diffs are not rendered by default.

simulator/multirequests.py

Lines changed: 179 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,136 +1,205 @@
11
from __future__ import annotations
22

33
import math
4+
import os
5+
from dataclasses import replace
46

57
from sim_types import GPUType
68
from sim_types import Model
79
from sim_types import QualityLevel
810
from sim_types import RESOLUTION_PIXELS
11+
from sim_types import Result
912
from sim_types import WorkflowConfig
1013
from sim_types import LatencyData
1114

15+
from data_loading import load_latency_data
16+
from data_loading import load_power_data
17+
from data_loading import load_adaptive_quality_data
18+
19+
from workflows import PODCAST_WORKFLOW
20+
21+
from policies import STREAMWISE_POLICY
22+
23+
from auto_model_allocator import AutoModelAllocator
24+
1225

1326
# Queries per minute
1427
QPM_LIST = [0.1, 1, 2, 5, 10, 20, 30, 50, 100]
1528

29+
# Resolve the data directory relative to this file so imports work from any cwd.
30+
_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
1631

17-
# Single quality
18-
TIME_PER_REQ: dict[GPUType, dict[Model, float]] = {
19-
GPUType.A100: {
20-
Model.FLUX: 9.75,
21-
Model.HF: 123.08,
22-
Model.HF_VAE: 114.32,
23-
Model.FT: 130.82,
24-
Model.FT_VAE: 52.4, # TODO proper value
25-
Model.UPSCALER: 126.83,
26-
Model.GEMMA: 6.5 + 42 * 0.6, # First scene + per scene
27-
Model.OTHERS: 43 * 0.6, # Kokoro: 43 scenes at 0.6 seconds each
28-
},
29-
GPUType.H100: {
30-
Model.FT: 130.82 / 2.2, # hunyuanframepackf1_time_per_req
31-
Model.FT_VAE: 52.4 / 2.2, # TODO proper value
32-
}
32+
33+
# ---------------------------------------------------------------------------
34+
# Hardware budget — the Pareto-optimal operating point used in the paper.
35+
# ---------------------------------------------------------------------------
36+
HARDWARE_BUDGET: dict[GPUType, int] = {
37+
GPUType.A100: 256,
38+
GPUType.H100: 64,
3339
}
3440

3541

36-
# Optimal point in Pareto Frontier for StreamWise
37-
INIT_REPLICAS: dict[GPUType, dict[Model, int]] = {
38-
GPUType.A100: {
39-
Model.OTHERS: 1,
40-
Model.GEMMA: 1,
41-
Model.FLUX: 1,
42-
Model.HF: 12,
43-
Model.HF_VAE: 3,
44-
Model.FT: 172,
45-
Model.FT_VAE: 10, # TODO proper value
46-
Model.UPSCALER: 21,
47-
},
48-
GPUType.H100: {
49-
Model.FT: 78,
50-
Model.FT_VAE: 1, # TODO proper value
51-
}
52-
}
42+
# ---------------------------------------------------------------------------
43+
# Derivation helpers
44+
# ---------------------------------------------------------------------------
45+
46+
def _extract_from_result(
47+
result: Result,
48+
) -> tuple[dict[GPUType, dict[Model, int]], dict[GPUType, dict[Model, float]]]:
49+
"""Extract init_replicas (GPU counts) and time_per_req from a simulation result.
50+
51+
Returns
52+
-------
53+
init_replicas:
54+
``{gpu_type: {model: total_gpus}}`` — total GPU count allocated to each
55+
model on each GPU type (i.e. ``devices × replicas`` summed across instances).
56+
time_per_req:
57+
``{gpu_type: {model: seconds}}`` — wall-clock time for the model to process
58+
one full request (10-min video) given the allocated resources. When a model
59+
has multiple instances on the same GPU type, we take the *maximum* time
60+
(the bottleneck).
61+
"""
62+
init_replicas: dict[GPUType, dict[Model, int]] = {}
63+
time_per_req: dict[GPUType, dict[Model, float]] = {}
64+
65+
for gpu_type, model_allocs in result.models.items():
66+
init_replicas[gpu_type] = {}
67+
time_per_req[gpu_type] = {}
68+
for model, allocs in model_allocs.items():
69+
total_gpus = sum(a.get_num_gpus() for a in allocs)
70+
times = [a.time for a in allocs if a.get_num_gpus() > 0]
71+
if total_gpus > 0:
72+
init_replicas[gpu_type][model] = total_gpus
73+
time_per_req[gpu_type][model] = max(times) if times else 0.0
74+
75+
return init_replicas, time_per_req
76+
77+
78+
def derive_multirequest_params(
79+
budget: dict[GPUType, int] | None = None,
80+
data_dir: str = _DATA_DIR,
81+
) -> tuple[dict[GPUType, dict[Model, int]], dict[GPUType, dict[Model, float]]]:
82+
"""Run the StreamWise simulator and derive multi-request parameters.
83+
84+
Runs the greedy allocator with ``STREAMWISE_POLICY`` on ``PODCAST_WORKFLOW``
85+
at the given hardware *budget* and extracts:
86+
87+
* **init_replicas** — total GPU count per model per GPU type
88+
* **time_per_req** — total time (seconds) per request per model per GPU type
89+
90+
Parameters
91+
----------
92+
budget:
93+
``{GPUType: num_gpus}`` hardware budget to allocate.
94+
Defaults to ``HARDWARE_BUDGET`` when ``None``.
95+
data_dir:
96+
Path to the latency/power CSV data directory.
97+
"""
98+
if budget is None:
99+
budget = dict(HARDWARE_BUDGET)
100+
latency_data = load_latency_data(data_dir=data_dir)
101+
power_data = load_power_data(data_dir=data_dir)
102+
103+
allocator = AutoModelAllocator(
104+
workflow=PODCAST_WORKFLOW,
105+
latency_data=latency_data,
106+
power_data=power_data,
107+
policy=STREAMWISE_POLICY,
108+
)
109+
result = allocator.allocate(
110+
num_gpus=budget,
111+
verbose=False,
112+
)
53113

54-
# Adaptive quality
55-
# Time per request in seconds
56-
TIME_PER_REQ_ADAPTIVE: dict[GPUType, dict[Model, dict[QualityLevel, float]]] = {
57-
GPUType.A100: {
58-
Model.GEMMA: {
59-
# Same quality for all levels: First + per scene
60-
QualityLevel.LOW: 2.3 + 42 * 0.176,
61-
QualityLevel.MEDIUM: 2.3 + 42 * 0.176,
62-
QualityLevel.HIGH: 2.3 + 42 * 0.176,
63-
},
64-
Model.OTHERS: {
65-
# Kokoro: 42 scenes at 0.6 seconds each
66-
QualityLevel.LOW: 43 * 0.6,
67-
QualityLevel.MEDIUM: 43 * 0.6,
68-
QualityLevel.HIGH: 43 * 0.6,
69-
},
70-
Model.FLUX: {
71-
QualityLevel.LOW: 0.10,
72-
QualityLevel.MEDIUM: 0.81,
73-
QualityLevel.HIGH: 0.95,
74-
},
75-
Model.HF: {
76-
QualityLevel.LOW: 3.41,
77-
QualityLevel.MEDIUM: 8.06,
78-
QualityLevel.HIGH: 27.1,
79-
},
80-
Model.HF_VAE: {
81-
QualityLevel.LOW: 0.75,
82-
QualityLevel.MEDIUM: 3.18,
83-
QualityLevel.HIGH: 52.4,
84-
},
85-
Model.UPSCALER: {
86-
QualityLevel.LOW: 2.01,
87-
QualityLevel.MEDIUM: 8.30,
88-
QualityLevel.HIGH: 34.4,
89-
},
90-
},
91-
GPUType.H100: {
92-
Model.HF_VAE: {
93-
QualityLevel.LOW: 0.75,
94-
QualityLevel.MEDIUM: 3.18,
95-
QualityLevel.HIGH: 52.4,
96-
},
97-
Model.FT: {
98-
QualityLevel.LOW: 8.74,
99-
QualityLevel.MEDIUM: 39.62,
100-
QualityLevel.HIGH: 131.14,
101-
},
102-
Model.FT_VAE: { # TODO proper values
103-
QualityLevel.LOW: 0.75,
104-
QualityLevel.MEDIUM: 3.18,
105-
QualityLevel.HIGH: 52.4,
106-
},
107-
Model.UPSCALER: {
108-
QualityLevel.LOW: 2.01,
109-
QualityLevel.MEDIUM: 8.30,
110-
QualityLevel.HIGH: 34.4,
111-
},
112-
}
113-
}
114+
return _extract_from_result(result)
115+
116+
117+
def derive_adaptive_params(
118+
budget: dict[GPUType, int] | None = None,
119+
data_dir: str = _DATA_DIR,
120+
) -> tuple[
121+
dict[GPUType, dict[Model, int]],
122+
dict[GPUType, dict[Model, dict[QualityLevel, float]]],
123+
]:
124+
"""Run the simulator at each quality level and derive adaptive parameters.
125+
126+
Returns
127+
-------
128+
init_replicas_adaptive:
129+
``{gpu_type: {model: total_gpus}}`` from the HIGH-quality simulation run
130+
(the worst-case / most-demanding quality level sets the base allocation).
131+
time_per_req_adaptive:
132+
``{gpu_type: {model: {quality: seconds}}}`` — per-quality time per request,
133+
every ``(gpu_type, model)`` in ``init_replicas_adaptive`` has a timing
134+
entry for every quality level.
135+
"""
136+
if budget is None:
137+
budget = dict(HARDWARE_BUDGET)
138+
139+
power_data = load_power_data(data_dir=data_dir)
140+
141+
qualities = [QualityLevel.HIGH, QualityLevel.MEDIUM, QualityLevel.LOW]
142+
results_by_quality: dict[QualityLevel, Result] = {}
143+
for quality in qualities:
144+
policy = replace(STREAMWISE_POLICY)
145+
policy.name = f"{STREAMWISE_POLICY.name} {quality.value}"
146+
147+
latency_data = load_adaptive_quality_data(
148+
data_dir=data_dir,
149+
level=quality,
150+
)
114151

152+
allocator = AutoModelAllocator(
153+
workflow=PODCAST_WORKFLOW,
154+
latency_data=latency_data,
155+
power_data=power_data,
156+
policy=policy,
157+
)
158+
result = allocator.allocate(
159+
num_gpus=budget,
160+
verbose=False,
161+
)
162+
results_by_quality[quality] = result
115163

116-
# This is a point in the Pareto Frontier found via simulation
117-
INIT_REPLICAS_ADAPTIVE: dict[GPUType, dict[Model, int]] = {
118-
GPUType.A100: {
119-
Model.OTHERS: 1, # Kokoro
120-
Model.GEMMA: 8,
121-
Model.FLUX: 16,
122-
Model.HF: 25,
123-
Model.HF_VAE: 10,
124-
Model.UPSCALER: 5,
125-
},
126-
GPUType.H100: {
127-
Model.HF_VAE: 1,
128-
Model.FT: 96,
129-
Model.FT_VAE: 1, # Proper values
130-
Model.UPSCALER: 38,
131-
}
132-
}
164+
init_replicas_adaptive, time_per_req_high = _extract_from_result(
165+
results_by_quality[QualityLevel.HIGH],
166+
)
133167

168+
time_per_req_by_quality: dict[QualityLevel, dict[GPUType, dict[Model, float]]] = {}
169+
for quality, result in results_by_quality.items():
170+
_, time_per_req_q = _extract_from_result(result)
171+
time_per_req_by_quality[quality] = time_per_req_q
172+
173+
time_per_req_adaptive: dict[GPUType, dict[Model, dict[QualityLevel, float]]] = {}
174+
for gpu_type, models in init_replicas_adaptive.items():
175+
time_per_req_adaptive[gpu_type] = {}
176+
for model in models:
177+
high_time = time_per_req_high[gpu_type][model]
178+
quality_times: dict[QualityLevel, float] = {}
179+
for quality in qualities:
180+
quality_times[quality] = (
181+
time_per_req_by_quality
182+
.get(quality, {})
183+
.get(gpu_type, {})
184+
.get(model, high_time)
185+
)
186+
time_per_req_adaptive[gpu_type][model] = quality_times
187+
188+
return init_replicas_adaptive, time_per_req_adaptive
189+
190+
191+
# ---------------------------------------------------------------------------
192+
# Derived constants — computed by running the simulator at HARDWARE_BUDGET.
193+
#
194+
# TIME_PER_REQ / INIT_REPLICAS: single (HIGH) quality operating point.
195+
# TIME_PER_REQ_ADAPTIVE / INIT_REPLICAS_ADAPTIVE: per-quality-level values.
196+
# ---------------------------------------------------------------------------
197+
INIT_REPLICAS, TIME_PER_REQ = derive_multirequest_params(budget=dict(HARDWARE_BUDGET))
198+
INIT_REPLICAS_ADAPTIVE, TIME_PER_REQ_ADAPTIVE = derive_adaptive_params(budget=dict(HARDWARE_BUDGET))
199+
200+
# Allocation of video frames across quality levels for a 10-minute output
201+
# to fulfill a TTFF SLO. These are configurable weights used in adaptive
202+
# quality cost aggregation.
134203
QUALITY_PORTIONS = {
135204
QualityLevel.LOW: 112,
136205
QualityLevel.MEDIUM: 305,

tests/simulator/test_evaluator.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# Add current path
66
sys.path.append(os.getcwd())
77

8+
from tests.test_utils import assert_equals_approx
89
from tests.test_utils import temp_sys_path
910

1011
with temp_sys_path("simulator"):
@@ -103,11 +104,11 @@ def test_8A() -> None:
103104

104105
assert result.gpus_used == {GPUType.A100: 8}
105106
assert result.gpus_total == {GPUType.A100: 8}
106-
_assert_equals_approx(result.total_time_s, 15953.68)
107-
_assert_equals_approx(result.ttff_s, 15353.68)
108-
_assert_equals_approx(result.first_chunk_time, 169.34)
109-
_assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 4.31)
110-
_assert_equals_approx(result.cost, 37.93)
107+
assert_equals_approx(result.total_time_s, 15953.68)
108+
assert_equals_approx(result.ttff_s, 15353.68)
109+
assert_equals_approx(result.first_chunk_time, 169.34)
110+
assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 4.31)
111+
assert_equals_approx(result.cost, 37.93)
111112

112113

113114
def test_16H() -> None:
@@ -151,11 +152,11 @@ def test_16H() -> None:
151152
assert result is not None
152153
assert result.gpus_used == {GPUType.H200: 14}
153154
assert result.gpus_total == {GPUType.H200: 16}
154-
_assert_equals_approx(result.total_time_s, 4062.7)
155-
_assert_equals_approx(result.ttff_s, 3462.7)
156-
_assert_equals_approx(result.first_chunk_time, 51.68)
157-
_assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 2.87)
158-
_assert_equals_approx(result.cost, 66.67)
155+
assert_equals_approx(result.total_time_s, 4062.7)
156+
assert_equals_approx(result.ttff_s, 3462.7)
157+
assert_equals_approx(result.first_chunk_time, 51.68)
158+
assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 2.87)
159+
assert_equals_approx(result.cost, 66.67)
159160

160161

161162
def test_cost_optimal() -> None:
@@ -274,27 +275,19 @@ def test_cost_optimal() -> None:
274275
GPUType.A100: 256,
275276
GPUType.H200: 64,
276277
}
277-
_assert_equals_approx(result.total_time_s, 304.40)
278-
_assert_equals_approx(result.ttff_s, 21.60)
279-
_assert_equals_approx(result.first_chunk_time, 21.60)
280-
_assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 5.01)
281-
_assert_equals_approx(result.cost, 46.00)
278+
assert_equals_approx(result.total_time_s, 304.40)
279+
assert_equals_approx(result.ttff_s, 21.60)
280+
assert_equals_approx(result.first_chunk_time, 21.60)
281+
assert_equals_approx(result.total_energy / SECONDS_IN_HOUR / 1000, 5.01)
282+
assert_equals_approx(result.cost, 46.00)
282283

283284
assert models[GPUType.A100][Model.OTHERS][0].devices == 1
284285
assert models[GPUType.A100][Model.OTHERS][0].replicas == 1
285286
assert models[GPUType.A100][Model.OTHERS][0].time_first == 0.60
286287

287288
assert models[GPUType.H200][Model.FT][1].devices == 24
288289
assert models[GPUType.H200][Model.FT][1].replicas == 1
289-
_assert_equals_approx(models[GPUType.H200][Model.FT][1].time, 191.93)
290-
_assert_equals_approx(models[GPUType.H200][Model.FT][1].time_first, 14.78)
291-
_assert_equals_approx(models[GPUType.H200][Model.FT][1].energy / SECONDS_IN_HOUR / 1000, 0.72)
292-
_assert_equals_approx(models[GPUType.H200][Model.FT][1].cost, 8.56)
293-
294-
295-
def _assert_equals_approx(
296-
a: float,
297-
b: float,
298-
tol: float = 0.01
299-
) -> None:
300-
assert abs(a - b) < tol, f"Expected {a:.2f} to be approximately equal to {b:.2f} within tolerance {tol}"
290+
assert_equals_approx(models[GPUType.H200][Model.FT][1].time, 191.93)
291+
assert_equals_approx(models[GPUType.H200][Model.FT][1].time_first, 14.78)
292+
assert_equals_approx(models[GPUType.H200][Model.FT][1].energy / SECONDS_IN_HOUR / 1000, 0.72)
293+
assert_equals_approx(models[GPUType.H200][Model.FT][1].cost, 8.56)

0 commit comments

Comments
 (0)