|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import math |
| 4 | +import os |
| 5 | +from dataclasses import replace |
4 | 6 |
|
5 | 7 | from sim_types import GPUType |
6 | 8 | from sim_types import Model |
7 | 9 | from sim_types import QualityLevel |
8 | 10 | from sim_types import RESOLUTION_PIXELS |
| 11 | +from sim_types import Result |
9 | 12 | from sim_types import WorkflowConfig |
10 | 13 | from sim_types import LatencyData |
11 | 14 |
|
| 15 | +from data_loading import load_latency_data |
| 16 | +from data_loading import load_power_data |
| 17 | +from data_loading import load_adaptive_quality_data |
| 18 | + |
| 19 | +from workflows import PODCAST_WORKFLOW |
| 20 | + |
| 21 | +from policies import STREAMWISE_POLICY |
| 22 | + |
| 23 | +from auto_model_allocator import AutoModelAllocator |
| 24 | + |
12 | 25 |
|
13 | 26 | # Queries per minute |
14 | 27 | QPM_LIST = [0.1, 1, 2, 5, 10, 20, 30, 50, 100] |
15 | 28 |
|
| 29 | +# Resolve the data directory relative to this file so imports work from any cwd. |
| 30 | +_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") |
16 | 31 |
|
17 | | -# Single quality |
18 | | -TIME_PER_REQ: dict[GPUType, dict[Model, float]] = { |
19 | | - GPUType.A100: { |
20 | | - Model.FLUX: 9.75, |
21 | | - Model.HF: 123.08, |
22 | | - Model.HF_VAE: 114.32, |
23 | | - Model.FT: 130.82, |
24 | | - Model.FT_VAE: 52.4, # TODO proper value |
25 | | - Model.UPSCALER: 126.83, |
26 | | - Model.GEMMA: 6.5 + 42 * 0.6, # First scene + per scene |
27 | | - Model.OTHERS: 43 * 0.6, # Kokoro: 43 scenes at 0.6 seconds each |
28 | | - }, |
29 | | - GPUType.H100: { |
30 | | - Model.FT: 130.82 / 2.2, # hunyuanframepackf1_time_per_req |
31 | | - Model.FT_VAE: 52.4 / 2.2, # TODO proper value |
32 | | - } |
| 32 | + |
| 33 | +# --------------------------------------------------------------------------- |
| 34 | +# Hardware budget — the Pareto-optimal operating point used in the paper. |
| 35 | +# --------------------------------------------------------------------------- |
| 36 | +HARDWARE_BUDGET: dict[GPUType, int] = { |
| 37 | + GPUType.A100: 256, |
| 38 | + GPUType.H100: 64, |
33 | 39 | } |
34 | 40 |
|
35 | 41 |
|
36 | | -# Optimal point in Pareto Frontier for StreamWise |
37 | | -INIT_REPLICAS: dict[GPUType, dict[Model, int]] = { |
38 | | - GPUType.A100: { |
39 | | - Model.OTHERS: 1, |
40 | | - Model.GEMMA: 1, |
41 | | - Model.FLUX: 1, |
42 | | - Model.HF: 12, |
43 | | - Model.HF_VAE: 3, |
44 | | - Model.FT: 172, |
45 | | - Model.FT_VAE: 10, # TODO proper value |
46 | | - Model.UPSCALER: 21, |
47 | | - }, |
48 | | - GPUType.H100: { |
49 | | - Model.FT: 78, |
50 | | - Model.FT_VAE: 1, # TODO proper value |
51 | | - } |
52 | | -} |
| 42 | +# --------------------------------------------------------------------------- |
| 43 | +# Derivation helpers |
| 44 | +# --------------------------------------------------------------------------- |
| 45 | + |
| 46 | +def _extract_from_result( |
| 47 | + result: Result, |
| 48 | +) -> tuple[dict[GPUType, dict[Model, int]], dict[GPUType, dict[Model, float]]]: |
| 49 | + """Extract init_replicas (GPU counts) and time_per_req from a simulation result. |
| 50 | +
|
| 51 | + Returns |
| 52 | + ------- |
| 53 | + init_replicas: |
| 54 | + ``{gpu_type: {model: total_gpus}}`` — total GPU count allocated to each |
| 55 | + model on each GPU type (i.e. ``devices × replicas`` summed across instances). |
| 56 | + time_per_req: |
| 57 | + ``{gpu_type: {model: seconds}}`` — wall-clock time for the model to process |
| 58 | + one full request (10-min video) given the allocated resources. When a model |
| 59 | + has multiple instances on the same GPU type, we take the *maximum* time |
| 60 | + (the bottleneck). |
| 61 | + """ |
| 62 | + init_replicas: dict[GPUType, dict[Model, int]] = {} |
| 63 | + time_per_req: dict[GPUType, dict[Model, float]] = {} |
| 64 | + |
| 65 | + for gpu_type, model_allocs in result.models.items(): |
| 66 | + init_replicas[gpu_type] = {} |
| 67 | + time_per_req[gpu_type] = {} |
| 68 | + for model, allocs in model_allocs.items(): |
| 69 | + total_gpus = sum(a.get_num_gpus() for a in allocs) |
| 70 | + times = [a.time for a in allocs if a.get_num_gpus() > 0] |
| 71 | + if total_gpus > 0: |
| 72 | + init_replicas[gpu_type][model] = total_gpus |
| 73 | + time_per_req[gpu_type][model] = max(times) if times else 0.0 |
| 74 | + |
| 75 | + return init_replicas, time_per_req |
| 76 | + |
| 77 | + |
| 78 | +def derive_multirequest_params( |
| 79 | + budget: dict[GPUType, int] | None = None, |
| 80 | + data_dir: str = _DATA_DIR, |
| 81 | +) -> tuple[dict[GPUType, dict[Model, int]], dict[GPUType, dict[Model, float]]]: |
| 82 | + """Run the StreamWise simulator and derive multi-request parameters. |
| 83 | +
|
| 84 | + Runs the greedy allocator with ``STREAMWISE_POLICY`` on ``PODCAST_WORKFLOW`` |
| 85 | + at the given hardware *budget* and extracts: |
| 86 | +
|
| 87 | + * **init_replicas** — total GPU count per model per GPU type |
| 88 | + * **time_per_req** — total time (seconds) per request per model per GPU type |
| 89 | +
|
| 90 | + Parameters |
| 91 | + ---------- |
| 92 | + budget: |
| 93 | + ``{GPUType: num_gpus}`` hardware budget to allocate. |
| 94 | + Defaults to ``HARDWARE_BUDGET`` when ``None``. |
| 95 | + data_dir: |
| 96 | + Path to the latency/power CSV data directory. |
| 97 | + """ |
| 98 | + if budget is None: |
| 99 | + budget = dict(HARDWARE_BUDGET) |
| 100 | + latency_data = load_latency_data(data_dir=data_dir) |
| 101 | + power_data = load_power_data(data_dir=data_dir) |
| 102 | + |
| 103 | + allocator = AutoModelAllocator( |
| 104 | + workflow=PODCAST_WORKFLOW, |
| 105 | + latency_data=latency_data, |
| 106 | + power_data=power_data, |
| 107 | + policy=STREAMWISE_POLICY, |
| 108 | + ) |
| 109 | + result = allocator.allocate( |
| 110 | + num_gpus=budget, |
| 111 | + verbose=False, |
| 112 | + ) |
53 | 113 |
|
54 | | -# Adaptive quality |
55 | | -# Time per request in seconds |
56 | | -TIME_PER_REQ_ADAPTIVE: dict[GPUType, dict[Model, dict[QualityLevel, float]]] = { |
57 | | - GPUType.A100: { |
58 | | - Model.GEMMA: { |
59 | | - # Same quality for all levels: First + per scene |
60 | | - QualityLevel.LOW: 2.3 + 42 * 0.176, |
61 | | - QualityLevel.MEDIUM: 2.3 + 42 * 0.176, |
62 | | - QualityLevel.HIGH: 2.3 + 42 * 0.176, |
63 | | - }, |
64 | | - Model.OTHERS: { |
65 | | - # Kokoro: 42 scenes at 0.6 seconds each |
66 | | - QualityLevel.LOW: 43 * 0.6, |
67 | | - QualityLevel.MEDIUM: 43 * 0.6, |
68 | | - QualityLevel.HIGH: 43 * 0.6, |
69 | | - }, |
70 | | - Model.FLUX: { |
71 | | - QualityLevel.LOW: 0.10, |
72 | | - QualityLevel.MEDIUM: 0.81, |
73 | | - QualityLevel.HIGH: 0.95, |
74 | | - }, |
75 | | - Model.HF: { |
76 | | - QualityLevel.LOW: 3.41, |
77 | | - QualityLevel.MEDIUM: 8.06, |
78 | | - QualityLevel.HIGH: 27.1, |
79 | | - }, |
80 | | - Model.HF_VAE: { |
81 | | - QualityLevel.LOW: 0.75, |
82 | | - QualityLevel.MEDIUM: 3.18, |
83 | | - QualityLevel.HIGH: 52.4, |
84 | | - }, |
85 | | - Model.UPSCALER: { |
86 | | - QualityLevel.LOW: 2.01, |
87 | | - QualityLevel.MEDIUM: 8.30, |
88 | | - QualityLevel.HIGH: 34.4, |
89 | | - }, |
90 | | - }, |
91 | | - GPUType.H100: { |
92 | | - Model.HF_VAE: { |
93 | | - QualityLevel.LOW: 0.75, |
94 | | - QualityLevel.MEDIUM: 3.18, |
95 | | - QualityLevel.HIGH: 52.4, |
96 | | - }, |
97 | | - Model.FT: { |
98 | | - QualityLevel.LOW: 8.74, |
99 | | - QualityLevel.MEDIUM: 39.62, |
100 | | - QualityLevel.HIGH: 131.14, |
101 | | - }, |
102 | | - Model.FT_VAE: { # TODO proper values |
103 | | - QualityLevel.LOW: 0.75, |
104 | | - QualityLevel.MEDIUM: 3.18, |
105 | | - QualityLevel.HIGH: 52.4, |
106 | | - }, |
107 | | - Model.UPSCALER: { |
108 | | - QualityLevel.LOW: 2.01, |
109 | | - QualityLevel.MEDIUM: 8.30, |
110 | | - QualityLevel.HIGH: 34.4, |
111 | | - }, |
112 | | - } |
113 | | -} |
| 114 | + return _extract_from_result(result) |
| 115 | + |
| 116 | + |
| 117 | +def derive_adaptive_params( |
| 118 | + budget: dict[GPUType, int] | None = None, |
| 119 | + data_dir: str = _DATA_DIR, |
| 120 | +) -> tuple[ |
| 121 | + dict[GPUType, dict[Model, int]], |
| 122 | + dict[GPUType, dict[Model, dict[QualityLevel, float]]], |
| 123 | +]: |
| 124 | + """Run the simulator at each quality level and derive adaptive parameters. |
| 125 | +
|
| 126 | + Returns |
| 127 | + ------- |
| 128 | + init_replicas_adaptive: |
| 129 | + ``{gpu_type: {model: total_gpus}}`` from the HIGH-quality simulation run |
| 130 | + (the worst-case / most-demanding quality level sets the base allocation). |
| 131 | + time_per_req_adaptive: |
| 132 | + ``{gpu_type: {model: {quality: seconds}}}`` — per-quality time per request, |
| 133 | + every ``(gpu_type, model)`` in ``init_replicas_adaptive`` has a timing |
| 134 | + entry for every quality level. |
| 135 | + """ |
| 136 | + if budget is None: |
| 137 | + budget = dict(HARDWARE_BUDGET) |
| 138 | + |
| 139 | + power_data = load_power_data(data_dir=data_dir) |
| 140 | + |
| 141 | + qualities = [QualityLevel.HIGH, QualityLevel.MEDIUM, QualityLevel.LOW] |
| 142 | + results_by_quality: dict[QualityLevel, Result] = {} |
| 143 | + for quality in qualities: |
| 144 | + policy = replace(STREAMWISE_POLICY) |
| 145 | + policy.name = f"{STREAMWISE_POLICY.name} {quality.value}" |
| 146 | + |
| 147 | + latency_data = load_adaptive_quality_data( |
| 148 | + data_dir=data_dir, |
| 149 | + level=quality, |
| 150 | + ) |
114 | 151 |
|
| 152 | + allocator = AutoModelAllocator( |
| 153 | + workflow=PODCAST_WORKFLOW, |
| 154 | + latency_data=latency_data, |
| 155 | + power_data=power_data, |
| 156 | + policy=policy, |
| 157 | + ) |
| 158 | + result = allocator.allocate( |
| 159 | + num_gpus=budget, |
| 160 | + verbose=False, |
| 161 | + ) |
| 162 | + results_by_quality[quality] = result |
115 | 163 |
|
116 | | -# This is a point in the Pareto Frontier found via simulation |
117 | | -INIT_REPLICAS_ADAPTIVE: dict[GPUType, dict[Model, int]] = { |
118 | | - GPUType.A100: { |
119 | | - Model.OTHERS: 1, # Kokoro |
120 | | - Model.GEMMA: 8, |
121 | | - Model.FLUX: 16, |
122 | | - Model.HF: 25, |
123 | | - Model.HF_VAE: 10, |
124 | | - Model.UPSCALER: 5, |
125 | | - }, |
126 | | - GPUType.H100: { |
127 | | - Model.HF_VAE: 1, |
128 | | - Model.FT: 96, |
129 | | - Model.FT_VAE: 1, # Proper values |
130 | | - Model.UPSCALER: 38, |
131 | | - } |
132 | | -} |
| 164 | + init_replicas_adaptive, time_per_req_high = _extract_from_result( |
| 165 | + results_by_quality[QualityLevel.HIGH], |
| 166 | + ) |
133 | 167 |
|
| 168 | + time_per_req_by_quality: dict[QualityLevel, dict[GPUType, dict[Model, float]]] = {} |
| 169 | + for quality, result in results_by_quality.items(): |
| 170 | + _, time_per_req_q = _extract_from_result(result) |
| 171 | + time_per_req_by_quality[quality] = time_per_req_q |
| 172 | + |
| 173 | + time_per_req_adaptive: dict[GPUType, dict[Model, dict[QualityLevel, float]]] = {} |
| 174 | + for gpu_type, models in init_replicas_adaptive.items(): |
| 175 | + time_per_req_adaptive[gpu_type] = {} |
| 176 | + for model in models: |
| 177 | + high_time = time_per_req_high[gpu_type][model] |
| 178 | + quality_times: dict[QualityLevel, float] = {} |
| 179 | + for quality in qualities: |
| 180 | + quality_times[quality] = ( |
| 181 | + time_per_req_by_quality |
| 182 | + .get(quality, {}) |
| 183 | + .get(gpu_type, {}) |
| 184 | + .get(model, high_time) |
| 185 | + ) |
| 186 | + time_per_req_adaptive[gpu_type][model] = quality_times |
| 187 | + |
| 188 | + return init_replicas_adaptive, time_per_req_adaptive |
| 189 | + |
| 190 | + |
| 191 | +# --------------------------------------------------------------------------- |
| 192 | +# Derived constants — computed by running the simulator at HARDWARE_BUDGET. |
| 193 | +# |
| 194 | +# TIME_PER_REQ / INIT_REPLICAS: single (HIGH) quality operating point. |
| 195 | +# TIME_PER_REQ_ADAPTIVE / INIT_REPLICAS_ADAPTIVE: per-quality-level values. |
| 196 | +# --------------------------------------------------------------------------- |
| 197 | +INIT_REPLICAS, TIME_PER_REQ = derive_multirequest_params(budget=dict(HARDWARE_BUDGET)) |
| 198 | +INIT_REPLICAS_ADAPTIVE, TIME_PER_REQ_ADAPTIVE = derive_adaptive_params(budget=dict(HARDWARE_BUDGET)) |
| 199 | + |
| 200 | +# Allocation of video frames across quality levels for a 10-minute output |
| 201 | +# to fulfill a TTFF SLO. These are configurable weights used in adaptive |
| 202 | +# quality cost aggregation. |
134 | 203 | QUALITY_PORTIONS = { |
135 | 204 | QualityLevel.LOW: 112, |
136 | 205 | QualityLevel.MEDIUM: 305, |
|
0 commit comments