Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,47 @@ def __init__(
self.engine_args = engine_args
self.model = self.engine_args.model

# Check for encoder device override via environment variable
# DYN_ENCODER_DEVICE can be: "auto", "cpu", "cuda", "xpu", or specific device like "cuda:0"
encoder_device = os.getenv("DYN_ENCODER_DEVICE", "auto")

self.image_loader = ImageLoader(cache_size=CACHE_SIZE_MAXIMUM)
self.image_processor = AutoImageProcessor.from_pretrained(
self.model, trust_remote_code=True
)
self.vision_model = load_vision_model(
self.model, enforce_eager=self.engine_args.enforce_eager
self.model,
enforce_eager=self.engine_args.enforce_eager,
device=encoder_device,
)

# Device verification logging
logger.info("=" * 60)
logger.info("ENCODER DEVICE VERIFICATION")
logger.info("=" * 60)
logger.info(f"Requested encoder device (DYN_ENCODER_DEVICE): {encoder_device}")

# Check what device the vision model is on
if hasattr(self.vision_model, "device"):
logger.info(f"Vision model device: {self.vision_model.device}")
else:
# Try to get device from first parameter
try:
first_param_device = next(self.vision_model.parameters()).device
logger.info(
f"Vision model device (from parameters): {first_param_device}"
)
except (StopIteration, AttributeError):
logger.info("Vision model device: Unable to determine")

# Check CUDA availability and visibility
logger.info(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
logger.info(f"CUDA device count: {torch.cuda.device_count()}")
logger.info(f"Current CUDA device: {torch.cuda.current_device()}")

logger.info("=" * 60)

hidden_size = getattr(self.vision_model, "out_hidden_size", None)
if hidden_size is None:
hidden_size = getattr(
Expand Down Expand Up @@ -248,7 +282,19 @@ async def generate(
# [gluo FIXME] This is specific to qwen vision processing..
# Split concatenated embeddings for each image item.
if is_qwen_vl_model(self.model):
merge_size = self.vision_encoder.spatial_merge_size
# For vLLM encoder: spatial_merge_size is directly on vision_encoder
# For HuggingFace: it's on vision_encoder.visual.spatial_merge_size
if hasattr(self.vision_encoder, "spatial_merge_size"):
merge_size = self.vision_encoder.spatial_merge_size
elif hasattr(self.vision_encoder, "visual") and hasattr(
self.vision_encoder.visual, "spatial_merge_size"
):
merge_size = self.vision_encoder.visual.spatial_merge_size
else:
# Fallback to config
merge_size = getattr(
self.vision_encoder.config, "spatial_merge_size", 2
)
sizes = (
image_embeds["image_grid_thw"].prod(-1)
// merge_size
Expand Down
10 changes: 10 additions & 0 deletions components/src/dynamo/vllm/multimodal_utils/encode_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,16 @@ def encode_image_embeddings(
NotImplementedError: If model is not supported
"""
with torch.no_grad():
# Log encoder device during inference
logger = logging.getLogger(__name__)
try:
encoder_device = next(vision_encoder.parameters()).device
logger.info(
f"[ENCODE] Vision encoder device during inference: {encoder_device}"
)
except (StopIteration, AttributeError):
logger.info("[ENCODE] Unable to determine vision encoder device")

# Route through the correct encoder based on model
if is_model_supported(model_name, SupportedModels.LLAVA_1_5_7B):
pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)
Expand Down
17 changes: 14 additions & 3 deletions components/src/dynamo/vllm/multimodal_utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,22 @@ def is_qwen_vl_model(model_name: str) -> bool:
)


def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Module:
def load_vision_model(
model_id: str, enforce_eager: bool = False, device: str = "auto"
) -> torch.nn.Module:
"""
Load a vision model from a HuggingFace model ID.

Args:
model_id: The model identifier
enforce_eager: Whether to enforce eager execution
device: Device to load the model on. Options: "auto", "cpu", "cuda", "xpu", or specific device like "cuda:0"
"""
if VLLM_ENCODER and is_qwen_vl_model(model_id):
# When device="cpu" is explicitly requested, skip vLLM encoder and use HuggingFace path
# because vLLM doesn't support CPU-only mode for encoder workers
use_vllm_encoder = VLLM_ENCODER and is_qwen_vl_model(model_id) and device != "cpu"

if use_vllm_encoder:
# Disable to get ViT from the same process
update_environment_variables(
{
Expand All @@ -185,7 +196,7 @@ def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Mo
vllm_model.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.model.visual
)
return AutoModel.from_pretrained(
model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
model_id, device_map=device, torch_dtype=torch.float16, trust_remote_code=True
)


Expand Down
156 changes: 156 additions & 0 deletions examples/backends/vllm/launch/xpu/cpu_encoder_for_epd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"

# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"

# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It reduces performance by:
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU=false

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
echo ""
echo "Options:"
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo " --single-gpu Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --model llava-hf/llava-1.5-7b-hf"
echo " $0 --model microsoft/Phi-3.5-vision-instruct"
echo " $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
echo " $0 --model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done

# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi

HTTP_PORT="${DYN_HTTP_PORT:-8000}"
if [[ "$SINGLE_GPU" == "true" ]]; then
GPU_LABEL="1 GPU"
else
GPU_LABEL="3 GPUs"
fi
print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"


# Start frontend (no router mode)
echo "Starting frontend..."
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &

EXTRA_ARGS=""
PD_EXTRA_ARGS=""

# GPU assignments (override via environment variables)
# Encoder uses GPU 0 for vLLM infrastructure, but vision model loads on CPU via DYN_ENCODER_DEVICE
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}

# GPU memory utilization for workers.
# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
# --gpu-memory-utilization for KV cache sizing. Per vLLM CacheConfig:
# "kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization"
# Ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/
# Therefore _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect on actual VRAM
# usage when --kv-cache-memory-bytes is set.
if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
echo "WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because" >&2
echo " --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM." >&2
fi
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}

# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
# OOM during profiling when 3 workers share a GPU. 512 MB covers the
# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))

if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--enforce-eager"
PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
fi

if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
EXTRA_ARGS="$EXTRA_ARGS --block-size 64"
PD_EXTRA_ARGS="--max-model-len 10240"
fi

# Start encode worker with CPU vision model
echo "Starting encode worker with CPU vision model (vLLM on GPU $DYN_ENCODE_WORKER_GPU)..."
# DYN_ENCODER_DEVICE=cpu forces the vision model to load on CPU (device_map="cpu")
# VLLM_ENCODER=0 ensures HuggingFace encoding path is used (not vLLM encoder)
# vLLM infrastructure still runs on GPU to maintain compatibility
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_ENCODER_DEVICE=cpu \
VLLM_ENCODER=0 \
env $DEVICE_AFFINITY_ENV=$DYN_ENCODE_WORKER_GPU \
python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "cpu"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &

# Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
env $DEVICE_AFFINITY_ENV=$DYN_PREFILL_WORKER_GPU \
python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &

# Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
env $DEVICE_AFFINITY_ENV=$DYN_DECODE_WORKER_GPU \
python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
Comment on lines +144 to +148
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check how other decode workers are configured in the repo
rg -n --type=sh 'multimodal-decode-worker|disaggregation-mode decode' examples/

Repository: ai-dynamo/dynamo

Length of output: 3720


🏁 Script executed:

rg -n 'multimodal-decode-worker|disaggregation-mode' --type=py dynamo/vllm | head -50

Repository: ai-dynamo/dynamo

Length of output: 111


🏁 Script executed:

find . -type f -name "*.py" -path "*/vllm/*" | head -20

Repository: ai-dynamo/dynamo

Length of output: 1127


🏁 Script executed:

rg -n 'multimodal-decode-worker|disaggregation-mode' ./components/src/dynamo/vllm/

Repository: ai-dynamo/dynamo

Length of output: 2884


🏁 Script executed:

sed -n '20,110p' ./components/src/dynamo/vllm/backend_args.py

Repository: ai-dynamo/dynamo

Length of output: 3683


🏁 Script executed:

sed -n '220,300p' ./components/src/dynamo/vllm/backend_args.py

Repository: ai-dynamo/dynamo

Length of output: 3609


🏁 Script executed:

rg -n 'multimodal_decode_worker' ./components/src/dynamo/vllm/ -A 3 -B 1

Repository: ai-dynamo/dynamo

Length of output: 6465


🏁 Script executed:

sed -n '150,180p' ./components/src/dynamo/vllm/args.py

Repository: ai-dynamo/dynamo

Length of output: 1296


Add --disaggregation-mode decode to the decode worker command.

The decode worker at line 148 uses --multimodal-decode-worker but omits --disaggregation-mode decode. These flags are independent—--multimodal-decode-worker only sets the component type, not the disaggregation mode. Without the explicit flag, disaggregation_mode defaults to AGGREGATED, which conflicts with the intended decode-only behavior. All other decode workers in the repository explicitly specify --disaggregation-mode decode for consistency. Add this flag to match the sibling script disagg_multimodal_epd_xpu.sh:142.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/backends/vllm/launch/xpu/cpu_encoder_for_epd.sh` around lines 144 -
148, Update the decode worker launch command that starts with
"VLLM_NIXL_SIDE_CHANNEL_PORT=20099 env
$DEVICE_AFFINITY_ENV=$DYN_DECODE_WORKER_GPU python -m dynamo.vllm
--multimodal-decode-worker ..." to include the explicit flag
"--disaggregation-mode decode" so the worker runs in decode-only disaggregation
mode (matching other decode worker scripts); ensure the new flag is placed among
the existing CLI flags (alongside --enable-multimodal, --model $MODEL_NAME,
etc.) so disaggregation_mode is not left at the default AGGREGATED.



echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="

# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ set -e
trap 'echo Cleaning up...; kill 0' EXIT

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"

# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
Expand Down
Loading