ai-dynamo · ZhengHongming888 · Mar 26, 2026 · Mar 27, 2026 · coderabbitai · Mar 27, 2026
@@ -62,13 +62,47 @@ def __init__(
         self.engine_args = engine_args
         self.model = self.engine_args.model
 
+        # Check for encoder device override via environment variable
+        # DYN_ENCODER_DEVICE can be: "auto", "cpu", "cuda", "xpu", or specific device like "cuda:0"
+        encoder_device = os.getenv("DYN_ENCODER_DEVICE", "auto")
+
         self.image_loader = ImageLoader(cache_size=CACHE_SIZE_MAXIMUM)
         self.image_processor = AutoImageProcessor.from_pretrained(
             self.model, trust_remote_code=True
         )
         self.vision_model = load_vision_model(
-            self.model, enforce_eager=self.engine_args.enforce_eager
+            self.model,
+            enforce_eager=self.engine_args.enforce_eager,
+            device=encoder_device,
         )
+
+        # Device verification logging
+        logger.info("=" * 60)
+        logger.info("ENCODER DEVICE VERIFICATION")
+        logger.info("=" * 60)
+        logger.info(f"Requested encoder device (DYN_ENCODER_DEVICE): {encoder_device}")
+
+        # Check what device the vision model is on
+        if hasattr(self.vision_model, "device"):
+            logger.info(f"Vision model device: {self.vision_model.device}")
+        else:
+            # Try to get device from first parameter
+            try:
+                first_param_device = next(self.vision_model.parameters()).device
+                logger.info(
+                    f"Vision model device (from parameters): {first_param_device}"
+                )
+            except (StopIteration, AttributeError):
+                logger.info("Vision model device: Unable to determine")
+
+        # Check CUDA availability and visibility
+        logger.info(f"CUDA available: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            logger.info(f"CUDA device count: {torch.cuda.device_count()}")
+            logger.info(f"Current CUDA device: {torch.cuda.current_device()}")
+
+        logger.info("=" * 60)
+
         hidden_size = getattr(self.vision_model, "out_hidden_size", None)
         if hidden_size is None:
             hidden_size = getattr(
@@ -248,7 +282,19 @@ async def generate(
                     # [gluo FIXME] This is specific to qwen vision processing..
                     # Split concatenated embeddings for each image item.
                     if is_qwen_vl_model(self.model):
-                        merge_size = self.vision_encoder.spatial_merge_size
+                        # For vLLM encoder: spatial_merge_size is directly on vision_encoder
+                        # For HuggingFace: it's on vision_encoder.visual.spatial_merge_size
+                        if hasattr(self.vision_encoder, "spatial_merge_size"):
+                            merge_size = self.vision_encoder.spatial_merge_size
+                        elif hasattr(self.vision_encoder, "visual") and hasattr(
+                            self.vision_encoder.visual, "spatial_merge_size"
+                        ):
+                            merge_size = self.vision_encoder.visual.spatial_merge_size
+                        else:
+                            # Fallback to config
+                            merge_size = getattr(
+                                self.vision_encoder.config, "spatial_merge_size", 2
+                            )
                         sizes = (
                             image_embeds["image_grid_thw"].prod(-1)
                             // merge_size

@@ -106,6 +106,16 @@ def encode_image_embeddings(
         NotImplementedError: If model is not supported
     """
     with torch.no_grad():
+        # Log encoder device during inference
+        logger = logging.getLogger(__name__)
+        try:
+            encoder_device = next(vision_encoder.parameters()).device
+            logger.info(
+                f"[ENCODE] Vision encoder device during inference: {encoder_device}"
+            )
+        except (StopIteration, AttributeError):
+            logger.info("[ENCODE] Unable to determine vision encoder device")
+
         # Route through the correct encoder based on model
         if is_model_supported(model_name, SupportedModels.LLAVA_1_5_7B):
             pixel_values = image_embeds["pixel_values"].to(vision_encoder.device)

@@ -156,11 +156,22 @@ def is_qwen_vl_model(model_name: str) -> bool:
     )
 
 
-def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Module:
+def load_vision_model(
+    model_id: str, enforce_eager: bool = False, device: str = "auto"
+) -> torch.nn.Module:
     """
     Load a vision model from a HuggingFace model ID.
+
+    Args:
+        model_id: The model identifier
+        enforce_eager: Whether to enforce eager execution
+        device: Device to load the model on. Options: "auto", "cpu", "cuda", "xpu", or specific device like "cuda:0"
     """
-    if VLLM_ENCODER and is_qwen_vl_model(model_id):
+    # When device="cpu" is explicitly requested, skip vLLM encoder and use HuggingFace path
+    # because vLLM doesn't support CPU-only mode for encoder workers
+    use_vllm_encoder = VLLM_ENCODER and is_qwen_vl_model(model_id) and device != "cpu"
+
+    if use_vllm_encoder:
         # Disable to get ViT from the same process
         update_environment_variables(
             {
@@ -185,7 +196,7 @@ def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Mo
             vllm_model.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.model.visual
         )
     return AutoModel.from_pretrained(
-        model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
+        model_id, device_map=device, torch_dtype=torch.float16, trust_remote_code=True
     )
 
 

@@ -0,0 +1,156 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
+
+# Default values
+MODEL_NAME="llava-hf/llava-1.5-7b-hf"
+
+# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
+# This is intended for functional testing with small models (e.g. 2B) where CI
+# only has 1 GPU available. It reduces performance by:
+#   - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
+#   - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
+#   - Limiting --max-model-len to 4096 tokens on P/D workers
+#   - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
+#   - Using lower gpu-memory-utilization fractions to share the GPU
+SINGLE_GPU=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME=$2
+            shift 2
+            ;;
+        --single-gpu)
+            SINGLE_GPU=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Disaggregated multimodal serving with separate Encode/Prefill/Decode workers"
+            echo ""
+            echo "Options:"
+            echo "  --model <model_name>          Specify the VLM model to use (default: $MODEL_NAME)"
+            echo "                                LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
+            echo "  --single-gpu                  Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
+            echo "  -h, --help                    Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  $0 --model llava-hf/llava-1.5-7b-hf"
+            echo "  $0 --model microsoft/Phi-3.5-vision-instruct"
+            echo "  $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
+            echo "  $0 --model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Device platform and affinity env name.
+# DEVICE_PLATFORM supports: cuda, xpu
+DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
+if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
+    if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+        DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
+    else
+        DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
+    fi
+fi
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    GPU_LABEL="1 GPU"
+else
+    GPU_LABEL="3 GPUs"
+fi
+print_launch_banner --multimodal "Launching Disaggregated Multimodal E/P/D ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
+
+
+# Start frontend (no router mode)
+echo "Starting frontend..."
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend &
+
+EXTRA_ARGS=""
+PD_EXTRA_ARGS=""
+
+# GPU assignments (override via environment variables)
+# Encoder uses GPU 0 for vLLM infrastructure, but vision model loads on CPU via DYN_ENCODER_DEVICE
+DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
+DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
+DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}
+
+# GPU memory utilization for workers.
+# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
+# --gpu-memory-utilization for KV cache sizing. Per vLLM CacheConfig:
+# "kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization"
+# Ref: https://docs.vllm.ai/en/stable/api/vllm/config/cache/
+# Therefore _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE has no effect on actual VRAM
+# usage when --kv-cache-memory-bytes is set.
+if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
+    echo "WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because" >&2
+    echo "  --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM." >&2
+fi
+DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
+DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
+DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
+
+# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
+# memory profiling entirely (both language model and multimodal encoder), which avoids
+# OOM during profiling when 3 workers share a GPU. 512 MB covers the
+# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
+PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
+
+if [[ "$SINGLE_GPU" == "true" ]]; then
+    EXTRA_ARGS="--enforce-eager"
+    PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
+fi
+
+if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+    EXTRA_ARGS="$EXTRA_ARGS --block-size 64"
+    PD_EXTRA_ARGS="--max-model-len 10240"
+fi
+
+# Start encode worker with CPU vision model
+echo "Starting encode worker with CPU vision model (vLLM on GPU $DYN_ENCODE_WORKER_GPU)..."
+# DYN_ENCODER_DEVICE=cpu forces the vision model to load on CPU (device_map="cpu")
+# VLLM_ENCODER=0 ensures HuggingFace encoding path is used (not vLLM encoder)
+# vLLM infrastructure still runs on GPU to maintain compatibility
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+DYN_ENCODER_DEVICE=cpu \
+VLLM_ENCODER=0 \
+env $DEVICE_AFFINITY_ENV=$DYN_ENCODE_WORKER_GPU \
+python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "cpu"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
+
+# Start prefill worker (also handles encode routing via --route-to-encoder)
+echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
+VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
+env $DEVICE_AFFINITY_ENV=$DYN_PREFILL_WORKER_GPU \
+python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+
+# Start decode worker
+echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
+VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
+env $DEVICE_AFFINITY_ENV=$DYN_DECODE_WORKER_GPU \
+python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS $PD_EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_buffer_device": "'"$DEVICE_PLATFORM"'"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
+
+
+echo "=================================================="
+echo "All components started. Waiting for initialization..."
+echo "=================================================="
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
@@ -5,8 +5,8 @@ set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
-source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
 
 # Default values
 MODEL_NAME="llava-hf/llava-1.5-7b-hf"