Skip to content

Commit 54b707a

Browse files
author
Hatem Elseidy
committed
feat(vllm): add TTS audio generation endpoint via vLLM Omni
Add POST /v1/audio/speech endpoint supporting text-to-speech with vLLM Omni, following the OpenAI audio API convention. Includes Rust protocol types, stream aggregation, model discovery for audio-capable workers, and a Python handler that builds TTS engine inputs with prompt length estimation and WAV encoding. Signed-off-by: Hatem Elseidy <hatem.elseidy@epicgames.com>
1 parent 06f1701 commit 54b707a

File tree

22 files changed

+1090
-19
lines changed

22 files changed

+1090
-19
lines changed

components/src/dynamo/common/protocols/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,22 @@
55
66
This module provides protocol types for various modalities:
77
- video_protocol: NvCreateVideoRequest, NvVideosResponse for video generation
8+
- audio_protocol: NvCreateAudioSpeechRequest, NvAudiosResponse for audio generation
89
"""
910

11+
from dynamo.common.protocols.audio_protocol import (
12+
NvAudiosResponse,
13+
NvCreateAudioSpeechRequest,
14+
)
1015
from dynamo.common.protocols.video_protocol import (
1116
NvCreateVideoRequest,
1217
NvVideosResponse,
1318
VideoData,
1419
)
1520

1621
__all__ = [
22+
"NvAudiosResponse",
23+
"NvCreateAudioSpeechRequest",
1724
"NvCreateVideoRequest",
1825
"NvVideosResponse",
1926
"VideoData",
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Protocol types for audio speech generation.
5+
6+
These types match the Rust protocol types in lib/llm/src/protocols/openai/audios.rs
7+
to ensure compatibility with the Dynamo HTTP frontend.
8+
"""
9+
# TODO: Replace these Pydantic models with Python bindings to the Rust protocol types once PyO3 bindings are available.
10+
11+
from typing import Optional
12+
13+
from pydantic import BaseModel
14+
15+
16+
class AudioNvExt(BaseModel):
17+
"""NVIDIA extensions for audio speech generation requests.
18+
19+
Matches Rust NvExt in lib/llm/src/protocols/openai/audios/nvext.rs.
20+
"""
21+
22+
annotations: Optional[list[str]] = None
23+
"""Annotations for SSE stream events."""
24+
25+
task_type: Optional[str] = None
26+
"""Task type (e.g. 'tts', 'voice_clone')."""
27+
28+
language: Optional[str] = None
29+
"""Language code (e.g. 'en', 'zh')."""
30+
31+
instructions: Optional[str] = None
32+
"""Additional instructions for speech generation."""
33+
34+
ref_audio: Optional[str] = None
35+
"""Base64-encoded reference audio for voice cloning."""
36+
37+
ref_text: Optional[str] = None
38+
"""Reference text corresponding to ref_audio."""
39+
40+
max_new_tokens: Optional[int] = None
41+
"""Maximum number of tokens to generate."""
42+
43+
seed: Optional[int] = None
44+
"""Random seed for reproducibility."""
45+
46+
47+
class NvCreateAudioSpeechRequest(BaseModel):
48+
"""Request for audio speech generation (/v1/audio/speech endpoint).
49+
50+
Matches Rust NvCreateAudioSpeechRequest in lib/llm/src/protocols/openai/audios.rs.
51+
"""
52+
53+
# Required fields
54+
input: str
55+
"""The text to generate audio for."""
56+
57+
model: str
58+
"""The model to use for audio generation."""
59+
60+
voice: str
61+
"""The voice to use for generation."""
62+
63+
# Optional fields
64+
response_format: Optional[str] = None
65+
"""Audio format: mp3, wav, opus, aac, flac, pcm."""
66+
67+
speed: Optional[float] = None
68+
"""Playback speed (0.25 to 4.0, default 1.0)."""
69+
70+
nvext: Optional[AudioNvExt] = None
71+
"""NVIDIA extensions."""
72+
73+
74+
class NvAudiosResponse(BaseModel):
75+
"""Response structure for audio speech generation.
76+
77+
Matches Rust NvAudiosResponse in lib/llm/src/protocols/openai/audios.rs.
78+
Internal transport uses base64-encoded audio. The HTTP handler decodes
79+
this to return raw binary audio to clients.
80+
"""
81+
82+
audio_b64: str
83+
"""Base64-encoded audio bytes."""
84+
85+
content_type: str
86+
"""MIME content type (e.g. 'audio/mpeg', 'audio/wav')."""
87+
88+
model: str
89+
"""Model used for generation."""
90+
91+
created: int
92+
"""Unix timestamp of creation."""

components/src/dynamo/common/utils/output_modalities.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from pydantic import BaseModel
88

9+
from dynamo.common.protocols.audio_protocol import NvCreateAudioSpeechRequest
910
from dynamo.common.protocols.image_protocol import NvCreateImageRequest
1011
from dynamo.common.protocols.video_protocol import NvCreateVideoRequest
1112
from dynamo.llm import ModelType
@@ -91,8 +92,7 @@ def parse_request_type(
9192
return NvCreateVideoRequest(**raw_request), RequestType.VIDEO_GENERATION
9293

9394
if modality is OutputModality.AUDIO:
94-
# Audio protocol types are not yet defined; pass through the raw dict.
95-
return raw_request, RequestType.AUDIO_GENERATION
95+
return NvCreateAudioSpeechRequest(**raw_request), RequestType.AUDIO_GENERATION
9696

9797
# Text Modality
9898
return raw_request, RequestType.CHAT_COMPLETION

0 commit comments

Comments
 (0)