Skip to content

Commit 2949f39

Browse files
authored
Merge branch 'main' into sniffio-missing
2 parents 55a2bee + 3a21569 commit 2949f39

File tree

6 files changed

+205
-20
lines changed

6 files changed

+205
-20
lines changed

examples/audio_speech_example.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from zai import ZaiClient
2+
import os
3+
import traceback
4+
import uuid
5+
6+
7+
# Change working directory to project root
8+
script_dir = os.path.dirname(os.path.abspath(__file__))
9+
project_root = os.path.dirname(script_dir)
10+
os.chdir(project_root)
11+
12+
13+
def text_to_speech_non_stream():
14+
# Initialize client
15+
client = ZaiClient()
16+
17+
# Audio format
18+
# Supported response formats: wav, pcm (default)
19+
response_format = 'pcm'
20+
21+
try:
22+
# Generate speech audio from text
23+
response = client.audio.speech(
24+
model='glm-tts',
25+
input='Hello, this is a test for text-to-speech functionality.',
26+
voice='tongtong',
27+
response_format=response_format,
28+
stream=False
29+
)
30+
31+
# Save audio to file with unique name
32+
output_file = f"audio_speech_{uuid.uuid4()}.{response_format}"
33+
with open(output_file, 'wb') as f:
34+
f.write(response.content)
35+
36+
print(f"Audio saved to {os.path.abspath(output_file)}")
37+
except Exception as e:
38+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
39+
raise
40+
41+
42+
def text_to_speech_stream():
43+
# Initialize client
44+
client = ZaiClient()
45+
46+
# Audio format
47+
# Streaming only supports pcm format
48+
response_format = 'pcm'
49+
50+
try:
51+
# Generate speech audio with streaming
52+
response = client.audio.speech(
53+
model='glm-tts',
54+
input='Hello, this is a test for text-to-speech functionality.',
55+
voice='tongtong',
56+
response_format=response_format,
57+
stream=True
58+
)
59+
60+
# Process streaming response
61+
chunk_index = 0
62+
for chunk in response:
63+
try:
64+
choice = chunk.choices[0]
65+
if choice.delta is None:
66+
break
67+
if choice.delta.content:
68+
print(f"[Chunk {chunk_index}] {choice.delta.content}")
69+
chunk_index += 1
70+
except (AttributeError, IndexError) as e:
71+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
72+
except Exception as e:
73+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
74+
raise
75+
76+
77+
if __name__ == '__main__':
78+
# Non-streaming text to speech
79+
text_to_speech_non_stream()
80+
81+
# Streaming text to speech
82+
# text_to_speech_stream()
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from zai import ZaiClient
2+
import os
3+
import traceback
4+
5+
6+
# Change working directory to project root
7+
script_dir = os.path.dirname(os.path.abspath(__file__))
8+
project_root = os.path.dirname(script_dir)
9+
os.chdir(project_root)
10+
11+
12+
def audio_transcription_non_stream():
13+
# Initialize client
14+
client = ZaiClient()
15+
16+
# Audio file path
17+
# Supported formats: .wav, .mp3
18+
# File size limit: <= 25 MB, Duration limit: <= 30 seconds
19+
audio_file_path = "tests/integration_tests/asr.wav"
20+
21+
# Check if file exists
22+
if not os.path.exists(audio_file_path):
23+
print(f"Audio file not found: {audio_file_path}")
24+
return
25+
26+
try:
27+
# Open the audio file and create transcription
28+
with open(audio_file_path, 'rb') as audio_file:
29+
response = client.audio.transcriptions.create(
30+
model='glm-asr-2512',
31+
file=audio_file,
32+
stream=False
33+
)
34+
35+
# Print transcription result
36+
print(response.text)
37+
except Exception as e:
38+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
39+
raise
40+
41+
42+
def audio_transcription_stream():
43+
# Initialize client
44+
client = ZaiClient()
45+
46+
# Audio file path
47+
# Supported formats: .wav, .mp3
48+
# File size limit: <= 25 MB, Duration limit: <= 30 seconds
49+
audio_file_path = "tests/integration_tests/asr.wav"
50+
51+
# Check if file exists
52+
if not os.path.exists(audio_file_path):
53+
print(f"Audio file not found: {audio_file_path}")
54+
return
55+
56+
try:
57+
# Open the audio file and create transcription with streaming
58+
with open(audio_file_path, 'rb') as audio_file:
59+
response = client.audio.transcriptions.create(
60+
model='glm-asr-2512',
61+
file=audio_file,
62+
stream=True
63+
)
64+
65+
# Process streaming response
66+
print("Streaming transcription:")
67+
for chunk in response:
68+
try:
69+
if hasattr(chunk, 'delta') and chunk.delta:
70+
print(chunk.delta, flush=True)
71+
except (AttributeError, IndexError) as e:
72+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
73+
except Exception as e:
74+
print(f"Exception: {e}\nTraceback: {traceback.format_exc()}")
75+
raise
76+
77+
78+
if __name__ == '__main__':
79+
# Non-streaming audio transcription
80+
audio_transcription_non_stream()
81+
82+
# Streaming audio transcription
83+
# audio_transcription_stream()

src/zai/api_resource/audio/audio.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def speech(
5151
input: str = None,
5252
voice: str = None,
5353
response_format: str = None,
54+
watermark_enabled: Optional[bool] | NotGiven = NOT_GIVEN,
5455
sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
5556
request_id: str = None,
5657
user_id: str = None,
@@ -66,13 +67,18 @@ def speech(
6667
Generate speech audio from text input
6768
6869
Arguments:
69-
model (str): The model to use for speech generation
70-
input (str): The text to convert to speech
71-
voice (str): The voice to use for speech generation
72-
response_format (str): The format of the response audio
70+
model (str): The model to use for speech generation (e.g., 'glm-tts')
71+
input (str): The text to convert to speech (max length: 1024 characters)
72+
voice (str): The voice to use for speech generation (e.g., 'tongtong', 'chuichui', 'xiaochen', etc.)
73+
response_format (str): The format of the response audio ('wav' or 'pcm', default 'pcm')
74+
watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
7375
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
7476
request_id (str): Unique identifier for the request
7577
user_id (str): User identifier
78+
encode_format (str): Encoding format for streaming response ('base64' or 'hex', default 'base64')
79+
speed (float): Speech speed, default 1.0, valid range [0.5, 2]
80+
volume (float): Audio volume, default 1.0, valid range (0, 10]
81+
stream (bool): Whether to use streaming output (default False)
7682
extra_headers (Headers): Additional headers to send
7783
extra_body (Body): Additional body parameters
7884
timeout (float | httpx.Timeout): Request timeout
@@ -83,6 +89,8 @@ def speech(
8389
'input': input,
8490
'voice': voice,
8591
'response_format': response_format,
92+
'watermark_enabled': watermark_enabled,
93+
'sensitive_word_check': sensitive_word_check,
8694
'encode_format': encode_format,
8795
'request_id': request_id,
8896
'user_id': user_id,

src/zai/api_resource/audio/transcriptions.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING, Mapping, Optional, cast
4+
from typing import TYPE_CHECKING, List, Mapping, Optional, cast
55

66
import httpx
77
from typing_extensions import Literal
@@ -43,10 +43,12 @@ def create(
4343
*,
4444
file: FileTypes,
4545
model: str,
46+
file_base64: Optional[str] | NotGiven = NOT_GIVEN,
47+
prompt: Optional[str] | NotGiven = NOT_GIVEN,
48+
hotwords: Optional[List[str]] | NotGiven = NOT_GIVEN,
4649
request_id: Optional[str] | NotGiven = NOT_GIVEN,
4750
user_id: Optional[str] | NotGiven = NOT_GIVEN,
4851
stream: Optional[Literal[False]] | Literal[True] | NotGiven = NOT_GIVEN,
49-
temperature: Optional[float] | NotGiven = NOT_GIVEN,
5052
sensitive_word_check: Optional[SensitiveWordCheckRequest] | NotGiven = NOT_GIVEN,
5153
extra_headers: Headers | None = None,
5254
extra_body: Body | None = None,
@@ -58,28 +60,26 @@ def create(
5860
Arguments:
5961
file (FileTypes): Audio file to transcribe
6062
model (str): The model to use for transcription
63+
file_base64 (Optional[str]): Base64 encoded audio file (alternative to file)
64+
prompt (Optional[str]): Previous transcription result for context
65+
hotwords (Optional[List[str]]): Hot words to improve recognition rate
6166
request_id (Optional[str]): Unique identifier for the request
6267
user_id (Optional[str]): User identifier
6368
stream (Optional[Literal[False]] | Literal[True]): Whether to stream the response
64-
temperature (Optional[float]): Sampling temperature for transcription
6569
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
6670
extra_headers (Headers): Additional headers to send
6771
extra_body (Body): Additional body parameters
6872
timeout (float | httpx.Timeout): Request timeout
6973
"""
70-
if temperature is not None and temperature != NOT_GIVEN:
71-
if temperature <= 0:
72-
temperature = 0.01
73-
if temperature >= 1:
74-
temperature = 0.99
75-
7674
body = deepcopy_minimal(
7775
{
7876
'model': model,
7977
'file': file,
78+
'file_base64': file_base64,
79+
'prompt': prompt,
80+
'hotwords': hotwords,
8081
'request_id': request_id,
8182
'user_id': user_id,
82-
'temperature': temperature,
8383
'sensitive_word_check': sensitive_word_check,
8484
'stream': stream,
8585
}

src/zai/types/audio/audio_speech_params.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ class AudioSpeechParams(TypedDict, total=False):
1616
input (str): Text to be converted to speech
1717
voice (str): Voice tone for speech generation
1818
response_format (str): Format of the generated audio file
19+
watermark_enabled (Optional[bool]): Whether to enable watermark on generated audio
20+
encode_format (str): Encoding format for streaming response (base64 or hex)
21+
speed (float): Speech speed, default 1.0, range [0.5, 2]
22+
volume (float): Audio volume, default 1.0, range (0, 10]
23+
stream (bool): Whether to use streaming output
1924
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration
2025
request_id (str): Request ID passed by client, must be unique; used to distinguish each request,
2126
platform will generate default if not provided by client
@@ -26,10 +31,11 @@ class AudioSpeechParams(TypedDict, total=False):
2631
input: str
2732
voice: str
2833
response_format: str
29-
sensitive_word_check: Optional[SensitiveWordCheckRequest]
30-
request_id: str
31-
user_id: str
34+
watermark_enabled: Optional[bool]
3235
encode_format: str
3336
speed: float
3437
volume: float
3538
stream: bool
39+
sensitive_word_check: Optional[SensitiveWordCheckRequest]
40+
request_id: str
41+
user_id: str

src/zai/types/audio/transcriptions_create_param.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import Optional
3+
from typing import List, Optional
44

55
from typing_extensions import TypedDict
66

@@ -13,7 +13,10 @@ class TranscriptionsParam(TypedDict, total=False):
1313
1414
Attributes:
1515
model (str): Model encoding.
16-
temperature (float): Sampling temperature.
16+
file (str): Audio file to transcribe.
17+
file_base64 (str): Base64 encoded audio file (alternative to file).
18+
prompt (str): Previous transcription result for context in long text scenarios.
19+
hotwords (List[str]): Hot words to improve recognition rate for specific domain vocabulary.
1720
stream (bool): Whether to use streaming output.
1821
sensitive_word_check (Optional[SensitiveWordCheckRequest]): Sensitive word check configuration.
1922
request_id (str): Passed by the client, must ensure uniqueness; used to distinguish
@@ -23,7 +26,10 @@ class TranscriptionsParam(TypedDict, total=False):
2326
"""
2427

2528
model: str
26-
temperature: float
29+
file: str
30+
file_base64: str
31+
prompt: str
32+
hotwords: List[str]
2733
stream: bool
2834
sensitive_word_check: Optional[SensitiveWordCheckRequest]
2935
request_id: str

0 commit comments

Comments
 (0)