fix(stainless): handle [DONE] SSE terminator in streaming responses

stainless-app[bot] · stainless-app[bot] · commit 17f0029a3bd6 · 2026-03-03T15:28:41.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 108
 openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-ef864e1fd05e4fda60d0c67ea0f8d49db03efcfaddb0b28c561962a695510f6e.yml
 openapi_spec_hash: fd0140251c983c3788c9da642426f1ba
-config_hash: 6aa61d4143c3e3df785972c0287d1370
+config_hash: ef1f9b33e203c71cfc10d91890c1ed2d
diff --git a/README.md b/README.md
@@ -128,6 +128,50 @@ async def main() -> None:
 asyncio.run(main())
 ```
 
+## Streaming responses
+
+We provide support for streaming responses using Server Side Events (SSE).
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+stream = client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+for completion in stream:
+    print(completion.id)
+```
+
+The async client uses the exact same interface.
+
+```python
+from llama_stack_client import AsyncLlamaStackClient
+
+client = AsyncLlamaStackClient()
+
+stream = await client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+async for completion in stream:
+    print(completion.id)
+```
+
 ## Using types
 
 Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:
diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
@@ -158,6 +158,8 @@ def __init__(
             _strict_response_validation=_strict_response_validation,
         )
 
+        self._default_stream_cls = Stream
+
     @cached_property
     def toolgroups(self) -> ToolgroupsResource:
         from .resources.toolgroups import ToolgroupsResource
@@ -515,6 +517,8 @@ def __init__(
             _strict_response_validation=_strict_response_validation,
         )
 
+        self._default_stream_cls = AsyncStream
+
     @cached_property
     def toolgroups(self) -> AsyncToolgroupsResource:
         from .resources.toolgroups import AsyncToolgroupsResource
diff --git a/src/llama_stack_client/_streaming.py b/src/llama_stack_client/_streaming.py
@@ -15,7 +15,8 @@
 
 import httpx
 
-from ._utils import extract_type_var_from_base
+from ._utils import is_mapping, extract_type_var_from_base
+from ._exceptions import APIError
 
 if TYPE_CHECKING:
     from ._client import LlamaStackClient, AsyncLlamaStackClient
@@ -65,7 +66,25 @@ def __stream__(self) -> Iterator[_T]:
 
         try:
             for sse in iterator:
-                yield process_data(data=sse.json(), cast_to=cast_to, response=response)
+                if sse.data.startswith("[DONE]"):
+                    break
+
+                data = sse.json()
+                if is_mapping(data) and data.get("error"):
+                    message = None
+                    error = data.get("error")
+                    if is_mapping(error):
+                        message = error.get("message")
+                    if not message or not isinstance(message, str):
+                        message = "An error occurred during streaming"
+
+                    raise APIError(
+                        message=message,
+                        request=self.response.request,
+                        body=data["error"],
+                    )
+
+                yield process_data(data=data, cast_to=cast_to, response=response)
         finally:
             # Ensure the response is closed even if the consumer doesn't read all data
             response.close()
@@ -131,7 +150,25 @@ async def __stream__(self) -> AsyncIterator[_T]:
 
         try:
             async for sse in iterator:
-                yield process_data(data=sse.json(), cast_to=cast_to, response=response)
+                if sse.data.startswith("[DONE]"):
+                    break
+
+                data = sse.json()
+                if is_mapping(data) and data.get("error"):
+                    message = None
+                    error = data.get("error")
+                    if is_mapping(error):
+                        message = error.get("message")
+                    if not message or not isinstance(message, str):
+                        message = "An error occurred during streaming"
+
+                    raise APIError(
+                        message=message,
+                        request=self.response.request,
+                        body=data["error"],
+                    )
+
+                yield process_data(data=data, cast_to=cast_to, response=response)
         finally:
             # Ensure the response is closed even if the consumer doesn't read all data
             await response.aclose()
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -29,6 +29,7 @@
 from llama_stack_client._types import Omit
 from llama_stack_client._utils import asyncify
 from llama_stack_client._models import BaseModel, FinalRequestOptions
+from llama_stack_client._streaming import Stream, AsyncStream
 from llama_stack_client._exceptions import APIStatusError, APITimeoutError, APIResponseValidationError
 from llama_stack_client._base_client import (
     DEFAULT_TIMEOUT,
@@ -773,6 +774,17 @@ def test_client_max_retries_validation(self) -> None:
         with pytest.raises(TypeError, match=r"max_retries cannot be None"):
             LlamaStackClient(base_url=base_url, _strict_response_validation=True, max_retries=cast(Any, None))
 
+    @pytest.mark.respx(base_url=base_url)
+    def test_default_stream_cls(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
+        class Model(BaseModel):
+            name: str
+
+        respx_mock.post("/foo").mock(return_value=httpx.Response(200, json={"foo": "bar"}))
+
+        stream = client.post("/foo", cast_to=Model, stream=True, stream_cls=Stream[Model])
+        assert isinstance(stream, Stream)
+        stream.response.close()
+
     @pytest.mark.respx(base_url=base_url)
     def test_received_text_for_expected_json(self, respx_mock: MockRouter) -> None:
         class Model(BaseModel):
@@ -1685,6 +1697,17 @@ async def test_client_max_retries_validation(self) -> None:
         with pytest.raises(TypeError, match=r"max_retries cannot be None"):
             AsyncLlamaStackClient(base_url=base_url, _strict_response_validation=True, max_retries=cast(Any, None))
 
+    @pytest.mark.respx(base_url=base_url)
+    async def test_default_stream_cls(self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient) -> None:
+        class Model(BaseModel):
+            name: str
+
+        respx_mock.post("/foo").mock(return_value=httpx.Response(200, json={"foo": "bar"}))
+
+        stream = await async_client.post("/foo", cast_to=Model, stream=True, stream_cls=AsyncStream[Model])
+        assert isinstance(stream, AsyncStream)
+        await stream.response.aclose()
+
     @pytest.mark.respx(base_url=base_url)
     async def test_received_text_for_expected_json(self, respx_mock: MockRouter) -> None:
         class Model(BaseModel):