Skip to content

Commit 9cce844

Browse files
authored
Merge pull request #1119 from asimurka/query_refactor
LCORE-1213: Refactor of query handlers
2 parents 15754b4 + 6bd6ec2 commit 9cce844

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+8383
-6275
lines changed

docs/openapi.json

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,11 +1293,11 @@
12931293
"/v1/query": {
12941294
"post": {
12951295
"tags": [
1296-
"query_v1"
1296+
"query"
12971297
],
1298-
"summary": "Query Endpoint Handler V1",
1299-
"description": "Handle request to the /query endpoint using Responses API.\n\nThis is a wrapper around query_endpoint_handler_base that provides\nthe Responses API specific retrieve_response and get_topic_summary functions.\n\nReturns:\n QueryResponse: Contains the conversation ID and the LLM-generated response.",
1300-
"operationId": "query_endpoint_handler_v2_v1_query_post",
1298+
"summary": "Query Endpoint Handler",
1299+
"description": "Handle request to the /query endpoint using Responses API.\n\nProcesses a POST request to a query endpoint, forwarding the\nuser's query to a selected Llama Stack LLM and returning the generated response.\n\nReturns:\n QueryResponse: Contains the conversation ID and the LLM-generated response.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 413: Prompt too long - Prompt exceeded model's context window size\n - 422: Unprocessable Entity - Request validation failed\n - 429: Quota limit exceeded - The token quota for model or user has been exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend",
1300+
"operationId": "query_endpoint_handler_v1_query_post",
13011301
"requestBody": {
13021302
"content": {
13031303
"application/json": {
@@ -1453,6 +1453,26 @@
14531453
}
14541454
}
14551455
},
1456+
"413": {
1457+
"description": "Prompt is too long",
1458+
"content": {
1459+
"application/json": {
1460+
"schema": {
1461+
"$ref": "#/components/schemas/PromptTooLongResponse"
1462+
},
1463+
"examples": {
1464+
"prompt too long": {
1465+
"value": {
1466+
"detail": {
1467+
"cause": "The prompt exceeds the maximum allowed length.",
1468+
"response": "Prompt is too long"
1469+
}
1470+
}
1471+
}
1472+
}
1473+
}
1474+
}
1475+
},
14561476
"422": {
14571477
"description": "Request validation failed",
14581478
"content": {
@@ -1480,7 +1500,7 @@
14801500
"invalid value": {
14811501
"value": {
14821502
"detail": {
1483-
"cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
1503+
"cause": "Invalid attachment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
14841504
"response": "Invalid attribute value"
14851505
}
14861506
}
@@ -1603,11 +1623,11 @@
16031623
"/v1/streaming_query": {
16041624
"post": {
16051625
"tags": [
1606-
"streaming_query_v1"
1626+
"streaming_query"
16071627
],
1608-
"summary": "Streaming Query Endpoint Handler V1",
1609-
"description": "Handle request to the /streaming_query endpoint using Responses API.\n\nReturns a streaming response using Server-Sent Events (SSE) format with\ncontent type text/event-stream.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle with content type\n text/event-stream.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 422: Unprocessable Entity - Request validation failed\n - 429: Too Many Requests - Quota limit exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend",
1610-
"operationId": "streaming_query_endpoint_handler_v2_v1_streaming_query_post",
1628+
"summary": "Streaming Query Endpoint Handler",
1629+
"description": "Handle request to the /streaming_query endpoint using Responses API.\n\nReturns a streaming response using Server-Sent Events (SSE) format with\ncontent type text/event-stream.\n\nReturns:\n SSE-formatted events for the query lifecycle.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 413: Prompt too long - Prompt exceeded model's context window size\n - 422: Unprocessable Entity - Request validation failed\n - 429: Quota limit exceeded - The token quota for model or user has been exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend",
1630+
"operationId": "streaming_query_endpoint_handler_v1_streaming_query_post",
16111631
"requestBody": {
16121632
"content": {
16131633
"application/json": {
@@ -1731,6 +1751,26 @@
17311751
}
17321752
}
17331753
},
1754+
"413": {
1755+
"description": "Prompt is too long",
1756+
"content": {
1757+
"application/json": {
1758+
"schema": {
1759+
"$ref": "#/components/schemas/PromptTooLongResponse"
1760+
},
1761+
"examples": {
1762+
"prompt too long": {
1763+
"value": {
1764+
"detail": {
1765+
"cause": "The prompt exceeds the maximum allowed length.",
1766+
"response": "Prompt is too long"
1767+
}
1768+
}
1769+
}
1770+
}
1771+
}
1772+
}
1773+
},
17341774
"422": {
17351775
"description": "Request validation failed",
17361776
"content": {
@@ -1758,7 +1798,7 @@
17581798
"invalid value": {
17591799
"value": {
17601800
"detail": {
1761-
"cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
1801+
"cause": "Invalid attachment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
17621802
"response": "Invalid attribute value"
17631803
}
17641804
}
@@ -3756,7 +3796,7 @@
37563796
"invalid value": {
37573797
"value": {
37583798
"detail": {
3759-
"cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
3799+
"cause": "Invalid attachment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
37603800
"response": "Invalid attribute value"
37613801
}
37623802
}
@@ -4278,7 +4318,7 @@
42784318
],
42794319
"summary": "Handle A2A Jsonrpc",
42804320
"description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response",
4281-
"operationId": "handle_a2a_jsonrpc_a2a_get",
4321+
"operationId": "handle_a2a_jsonrpc_a2a_post",
42824322
"responses": {
42834323
"200": {
42844324
"description": "Successful Response",
@@ -4296,7 +4336,7 @@
42964336
],
42974337
"summary": "Handle A2A Jsonrpc",
42984338
"description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response",
4299-
"operationId": "handle_a2a_jsonrpc_a2a_get",
4339+
"operationId": "handle_a2a_jsonrpc_a2a_post",
43004340
"responses": {
43014341
"200": {
43024342
"description": "Successful Response",
@@ -7519,6 +7559,33 @@
75197559
"title": "PostgreSQLDatabaseConfiguration",
75207560
"description": "PostgreSQL database configuration.\n\nPostgreSQL database is used by Lightspeed Core Stack service for storing\ninformation about conversation IDs. It can also be leveraged to store\nconversation history and information about quota usage.\n\nUseful resources:\n\n- [Psycopg: connection classes](https://www.psycopg.org/psycopg3/docs/api/connections.html)\n- [PostgreSQL connection strings](https://www.connectionstrings.com/postgresql/)\n- [How to Use PostgreSQL in Python](https://www.freecodecamp.org/news/postgresql-in-python/)"
75217561
},
7562+
"PromptTooLongResponse": {
7563+
"properties": {
7564+
"status_code": {
7565+
"type": "integer",
7566+
"title": "Status Code"
7567+
},
7568+
"detail": {
7569+
"$ref": "#/components/schemas/DetailModel"
7570+
}
7571+
},
7572+
"type": "object",
7573+
"required": [
7574+
"status_code",
7575+
"detail"
7576+
],
7577+
"title": "PromptTooLongResponse",
7578+
"description": "413 Payload Too Large - Prompt is too long.",
7579+
"examples": [
7580+
{
7581+
"detail": {
7582+
"cause": "The prompt exceeds the maximum allowed length.",
7583+
"response": "Prompt is too long"
7584+
},
7585+
"label": "prompt too long"
7586+
}
7587+
]
7588+
},
75227589
"ProviderHealthStatus": {
75237590
"properties": {
75247591
"provider_id": {
@@ -7967,7 +8034,7 @@
79678034
"truncated": {
79688035
"type": "boolean",
79698036
"title": "Truncated",
7970-
"description": "Whether conversation history was truncated",
8037+
"description": "Deprecated:Whether conversation history was truncated",
79718038
"default": false,
79728039
"examples": [
79738040
false,
@@ -9329,7 +9396,7 @@
93299396
},
93309397
{
93319398
"detail": {
9332-
"cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
9399+
"cause": "Invalid attachment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']",
93339400
"response": "Invalid attribute value"
93349401
},
93359402
"label": "invalid value"

src/app/endpoints/README.md

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,12 @@ Handler for REST API call to authorized endpoint.
1212
## [config.py](config.py)
1313
Handler for REST API call to retrieve service configuration.
1414

15-
## [conversations.py](conversations.py)
16-
Handler for REST API calls to manage conversation history.
15+
## [conversations_v1.py](conversations_v1.py)
16+
Handler for REST API calls to manage conversation history using Conversations API.
1717

1818
## [conversations_v2.py](conversations_v2.py)
1919
Handler for REST API calls to manage conversation history.
2020

21-
## [conversations_v3.py](conversations_v3.py)
22-
Handler for REST API calls to manage conversation history using Conversations API.
23-
2421
## [feedback.py](feedback.py)
2522
Handler for REST API endpoint for user feedback.
2623

@@ -43,10 +40,7 @@ Handler for REST API call to list available models.
4340
Handler for REST API calls to list and retrieve available providers.
4441

4542
## [query.py](query.py)
46-
Handler for REST API call to provide answer to query.
47-
48-
## [query_v2.py](query_v2.py)
49-
Handler for REST API call to provide answer to query using Response API.
43+
Handler for REST API call to provide answer to query using Responses API.
5044

5145
## [rags.py](rags.py)
5246
Handler for REST API calls to list and retrieve available RAGs.
@@ -61,10 +55,7 @@ Handler for the / endpoint.
6155
Handler for REST API call to list available shields.
6256

6357
## [streaming_query.py](streaming_query.py)
64-
Handler for REST API call to provide answer to streaming query.
65-
66-
## [streaming_query_v2.py](streaming_query_v2.py)
67-
Streaming query handler using Responses API (v2).
58+
Handler for REST API call to provide answer to streaming query using Responses API.
6859

6960
## [tools.py](tools.py)
7061
Handler for REST API call to list available tools from MCP servers.

src/app/endpoints/a2a.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,7 @@
3636
from starlette.responses import Response, StreamingResponse
3737

3838
from a2a_storage import A2AContextStore, A2AStorageFactory
39-
from app.endpoints.query import (
40-
evaluate_model_hints,
41-
select_model_and_provider_id,
42-
)
43-
from app.endpoints.streaming_query_v2 import retrieve_response
39+
4440
from authentication import get_auth_dependency
4541
from authentication.interface import AuthTuple
4642
from authorization.middleware import authorize
@@ -49,7 +45,11 @@
4945
from models.config import Action
5046
from models.requests import QueryRequest
5147
from utils.mcp_headers import mcp_headers_dependency
52-
from utils.responses import extract_text_from_response_output_item
48+
from utils.responses import (
49+
extract_text_from_response_output_item,
50+
prepare_responses_params,
51+
)
52+
from utils.suid import normalize_conversation_id
5353
from version import __version__
5454

5555
logger = logging.getLogger("app.endpoints.handlers")
@@ -317,23 +317,17 @@ async def _process_task_streaming( # pylint: disable=too-many-locals
317317
# Get LLM client and select model
318318
client = AsyncLlamaStackClientHolder().get_client()
319319
try:
320-
llama_stack_model_id, _model_id, _provider_id = (
321-
select_model_and_provider_id(
322-
await client.models.list(),
323-
*evaluate_model_hints(
324-
user_conversation=None, query_request=query_request
325-
),
326-
)
327-
)
328-
329-
# Stream response from LLM using the Responses API
330-
stream, conversation_id = await retrieve_response(
320+
responses_params = await prepare_responses_params(
331321
client,
332-
llama_stack_model_id,
333322
query_request,
323+
None,
334324
self.auth_token,
335-
mcp_headers=self.mcp_headers,
325+
self.mcp_headers,
326+
stream=True,
327+
store=True,
336328
)
329+
# Stream response from LLM using the Responses API
330+
stream = await client.responses.create(**responses_params.model_dump())
337331
except APIConnectionError as e:
338332
error_message = (
339333
f"Unable to connect to Llama Stack backend service: {str(e)}. "
@@ -356,6 +350,9 @@ async def _process_task_streaming( # pylint: disable=too-many-locals
356350
return
357351

358352
# Persist conversation_id for next turn in same A2A context
353+
conversation_id = conversation_id or normalize_conversation_id(
354+
responses_params.conversation
355+
)
359356
if conversation_id:
360357
await context_store.set(a2a_context_id, conversation_id)
361358
logger.info(
@@ -379,7 +376,7 @@ async def _process_task_streaming( # pylint: disable=too-many-locals
379376
context_id=context_id,
380377
final=False,
381378
metadata={
382-
"model": llama_stack_model_id,
379+
"model": responses_params.model,
383380
"conversation_id": conversation_id,
384381
},
385382
)

0 commit comments

Comments
 (0)