redis
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 8 additions & 1 deletion b/‎CLAUDE.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎agent_memory_server/api.py‎
Lines changed: 4 additions & 6 deletions b/‎agent_memory_server/api.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎agent_memory_server/extraction.py‎
Lines changed: 5 additions & 17 deletions b/‎agent_memory_server/extraction.py‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎agent_memory_server/llm/__init__.py‎
Lines changed: 45 additions & 0 deletions b/‎agent_memory_server/llm/__init__.py‎
Lines changed: 45 additions & 0 deletions
@@ -237,3 +237,4 @@ TASK_MEMORY.md
 *.code-workspace
 /agent-memory-client/agent-memory-client-java/.gradle/
 augment*.md
+dev_docs/
@@ -110,6 +110,9 @@ query = VectorQuery(vector=embedding, vector_field_name="vector", return_fields=
 
 ## Critical Rules
 
+### Import Placement
+Place all imports at the top of modules, not inside functions. Inline imports should only be used when strictly necessary (e.g., avoiding circular dependencies, optional dependencies, or significant startup performance concerns).
+
 ### Authentication
 - **PRODUCTION**: Never set `DISABLE_AUTH=true` in production
 - **DEVELOPMENT**: Use `DISABLE_AUTH=true` for local testing only
@@ -149,7 +152,11 @@ agent_memory_server/
 ├── summarization.py     # Conversation summarization
 ├── extraction.py        # Topic and entity extraction
 ├── filters.py           # Search filtering logic
-├── llms.py              # LLM provider integrations
+├── llm/                 # LLM client package (LiteLLM-based)
+│   ├── __init__.py      # Re-exports for clean imports
+│   ├── client.py        # LLMClient class with chat/embedding methods
+│   ├── types.py         # ChatCompletionResponse, EmbeddingResponse, LLMBackend
+│   └── exceptions.py    # LLMClientError, ModelValidationError, APIKeyMissingError
 ├── migrations.py        # Database schema migrations
 ├── docket_tasks.py      # Background task definitions
 ├── cli.py               # Command-line interface
 
@@ -11,7 +11,7 @@
 from agent_memory_server.config import settings
 from agent_memory_server.dependencies import HybridBackgroundTasks
 from agent_memory_server.filters import SessionId, UserId
-from agent_memory_server.llms import get_model_client, get_model_config
+from agent_memory_server.llm import LLMClient
 from agent_memory_server.logging import get_logger
 from agent_memory_server.models import (
     AckResponse,
@@ -101,7 +101,7 @@ def _get_effective_token_limit(
         return context_window_max
     # If model_name is provided, get its max_tokens from our config
     if model_name is not None:
-        model_config = get_model_config(model_name)
+        model_config = LLMClient.get_model_config(model_name)
         return model_config.max_tokens
     # Otherwise use a conservative default (GPT-3.5 context window)
     return 16000  # Conservative default
@@ -238,9 +238,8 @@ async def _summarize_working_memory(
     if current_tokens <= token_threshold:
         return memory
 
-    # Get model client for summarization
-    client = await get_model_client(model)
-    model_config = get_model_config(model)
+    # Get model config for summarization
+    model_config = LLMClient.get_model_config(model)
     summarization_max_tokens = model_config.max_tokens
 
     # Token allocation for summarization (same logic as original summarize_session)
@@ -305,7 +304,6 @@ async def _summarize_working_memory(
     # Generate summary
     summary, summary_tokens_used = await _incremental_summary(
         model,
-        client,
         memory.context,  # Use existing context as base
         messages_to_summarize,
     )
 
@@ -9,12 +9,7 @@
 # Lazy-import transformers in get_ner_model to avoid heavy deps at startup
 from agent_memory_server.config import settings
 from agent_memory_server.filters import DiscreteMemoryExtracted, MemoryType
-from agent_memory_server.llms import (
-    AnthropicClientWrapper,
-    BedrockClientWrapper,
-    OpenAIClientWrapper,
-    get_model_client,
-)
+from agent_memory_server.llm import LLMClient
 from agent_memory_server.logging import get_logger
 from agent_memory_server.models import MemoryRecord
 
@@ -128,15 +123,10 @@ def extract_entities(text: str) -> list[str]:
 async def extract_topics_llm(
     text: str,
     num_topics: int | None = None,
-    client: OpenAIClientWrapper
-    | AnthropicClientWrapper
-    | BedrockClientWrapper
-    | None = None,
 ) -> list[str]:
     """
     Extract topics from text using the LLM model.
     """
-    _client = client or await get_model_client(settings.topic_model)
     _num_topics = num_topics if num_topics is not None else settings.top_k_topics
 
     prompt = f"""
@@ -152,17 +142,15 @@ async def extract_topics_llm(
 
     async for attempt in AsyncRetrying(stop=stop_after_attempt(3)):
         with attempt:
-            response = await _client.create_chat_completion(
+            response = await LLMClient.create_chat_completion(
                 model=settings.generation_model,
-                prompt=prompt,
+                messages=[{"role": "user", "content": prompt}],
                 response_format={"type": "json_object"},
             )
             try:
-                topics = json.loads(response.choices[0].message.content)["topics"]
+                topics = json.loads(response.content)["topics"]
             except (json.JSONDecodeError, KeyError):
-                logger.error(
-                    f"Error decoding JSON: {response.choices[0].message.content}"
-                )
+                logger.error(f"Error decoding JSON: {response.content}")
                 topics = []
             if topics:
                 topics = topics[:_num_topics]
 
@@ -0,0 +1,45 @@
+"""
+LLM client package for unified LLM operations.
+
+This package provides a single entry point for all LLM interactions,
+abstracting away the underlying provider (OpenAI, Anthropic, Bedrock, etc.).
+
+Usage:
+    from agent_memory_server.llm import LLMClient, ChatCompletionResponse
+
+    response = await LLMClient.create_chat_completion(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": "Hello"}],
+    )
+"""
+
+from agent_memory_server.llm.client import (
+    LLMClient,
+    get_model_config,
+    optimize_query_for_vector_search,
+)
+from agent_memory_server.llm.exceptions import (
+    APIKeyMissingError,
+    LLMClientError,
+    ModelValidationError,
+)
+from agent_memory_server.llm.types import (
+    ChatCompletionResponse,
+    EmbeddingResponse,
+)
+
+
+__all__ = [
+    # Client
+    "LLMClient",
+    # Convenience functions
+    "get_model_config",
+    "optimize_query_for_vector_search",
+    # Exceptions
+    "LLMClientError",
+    "ModelValidationError",
+    "APIKeyMissingError",
+    # Types
+    "ChatCompletionResponse",
+    "EmbeddingResponse",
+]