fix(openai-agents): restore Speech/Transcription/SpeechGroup handlers, remove AgentSpanData misfeature

max-deygin-traceloop · claude · max-deygin-traceloop · commit 749f737d1a9a · 2026-03-18T16:21:26.000+02:00
- _hooks.py: remove misplaced catch-all 'elif span_data:' that was shadowing
  SpeechSpanData, TranscriptionSpanData, SpeechGroupSpanData, and AgentSpanData branches
- _hooks.py: remove AgentSpanData handler that incorrectly propagated model settings
  to agent spans (test spec: agent spans must NOT carry gen_ai.request.* params)
- _hooks.py: replace hardcoded "openai.agent.model.frequency_penalty" with
  GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY constant
- tests: fix dead "llm.usage.*" prefix check, fix vestigial "gen_ai.prompt" scan,
  fix hardcoded frequency_penalty string, fix long line in test_realtime_session.py

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/opentelemetry-instrumentation-openai-agents/opentelemetry/instrumentation/openai_agents/_hooks.py b/packages/opentelemetry-instrumentation-openai-agents/opentelemetry/instrumentation/openai_agents/_hooks.py
@@ -624,16 +624,6 @@ def on_span_end(self, span):
                     model_settings = _extract_response_attributes(otel_span, response, trace_content)
                     self._last_model_settings = model_settings
 
-            # Legacy fallback for other span types
-            elif span_data:
-                input_data = getattr(span_data, "input", [])
-                _extract_prompt_attributes(otel_span, input_data, trace_content)
-
-                response = getattr(span_data, "response", None)
-                if response:
-                    model_settings = _extract_response_attributes(otel_span, response, trace_content)
-                    self._last_model_settings = model_settings
-
             elif (
                 _has_realtime_spans
                 and SpeechSpanData
@@ -687,33 +677,6 @@ def on_span_end(self, span):
                         json.dumps([{"role": "user", "content": input_text}]),
                     )
 
-            elif span_data and type(span_data).__name__ == "AgentSpanData":
-                # For agent spans, add the model settings we stored from the response span
-                if hasattr(self, "_last_model_settings") and self._last_model_settings:
-                    for key, value in self._last_model_settings.items():
-                        if key == "temperature":
-                            otel_span.set_attribute(
-                                GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE, value
-                            )
-                        elif key == "max_tokens":
-                            otel_span.set_attribute(
-                                GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS, value
-                            )
-                        elif key == "top_p":
-                            otel_span.set_attribute(
-                                GenAIAttributes.GEN_AI_REQUEST_TOP_P, value
-                            )
-                        elif key == "model":
-                            otel_span.set_attribute(
-                                GenAIAttributes.GEN_AI_REQUEST_MODEL, value
-                            )
-                        elif key == "frequency_penalty":
-                            otel_span.set_attribute(
-                                "openai.agent.model.frequency_penalty", value
-                            )
-                        # Note: prompt_attributes, completion_attributes, and usage tokens are now
-                        # on response spans only
-
             if hasattr(span, "error") and span.error:
                 otel_span.set_status(Status(StatusCode.ERROR, str(span.error)))
             else:
diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_openai_agents.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_openai_agents.py
@@ -50,11 +50,11 @@ def test_dict_content_serialization(exporter):
 
     spans = exporter.get_finished_spans()
 
-    # Look for any spans with prompt/content attributes
+    # Look for any spans with message content attributes
     for span in spans:
         for attr_name, attr_value in span.attributes.items():
-            prompt_content_check = ("prompt" in attr_name and "content" in attr_name) or (
-                "gen_ai.prompt" in attr_name and "content" in attr_name
+            prompt_content_check = (
+                attr_name in ("gen_ai.input.messages", "gen_ai.output.messages")
             )
             if prompt_content_check:
                 # All content attributes should be strings, not dicts
@@ -98,7 +98,7 @@ def test_agent_spans(exporter, test_agent):
     assert GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE not in agent_span.attributes
     assert GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS not in agent_span.attributes
     assert GenAIAttributes.GEN_AI_REQUEST_TOP_P not in agent_span.attributes
-    assert "openai.agent.model.frequency_penalty" not in agent_span.attributes
+    assert GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY not in agent_span.attributes
 
     # Find the response span (openai.response) - this should contain prompts/completions/usage
     response_spans = [s for s in spans if s.name == "openai.response"]
diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime_session.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime_session.py
@@ -697,5 +697,6 @@ def test_response_done_without_usage_still_captures_completion(self, tracer, tra
         spans = exporter.get_finished_spans()
         llm_spans = [s for s in spans if s.name == "openai.realtime"]
         assert len(llm_spans) == 1
-        assert json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))[0]["content"] == "Why did the chicken cross the road?"
+        output = json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))
+        assert output[0]["content"] == "Why did the chicken cross the road?"
         assert llm_spans[0].attributes.get("gen_ai.usage.input_tokens") is None
diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_recipe_agents_hierarchy.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_recipe_agents_hierarchy.py
@@ -286,7 +286,7 @@ async def test_recipe_agents_hierarchy(exporter, recipe_agents):
 
         # Check for usage
         has_usage = any(
-            key.startswith("gen_ai.usage.") or key.startswith("llm.usage.") for key in response_span.attributes.keys()
+            key.startswith("gen_ai.usage.") for key in response_span.attributes.keys()
         )
         assert has_usage, (
             f"Response span {i} should have usage attributes, attributes: {dict(response_span.attributes)}"

Original file line number	Diff line number	Diff line change
`@@ -286,7 +286,7 @@ async def test_recipe_agents_hierarchy(exporter, recipe_agents):`
`286`	`286`
`287`	`287`	`# Check for usage`
`288`	`288`	`has_usage = any(`
`289`		`- key.startswith("gen_ai.usage.") or key.startswith("llm.usage.") for key in response_span.attributes.keys()`
	`289`	`+ key.startswith("gen_ai.usage.") for key in response_span.attributes.keys()`
`290`	`290`	`)`
`291`	`291`	`assert has_usage, (`
`292`	`292`	`f"Response span {i} should have usage attributes, attributes: {dict(response_span.attributes)}"`