fix: #2806 sanitize AnyLLM responses replay input before validation (#2813)

seratch · web-flow · commit 40aada1a8be9 · 2026-03-31T19:35:22.000+09:00
diff --git a/src/agents/extensions/models/any_llm_model.py b/src/agents/extensions/models/any_llm_model.py
@@ -813,7 +813,7 @@ async def _fetch_responses_response(
 
         list_input = ItemHelpers.input_to_new_input_list(input)
         list_input = _to_dump_compatible(list_input)
-        list_input = self._remove_openai_responses_api_incompatible_fields(list_input)
+        list_input = self._sanitize_any_llm_responses_input(list_input)
 
         parallel_tool_calls = (
             True
@@ -1095,31 +1095,51 @@ def _make_any_llm_responses_params(payload: dict[str, Any]) -> Any:
         AnyLLMResponsesParams = any_llm_responses.ResponsesParams
         return AnyLLMResponsesParams(**payload)
 
-    def _remove_openai_responses_api_incompatible_fields(self, list_input: list[Any]) -> list[Any]:
-        has_provider_data = any(
-            isinstance(item, dict) and item.get("provider_data") for item in list_input
-        )
-        if not has_provider_data:
-            return list_input
+    def _sanitize_any_llm_responses_input(self, list_input: list[Any]) -> list[Any]:
+        """Normalize replayed Responses input into a shape accepted by any-llm.
 
+        any-llm validates replayed items against OpenAI-style input models before the request is
+        handed to the underlying provider. SDK-produced replay items can legitimately carry
+        adapter-only fields such as provider_data or explicit nulls like status=None, which those
+        models reject. Strip those fields here while preserving valid replay content.
+        """
         result: list[Any] = []
         for item in list_input:
-            cleaned = self._clean_item_for_openai(item)
+            cleaned = self._sanitize_any_llm_responses_value(item)
             if cleaned is not None:
                 result.append(cleaned)
         return result
 
-    def _clean_item_for_openai(self, item: Any) -> Any | None:
-        if not isinstance(item, dict):
-            return item
+    def _sanitize_any_llm_responses_value(self, value: Any) -> Any | None:
+        if isinstance(value, list):
+            sanitized_list = []
+            for item in value:
+                cleaned_item = self._sanitize_any_llm_responses_value(item)
+                if cleaned_item is not None:
+                    sanitized_list.append(cleaned_item)
+            return sanitized_list
+
+        if not isinstance(value, dict):
+            return value
 
-        if item.get("type") == "reasoning" and item.get("provider_data"):
+        # Provider-specific reasoning payloads are not replay-safe across adapter boundaries.
+        if value.get("type") == "reasoning" and value.get("provider_data"):
             return None
-        if item.get("id") == FAKE_RESPONSES_ID:
-            del item["id"]
-        if "provider_data" in item:
-            del item["provider_data"]
-        return item
+
+        cleaned: dict[str, Any] = {}
+        for key, item_value in value.items():
+            if key == "provider_data":
+                continue
+            if key == "id" and item_value == FAKE_RESPONSES_ID:
+                continue
+            if item_value is None:
+                continue
+
+            sanitized = self._sanitize_any_llm_responses_value(item_value)
+            if sanitized is not None:
+                cleaned[key] = sanitized
+
+        return cleaned
 
     def _attach_logprobs_to_output(self, output_items: list[Any], logprobs: list[Any]) -> None:
         from openai.types.responses import ResponseOutputMessage, ResponseOutputText
diff --git a/tests/models/test_any_llm_model.py b/tests/models/test_any_llm_model.py
@@ -4,7 +4,7 @@
 import sys
 import types as pytypes
 from collections.abc import AsyncIterator
-from typing import Any
+from typing import Any, Literal, cast
 
 import pytest
 from openai.types.chat import (
@@ -25,9 +25,18 @@
 )
 from pydantic import BaseModel
 
-from agents import ModelSettings, ModelTracing, __version__
+from agents import (
+    Agent,
+    Handoff,
+    ModelSettings,
+    ModelTracing,
+    Tool,
+    TResponseInputItem,
+    __version__,
+)
 from agents.exceptions import UserError
 from agents.models.chatcmpl_helpers import HEADERS_OVERRIDE
+from agents.models.fake_id import FAKE_RESPONSES_ID
 
 
 class FakeAnyLLMProvider:
@@ -583,6 +592,140 @@ async def test_any_llm_prompt_requests_fail_fast(monkeypatch) -> None:
         )
 
 
+def test_any_llm_responses_input_sanitizer_strips_none_fields_from_reasoning_items() -> None:
+    pytest.importorskip(
+        "any_llm",
+        reason="`any-llm-sdk` is only available when the optional dependency is installed.",
+    )
+    from agents.extensions.models.any_llm_model import AnyLLMModel
+
+    model = AnyLLMModel(model="openai/gpt-5.4-mini")
+    raw_input = [
+        {
+            "id": "rid1",
+            "summary": [{"text": "why", "type": "summary_text"}],
+            "type": "reasoning",
+            "content": [{"type": "reasoning_text", "text": "thinking"}],
+            "status": None,
+            "encrypted_content": None,
+        }
+    ]
+
+    cleaned = model._sanitize_any_llm_responses_input(raw_input)
+
+    assert cleaned == [
+        {
+            "id": "rid1",
+            "summary": [{"text": "why", "type": "summary_text"}],
+            "type": "reasoning",
+            "content": [{"type": "reasoning_text", "text": "thinking"}],
+        }
+    ]
+
+    ResponsesParams = importlib.import_module("any_llm.types.responses").ResponsesParams
+    params = ResponsesParams(model="dummy", input=cleaned)
+    assert isinstance(params.input, list)
+
+
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_any_llm_responses_path_sanitizes_replayed_items_before_validation() -> None:
+    pytest.importorskip(
+        "any_llm",
+        reason="`any-llm-sdk` is only available when the optional dependency is installed.",
+    )
+    from agents.extensions.models.any_llm_model import AnyLLMModel
+
+    class ValidatingProvider:
+        SUPPORTS_RESPONSES = True
+
+        def __init__(self) -> None:
+            self.private_responses_calls: list[dict[str, Any]] = []
+
+        async def aresponses(self, **kwargs: Any) -> Any:
+            raise AssertionError("public aresponses path should not be used in this test")
+
+        async def _aresponses(self, params: Any, **kwargs: Any) -> Response:
+            self.private_responses_calls.append({"params": params, "kwargs": kwargs})
+            return _response("Hello from sanitized replay")
+
+    class TestAnyLLMModel(AnyLLMModel):
+        def __init__(self, provider: ValidatingProvider) -> None:
+            super().__init__(model="openai/gpt-5.4-mini", api="responses")
+            self._provider = provider
+
+        def _get_provider(self) -> Any:
+            return self._provider
+
+    provider = ValidatingProvider()
+    model = TestAnyLLMModel(provider)
+    tools: list[Tool] = []
+    handoffs: list[Handoff[Any, Agent[Any]]] = []
+    stream_flag: Literal[False] = False
+
+    replay_input = cast(
+        list[TResponseInputItem],
+        [
+            {"role": "user", "content": "What's the weather in Tokyo?"},
+            {
+                "id": FAKE_RESPONSES_ID,
+                "summary": [
+                    {"text": "I should call the weather tool first.", "type": "summary_text"}
+                ],
+                "type": "reasoning",
+                "content": [{"type": "reasoning_text", "text": "thinking"}],
+                "status": None,
+                "provider_data": {"model": "anthropic/fake-responses-model"},
+            },
+            {
+                "id": FAKE_RESPONSES_ID,
+                "arguments": '{"city": "Tokyo"}',
+                "call_id": "call_weather_123",
+                "name": "get_weather",
+                "type": "function_call",
+                "status": None,
+                "provider_data": {"model": "anthropic/fake-responses-model"},
+            },
+            {
+                "type": "function_call_output",
+                "call_id": "call_weather_123",
+                "output": "The weather in Tokyo is sunny and 22°C.",
+            },
+        ],
+    )
+
+    response = await model._fetch_responses_response(
+        system_instructions=None,
+        input=replay_input,
+        model_settings=ModelSettings(),
+        tools=tools,
+        output_schema=None,
+        handoffs=handoffs,
+        previous_response_id=None,
+        conversation_id=None,
+        stream=stream_flag,
+        prompt=None,
+    )
+
+    assert response.id == "resp_123"
+    assert len(provider.private_responses_calls) == 1
+    params = provider.private_responses_calls[0]["params"]
+    assert params.input == [
+        {"role": "user", "content": "What's the weather in Tokyo?"},
+        {
+            "arguments": '{"city": "Tokyo"}',
+            "call_id": "call_weather_123",
+            "name": "get_weather",
+            "type": "function_call",
+        },
+        {
+            "type": "function_call_output",
+            "call_id": "call_weather_123",
+            "output": "The weather in Tokyo is sunny and 22°C.",
+        },
+    ]
+
+
 def test_any_llm_provider_passes_api_override() -> None:
     pytest.importorskip(
         "any_llm",