fix: #2503 filter reasoning items from nested handoff input (#2508)

seratch · web-flow · commit 6375b974c677 · 2026-02-18T07:45:49.000+09:00
diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py
@@ -32,6 +32,8 @@
 _SUMMARY_ONLY_INPUT_TYPES = {
     "function_call",
     "function_call_output",
+    # Reasoning items can become orphaned after other summarized items are filtered.
+    "reasoning",
 }
 
 
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -13,6 +13,7 @@
 from openai import BadRequestError
 from openai.types.responses import ResponseFunctionToolCall
 from openai.types.responses.response_output_text import AnnotationFileCitation, ResponseOutputText
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem, Summary
 from typing_extensions import TypedDict
 
 from agents import (
@@ -585,6 +586,60 @@ def capture_model_input(data):
     assert has_function_call_output
 
 
+@pytest.mark.asyncio
+async def test_nested_handoff_filters_reasoning_items_from_model_input():
+    model = FakeModel()
+    delegate = Agent(
+        name="delegate",
+        model=model,
+    )
+    triage = Agent(
+        name="triage",
+        model=model,
+        handoffs=[delegate],
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            [
+                ResponseReasoningItem(
+                    id="reasoning_1",
+                    type="reasoning",
+                    summary=[Summary(text="Thinking about a handoff.", type="summary_text")],
+                ),
+                get_handoff_tool_call(delegate),
+            ],
+            [get_text_message("done")],
+        ]
+    )
+
+    captured_inputs: list[list[dict[str, Any]]] = []
+
+    def capture_model_input(data):
+        if isinstance(data.model_data.input, list):
+            captured_inputs.append(
+                [item for item in data.model_data.input if isinstance(item, dict)]
+            )
+        return data.model_data
+
+    result = await Runner.run(
+        triage,
+        input="user_message",
+        run_config=RunConfig(
+            nest_handoff_history=True,
+            call_model_input_filter=capture_model_input,
+        ),
+    )
+
+    assert result.final_output == "done"
+    assert len(captured_inputs) >= 2
+    handoff_input = captured_inputs[1]
+    handoff_input_types = [
+        item["type"] for item in handoff_input if isinstance(item.get("type"), str)
+    ]
+    assert "reasoning" not in handoff_input_types
+
+
 @pytest.mark.asyncio
 async def test_resume_preserves_filtered_model_input_after_handoff():
     model = FakeModel()
diff --git a/tests/test_agent_runner_streamed.py b/tests/test_agent_runner_streamed.py
@@ -6,6 +6,7 @@
 
 import pytest
 from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem, Summary
 from typing_extensions import TypedDict
 
 from agents import (
@@ -333,6 +334,61 @@ async def test_handoff_filters():
     )
 
 
+@pytest.mark.asyncio
+async def test_streamed_nested_handoff_filters_reasoning_items_from_model_input():
+    model = FakeModel()
+    delegate = Agent(
+        name="delegate",
+        model=model,
+    )
+    triage = Agent(
+        name="triage",
+        model=model,
+        handoffs=[delegate],
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            [
+                ResponseReasoningItem(
+                    id="reasoning_1",
+                    type="reasoning",
+                    summary=[Summary(text="Thinking about a handoff.", type="summary_text")],
+                ),
+                get_handoff_tool_call(delegate),
+            ],
+            [get_text_message("done")],
+        ]
+    )
+
+    captured_inputs: list[list[dict[str, Any]]] = []
+
+    def capture_model_input(data):
+        if isinstance(data.model_data.input, list):
+            captured_inputs.append(
+                [item for item in data.model_data.input if isinstance(item, dict)]
+            )
+        return data.model_data
+
+    result = Runner.run_streamed(
+        triage,
+        input="user_message",
+        run_config=RunConfig(
+            nest_handoff_history=True,
+            call_model_input_filter=capture_model_input,
+        ),
+    )
+    await consume_stream(result)
+
+    assert result.final_output == "done"
+    assert len(captured_inputs) >= 2
+    handoff_input = captured_inputs[1]
+    handoff_input_types = [
+        item["type"] for item in handoff_input if isinstance(item.get("type"), str)
+    ]
+    assert "reasoning" not in handoff_input_types
+
+
 @pytest.mark.asyncio
 async def test_async_input_filter_supported():
     # DO NOT rename this without updating pyproject.toml
diff --git a/tests/test_handoff_history_duplication.py b/tests/test_handoff_history_duplication.py
@@ -12,13 +12,15 @@
     ResponseOutputMessage,
     ResponseOutputText,
 )
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem, Summary
 
 from agents import Agent
 from agents.handoffs import HandoffInputData, nest_handoff_history
 from agents.items import (
     HandoffCallItem,
     HandoffOutputItem,
     MessageOutputItem,
+    ReasoningItem,
     ToolApprovalItem,
     ToolCallItem,
     ToolCallOutputItem,
@@ -97,6 +99,16 @@ def _create_message_item(agent: Agent) -> MessageOutputItem:
     return MessageOutputItem(agent=agent, raw_item=raw_item, type="message_output_item")
 
 
+def _create_reasoning_item(agent: Agent) -> ReasoningItem:
+    """Create a mock ReasoningItem."""
+    raw_item = ResponseReasoningItem(
+        id="reasoning_123",
+        type="reasoning",
+        summary=[Summary(text="Thinking about handoff", type="summary_text")],
+    )
+    return ReasoningItem(agent=agent, raw_item=raw_item, type="reasoning_item")
+
+
 def _create_tool_approval_item(agent: Agent) -> ToolApprovalItem:
     """Create a mock ToolApprovalItem."""
     raw_item = {
@@ -157,6 +169,28 @@ def test_tool_approval_items_are_skipped(self):
         assert len(nested.pre_handoff_items) == 0
         assert nested.input_items == ()
 
+    def test_pre_handoff_reasoning_items_are_filtered(self):
+        """Verify ReasoningItem in pre_handoff_items is filtered.
+
+        Reasoning is represented in the summary transcript and should not be
+        forwarded as a raw item.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(_create_reasoning_item(agent),),
+            new_items=(),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        assert len(nested.pre_handoff_items) == 0
+        first_item = nested.input_history[0]
+        assert isinstance(first_item, dict)
+        summary = str(first_item.get("content", ""))
+        assert "reasoning" in summary
+
     def test_new_items_handoff_output_is_filtered_for_input(self):
         """Verify HandoffOutputItem in new_items is filtered from input_items.
 
@@ -209,6 +243,35 @@ def test_message_items_are_preserved_in_new_items(self):
         assert len(nested.input_items) == 1, "MessageOutputItem should be preserved in input_items"
         assert isinstance(nested.input_items[0], MessageOutputItem)
 
+    def test_reasoning_items_are_filtered_from_input_items(self):
+        """Verify ReasoningItem in new_items is filtered from input_items.
+
+        Reasoning is summarized in the conversation transcript and should not be
+        forwarded verbatim in nested handoff model input.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(),
+            new_items=(
+                _create_reasoning_item(agent),
+                _create_handoff_call_item(agent),
+                _create_handoff_output_item(agent),
+            ),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        assert nested.input_items is not None
+        has_reasoning = any(isinstance(item, ReasoningItem) for item in nested.input_items)
+        assert not has_reasoning, "ReasoningItem should be filtered from input_items"
+
+        first_item = nested.input_history[0]
+        assert isinstance(first_item, dict)
+        summary = str(first_item.get("content", ""))
+        assert "reasoning" in summary
+
     def test_summary_contains_filtered_items_as_text(self):
         """Verify the summary message contains the filtered tool items as text.
 

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,8 @@`
`32`	`32`	`_SUMMARY_ONLY_INPUT_TYPES = {`
`33`	`33`	`"function_call",`
`34`	`34`	`"function_call_output",`
	`35`	`+ # Reasoning items can become orphaned after other summarized items are filtered.`
	`36`	`+ "reasoning",`
`35`	`37`	`}`
`36`	`38`
`37`	`39`