fix: #2258 add normalized to_input_list mode for filtered handoff follow-ups (#2667)

seratch · web-flow · commit e88bcf2e2b71 · 2026-03-14T10:03:51.000+09:00
diff --git a/src/agents/result.py b/src/agents/result.py
@@ -29,7 +29,7 @@
 )
 from .logger import logger
 from .run_context import RunContextWrapper
-from .run_internal.items import run_item_to_input_item
+from .run_internal.items import run_items_to_input_items
 from .run_internal.run_steps import (
     NextStepInterruption,
     ProcessedResponse,
@@ -110,6 +110,40 @@ def _populate_state_from_result(
     return state
 
 
+ToInputListMode = Literal["preserve_all", "normalized"]
+
+
+def _input_items_for_result(
+    result: RunResultBase,
+    *,
+    mode: ToInputListMode,
+    reasoning_item_id_policy: Literal["preserve", "omit"] | None,
+) -> list[TResponseInputItem]:
+    """Return input items for the requested result view.
+
+    ``preserve_all`` keeps the full converted history from ``new_items``. ``normalized`` returns
+    the canonical continuation input when handoff filtering rewrote model history, otherwise it
+    falls back to the same converted history.
+    """
+    session_items = run_items_to_input_items(result.new_items, reasoning_item_id_policy)
+    if mode == "preserve_all":
+        return session_items
+    if mode != "normalized":
+        raise ValueError(f"Unsupported to_input_list mode: {mode}")
+    if not getattr(result, "_replay_from_model_input_items", False):
+        # Most runs never rewrite continuation history, so normalized stays identical to the
+        # historical preserve-all view unless the runner explicitly marked a divergence.
+        return session_items
+
+    model_input_items = getattr(result, "_model_input_items", None)
+    if not isinstance(model_input_items, list):
+        return session_items
+
+    # When the runner marks a divergence, generated_items already reflect the continuation input
+    # chosen for the next local run after applying handoff/input filtering.
+    return run_items_to_input_items(model_input_items, reasoning_item_id_policy)
+
+
 @dataclass
 class RunResultBase(abc.ABC):
     input: str | list[TResponseInputItem]
@@ -145,6 +179,12 @@ class RunResultBase(abc.ABC):
 
     _trace_state: TraceState | None = field(default=None, init=False, repr=False)
     """Serialized trace metadata captured during the run."""
+    _replay_from_model_input_items: bool = field(default=False, init=False, repr=False)
+    """Whether replay helpers should prefer `_model_input_items` over `new_items`.
+
+    This is only set when the runner preserved extra session history items that should not be
+    replayed into the next local run, such as nested handoff history or filtered handoff input.
+    """
 
     @classmethod
     def __get_pydantic_core_schema__(
@@ -208,18 +248,25 @@ def final_output_as(self, cls: type[T], raise_if_incorrect_type: bool = False) -
 
         return cast(T, self.final_output)
 
-    def to_input_list(self) -> list[TResponseInputItem]:
-        """Creates a new input list, merging the original input with all the new items generated."""
+    def to_input_list(
+        self,
+        *,
+        mode: ToInputListMode = "preserve_all",
+    ) -> list[TResponseInputItem]:
+        """Create an input-item view of this run.
+
+        ``mode="preserve_all"`` keeps the historical behavior of converting ``new_items`` into a
+        full plain-item history. ``mode="normalized"`` prefers the canonical continuation input
+        when handoff filtering rewrote model history, while remaining identical for ordinary runs.
+        """
         original_items: list[TResponseInputItem] = ItemHelpers.input_to_new_input_list(self.input)
-        new_items: list[TResponseInputItem] = []
         reasoning_item_id_policy = getattr(self, "_reasoning_item_id_policy", None)
-        for item in self.new_items:
-            converted = run_item_to_input_item(item, reasoning_item_id_policy)
-            if converted is None:
-                continue
-            new_items.append(converted)
-
-        return original_items + new_items
+        replay_items = _input_items_for_result(
+            self,
+            mode=mode,
+            reasoning_item_id_policy=reasoning_item_id_policy,
+        )
+        return original_items + replay_items
 
     @property
     def agent_tool_invocation(self) -> AgentToolInvocation | None:
diff --git a/src/agents/run.py b/src/agents/run.py
@@ -798,6 +798,11 @@ def _with_reasoning_item_id_policy(result: RunResult) -> RunResult:
                                 )
                                 result._current_turn = current_turn
                                 result._model_input_items = list(generated_items)
+                                # Keep normalized replay aligned with the model-facing
+                                # continuation whenever session history preserved extra items.
+                                result._replay_from_model_input_items = list(
+                                    generated_items
+                                ) != list(session_items)
                                 if run_state is not None:
                                     result._trace_state = run_state._trace_state
                                 if session_persistence_enabled:
@@ -932,6 +937,9 @@ def _with_reasoning_item_id_policy(result: RunResult) -> RunResult:
                         )
                         result._current_turn = max_turns
                         result._model_input_items = list(generated_items)
+                        result._replay_from_model_input_items = list(generated_items) != list(
+                            session_items
+                        )
                         if run_state is not None:
                             result._trace_state = run_state._trace_state
                         if session_persistence_enabled and include_in_history:
@@ -1200,6 +1208,9 @@ def _with_reasoning_item_id_policy(result: RunResult) -> RunResult:
                             )
                             result._current_turn = current_turn
                             result._model_input_items = list(generated_items)
+                            result._replay_from_model_input_items = list(generated_items) != list(
+                                session_items
+                            )
                             if run_state is not None:
                                 result._current_turn_persisted_item_count = (
                                     run_state._current_turn_persisted_item_count
@@ -1591,6 +1602,11 @@ def run_streamed(
         streamed_result._model_input_items = (
             list(run_state._generated_items) if run_state is not None else []
         )
+        streamed_result._replay_from_model_input_items = (
+            list(run_state._generated_items) != list(run_state._session_items)
+            if run_state is not None
+            else False
+        )
         streamed_result._reasoning_item_id_policy = resolved_reasoning_item_id_policy
         if run_state is not None:
             streamed_result._trace_state = run_state._trace_state
diff --git a/src/agents/run_internal/agent_runner_helpers.py b/src/agents/run_internal/agent_runner_helpers.py
@@ -271,6 +271,7 @@ def build_interruption_result(
     )
     result._current_turn = current_turn
     result._model_input_items = list(generated_items)
+    result._replay_from_model_input_items = list(generated_items) != list(session_items)
     if run_state is not None:
         result._current_turn_persisted_item_count = run_state._current_turn_persisted_item_count
         result._trace_state = run_state._trace_state
diff --git a/src/agents/run_internal/run_loop.py b/src/agents/run_internal/run_loop.py
@@ -483,6 +483,11 @@ def _sync_conversation_tracking_from_tracker() -> None:
         streamed_result._state = run_state
     if run_state is not None:
         streamed_result._model_input_items = list(run_state._generated_items)
+        # Streamed follow-ups need the same normalized replay signal as sync runs when the
+        # runner's continuation differs from the richer session history.
+        streamed_result._replay_from_model_input_items = list(run_state._generated_items) != list(
+            run_state._session_items
+        )
 
     if run_state is not None:
         run_state._conversation_id = conversation_id
@@ -627,6 +632,9 @@ async def _save_stream_items_without_count(
                     )
                     streamed_result._model_input_items = generated_items
                     streamed_result.new_items = base_session_items + list(turn_session_items)
+                    streamed_result._replay_from_model_input_items = list(
+                        streamed_result._model_input_items
+                    ) != list(streamed_result.new_items)
                     if run_state is not None:
                         update_run_state_after_resume(
                             run_state,
@@ -914,6 +922,9 @@ async def _save_stream_items_without_count(
                 )
                 turn_session_items = session_items_for_turn(turn_result)
                 streamed_result.new_items.extend(turn_session_items)
+                streamed_result._replay_from_model_input_items = list(
+                    streamed_result._model_input_items
+                ) != list(streamed_result.new_items)
                 store_setting = current_agent.model_settings.resolve(
                     run_config.model_settings
                 ).store
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -1146,6 +1146,10 @@ async def test_structured_output():
         "should have input: conversation summary, function call, function call result, message, "
         "handoff, handoff output, preamble message, tool call, tool call result, final output"
     )
+    assert len(result.to_input_list(mode="normalized")) == 6, (
+        "should have normalized replay input: conversation summary, carried-forward message, "
+        "preamble message, tool call, tool call result, final output"
+    )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
diff --git a/tests/test_agent_runner_streamed.py b/tests/test_agent_runner_streamed.py
@@ -669,6 +669,10 @@ async def test_structured_output():
         "should have input: conversation summary, function call, function call result, message, "
         "handoff, handoff output, preamble message, tool call, tool call result, final output"
     )
+    assert len(result.to_input_list(mode="normalized")) == 6, (
+        "should have normalized replay input: conversation summary, carried-forward message, "
+        "preamble message, tool call, tool call result, final output"
+    )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
@@ -1398,6 +1402,10 @@ async def test_streaming_events():
         "should have input: conversation summary, function call, function call result, message, "
         "handoff, handoff output, tool call, tool call result, final output"
     )
+    assert len(result.to_input_list(mode="normalized")) == 5, (
+        "should have normalized replay input: conversation summary, carried-forward message, "
+        "tool call, tool call result, final output"
+    )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
diff --git a/tests/test_handoff_history_duplication.py b/tests/test_handoff_history_duplication.py

Original file line number	Diff line number	Diff line change
`@@ -271,6 +271,7 @@ def build_interruption_result(`
`271`	`271`	`)`
`272`	`272`	`result._current_turn = current_turn`
`273`	`273`	`result._model_input_items = list(generated_items)`
	`274`	`+ result._replay_from_model_input_items = list(generated_items) != list(session_items)`
`274`	`275`	`if run_state is not None:`
`275`	`276`	`result._current_turn_persisted_item_count = run_state._current_turn_persisted_item_count`
`276`	`277`	`result._trace_state = run_state._trace_state`