Finish reasoning summaries before text deltas (#2609)

lavish0000 · web-flow · commit 7351bf650110 · 2026-03-07T00:50:53.000+09:00
diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Iterator
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -60,6 +60,8 @@ class StreamingState:
     text_content_index_and_output: tuple[int, ResponseOutputText] | None = None
     refusal_content_index_and_output: tuple[int, ResponseOutputRefusal] | None = None
     reasoning_content_index_and_output: tuple[int, ResponseReasoningItem] | None = None
+    active_reasoning_summary_index: int | None = None
+    reasoning_item_done: bool = False
     function_calls: dict[int, ResponseFunctionToolCall] = field(default_factory=dict)
     # Fields for real-time function call streaming
     function_call_streaming: dict[int, bool] = field(default_factory=dict)
@@ -82,6 +84,67 @@ def get_and_increment(self) -> int:
 
 
 class ChatCmplStreamHandler:
+    @classmethod
+    def _finish_reasoning_summary_part(
+        cls,
+        state: StreamingState,
+        sequence_number: SequenceNumber,
+    ) -> Iterator[TResponseStreamEvent]:
+        if (
+            not state.reasoning_content_index_and_output
+            or state.active_reasoning_summary_index is None
+        ):
+            return
+
+        reasoning_item = state.reasoning_content_index_and_output[1]
+        summary_index = state.active_reasoning_summary_index
+        if not reasoning_item.summary or summary_index >= len(reasoning_item.summary):
+            state.active_reasoning_summary_index = None
+            return
+
+        yield ResponseReasoningSummaryPartDoneEvent(
+            item_id=FAKE_RESPONSES_ID,
+            output_index=0,
+            summary_index=summary_index,
+            part=DoneEventPart(
+                text=reasoning_item.summary[summary_index].text,
+                type="summary_text",
+            ),
+            type="response.reasoning_summary_part.done",
+            sequence_number=sequence_number.get_and_increment(),
+        )
+        state.active_reasoning_summary_index = None
+
+    @classmethod
+    def _finish_reasoning_item(
+        cls,
+        state: StreamingState,
+        sequence_number: SequenceNumber,
+    ) -> Iterator[TResponseStreamEvent]:
+        if not state.reasoning_content_index_and_output or state.reasoning_item_done:
+            return
+
+        reasoning_item = state.reasoning_content_index_and_output[1]
+        if reasoning_item.summary and len(reasoning_item.summary) > 0:
+            yield from cls._finish_reasoning_summary_part(state, sequence_number)
+        elif reasoning_item.content is not None:
+            yield ResponseReasoningTextDoneEvent(
+                item_id=FAKE_RESPONSES_ID,
+                output_index=0,
+                content_index=0,
+                text=reasoning_item.content[0].text,
+                type="response.reasoning_text.done",
+                sequence_number=sequence_number.get_and_increment(),
+            )
+
+        yield ResponseOutputItemDoneEvent(
+            item=reasoning_item,
+            output_index=0,
+            type="response.output_item.done",
+            sequence_number=sequence_number.get_and_increment(),
+        )
+        state.reasoning_item_done = True
+
     @classmethod
     async def handle_stream(
         cls,
@@ -149,7 +212,7 @@ async def handle_stream(
                 if reasoning_content and not state.reasoning_content_index_and_output:
                     reasoning_item = ResponseReasoningItem(
                         id=FAKE_RESPONSES_ID,
-                        summary=[Summary(text="", type="summary_text")],
+                        summary=[],
                         type="reasoning",
                     )
                     if state.provider_data:
@@ -162,36 +225,37 @@ async def handle_stream(
                         sequence_number=sequence_number.get_and_increment(),
                     )
 
-                    yield ResponseReasoningSummaryPartAddedEvent(
-                        item_id=FAKE_RESPONSES_ID,
-                        output_index=0,
-                        summary_index=0,
-                        part=AddedEventPart(text="", type="summary_text"),
-                        type="response.reasoning_summary_part.added",
-                        sequence_number=sequence_number.get_and_increment(),
-                    )
-
                 if reasoning_content and state.reasoning_content_index_and_output:
-                    # Ensure summary list has at least one element
-                    if not state.reasoning_content_index_and_output[1].summary:
-                        state.reasoning_content_index_and_output[1].summary = [
-                            Summary(text="", type="summary_text")
-                        ]
+                    reasoning_item = state.reasoning_content_index_and_output[1]
+                    if state.active_reasoning_summary_index is None:
+                        summary_index = len(reasoning_item.summary)
+                        reasoning_item.summary.append(Summary(text="", type="summary_text"))
+                        state.active_reasoning_summary_index = summary_index
+
+                        yield ResponseReasoningSummaryPartAddedEvent(
+                            item_id=FAKE_RESPONSES_ID,
+                            output_index=0,
+                            summary_index=summary_index,
+                            part=AddedEventPart(text="", type="summary_text"),
+                            type="response.reasoning_summary_part.added",
+                            sequence_number=sequence_number.get_and_increment(),
+                        )
+
+                    summary_index = state.active_reasoning_summary_index
 
                     yield ResponseReasoningSummaryTextDeltaEvent(
                         delta=reasoning_content,
                         item_id=FAKE_RESPONSES_ID,
                         output_index=0,
-                        summary_index=0,
+                        summary_index=summary_index,
                         type="response.reasoning_summary_text.delta",
                         sequence_number=sequence_number.get_and_increment(),
                     )
 
-                    # Create a new summary with updated text
-                    current_content = state.reasoning_content_index_and_output[1].summary[0]
+                    current_content = reasoning_item.summary[summary_index]
                     updated_text = current_content.text + reasoning_content
                     new_content = Summary(text=updated_text, type="summary_text")
-                    state.reasoning_content_index_and_output[1].summary[0] = new_content
+                    reasoning_item.summary[summary_index] = new_content
 
             # Handle reasoning content from 3rd party platforms
             if hasattr(delta, "reasoning"):
@@ -233,6 +297,19 @@ async def handle_stream(
                     new_text_content = Content(text=updated_text, type="reasoning_text")
                     state.reasoning_content_index_and_output[1].content[0] = new_text_content
 
+            if (
+                state.reasoning_content_index_and_output
+                and state.active_reasoning_summary_index is not None
+                and not (hasattr(delta, "reasoning_content") and delta.reasoning_content)
+                and (
+                    delta.content is not None
+                    or (hasattr(delta, "refusal") and delta.refusal)
+                    or bool(delta.tool_calls)
+                )
+            ):
+                for event in cls._finish_reasoning_summary_part(state, sequence_number):
+                    yield event
+
             # Handle regular content
             if delta.content is not None:
                 if not state.text_content_index_and_output:
@@ -513,37 +590,8 @@ async def handle_stream(
                             sequence_number=sequence_number.get_and_increment(),
                         )
 
-        if state.reasoning_content_index_and_output:
-            if (
-                state.reasoning_content_index_and_output[1].summary
-                and len(state.reasoning_content_index_and_output[1].summary) > 0
-            ):
-                yield ResponseReasoningSummaryPartDoneEvent(
-                    item_id=FAKE_RESPONSES_ID,
-                    output_index=0,
-                    summary_index=0,
-                    part=DoneEventPart(
-                        text=state.reasoning_content_index_and_output[1].summary[0].text,
-                        type="summary_text",
-                    ),
-                    type="response.reasoning_summary_part.done",
-                    sequence_number=sequence_number.get_and_increment(),
-                )
-            elif state.reasoning_content_index_and_output[1].content is not None:
-                yield ResponseReasoningTextDoneEvent(
-                    item_id=FAKE_RESPONSES_ID,
-                    output_index=0,
-                    content_index=0,
-                    text=state.reasoning_content_index_and_output[1].content[0].text,
-                    type="response.reasoning_text.done",
-                    sequence_number=sequence_number.get_and_increment(),
-                )
-            yield ResponseOutputItemDoneEvent(
-                item=state.reasoning_content_index_and_output[1],
-                output_index=0,
-                type="response.output_item.done",
-                sequence_number=sequence_number.get_and_increment(),
-            )
+        for event in cls._finish_reasoning_item(state, sequence_number):
+            yield event
 
         function_call_starting_index = 0
         if state.reasoning_content_index_and_output:
diff --git a/tests/test_reasoning_content.py b/tests/test_reasoning_content.py
@@ -142,6 +142,18 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert reasoning_delta_events[0].delta == "Let me think"
     assert reasoning_delta_events[1].delta == " about this"
 
+    reasoning_done_index = next(
+        index
+        for index, event in enumerate(output_events)
+        if event.type == "response.reasoning_summary_part.done"
+    )
+    first_text_delta_index = next(
+        index
+        for index, event in enumerate(output_events)
+        if event.type == "response.output_text.delta"
+    )
+    assert reasoning_done_index < first_text_delta_index
+
     # verify regular content events were emitted
     content_delta_events = [e for e in output_events if e.type == "response.output_text.delta"]
     assert len(content_delta_events) == 2
@@ -163,6 +175,88 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert response_event.response.output[1].content[0].text == "The answer is 42"
 
 
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_stream_response_keeps_reasoning_item_open_across_interleaved_text(
+    monkeypatch,
+) -> None:
+    chunks = [
+        create_chunk(create_reasoning_delta("Let me think")),
+        create_chunk(create_content_delta("The answer")),
+        create_chunk(create_reasoning_delta(" more carefully")),
+        create_chunk(create_content_delta(" is 42"), include_usage=True),
+    ]
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        resp = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return resp, create_fake_stream(chunks)
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+    output_events = []
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    reasoning_part_added_events = [
+        event for event in output_events if event.type == "response.reasoning_summary_part.added"
+    ]
+    assert [event.summary_index for event in reasoning_part_added_events] == [0, 1]
+
+    reasoning_part_done_events = [
+        event for event in output_events if event.type == "response.reasoning_summary_part.done"
+    ]
+    assert [event.summary_index for event in reasoning_part_done_events] == [0, 1]
+
+    first_reasoning_done_index = output_events.index(reasoning_part_done_events[0])
+    first_text_delta_index = next(
+        index
+        for index, event in enumerate(output_events)
+        if event.type == "response.output_text.delta"
+    )
+    second_reasoning_delta_index = next(
+        index
+        for index, event in enumerate(output_events)
+        if event.type == "response.reasoning_summary_text.delta" and event.summary_index == 1
+    )
+    reasoning_item_done_index = next(
+        index
+        for index, event in enumerate(output_events)
+        if event.type == "response.output_item.done" and event.item.type == "reasoning"
+    )
+
+    assert first_reasoning_done_index < first_text_delta_index
+    assert second_reasoning_delta_index > first_text_delta_index
+    assert reasoning_item_done_index > second_reasoning_delta_index
+
+    response_event = output_events[-1]
+    assert response_event.type == "response.completed"
+    assert isinstance(response_event.response.output[0], ResponseReasoningItem)
+    assert [summary.text for summary in response_event.response.output[0].summary] == [
+        "Let me think",
+        " more carefully",
+    ]
+
+
 @pytest.mark.allow_call_model_methods
 @pytest.mark.asyncio
 async def test_get_response_with_reasoning_content(monkeypatch) -> None: