openai-agents-python/tests/test_anthropic_thinking_blocks.py at a09b7e175dc7b8fc89ad654661def0c2c8f7d4bb · openai/openai-agents-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
"""
Test for Anthropic thinking blocks in conversation history.

This test validates the fix for issue #1704:
- Thinking blocks are properly preserved from Anthropic responses
- Reasoning items are stored in session but not sent back in conversation history
- Non-reasoning models are unaffected
- Token usage is not increased for non-reasoning scenarios
"""

from __future__ import annotations

import json
from typing import Any, cast

from openai.types.chat import ChatCompletionMessageToolCall
from openai.types.chat.chat_completion_message_tool_call import Function
from openai.types.responses import ResponseReasoningItem

from agents.extensions.models.litellm_model import InternalChatCompletionMessage
from agents.models.chatcmpl_converter import Converter


def create_mock_anthropic_response_with_thinking() -> InternalChatCompletionMessage:
    """Create a mock Anthropic response with thinking blocks (like real response)."""
    message = InternalChatCompletionMessage(
        role="assistant",
        content="I'll check the weather in Paris for you.",
        reasoning_content="I need to call the weather function for Paris",
        thinking_blocks=[
            {
                "type": "thinking",
                "thinking": "I need to call the weather function for Paris",
                "signature": "EqMDCkYIBxgCKkBAFZO8EyZwN1hiLctq0YjZnP0KeKgprr+C0PzgDv4GSggnFwrPQHIZ9A5s+paH+DrQBI1+Vnfq3mLAU5lJnoetEgzUEWx/Cv1022ieAvcaDCXdmg1XkMK0tZ8uCCIwURYAAX0uf2wFdnWt9n8whkhmy8ARQD5G2za4R8X5vTqBq8jpJ15T3c1Jcf3noKMZKooCWFVf0/W5VQqpZTgwDkqyTau7XraS+u48YlmJGSfyWMPO8snFLMZLGaGmVJgHfEI5PILhOEuX/R2cEeLuC715f51LMVuxTNzlOUV/037JV6P2ten7D66FnWU9JJMMJJov+DjMb728yQFHwHz4roBJ5ePHaaFP6mDwpqYuG/hai6pVv2TAK1IdKUui/oXrYtU+0gxb6UF2kS1bspqDuN++R8JdL7CMSU5l28pQ8TsH1TpVF4jZpsFbp1Du4rQIULFsCFFg+Edf9tPgyKZOq6xcskIjT7oylAPO37/jhdNknDq2S82PaSKtke3ViOigtM5uJfG521ZscBJQ1K3kwoI/repIdV9PatjOYdsYAQ==",  # noqa: E501
            }
        ],
    )
    return message


def test_converter_skips_reasoning_items():
    """
    Unit test to verify that reasoning items are skipped when converting items to messages.
    """
    # Create test items including a reasoning item
    test_items: list[dict[str, Any]] = [
        {"role": "user", "content": "Hello"},
        {
            "id": "reasoning_123",
            "type": "reasoning",
            "summary": [{"text": "User said hello", "type": "summary_text"}],
        },
        {
            "id": "msg_123",
            "type": "message",
            "role": "assistant",
            "content": [{"type": "output_text", "text": "Hi there!"}],
            "status": "completed",
        },
    ]

    # Convert to messages
    messages = Converter.items_to_messages(test_items)  # type: ignore[arg-type]

    # Should have user message and assistant message, but no reasoning content
    assert len(messages) == 2
    assert messages[0]["role"] == "user"
    assert messages[1]["role"] == "assistant"

    # Verify no thinking blocks in assistant message
    assistant_msg = messages[1]
    content = assistant_msg.get("content")
    if isinstance(content, list):
        for part in content:
            assert part.get("type") != "thinking"


def test_reasoning_items_preserved_in_message_conversion():
    """
    Test that reasoning content and thinking blocks are properly extracted
    from Anthropic responses and stored in reasoning items.
    """
    # Create mock message with thinking blocks
    mock_message = create_mock_anthropic_response_with_thinking()

    # Convert to output items
    output_items = Converter.message_to_output_items(mock_message)

    # Should have reasoning item, message item, and tool call items
    reasoning_items = [
        item for item in output_items if hasattr(item, "type") and item.type == "reasoning"
    ]
    assert len(reasoning_items) == 1

    reasoning_item = reasoning_items[0]
    assert reasoning_item.summary[0].text == "I need to call the weather function for Paris"

    # Verify thinking blocks are stored if we preserve them
    if (
        hasattr(reasoning_item, "content")
        and reasoning_item.content
        and len(reasoning_item.content) > 0
    ):
        thinking_block = reasoning_item.content[0]
        assert thinking_block.type == "reasoning_text"
        assert thinking_block.text == "I need to call the weather function for Paris"


def test_anthropic_thinking_blocks_with_tool_calls():
    """
    Test for models with extended thinking and interleaved thinking with tool calls.

    This test verifies the Anthropic's API's requirements for thinking blocks
    to be the first content in assistant messages when reasoning is enabled and tool
    calls are present.
    """
    # Create a message with reasoning, thinking blocks and tool calls
    message = InternalChatCompletionMessage(
        role="assistant",
        content="I'll check the weather for you.",
        reasoning_content="The user wants weather information, I need to call the weather function",
        thinking_blocks=[
            {
                "type": "thinking",
                "thinking": (
                    "The user is asking about weather. "
                    "Let me use the weather tool to get this information."
                ),
                "signature": "TestSignature123",
            },
            {
                "type": "thinking",
                "thinking": ("We should use the city Tokyo as the city."),
                "signature": "TestSignature456",
            },
        ],
        tool_calls=[
            ChatCompletionMessageToolCall(
                id="call_123",
                type="function",
                function=Function(name="get_weather", arguments='{"city": "Tokyo"}'),
            )
        ],
    )

    # Step 1: Convert message to output items
    output_items = Converter.message_to_output_items(message)

    # Verify reasoning item exists and contains thinking blocks
    reasoning_items = [
        item for item in output_items if hasattr(item, "type") and item.type == "reasoning"
    ]
    assert len(reasoning_items) == 1, "Should have exactly two reasoning items"

    reasoning_item = reasoning_items[0]

    # Verify thinking text is stored in content
    assert hasattr(reasoning_item, "content") and reasoning_item.content, (
        "Reasoning item should have content"
    )
    assert reasoning_item.content[0].type == "reasoning_text", (
        "Content should be reasoning_text type"
    )

    # Verify full blocks are stored as JSON in encrypted_content so that both
    # thinking and redacted_thinking blocks survive the round-trip verbatim.
    assert isinstance(reasoning_item, ResponseReasoningItem)
    assert reasoning_item.encrypted_content is not None, (
        "Reasoning item should have encrypted_content"
    )
    stored_blocks = json.loads(reasoning_item.encrypted_content)
    assert stored_blocks[0]["signature"] == "TestSignature123", (
        "Signature of first block should be preserved"
    )
    assert stored_blocks[1]["signature"] == "TestSignature456", (
        "Signature of second block should be preserved"
    )

    # Verify tool calls are present
    tool_call_items = [
        item for item in output_items if hasattr(item, "type") and item.type == "function_call"
    ]
    assert len(tool_call_items) == 1, "Should have exactly one tool call"

    # Step 2: Convert output items back to messages
    # Convert items to dicts for the converter (simulating serialization/deserialization)
    items_as_dicts: list[dict[str, Any]] = []
    for item in output_items:
        if hasattr(item, "model_dump"):
            items_as_dicts.append(item.model_dump())
        else:
            items_as_dicts.append(cast(dict[str, Any], item))

    messages = Converter.items_to_messages(
        items_as_dicts,  # type: ignore[arg-type]
        model="anthropic/claude-4-opus",
        preserve_thinking_blocks=True,
    )

    # Find the assistant message with tool calls
    assistant_messages = [
        msg for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
    ]
    assert len(assistant_messages) == 1, "Should have exactly one assistant message with tool calls"

    assistant_msg = assistant_messages[0]

    # Content must start with thinking blocks, not text
    content = assistant_msg.get("content")
    assert content is not None, "Assistant message should have content"

    assert isinstance(content, list) and len(content) > 0, (
        "Assistant message content should be a non-empty list"
    )

    first_content = content[0]
    assert first_content.get("type") == "thinking", (
        f"First content must be 'thinking' type for Anthropic compatibility, "
        f"but got '{first_content.get('type')}'"
    )
    expected_thinking = (
        "The user is asking about weather. Let me use the weather tool to get this information."
    )
    assert first_content.get("thinking") == expected_thinking, (
        "Thinking content should be preserved"
    )
    # Signature should also be preserved
    assert first_content.get("signature") == "TestSignature123", (
        "Signature should be preserved in thinking block"
    )

    second_content = content[1]
    assert second_content.get("type") == "thinking", (
        f"Second content must be 'thinking' type for Anthropic compatibility, "
        f"but got '{second_content.get('type')}'"
    )
    expected_thinking = "We should use the city Tokyo as the city."
    assert second_content.get("thinking") == expected_thinking, (
        "Thinking content should be preserved"
    )
    # Signature should also be preserved
    assert second_content.get("signature") == "TestSignature456", (
        "Signature should be preserved in thinking block"
    )

    last_content = content[2]
    assert last_content.get("type") == "text", (
        f"First content must be 'text' type but got '{last_content.get('type')}'"
    )
    expected_text = "I'll check the weather for you."
    assert last_content.get("text") == expected_text, "Content text should be preserved"

    # Verify tool calls are preserved
    tool_calls = assistant_msg.get("tool_calls", [])
    assert len(cast(list[Any], tool_calls)) == 1, "Tool calls should be preserved"
    assert cast(list[Any], tool_calls)[0]["function"]["name"] == "get_weather"


def test_items_to_messages_preserves_positional_bool_arguments():
    """
    Preserve positional compatibility for the released items_to_messages signature.
    """
    message = InternalChatCompletionMessage(
        role="assistant",
        content="I'll check the weather for you.",
        reasoning_content="The user wants weather information, I need to call the weather function",
        thinking_blocks=[
            {
                "type": "thinking",
                "thinking": (
                    "The user is asking about weather. "
                    "Let me use the weather tool to get this information."
                ),
                "signature": "TestSignature123",
            }
        ],
        tool_calls=[
            ChatCompletionMessageToolCall(
                id="call_123",
                type="function",
                function=Function(name="get_weather", arguments='{"city": "Tokyo"}'),
            )
        ],
    )

    output_items = Converter.message_to_output_items(message)
    items_as_dicts: list[dict[str, Any]] = []
    for item in output_items:
        if hasattr(item, "model_dump"):
            items_as_dicts.append(item.model_dump())
        else:
            items_as_dicts.append(cast(dict[str, Any], item))

    messages = Converter.items_to_messages(
        items_as_dicts,  # type: ignore[arg-type]
        "anthropic/claude-4-opus",
        True,
        True,
    )

    assistant_messages = [
        msg for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
    ]
    assert len(assistant_messages) == 1, "Should have exactly one assistant message with tool calls"

    assistant_msg = assistant_messages[0]
    content = assistant_msg.get("content")
    assert isinstance(content, list) and len(content) > 0, (
        "Positional bool arguments should still preserve thinking blocks"
    )
    assert content[0].get("type") == "thinking", (
        "The third positional argument must continue to map to preserve_thinking_blocks"
    )


def test_anthropic_thinking_blocks_without_tool_calls():
    """
    Test for models with extended thinking WITHOUT tool calls.

    This test verifies that thinking blocks are properly attached to assistant
    messages even when there are no tool calls (fixes issue #2195).
    """
    # Create a message with reasoning and thinking blocks but NO tool calls
    message = InternalChatCompletionMessage(
        role="assistant",
        content="The weather in Paris is sunny with a temperature of 22°C.",
        reasoning_content="The user wants to know about the weather in Paris.",
        thinking_blocks=[
            {
                "type": "thinking",
                "thinking": "Let me think about the weather in Paris.",
                "signature": "TestSignatureNoTools123",
            }
        ],
        tool_calls=None,  # No tool calls
    )

    # Step 1: Convert message to output items
    output_items = Converter.message_to_output_items(message)

    # Verify reasoning item exists and contains thinking blocks
    reasoning_items = [
        item for item in output_items if hasattr(item, "type") and item.type == "reasoning"
    ]
    assert len(reasoning_items) == 1, "Should have exactly one reasoning item"

    reasoning_item = reasoning_items[0]

    # Verify thinking text is stored in content
    assert hasattr(reasoning_item, "content") and reasoning_item.content, (
        "Reasoning item should have content"
    )
    assert reasoning_item.content[0].type == "reasoning_text", (
        "Content should be reasoning_text type"
    )
    assert reasoning_item.content[0].text == "Let me think about the weather in Paris.", (
        "Thinking text should be preserved"
    )

    # Verify full blocks are stored as JSON in encrypted_content.
    assert isinstance(reasoning_item, ResponseReasoningItem)
    assert reasoning_item.encrypted_content is not None, (
        "Reasoning item should have encrypted_content"
    )
    stored_blocks = json.loads(reasoning_item.encrypted_content)
    assert stored_blocks[0]["signature"] == "TestSignatureNoTools123", (
        "Signature should be preserved"
    )

    # Verify message item exists
    message_items = [
        item for item in output_items if hasattr(item, "type") and item.type == "message"
    ]
    assert len(message_items) == 1, "Should have exactly one message item"

    # Step 2: Convert output items back to messages with preserve_thinking_blocks=True
    items_as_dicts: list[dict[str, Any]] = []
    for item in output_items:
        if hasattr(item, "model_dump"):
            items_as_dicts.append(item.model_dump())
        else:
            items_as_dicts.append(cast(dict[str, Any], item))

    messages = Converter.items_to_messages(
        items_as_dicts,  # type: ignore[arg-type]
        model="anthropic/claude-4-opus",
        preserve_thinking_blocks=True,
    )

    # Should have one assistant message
    assistant_messages = [msg for msg in messages if msg.get("role") == "assistant"]
    assert len(assistant_messages) == 1, "Should have exactly one assistant message"

    assistant_msg = assistant_messages[0]

    # Content must start with thinking blocks even WITHOUT tool calls
    content = assistant_msg.get("content")
    assert content is not None, "Assistant message should have content"
    assert isinstance(content, list), (
        f"Assistant message content should be a list when thinking blocks are present, "
        f"but got {type(content)}"
    )
    assert len(content) >= 2, (
        f"Assistant message should have at least 2 content items "
        f"(thinking + text), got {len(content)}"
    )

    # First content should be thinking block
    first_content = content[0]
    assert first_content.get("type") == "thinking", (
        f"First content must be 'thinking' type for Anthropic compatibility, "
        f"but got '{first_content.get('type')}'"
    )
    assert first_content.get("thinking") == "Let me think about the weather in Paris.", (
        "Thinking content should be preserved"
    )
    assert first_content.get("signature") == "TestSignatureNoTools123", (
        "Signature should be preserved in thinking block"
    )

    # Second content should be text
    second_content = content[1]
    assert second_content.get("type") == "text", (
        f"Second content must be 'text' type, but got '{second_content.get('type')}'"
    )
    assert (
        second_content.get("text") == "The weather in Paris is sunny with a temperature of 22°C."
    ), "Text content should be preserved"


def test_redacted_thinking_blocks_preserved_across_turns():
    """
    Regression test for Bedrock redacted_thinking blocks being dropped.

    When Claude (via Bedrock) returns redacted_thinking blocks the previous
    serialisation only stored thinking/signature pairs and silently discarded
    any block whose type is "redacted_thinking" (they carry a "data" field
    instead of "thinking"/"signature").  Bedrock then rejected the next turn
    with: "thinking or redacted_thinking blocks in the latest assistant message
    cannot be modified".

    The fix serialises the complete block list as JSON so every block type
    survives the round-trip verbatim.
    """
    redacted_data = "SGVsbG8gV29ybGQ="  # base64 stand-in for encrypted content
    message = InternalChatCompletionMessage(
        role="assistant",
        content="I've investigated the cluster.",
        reasoning_content="Thinking was redacted by the provider.",
        thinking_blocks=[
            {
                "type": "redacted_thinking",
                "data": redacted_data,
            }
        ],
        tool_calls=None,
    )

    # Step 1: model response → output items
    output_items = Converter.message_to_output_items(message)

    reasoning_items = [i for i in output_items if getattr(i, "type", None) == "reasoning"]
    assert len(reasoning_items) == 1

    reasoning_item = cast(ResponseReasoningItem, reasoning_items[0])

    # encrypted_content must be present (the block has no "thinking" text, so
    # content will be empty — encrypted_content is the only carrier).
    assert reasoning_item.encrypted_content is not None, (
        "encrypted_content must be set even for redacted_thinking blocks"
    )
    stored_blocks = json.loads(reasoning_item.encrypted_content)
    assert len(stored_blocks) == 1
    assert stored_blocks[0]["type"] == "redacted_thinking", "Block type must be preserved verbatim"
    assert stored_blocks[0]["data"] == redacted_data, "Encrypted data must be preserved verbatim"

    # Step 2: output items → next-turn messages
    items_as_dicts: list[dict[str, Any]] = [
        i.model_dump() if hasattr(i, "model_dump") else cast(dict[str, Any], i)
        for i in output_items
    ]
    messages = Converter.items_to_messages(
        items_as_dicts,  # type: ignore[arg-type]
        model="anthropic/claude-sonnet-4-5",
        preserve_thinking_blocks=True,
    )

    assistant_messages = [m for m in messages if m.get("role") == "assistant"]
    assert len(assistant_messages) == 1

    content = assistant_messages[0].get("content")
    assert isinstance(content, list) and len(content) >= 1, (
        "Assistant message must contain the redacted_thinking block"
    )

    redacted_block = content[0]
    assert redacted_block.get("type") == "redacted_thinking", (
        f"Expected redacted_thinking block, got {redacted_block.get('type')}"
    )
    assert redacted_block.get("data") == redacted_data, (
        "data field of redacted_thinking block must be preserved verbatim"
    )


def test_mixed_thinking_and_redacted_thinking_blocks_preserved():
    """
    When a response contains both thinking and redacted_thinking blocks,
    all blocks must survive the round-trip in their original order and with
    their original fields intact.
    """
    message = InternalChatCompletionMessage(
        role="assistant",
        content="Done.",
        reasoning_content="Mixed thinking blocks.",
        thinking_blocks=[
            {
                "type": "thinking",
                "thinking": "First, let me check the pods.",
                "signature": "SigAAA",
            },
            {
                "type": "redacted_thinking",
                "data": "cmVkYWN0ZWQ=",
            },
            {
                "type": "thinking",
                "thinking": "Now summarising findings.",
                "signature": "SigBBB",
            },
        ],
        tool_calls=None,
    )

    output_items = Converter.message_to_output_items(message)

    reasoning_items = [i for i in output_items if getattr(i, "type", None) == "reasoning"]
    assert len(reasoning_items) == 1

    reasoning_item_mixed = cast(ResponseReasoningItem, reasoning_items[0])
    assert reasoning_item_mixed.encrypted_content is not None
    stored_blocks = json.loads(reasoning_item_mixed.encrypted_content)
    assert len(stored_blocks) == 3
    assert stored_blocks[0] == {
        "type": "thinking",
        "thinking": "First, let me check the pods.",
        "signature": "SigAAA",
    }
    assert stored_blocks[1] == {"type": "redacted_thinking", "data": "cmVkYWN0ZWQ="}
    assert stored_blocks[2] == {
        "type": "thinking",
        "thinking": "Now summarising findings.",
        "signature": "SigBBB",
    }

    items_as_dicts: list[dict[str, Any]] = [
        i.model_dump() if hasattr(i, "model_dump") else cast(dict[str, Any], i)
        for i in output_items
    ]
    messages = Converter.items_to_messages(
        items_as_dicts,  # type: ignore[arg-type]
        model="bedrock/anthropic.claude-sonnet-4-5",
        preserve_thinking_blocks=True,
    )

    assistant_messages = [m for m in messages if m.get("role") == "assistant"]
    assert len(assistant_messages) == 1

    content = assistant_messages[0].get("content")
    assert isinstance(content, list)

    # First three entries are the thinking blocks (in original order)
    assert content[0] == {
        "type": "thinking",
        "thinking": "First, let me check the pods.",
        "signature": "SigAAA",
    }
    assert content[1] == {"type": "redacted_thinking", "data": "cmVkYWN0ZWQ="}
    assert content[2] == {
        "type": "thinking",
        "thinking": "Now summarising findings.",
        "signature": "SigBBB",
    }