From f9459ac120d2622007c34348207d4436c6f57e6b Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 26 Jun 2026 18:14:59 +1000 Subject: [PATCH 1/2] fix(anthropic): default max_tokens to the model's output ceiling (#849) Anthropic's Messages API requires `max_tokens`, so the text adapter must always send a value. It previously hard-coded `?? 1024` when the caller didn't pass one, silently truncating any non-trivial generation mid-stream with `stop_reason: "max_tokens"`. Now default to the resolved model's real `max_output_tokens` from model-meta (e.g. 64K Sonnet, 128K Opus), falling back to 64K for unrecognized ids. `max_tokens` is a ceiling, not a reservation, so this costs nothing extra. Also log a warning when a response is truncated while using the defaulted cap, so it isn't silently read as the model "doing nothing"; callers that set `max_tokens` explicitly are unaffected. The new id -> max_output_tokens map is kept in lockstep with ANTHROPIC_MODELS by `scripts/sync-provider-models.ts`, so a freshly-synced model resolves to its real ceiling rather than the fallback. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/anthropic-max-tokens-default.md | 15 ++++ docs/adapters/anthropic.md | 4 + docs/config.json | 3 +- packages/ai-anthropic/src/adapters/text.ts | 30 ++++++- packages/ai-anthropic/src/model-meta.ts | 55 ++++++++++++ .../tests/anthropic-adapter.test.ts | 87 ++++++++++++++++++- .../ai-anthropic/tests/model-meta.test.ts | 29 ++++++- .../ai-core/adapter-configuration/SKILL.md | 10 +++ scripts/sync-provider-models.ts | 58 +++++++++++++ 9 files changed, 285 insertions(+), 6 deletions(-) create mode 100644 .changeset/anthropic-max-tokens-default.md diff --git a/.changeset/anthropic-max-tokens-default.md b/.changeset/anthropic-max-tokens-default.md new file mode 100644 index 000000000..11017103d --- /dev/null +++ b/.changeset/anthropic-max-tokens-default.md @@ -0,0 +1,15 @@ +--- +'@tanstack/ai-anthropic': patch +--- + +Default Anthropic `max_tokens` to the selected model's real output ceiling +(`max_output_tokens` from model metadata — e.g. 64K for Sonnet, 128K for Opus) +when the caller doesn't pass one, instead of a hard-coded `1024` that silently +truncated long responses with `stop_reason: "max_tokens"` (#849). Unknown +models fall back to a safe constant. `max_tokens` is a ceiling, not a +reservation, so this costs nothing unless the model genuinely produces more. + +The adapter also now logs a warning when a response is truncated while using the +defaulted (caller-unspecified) cap, so the truncation isn't silently attributed +to the model "doing nothing". Callers that set `modelOptions.max_tokens` +explicitly are unaffected. diff --git a/docs/adapters/anthropic.md b/docs/adapters/anthropic.md index a44da5cfd..8e5b6f85d 100644 --- a/docs/adapters/anthropic.md +++ b/docs/adapters/anthropic.md @@ -136,6 +136,10 @@ const stream = chat({ > If you previously passed `temperature` / `topP` / `maxTokens` at the root of `chat()`, see [Moving Sampling Options into modelOptions](../migration/sampling-options-to-model-options). +#### `max_tokens` default + +Anthropic's Messages API _requires_ `max_tokens` on every request, so the adapter always sends a value. When you don't set `modelOptions.max_tokens`, it defaults to the selected model's full output ceiling (`max_output_tokens` from the model metadata — e.g. 64K for Sonnet, 128K for Opus), falling back to a safe constant for unrecognized models. `max_tokens` is a ceiling, not a reservation — billing is on tokens actually generated — so this default costs nothing extra and avoids the silent mid-response truncation (`stop_reason: "max_tokens"`) that a low default would cause. Set `max_tokens` explicitly only when you want to _cap_ output below the model ceiling. If a response is truncated while using the default cap, the adapter logs a warning (visible with [debug logging](../advanced/debug-logging) enabled). + ### Thinking (Extended Thinking) Enable extended thinking with a token budget. This allows Claude to show its reasoning process, which is streamed as `thinking` chunks: diff --git a/docs/config.json b/docs/config.json index 0e8982869..0074c6638 100644 --- a/docs/config.json +++ b/docs/config.json @@ -507,7 +507,8 @@ { "label": "Anthropic", "to": "adapters/anthropic", - "addedAt": "2026-04-15" + "addedAt": "2026-04-15", + "updatedAt": "2026-06-26" }, { "label": "Google Gemini", diff --git a/packages/ai-anthropic/src/adapters/text.ts b/packages/ai-anthropic/src/adapters/text.ts index bdffea41c..5af486fdc 100644 --- a/packages/ai-anthropic/src/adapters/text.ts +++ b/packages/ai-anthropic/src/adapters/text.ts @@ -10,7 +10,10 @@ import { generateId, getAnthropicApiKeyFromEnv, } from '../utils' -import { ANTHROPIC_COMBINED_TOOLS_AND_SCHEMA_MODELS } from '../model-meta' +import { + ANTHROPIC_COMBINED_TOOLS_AND_SCHEMA_MODELS, + getAnthropicDefaultMaxTokens, +} from '../model-meta' import type { ANTHROPIC_MODELS, AnthropicChatModelProviderOptionsByName, @@ -420,7 +423,14 @@ export class AnthropicTextAdapter< validProviderOptions.thinking?.type === 'enabled' ? validProviderOptions.thinking.budget_tokens : undefined - const defaultMaxTokens = modelOptions?.max_tokens ?? 1024 + // Anthropic's Messages API *requires* `max_tokens`, so we must always send a + // value. When the caller doesn't specify one, default to the resolved + // model's real output ceiling (from model-meta) rather than a low constant + // that silently truncates long responses with `stop_reason: "max_tokens"` + // (issue #849). `max_tokens` is a ceiling, not a reservation — billing is on + // tokens actually generated, so a higher default costs nothing extra. + const defaultMaxTokens = + modelOptions?.max_tokens ?? getAnthropicDefaultMaxTokens(this.model) const maxTokens = thinkingBudget && thinkingBudget >= defaultMaxTokens ? thinkingBudget + 1 @@ -1181,6 +1191,22 @@ export class AnthropicTextAdapter< break } case 'max_tokens': { + // Surface a warning when the truncating cap was the + // adapter-supplied default (caller didn't pass `max_tokens`), so + // the truncation isn't silently attributed to the model "doing + // nothing" (issue #849). When the caller set `max_tokens` + // themselves, hitting it is their own deliberate ceiling. + if (options.modelOptions?.max_tokens == null) { + const defaultedMaxTokens = getAnthropicDefaultMaxTokens(model) + logger.warn( + `anthropic response truncated at the default max_tokens (${defaultedMaxTokens}) for model=${model}; pass maxTokens (or modelOptions.max_tokens) to raise the output ceiling`, + { + source: 'anthropic.processAnthropicStream', + model, + defaultedMaxTokens, + }, + ) + } yield { type: EventType.RUN_ERROR, model, diff --git a/packages/ai-anthropic/src/model-meta.ts b/packages/ai-anthropic/src/model-meta.ts index e951d9edb..13785e9fa 100644 --- a/packages/ai-anthropic/src/model-meta.ts +++ b/packages/ai-anthropic/src/model-meta.ts @@ -814,6 +814,61 @@ export const ANTHROPIC_MODELS = [ CLAUDE_SONNET_5.id, ] as const +/** + * Fallback `max_tokens` ceiling for a model whose metadata carries no + * `max_output_tokens` (e.g. an unrecognized model id). Anthropic's Messages + * API *requires* `max_tokens`, so the adapter must always send a value. 64K is + * the output ceiling of the current mainstream Claude tier (Sonnet/Haiku 4.5), + * so it's a sane default for an unknown — almost certainly modern — model and + * avoids silently truncating long generations (issue #849). Recognized models + * use their exact `max_output_tokens` from {@link ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS} + * (e.g. 128K for Opus), so this fallback only ever applies to ids not in the + * map. + */ +export const ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS = 64_000 + +/** + * Runtime lookup of each model's maximum output-token ceiling, keyed by model + * id. Lets the text adapter default the required `max_tokens` request field to + * the model's real ceiling when the caller doesn't specify one, rather than a + * low constant that truncates responses mid-stream (issue #849). + * + * Kept in sync with {@link ANTHROPIC_MODELS} by `scripts/sync-provider-models.ts` + * — when that script adds a model it also inserts the model's `max_output_tokens` + * here, so a freshly-synced model resolves to its real ceiling rather than the + * fallback above. + */ +const ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS: Record = { + [CLAUDE_OPUS_4_6.id]: CLAUDE_OPUS_4_6.max_output_tokens, + [CLAUDE_OPUS_4_5.id]: CLAUDE_OPUS_4_5.max_output_tokens, + [CLAUDE_SONNET_4_6.id]: CLAUDE_SONNET_4_6.max_output_tokens, + [CLAUDE_SONNET_4_5.id]: CLAUDE_SONNET_4_5.max_output_tokens, + [CLAUDE_HAIKU_4_5.id]: CLAUDE_HAIKU_4_5.max_output_tokens, + [CLAUDE_OPUS_4_1.id]: CLAUDE_OPUS_4_1.max_output_tokens, + [CLAUDE_SONNET_4.id]: CLAUDE_SONNET_4.max_output_tokens, + [CLAUDE_SONNET_3_7.id]: CLAUDE_SONNET_3_7.max_output_tokens, + [CLAUDE_OPUS_4.id]: CLAUDE_OPUS_4.max_output_tokens, + [CLAUDE_HAIKU_3_5.id]: CLAUDE_HAIKU_3_5.max_output_tokens, + [CLAUDE_HAIKU_3.id]: CLAUDE_HAIKU_3.max_output_tokens, + [CLAUDE_OPUS_4_6_FAST.id]: CLAUDE_OPUS_4_6_FAST.max_output_tokens, + [CLAUDE_OPUS_4_7.id]: CLAUDE_OPUS_4_7.max_output_tokens, + [CLAUDE_OPUS_4_7_FAST.id]: CLAUDE_OPUS_4_7_FAST.max_output_tokens, + [CLAUDE_OPUS_4_8.id]: CLAUDE_OPUS_4_8.max_output_tokens, + [CLAUDE_OPUS_4_8_FAST.id]: CLAUDE_OPUS_4_8_FAST.max_output_tokens, +} + +/** + * Resolve the default `max_tokens` for a model: its known `max_output_tokens` + * ceiling, or {@link ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS} for unknown models. + * Callers that pass an explicit `max_tokens` bypass this entirely. + */ +export function getAnthropicDefaultMaxTokens(model: string): number { + return ( + ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS[model] ?? + ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS + ) +} + /** * Anthropic models that support combining `tools` + JSON-Schema-constrained * output in a single streaming Messages request (per issue #605). GA'd diff --git a/packages/ai-anthropic/tests/anthropic-adapter.test.ts b/packages/ai-anthropic/tests/anthropic-adapter.test.ts index 611c38a33..3dda7c9d9 100644 --- a/packages/ai-anthropic/tests/anthropic-adapter.test.ts +++ b/packages/ai-anthropic/tests/anthropic-adapter.test.ts @@ -444,7 +444,7 @@ describe('Anthropic adapter option mapping', () => { expect(payload.top_p).toBe(0.7) }) - it('defaults max_tokens to 1024 when not provided via modelOptions', async () => { + it("defaults max_tokens to the model's max_output_tokens when not provided via modelOptions (#849)", async () => { mocks.betaMessagesCreate.mockResolvedValueOnce(createTextStream('ok')) const adapter = createAdapter('claude-3-7-sonnet') @@ -457,7 +457,90 @@ describe('Anthropic adapter option mapping', () => { } const [payload] = mocks.betaMessagesCreate.mock.calls[0]! - expect(payload.max_tokens).toBe(1024) + // claude-3-7-sonnet's model-meta max_output_tokens is 64_000 — not the old + // hard-coded 1024 floor that silently truncated long responses. + expect(payload.max_tokens).toBe(64_000) + }) + + it('warns when the default max_tokens cap truncates the response (#849)', async () => { + // Stream that ends with stop_reason: "max_tokens" — the model hit the cap. + const truncatedStream = (async function* () { + yield { + type: 'content_block_start', + index: 0, + content_block: { type: 'text', text: '' }, + } + yield { + type: 'content_block_delta', + index: 0, + delta: { type: 'text_delta', text: 'partial output' }, + } + yield { type: 'content_block_stop', index: 0 } + yield { + type: 'message_delta', + delta: { stop_reason: 'max_tokens' }, + usage: { output_tokens: 64_000 }, + } + yield { type: 'message_stop' } + })() + mocks.betaMessagesCreate.mockResolvedValueOnce(truncatedStream) + + const adapter = createAdapter('claude-3-7-sonnet') + + const logger = { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + } + + for await (const _ of chat({ + adapter, + messages: [{ role: 'user', content: 'Write a long essay' }], + debug: { logger, errors: true }, + })) { + // consume stream + } + + const truncationWarning = logger.warn.mock.calls.find((call) => + String(call[0]).includes('truncated at the default max_tokens'), + ) + expect(truncationWarning).toBeDefined() + }) + + it('does not warn about truncation when the caller set max_tokens explicitly (#849)', async () => { + const truncatedStream = (async function* () { + yield { + type: 'message_delta', + delta: { stop_reason: 'max_tokens' }, + usage: { output_tokens: 100 }, + } + yield { type: 'message_stop' } + })() + mocks.betaMessagesCreate.mockResolvedValueOnce(truncatedStream) + + const adapter = createAdapter('claude-3-7-sonnet') + + const logger = { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + } + + for await (const _ of chat({ + adapter, + messages: [{ role: 'user', content: 'Hi' }], + modelOptions: { max_tokens: 100 } satisfies AnthropicTextProviderOptions, + debug: { logger, errors: true }, + })) { + // consume stream + } + + const truncationWarning = logger.warn.mock.calls.find((call) => + String(call[0]).includes('truncated at the default max_tokens'), + ) + expect(truncationWarning).toBeUndefined() }) it('native combined mode (#605): wires outputSchema into output_format alongside tools on Claude 4.5+', async () => { diff --git a/packages/ai-anthropic/tests/model-meta.test.ts b/packages/ai-anthropic/tests/model-meta.test.ts index 3a8c5bc44..50fc3b282 100644 --- a/packages/ai-anthropic/tests/model-meta.test.ts +++ b/packages/ai-anthropic/tests/model-meta.test.ts @@ -1,4 +1,8 @@ -import { describe, expectTypeOf, it } from 'vitest' +import { describe, expect, expectTypeOf, it } from 'vitest' +import { + ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS, + getAnthropicDefaultMaxTokens, +} from '../src/model-meta' import type { AnthropicChatModelProviderOptionsByName, AnthropicModelInputModalitiesByName, @@ -780,3 +784,26 @@ describe('Anthropic Model Input Modality Type Assertions', () => { }) }) }) + +describe('getAnthropicDefaultMaxTokens (#849)', () => { + it("returns the model's max_output_tokens for known models", () => { + expect(getAnthropicDefaultMaxTokens('claude-opus-4.8')).toBe(128_000) + expect(getAnthropicDefaultMaxTokens('claude-opus-4-6')).toBe(128_000) + expect(getAnthropicDefaultMaxTokens('claude-sonnet-4-6')).toBe(64_000) + expect(getAnthropicDefaultMaxTokens('claude-3-7-sonnet')).toBe(64_000) + expect(getAnthropicDefaultMaxTokens('claude-3-haiku')).toBe(4_000) + }) + + it('falls back to the safe constant for unknown models', () => { + expect(ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS).toBe(64_000) + expect(getAnthropicDefaultMaxTokens('some-future-claude-model')).toBe( + ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS, + ) + }) + + it('never returns the old hard-coded 1024 floor for a known model', () => { + expect(getAnthropicDefaultMaxTokens('claude-opus-4.8')).toBeGreaterThan( + 1024, + ) + }) +}) diff --git a/packages/ai/skills/ai-core/adapter-configuration/SKILL.md b/packages/ai/skills/ai-core/adapter-configuration/SKILL.md index 2faa4ecd4..f6be3f7b4 100644 --- a/packages/ai/skills/ai-core/adapter-configuration/SKILL.md +++ b/packages/ai/skills/ai-core/adapter-configuration/SKILL.md @@ -297,6 +297,16 @@ Per-provider sampling keys (all live inside `modelOptions`): some sampling options use provider-native names. Ollama nests all sampling under `modelOptions.options`. +> **Anthropic `max_tokens` default:** Anthropic's API _requires_ `max_tokens`, +> so the adapter always sends one. When you omit `modelOptions.max_tokens`, it +> defaults to the selected model's full output ceiling (its `max_output_tokens` +> from model metadata — e.g. 64K for Sonnet, 128K for Opus), not a low constant. +> `max_tokens` is a ceiling, not a reservation (billing is per token generated), +> so leaving it unset is the right default for codegen / agentic / long-form +> output and avoids silent `stop_reason: "max_tokens"` truncation. Set it only to +> cap output below the model ceiling. Other providers treat token limits as +> optional and don't apply this flooring. + ### 6. Capability Flag: `supportsCombinedToolsAndSchema` Adapters can declare an optional capability method: diff --git a/scripts/sync-provider-models.ts b/scripts/sync-provider-models.ts index efeaf25a5..3dc8233b5 100644 --- a/scripts/sync-provider-models.ts +++ b/scripts/sync-provider-models.ts @@ -43,6 +43,13 @@ interface ProviderConfig { providerOptionsTypeName: string /** Name of the input modalities type map */ inputModalitiesTypeName: string + /** + * Name of the runtime `Record` mapping model id → + * `max_output_tokens`, if the provider maintains one. Anthropic uses this to + * default the required `max_tokens` request field to the model's real ceiling + * (issue #849); other providers treat token limits as optional and omit it. + */ + maxOutputTokensMapName?: string /** The supports block template (minus input modalities, which come from OpenRouter) */ referenceSupportsBody: string /** Valid input modality types for this provider's ModelMeta interface */ @@ -95,6 +102,7 @@ const PROVIDER_MAP: Record = { chatArrayName: 'ANTHROPIC_MODELS', providerOptionsTypeName: 'AnthropicChatModelProviderOptionsByName', inputModalitiesTypeName: 'AnthropicModelInputModalitiesByName', + maxOutputTokensMapName: 'ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS', validInputModalities: ['text', 'image', 'audio', 'video', 'document'], referenceSupportsBody: ` extended_thinking: true, priority_tier: true, @@ -500,6 +508,34 @@ function addToTypeMap( return content.replace(pattern, () => `${match[1]}\n${newEntries}${match[2]}`) } +/** + * Add entries to a runtime object literal like: + * const MAP_NAME: Record = { + * ...existing entries... + * } + * Used for the Anthropic id → max_output_tokens map (issue #849), which is a + * value declaration rather than a `type` alias. + */ +function addToObjectMap( + content: string, + mapName: string, + entries: Array, +): string { + // Match: const MAP_NAME: Record = { ... \n} + const pattern = new RegExp( + `(const ${mapName}: Record = \\{[\\s\\S]*?)(\\n\\})`, + ) + const match = pattern.exec(content) + if (!match) { + console.warn(` Warning: Could not find object map '${mapName}' in file`) + return content + } + + const newEntries = entries.join('\n') + // Use replacer function to prevent $-character interpretation in replacement string + return content.replace(pattern, () => `${match[1]}\n${newEntries}${match[2]}`) +} + // --------------------------------------------------------------------------- // Git-based change detection // --------------------------------------------------------------------------- @@ -697,6 +733,28 @@ async function main() { ) } + // Add to the id → max_output_tokens runtime map (Anthropic only). Only + // models whose generated constant actually carries `max_output_tokens` + // (i.e. OpenRouter reported a `max_completion_tokens`) get an entry; the + // rest correctly fall through to the map's constant default. Keeps the map + // in lockstep with the chat-model array so a synced model resolves to its + // real ceiling instead of the fallback (issue #849). + if (config.maxOutputTokensMapName) { + const maxOutputEntries = chatModels + .filter(({ model }) => model.top_provider.max_completion_tokens) + .map( + ({ constName }) => + ` [${constName}${config.arrayRef}]: ${constName}.max_output_tokens,`, + ) + if (maxOutputEntries.length > 0) { + content = addToObjectMap( + content, + config.maxOutputTokensMapName, + maxOutputEntries, + ) + } + } + // Write the modified file await writeFile(config.metaFile, content, 'utf-8') console.log(` Wrote updated file: ${config.metaFile}`) From 92f5b767abd3f841527fbc44a0f602b3f3c60801 Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Fri, 26 Jun 2026 19:02:00 +1000 Subject: [PATCH 2/2] fix(anthropic): clamp non-streaming structured-output max_tokens default (#849) The #849 default of the model's full output ceiling broke the non-streaming `structuredOutput()` path: the Anthropic SDK refuses a non-streaming request whose `max_tokens` could exceed its 10-minute timeout (~21,333 tokens), so `chat({ outputSchema })` on any fallback-path model threw "Streaming is required for operations that may take longer than 10 minutes". `getAnthropicDefaultMaxTokens(model, { stream })` now clamps the default to `ANTHROPIC_MAX_NONSTREAMING_TOKENS` when `stream: false`; the streaming chat path keeps the model's full ceiling. An explicit oversized `max_tokens` still surfaces the SDK's "use streaming" error. Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/anthropic-max-tokens-default.md | 8 ++++ docs/adapters/anthropic.md | 2 + packages/ai-anthropic/src/adapters/text.ts | 14 +++++- packages/ai-anthropic/src/model-meta.ts | 32 +++++++++++-- .../tests/anthropic-adapter.test.ts | 46 +++++++++++++++++++ .../ai-anthropic/tests/model-meta.test.ts | 38 +++++++++++++++ 6 files changed, 135 insertions(+), 5 deletions(-) diff --git a/.changeset/anthropic-max-tokens-default.md b/.changeset/anthropic-max-tokens-default.md index 11017103d..57eef68dd 100644 --- a/.changeset/anthropic-max-tokens-default.md +++ b/.changeset/anthropic-max-tokens-default.md @@ -13,3 +13,11 @@ The adapter also now logs a warning when a response is truncated while using the defaulted (caller-unspecified) cap, so the truncation isn't silently attributed to the model "doing nothing". Callers that set `modelOptions.max_tokens` explicitly are unaffected. + +The non-streaming structured-output path (`structuredOutput()`) clamps this +default to the Anthropic SDK's non-streaming-safe limit (~21K tokens). The SDK +refuses a non-streaming request whose `max_tokens` could exceed its 10-minute +timeout, so without the clamp the full-ceiling default would make every +`chat({ outputSchema })` call on a fallback-path model throw "Streaming is +required for operations that may take longer than 10 minutes". The streaming +chat path keeps the model's full ceiling. diff --git a/docs/adapters/anthropic.md b/docs/adapters/anthropic.md index 8e5b6f85d..0a42a8c2c 100644 --- a/docs/adapters/anthropic.md +++ b/docs/adapters/anthropic.md @@ -140,6 +140,8 @@ const stream = chat({ Anthropic's Messages API _requires_ `max_tokens` on every request, so the adapter always sends a value. When you don't set `modelOptions.max_tokens`, it defaults to the selected model's full output ceiling (`max_output_tokens` from the model metadata — e.g. 64K for Sonnet, 128K for Opus), falling back to a safe constant for unrecognized models. `max_tokens` is a ceiling, not a reservation — billing is on tokens actually generated — so this default costs nothing extra and avoids the silent mid-response truncation (`stop_reason: "max_tokens"`) that a low default would cause. Set `max_tokens` explicitly only when you want to _cap_ output below the model ceiling. If a response is truncated while using the default cap, the adapter logs a warning (visible with [debug logging](../advanced/debug-logging) enabled). +One exception: structured output (`chat({ outputSchema })`) on models that use the non-streaming finalization path clamps this default to ~21K tokens. The Anthropic SDK rejects a non-streaming request whose `max_tokens` could exceed its 10-minute timeout, so the full ceiling can't be used there. Streaming chat is unaffected. To raise the structured-output ceiling toward a model's true max, stream the response. + ### Thinking (Extended Thinking) Enable extended thinking with a token budget. This allows Claude to show its reasoning process, which is streamed as `thinking` chunks: diff --git a/packages/ai-anthropic/src/adapters/text.ts b/packages/ai-anthropic/src/adapters/text.ts index 5af486fdc..0c9c8bcdd 100644 --- a/packages/ai-anthropic/src/adapters/text.ts +++ b/packages/ai-anthropic/src/adapters/text.ts @@ -266,7 +266,12 @@ export class AnthropicTextAdapter< const { chatOptions, outputSchema } = options const { logger } = chatOptions - const requestParams = this.mapCommonOptionsToAnthropic(chatOptions) + // `structuredOutput()` issues a non-streaming `messages.create({ stream: + // false })` below, so the defaulted `max_tokens` must stay under the SDK's + // non-streaming 10-minute guard (issue #849) — pass `stream: false`. + const requestParams = this.mapCommonOptionsToAnthropic(chatOptions, { + stream: false, + }) // Create a tool that will capture the structured output // Anthropic's SDK requires input_schema with type: 'object' literal @@ -355,6 +360,7 @@ export class AnthropicTextAdapter< private mapCommonOptionsToAnthropic( options: TextOptions, + { stream = true }: { stream?: boolean } = {}, ) { const modelOptions = options.modelOptions @@ -429,8 +435,12 @@ export class AnthropicTextAdapter< // that silently truncates long responses with `stop_reason: "max_tokens"` // (issue #849). `max_tokens` is a ceiling, not a reservation — billing is on // tokens actually generated, so a higher default costs nothing extra. + // For non-streaming requests (the `structuredOutput()` path) the default is + // clamped to the SDK's non-streaming-safe limit so it doesn't trip the + // "streaming required" 10-minute guard — see getAnthropicDefaultMaxTokens. const defaultMaxTokens = - modelOptions?.max_tokens ?? getAnthropicDefaultMaxTokens(this.model) + modelOptions?.max_tokens ?? + getAnthropicDefaultMaxTokens(this.model, { stream }) const maxTokens = thinkingBudget && thinkingBudget >= defaultMaxTokens ? thinkingBudget + 1 diff --git a/packages/ai-anthropic/src/model-meta.ts b/packages/ai-anthropic/src/model-meta.ts index 13785e9fa..4cb9f3978 100644 --- a/packages/ai-anthropic/src/model-meta.ts +++ b/packages/ai-anthropic/src/model-meta.ts @@ -857,16 +857,42 @@ const ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS: Record = { [CLAUDE_OPUS_4_8_FAST.id]: CLAUDE_OPUS_4_8_FAST.max_output_tokens, } +/** + * Largest `max_tokens` the Anthropic SDK permits on a **non-streaming** + * request. The SDK refuses to make a non-streaming call it estimates could + * exceed its 10-minute timeout, computed as + * `(60min * max_tokens) / 128_000 > 10min` — i.e. it throws + * `"Streaming is required for operations that may take longer than 10 minutes"` + * once `max_tokens > 128_000 * 10 / 60 ≈ 21_333` + * (`@anthropic-ai/sdk`'s `calculateNonstreamingTimeout`). The text adapter's + * only non-streaming call is the forced-tool `structuredOutput()` request, so + * its defaulted ceiling must stay at or below this; the streaming chat path + * keeps the model's full {@link getAnthropicDefaultMaxTokens} ceiling. We sit + * just under the boundary (`21_333` would round-trip to exactly 10min). This + * caps only the *default* — an explicit oversized `max_tokens` from the caller + * still surfaces the SDK's "use streaming" error, which is the correct signal. + */ +export const ANTHROPIC_MAX_NONSTREAMING_TOKENS = 21_000 + /** * Resolve the default `max_tokens` for a model: its known `max_output_tokens` * ceiling, or {@link ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS} for unknown models. * Callers that pass an explicit `max_tokens` bypass this entirely. + * + * Pass `stream: false` for non-streaming requests (the `structuredOutput()` + * path): the result is then clamped to {@link ANTHROPIC_MAX_NONSTREAMING_TOKENS} + * so the defaulted ceiling doesn't trip the SDK's non-streaming 10-minute guard + * (issue #849). Streaming requests (the default) are unaffected and get the + * model's full ceiling. */ -export function getAnthropicDefaultMaxTokens(model: string): number { - return ( +export function getAnthropicDefaultMaxTokens( + model: string, + { stream = true }: { stream?: boolean } = {}, +): number { + const ceiling = ANTHROPIC_MODEL_MAX_OUTPUT_TOKENS[model] ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS - ) + return stream ? ceiling : Math.min(ceiling, ANTHROPIC_MAX_NONSTREAMING_TOKENS) } /** diff --git a/packages/ai-anthropic/tests/anthropic-adapter.test.ts b/packages/ai-anthropic/tests/anthropic-adapter.test.ts index 3dda7c9d9..8a86baf3f 100644 --- a/packages/ai-anthropic/tests/anthropic-adapter.test.ts +++ b/packages/ai-anthropic/tests/anthropic-adapter.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, vi } from 'vitest' import { chat, type Tool, type StreamChunk } from '@tanstack/ai' import { AnthropicTextAdapter } from '../src/adapters/text' import type { AnthropicTextProviderOptions } from '../src/adapters/text' +import { ANTHROPIC_MAX_NONSTREAMING_TOKENS } from '../src/model-meta' import { z } from 'zod' const mocks = vi.hoisted(() => { @@ -543,6 +544,51 @@ describe('Anthropic adapter option mapping', () => { expect(truncationWarning).toBeUndefined() }) + it('clamps the default max_tokens on the non-streaming structured-output path so it never trips the SDK 10-minute guard (#849)', async () => { + // The structured-output fallback issues a NON-streaming + // `messages.create({ stream: false })`. The Anthropic SDK throws + // "Streaming is required for operations that may take longer than 10 + // minutes" once max_tokens exceeds ~21_333, so the defaulted ceiling must + // be clamped here even though the streaming chat path keeps the full 64K. + mocks.betaMessagesCreate.mockResolvedValueOnce({ + id: 'msg_structured', + type: 'message', + role: 'assistant', + model: 'claude-3-7-sonnet', + content: [ + { + type: 'tool_use', + id: 'toolu_structured_output', + name: 'structured_output', + input: { recommendation: 'Strat', price: 1299 }, + }, + ], + stop_reason: 'tool_use', + usage: { input_tokens: 10, output_tokens: 20 }, + }) + + const adapter = createAdapter('claude-3-7-sonnet') + + for await (const _ of chat({ + adapter, + messages: [{ role: 'user', content: 'recommend a guitar as json' }], + outputSchema: z.object({ + recommendation: z.string(), + price: z.number(), + }), + stream: true, + })) { + // consume stream + } + + const [payload] = mocks.betaMessagesCreate.mock.calls[0]! + expect(payload.stream).toBe(false) + // Clamped to the non-streaming limit — NOT claude-3-7-sonnet's full 64K + // streaming ceiling, which would make the SDK throw before the request. + expect(payload.max_tokens).toBe(ANTHROPIC_MAX_NONSTREAMING_TOKENS) + expect(payload.max_tokens).toBeLessThanOrEqual(21_333) + }) + it('native combined mode (#605): wires outputSchema into output_format alongside tools on Claude 4.5+', async () => { // Final-turn JSON the model emits when output_format is in play. const finalJson = JSON.stringify({ city: 'Berlin', temp: 18 }) diff --git a/packages/ai-anthropic/tests/model-meta.test.ts b/packages/ai-anthropic/tests/model-meta.test.ts index 50fc3b282..283a47368 100644 --- a/packages/ai-anthropic/tests/model-meta.test.ts +++ b/packages/ai-anthropic/tests/model-meta.test.ts @@ -1,6 +1,7 @@ import { describe, expect, expectTypeOf, it } from 'vitest' import { ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS, + ANTHROPIC_MAX_NONSTREAMING_TOKENS, getAnthropicDefaultMaxTokens, } from '../src/model-meta' import type { @@ -806,4 +807,41 @@ describe('getAnthropicDefaultMaxTokens (#849)', () => { 1024, ) }) + + it('clamps the default to the non-streaming limit for non-streaming requests (#849)', () => { + // The Anthropic SDK refuses non-streaming requests whose `max_tokens` + // could exceed its 10-minute timeout (~21_333). The streaming path keeps + // the full ceiling; the non-streaming (`structuredOutput`) path must clamp. + expect(ANTHROPIC_MAX_NONSTREAMING_TOKENS).toBeLessThanOrEqual(21_333) + + // Opus 128K and Sonnet 64K both exceed the non-streaming limit → clamped. + expect( + getAnthropicDefaultMaxTokens('claude-opus-4.8', { stream: false }), + ).toBe(ANTHROPIC_MAX_NONSTREAMING_TOKENS) + expect( + getAnthropicDefaultMaxTokens('claude-sonnet-4-6', { stream: false }), + ).toBe(ANTHROPIC_MAX_NONSTREAMING_TOKENS) + // Unknown model fallback (64K) is also above the limit → clamped. + expect( + getAnthropicDefaultMaxTokens('some-future-claude-model', { + stream: false, + }), + ).toBe(ANTHROPIC_MAX_NONSTREAMING_TOKENS) + }) + + it('does not clamp a model whose ceiling is already below the non-streaming limit (#849)', () => { + // claude-3-haiku's 4K ceiling is under the non-streaming limit, so the + // non-streaming path returns the real ceiling, not the (larger) cap. + expect( + getAnthropicDefaultMaxTokens('claude-3-haiku', { stream: false }), + ).toBe(4_000) + }) + + it('keeps the full ceiling for streaming requests (default) (#849)', () => { + expect( + getAnthropicDefaultMaxTokens('claude-opus-4.8', { stream: true }), + ).toBe(128_000) + // Omitting the option defaults to streaming. + expect(getAnthropicDefaultMaxTokens('claude-opus-4.8')).toBe(128_000) + }) })