From 5d296eac8b1b4e0f3be38efd28874e3a273bcaec Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Mon, 11 May 2026 18:10:02 +0200
Subject: [PATCH 1/8] fix(init): recover from stale-step resume when workflow
 already advanced
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the CLI resumes a step (e.g. select-features), the server runs it
and immediately starts the next step inline (plan-codemods runs its LLM
agent, ~15s). If the HTTP response drops before reaching the CLI (network
blip, CF timeout), the wizard's retry logic resumes the same step again —
but Mastra now returns 500 "step X was not suspended" because the workflow
has already moved on. Every retry hits the same 500, generating up to 12
Sentry events per incident.

Two fixes:

1. Pass `retries: 0` to MastraClient so the client's built-in retry logic
   doesn't stack on top of resumeWithRetry's own retries (was 4×4 = 16
   attempts; now 1×4 = 4).

2. When resumeAsync throws a "not suspended" error, call
   workflow.runById() to fetch the current run state and return it to the
   main loop. The loop then continues from whichever step is actually
   suspended — turning what was a fatal loop into a transparent recovery.

Fixes CLI-SERVER-9.
---
 src/lib/init/wizard-runner.ts | 58 +++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)
diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index b8f79c283..d0cb20ee3 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -423,17 +423,54 @@ const MAX_RESUME_RETRIES = 3;
 const RETRY_BACKOFF_MS = [2000, 4000, 8000];
 
 type ResumeRetryArgs = {
-  run: { resumeAsync: (args: Record<string, unknown>) => Promise<unknown> };
+  run: {
+    resumeAsync: (args: Record<string, unknown>) => Promise<unknown>;
+    readonly runId: string;
+  };
+  workflow: {
+    runById: (runId: string, opts?: { fields?: string[] }) => Promise<unknown>;
+  };
   stepId: string;
   resumeData: Record<string, unknown>;
   tracingOptions: Record<string, unknown>;
   ui: WizardUI;
 };
 
+/**
+ * Detect Mastra's "step not suspended" 500 — means the server already
+ * processed this step (our previous request succeeded but the response was
+ * dropped before we received it). The MastraClientError message embeds the
+ * server body, e.g.:
+ *   "HTTP error! status: 500 - {"error":"This workflow step 'X' was not suspended..."}"
+ */
+function isStepAlreadyAdvancedError(err: unknown): boolean {
+  return err instanceof Error && err.message.includes("not suspended");
+}
+
+/**
+ * Recover from a stale-step retry by fetching the current run state.
+ * If the workflow has already advanced (e.g. plan-codemods is now suspended),
+ * the returned WorkflowRunResult lets the main loop continue from the right step.
+ */
+async function tryRecoverCurrentRunState(
+  workflow: ResumeRetryArgs["workflow"],
+  runId: string
+): Promise<WorkflowRunResult | null> {
+  try {
+    const state = await workflow.runById(runId, {
+      fields: ["steps", "activeStepsPath"],
+    });
+    return assertWorkflowResult(state);
+  } catch {
+    return null;
+  }
+}
+
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: retry loop branches across transient errors, stale-step recovery, and backoff
 async function resumeWithRetry(
   args: ResumeRetryArgs
 ): Promise<WorkflowRunResult> {
-  const { run, stepId, resumeData, tracingOptions, ui } = args;
+  const { run, workflow, stepId, resumeData, tracingOptions, ui } = args;
   let lastError: unknown;
   for (let attempt = 0; attempt <= MAX_RESUME_RETRIES; attempt++) {
     try {
@@ -458,6 +495,18 @@ async function resumeWithRetry(
       return assertWorkflowResult(raw);
     } catch (err) {
       lastError = err;
+      // "Step not suspended" means the server processed our step but the
+      // response was dropped (network blip, CF response timeout, etc.).
+      // Retrying the same step will always 500. Fetch the current run state
+      // so the main loop can continue from whichever step is actually suspended.
+      if (isStepAlreadyAdvancedError(err)) {
+        ui.clearOverlay?.();
+        const recovered = await tryRecoverCurrentRunState(workflow, run.runId);
+        if (recovered) {
+          return recovered;
+        }
+        // Recovery failed — fall through to normal retry.
+      }
       if (attempt === MAX_RESUME_RETRIES) {
         ui.clearOverlay?.();
         throw err;
@@ -546,6 +595,10 @@ export async function runWizard(initialOptions: WizardOptions): Promise<void> {
     baseUrl: MASTRA_API_URL,
     headers: token ? { Authorization: `Bearer ${token}` } : {},
     abortSignal: abortController.signal,
+    // Disable Mastra's built-in retries — resumeWithRetry is the sole retry
+    // layer. Without this, each CLI retry attempt triggers 4 Mastra-internal
+    // attempts, producing up to 12 Sentry events per single wizard failure.
+    retries: 0,
     fetch: ((url, init) => {
       const traceData = getTraceData();
       // Preserve `init.signal` via the spread — MastraClient may pass its
@@ -680,6 +733,7 @@ export async function runWizard(initialOptions: WizardOptions): Promise<void> {
 
       result = await resumeWithRetry({
         run,
+        workflow,
         stepId: extracted.stepId,
         resumeData,
         tracingOptions,

From c80ac694b063441c2dcdaee67507127a4c65ee13 Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 12 May 2026 16:44:11 +0200
Subject: [PATCH 2/8] test(init): add wizard-runner coverage for retry recovery
 and edge cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Increases line coverage from 80% to 92.5%.

New tests cover:
- resumeWithRetry stale-step recovery: server already advanced (runById
  returns new state), recovery fails + normal retry takes over, and the
  recoveryAttempted guard preventing duplicate runById calls
- workflow exit code mapping (20→CONFIG, 30→DEPS, 40/41→CODEMOD, 50→VERIFY)
- start-request failure path (Connection failed spinner + WizardError)
- assertWorkflowResult with unrecognised status
- assertSuspendPayload with non-object payload
- extractSuspendPayload fallback loop (payload in alternate step)
- setStep completed/in_progress transitions across two steps
- detect-platform custom spinner label when existingProject.platform is set

Also adds the recoveryAttempted guard to resumeWithRetry so state recovery
is only attempted once per call even when multiple retries hit the same
"not suspended" error.
---
 src/lib/init/wizard-runner.ts       |   5 +-
 test/lib/init/wizard-runner.test.ts | 309 +++++++++++++++++++++++++++-
 2 files changed, 312 insertions(+), 2 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index d0cb20ee3..953b3ad1c 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -472,6 +472,7 @@ async function resumeWithRetry(
 ): Promise<WorkflowRunResult> {
   const { run, workflow, stepId, resumeData, tracingOptions, ui } = args;
   let lastError: unknown;
+  let recoveryAttempted = false;
   for (let attempt = 0; attempt <= MAX_RESUME_RETRIES; attempt++) {
     try {
       if (attempt > 0) {
@@ -499,7 +500,9 @@ async function resumeWithRetry(
       // response was dropped (network blip, CF response timeout, etc.).
       // Retrying the same step will always 500. Fetch the current run state
       // so the main loop can continue from whichever step is actually suspended.
-      if (isStepAlreadyAdvancedError(err)) {
+      // Only attempt recovery once — if it fails, fall through to normal retries.
+      if (isStepAlreadyAdvancedError(err) && !recoveryAttempted) {
+        recoveryAttempted = true;
         ui.clearOverlay?.();
         const recovered = await tryRecoverCurrentRunState(workflow, run.runId);
         if (recovered) {
diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts
index 70b882a75..068c113c8 100644
--- a/test/lib/init/wizard-runner.test.ts
+++ b/test/lib/init/wizard-runner.test.ts
@@ -12,7 +12,7 @@ import { MastraClient } from "@mastra/client-js";
 import * as banner from "../../../src/lib/banner.js";
 import { ENV_VAR_AGENTS } from "../../../src/lib/detect-agent.js";
 import { setEnv } from "../../../src/lib/env.js";
-import { WizardError } from "../../../src/lib/errors.js";
+import { EXIT, WizardError } from "../../../src/lib/errors.js";
 import { WizardCancelledError } from "../../../src/lib/init/clack-utils.js";
 // biome-ignore lint/performance/noNamespaceImport: spyOn requires object reference
 import * as fmt from "../../../src/lib/init/formatters.js";
@@ -92,6 +92,8 @@ let mockStartResult: WorkflowRunResult;
 let mockResumeResults: WorkflowRunResult[];
 let resumeCallCount = 0;
 let startAsyncMock: ReturnType<typeof mock>;
+let mockRunByIdResult: WorkflowRunResult | Error;
+let runByIdMock: ReturnType<typeof mock>;
 
 let getUISpy: ReturnType<typeof spyOn>;
 let formatBannerSpy: ReturnType<typeof spyOn>;
@@ -151,6 +153,7 @@ beforeEach(() => {
   mockStartResult = { status: "success", result: { platform: "React" } };
   mockResumeResults = [];
   resumeCallCount = 0;
+  mockRunByIdResult = new Error("runById not configured");
   process.exitCode = 0;
 
   spinnerMock.start.mockClear();
@@ -212,7 +215,13 @@ beforeEach(() => {
   );
 
   startAsyncMock = mock(() => Promise.resolve(mockStartResult));
+  runByIdMock = mock(() =>
+    mockRunByIdResult instanceof Error
+      ? Promise.reject(mockRunByIdResult)
+      : Promise.resolve(mockRunByIdResult)
+  );
   const run = {
+    runId: "test-run-id",
     startAsync: startAsyncMock,
     resumeAsync: mock(() => {
       const result = mockResumeResults[resumeCallCount] ?? {
@@ -224,6 +233,7 @@ beforeEach(() => {
   };
   const workflow = {
     createRun: mock(() => Promise.resolve(run)),
+    runById: runByIdMock,
   };
   capturedClientOptions = [];
   getWorkflowSpy = spyOn(
@@ -890,3 +900,300 @@ describe("runWizard — MastraClient lifecycle", () => {
     expect(capturedClientOptions[0]?.abortSignal?.aborted).toBe(true);
   });
 });
+
+// ─── Additional coverage tests ───────────────────────────────────────────────
+
+describe("runWizard — workflow exit codes", () => {
+  // handleFinalResult calls mapWorkflowExitCode when the workflow result
+  // carries a non-zero exitCode. Each case maps a server-internal code to
+  // the CLI's semantic EXIT constant.
+  test.each([
+    [20, EXIT.CONFIG],
+    [30, EXIT.WIZARD_DEPS],
+    [40, EXIT.WIZARD_CODEMOD],
+    [41, EXIT.WIZARD_CODEMOD],
+    [50, EXIT.WIZARD_VERIFY],
+    // 999 is an unknown code; also exercises the default branch of mapWorkflowExitCode
+    [999, EXIT.WIZARD],
+  ])("maps workflow exit code %s to the expected EXIT constant", async (workflowCode, expectedExitCode) => {
+    mockStartResult = {
+      status: "success",
+      result: { exitCode: workflowCode },
+    };
+
+    const err = await runWizard(makeOptions()).catch((e) => e);
+
+    expect(err).toBeInstanceOf(WizardError);
+    expect((err as WizardError).exitCode).toBe(expectedExitCode);
+  });
+});
+
+describe("runWizard — resumeWithRetry stale-step recovery", () => {
+  const toolPayload: ToolPayload = {
+    type: "tool",
+    operation: "run-commands",
+    cwd: "/tmp/test",
+    params: { commands: ["npm install"] },
+  };
+
+  function makeStaleStepRun(resumeAsyncImpl: () => Promise<WorkflowRunResult>) {
+    let runByIdRef: ReturnType<typeof mock>;
+    getWorkflowSpy.mockImplementation(function (this: MastraClient) {
+      capturedClientOptions.push(
+        (this as unknown as { options: { abortSignal?: AbortSignal } }).options
+      );
+      runByIdRef = runByIdMock;
+      return {
+        createRun: mock(() =>
+          Promise.resolve({
+            runId: "test-run-id",
+            startAsync: startAsyncMock,
+            resumeAsync: mock(resumeAsyncImpl),
+          })
+        ),
+        runById: runByIdRef,
+      } as any;
+    });
+  }
+
+  function staleStepError(): Error {
+    return new Error(
+      "HTTP error! status: 500 - " +
+        JSON.stringify({
+          error:
+            "This workflow step 'tool-step' was not suspended. Available suspended steps: [next-step]",
+        })
+    );
+  }
+
+  test("recovers when server has already advanced to the next step", async () => {
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["tool-step"]],
+      steps: { "tool-step": { suspendPayload: toolPayload } },
+    };
+    // runById returns a finished workflow — the wizard should complete cleanly.
+    mockRunByIdResult = { status: "success" };
+
+    let resumeCount = 0;
+    makeStaleStepRun(() => {
+      resumeCount += 1;
+      if (resumeCount === 1) {
+        return Promise.reject(staleStepError());
+      }
+      return Promise.resolve({ status: "success" });
+    });
+
+    await runWizard(makeOptions());
+
+    expect(formatResultSpy).toHaveBeenCalled();
+    expect(runByIdMock).toHaveBeenCalledWith(
+      "test-run-id",
+      expect.objectContaining({ fields: expect.any(Array) })
+    );
+    // Recovery succeeded on the first attempt — resumeAsync was not called again.
+    expect(resumeCount).toBe(1);
+  });
+
+  test("falls through to normal retry when runById fails", async () => {
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["tool-step"]],
+      steps: { "tool-step": { suspendPayload: toolPayload } },
+    };
+    // runById is unreachable — recovery fails, retry path takes over.
+    mockRunByIdResult = new Error("runById network error");
+
+    let resumeCount = 0;
+    makeStaleStepRun(() => {
+      resumeCount += 1;
+      if (resumeCount === 1) {
+        return Promise.reject(staleStepError());
+      }
+      return Promise.resolve({ status: "success" });
+    });
+
+    await runWizard(makeOptions());
+
+    expect(formatResultSpy).toHaveBeenCalled();
+    // Recovery attempted once, then normal retry kicked in.
+    expect(runByIdMock).toHaveBeenCalledTimes(1);
+    expect(resumeCount).toBe(2);
+  });
+
+  // Attempts 0 and 1 both get "not suspended"; attempt 2 succeeds.
+  // The recoveryAttempted guard means runById is only called once (on attempt 0),
+  // not twice. Total backoff: 2s + 4s = 6s, so we use a 10s timeout.
+  test("recovery is attempted only once across multiple stale-step retries", async () => {
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["tool-step"]],
+      steps: { "tool-step": { suspendPayload: toolPayload } },
+    };
+    mockRunByIdResult = new Error("offline");
+
+    let resumeCount = 0;
+    makeStaleStepRun(() => {
+      resumeCount += 1;
+      if (resumeCount <= 2) {
+        return Promise.reject(staleStepError());
+      }
+      return Promise.resolve({ status: "success" });
+    });
+
+    await runWizard(makeOptions());
+
+    expect(formatResultSpy).toHaveBeenCalled();
+    // Despite two "not suspended" failures, runById was only called once.
+    expect(runByIdMock).toHaveBeenCalledTimes(1);
+  }, 10_000);
+});
+
+describe("runWizard — additional coverage", () => {
+  test("throws WizardError and stops spinner when workflow start fails", async () => {
+    startAsyncMock.mockRejectedValue(new Error("connection refused"));
+
+    await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
+
+    expect(spinnerMock.stop).toHaveBeenCalledWith("Connection failed", 1);
+    expect(lastCancelMessage()).toBe("Setup failed");
+  });
+
+  test("throws when the workflow response has an unrecognised status", async () => {
+    startAsyncMock.mockResolvedValue({ status: "bailed" });
+
+    await expect(runWizard(makeOptions())).rejects.toThrow(
+      /Unexpected workflow status/
+    );
+  });
+
+  test("throws when a suspend payload is a non-object truthy value", async () => {
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["detect-platform"]],
+      steps: {
+        "detect-platform": {
+          // 42 is truthy, so extractSuspendPayload passes it to
+          // assertSuspendPayload, which rejects non-objects.
+          suspendPayload: 42,
+        },
+      },
+    };
+
+    await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
+  });
+
+  test("finds suspend payload via fallback loop when primary step has none", async () => {
+    const payload: ToolPayload = {
+      type: "tool",
+      operation: "run-commands",
+      cwd: "/tmp/test",
+      params: { commands: ["echo hi"] },
+    };
+    // `suspended` points to "step-a", but its payload is missing.
+    // extractSuspendPayload falls back to iterating all steps and finds
+    // the payload in "step-b".
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["step-a"]],
+      steps: {
+        "step-a": {},
+        "step-b": { suspendPayload: payload },
+      },
+    };
+    mockResumeResults = [{ status: "success" }];
+
+    await runWizard(makeOptions());
+
+    expect(executeToolSpy).toHaveBeenCalledWith(payload, makeContext());
+  });
+
+  test("marks the previous step completed when the workflow advances", async () => {
+    const payloadA: ToolPayload = {
+      type: "tool",
+      operation: "list-dir",
+      cwd: "/tmp/test",
+      params: { path: "." },
+    };
+    const payloadB: ToolPayload = {
+      type: "tool",
+      operation: "read-files",
+      cwd: "/tmp/test",
+      params: { paths: ["package.json"] },
+    };
+
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["discover-context"]],
+      steps: { "discover-context": { suspendPayload: payloadA } },
+    };
+    mockResumeResults = [
+      {
+        status: "suspended",
+        suspended: [["detect-platform"]],
+        steps: { "detect-platform": { suspendPayload: payloadB } },
+      },
+      { status: "success" },
+    ];
+
+    await runWizard(makeOptions());
+
+    const stepCalls = mockUICalls.filter((c) => c.kind === "setStep");
+    expect(stepCalls).toContainEqual({
+      kind: "setStep",
+      stepId: "discover-context",
+      status: "in_progress",
+    });
+    expect(stepCalls).toContainEqual({
+      kind: "setStep",
+      stepId: "discover-context",
+      status: "completed",
+    });
+    expect(stepCalls).toContainEqual({
+      kind: "setStep",
+      stepId: "detect-platform",
+      status: "in_progress",
+    });
+    const inProgressIdx = stepCalls.findIndex(
+      (c) =>
+        c.kind === "setStep" &&
+        c.stepId === "discover-context" &&
+        c.status === "in_progress"
+    );
+    const completedIdx = stepCalls.findIndex(
+      (c) =>
+        c.kind === "setStep" &&
+        c.stepId === "discover-context" &&
+        c.status === "completed"
+    );
+    expect(inProgressIdx).toBeLessThan(completedIdx);
+  });
+
+  test("uses existing platform name in detect-platform spinner label", async () => {
+    resolveInitContextSpy.mockResolvedValue(
+      makeContext({ existingProject: { platform: "javascript-nextjs" } })
+    );
+    mockStartResult = {
+      status: "suspended",
+      suspended: [["detect-platform"]],
+      steps: {
+        "detect-platform": {
+          suspendPayload: {
+            type: "tool",
+            operation: "list-dir",
+            cwd: "/tmp/test",
+            params: { path: "." },
+          },
+        },
+      },
+    };
+    mockResumeResults = [{ status: "success" }];
+
+    await runWizard(makeOptions());
+
+    const messages = spinnerMock.message.mock.calls.map(
+      (c: unknown[]) => c[0] as string
+    );
+    expect(messages.some((m) => m.includes("javascript-nextjs"))).toBe(true);
+  });
+});

From 658e6ded91d15a0a966f6b9e8edca97c8bf727ba Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 12 May 2026 16:55:42 +0200
Subject: [PATCH 3/8] fix(init): address Bugbot comments on stale-step recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes from Cursor Bugbot review of PR #949:

1. Remove retries: 0 — it was added to reduce Sentry event noise from
   the retry storm, but it also stripped the Mastra client's built-in
   retry safety net from workflow.createRun() and run.startAsync(), which
   have no custom retry wrapper. A single transient error during startup
   now immediately fails. Restoring the default (3 retries) keeps those
   calls protected; "not suspended" incidents still go from 12 events
   down to 4 (vs 1 before), which is still a significant improvement.

2. Wrap workflow.runById in withTimeout and add "result" to fields —
   tryRecoverCurrentRunState previously called runById without a timeout,
   so on an unreliable network it could hang for 30-120s. Consistent with
   resumeAsync and startAsync which both use API_TIMEOUT_MS. Also adds
   "result" to the requested fields so that if the workflow already
   completed with a non-zero exitCode, handleFinalResult maps it to the
   correct EXIT constant instead of the generic EXIT.WIZARD fallback.
---
 src/lib/init/wizard-runner.ts | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index 953b3ad1c..4ac132c3c 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -457,9 +457,13 @@ async function tryRecoverCurrentRunState(
   runId: string
 ): Promise<WorkflowRunResult | null> {
   try {
-    const state = await workflow.runById(runId, {
-      fields: ["steps", "activeStepsPath"],
-    });
+    const state = await withTimeout(
+      workflow.runById(runId, {
+        fields: ["steps", "activeStepsPath", "result"],
+      }),
+      API_TIMEOUT_MS,
+      "Run state recovery"
+    );
     return assertWorkflowResult(state);
   } catch {
     return null;
@@ -598,10 +602,6 @@ export async function runWizard(initialOptions: WizardOptions): Promise<void> {
     baseUrl: MASTRA_API_URL,
     headers: token ? { Authorization: `Bearer ${token}` } : {},
     abortSignal: abortController.signal,
-    // Disable Mastra's built-in retries — resumeWithRetry is the sole retry
-    // layer. Without this, each CLI retry attempt triggers 4 Mastra-internal
-    // attempts, producing up to 12 Sentry events per single wizard failure.
-    retries: 0,
     fetch: ((url, init) => {
       const traceData = getTraceData();
       // Preserve `init.signal` via the spread — MastraClient may pass its

From 90e8017a286f2b1bfac9398f70c2b466886e6cb0 Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 12 May 2026 17:15:27 +0200
Subject: [PATCH 4/8] fix(init): throw immediately when stale-step recovery
 fails

The previous implementation fell through to normal retries after a failed
recovery attempt, contradicting its own comment ("Retrying the same step
will always 500"). The recoveryAttempted guard blocked the only useful
action (another runById call) while allowing useless ones (3 more retries
of the stale step with 2+4+8 = 14s of wasted backoff).

Instead, throw immediately after failed recovery. The step is confirmed
not suspended, so there is nothing a retry can accomplish. Transient
errors (network timeout, auth flap) continue to use normal backoff retry
since they never enter the isStepAlreadyAdvancedError branch.

Also removes the now-unnecessary recoveryAttempted flag.
---
 src/lib/init/wizard-runner.ts       |  9 ++++----
 test/lib/init/wizard-runner.test.ts | 33 +++++++++++------------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index 4ac132c3c..0f97faee6 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -476,7 +476,6 @@ async function resumeWithRetry(
 ): Promise<WorkflowRunResult> {
   const { run, workflow, stepId, resumeData, tracingOptions, ui } = args;
   let lastError: unknown;
-  let recoveryAttempted = false;
   for (let attempt = 0; attempt <= MAX_RESUME_RETRIES; attempt++) {
     try {
       if (attempt > 0) {
@@ -504,15 +503,15 @@ async function resumeWithRetry(
       // response was dropped (network blip, CF response timeout, etc.).
       // Retrying the same step will always 500. Fetch the current run state
       // so the main loop can continue from whichever step is actually suspended.
-      // Only attempt recovery once — if it fails, fall through to normal retries.
-      if (isStepAlreadyAdvancedError(err) && !recoveryAttempted) {
-        recoveryAttempted = true;
+      if (isStepAlreadyAdvancedError(err)) {
         ui.clearOverlay?.();
         const recovered = await tryRecoverCurrentRunState(workflow, run.runId);
         if (recovered) {
           return recovered;
         }
-        // Recovery failed — fall through to normal retry.
+        // Recovery failed — the step is confirmed not suspended and retrying
+        // it will always 500. Throw immediately instead of wasting 14s.
+        throw err;
       }
       if (attempt === MAX_RESUME_RETRIES) {
         ui.clearOverlay?.();
diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts
index 068c113c8..dfcd35430 100644
--- a/test/lib/init/wizard-runner.test.ts
+++ b/test/lib/init/wizard-runner.test.ts
@@ -995,36 +995,30 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     expect(resumeCount).toBe(1);
   });
 
-  test("falls through to normal retry when runById fails", async () => {
+  test("throws immediately when stale-step error occurs and runById fails", async () => {
     mockStartResult = {
       status: "suspended",
       suspended: [["tool-step"]],
       steps: { "tool-step": { suspendPayload: toolPayload } },
     };
-    // runById is unreachable — recovery fails, retry path takes over.
+    // runById is unreachable — recovery fails, wizard throws without retrying.
     mockRunByIdResult = new Error("runById network error");
 
     let resumeCount = 0;
     makeStaleStepRun(() => {
       resumeCount += 1;
-      if (resumeCount === 1) {
-        return Promise.reject(staleStepError());
-      }
-      return Promise.resolve({ status: "success" });
+      return Promise.reject(staleStepError());
     });
 
-    await runWizard(makeOptions());
+    await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
 
-    expect(formatResultSpy).toHaveBeenCalled();
-    // Recovery attempted once, then normal retry kicked in.
+    // Threw immediately after recovery failed — no futile retries of the stale step.
+    expect(resumeCount).toBe(1);
     expect(runByIdMock).toHaveBeenCalledTimes(1);
-    expect(resumeCount).toBe(2);
   });
 
   // Attempts 0 and 1 both get "not suspended"; attempt 2 succeeds.
-  // The recoveryAttempted guard means runById is only called once (on attempt 0),
-  // not twice. Total backoff: 2s + 4s = 6s, so we use a 10s timeout.
-  test("recovery is attempted only once across multiple stale-step retries", async () => {
+  test("throws immediately after failed recovery without further retries", async () => {
     mockStartResult = {
       status: "suspended",
       suspended: [["tool-step"]],
@@ -1035,18 +1029,15 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     let resumeCount = 0;
     makeStaleStepRun(() => {
       resumeCount += 1;
-      if (resumeCount <= 2) {
-        return Promise.reject(staleStepError());
-      }
-      return Promise.resolve({ status: "success" });
+      return Promise.reject(staleStepError());
     });
 
-    await runWizard(makeOptions());
+    await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
 
-    expect(formatResultSpy).toHaveBeenCalled();
-    // Despite two "not suspended" failures, runById was only called once.
+    // Threw immediately — runById tried once, resumeAsync tried once.
+    expect(resumeCount).toBe(1);
     expect(runByIdMock).toHaveBeenCalledTimes(1);
-  }, 10_000);
+  });
 });
 
 describe("runWizard — additional coverage", () => {

From 15c21c138a93f4e04520efdafb81f60fcc1ea231 Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 12 May 2026 22:03:18 +0200
Subject: [PATCH 5/8] fix(init): tighten stale-step detection and show
 reconnecting feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two review recommendations from /review:

1. Tighten isStepAlreadyAdvancedError to match "was not suspended" instead
   of "not suspended" — both Mastra error variants use the more specific
   phrase, which reduces the false-positive surface area.

2. Show "Reconnecting..." spinner message before the runById recovery call
   so the user sees feedback during a potentially slow state fetch instead
   of watching a stale spinner message. Passes spin through ResumeRetryArgs
   to give resumeWithRetry access to the running spinner handle.
---
 src/lib/init/wizard-runner.ts | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index 0f97faee6..d8b5482d6 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -433,6 +433,7 @@ type ResumeRetryArgs = {
   stepId: string;
   resumeData: Record<string, unknown>;
   tracingOptions: Record<string, unknown>;
+  spin: SpinnerHandle;
   ui: WizardUI;
 };
 
@@ -444,7 +445,7 @@ type ResumeRetryArgs = {
  *   "HTTP error! status: 500 - {"error":"This workflow step 'X' was not suspended..."}"
  */
 function isStepAlreadyAdvancedError(err: unknown): boolean {
-  return err instanceof Error && err.message.includes("not suspended");
+  return err instanceof Error && err.message.includes("was not suspended");
 }
 
 /**
@@ -474,7 +475,7 @@ async function tryRecoverCurrentRunState(
 async function resumeWithRetry(
   args: ResumeRetryArgs
 ): Promise<WorkflowRunResult> {
-  const { run, workflow, stepId, resumeData, tracingOptions, ui } = args;
+  const { run, workflow, stepId, resumeData, tracingOptions, spin, ui } = args;
   let lastError: unknown;
   for (let attempt = 0; attempt <= MAX_RESUME_RETRIES; attempt++) {
     try {
@@ -505,6 +506,7 @@ async function resumeWithRetry(
       // so the main loop can continue from whichever step is actually suspended.
       if (isStepAlreadyAdvancedError(err)) {
         ui.clearOverlay?.();
+        spin.message("Reconnecting...");
         const recovered = await tryRecoverCurrentRunState(workflow, run.runId);
         if (recovered) {
           return recovered;
@@ -739,6 +741,7 @@ export async function runWizard(initialOptions: WizardOptions): Promise<void> {
         stepId: extracted.stepId,
         resumeData,
         tracingOptions,
+        spin,
         ui,
       });
     }

From a382a02ceed553c811da3b5dcdb0fb794aeee457 Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Tue, 12 May 2026 22:24:43 +0200
Subject: [PATCH 6/8] fix(init): add Sentry observability to stale-step
 recovery path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the recovery path had no observability — there was no way
to tell from Sentry whether the fix was working in production.

- addBreadcrumb when recovery succeeds: appears in subsequent traces
  so we can confirm the recovery mechanism is actually being triggered
  and seamlessly continuing the wizard.

- captureException (warning) when recovery fails: creates a separate
  trackable event tagged wizard.stale_step_recovery=failed with the
  step ID and run ID so we can investigate edge cases where runById
  is unreachable or returns unexpected data.

The server-side CLI-SERVER-9 events (from withWorkflowErrorCapture)
continue to fire on each "step not suspended" 500, but these new
client-side events tell us the outcome of the recovery attempt.
---
 src/lib/init/wizard-runner.ts | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index d8b5482d6..ed8d9fe1a 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -15,6 +15,7 @@ import { randomBytes } from "node:crypto";
 
 import { MastraClient } from "@mastra/client-js";
 import {
+  addBreadcrumb,
   captureException,
   getTraceData,
   setTag,
@@ -509,10 +510,24 @@ async function resumeWithRetry(
         spin.message("Reconnecting...");
         const recovered = await tryRecoverCurrentRunState(workflow, run.runId);
         if (recovered) {
+          addBreadcrumb({
+            category: "wizard",
+            message: `stale-step recovery succeeded for ${stepId}`,
+            level: "info",
+            data: { stepId, runId: run.runId },
+          });
           return recovered;
         }
         // Recovery failed — the step is confirmed not suspended and retrying
         // it will always 500. Throw immediately instead of wasting 14s.
+        captureException(err, {
+          level: "warning",
+          tags: {
+            "wizard.stale_step_recovery": "failed",
+            "wizard.resume_step": stepId,
+          },
+          extra: { runId: run.runId },
+        });
         throw err;
       }
       if (attempt === MAX_RESUME_RETRIES) {

From b40823ee0ad4e17ea7935fd6061fc5d9238c7f1c Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Wed, 13 May 2026 10:40:14 +0200
Subject: [PATCH 7/8] test(init): remove duplicate stale-step recovery test

---
 test/lib/init/wizard-runner.test.ts | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts
index dfcd35430..5fa8f09e2 100644
--- a/test/lib/init/wizard-runner.test.ts
+++ b/test/lib/init/wizard-runner.test.ts
@@ -1017,27 +1017,6 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     expect(runByIdMock).toHaveBeenCalledTimes(1);
   });
 
-  // Attempts 0 and 1 both get "not suspended"; attempt 2 succeeds.
-  test("throws immediately after failed recovery without further retries", async () => {
-    mockStartResult = {
-      status: "suspended",
-      suspended: [["tool-step"]],
-      steps: { "tool-step": { suspendPayload: toolPayload } },
-    };
-    mockRunByIdResult = new Error("offline");
-
-    let resumeCount = 0;
-    makeStaleStepRun(() => {
-      resumeCount += 1;
-      return Promise.reject(staleStepError());
-    });
-
-    await expect(runWizard(makeOptions())).rejects.toThrow(WizardError);
-
-    // Threw immediately — runById tried once, resumeAsync tried once.
-    expect(resumeCount).toBe(1);
-    expect(runByIdMock).toHaveBeenCalledTimes(1);
-  });
 });
 
 describe("runWizard — additional coverage", () => {

From 0f4835e8ecf98e4758306d257f253622a58cd41e Mon Sep 17 00:00:00 2001
From: betegon <miguelbetegongarcia@gmail.com>
Date: Wed, 13 May 2026 11:02:16 +0200
Subject: [PATCH 8/8] fix(init): derive suspended field from activeStepsPath in
 recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runById returns the stored D1 state, which does not include the
suspended field — that's built at runtime inside resumeAsync by
filtering step results for status === "suspended". Without it,
the main loop falls back to stepId = "unknown" and iterates all
steps in the run to find one with a suspendPayload. A completed
step that still carries a historical suspendPayload in D1 could
be selected instead of the currently-suspended one.

activeStepsPath (Record<stepId, executionPath>) IS returned and
already requested. Its keys are exactly the currently-active
step IDs. Derive suspended = [[stepId]] from those keys so the
main loop finds the right step deterministically.

Also fixes a trailing blank line left by the duplicate-test removal.
---
 src/lib/init/wizard-runner.ts       | 15 ++++++++++++++-
 test/lib/init/wizard-runner.test.ts |  1 -
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/lib/init/wizard-runner.ts b/src/lib/init/wizard-runner.ts
index ed8d9fe1a..baf06e26b 100644
--- a/src/lib/init/wizard-runner.ts
+++ b/src/lib/init/wizard-runner.ts
@@ -459,13 +459,26 @@ async function tryRecoverCurrentRunState(
   runId: string
 ): Promise<WorkflowRunResult | null> {
   try {
-    const state = await withTimeout(
+    const raw = await withTimeout(
       workflow.runById(runId, {
         fields: ["steps", "activeStepsPath", "result"],
       }),
       API_TIMEOUT_MS,
       "Run state recovery"
     );
+    // runById returns activeStepsPath (Record<stepId, executionPath>) but
+    // not suspended (string[][]). The main loop reads result.suspended to
+    // find the active step; without it, stepId falls back to "unknown" and
+    // extractSuspendPayload iterates all steps — picking the first with any
+    // suspendPayload, which could be a completed step with stale D1 data.
+    // Derive suspended from the activeStepsPath keys so the lookup is
+    // deterministic: those keys are exactly the currently-active step IDs.
+    const state = raw as Record<string, unknown>;
+    if (!state.suspended && state.activeStepsPath) {
+      state.suspended = Object.keys(
+        state.activeStepsPath as Record<string, unknown>
+      ).map((id) => [id]);
+    }
     return assertWorkflowResult(state);
   } catch {
     return null;
diff --git a/test/lib/init/wizard-runner.test.ts b/test/lib/init/wizard-runner.test.ts
index 5fa8f09e2..36a91066c 100644
--- a/test/lib/init/wizard-runner.test.ts
+++ b/test/lib/init/wizard-runner.test.ts
@@ -1016,7 +1016,6 @@ describe("runWizard — resumeWithRetry stale-step recovery", () => {
     expect(resumeCount).toBe(1);
     expect(runByIdMock).toHaveBeenCalledTimes(1);
   });
-
 });
 
 describe("runWizard — additional coverage", () => {