triggerdotdev · deepshekhardas · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/.changeset/fix-console-interceptor-2900.md b/.changeset/fix-console-interceptor-2900.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/core": patch
+---
+
+Fix: ConsoleInterceptor now delegates to original console methods to preserve log chain when other interceptors (like Sentry) are present. (#2900)
diff --git a/.changeset/fix-docker-hub-rate-limit-2911.md b/.changeset/fix-docker-hub-rate-limit-2911.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/cli-v3": patch
+---
+
+Fix: Native build server failed with Docker Hub rate limits. Added support for checking checking `DOCKER_USERNAME` and `DOCKER_PASSWORD` in environment variables and logging into Docker Hub before building. (#2911)
diff --git a/.changeset/fix-github-install-node-version-2913.md b/.changeset/fix-github-install-node-version-2913.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/cli-v3": patch
+---
+
+Fix: Ignore engine checks during deployment install phase to prevent failure on build server when Node version mismatch exists. (#2913)
diff --git a/.changeset/fix-orphaned-workers-2909.md b/.changeset/fix-orphaned-workers-2909.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/cli-v3": patch
+---
+
+Fix: `trigger.dev dev` command left orphaned worker processes when exited via Ctrl+C (SIGINT). Added signal handlers to ensure proper cleanup of child processes and lockfiles. (#2909)
diff --git a/.changeset/fix-sentry-oom-2920.md b/.changeset/fix-sentry-oom-2920.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/cli-v3": patch
+---
+
+Fix Sentry OOM: Allow disabling `source-map-support` via `TRIGGER_SOURCE_MAPS=false`. Also supports `node` for native source maps. (#2920)
diff --git a/.server-changes/replication-error-recovery.md b/.server-changes/replication-error-recovery.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: fix
+---
+
+Runs and sessions replication services now auto-recover from stream errors (e.g. after a Postgres failover) instead of silently leaving replication stopped. Behaviour is configurable per service — reconnect (default), exit so a process supervisor can restart the host, or log.
diff --git a/apps/webapp/app/components/runs/v3/AIFilterInput.tsx b/apps/webapp/app/components/runs/v3/AIFilterInput.tsx
@@ -38,7 +38,7 @@ export function AIFilterInput() {
   const fetcher = useFetcher<AIFilterResult>();
 
   useEffect(() => {
-    if (fetcher.data?.success && fetcher.state === "loading") {
+    if (fetcher.data?.success && fetcher.state === "idle") {
       setText("");
       setIsFocused(false);
 

diff --git a/apps/webapp/app/components/runs/v3/ReplayRunDialog.tsx b/apps/webapp/app/components/runs/v3/ReplayRunDialog.tsx
@@ -93,7 +93,7 @@ function ReplayContent({ runFriendlyId, failedRedirect }: ReplayRunDialogProps)
         }/env/${envSlug}/queues?${searchParams.toString()}`
       );
     }
-  }, [params.organizationSlug, params.projectParam, params.envParam, environmentIdOverride]);
+  }, [params.organizationSlug, params.projectParam, params.envParam, environmentIdOverride, replayDataFetcher.data]);
 
   const customQueues = useMemo(() => {
     return queueFetcher.data?.queues ?? [];

diff --git a/apps/webapp/app/components/runs/v3/agent/AgentView.tsx b/apps/webapp/app/components/runs/v3/agent/AgentView.tsx
@@ -554,6 +554,15 @@ function useAgentSessionMessages({
     };
   }, [sessionId, apiOrigin, orgSlug, projectSlug, envSlug, snapshotPresignedUrl]);
 
+  // Reset refs when session changes to prevent stale data from bleeding
+  // across sessions (e.g. navigating between runs with different agent sessions)
+  useEffect(() => {
+    pendingRef.current = new Map(seedMessages.map((m) => [m.id, m]));
+    timestampsRef.current = new Map(seedMessages.map((m) => [m.id, INITIAL_PAYLOAD_TIMESTAMP]));
+    orchestrationRef.current = new Map();
+    setMessagesById(new Map(pendingRef.current));
+  }, [sessionId, seedMessages]);
+
   return useMemo(() => {
     const timestamps = timestampsRef.current;
     const arr = Array.from(messagesById.values());

diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -1330,6 +1330,16 @@ const EnvironmentSchema = z
     RUN_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
     RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"),
     RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"),
+    // What to do when the runs replication client errors (e.g. after a
+    // Postgres failover). `reconnect` (default) re-subscribes in-process with
+    // exponential backoff; `exit` exits the process so a supervisor restarts
+    // it; `log` preserves the old no-op behaviour. Reconnect tuning is
+    // shared across both replication services via REPLICATION_RECONNECT_*.
+    RUN_REPLICATION_ERROR_STRATEGY: z
+      .enum(["reconnect", "exit", "log"])
+      .default("reconnect"),
+    RUN_REPLICATION_EXIT_DELAY_MS: z.coerce.number().int().min(0).default(5_000),
+    RUN_REPLICATION_EXIT_CODE: z.coerce.number().int().min(0).max(255).default(1),
 
     // Session replication (Postgres → ClickHouse sessions_v1). Shares Redis
     // with the runs replicator for leader locking but has its own slot and
@@ -1362,6 +1372,19 @@ const EnvironmentSchema = z
     SESSION_REPLICATION_INSERT_MAX_RETRIES: z.coerce.number().int().default(3),
     SESSION_REPLICATION_INSERT_BASE_DELAY_MS: z.coerce.number().int().default(100),
     SESSION_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000),
+    // Error recovery — same semantics as RUN_REPLICATION_ERROR_STRATEGY.
+    SESSION_REPLICATION_ERROR_STRATEGY: z
+      .enum(["reconnect", "exit", "log"])
+      .default("reconnect"),
+    SESSION_REPLICATION_EXIT_DELAY_MS: z.coerce.number().int().min(0).default(5_000),
+    SESSION_REPLICATION_EXIT_CODE: z.coerce.number().int().min(0).max(255).default(1),
+
+    // Reconnect tuning shared across both replication services. Only
+    // applies when error strategy is `reconnect`. Max attempts of 0 means
+    // unlimited (default).
+    REPLICATION_RECONNECT_INITIAL_DELAY_MS: z.coerce.number().int().min(0).default(1_000),
+    REPLICATION_RECONNECT_MAX_DELAY_MS: z.coerce.number().int().min(0).default(60_000),
+    REPLICATION_RECONNECT_MAX_ATTEMPTS: z.coerce.number().int().min(0).default(0),
 
     // Clickhouse
     CLICKHOUSE_URL: z.string(),

diff --git a/apps/webapp/app/presenters/v3/LogsListPresenter.server.ts b/apps/webapp/app/presenters/v3/LogsListPresenter.server.ts
@@ -215,12 +215,6 @@ export class LogsListPresenter extends BasePresenter {
       );
     }
 
-    if (store === EVENT_STORE_TYPES.CLICKHOUSE) {
-      throw new ServiceValidationError(
-        "Logs are not available for ClickHouse event store. Please contact support."
-      );
-    }
-
     const queryBuilder = this.clickhouse.taskEventsSearch.logsListQueryBuilder();
 
     // This should be removed once we clear the old inserts, 30 DAYS, the materialized view excludes events without trace_id)

diff --git a/apps/webapp/app/services/replicationErrorRecovery.server.ts b/apps/webapp/app/services/replicationErrorRecovery.server.ts
@@ -0,0 +1,207 @@
+import { Logger } from "@trigger.dev/core/logger";
+
+// When the LogicalReplicationClient's WAL stream errors (e.g. after a
+// Postgres failover) it calls stop() on itself and stays stopped. The host
+// service has to decide how to recover. Three strategies are available:
+//
+// - "reconnect" — re-subscribe in-process with exponential backoff. Default;
+//   works without a process supervisor.
+// - "exit"      — exit the process so an external supervisor (Docker
+//   restart=always, ECS, systemd, k8s, ...) replaces it. Recommended when a
+//   supervisor is present because it gets a clean slate every time.
+// - "log"       — preserve the historical no-op behaviour. Useful for
+//   debugging or in test environments where you want to observe the
+//   silent-death failure mode.
+export type ReplicationErrorRecoveryStrategy =
+  | {
+      type: "reconnect";
+      initialDelayMs?: number;
+      maxDelayMs?: number;
+      // 0 (or undefined) means retry forever.
+      maxAttempts?: number;
+    }
+  | {
+      type: "exit";
+      exitDelayMs?: number;
+      exitCode?: number;
+    }
+  | { type: "log" };
+
+export type ReplicationErrorRecoveryDeps = {
+  strategy: ReplicationErrorRecoveryStrategy;
+  logger: Logger;
+  // Re-subscribe the underlying replication client. Implementations should
+  // call client.subscribe(...) and resolve once the stream is started.
+  reconnect: () => Promise<void>;
+  // True once the host service has begun graceful shutdown — recovery
+  // suppresses all work in that state.
+  isShuttingDown: () => boolean;
+};
+
+export type ReplicationErrorRecovery = {
+  // Called from the replication client's "error" event handler.
+  handle(error: unknown): void;
+  // Called from the replication client's "start" event handler. Resets the
+  // reconnect attempt counter so the next failure starts from initialDelayMs.
+  notifyStreamStarted(): void;
+  // Called from the replication client's "leaderElection" event handler with
+  // isLeader=false. Only the reconnect strategy acts on this; exit and log
+  // strategies treat losing the lock as a normal multi-instance state (an
+  // "exit" instance would otherwise restart-loop whenever a peer holds it).
+  notifyLeaderElectionLost(error: unknown): void;
+  // Cancel any pending reconnect/exit timer. Called from shutdown().
+  dispose(): void;
+};
+
+export function createReplicationErrorRecovery(
+  deps: ReplicationErrorRecoveryDeps
+): ReplicationErrorRecovery {
+  const { strategy, logger, reconnect, isShuttingDown } = deps;
+  let attempt = 0;
+  let pendingReconnect: NodeJS.Timeout | null = null;
+  let pendingExit: NodeJS.Timeout | null = null;
+  let exiting = false;
+
+  function scheduleReconnect(error: unknown): void {
+    if (strategy.type !== "reconnect") return;
+    if (pendingReconnect) return;
+
+    attempt += 1;
+    const maxAttempts = strategy.maxAttempts ?? 0;
+    if (maxAttempts > 0 && attempt > maxAttempts) {
+      logger.error("Replication reconnect exceeded maxAttempts; giving up", {
+        attempt,
+        maxAttempts,
+        error,
+      });
+      return;
+    }
+
+    const initialDelay = strategy.initialDelayMs ?? 1_000;
+    const maxDelay = strategy.maxDelayMs ?? 60_000;
+    const delay = Math.min(initialDelay * Math.pow(2, attempt - 1), maxDelay);
+
+    logger.error("Replication stream lost — scheduling reconnect", {
+      attempt,
+      delayMs: delay,
+      error,
+    });
+
+    pendingReconnect = setTimeout(async () => {
+      pendingReconnect = null;
+      if (isShuttingDown()) return;
+
+      try {
+        await reconnect();
+        // Success path is handled by notifyStreamStarted, which fires from
+        // the replication client's "start" event after the stream is live.
+      } catch (err) {
+        // subscribe() can throw without first emitting an "error" event —
+        // notably when the initial pg client.connect() fails because Postgres
+        // is still unreachable mid-failover. Schedule the next attempt
+        // ourselves so recovery doesn't silently stop. If subscribe() did
+        // also emit an "error" event, handle() will call scheduleReconnect()
+        // first; the guard on pendingReconnect makes this idempotent.
+        logger.error("Replication reconnect attempt failed", {
+          attempt,
+          error: err,
+        });
+        scheduleReconnect(err);
+      }
+    }, delay);
+  }
+
+  function scheduleExit(): void {
+    if (strategy.type !== "exit") return;
+    if (exiting) return;
+    exiting = true;
+
+    const delay = strategy.exitDelayMs ?? 5_000;
+    const code = strategy.exitCode ?? 1;
+
+    logger.error("Fatal replication error — exiting to let process supervisor restart", {
+      exitCode: code,
+      exitDelayMs: delay,
+    });
+
+    pendingExit = setTimeout(() => {
+      // eslint-disable-next-line no-process-exit
+      process.exit(code);
+    }, delay);
+    // Don't hold a clean shutdown back on this timer.
+    pendingExit.unref();
+  }
+
+  return {
+    handle(error) {
+      if (isShuttingDown()) return;
+      switch (strategy.type) {
+        case "log":
+          return;
+        case "exit":
+          return scheduleExit();
+        case "reconnect":
+          return scheduleReconnect(error);
+      }
+    },
+    notifyStreamStarted() {
+      if (attempt > 0) {
+        logger.info("Replication reconnect succeeded", { attempt });
+        attempt = 0;
+      }
+    },
+    notifyLeaderElectionLost(error) {
+      if (isShuttingDown()) return;
+      // Only the reconnect strategy should react. For exit, losing the
+      // lock to a peer would otherwise trigger a restart loop. For log,
+      // we keep historical no-op semantics.
+      if (strategy.type !== "reconnect") return;
+      scheduleReconnect(error);
+    },
+    dispose() {
+      if (pendingReconnect) {
+        clearTimeout(pendingReconnect);
+        pendingReconnect = null;
+      }
+      if (pendingExit) {
+        clearTimeout(pendingExit);
+        pendingExit = null;
+      }
+    },
+  };
+}
+
+// Shape of the env-driven configuration object the instance bootstrap files
+// build from process.env. Kept separate from the strategy union above so the
+// instance code can pass a single object regardless of which strategy is set.
+export type ReplicationErrorRecoveryEnv = {
+  strategy: "reconnect" | "exit" | "log";
+  reconnectInitialDelayMs?: number;
+  reconnectMaxDelayMs?: number;
+  reconnectMaxAttempts?: number;
+  exitDelayMs?: number;
+  exitCode?: number;
+};
+
+export function strategyFromEnv(
+  env: ReplicationErrorRecoveryEnv
+): ReplicationErrorRecoveryStrategy {
+  switch (env.strategy) {
+    case "exit":
+      return {
+        type: "exit",
+        exitDelayMs: env.exitDelayMs,
+        exitCode: env.exitCode,
+      };
+    case "log":
+      return { type: "log" };
+    case "reconnect":
+    default:
+      return {
+        type: "reconnect",
+        initialDelayMs: env.reconnectInitialDelayMs,
+        maxDelayMs: env.reconnectMaxDelayMs,
+        maxAttempts: env.reconnectMaxAttempts,
+      };
+  }
+}
diff --git a/apps/webapp/app/services/runsReplicationInstance.server.ts b/apps/webapp/app/services/runsReplicationInstance.server.ts
@@ -3,6 +3,7 @@ import invariant from "tiny-invariant";
 import { env } from "~/env.server";
 import { singleton } from "~/utils/singleton";
 import { meter, provider } from "~/v3/tracer.server";
+import { strategyFromEnv } from "./replicationErrorRecovery.server";
 import { RunsReplicationService } from "./runsReplicationService.server";
 import { signalsEmitter } from "./signals.server";
 
@@ -69,6 +70,14 @@ function initializeRunsReplicationInstance() {
     insertStrategy: env.RUN_REPLICATION_INSERT_STRATEGY,
     disablePayloadInsert: env.RUN_REPLICATION_DISABLE_PAYLOAD_INSERT === "1",
     disableErrorFingerprinting: env.RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING === "1",
+    errorRecovery: strategyFromEnv({
+      strategy: env.RUN_REPLICATION_ERROR_STRATEGY,
+      reconnectInitialDelayMs: env.REPLICATION_RECONNECT_INITIAL_DELAY_MS,
+      reconnectMaxDelayMs: env.REPLICATION_RECONNECT_MAX_DELAY_MS,
+      reconnectMaxAttempts: env.REPLICATION_RECONNECT_MAX_ATTEMPTS,
+      exitDelayMs: env.RUN_REPLICATION_EXIT_DELAY_MS,
+      exitCode: env.RUN_REPLICATION_EXIT_CODE,
+    }),
   });
 
   if (env.RUN_REPLICATION_ENABLED === "1") {