Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ed41f0a
fix(cli-v3): allow disabling source-map-support to prevent OOM with S…
Feb 2, 2026
023c3fd
fix(cli-v3): ignore engine checks during deployment install to preven…
Feb 2, 2026
93aa053
fix(core): delegate to original console in ConsoleInterceptor to pres…
Feb 2, 2026
8b684e1
fix(cli-v3): authenticate to Docker Hub to prevent rate limits (#2911)
Feb 2, 2026
737ad56
fix(cli-v3): ensure worker cleanup on SIGINT/SIGTERM (#2909)
Feb 2, 2026
c97cbcc
verify: add reproduction scripts and PR details for all major fixes
Feb 3, 2026
aa90db9
verify: add reproduction scripts and PR details for all major fixes
Feb 3, 2026
f5ce2bc
docs: add consolidated PR body description
Feb 3, 2026
8c986db
chore: remove reproduction scripts and temporary files
Feb 3, 2026
82f198f
Merge remote-tracking branch 'remotes/origin/fix/sentry-oom-2920'
Feb 8, 2026
9a3e8d0
Merge branch 'fix/issue-2909-orphaned-workers'
Feb 9, 2026
e101f8e
chore: remove reproduction scripts after verification
Feb 9, 2026
d01d438
fix: resolve typecheck errors after merge
Feb 9, 2026
aafb736
fix(webapp): auto-recover replication services after stream errors
ericallam May 13, 2026
7fa3a16
fix(webapp): reschedule reconnect when subscribe() throws
ericallam May 13, 2026
4e6461a
fix(webapp): reschedule reconnect when subscribe() returns stopped
ericallam May 15, 2026
4b5db51
fix(webapp): drop bogus isStopped check, route leader-lock failure th…
ericallam May 15, 2026
5365936
fix(webapp): scope leaderElection-lost recovery to reconnect strategy
ericallam May 15, 2026
d35bf04
Merge pull request #10 from deepshekhardas/pr/3613-replication-fix
deepshekhardas May 20, 2026
913f7c7
Fix 4 webapp bugs from issue #3748
May 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/fix-console-interceptor-2900.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/core": patch
---

Fix: ConsoleInterceptor now delegates to original console methods to preserve log chain when other interceptors (like Sentry) are present. (#2900)
5 changes: 5 additions & 0 deletions .changeset/fix-docker-hub-rate-limit-2911.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/cli-v3": patch
---

Fix: Native build server failed with Docker Hub rate limits. Added support for checking checking `DOCKER_USERNAME` and `DOCKER_PASSWORD` in environment variables and logging into Docker Hub before building. (#2911)
5 changes: 5 additions & 0 deletions .changeset/fix-github-install-node-version-2913.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/cli-v3": patch
---

Fix: Ignore engine checks during deployment install phase to prevent failure on build server when Node version mismatch exists. (#2913)
5 changes: 5 additions & 0 deletions .changeset/fix-orphaned-workers-2909.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/cli-v3": patch
---

Fix: `trigger.dev dev` command left orphaned worker processes when exited via Ctrl+C (SIGINT). Added signal handlers to ensure proper cleanup of child processes and lockfiles. (#2909)
5 changes: 5 additions & 0 deletions .changeset/fix-sentry-oom-2920.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/cli-v3": patch
---

Fix Sentry OOM: Allow disabling `source-map-support` via `TRIGGER_SOURCE_MAPS=false`. Also supports `node` for native source maps. (#2920)
6 changes: 6 additions & 0 deletions .server-changes/replication-error-recovery.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: fix
---

Runs and sessions replication services now auto-recover from stream errors (e.g. after a Postgres failover) instead of silently leaving replication stopped. Behaviour is configurable per service — reconnect (default), exit so a process supervisor can restart the host, or log.
2 changes: 1 addition & 1 deletion apps/webapp/app/components/runs/v3/AIFilterInput.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export function AIFilterInput() {
const fetcher = useFetcher<AIFilterResult>();

useEffect(() => {
if (fetcher.data?.success && fetcher.state === "loading") {
if (fetcher.data?.success && fetcher.state === "idle") {
setText("");
setIsFocused(false);

Expand Down
2 changes: 1 addition & 1 deletion apps/webapp/app/components/runs/v3/ReplayRunDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ function ReplayContent({ runFriendlyId, failedRedirect }: ReplayRunDialogProps)
}/env/${envSlug}/queues?${searchParams.toString()}`
);
}
}, [params.organizationSlug, params.projectParam, params.envParam, environmentIdOverride]);
}, [params.organizationSlug, params.projectParam, params.envParam, environmentIdOverride, replayDataFetcher.data]);

const customQueues = useMemo(() => {
return queueFetcher.data?.queues ?? [];
Expand Down
9 changes: 9 additions & 0 deletions apps/webapp/app/components/runs/v3/agent/AgentView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,15 @@ function useAgentSessionMessages({
};
}, [sessionId, apiOrigin, orgSlug, projectSlug, envSlug, snapshotPresignedUrl]);

// Reset refs when session changes to prevent stale data from bleeding
// across sessions (e.g. navigating between runs with different agent sessions)
useEffect(() => {
pendingRef.current = new Map(seedMessages.map((m) => [m.id, m]));
timestampsRef.current = new Map(seedMessages.map((m) => [m.id, INITIAL_PAYLOAD_TIMESTAMP]));
orchestrationRef.current = new Map();
setMessagesById(new Map(pendingRef.current));
}, [sessionId, seedMessages]);

return useMemo(() => {
const timestamps = timestampsRef.current;
const arr = Array.from(messagesById.values());
Expand Down
23 changes: 23 additions & 0 deletions apps/webapp/app/env.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,16 @@ const EnvironmentSchema = z
RUN_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"),
RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"),
// What to do when the runs replication client errors (e.g. after a
// Postgres failover). `reconnect` (default) re-subscribes in-process with
// exponential backoff; `exit` exits the process so a supervisor restarts
// it; `log` preserves the old no-op behaviour. Reconnect tuning is
// shared across both replication services via REPLICATION_RECONNECT_*.
RUN_REPLICATION_ERROR_STRATEGY: z
.enum(["reconnect", "exit", "log"])
.default("reconnect"),
RUN_REPLICATION_EXIT_DELAY_MS: z.coerce.number().int().min(0).default(5_000),
RUN_REPLICATION_EXIT_CODE: z.coerce.number().int().min(0).max(255).default(1),

// Session replication (Postgres → ClickHouse sessions_v1). Shares Redis
// with the runs replicator for leader locking but has its own slot and
Expand Down Expand Up @@ -1362,6 +1372,19 @@ const EnvironmentSchema = z
SESSION_REPLICATION_INSERT_MAX_RETRIES: z.coerce.number().int().default(3),
SESSION_REPLICATION_INSERT_BASE_DELAY_MS: z.coerce.number().int().default(100),
SESSION_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000),
// Error recovery — same semantics as RUN_REPLICATION_ERROR_STRATEGY.
SESSION_REPLICATION_ERROR_STRATEGY: z
.enum(["reconnect", "exit", "log"])
.default("reconnect"),
SESSION_REPLICATION_EXIT_DELAY_MS: z.coerce.number().int().min(0).default(5_000),
SESSION_REPLICATION_EXIT_CODE: z.coerce.number().int().min(0).max(255).default(1),

// Reconnect tuning shared across both replication services. Only
// applies when error strategy is `reconnect`. Max attempts of 0 means
// unlimited (default).
REPLICATION_RECONNECT_INITIAL_DELAY_MS: z.coerce.number().int().min(0).default(1_000),
REPLICATION_RECONNECT_MAX_DELAY_MS: z.coerce.number().int().min(0).default(60_000),
REPLICATION_RECONNECT_MAX_ATTEMPTS: z.coerce.number().int().min(0).default(0),

// Clickhouse
CLICKHOUSE_URL: z.string(),
Expand Down
6 changes: 0 additions & 6 deletions apps/webapp/app/presenters/v3/LogsListPresenter.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,6 @@ export class LogsListPresenter extends BasePresenter {
);
}

if (store === EVENT_STORE_TYPES.CLICKHOUSE) {
throw new ServiceValidationError(
Comment on lines -218 to -219
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 LogsListPresenter: Removal of CLICKHOUSE store gate now allows queries against clickhouse_v1

The PR removes the guard at apps/webapp/app/presenters/v3/LogsListPresenter.server.ts:218-222 that threw an error when store === EVENT_STORE_TYPES.CLICKHOUSE. Previously, both POSTGRES and CLICKHOUSE stores were rejected, leaving only CLICKHOUSE_V2 as a valid path. Now CLICKHOUSE (v1) organizations will fall through to the same ClickHouse query builder used by v2. This is presumably intentional (the query builder uses a logsListQueryBuilder() that works against the same underlying table), but if there are schema differences between clickhouse v1 and v2 event storage, this could produce incorrect results for v1 orgs. Worth confirming this is intentional.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

"Logs are not available for ClickHouse event store. Please contact support."
);
}

const queryBuilder = this.clickhouse.taskEventsSearch.logsListQueryBuilder();

// This should be removed once we clear the old inserts, 30 DAYS, the materialized view excludes events without trace_id)
Expand Down
207 changes: 207 additions & 0 deletions apps/webapp/app/services/replicationErrorRecovery.server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
import { Logger } from "@trigger.dev/core/logger";

// When the LogicalReplicationClient's WAL stream errors (e.g. after a
// Postgres failover) it calls stop() on itself and stays stopped. The host
// service has to decide how to recover. Three strategies are available:
//
// - "reconnect" — re-subscribe in-process with exponential backoff. Default;
// works without a process supervisor.
// - "exit" — exit the process so an external supervisor (Docker
// restart=always, ECS, systemd, k8s, ...) replaces it. Recommended when a
// supervisor is present because it gets a clean slate every time.
// - "log" — preserve the historical no-op behaviour. Useful for
// debugging or in test environments where you want to observe the
// silent-death failure mode.
export type ReplicationErrorRecoveryStrategy =
| {
type: "reconnect";
initialDelayMs?: number;
maxDelayMs?: number;
// 0 (or undefined) means retry forever.
maxAttempts?: number;
}
| {
type: "exit";
exitDelayMs?: number;
exitCode?: number;
}
| { type: "log" };

export type ReplicationErrorRecoveryDeps = {
strategy: ReplicationErrorRecoveryStrategy;
logger: Logger;
// Re-subscribe the underlying replication client. Implementations should
// call client.subscribe(...) and resolve once the stream is started.
reconnect: () => Promise<void>;
// True once the host service has begun graceful shutdown — recovery
// suppresses all work in that state.
isShuttingDown: () => boolean;
};

export type ReplicationErrorRecovery = {
// Called from the replication client's "error" event handler.
handle(error: unknown): void;
// Called from the replication client's "start" event handler. Resets the
// reconnect attempt counter so the next failure starts from initialDelayMs.
notifyStreamStarted(): void;
// Called from the replication client's "leaderElection" event handler with
// isLeader=false. Only the reconnect strategy acts on this; exit and log
// strategies treat losing the lock as a normal multi-instance state (an
// "exit" instance would otherwise restart-loop whenever a peer holds it).
notifyLeaderElectionLost(error: unknown): void;
// Cancel any pending reconnect/exit timer. Called from shutdown().
dispose(): void;
};

export function createReplicationErrorRecovery(
deps: ReplicationErrorRecoveryDeps
): ReplicationErrorRecovery {
const { strategy, logger, reconnect, isShuttingDown } = deps;
let attempt = 0;
let pendingReconnect: NodeJS.Timeout | null = null;
let pendingExit: NodeJS.Timeout | null = null;
let exiting = false;

function scheduleReconnect(error: unknown): void {
if (strategy.type !== "reconnect") return;
if (pendingReconnect) return;

attempt += 1;
const maxAttempts = strategy.maxAttempts ?? 0;
if (maxAttempts > 0 && attempt > maxAttempts) {
logger.error("Replication reconnect exceeded maxAttempts; giving up", {
attempt,
maxAttempts,
error,
});
return;
}

const initialDelay = strategy.initialDelayMs ?? 1_000;
const maxDelay = strategy.maxDelayMs ?? 60_000;
const delay = Math.min(initialDelay * Math.pow(2, attempt - 1), maxDelay);

logger.error("Replication stream lost — scheduling reconnect", {
attempt,
delayMs: delay,
error,
});

pendingReconnect = setTimeout(async () => {
pendingReconnect = null;
if (isShuttingDown()) return;

try {
await reconnect();
// Success path is handled by notifyStreamStarted, which fires from
// the replication client's "start" event after the stream is live.
} catch (err) {
// subscribe() can throw without first emitting an "error" event —
// notably when the initial pg client.connect() fails because Postgres
// is still unreachable mid-failover. Schedule the next attempt
// ourselves so recovery doesn't silently stop. If subscribe() did
// also emit an "error" event, handle() will call scheduleReconnect()
// first; the guard on pendingReconnect makes this idempotent.
logger.error("Replication reconnect attempt failed", {
attempt,
error: err,
});
scheduleReconnect(err);
}
}, delay);
}

function scheduleExit(): void {
if (strategy.type !== "exit") return;
if (exiting) return;
exiting = true;

const delay = strategy.exitDelayMs ?? 5_000;
const code = strategy.exitCode ?? 1;

logger.error("Fatal replication error — exiting to let process supervisor restart", {
exitCode: code,
exitDelayMs: delay,
});

pendingExit = setTimeout(() => {
// eslint-disable-next-line no-process-exit
process.exit(code);
}, delay);
// Don't hold a clean shutdown back on this timer.
pendingExit.unref();
}

return {
handle(error) {
if (isShuttingDown()) return;
switch (strategy.type) {
case "log":
return;
case "exit":
return scheduleExit();
case "reconnect":
return scheduleReconnect(error);
}
},
notifyStreamStarted() {
if (attempt > 0) {
logger.info("Replication reconnect succeeded", { attempt });
attempt = 0;
}
},
notifyLeaderElectionLost(error) {
if (isShuttingDown()) return;
// Only the reconnect strategy should react. For exit, losing the
// lock to a peer would otherwise trigger a restart loop. For log,
// we keep historical no-op semantics.
if (strategy.type !== "reconnect") return;
scheduleReconnect(error);
},
dispose() {
if (pendingReconnect) {
clearTimeout(pendingReconnect);
pendingReconnect = null;
}
if (pendingExit) {
clearTimeout(pendingExit);
pendingExit = null;
}
},
};
}

// Shape of the env-driven configuration object the instance bootstrap files
// build from process.env. Kept separate from the strategy union above so the
// instance code can pass a single object regardless of which strategy is set.
export type ReplicationErrorRecoveryEnv = {
strategy: "reconnect" | "exit" | "log";
reconnectInitialDelayMs?: number;
reconnectMaxDelayMs?: number;
reconnectMaxAttempts?: number;
exitDelayMs?: number;
exitCode?: number;
};

export function strategyFromEnv(
env: ReplicationErrorRecoveryEnv
): ReplicationErrorRecoveryStrategy {
switch (env.strategy) {
case "exit":
return {
type: "exit",
exitDelayMs: env.exitDelayMs,
exitCode: env.exitCode,
};
case "log":
return { type: "log" };
case "reconnect":
default:
return {
type: "reconnect",
initialDelayMs: env.reconnectInitialDelayMs,
maxDelayMs: env.reconnectMaxDelayMs,
maxAttempts: env.reconnectMaxAttempts,
};
}
}
9 changes: 9 additions & 0 deletions apps/webapp/app/services/runsReplicationInstance.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import invariant from "tiny-invariant";
import { env } from "~/env.server";
import { singleton } from "~/utils/singleton";
import { meter, provider } from "~/v3/tracer.server";
import { strategyFromEnv } from "./replicationErrorRecovery.server";
import { RunsReplicationService } from "./runsReplicationService.server";
import { signalsEmitter } from "./signals.server";

Expand Down Expand Up @@ -69,6 +70,14 @@ function initializeRunsReplicationInstance() {
insertStrategy: env.RUN_REPLICATION_INSERT_STRATEGY,
disablePayloadInsert: env.RUN_REPLICATION_DISABLE_PAYLOAD_INSERT === "1",
disableErrorFingerprinting: env.RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING === "1",
errorRecovery: strategyFromEnv({
strategy: env.RUN_REPLICATION_ERROR_STRATEGY,
reconnectInitialDelayMs: env.REPLICATION_RECONNECT_INITIAL_DELAY_MS,
reconnectMaxDelayMs: env.REPLICATION_RECONNECT_MAX_DELAY_MS,
reconnectMaxAttempts: env.REPLICATION_RECONNECT_MAX_ATTEMPTS,
exitDelayMs: env.RUN_REPLICATION_EXIT_DELAY_MS,
exitCode: env.RUN_REPLICATION_EXIT_CODE,
}),
});

if (env.RUN_REPLICATION_ENABLED === "1") {
Expand Down
Loading