feat(gateway): add exponential backoff with jitter to restart loop
This commit is contained in:
parent
5ef33d9359
commit
5d77b603e6
@ -6,6 +6,9 @@ import {
|
|||||||
} from "../../infra/restart.js";
|
} from "../../infra/restart.js";
|
||||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||||
import type { defaultRuntime } from "../../runtime.js";
|
import type { defaultRuntime } from "../../runtime.js";
|
||||||
|
import { calculateBackoffMs, applyJitter } from "./backoff.js";
|
||||||
|
import { recordCrash, classifyError } from "./crash-tracker.js";
|
||||||
|
import { killAllChildrenSync } from "../../infra/child-registry.js";
|
||||||
|
|
||||||
const gatewayLog = createSubsystemLogger("gateway");
|
const gatewayLog = createSubsystemLogger("gateway");
|
||||||
|
|
||||||
@ -18,7 +21,7 @@ export async function runGatewayLoop(params: {
|
|||||||
const lock = await acquireGatewayLock();
|
const lock = await acquireGatewayLock();
|
||||||
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
||||||
let shuttingDown = false;
|
let shuttingDown = false;
|
||||||
let restartResolver: (() => void) | null = null;
|
let restartResolver: ((reason: { isUserInitiated: boolean }) => void) | null = null;
|
||||||
|
|
||||||
const cleanupSignals = () => {
|
const cleanupSignals = () => {
|
||||||
process.removeListener("SIGTERM", onSigterm);
|
process.removeListener("SIGTERM", onSigterm);
|
||||||
@ -54,7 +57,7 @@ export async function runGatewayLoop(params: {
|
|||||||
server = null;
|
server = null;
|
||||||
if (isRestart) {
|
if (isRestart) {
|
||||||
shuttingDown = false;
|
shuttingDown = false;
|
||||||
restartResolver?.();
|
restartResolver?.({ isUserInitiated: action === "restart" });
|
||||||
} else {
|
} else {
|
||||||
cleanupSignals();
|
cleanupSignals();
|
||||||
params.runtime.exit(0);
|
params.runtime.exit(0);
|
||||||
@ -87,15 +90,79 @@ export async function runGatewayLoop(params: {
|
|||||||
process.on("SIGINT", onSigint);
|
process.on("SIGINT", onSigint);
|
||||||
process.on("SIGUSR1", onSigusr1);
|
process.on("SIGUSR1", onSigusr1);
|
||||||
|
|
||||||
|
// Register exit handler for crash scenarios (sync only - can't await in 'exit' handler)
|
||||||
|
process.on("exit", () => {
|
||||||
|
killAllChildrenSync();
|
||||||
|
});
|
||||||
|
|
||||||
|
let consecutiveFailures = 0;
|
||||||
|
const STABILITY_THRESHOLD_MS = 60_000;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Keep process alive; SIGUSR1 triggers an in-process restart (no supervisor required).
|
// Keep process alive; SIGUSR1 triggers an in-process restart (no supervisor required).
|
||||||
// SIGTERM/SIGINT still exit after a graceful shutdown.
|
// SIGTERM/SIGINT still exit after a graceful shutdown.
|
||||||
// eslint-disable-next-line no-constant-condition
|
// eslint-disable-next-line no-constant-condition
|
||||||
while (true) {
|
while (true) {
|
||||||
server = await params.start();
|
// Calculate and apply backoff with jitter
|
||||||
await new Promise<void>((resolve) => {
|
const baseBackoffMs = calculateBackoffMs(consecutiveFailures);
|
||||||
|
const backoffMs = applyJitter(baseBackoffMs);
|
||||||
|
|
||||||
|
if (backoffMs > 0) {
|
||||||
|
gatewayLog.warn(
|
||||||
|
`Restarting gateway in ${backoffMs}ms after failure (attempt ${consecutiveFailures + 1})`,
|
||||||
|
);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, backoffMs));
|
||||||
|
}
|
||||||
|
|
||||||
|
const startAttemptMs = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
server = await params.start();
|
||||||
|
} catch (err) {
|
||||||
|
gatewayLog.error(`Gateway startup failed: ${String(err)}`);
|
||||||
|
recordCrash({
|
||||||
|
errorType: classifyError(err),
|
||||||
|
errorMessage: err instanceof Error ? err.message : String(err),
|
||||||
|
uptimeMs: 0,
|
||||||
|
backoffMs,
|
||||||
|
consecutiveFailures: consecutiveFailures + 1,
|
||||||
|
});
|
||||||
|
consecutiveFailures++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Server started successfully - wait for restart signal
|
||||||
|
const restartReason = await new Promise<{ isUserInitiated: boolean }>((resolve) => {
|
||||||
restartResolver = resolve;
|
restartResolver = resolve;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const uptimeMs = Date.now() - startAttemptMs;
|
||||||
|
|
||||||
|
// Determine backoff reset behavior based on uptime and restart type
|
||||||
|
if (restartReason.isUserInitiated) {
|
||||||
|
// User-initiated restart (SIGUSR1): no backoff
|
||||||
|
consecutiveFailures = 0;
|
||||||
|
} else if (uptimeMs >= STABILITY_THRESHOLD_MS) {
|
||||||
|
// Crashed after stable uptime: reset to minimal backoff
|
||||||
|
recordCrash({
|
||||||
|
errorType: "runtime_error",
|
||||||
|
errorMessage: "crashed after stable uptime",
|
||||||
|
uptimeMs,
|
||||||
|
backoffMs: calculateBackoffMs(1),
|
||||||
|
consecutiveFailures: 1,
|
||||||
|
});
|
||||||
|
consecutiveFailures = 1;
|
||||||
|
} else {
|
||||||
|
// Crashed during startup or early runtime: increment backoff
|
||||||
|
recordCrash({
|
||||||
|
errorType: "runtime_error",
|
||||||
|
errorMessage: "crashed during early runtime",
|
||||||
|
uptimeMs,
|
||||||
|
backoffMs: calculateBackoffMs(consecutiveFailures + 1),
|
||||||
|
consecutiveFailures: consecutiveFailures + 1,
|
||||||
|
});
|
||||||
|
consecutiveFailures++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
await lock?.release();
|
await lock?.release();
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user