fix(gateway): gracefully handle AbortError and transient network errors
Addresses issues #1851, #1997, and #2034. During config reload (SIGUSR1), in-flight requests are aborted, causing AbortError exceptions. Similarly, transient network errors (fetch failed, ECONNRESET, ETIMEDOUT, etc.) can crash the gateway unnecessarily. This change: - Adds isAbortError() to detect intentional cancellations - Adds isTransientNetworkError() to detect temporary connectivity issues - Logs these errors appropriately instead of crashing - Handles nested cause chains and AggregateError AbortError is logged as a warning (expected during shutdown). Network errors are logged as non-fatal errors (will resolve on their own). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c120aa8a2e
commit
a2fd1cefff
129
src/infra/unhandled-rejections.test.ts
Normal file
129
src/infra/unhandled-rejections.test.ts
Normal file
@ -0,0 +1,129 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js";
|
||||
|
||||
describe("isAbortError", () => {
|
||||
it("returns true for error with name AbortError", () => {
|
||||
const error = new Error("aborted");
|
||||
error.name = "AbortError";
|
||||
expect(isAbortError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it('returns true for error with "This operation was aborted" message', () => {
|
||||
const error = new Error("This operation was aborted");
|
||||
expect(isAbortError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for undici-style AbortError", () => {
|
||||
// Node's undici throws errors with this exact message
|
||||
const error = Object.assign(new Error("This operation was aborted"), { name: "AbortError" });
|
||||
expect(isAbortError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for object with AbortError name", () => {
|
||||
expect(isAbortError({ name: "AbortError", message: "test" })).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for regular errors", () => {
|
||||
expect(isAbortError(new Error("Something went wrong"))).toBe(false);
|
||||
expect(isAbortError(new TypeError("Cannot read property"))).toBe(false);
|
||||
expect(isAbortError(new RangeError("Invalid array length"))).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for errors with similar but different messages", () => {
|
||||
expect(isAbortError(new Error("Operation aborted"))).toBe(false);
|
||||
expect(isAbortError(new Error("aborted"))).toBe(false);
|
||||
expect(isAbortError(new Error("Request was aborted"))).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for null and undefined", () => {
|
||||
expect(isAbortError(null)).toBe(false);
|
||||
expect(isAbortError(undefined)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for non-error values", () => {
|
||||
expect(isAbortError("string error")).toBe(false);
|
||||
expect(isAbortError(42)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for plain objects without AbortError name", () => {
|
||||
expect(isAbortError({ message: "plain object" })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isTransientNetworkError", () => {
|
||||
it("returns true for errors with transient network codes", () => {
|
||||
const codes = [
|
||||
"ECONNRESET",
|
||||
"ECONNREFUSED",
|
||||
"ENOTFOUND",
|
||||
"ETIMEDOUT",
|
||||
"ESOCKETTIMEDOUT",
|
||||
"ECONNABORTED",
|
||||
"EPIPE",
|
||||
"EHOSTUNREACH",
|
||||
"ENETUNREACH",
|
||||
"EAI_AGAIN",
|
||||
"UND_ERR_CONNECT_TIMEOUT",
|
||||
"UND_ERR_SOCKET",
|
||||
"UND_ERR_HEADERS_TIMEOUT",
|
||||
"UND_ERR_BODY_TIMEOUT",
|
||||
];
|
||||
|
||||
for (const code of codes) {
|
||||
const error = Object.assign(new Error("test"), { code });
|
||||
expect(isTransientNetworkError(error), `code: ${code}`).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
it('returns true for TypeError with "fetch failed" message', () => {
|
||||
const error = new TypeError("fetch failed");
|
||||
expect(isTransientNetworkError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for fetch failed with network cause", () => {
|
||||
const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" });
|
||||
const error = Object.assign(new TypeError("fetch failed"), { cause });
|
||||
expect(isTransientNetworkError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for nested cause chain with network error", () => {
|
||||
const innerCause = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
|
||||
const outerCause = Object.assign(new Error("wrapper"), { cause: innerCause });
|
||||
const error = Object.assign(new TypeError("fetch failed"), { cause: outerCause });
|
||||
expect(isTransientNetworkError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for AggregateError containing network errors", () => {
|
||||
const networkError = Object.assign(new Error("timeout"), { code: "ETIMEDOUT" });
|
||||
const error = new AggregateError([networkError], "Multiple errors");
|
||||
expect(isTransientNetworkError(error)).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for regular errors without network codes", () => {
|
||||
expect(isTransientNetworkError(new Error("Something went wrong"))).toBe(false);
|
||||
expect(isTransientNetworkError(new TypeError("Cannot read property"))).toBe(false);
|
||||
expect(isTransientNetworkError(new RangeError("Invalid array length"))).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for errors with non-network codes", () => {
|
||||
const error = Object.assign(new Error("test"), { code: "INVALID_CONFIG" });
|
||||
expect(isTransientNetworkError(error)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for null and undefined", () => {
|
||||
expect(isTransientNetworkError(null)).toBe(false);
|
||||
expect(isTransientNetworkError(undefined)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for non-error values", () => {
|
||||
expect(isTransientNetworkError("string error")).toBe(false);
|
||||
expect(isTransientNetworkError(42)).toBe(false);
|
||||
expect(isTransientNetworkError({ message: "plain object" })).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for AggregateError with only non-network errors", () => {
|
||||
const error = new AggregateError([new Error("regular error")], "Multiple errors");
|
||||
expect(isTransientNetworkError(error)).toBe(false);
|
||||
});
|
||||
});
|
||||
@ -1,11 +1,88 @@
|
||||
import process from "node:process";
|
||||
|
||||
import { formatErrorMessage, formatUncaughtError } from "./errors.js";
|
||||
import { formatUncaughtError } from "./errors.js";
|
||||
|
||||
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
||||
|
||||
const handlers = new Set<UnhandledRejectionHandler>();
|
||||
|
||||
/**
|
||||
* Checks if an error is an AbortError.
|
||||
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
|
||||
*/
|
||||
export function isAbortError(err: unknown): boolean {
|
||||
if (!err || typeof err !== "object") return false;
|
||||
const name = "name" in err ? String(err.name) : "";
|
||||
if (name === "AbortError") return true;
|
||||
// Check for "This operation was aborted" message from Node's undici
|
||||
const message = "message" in err && typeof err.message === "string" ? err.message : "";
|
||||
if (message === "This operation was aborted") return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Network error codes that indicate transient failures (shouldn't crash the gateway)
|
||||
const TRANSIENT_NETWORK_CODES = new Set([
|
||||
"ECONNRESET",
|
||||
"ECONNREFUSED",
|
||||
"ENOTFOUND",
|
||||
"ETIMEDOUT",
|
||||
"ESOCKETTIMEDOUT",
|
||||
"ECONNABORTED",
|
||||
"EPIPE",
|
||||
"EHOSTUNREACH",
|
||||
"ENETUNREACH",
|
||||
"EAI_AGAIN",
|
||||
"UND_ERR_CONNECT_TIMEOUT",
|
||||
"UND_ERR_SOCKET",
|
||||
"UND_ERR_HEADERS_TIMEOUT",
|
||||
"UND_ERR_BODY_TIMEOUT",
|
||||
]);
|
||||
|
||||
function getErrorCode(err: unknown): string | undefined {
|
||||
if (!err || typeof err !== "object") return undefined;
|
||||
const code = (err as { code?: unknown }).code;
|
||||
return typeof code === "string" ? code : undefined;
|
||||
}
|
||||
|
||||
function getErrorCause(err: unknown): unknown {
|
||||
if (!err || typeof err !== "object") return undefined;
|
||||
return (err as { cause?: unknown }).cause;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an error is a transient network error that shouldn't crash the gateway.
|
||||
* These are typically temporary connectivity issues that will resolve on their own.
|
||||
*/
|
||||
export function isTransientNetworkError(err: unknown): boolean {
|
||||
if (!err) return false;
|
||||
|
||||
// Check the error itself
|
||||
const code = getErrorCode(err);
|
||||
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
|
||||
|
||||
// "fetch failed" TypeError from undici (Node's native fetch)
|
||||
if (err instanceof TypeError && err.message === "fetch failed") {
|
||||
const cause = getErrorCause(err);
|
||||
// The cause often contains the actual network error
|
||||
if (cause) return isTransientNetworkError(cause);
|
||||
// Even without a cause, "fetch failed" is typically a network issue
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check the cause chain recursively
|
||||
const cause = getErrorCause(err);
|
||||
if (cause && cause !== err) {
|
||||
return isTransientNetworkError(cause);
|
||||
}
|
||||
|
||||
// AggregateError may wrap multiple causes
|
||||
if (err instanceof AggregateError && err.errors?.length) {
|
||||
return err.errors.some((e) => isTransientNetworkError(e));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
|
||||
handlers.add(handler);
|
||||
return () => {
|
||||
@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an error is a recoverable/transient error that shouldn't crash the process.
|
||||
* These include network errors and abort signals during shutdown.
|
||||
*/
|
||||
function isRecoverableError(reason: unknown): boolean {
|
||||
if (!reason) return false;
|
||||
|
||||
// Check error name for AbortError
|
||||
if (reason instanceof Error && reason.name === "AbortError") {
|
||||
return true;
|
||||
}
|
||||
|
||||
const message = reason instanceof Error ? reason.message : formatErrorMessage(reason);
|
||||
const lowerMessage = message.toLowerCase();
|
||||
return (
|
||||
lowerMessage.includes("fetch failed") ||
|
||||
lowerMessage.includes("network request") ||
|
||||
lowerMessage.includes("econnrefused") ||
|
||||
lowerMessage.includes("econnreset") ||
|
||||
lowerMessage.includes("etimedout") ||
|
||||
lowerMessage.includes("socket hang up") ||
|
||||
lowerMessage.includes("enotfound") ||
|
||||
lowerMessage.includes("network error") ||
|
||||
lowerMessage.includes("getaddrinfo") ||
|
||||
lowerMessage.includes("client network socket disconnected") ||
|
||||
lowerMessage.includes("this operation was aborted") ||
|
||||
lowerMessage.includes("aborted")
|
||||
);
|
||||
}
|
||||
|
||||
export function isUnhandledRejectionHandled(reason: unknown): boolean {
|
||||
for (const handler of handlers) {
|
||||
try {
|
||||
@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void {
|
||||
process.on("unhandledRejection", (reason, _promise) => {
|
||||
if (isUnhandledRejectionHandled(reason)) return;
|
||||
|
||||
// Don't crash on recoverable/transient errors - log them and continue
|
||||
if (isRecoverableError(reason)) {
|
||||
console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason));
|
||||
// AbortError is typically an intentional cancellation (e.g., during shutdown)
|
||||
// Log it but don't crash - these are expected during graceful shutdown
|
||||
if (isAbortError(reason)) {
|
||||
console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason));
|
||||
return;
|
||||
}
|
||||
|
||||
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
|
||||
// These are temporary connectivity issues that will resolve on their own
|
||||
if (isTransientNetworkError(reason)) {
|
||||
console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user