fix(gateway): gracefully handle AbortError and transient network errors
Addresses issues #1851, #1997, and #2034. During config reload (SIGUSR1), in-flight requests are aborted, causing AbortError exceptions. Similarly, transient network errors (fetch failed, ECONNRESET, ETIMEDOUT, etc.) can crash the gateway unnecessarily. This change: - Adds isAbortError() to detect intentional cancellations - Adds isTransientNetworkError() to detect temporary connectivity issues - Logs these errors appropriately instead of crashing - Handles nested cause chains and AggregateError AbortError is logged as a warning (expected during shutdown). Network errors are logged as non-fatal errors (will resolve on their own). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c120aa8a2e
commit
a2fd1cefff
129
src/infra/unhandled-rejections.test.ts
Normal file
129
src/infra/unhandled-rejections.test.ts
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js";
|
||||||
|
|
||||||
|
describe("isAbortError", () => {
|
||||||
|
it("returns true for error with name AbortError", () => {
|
||||||
|
const error = new Error("aborted");
|
||||||
|
error.name = "AbortError";
|
||||||
|
expect(isAbortError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for error with "This operation was aborted" message', () => {
|
||||||
|
const error = new Error("This operation was aborted");
|
||||||
|
expect(isAbortError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for undici-style AbortError", () => {
|
||||||
|
// Node's undici throws errors with this exact message
|
||||||
|
const error = Object.assign(new Error("This operation was aborted"), { name: "AbortError" });
|
||||||
|
expect(isAbortError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for object with AbortError name", () => {
|
||||||
|
expect(isAbortError({ name: "AbortError", message: "test" })).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for regular errors", () => {
|
||||||
|
expect(isAbortError(new Error("Something went wrong"))).toBe(false);
|
||||||
|
expect(isAbortError(new TypeError("Cannot read property"))).toBe(false);
|
||||||
|
expect(isAbortError(new RangeError("Invalid array length"))).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for errors with similar but different messages", () => {
|
||||||
|
expect(isAbortError(new Error("Operation aborted"))).toBe(false);
|
||||||
|
expect(isAbortError(new Error("aborted"))).toBe(false);
|
||||||
|
expect(isAbortError(new Error("Request was aborted"))).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for null and undefined", () => {
|
||||||
|
expect(isAbortError(null)).toBe(false);
|
||||||
|
expect(isAbortError(undefined)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for non-error values", () => {
|
||||||
|
expect(isAbortError("string error")).toBe(false);
|
||||||
|
expect(isAbortError(42)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for plain objects without AbortError name", () => {
|
||||||
|
expect(isAbortError({ message: "plain object" })).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("isTransientNetworkError", () => {
|
||||||
|
it("returns true for errors with transient network codes", () => {
|
||||||
|
const codes = [
|
||||||
|
"ECONNRESET",
|
||||||
|
"ECONNREFUSED",
|
||||||
|
"ENOTFOUND",
|
||||||
|
"ETIMEDOUT",
|
||||||
|
"ESOCKETTIMEDOUT",
|
||||||
|
"ECONNABORTED",
|
||||||
|
"EPIPE",
|
||||||
|
"EHOSTUNREACH",
|
||||||
|
"ENETUNREACH",
|
||||||
|
"EAI_AGAIN",
|
||||||
|
"UND_ERR_CONNECT_TIMEOUT",
|
||||||
|
"UND_ERR_SOCKET",
|
||||||
|
"UND_ERR_HEADERS_TIMEOUT",
|
||||||
|
"UND_ERR_BODY_TIMEOUT",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const code of codes) {
|
||||||
|
const error = Object.assign(new Error("test"), { code });
|
||||||
|
expect(isTransientNetworkError(error), `code: ${code}`).toBe(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for TypeError with "fetch failed" message', () => {
|
||||||
|
const error = new TypeError("fetch failed");
|
||||||
|
expect(isTransientNetworkError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for fetch failed with network cause", () => {
|
||||||
|
const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" });
|
||||||
|
const error = Object.assign(new TypeError("fetch failed"), { cause });
|
||||||
|
expect(isTransientNetworkError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for nested cause chain with network error", () => {
|
||||||
|
const innerCause = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
|
||||||
|
const outerCause = Object.assign(new Error("wrapper"), { cause: innerCause });
|
||||||
|
const error = Object.assign(new TypeError("fetch failed"), { cause: outerCause });
|
||||||
|
expect(isTransientNetworkError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for AggregateError containing network errors", () => {
|
||||||
|
const networkError = Object.assign(new Error("timeout"), { code: "ETIMEDOUT" });
|
||||||
|
const error = new AggregateError([networkError], "Multiple errors");
|
||||||
|
expect(isTransientNetworkError(error)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for regular errors without network codes", () => {
|
||||||
|
expect(isTransientNetworkError(new Error("Something went wrong"))).toBe(false);
|
||||||
|
expect(isTransientNetworkError(new TypeError("Cannot read property"))).toBe(false);
|
||||||
|
expect(isTransientNetworkError(new RangeError("Invalid array length"))).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for errors with non-network codes", () => {
|
||||||
|
const error = Object.assign(new Error("test"), { code: "INVALID_CONFIG" });
|
||||||
|
expect(isTransientNetworkError(error)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for null and undefined", () => {
|
||||||
|
expect(isTransientNetworkError(null)).toBe(false);
|
||||||
|
expect(isTransientNetworkError(undefined)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for non-error values", () => {
|
||||||
|
expect(isTransientNetworkError("string error")).toBe(false);
|
||||||
|
expect(isTransientNetworkError(42)).toBe(false);
|
||||||
|
expect(isTransientNetworkError({ message: "plain object" })).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for AggregateError with only non-network errors", () => {
|
||||||
|
const error = new AggregateError([new Error("regular error")], "Multiple errors");
|
||||||
|
expect(isTransientNetworkError(error)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
@ -1,11 +1,88 @@
|
|||||||
import process from "node:process";
|
import process from "node:process";
|
||||||
|
|
||||||
import { formatErrorMessage, formatUncaughtError } from "./errors.js";
|
import { formatUncaughtError } from "./errors.js";
|
||||||
|
|
||||||
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
||||||
|
|
||||||
const handlers = new Set<UnhandledRejectionHandler>();
|
const handlers = new Set<UnhandledRejectionHandler>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if an error is an AbortError.
|
||||||
|
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
|
||||||
|
*/
|
||||||
|
export function isAbortError(err: unknown): boolean {
|
||||||
|
if (!err || typeof err !== "object") return false;
|
||||||
|
const name = "name" in err ? String(err.name) : "";
|
||||||
|
if (name === "AbortError") return true;
|
||||||
|
// Check for "This operation was aborted" message from Node's undici
|
||||||
|
const message = "message" in err && typeof err.message === "string" ? err.message : "";
|
||||||
|
if (message === "This operation was aborted") return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Network error codes that indicate transient failures (shouldn't crash the gateway)
|
||||||
|
const TRANSIENT_NETWORK_CODES = new Set([
|
||||||
|
"ECONNRESET",
|
||||||
|
"ECONNREFUSED",
|
||||||
|
"ENOTFOUND",
|
||||||
|
"ETIMEDOUT",
|
||||||
|
"ESOCKETTIMEDOUT",
|
||||||
|
"ECONNABORTED",
|
||||||
|
"EPIPE",
|
||||||
|
"EHOSTUNREACH",
|
||||||
|
"ENETUNREACH",
|
||||||
|
"EAI_AGAIN",
|
||||||
|
"UND_ERR_CONNECT_TIMEOUT",
|
||||||
|
"UND_ERR_SOCKET",
|
||||||
|
"UND_ERR_HEADERS_TIMEOUT",
|
||||||
|
"UND_ERR_BODY_TIMEOUT",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function getErrorCode(err: unknown): string | undefined {
|
||||||
|
if (!err || typeof err !== "object") return undefined;
|
||||||
|
const code = (err as { code?: unknown }).code;
|
||||||
|
return typeof code === "string" ? code : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getErrorCause(err: unknown): unknown {
|
||||||
|
if (!err || typeof err !== "object") return undefined;
|
||||||
|
return (err as { cause?: unknown }).cause;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if an error is a transient network error that shouldn't crash the gateway.
|
||||||
|
* These are typically temporary connectivity issues that will resolve on their own.
|
||||||
|
*/
|
||||||
|
export function isTransientNetworkError(err: unknown): boolean {
|
||||||
|
if (!err) return false;
|
||||||
|
|
||||||
|
// Check the error itself
|
||||||
|
const code = getErrorCode(err);
|
||||||
|
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
|
||||||
|
|
||||||
|
// "fetch failed" TypeError from undici (Node's native fetch)
|
||||||
|
if (err instanceof TypeError && err.message === "fetch failed") {
|
||||||
|
const cause = getErrorCause(err);
|
||||||
|
// The cause often contains the actual network error
|
||||||
|
if (cause) return isTransientNetworkError(cause);
|
||||||
|
// Even without a cause, "fetch failed" is typically a network issue
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the cause chain recursively
|
||||||
|
const cause = getErrorCause(err);
|
||||||
|
if (cause && cause !== err) {
|
||||||
|
return isTransientNetworkError(cause);
|
||||||
|
}
|
||||||
|
|
||||||
|
// AggregateError may wrap multiple causes
|
||||||
|
if (err instanceof AggregateError && err.errors?.length) {
|
||||||
|
return err.errors.some((e) => isTransientNetworkError(e));
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
|
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
|
||||||
handlers.add(handler);
|
handlers.add(handler);
|
||||||
return () => {
|
return () => {
|
||||||
@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if an error is a recoverable/transient error that shouldn't crash the process.
|
|
||||||
* These include network errors and abort signals during shutdown.
|
|
||||||
*/
|
|
||||||
function isRecoverableError(reason: unknown): boolean {
|
|
||||||
if (!reason) return false;
|
|
||||||
|
|
||||||
// Check error name for AbortError
|
|
||||||
if (reason instanceof Error && reason.name === "AbortError") {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const message = reason instanceof Error ? reason.message : formatErrorMessage(reason);
|
|
||||||
const lowerMessage = message.toLowerCase();
|
|
||||||
return (
|
|
||||||
lowerMessage.includes("fetch failed") ||
|
|
||||||
lowerMessage.includes("network request") ||
|
|
||||||
lowerMessage.includes("econnrefused") ||
|
|
||||||
lowerMessage.includes("econnreset") ||
|
|
||||||
lowerMessage.includes("etimedout") ||
|
|
||||||
lowerMessage.includes("socket hang up") ||
|
|
||||||
lowerMessage.includes("enotfound") ||
|
|
||||||
lowerMessage.includes("network error") ||
|
|
||||||
lowerMessage.includes("getaddrinfo") ||
|
|
||||||
lowerMessage.includes("client network socket disconnected") ||
|
|
||||||
lowerMessage.includes("this operation was aborted") ||
|
|
||||||
lowerMessage.includes("aborted")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function isUnhandledRejectionHandled(reason: unknown): boolean {
|
export function isUnhandledRejectionHandled(reason: unknown): boolean {
|
||||||
for (const handler of handlers) {
|
for (const handler of handlers) {
|
||||||
try {
|
try {
|
||||||
@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void {
|
|||||||
process.on("unhandledRejection", (reason, _promise) => {
|
process.on("unhandledRejection", (reason, _promise) => {
|
||||||
if (isUnhandledRejectionHandled(reason)) return;
|
if (isUnhandledRejectionHandled(reason)) return;
|
||||||
|
|
||||||
// Don't crash on recoverable/transient errors - log them and continue
|
// AbortError is typically an intentional cancellation (e.g., during shutdown)
|
||||||
if (isRecoverableError(reason)) {
|
// Log it but don't crash - these are expected during graceful shutdown
|
||||||
console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason));
|
if (isAbortError(reason)) {
|
||||||
|
console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
|
||||||
|
// These are temporary connectivity issues that will resolve on their own
|
||||||
|
if (isTransientNetworkError(reason)) {
|
||||||
|
console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user