fix(gateway): gracefully handle AbortError and transient network errors

Addresses issues #1851, #1997, and #2034.

During config reload (SIGUSR1), in-flight requests are aborted, causing
AbortError exceptions. Similarly, transient network errors (fetch failed,
ECONNRESET, ETIMEDOUT, etc.) can crash the gateway unnecessarily.

This change:
- Adds isAbortError() to detect intentional cancellations
- Adds isTransientNetworkError() to detect temporary connectivity issues
- Logs these errors appropriately instead of crashing
- Handles nested cause chains and AggregateError

AbortError is logged as a warning (expected during shutdown).
Network errors are logged as non-fatal errors (will resolve on their own).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Glucksberg 2026-01-26 23:50:20 +00:00 committed by Shadow
parent c120aa8a2e
commit a2fd1cefff
No known key found for this signature in database
2 changed files with 218 additions and 34 deletions

View File

@ -0,0 +1,129 @@
import { describe, expect, it } from "vitest";
import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js";
describe("isAbortError", () => {
it("returns true for error with name AbortError", () => {
const error = new Error("aborted");
error.name = "AbortError";
expect(isAbortError(error)).toBe(true);
});
it('returns true for error with "This operation was aborted" message', () => {
const error = new Error("This operation was aborted");
expect(isAbortError(error)).toBe(true);
});
it("returns true for undici-style AbortError", () => {
// Node's undici throws errors with this exact message
const error = Object.assign(new Error("This operation was aborted"), { name: "AbortError" });
expect(isAbortError(error)).toBe(true);
});
it("returns true for object with AbortError name", () => {
expect(isAbortError({ name: "AbortError", message: "test" })).toBe(true);
});
it("returns false for regular errors", () => {
expect(isAbortError(new Error("Something went wrong"))).toBe(false);
expect(isAbortError(new TypeError("Cannot read property"))).toBe(false);
expect(isAbortError(new RangeError("Invalid array length"))).toBe(false);
});
it("returns false for errors with similar but different messages", () => {
expect(isAbortError(new Error("Operation aborted"))).toBe(false);
expect(isAbortError(new Error("aborted"))).toBe(false);
expect(isAbortError(new Error("Request was aborted"))).toBe(false);
});
it("returns false for null and undefined", () => {
expect(isAbortError(null)).toBe(false);
expect(isAbortError(undefined)).toBe(false);
});
it("returns false for non-error values", () => {
expect(isAbortError("string error")).toBe(false);
expect(isAbortError(42)).toBe(false);
});
it("returns false for plain objects without AbortError name", () => {
expect(isAbortError({ message: "plain object" })).toBe(false);
});
});
describe("isTransientNetworkError", () => {
it("returns true for errors with transient network codes", () => {
const codes = [
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
];
for (const code of codes) {
const error = Object.assign(new Error("test"), { code });
expect(isTransientNetworkError(error), `code: ${code}`).toBe(true);
}
});
it('returns true for TypeError with "fetch failed" message', () => {
const error = new TypeError("fetch failed");
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for fetch failed with network cause", () => {
const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" });
const error = Object.assign(new TypeError("fetch failed"), { cause });
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for nested cause chain with network error", () => {
const innerCause = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
const outerCause = Object.assign(new Error("wrapper"), { cause: innerCause });
const error = Object.assign(new TypeError("fetch failed"), { cause: outerCause });
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for AggregateError containing network errors", () => {
const networkError = Object.assign(new Error("timeout"), { code: "ETIMEDOUT" });
const error = new AggregateError([networkError], "Multiple errors");
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns false for regular errors without network codes", () => {
expect(isTransientNetworkError(new Error("Something went wrong"))).toBe(false);
expect(isTransientNetworkError(new TypeError("Cannot read property"))).toBe(false);
expect(isTransientNetworkError(new RangeError("Invalid array length"))).toBe(false);
});
it("returns false for errors with non-network codes", () => {
const error = Object.assign(new Error("test"), { code: "INVALID_CONFIG" });
expect(isTransientNetworkError(error)).toBe(false);
});
it("returns false for null and undefined", () => {
expect(isTransientNetworkError(null)).toBe(false);
expect(isTransientNetworkError(undefined)).toBe(false);
});
it("returns false for non-error values", () => {
expect(isTransientNetworkError("string error")).toBe(false);
expect(isTransientNetworkError(42)).toBe(false);
expect(isTransientNetworkError({ message: "plain object" })).toBe(false);
});
it("returns false for AggregateError with only non-network errors", () => {
const error = new AggregateError([new Error("regular error")], "Multiple errors");
expect(isTransientNetworkError(error)).toBe(false);
});
});

View File

@ -1,11 +1,88 @@
import process from "node:process";
import { formatErrorMessage, formatUncaughtError } from "./errors.js";
import { formatUncaughtError } from "./errors.js";
type UnhandledRejectionHandler = (reason: unknown) => boolean;
const handlers = new Set<UnhandledRejectionHandler>();
/**
* Checks if an error is an AbortError.
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
*/
export function isAbortError(err: unknown): boolean {
if (!err || typeof err !== "object") return false;
const name = "name" in err ? String(err.name) : "";
if (name === "AbortError") return true;
// Check for "This operation was aborted" message from Node's undici
const message = "message" in err && typeof err.message === "string" ? err.message : "";
if (message === "This operation was aborted") return true;
return false;
}
// Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") return undefined;
const code = (err as { code?: unknown }).code;
return typeof code === "string" ? code : undefined;
}
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object") return undefined;
return (err as { cause?: unknown }).cause;
}
/**
* Checks if an error is a transient network error that shouldn't crash the gateway.
* These are typically temporary connectivity issues that will resolve on their own.
*/
export function isTransientNetworkError(err: unknown): boolean {
if (!err) return false;
// Check the error itself
const code = getErrorCode(err);
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
// "fetch failed" TypeError from undici (Node's native fetch)
if (err instanceof TypeError && err.message === "fetch failed") {
const cause = getErrorCause(err);
// The cause often contains the actual network error
if (cause) return isTransientNetworkError(cause);
// Even without a cause, "fetch failed" is typically a network issue
return true;
}
// Check the cause chain recursively
const cause = getErrorCause(err);
if (cause && cause !== err) {
return isTransientNetworkError(cause);
}
// AggregateError may wrap multiple causes
if (err instanceof AggregateError && err.errors?.length) {
return err.errors.some((e) => isTransientNetworkError(e));
}
return false;
}
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
handlers.add(handler);
return () => {
@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan
};
}
/**
* Check if an error is a recoverable/transient error that shouldn't crash the process.
* These include network errors and abort signals during shutdown.
*/
function isRecoverableError(reason: unknown): boolean {
if (!reason) return false;
// Check error name for AbortError
if (reason instanceof Error && reason.name === "AbortError") {
return true;
}
const message = reason instanceof Error ? reason.message : formatErrorMessage(reason);
const lowerMessage = message.toLowerCase();
return (
lowerMessage.includes("fetch failed") ||
lowerMessage.includes("network request") ||
lowerMessage.includes("econnrefused") ||
lowerMessage.includes("econnreset") ||
lowerMessage.includes("etimedout") ||
lowerMessage.includes("socket hang up") ||
lowerMessage.includes("enotfound") ||
lowerMessage.includes("network error") ||
lowerMessage.includes("getaddrinfo") ||
lowerMessage.includes("client network socket disconnected") ||
lowerMessage.includes("this operation was aborted") ||
lowerMessage.includes("aborted")
);
}
export function isUnhandledRejectionHandled(reason: unknown): boolean {
for (const handler of handlers) {
try {
@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void {
process.on("unhandledRejection", (reason, _promise) => {
if (isUnhandledRejectionHandled(reason)) return;
// Don't crash on recoverable/transient errors - log them and continue
if (isRecoverableError(reason)) {
console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason));
// AbortError is typically an intentional cancellation (e.g., during shutdown)
// Log it but don't crash - these are expected during graceful shutdown
if (isAbortError(reason)) {
console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason));
return;
}
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
// These are temporary connectivity issues that will resolve on their own
if (isTransientNetworkError(reason)) {
console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason));
return;
}