diff --git a/src/infra/unhandled-rejections.test.ts b/src/infra/unhandled-rejections.test.ts new file mode 100644 index 000000000..1ec144ba1 --- /dev/null +++ b/src/infra/unhandled-rejections.test.ts @@ -0,0 +1,129 @@ +import { describe, expect, it } from "vitest"; + +import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js"; + +describe("isAbortError", () => { + it("returns true for error with name AbortError", () => { + const error = new Error("aborted"); + error.name = "AbortError"; + expect(isAbortError(error)).toBe(true); + }); + + it('returns true for error with "This operation was aborted" message', () => { + const error = new Error("This operation was aborted"); + expect(isAbortError(error)).toBe(true); + }); + + it("returns true for undici-style AbortError", () => { + // Node's undici throws errors with this exact message + const error = Object.assign(new Error("This operation was aborted"), { name: "AbortError" }); + expect(isAbortError(error)).toBe(true); + }); + + it("returns true for object with AbortError name", () => { + expect(isAbortError({ name: "AbortError", message: "test" })).toBe(true); + }); + + it("returns false for regular errors", () => { + expect(isAbortError(new Error("Something went wrong"))).toBe(false); + expect(isAbortError(new TypeError("Cannot read property"))).toBe(false); + expect(isAbortError(new RangeError("Invalid array length"))).toBe(false); + }); + + it("returns false for errors with similar but different messages", () => { + expect(isAbortError(new Error("Operation aborted"))).toBe(false); + expect(isAbortError(new Error("aborted"))).toBe(false); + expect(isAbortError(new Error("Request was aborted"))).toBe(false); + }); + + it("returns false for null and undefined", () => { + expect(isAbortError(null)).toBe(false); + expect(isAbortError(undefined)).toBe(false); + }); + + it("returns false for non-error values", () => { + expect(isAbortError("string error")).toBe(false); + expect(isAbortError(42)).toBe(false); + }); + + it("returns false for plain objects without AbortError name", () => { + expect(isAbortError({ message: "plain object" })).toBe(false); + }); +}); + +describe("isTransientNetworkError", () => { + it("returns true for errors with transient network codes", () => { + const codes = [ + "ECONNRESET", + "ECONNREFUSED", + "ENOTFOUND", + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNABORTED", + "EPIPE", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_SOCKET", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", + ]; + + for (const code of codes) { + const error = Object.assign(new Error("test"), { code }); + expect(isTransientNetworkError(error), `code: ${code}`).toBe(true); + } + }); + + it('returns true for TypeError with "fetch failed" message', () => { + const error = new TypeError("fetch failed"); + expect(isTransientNetworkError(error)).toBe(true); + }); + + it("returns true for fetch failed with network cause", () => { + const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" }); + const error = Object.assign(new TypeError("fetch failed"), { cause }); + expect(isTransientNetworkError(error)).toBe(true); + }); + + it("returns true for nested cause chain with network error", () => { + const innerCause = Object.assign(new Error("connection reset"), { code: "ECONNRESET" }); + const outerCause = Object.assign(new Error("wrapper"), { cause: innerCause }); + const error = Object.assign(new TypeError("fetch failed"), { cause: outerCause }); + expect(isTransientNetworkError(error)).toBe(true); + }); + + it("returns true for AggregateError containing network errors", () => { + const networkError = Object.assign(new Error("timeout"), { code: "ETIMEDOUT" }); + const error = new AggregateError([networkError], "Multiple errors"); + expect(isTransientNetworkError(error)).toBe(true); + }); + + it("returns false for regular errors without network codes", () => { + expect(isTransientNetworkError(new Error("Something went wrong"))).toBe(false); + expect(isTransientNetworkError(new TypeError("Cannot read property"))).toBe(false); + expect(isTransientNetworkError(new RangeError("Invalid array length"))).toBe(false); + }); + + it("returns false for errors with non-network codes", () => { + const error = Object.assign(new Error("test"), { code: "INVALID_CONFIG" }); + expect(isTransientNetworkError(error)).toBe(false); + }); + + it("returns false for null and undefined", () => { + expect(isTransientNetworkError(null)).toBe(false); + expect(isTransientNetworkError(undefined)).toBe(false); + }); + + it("returns false for non-error values", () => { + expect(isTransientNetworkError("string error")).toBe(false); + expect(isTransientNetworkError(42)).toBe(false); + expect(isTransientNetworkError({ message: "plain object" })).toBe(false); + }); + + it("returns false for AggregateError with only non-network errors", () => { + const error = new AggregateError([new Error("regular error")], "Multiple errors"); + expect(isTransientNetworkError(error)).toBe(false); + }); +}); diff --git a/src/infra/unhandled-rejections.ts b/src/infra/unhandled-rejections.ts index ac7ac91d5..86e80e9a3 100644 --- a/src/infra/unhandled-rejections.ts +++ b/src/infra/unhandled-rejections.ts @@ -1,11 +1,88 @@ import process from "node:process"; -import { formatErrorMessage, formatUncaughtError } from "./errors.js"; +import { formatUncaughtError } from "./errors.js"; type UnhandledRejectionHandler = (reason: unknown) => boolean; const handlers = new Set(); +/** + * Checks if an error is an AbortError. + * These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash. + */ +export function isAbortError(err: unknown): boolean { + if (!err || typeof err !== "object") return false; + const name = "name" in err ? String(err.name) : ""; + if (name === "AbortError") return true; + // Check for "This operation was aborted" message from Node's undici + const message = "message" in err && typeof err.message === "string" ? err.message : ""; + if (message === "This operation was aborted") return true; + return false; +} + +// Network error codes that indicate transient failures (shouldn't crash the gateway) +const TRANSIENT_NETWORK_CODES = new Set([ + "ECONNRESET", + "ECONNREFUSED", + "ENOTFOUND", + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNABORTED", + "EPIPE", + "EHOSTUNREACH", + "ENETUNREACH", + "EAI_AGAIN", + "UND_ERR_CONNECT_TIMEOUT", + "UND_ERR_SOCKET", + "UND_ERR_HEADERS_TIMEOUT", + "UND_ERR_BODY_TIMEOUT", +]); + +function getErrorCode(err: unknown): string | undefined { + if (!err || typeof err !== "object") return undefined; + const code = (err as { code?: unknown }).code; + return typeof code === "string" ? code : undefined; +} + +function getErrorCause(err: unknown): unknown { + if (!err || typeof err !== "object") return undefined; + return (err as { cause?: unknown }).cause; +} + +/** + * Checks if an error is a transient network error that shouldn't crash the gateway. + * These are typically temporary connectivity issues that will resolve on their own. + */ +export function isTransientNetworkError(err: unknown): boolean { + if (!err) return false; + + // Check the error itself + const code = getErrorCode(err); + if (code && TRANSIENT_NETWORK_CODES.has(code)) return true; + + // "fetch failed" TypeError from undici (Node's native fetch) + if (err instanceof TypeError && err.message === "fetch failed") { + const cause = getErrorCause(err); + // The cause often contains the actual network error + if (cause) return isTransientNetworkError(cause); + // Even without a cause, "fetch failed" is typically a network issue + return true; + } + + // Check the cause chain recursively + const cause = getErrorCause(err); + if (cause && cause !== err) { + return isTransientNetworkError(cause); + } + + // AggregateError may wrap multiple causes + if (err instanceof AggregateError && err.errors?.length) { + return err.errors.some((e) => isTransientNetworkError(e)); + } + + return false; +} + export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void { handlers.add(handler); return () => { @@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan }; } -/** - * Check if an error is a recoverable/transient error that shouldn't crash the process. - * These include network errors and abort signals during shutdown. - */ -function isRecoverableError(reason: unknown): boolean { - if (!reason) return false; - - // Check error name for AbortError - if (reason instanceof Error && reason.name === "AbortError") { - return true; - } - - const message = reason instanceof Error ? reason.message : formatErrorMessage(reason); - const lowerMessage = message.toLowerCase(); - return ( - lowerMessage.includes("fetch failed") || - lowerMessage.includes("network request") || - lowerMessage.includes("econnrefused") || - lowerMessage.includes("econnreset") || - lowerMessage.includes("etimedout") || - lowerMessage.includes("socket hang up") || - lowerMessage.includes("enotfound") || - lowerMessage.includes("network error") || - lowerMessage.includes("getaddrinfo") || - lowerMessage.includes("client network socket disconnected") || - lowerMessage.includes("this operation was aborted") || - lowerMessage.includes("aborted") - ); -} - export function isUnhandledRejectionHandled(reason: unknown): boolean { for (const handler of handlers) { try { @@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void { process.on("unhandledRejection", (reason, _promise) => { if (isUnhandledRejectionHandled(reason)) return; - // Don't crash on recoverable/transient errors - log them and continue - if (isRecoverableError(reason)) { - console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason)); + // AbortError is typically an intentional cancellation (e.g., during shutdown) + // Log it but don't crash - these are expected during graceful shutdown + if (isAbortError(reason)) { + console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason)); + return; + } + + // Transient network errors (fetch failed, connection reset, etc.) shouldn't crash + // These are temporary connectivity issues that will resolve on their own + if (isTransientNetworkError(reason)) { + console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason)); return; }