From 244ebc1b569d65f5228194f69285cb54330cd875 Mon Sep 17 00:00:00 2001 From: mbp-2013 Date: Thu, 29 Jan 2026 23:32:50 -0800 Subject: [PATCH] fix: prevent gateway crash when all auth profiles are in cooldown - Add AllModelsFailedError class with cooldown detection - Modify runWithModelFallback() to throw typed error with retry timing - Add handler in unhandled-rejections.ts to log warning instead of exit Fixes #2811 --- CHANGELOG.md | 1 + src/agents/model-fallback-error.test.ts | 91 +++++++++++++++++++++++++ src/agents/model-fallback-error.ts | 38 +++++++++++ src/agents/model-fallback.ts | 54 ++++++++++++--- src/infra/unhandled-rejections.ts | 17 +++++ 5 files changed, 193 insertions(+), 8 deletions(-) create mode 100644 src/agents/model-fallback-error.test.ts create mode 100644 src/agents/model-fallback-error.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c0549c16..e628b3399 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,6 +72,7 @@ Status: stable. - **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed). ### Fixes +- Gateway: prevent crash when all auth profiles are in cooldown; log warning and continue. (#2811) - Telegram: avoid silent empty replies by tracking normalization skips before fallback. (#3796) - Telegram: scope native skill commands to bound agent per bot. (#4360) Thanks @robhparker. - Mentions: honor mentionPatterns even when explicit mentions are present. (#3303) Thanks @HirokiKobayashi-R. diff --git a/src/agents/model-fallback-error.test.ts b/src/agents/model-fallback-error.test.ts new file mode 100644 index 000000000..8968ee383 --- /dev/null +++ b/src/agents/model-fallback-error.test.ts @@ -0,0 +1,91 @@ +import { describe, it, expect } from "vitest"; +import { AllModelsFailedError, isAllModelsFailedError } from "./model-fallback-error.js"; + +describe("AllModelsFailedError", () => { + it("creates error with cooldown-only flag", () => { + const error = new AllModelsFailedError("All models failed", { + attempts: [ + { + provider: "anthropic", + model: "claude-3-5", + error: "cooldown", + reason: "rate_limit", + }, + ], + allInCooldown: true, + retryAfterMs: 300000, + }); + + expect(error.name).toBe("AllModelsFailedError"); + expect(error.allInCooldown).toBe(true); + expect(error.isCooldownOnly()).toBe(true); + expect(isAllModelsFailedError(error)).toBe(true); + expect(error.retryAfterMs).toBe(300000); + }); + + it("distinguishes mixed failures", () => { + const error = new AllModelsFailedError("msg", { + attempts: [ + { provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" }, + { provider: "openai", model: "gpt-4", error: "auth", reason: "auth" }, + ], + allInCooldown: false, + }); + expect(error.isCooldownOnly()).toBe(false); + expect(error.allInCooldown).toBe(false); + }); + + it("returns false for isCooldownOnly when no attempts", () => { + const error = new AllModelsFailedError("msg", { + attempts: [], + allInCooldown: true, + }); + expect(error.isCooldownOnly()).toBe(false); + }); + + it("preserves cause in error chain", () => { + const cause = new Error("Original error"); + const error = new AllModelsFailedError("All models failed", { + attempts: [{ provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" }], + allInCooldown: true, + cause, + }); + + expect(error.cause).toBe(cause); + }); + + it("includes all attempt details", () => { + const attempts = [ + { + provider: "anthropic", + model: "c", + error: "cooldown", + reason: "rate_limit" as const, + status: 429, + code: "rate_limit", + }, + { provider: "openai", model: "gpt-4", error: "auth", reason: "auth" as const, status: 401 }, + ]; + const error = new AllModelsFailedError("msg", { + attempts, + allInCooldown: false, + }); + + expect(error.attempts).toEqual(attempts); + }); + + it("type guard works correctly", () => { + const error = new AllModelsFailedError("msg", { + attempts: [{ provider: "anthropic", model: "c", error: "c", reason: "rate_limit" }], + allInCooldown: true, + }); + expect(isAllModelsFailedError(error)).toBe(true); + + const regularError = new Error("regular"); + expect(isAllModelsFailedError(regularError)).toBe(false); + + expect(isAllModelsFailedError(null)).toBe(false); + expect(isAllModelsFailedError(undefined)).toBe(false); + expect(isAllModelsFailedError("string")).toBe(false); + }); +}); diff --git a/src/agents/model-fallback-error.ts b/src/agents/model-fallback-error.ts new file mode 100644 index 000000000..1a0a6f7d4 --- /dev/null +++ b/src/agents/model-fallback-error.ts @@ -0,0 +1,38 @@ +import type { FailoverReason } from "./pi-embedded-helpers/types.js"; + +export class AllModelsFailedError extends Error { + readonly attempts: Array<{ + provider: string; + model: string; + error: string; + reason?: FailoverReason; + status?: number; + code?: string; + }>; + readonly allInCooldown: boolean; + readonly retryAfterMs?: number; + + constructor( + message: string, + params: { + attempts: AllModelsFailedError["attempts"]; + allInCooldown: boolean; + retryAfterMs?: number; + cause?: unknown; + }, + ) { + super(message, { cause: params.cause }); + this.name = "AllModelsFailedError"; + this.attempts = params.attempts; + this.allInCooldown = params.allInCooldown; + this.retryAfterMs = params.retryAfterMs; + } + + isCooldownOnly(): boolean { + return this.allInCooldown && this.attempts.length > 0; + } +} + +export function isAllModelsFailedError(err: unknown): err is AllModelsFailedError { + return err instanceof AllModelsFailedError; +} diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index b99a104d5..1ce1a5a4f 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -1,4 +1,9 @@ import type { OpenClawConfig } from "../config/config.js"; +import { + ensureAuthProfileStore, + isProfileInCooldown, + resolveAuthProfileOrder, +} from "./auth-profiles.js"; import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js"; import { coerceToFailoverError, @@ -6,6 +11,7 @@ import { isFailoverError, isTimeoutError, } from "./failover-error.js"; +import { AllModelsFailedError } from "./model-fallback-error.js"; import { buildModelAliasIndex, modelKey, @@ -14,11 +20,6 @@ import { resolveModelRefFromString, } from "./model-selection.js"; import type { FailoverReason } from "./pi-embedded-helpers.js"; -import { - ensureAuthProfileStore, - isProfileInCooldown, - resolveAuthProfileOrder, -} from "./auth-profiles.js"; type ModelCandidate = { provider: string; @@ -293,9 +294,46 @@ export async function runWithModelFallback(params: { ) .join(" | ") : "unknown"; - throw new Error(`All models failed (${attempts.length || candidates.length}): ${summary}`, { - cause: lastError instanceof Error ? lastError : undefined, - }); + + // Determine if all failures are due to cooldown (rate_limit) + const allCooldown = attempts.length > 0 && attempts.every((a) => a.reason === "rate_limit"); + + // Calculate earliest retry time if all in cooldown + let retryAfterMs: number | undefined; + if (allCooldown && authStore) { + const profileIds = new Set(); + for (const candidate of candidates) { + const profiles = resolveAuthProfileOrder({ + cfg: params.cfg, + store: authStore, + provider: candidate.provider, + }); + profiles.forEach((id) => profileIds.add(id)); + } + + let earliest: number | null = null; + for (const id of profileIds) { + const stats = authStore.usageStats?.[id]; + if (!stats) continue; + const unusableUntil = Math.max(stats.cooldownUntil ?? 0, stats.disabledUntil ?? 0); + if (unusableUntil > 0 && (earliest === null || unusableUntil < earliest)) { + earliest = unusableUntil; + } + } + if (earliest) { + retryAfterMs = Math.max(0, earliest - Date.now()); + } + } + + throw new AllModelsFailedError( + `All models failed (${attempts.length || candidates.length}): ${summary}`, + { + attempts, + allInCooldown: allCooldown, + retryAfterMs, + cause: lastError instanceof Error ? lastError : undefined, + }, + ); } export async function runWithImageModelFallback(params: { diff --git a/src/infra/unhandled-rejections.ts b/src/infra/unhandled-rejections.ts index 4d2a48d23..84e230bdd 100644 --- a/src/infra/unhandled-rejections.ts +++ b/src/infra/unhandled-rejections.ts @@ -1,5 +1,6 @@ import process from "node:process"; +import { isAllModelsFailedError } from "../agents/model-fallback-error.js"; import { extractErrorCode, formatUncaughtError } from "./errors.js"; type UnhandledRejectionHandler = (reason: unknown) => boolean; @@ -134,6 +135,22 @@ export function installUnhandledRejectionHandler(): void { return; } + // Handle AllModelsFailedError - don't crash on cooldown + if (isAllModelsFailedError(reason)) { + if (reason.allInCooldown) { + const mins = reason.retryAfterMs ? Math.round(reason.retryAfterMs / 60000) : "unknown"; + console.warn( + `[openclaw] All models in cooldown - gateway continuing. ` + + `Retry after ${mins}min. ` + + `Providers: ${reason.attempts.map((a) => a.provider).join(", ")}`, + ); + return; // Don't exit + } + // Mixed failures (not all cooldown) - log but don't crash + console.warn("[openclaw] All models failed (mixed reasons):", formatUncaughtError(reason)); + return; + } + if (isFatalError(reason)) { console.error("[openclaw] FATAL unhandled rejection:", formatUncaughtError(reason)); process.exit(1);