fix: prevent gateway crash when all auth profiles are in cooldown

- Add AllModelsFailedError class with cooldown detection - Modify runWithModelFallback() to throw typed error with retry timing - Add handler in unhandled-rejections.ts to log warning instead of exit Fixes #2811
2026-01-29 23:32:50 -08:00 · 2026-01-29 23:32:50 -08:00 · 244ebc1b56
commit 244ebc1b56
parent 9025da2296
5 changed files with 193 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -72,6 +72,7 @@ Status: stable.
 - **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed).

 ### Fixes
+- Gateway: prevent crash when all auth profiles are in cooldown; log warning and continue. (#2811)
 - Telegram: avoid silent empty replies by tracking normalization skips before fallback. (#3796)
 - Telegram: scope native skill commands to bound agent per bot. (#4360) Thanks @robhparker.
 - Mentions: honor mentionPatterns even when explicit mentions are present. (#3303) Thanks @HirokiKobayashi-R.
--- a/src/agents/model-fallback-error.test.ts
+++ b/src/agents/model-fallback-error.test.ts
@ -0,0 +1,91 @@
+import { describe, it, expect } from "vitest";
+import { AllModelsFailedError, isAllModelsFailedError } from "./model-fallback-error.js";
+
+describe("AllModelsFailedError", () => {
+  it("creates error with cooldown-only flag", () => {
+    const error = new AllModelsFailedError("All models failed", {
+      attempts: [
+        {
+          provider: "anthropic",
+          model: "claude-3-5",
+          error: "cooldown",
+          reason: "rate_limit",
+        },
+      ],
+      allInCooldown: true,
+      retryAfterMs: 300000,
+    });
+
+    expect(error.name).toBe("AllModelsFailedError");
+    expect(error.allInCooldown).toBe(true);
+    expect(error.isCooldownOnly()).toBe(true);
+    expect(isAllModelsFailedError(error)).toBe(true);
+    expect(error.retryAfterMs).toBe(300000);
+  });
+
+  it("distinguishes mixed failures", () => {
+    const error = new AllModelsFailedError("msg", {
+      attempts: [
+        { provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" },
+        { provider: "openai", model: "gpt-4", error: "auth", reason: "auth" },
+      ],
+      allInCooldown: false,
+    });
+    expect(error.isCooldownOnly()).toBe(false);
+    expect(error.allInCooldown).toBe(false);
+  });
+
+  it("returns false for isCooldownOnly when no attempts", () => {
+    const error = new AllModelsFailedError("msg", {
+      attempts: [],
+      allInCooldown: true,
+    });
+    expect(error.isCooldownOnly()).toBe(false);
+  });
+
+  it("preserves cause in error chain", () => {
+    const cause = new Error("Original error");
+    const error = new AllModelsFailedError("All models failed", {
+      attempts: [{ provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" }],
+      allInCooldown: true,
+      cause,
+    });
+
+    expect(error.cause).toBe(cause);
+  });
+
+  it("includes all attempt details", () => {
+    const attempts = [
+      {
+        provider: "anthropic",
+        model: "c",
+        error: "cooldown",
+        reason: "rate_limit" as const,
+        status: 429,
+        code: "rate_limit",
+      },
+      { provider: "openai", model: "gpt-4", error: "auth", reason: "auth" as const, status: 401 },
+    ];
+    const error = new AllModelsFailedError("msg", {
+      attempts,
+      allInCooldown: false,
+    });
+
+    expect(error.attempts).toEqual(attempts);
+  });
+
+  it("type guard works correctly", () => {
+    const error = new AllModelsFailedError("msg", {
+      attempts: [{ provider: "anthropic", model: "c", error: "c", reason: "rate_limit" }],
+      allInCooldown: true,
+    });
+    expect(isAllModelsFailedError(error)).toBe(true);
+
+    const regularError = new Error("regular");
+    expect(isAllModelsFailedError(regularError)).toBe(false);
+
+    expect(isAllModelsFailedError(null)).toBe(false);
+    expect(isAllModelsFailedError(undefined)).toBe(false);
+    expect(isAllModelsFailedError("string")).toBe(false);
+  });
+});
--- a/src/agents/model-fallback-error.ts
+++ b/src/agents/model-fallback-error.ts
@ -0,0 +1,38 @@
+import type { FailoverReason } from "./pi-embedded-helpers/types.js";
+
+export class AllModelsFailedError extends Error {
+  readonly attempts: Array<{
+    provider: string;
+    model: string;
+    error: string;
+    reason?: FailoverReason;
+    status?: number;
+    code?: string;
+  }>;
+  readonly allInCooldown: boolean;
+  readonly retryAfterMs?: number;
+
+  constructor(
+    message: string,
+    params: {
+      attempts: AllModelsFailedError["attempts"];
+      allInCooldown: boolean;
+      retryAfterMs?: number;
+      cause?: unknown;
+    },
+  ) {
+    super(message, { cause: params.cause });
+    this.name = "AllModelsFailedError";
+    this.attempts = params.attempts;
+    this.allInCooldown = params.allInCooldown;
+    this.retryAfterMs = params.retryAfterMs;
+  }
+
+  isCooldownOnly(): boolean {
+    return this.allInCooldown && this.attempts.length > 0;
+  }
+}
+
+export function isAllModelsFailedError(err: unknown): err is AllModelsFailedError {
+  return err instanceof AllModelsFailedError;
+}
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@ -1,4 +1,9 @@
 import type { OpenClawConfig } from "../config/config.js";
+import {
+  ensureAuthProfileStore,
+  isProfileInCooldown,
+  resolveAuthProfileOrder,
+} from "./auth-profiles.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import {
  coerceToFailoverError,
@ -6,6 +11,7 @@ import {
  isFailoverError,
  isTimeoutError,
 } from "./failover-error.js";
+import { AllModelsFailedError } from "./model-fallback-error.js";
 import {
  buildModelAliasIndex,
  modelKey,
@ -14,11 +20,6 @@ import {
  resolveModelRefFromString,
 } from "./model-selection.js";
 import type { FailoverReason } from "./pi-embedded-helpers.js";
-import {
-  ensureAuthProfileStore,
-  isProfileInCooldown,
-  resolveAuthProfileOrder,
-} from "./auth-profiles.js";

 type ModelCandidate = {
  provider: string;
@ -293,9 +294,46 @@ export async function runWithModelFallback<T>(params: {
          )
          .join(" | ")
      : "unknown";
-  throw new Error(`All models failed (${attempts.length || candidates.length}): ${summary}`, {
-    cause: lastError instanceof Error ? lastError : undefined,
-  });
+
+  // Determine if all failures are due to cooldown (rate_limit)
+  const allCooldown = attempts.length > 0 && attempts.every((a) => a.reason === "rate_limit");
+
+  // Calculate earliest retry time if all in cooldown
+  let retryAfterMs: number | undefined;
+  if (allCooldown && authStore) {
+    const profileIds = new Set<string>();
+    for (const candidate of candidates) {
+      const profiles = resolveAuthProfileOrder({
+        cfg: params.cfg,
+        store: authStore,
+        provider: candidate.provider,
+      });
+      profiles.forEach((id) => profileIds.add(id));
+    }
+
+    let earliest: number | null = null;
+    for (const id of profileIds) {
+      const stats = authStore.usageStats?.[id];
+      if (!stats) continue;
+      const unusableUntil = Math.max(stats.cooldownUntil ?? 0, stats.disabledUntil ?? 0);
+      if (unusableUntil > 0 && (earliest === null || unusableUntil < earliest)) {
+        earliest = unusableUntil;
+      }
+    }
+    if (earliest) {
+      retryAfterMs = Math.max(0, earliest - Date.now());
+    }
+  }
+
+  throw new AllModelsFailedError(
+    `All models failed (${attempts.length || candidates.length}): ${summary}`,
+    {
+      attempts,
+      allInCooldown: allCooldown,
+      retryAfterMs,
+      cause: lastError instanceof Error ? lastError : undefined,
+    },
+  );
 }

 export async function runWithImageModelFallback<T>(params: {
--- a/src/infra/unhandled-rejections.ts
+++ b/src/infra/unhandled-rejections.ts
@ -1,5 +1,6 @@
 import process from "node:process";

+import { isAllModelsFailedError } from "../agents/model-fallback-error.js";
 import { extractErrorCode, formatUncaughtError } from "./errors.js";

 type UnhandledRejectionHandler = (reason: unknown) => boolean;
@ -134,6 +135,22 @@ export function installUnhandledRejectionHandler(): void {
      return;
    }

+    // Handle AllModelsFailedError - don't crash on cooldown
+    if (isAllModelsFailedError(reason)) {
+      if (reason.allInCooldown) {
+        const mins = reason.retryAfterMs ? Math.round(reason.retryAfterMs / 60000) : "unknown";
+        console.warn(
+          `[openclaw] All models in cooldown - gateway continuing. ` +
+            `Retry after ${mins}min. ` +
+            `Providers: ${reason.attempts.map((a) => a.provider).join(", ")}`,
+        );
+        return; // Don't exit
+      }
+      // Mixed failures (not all cooldown) - log but don't crash
+      console.warn("[openclaw] All models failed (mixed reasons):", formatUncaughtError(reason));
+      return;
+    }
+
    if (isFatalError(reason)) {
      console.error("[openclaw] FATAL unhandled rejection:", formatUncaughtError(reason));
      process.exit(1);