fix: prevent gateway crash when all auth profiles are in cooldown
- Add AllModelsFailedError class with cooldown detection - Modify runWithModelFallback() to throw typed error with retry timing - Add handler in unhandled-rejections.ts to log warning instead of exit Fixes #2811
This commit is contained in:
parent
9025da2296
commit
244ebc1b56
@ -72,6 +72,7 @@ Status: stable.
|
||||
- **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed).
|
||||
|
||||
### Fixes
|
||||
- Gateway: prevent crash when all auth profiles are in cooldown; log warning and continue. (#2811)
|
||||
- Telegram: avoid silent empty replies by tracking normalization skips before fallback. (#3796)
|
||||
- Telegram: scope native skill commands to bound agent per bot. (#4360) Thanks @robhparker.
|
||||
- Mentions: honor mentionPatterns even when explicit mentions are present. (#3303) Thanks @HirokiKobayashi-R.
|
||||
|
||||
91
src/agents/model-fallback-error.test.ts
Normal file
91
src/agents/model-fallback-error.test.ts
Normal file
@ -0,0 +1,91 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { AllModelsFailedError, isAllModelsFailedError } from "./model-fallback-error.js";
|
||||
|
||||
describe("AllModelsFailedError", () => {
|
||||
it("creates error with cooldown-only flag", () => {
|
||||
const error = new AllModelsFailedError("All models failed", {
|
||||
attempts: [
|
||||
{
|
||||
provider: "anthropic",
|
||||
model: "claude-3-5",
|
||||
error: "cooldown",
|
||||
reason: "rate_limit",
|
||||
},
|
||||
],
|
||||
allInCooldown: true,
|
||||
retryAfterMs: 300000,
|
||||
});
|
||||
|
||||
expect(error.name).toBe("AllModelsFailedError");
|
||||
expect(error.allInCooldown).toBe(true);
|
||||
expect(error.isCooldownOnly()).toBe(true);
|
||||
expect(isAllModelsFailedError(error)).toBe(true);
|
||||
expect(error.retryAfterMs).toBe(300000);
|
||||
});
|
||||
|
||||
it("distinguishes mixed failures", () => {
|
||||
const error = new AllModelsFailedError("msg", {
|
||||
attempts: [
|
||||
{ provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" },
|
||||
{ provider: "openai", model: "gpt-4", error: "auth", reason: "auth" },
|
||||
],
|
||||
allInCooldown: false,
|
||||
});
|
||||
expect(error.isCooldownOnly()).toBe(false);
|
||||
expect(error.allInCooldown).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for isCooldownOnly when no attempts", () => {
|
||||
const error = new AllModelsFailedError("msg", {
|
||||
attempts: [],
|
||||
allInCooldown: true,
|
||||
});
|
||||
expect(error.isCooldownOnly()).toBe(false);
|
||||
});
|
||||
|
||||
it("preserves cause in error chain", () => {
|
||||
const cause = new Error("Original error");
|
||||
const error = new AllModelsFailedError("All models failed", {
|
||||
attempts: [{ provider: "anthropic", model: "c", error: "cooldown", reason: "rate_limit" }],
|
||||
allInCooldown: true,
|
||||
cause,
|
||||
});
|
||||
|
||||
expect(error.cause).toBe(cause);
|
||||
});
|
||||
|
||||
it("includes all attempt details", () => {
|
||||
const attempts = [
|
||||
{
|
||||
provider: "anthropic",
|
||||
model: "c",
|
||||
error: "cooldown",
|
||||
reason: "rate_limit" as const,
|
||||
status: 429,
|
||||
code: "rate_limit",
|
||||
},
|
||||
{ provider: "openai", model: "gpt-4", error: "auth", reason: "auth" as const, status: 401 },
|
||||
];
|
||||
const error = new AllModelsFailedError("msg", {
|
||||
attempts,
|
||||
allInCooldown: false,
|
||||
});
|
||||
|
||||
expect(error.attempts).toEqual(attempts);
|
||||
});
|
||||
|
||||
it("type guard works correctly", () => {
|
||||
const error = new AllModelsFailedError("msg", {
|
||||
attempts: [{ provider: "anthropic", model: "c", error: "c", reason: "rate_limit" }],
|
||||
allInCooldown: true,
|
||||
});
|
||||
expect(isAllModelsFailedError(error)).toBe(true);
|
||||
|
||||
const regularError = new Error("regular");
|
||||
expect(isAllModelsFailedError(regularError)).toBe(false);
|
||||
|
||||
expect(isAllModelsFailedError(null)).toBe(false);
|
||||
expect(isAllModelsFailedError(undefined)).toBe(false);
|
||||
expect(isAllModelsFailedError("string")).toBe(false);
|
||||
});
|
||||
});
|
||||
38
src/agents/model-fallback-error.ts
Normal file
38
src/agents/model-fallback-error.ts
Normal file
@ -0,0 +1,38 @@
|
||||
import type { FailoverReason } from "./pi-embedded-helpers/types.js";
|
||||
|
||||
export class AllModelsFailedError extends Error {
|
||||
readonly attempts: Array<{
|
||||
provider: string;
|
||||
model: string;
|
||||
error: string;
|
||||
reason?: FailoverReason;
|
||||
status?: number;
|
||||
code?: string;
|
||||
}>;
|
||||
readonly allInCooldown: boolean;
|
||||
readonly retryAfterMs?: number;
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
params: {
|
||||
attempts: AllModelsFailedError["attempts"];
|
||||
allInCooldown: boolean;
|
||||
retryAfterMs?: number;
|
||||
cause?: unknown;
|
||||
},
|
||||
) {
|
||||
super(message, { cause: params.cause });
|
||||
this.name = "AllModelsFailedError";
|
||||
this.attempts = params.attempts;
|
||||
this.allInCooldown = params.allInCooldown;
|
||||
this.retryAfterMs = params.retryAfterMs;
|
||||
}
|
||||
|
||||
isCooldownOnly(): boolean {
|
||||
return this.allInCooldown && this.attempts.length > 0;
|
||||
}
|
||||
}
|
||||
|
||||
export function isAllModelsFailedError(err: unknown): err is AllModelsFailedError {
|
||||
return err instanceof AllModelsFailedError;
|
||||
}
|
||||
@ -1,4 +1,9 @@
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import {
|
||||
ensureAuthProfileStore,
|
||||
isProfileInCooldown,
|
||||
resolveAuthProfileOrder,
|
||||
} from "./auth-profiles.js";
|
||||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
||||
import {
|
||||
coerceToFailoverError,
|
||||
@ -6,6 +11,7 @@ import {
|
||||
isFailoverError,
|
||||
isTimeoutError,
|
||||
} from "./failover-error.js";
|
||||
import { AllModelsFailedError } from "./model-fallback-error.js";
|
||||
import {
|
||||
buildModelAliasIndex,
|
||||
modelKey,
|
||||
@ -14,11 +20,6 @@ import {
|
||||
resolveModelRefFromString,
|
||||
} from "./model-selection.js";
|
||||
import type { FailoverReason } from "./pi-embedded-helpers.js";
|
||||
import {
|
||||
ensureAuthProfileStore,
|
||||
isProfileInCooldown,
|
||||
resolveAuthProfileOrder,
|
||||
} from "./auth-profiles.js";
|
||||
|
||||
type ModelCandidate = {
|
||||
provider: string;
|
||||
@ -293,9 +294,46 @@ export async function runWithModelFallback<T>(params: {
|
||||
)
|
||||
.join(" | ")
|
||||
: "unknown";
|
||||
throw new Error(`All models failed (${attempts.length || candidates.length}): ${summary}`, {
|
||||
cause: lastError instanceof Error ? lastError : undefined,
|
||||
});
|
||||
|
||||
// Determine if all failures are due to cooldown (rate_limit)
|
||||
const allCooldown = attempts.length > 0 && attempts.every((a) => a.reason === "rate_limit");
|
||||
|
||||
// Calculate earliest retry time if all in cooldown
|
||||
let retryAfterMs: number | undefined;
|
||||
if (allCooldown && authStore) {
|
||||
const profileIds = new Set<string>();
|
||||
for (const candidate of candidates) {
|
||||
const profiles = resolveAuthProfileOrder({
|
||||
cfg: params.cfg,
|
||||
store: authStore,
|
||||
provider: candidate.provider,
|
||||
});
|
||||
profiles.forEach((id) => profileIds.add(id));
|
||||
}
|
||||
|
||||
let earliest: number | null = null;
|
||||
for (const id of profileIds) {
|
||||
const stats = authStore.usageStats?.[id];
|
||||
if (!stats) continue;
|
||||
const unusableUntil = Math.max(stats.cooldownUntil ?? 0, stats.disabledUntil ?? 0);
|
||||
if (unusableUntil > 0 && (earliest === null || unusableUntil < earliest)) {
|
||||
earliest = unusableUntil;
|
||||
}
|
||||
}
|
||||
if (earliest) {
|
||||
retryAfterMs = Math.max(0, earliest - Date.now());
|
||||
}
|
||||
}
|
||||
|
||||
throw new AllModelsFailedError(
|
||||
`All models failed (${attempts.length || candidates.length}): ${summary}`,
|
||||
{
|
||||
attempts,
|
||||
allInCooldown: allCooldown,
|
||||
retryAfterMs,
|
||||
cause: lastError instanceof Error ? lastError : undefined,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
export async function runWithImageModelFallback<T>(params: {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import process from "node:process";
|
||||
|
||||
import { isAllModelsFailedError } from "../agents/model-fallback-error.js";
|
||||
import { extractErrorCode, formatUncaughtError } from "./errors.js";
|
||||
|
||||
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
||||
@ -134,6 +135,22 @@ export function installUnhandledRejectionHandler(): void {
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle AllModelsFailedError - don't crash on cooldown
|
||||
if (isAllModelsFailedError(reason)) {
|
||||
if (reason.allInCooldown) {
|
||||
const mins = reason.retryAfterMs ? Math.round(reason.retryAfterMs / 60000) : "unknown";
|
||||
console.warn(
|
||||
`[openclaw] All models in cooldown - gateway continuing. ` +
|
||||
`Retry after ${mins}min. ` +
|
||||
`Providers: ${reason.attempts.map((a) => a.provider).join(", ")}`,
|
||||
);
|
||||
return; // Don't exit
|
||||
}
|
||||
// Mixed failures (not all cooldown) - log but don't crash
|
||||
console.warn("[openclaw] All models failed (mixed reasons):", formatUncaughtError(reason));
|
||||
return;
|
||||
}
|
||||
|
||||
if (isFatalError(reason)) {
|
||||
console.error("[openclaw] FATAL unhandled rejection:", formatUncaughtError(reason));
|
||||
process.exit(1);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user