From 2e0dd2ce72d3401e2232a2a2471195b7696bf0bb Mon Sep 17 00:00:00 2001 From: Charles-Henri ROBICHE Date: Thu, 29 Jan 2026 00:26:44 +0100 Subject: [PATCH] fix(litellm): complete prompt caching support for Claude models This commit completes the LiteLLM prompt caching implementation by: 1. Applying cacheControlTtl defaults to litellm/claude-* models in addition to anthropic/* models. Previously, only direct Anthropic models received the default 1h cache TTL, causing LiteLLM Claude models to skip caching. 2. Setting api: "anthropic-messages" for Claude models during onboarding. LiteLLM was using openai-completions API which doesn't support Anthropic's cache control headers. The anthropic-messages API is required for proper prompt caching functionality. Result: 90% cost reduction for LiteLLM Claude usage (from $0.47 to $0.05 per message with ~94K token conversations), matching direct Anthropic API costs. Co-Authored-By: Claude (claude-sonnet-4-5) --- src/commands/onboard-auth.config-core.ts | 4 +++ src/config/defaults.ts | 36 ++++++++++++++++-------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/commands/onboard-auth.config-core.ts b/src/commands/onboard-auth.config-core.ts index 2cb457ebb..6835d4156 100644 --- a/src/commands/onboard-auth.config-core.ts +++ b/src/commands/onboard-auth.config-core.ts @@ -436,9 +436,13 @@ export function applyLitellmProviderConfig( const providers = { ...cfg.models?.providers }; const existingProvider = providers.litellm; const existingModels = Array.isArray(existingProvider?.models) ? existingProvider.models : []; + // Detect Claude models and use Anthropic Messages API for proper cache control support + const isClaude = params.modelId.toLowerCase().startsWith("claude-"); const newModel = { id: params.modelId, name: params.modelName ?? params.modelId, + // Claude models through LiteLLM should use anthropic-messages API for cache control + ...(isClaude ? { api: "anthropic-messages" as const } : {}), reasoning: false, input: ["text"] as ("text" | "image")[], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, diff --git a/src/config/defaults.ts b/src/config/defaults.ts index 82aada474..15644abe8 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -323,7 +323,12 @@ export function applyContextPruningDefaults(cfg: MoltbotConfig): MoltbotConfig { for (const [key, entry] of Object.entries(nextModels)) { const parsed = parseModelRef(key, "anthropic"); - if (!parsed || parsed.provider !== "anthropic") continue; + if (!parsed) continue; + // Apply cache control to Anthropic models and LiteLLM Claude models + const isAnthropicProvider = parsed.provider === "anthropic"; + const isLitellmClaude = + parsed.provider === "litellm" && parsed.model.toLowerCase().startsWith("claude-"); + if (!isAnthropicProvider && !isLitellmClaude) continue; const current = entry ?? {}; const params = (current as { params?: Record }).params ?? {}; if (typeof params.cacheControlTtl === "string") continue; @@ -337,17 +342,24 @@ export function applyContextPruningDefaults(cfg: MoltbotConfig): MoltbotConfig { const primary = resolvePrimaryModelRef(defaults.model?.primary ?? undefined); if (primary) { const parsedPrimary = parseModelRef(primary, "anthropic"); - if (parsedPrimary?.provider === "anthropic") { - const key = `${parsedPrimary.provider}/${parsedPrimary.model}`; - const entry = nextModels[key]; - const current = entry ?? {}; - const params = (current as { params?: Record }).params ?? {}; - if (typeof params.cacheControlTtl !== "string") { - nextModels[key] = { - ...(current as Record), - params: { ...params, cacheControlTtl: "1h" }, - }; - modelsMutated = true; + if (parsedPrimary) { + // Apply cache control to Anthropic models and LiteLLM Claude models + const isAnthropicProvider = parsedPrimary.provider === "anthropic"; + const isLitellmClaude = + parsedPrimary.provider === "litellm" && + parsedPrimary.model.toLowerCase().startsWith("claude-"); + if (isAnthropicProvider || isLitellmClaude) { + const key = `${parsedPrimary.provider}/${parsedPrimary.model}`; + const entry = nextModels[key]; + const current = entry ?? {}; + const params = (current as { params?: Record }).params ?? {}; + if (typeof params.cacheControlTtl !== "string") { + nextModels[key] = { + ...(current as Record), + params: { ...params, cacheControlTtl: "1h" }, + }; + modelsMutated = true; + } } } }