fix(litellm): complete prompt caching support for Claude models

This commit completes the LiteLLM prompt caching implementation by: 1. Applying cacheControlTtl defaults to litellm/claude-* models in addition to anthropic/* models. Previously, only direct Anthropic models received the default 1h cache TTL, causing LiteLLM Claude models to skip caching. 2. Setting api: "anthropic-messages" for Claude models during onboarding. LiteLLM was using openai-completions API which doesn't support Anthropic's cache control headers. The anthropic-messages API is required for proper prompt caching functionality. Result: 90% cost reduction for LiteLLM Claude usage (from $0.47 to $0.05 per message with ~94K token conversations), matching direct Anthropic API costs. Co-Authored-By: Claude (claude-sonnet-4-5) <noreply@anthropic.com>
2026-01-29 00:26:44 +01:00 · 2026-01-29 00:26:44 +01:00 · 2e0dd2ce72
commit 2e0dd2ce72
parent 920fe168de
2 changed files with 28 additions and 12 deletions
--- a/src/commands/onboard-auth.config-core.ts
+++ b/src/commands/onboard-auth.config-core.ts
@ -436,9 +436,13 @@ export function applyLitellmProviderConfig(
  const providers = { ...cfg.models?.providers };
  const existingProvider = providers.litellm;
  const existingModels = Array.isArray(existingProvider?.models) ? existingProvider.models : [];
  // Detect Claude models and use Anthropic Messages API for proper cache control support
  const isClaude = params.modelId.toLowerCase().startsWith("claude-");
  const newModel = {
    id: params.modelId,
    name: params.modelName ?? params.modelId,
    // Claude models through LiteLLM should use anthropic-messages API for cache control
    ...(isClaude ? { api: "anthropic-messages" as const } : {}),
    reasoning: false,
    input: ["text"] as ("text" | "image")[],
    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
--- a/src/config/defaults.ts
+++ b/src/config/defaults.ts
@ -323,7 +323,12 @@ export function applyContextPruningDefaults(cfg: MoltbotConfig): MoltbotConfig {
    for (const [key, entry] of Object.entries(nextModels)) {
      const parsed = parseModelRef(key, "anthropic");
-      if (!parsed || parsed.provider !== "anthropic") continue;
+      if (!parsed) continue;
      // Apply cache control to Anthropic models and LiteLLM Claude models
      const isAnthropicProvider = parsed.provider === "anthropic";
      const isLitellmClaude =
        parsed.provider === "litellm" && parsed.model.toLowerCase().startsWith("claude-");
      if (!isAnthropicProvider && !isLitellmClaude) continue;
      const current = entry ?? {};
      const params = (current as { params?: Record<string, unknown> }).params ?? {};
      if (typeof params.cacheControlTtl === "string") continue;
@ -337,7 +342,13 @@ export function applyContextPruningDefaults(cfg: MoltbotConfig): MoltbotConfig {
    const primary = resolvePrimaryModelRef(defaults.model?.primary ?? undefined);
    if (primary) {
      const parsedPrimary = parseModelRef(primary, "anthropic");
-      if (parsedPrimary?.provider === "anthropic") {
+      if (parsedPrimary) {
        // Apply cache control to Anthropic models and LiteLLM Claude models
        const isAnthropicProvider = parsedPrimary.provider === "anthropic";
        const isLitellmClaude =
          parsedPrimary.provider === "litellm" &&
          parsedPrimary.model.toLowerCase().startsWith("claude-");
        if (isAnthropicProvider || isLitellmClaude) {
          const key = `${parsedPrimary.provider}/${parsedPrimary.model}`;
          const entry = nextModels[key];
          const current = entry ?? {};
@ -351,6 +362,7 @@ export function applyContextPruningDefaults(cfg: MoltbotConfig): MoltbotConfig {
          }
        }
      }
    }
    if (modelsMutated) {
      nextDefaults.models = nextModels;