fix(voice-call): prevent audio overlap with TTS queue

Add a TTS queue to serialize audio playback and prevent overlapping speech during voice calls. Previously, concurrent speak() calls could send audio chunks simultaneously, causing garbled/choppy output. Changes: - Add queueTts() to MediaStreamHandler for sequential TTS playback - Wrap playTtsViaStream() audio sending in the queue - Clear queue on barge-in (when user starts speaking) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 01:36:24 -05:00 · 2026-01-25 01:36:24 -05:00 · 76014685eb
commit 76014685eb
parent 875b018ea1
3 changed files with 85 additions and 11 deletions
--- a/extensions/voice-call/src/media-stream.ts
+++ b/extensions/voice-call/src/media-stream.ts
@ -51,6 +51,11 @@ export class MediaStreamHandler {
  private sessions = new Map<string, StreamSession>();
  private config: MediaStreamConfig;

+  /** TTS playback queues per stream (serialize audio to prevent overlap) */
+  private ttsQueues = new Map<string, Array<() => Promise<void>>>();
+  /** Whether TTS is currently playing per stream */
+  private ttsPlaying = new Map<string, boolean>();
+
  constructor(config: MediaStreamConfig) {
    this.config = config;
  }
@ -228,6 +233,51 @@ export class MediaStreamHandler {
    this.sendToStream(streamSid, { event: "clear", streamSid });
  }

+  /**
+   * Queue a TTS operation for sequential playback.
+   * Only one TTS operation plays at a time per stream to prevent overlap.
+   */
+  async queueTts(streamSid: string, playFn: () => Promise<void>): Promise<void> {
+    if (!this.ttsQueues.has(streamSid)) {
+      this.ttsQueues.set(streamSid, []);
+    }
+    const queue = this.ttsQueues.get(streamSid)!;
+    queue.push(playFn);
+
+    // Process queue if not already playing
+    if (!this.ttsPlaying.get(streamSid)) {
+      await this.processQueue(streamSid);
+    }
+  }
+
+  /**
+   * Process the TTS queue for a stream.
+   */
+  private async processQueue(streamSid: string): Promise<void> {
+    const queue = this.ttsQueues.get(streamSid);
+    if (!queue || queue.length === 0) {
+      this.ttsPlaying.set(streamSid, false);
+      return;
+    }
+
+    this.ttsPlaying.set(streamSid, true);
+    const playFn = queue.shift()!;
+
+    try {
+      await playFn();
+    } finally {
+      await this.processQueue(streamSid);
+    }
+  }
+
+  /**
+   * Clear TTS queue and interrupt current playback (barge-in).
+   */
+  clearTtsQueue(streamSid: string): void {
+    this.ttsQueues.set(streamSid, []);
+    this.clearAudio(streamSid);
+  }
+
  /**
   * Get active session by call ID.
   */
--- a/extensions/voice-call/src/providers/twilio.ts
+++ b/extensions/voice-call/src/providers/twilio.ts
@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
    this.callStreamMap.delete(callSid);
  }

+  /**
+   * Clear TTS queue for a call (barge-in).
+   * Used when user starts speaking to interrupt current TTS playback.
+   */
+  clearTtsQueue(callSid: string): void {
+    const streamSid = this.callStreamMap.get(callSid);
+    if (streamSid && this.mediaStreamHandler) {
+      this.mediaStreamHandler.clearTtsQueue(streamSid);
+    }
+  }
+
  /**
   * Make an authenticated request to the Twilio API.
   */
@ -504,7 +515,7 @@ export class TwilioProvider implements VoiceCallProvider {
  /**
   * Play TTS via core TTS and Twilio Media Streams.
   * Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
-   * Uses a jitter buffer to smooth out timing variations.
+   * Uses a queue to serialize playback and prevent overlapping audio.
   */
  private async playTtsViaStream(
    text: string,
@ -514,22 +525,30 @@ export class TwilioProvider implements VoiceCallProvider {
      throw new Error("TTS provider and media stream handler required");
    }

-    // Generate audio with core TTS (returns mu-law at 8kHz)
-    const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
-
    // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
    const CHUNK_SIZE = 160;
    const CHUNK_DELAY_MS = 20;

-    for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
-      this.mediaStreamHandler.sendAudio(streamSid, chunk);
+    const handler = this.mediaStreamHandler;
+    const ttsProvider = this.ttsProvider;
+    await handler.queueTts(streamSid, async (signal) => {
+      // Generate audio with core TTS (returns mu-law at 8kHz)
+      const muLawAudio = await ttsProvider.synthesizeForTelephony(text);

-      // Pace the audio to match real-time playback
-      await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
-    }
+      for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
+        if (signal.aborted) break;
+        handler.sendAudio(streamSid, chunk);

-    // Send a mark to track when audio finishes
-    this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
+        // Pace the audio to match real-time playback
+        await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
+        if (signal.aborted) break;
+      }
+
+      if (!signal.aborted) {
+        // Send a mark to track when audio finishes
+        handler.sendMark(streamSid, `tts-${Date.now()}`);
+      }
+    });
  }

  /**
--- a/extensions/voice-call/src/webhook.ts
+++ b/extensions/voice-call/src/webhook.ts
@ -78,6 +78,11 @@ export class VoiceCallWebhookServer {
          `[voice-call] Transcript for ${providerCallId}: ${transcript}`,
        );

+        // Clear TTS queue on barge-in (user started speaking, interrupt current playback)
+        if (this.provider.name === "twilio") {
+          (this.provider as TwilioProvider).clearTtsQueue(providerCallId);
+        }
+
        // Look up our internal call ID from the provider call ID
        const call = this.manager.getCallByProviderCallId(providerCallId);
        if (!call) {