diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts index 252b6b331..b2a7e6d65 100644 --- a/extensions/voice-call/src/media-stream.ts +++ b/extensions/voice-call/src/media-stream.ts @@ -51,6 +51,11 @@ export class MediaStreamHandler { private sessions = new Map(); private config: MediaStreamConfig; + /** TTS playback queues per stream (serialize audio to prevent overlap) */ + private ttsQueues = new Map Promise>>(); + /** Whether TTS is currently playing per stream */ + private ttsPlaying = new Map(); + constructor(config: MediaStreamConfig) { this.config = config; } @@ -228,6 +233,51 @@ export class MediaStreamHandler { this.sendToStream(streamSid, { event: "clear", streamSid }); } + /** + * Queue a TTS operation for sequential playback. + * Only one TTS operation plays at a time per stream to prevent overlap. + */ + async queueTts(streamSid: string, playFn: () => Promise): Promise { + if (!this.ttsQueues.has(streamSid)) { + this.ttsQueues.set(streamSid, []); + } + const queue = this.ttsQueues.get(streamSid)!; + queue.push(playFn); + + // Process queue if not already playing + if (!this.ttsPlaying.get(streamSid)) { + await this.processQueue(streamSid); + } + } + + /** + * Process the TTS queue for a stream. + */ + private async processQueue(streamSid: string): Promise { + const queue = this.ttsQueues.get(streamSid); + if (!queue || queue.length === 0) { + this.ttsPlaying.set(streamSid, false); + return; + } + + this.ttsPlaying.set(streamSid, true); + const playFn = queue.shift()!; + + try { + await playFn(); + } finally { + await this.processQueue(streamSid); + } + } + + /** + * Clear TTS queue and interrupt current playback (barge-in). + */ + clearTtsQueue(streamSid: string): void { + this.ttsQueues.set(streamSid, []); + this.clearAudio(streamSid); + } + /** * Get active session by call ID. */ diff --git a/extensions/voice-call/src/providers/twilio.ts b/extensions/voice-call/src/providers/twilio.ts index 8e400f82f..d77b29f6c 100644 --- a/extensions/voice-call/src/providers/twilio.ts +++ b/extensions/voice-call/src/providers/twilio.ts @@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider { this.callStreamMap.delete(callSid); } + /** + * Clear TTS queue for a call (barge-in). + * Used when user starts speaking to interrupt current TTS playback. + */ + clearTtsQueue(callSid: string): void { + const streamSid = this.callStreamMap.get(callSid); + if (streamSid && this.mediaStreamHandler) { + this.mediaStreamHandler.clearTtsQueue(streamSid); + } + } + /** * Make an authenticated request to the Twilio API. */ @@ -504,7 +515,7 @@ export class TwilioProvider implements VoiceCallProvider { /** * Play TTS via core TTS and Twilio Media Streams. * Generates audio with core TTS, converts to mu-law, and streams via WebSocket. - * Uses a jitter buffer to smooth out timing variations. + * Uses a queue to serialize playback and prevent overlapping audio. */ private async playTtsViaStream( text: string, @@ -514,22 +525,30 @@ export class TwilioProvider implements VoiceCallProvider { throw new Error("TTS provider and media stream handler required"); } - // Generate audio with core TTS (returns mu-law at 8kHz) - const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text); - // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law) const CHUNK_SIZE = 160; const CHUNK_DELAY_MS = 20; - for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) { - this.mediaStreamHandler.sendAudio(streamSid, chunk); + const handler = this.mediaStreamHandler; + const ttsProvider = this.ttsProvider; + await handler.queueTts(streamSid, async (signal) => { + // Generate audio with core TTS (returns mu-law at 8kHz) + const muLawAudio = await ttsProvider.synthesizeForTelephony(text); - // Pace the audio to match real-time playback - await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS)); - } + for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) { + if (signal.aborted) break; + handler.sendAudio(streamSid, chunk); - // Send a mark to track when audio finishes - this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`); + // Pace the audio to match real-time playback + await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS)); + if (signal.aborted) break; + } + + if (!signal.aborted) { + // Send a mark to track when audio finishes + handler.sendMark(streamSid, `tts-${Date.now()}`); + } + }); } /** diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index c69436d77..56e104ff8 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -78,6 +78,11 @@ export class VoiceCallWebhookServer { `[voice-call] Transcript for ${providerCallId}: ${transcript}`, ); + // Clear TTS queue on barge-in (user started speaking, interrupt current playback) + if (this.provider.name === "twilio") { + (this.provider as TwilioProvider).clearTtsQueue(providerCallId); + } + // Look up our internal call ID from the provider call ID const call = this.manager.getCallByProviderCallId(providerCallId); if (!call) {