fix(voice-call): prevent audio overlap with TTS queue

Add a TTS queue to serialize audio playback and prevent overlapping
speech during voice calls. Previously, concurrent speak() calls could
send audio chunks simultaneously, causing garbled/choppy output.

Changes:
- Add queueTts() to MediaStreamHandler for sequential TTS playback
- Wrap playTtsViaStream() audio sending in the queue
- Clear queue on barge-in (when user starts speaking)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dan Guido 2026-01-25 01:36:24 -05:00 committed by Peter Steinberger
parent 875b018ea1
commit 76014685eb
3 changed files with 85 additions and 11 deletions

View File

@ -51,6 +51,11 @@ export class MediaStreamHandler {
private sessions = new Map<string, StreamSession>(); private sessions = new Map<string, StreamSession>();
private config: MediaStreamConfig; private config: MediaStreamConfig;
/** TTS playback queues per stream (serialize audio to prevent overlap) */
private ttsQueues = new Map<string, Array<() => Promise<void>>>();
/** Whether TTS is currently playing per stream */
private ttsPlaying = new Map<string, boolean>();
constructor(config: MediaStreamConfig) { constructor(config: MediaStreamConfig) {
this.config = config; this.config = config;
} }
@ -228,6 +233,51 @@ export class MediaStreamHandler {
this.sendToStream(streamSid, { event: "clear", streamSid }); this.sendToStream(streamSid, { event: "clear", streamSid });
} }
/**
* Queue a TTS operation for sequential playback.
* Only one TTS operation plays at a time per stream to prevent overlap.
*/
async queueTts(streamSid: string, playFn: () => Promise<void>): Promise<void> {
if (!this.ttsQueues.has(streamSid)) {
this.ttsQueues.set(streamSid, []);
}
const queue = this.ttsQueues.get(streamSid)!;
queue.push(playFn);
// Process queue if not already playing
if (!this.ttsPlaying.get(streamSid)) {
await this.processQueue(streamSid);
}
}
/**
* Process the TTS queue for a stream.
*/
private async processQueue(streamSid: string): Promise<void> {
const queue = this.ttsQueues.get(streamSid);
if (!queue || queue.length === 0) {
this.ttsPlaying.set(streamSid, false);
return;
}
this.ttsPlaying.set(streamSid, true);
const playFn = queue.shift()!;
try {
await playFn();
} finally {
await this.processQueue(streamSid);
}
}
/**
* Clear TTS queue and interrupt current playback (barge-in).
*/
clearTtsQueue(streamSid: string): void {
this.ttsQueues.set(streamSid, []);
this.clearAudio(streamSid);
}
/** /**
* Get active session by call ID. * Get active session by call ID.
*/ */

View File

@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
this.callStreamMap.delete(callSid); this.callStreamMap.delete(callSid);
} }
/**
* Clear TTS queue for a call (barge-in).
* Used when user starts speaking to interrupt current TTS playback.
*/
clearTtsQueue(callSid: string): void {
const streamSid = this.callStreamMap.get(callSid);
if (streamSid && this.mediaStreamHandler) {
this.mediaStreamHandler.clearTtsQueue(streamSid);
}
}
/** /**
* Make an authenticated request to the Twilio API. * Make an authenticated request to the Twilio API.
*/ */
@ -504,7 +515,7 @@ export class TwilioProvider implements VoiceCallProvider {
/** /**
* Play TTS via core TTS and Twilio Media Streams. * Play TTS via core TTS and Twilio Media Streams.
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket. * Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
* Uses a jitter buffer to smooth out timing variations. * Uses a queue to serialize playback and prevent overlapping audio.
*/ */
private async playTtsViaStream( private async playTtsViaStream(
text: string, text: string,
@ -514,22 +525,30 @@ export class TwilioProvider implements VoiceCallProvider {
throw new Error("TTS provider and media stream handler required"); throw new Error("TTS provider and media stream handler required");
} }
// Generate audio with core TTS (returns mu-law at 8kHz)
const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law) // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
const CHUNK_SIZE = 160; const CHUNK_SIZE = 160;
const CHUNK_DELAY_MS = 20; const CHUNK_DELAY_MS = 20;
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) { const handler = this.mediaStreamHandler;
this.mediaStreamHandler.sendAudio(streamSid, chunk); const ttsProvider = this.ttsProvider;
await handler.queueTts(streamSid, async (signal) => {
// Generate audio with core TTS (returns mu-law at 8kHz)
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
// Pace the audio to match real-time playback for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS)); if (signal.aborted) break;
} handler.sendAudio(streamSid, chunk);
// Send a mark to track when audio finishes // Pace the audio to match real-time playback
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`); await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
if (signal.aborted) break;
}
if (!signal.aborted) {
// Send a mark to track when audio finishes
handler.sendMark(streamSid, `tts-${Date.now()}`);
}
});
} }
/** /**

View File

@ -78,6 +78,11 @@ export class VoiceCallWebhookServer {
`[voice-call] Transcript for ${providerCallId}: ${transcript}`, `[voice-call] Transcript for ${providerCallId}: ${transcript}`,
); );
// Clear TTS queue on barge-in (user started speaking, interrupt current playback)
if (this.provider.name === "twilio") {
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
}
// Look up our internal call ID from the provider call ID // Look up our internal call ID from the provider call ID
const call = this.manager.getCallByProviderCallId(providerCallId); const call = this.manager.getCallByProviderCallId(providerCallId);
if (!call) { if (!call) {