fix(voice-call): prevent audio overlap with TTS queue
Add a TTS queue to serialize audio playback and prevent overlapping speech during voice calls. Previously, concurrent speak() calls could send audio chunks simultaneously, causing garbled/choppy output. Changes: - Add queueTts() to MediaStreamHandler for sequential TTS playback - Wrap playTtsViaStream() audio sending in the queue - Clear queue on barge-in (when user starts speaking) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
875b018ea1
commit
76014685eb
@ -51,6 +51,11 @@ export class MediaStreamHandler {
|
|||||||
private sessions = new Map<string, StreamSession>();
|
private sessions = new Map<string, StreamSession>();
|
||||||
private config: MediaStreamConfig;
|
private config: MediaStreamConfig;
|
||||||
|
|
||||||
|
/** TTS playback queues per stream (serialize audio to prevent overlap) */
|
||||||
|
private ttsQueues = new Map<string, Array<() => Promise<void>>>();
|
||||||
|
/** Whether TTS is currently playing per stream */
|
||||||
|
private ttsPlaying = new Map<string, boolean>();
|
||||||
|
|
||||||
constructor(config: MediaStreamConfig) {
|
constructor(config: MediaStreamConfig) {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
}
|
}
|
||||||
@ -228,6 +233,51 @@ export class MediaStreamHandler {
|
|||||||
this.sendToStream(streamSid, { event: "clear", streamSid });
|
this.sendToStream(streamSid, { event: "clear", streamSid });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Queue a TTS operation for sequential playback.
|
||||||
|
* Only one TTS operation plays at a time per stream to prevent overlap.
|
||||||
|
*/
|
||||||
|
async queueTts(streamSid: string, playFn: () => Promise<void>): Promise<void> {
|
||||||
|
if (!this.ttsQueues.has(streamSid)) {
|
||||||
|
this.ttsQueues.set(streamSid, []);
|
||||||
|
}
|
||||||
|
const queue = this.ttsQueues.get(streamSid)!;
|
||||||
|
queue.push(playFn);
|
||||||
|
|
||||||
|
// Process queue if not already playing
|
||||||
|
if (!this.ttsPlaying.get(streamSid)) {
|
||||||
|
await this.processQueue(streamSid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process the TTS queue for a stream.
|
||||||
|
*/
|
||||||
|
private async processQueue(streamSid: string): Promise<void> {
|
||||||
|
const queue = this.ttsQueues.get(streamSid);
|
||||||
|
if (!queue || queue.length === 0) {
|
||||||
|
this.ttsPlaying.set(streamSid, false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.ttsPlaying.set(streamSid, true);
|
||||||
|
const playFn = queue.shift()!;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await playFn();
|
||||||
|
} finally {
|
||||||
|
await this.processQueue(streamSid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear TTS queue and interrupt current playback (barge-in).
|
||||||
|
*/
|
||||||
|
clearTtsQueue(streamSid: string): void {
|
||||||
|
this.ttsQueues.set(streamSid, []);
|
||||||
|
this.clearAudio(streamSid);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get active session by call ID.
|
* Get active session by call ID.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -135,6 +135,17 @@ export class TwilioProvider implements VoiceCallProvider {
|
|||||||
this.callStreamMap.delete(callSid);
|
this.callStreamMap.delete(callSid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear TTS queue for a call (barge-in).
|
||||||
|
* Used when user starts speaking to interrupt current TTS playback.
|
||||||
|
*/
|
||||||
|
clearTtsQueue(callSid: string): void {
|
||||||
|
const streamSid = this.callStreamMap.get(callSid);
|
||||||
|
if (streamSid && this.mediaStreamHandler) {
|
||||||
|
this.mediaStreamHandler.clearTtsQueue(streamSid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make an authenticated request to the Twilio API.
|
* Make an authenticated request to the Twilio API.
|
||||||
*/
|
*/
|
||||||
@ -504,7 +515,7 @@ export class TwilioProvider implements VoiceCallProvider {
|
|||||||
/**
|
/**
|
||||||
* Play TTS via core TTS and Twilio Media Streams.
|
* Play TTS via core TTS and Twilio Media Streams.
|
||||||
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
* Generates audio with core TTS, converts to mu-law, and streams via WebSocket.
|
||||||
* Uses a jitter buffer to smooth out timing variations.
|
* Uses a queue to serialize playback and prevent overlapping audio.
|
||||||
*/
|
*/
|
||||||
private async playTtsViaStream(
|
private async playTtsViaStream(
|
||||||
text: string,
|
text: string,
|
||||||
@ -514,22 +525,30 @@ export class TwilioProvider implements VoiceCallProvider {
|
|||||||
throw new Error("TTS provider and media stream handler required");
|
throw new Error("TTS provider and media stream handler required");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate audio with core TTS (returns mu-law at 8kHz)
|
|
||||||
const muLawAudio = await this.ttsProvider.synthesizeForTelephony(text);
|
|
||||||
|
|
||||||
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
// Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
|
||||||
const CHUNK_SIZE = 160;
|
const CHUNK_SIZE = 160;
|
||||||
const CHUNK_DELAY_MS = 20;
|
const CHUNK_DELAY_MS = 20;
|
||||||
|
|
||||||
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
const handler = this.mediaStreamHandler;
|
||||||
this.mediaStreamHandler.sendAudio(streamSid, chunk);
|
const ttsProvider = this.ttsProvider;
|
||||||
|
await handler.queueTts(streamSid, async (signal) => {
|
||||||
|
// Generate audio with core TTS (returns mu-law at 8kHz)
|
||||||
|
const muLawAudio = await ttsProvider.synthesizeForTelephony(text);
|
||||||
|
|
||||||
// Pace the audio to match real-time playback
|
for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
|
||||||
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
if (signal.aborted) break;
|
||||||
}
|
handler.sendAudio(streamSid, chunk);
|
||||||
|
|
||||||
// Send a mark to track when audio finishes
|
// Pace the audio to match real-time playback
|
||||||
this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
|
await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
|
||||||
|
if (signal.aborted) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!signal.aborted) {
|
||||||
|
// Send a mark to track when audio finishes
|
||||||
|
handler.sendMark(streamSid, `tts-${Date.now()}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -78,6 +78,11 @@ export class VoiceCallWebhookServer {
|
|||||||
`[voice-call] Transcript for ${providerCallId}: ${transcript}`,
|
`[voice-call] Transcript for ${providerCallId}: ${transcript}`,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Clear TTS queue on barge-in (user started speaking, interrupt current playback)
|
||||||
|
if (this.provider.name === "twilio") {
|
||||||
|
(this.provider as TwilioProvider).clearTtsQueue(providerCallId);
|
||||||
|
}
|
||||||
|
|
||||||
// Look up our internal call ID from the provider call ID
|
// Look up our internal call ID from the provider call ID
|
||||||
const call = this.manager.getCallByProviderCallId(providerCallId);
|
const call = this.manager.getCallByProviderCallId(providerCallId);
|
||||||
if (!call) {
|
if (!call) {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user