import crypto from "node:crypto"; import type { TwilioConfig } from "../config.js"; import type { MediaStreamHandler } from "../media-stream.js"; import type { HangupCallInput, InitiateCallInput, InitiateCallResult, NormalizedEvent, PlayTtsInput, ProviderWebhookParseResult, StartListeningInput, StopListeningInput, WebhookContext, WebhookVerificationResult, } from "../types.js"; import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js"; import type { VoiceCallProvider } from "./base.js"; import type { OpenAITTSProvider } from "./tts-openai.js"; import { chunkAudio } from "./tts-openai.js"; import { twilioApiRequest } from "./twilio/api.js"; import { verifyTwilioProviderWebhook } from "./twilio/webhook.js"; /** * Twilio Voice API provider implementation. * * Uses Twilio Programmable Voice API with Media Streams for real-time * bidirectional audio streaming. * * @see https://www.twilio.com/docs/voice * @see https://www.twilio.com/docs/voice/media-streams */ export interface TwilioProviderOptions { /** Allow ngrok free tier compatibility mode (less secure) */ allowNgrokFreeTier?: boolean; /** Override public URL for signature verification */ publicUrl?: string; /** Path for media stream WebSocket (e.g., /voice/stream) */ streamPath?: string; /** Skip webhook signature verification (development only) */ skipVerification?: boolean; } export class TwilioProvider implements VoiceCallProvider { readonly name = "twilio" as const; private readonly accountSid: string; private readonly authToken: string; private readonly baseUrl: string; private readonly callWebhookUrls = new Map(); private readonly options: TwilioProviderOptions; /** Current public webhook URL (set when tunnel starts or from config) */ private currentPublicUrl: string | null = null; /** Optional OpenAI TTS provider for streaming TTS */ private ttsProvider: OpenAITTSProvider | null = null; /** Optional media stream handler for sending audio */ private mediaStreamHandler: MediaStreamHandler | null = null; /** Map of call SID to stream SID for media streams */ private callStreamMap = new Map(); constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) { if (!config.accountSid) { throw new Error("Twilio Account SID is required"); } if (!config.authToken) { throw new Error("Twilio Auth Token is required"); } this.accountSid = config.accountSid; this.authToken = config.authToken; this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`; this.options = options; if (options.publicUrl) { this.currentPublicUrl = options.publicUrl; } } setPublicUrl(url: string): void { this.currentPublicUrl = url; } getPublicUrl(): string | null { return this.currentPublicUrl; } setTTSProvider(provider: OpenAITTSProvider): void { this.ttsProvider = provider; } setMediaStreamHandler(handler: MediaStreamHandler): void { this.mediaStreamHandler = handler; } registerCallStream(callSid: string, streamSid: string): void { this.callStreamMap.set(callSid, streamSid); } unregisterCallStream(callSid: string): void { this.callStreamMap.delete(callSid); } /** * Make an authenticated request to the Twilio API. */ private async apiRequest( endpoint: string, params: Record, options?: { allowNotFound?: boolean }, ): Promise { return await twilioApiRequest({ baseUrl: this.baseUrl, accountSid: this.accountSid, authToken: this.authToken, endpoint, body: params, allowNotFound: options?.allowNotFound, }); } /** * Verify Twilio webhook signature using HMAC-SHA1. * * Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing * the public URL from forwarding headers. * * @see https://www.twilio.com/docs/usage/webhooks/webhooks-security */ verifyWebhook(ctx: WebhookContext): WebhookVerificationResult { return verifyTwilioProviderWebhook({ ctx, authToken: this.authToken, currentPublicUrl: this.currentPublicUrl, options: this.options, }); } /** * Parse Twilio webhook event into normalized format. */ parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult { try { const params = new URLSearchParams(ctx.rawBody); const callIdFromQuery = typeof ctx.query?.callId === "string" && ctx.query.callId.trim() ? ctx.query.callId.trim() : undefined; const event = this.normalizeEvent(params, callIdFromQuery); // For Twilio, we must return TwiML. Most actions are driven by Calls API updates, // so the webhook response is typically a pause to keep the call alive. const twiml = this.generateTwimlResponse(ctx); return { events: event ? [event] : [], providerResponseBody: twiml, providerResponseHeaders: { "Content-Type": "application/xml" }, statusCode: 200, }; } catch { return { events: [], statusCode: 400 }; } } /** * Parse Twilio direction to normalized format. */ private static parseDirection( direction: string | null, ): "inbound" | "outbound" | undefined { if (direction === "inbound") return "inbound"; if (direction === "outbound-api" || direction === "outbound-dial") return "outbound"; return undefined; } /** * Convert Twilio webhook params to normalized event format. */ private normalizeEvent( params: URLSearchParams, callIdOverride?: string, ): NormalizedEvent | null { const callSid = params.get("CallSid") || ""; const baseEvent = { id: crypto.randomUUID(), callId: callIdOverride || callSid, providerCallId: callSid, timestamp: Date.now(), direction: TwilioProvider.parseDirection(params.get("Direction")), from: params.get("From") || undefined, to: params.get("To") || undefined, }; // Handle speech result (from ) const speechResult = params.get("SpeechResult"); if (speechResult) { return { ...baseEvent, type: "call.speech", transcript: speechResult, isFinal: true, confidence: parseFloat(params.get("Confidence") || "0.9"), }; } // Handle DTMF const digits = params.get("Digits"); if (digits) { return { ...baseEvent, type: "call.dtmf", digits }; } // Handle call status changes const callStatus = params.get("CallStatus"); switch (callStatus) { case "initiated": return { ...baseEvent, type: "call.initiated" }; case "ringing": return { ...baseEvent, type: "call.ringing" }; case "in-progress": return { ...baseEvent, type: "call.answered" }; case "completed": case "busy": case "no-answer": case "failed": return { ...baseEvent, type: "call.ended", reason: callStatus }; case "canceled": return { ...baseEvent, type: "call.ended", reason: "hangup-bot" }; default: return null; } } private static readonly EMPTY_TWIML = ''; private static readonly PAUSE_TWIML = ` `; /** * Generate TwiML response for webhook. * When a call is answered, connects to media stream for bidirectional audio. */ private generateTwimlResponse(ctx?: WebhookContext): string { if (!ctx) return TwilioProvider.EMPTY_TWIML; const params = new URLSearchParams(ctx.rawBody); const callStatus = params.get("CallStatus"); const direction = params.get("Direction"); console.log( `[voice-call] generateTwimlResponse: status=${callStatus} direction=${direction}`, ); // For inbound calls, answer immediately with stream if (direction === "inbound") { const streamUrl = this.getStreamUrl(); return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML; } // For outbound calls, only connect to stream when call is in-progress if (callStatus !== "in-progress") { return TwilioProvider.EMPTY_TWIML; } const streamUrl = this.getStreamUrl(); return streamUrl ? this.getStreamConnectXml(streamUrl) : TwilioProvider.PAUSE_TWIML; } /** * Get the WebSocket URL for media streaming. * Derives from the public URL origin + stream path. */ private getStreamUrl(): string | null { if (!this.currentPublicUrl || !this.options.streamPath) { return null; } // Extract just the origin (host) from the public URL, ignoring any path const url = new URL(this.currentPublicUrl); const origin = url.origin; // Convert https:// to wss:// for WebSocket const wsOrigin = origin .replace(/^https:\/\//, "wss://") .replace(/^http:\/\//, "ws://"); // Append the stream path const path = this.options.streamPath.startsWith("/") ? this.options.streamPath : `/${this.options.streamPath}`; return `${wsOrigin}${path}`; } /** * Generate TwiML to connect a call to a WebSocket media stream. * This enables bidirectional audio streaming for real-time STT/TTS. * * @param streamUrl - WebSocket URL (wss://...) for the media stream */ getStreamConnectXml(streamUrl: string): string { return ` `; } /** * Initiate an outbound call via Twilio API. * If inlineTwiml is provided, uses that directly (for notify mode). * Otherwise, uses webhook URL for dynamic TwiML. */ async initiateCall(input: InitiateCallInput): Promise { const url = new URL(input.webhookUrl); url.searchParams.set("callId", input.callId); // Build request params const params: Record = { To: input.to, From: input.from, StatusCallback: url.toString(), StatusCallbackEvent: "initiated ringing answered completed", Timeout: "30", }; // Use inline TwiML for notify mode (simpler, no webhook needed) if (input.inlineTwiml) { params.Twiml = input.inlineTwiml; } else { params.Url = url.toString(); } const result = await this.apiRequest( "/Calls.json", params, ); this.callWebhookUrls.set(result.sid, url.toString()); return { providerCallId: result.sid, status: result.status === "queued" ? "queued" : "initiated", }; } /** * Hang up a call via Twilio API. */ async hangupCall(input: HangupCallInput): Promise { this.callWebhookUrls.delete(input.providerCallId); await this.apiRequest( `/Calls/${input.providerCallId}.json`, { Status: "completed" }, { allowNotFound: true }, ); } /** * Play TTS audio via Twilio. * * Two modes: * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available, * generates audio via OpenAI and streams it through WebSocket (preferred). * 2. TwiML : Falls back to Twilio's native TTS with Polly voices. * Note: This may not work on all Twilio accounts. */ async playTts(input: PlayTtsInput): Promise { // Try OpenAI TTS via media stream first (if configured) const streamSid = this.callStreamMap.get(input.providerCallId); if (this.ttsProvider && this.mediaStreamHandler && streamSid) { try { await this.playTtsViaStream(input.text, streamSid); return; } catch (err) { console.warn( `[voice-call] OpenAI TTS failed, falling back to Twilio :`, err instanceof Error ? err.message : err, ); // Fall through to TwiML fallback } } // Fall back to TwiML (may not work on all accounts) const webhookUrl = this.callWebhookUrls.get(input.providerCallId); if (!webhookUrl) { throw new Error( "Missing webhook URL for this call (provider state not initialized)", ); } console.warn( "[voice-call] Using TwiML fallback - OpenAI TTS not configured or media stream not active", ); const pollyVoice = mapVoiceToPolly(input.voice); const twiml = ` ${escapeXml(input.text)} . `; await this.apiRequest(`/Calls/${input.providerCallId}.json`, { Twiml: twiml, }); } /** * Play TTS via OpenAI and Twilio Media Streams. * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket. * Uses a jitter buffer to smooth out timing variations. */ private async playTtsViaStream( text: string, streamSid: string, ): Promise { if (!this.ttsProvider || !this.mediaStreamHandler) { throw new Error("TTS provider and media stream handler required"); } // Generate audio with OpenAI TTS (returns mu-law at 8kHz) const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text); // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law) const CHUNK_SIZE = 160; const CHUNK_DELAY_MS = 20; for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) { this.mediaStreamHandler.sendAudio(streamSid, chunk); // Pace the audio to match real-time playback await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS)); } // Send a mark to track when audio finishes this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`); } /** * Start listening for speech via Twilio . */ async startListening(input: StartListeningInput): Promise { const webhookUrl = this.callWebhookUrls.get(input.providerCallId); if (!webhookUrl) { throw new Error( "Missing webhook URL for this call (provider state not initialized)", ); } const twiml = ` `; await this.apiRequest(`/Calls/${input.providerCallId}.json`, { Twiml: twiml, }); } /** * Stop listening - for Twilio this is a no-op as auto-ends. */ async stopListening(_input: StopListeningInput): Promise { // Twilio's automatically stops on speech end // No explicit action needed } } // ----------------------------------------------------------------------------- // Twilio-specific types // ----------------------------------------------------------------------------- interface TwilioCallResponse { sid: string; status: string; direction: string; from: string; to: string; uri: string; }