From 5a9fd4de5ce18e07422cec43909e3d7a944c89be Mon Sep 17 00:00:00 2001 From: "chenglun.hu" Date: Thu, 29 Jan 2026 10:15:11 +0800 Subject: [PATCH] fix(voice-call): align OpenAI Realtime STT with GA API format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OpenAI Realtime STT provider was using an outdated Beta API format that doesn't produce transcriptions when using the GA endpoint. Changes: - Event type: transcription_session.update → session.update - Session type: Added type: "transcription" - Structure: Flat session → nested session.audio.input - Format: g711_ulaw → audio/pcmu (MIME type) Fixes #3447 Co-Authored-By: Claude Opus 4.5 --- .../src/providers/stt-openai-realtime.ts | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/extensions/voice-call/src/providers/stt-openai-realtime.ts b/extensions/voice-call/src/providers/stt-openai-realtime.ts index 5cd52658d..8c80fe9ac 100644 --- a/extensions/voice-call/src/providers/stt-openai-realtime.ts +++ b/extensions/voice-call/src/providers/stt-openai-realtime.ts @@ -124,19 +124,26 @@ class OpenAIRealtimeSTTSession implements RealtimeSTTSession { this.connected = true; this.reconnectAttempts = 0; - // Configure the transcription session + // Configure the transcription session (GA API format) this.sendEvent({ - type: "transcription_session.update", + type: "session.update", session: { - input_audio_format: "g711_ulaw", - input_audio_transcription: { - model: this.model, - }, - turn_detection: { - type: "server_vad", - threshold: this.vadThreshold, - prefix_padding_ms: 300, - silence_duration_ms: this.silenceDurationMs, + type: "transcription", + audio: { + input: { + format: { + type: "audio/pcmu", + }, + transcription: { + model: this.model, + }, + turn_detection: { + type: "server_vad", + threshold: this.vadThreshold, + prefix_padding_ms: 300, + silence_duration_ms: this.silenceDurationMs, + }, + }, }, }, });