openclaw/apps/ios/Sources/Voice/TalkModeManager.swift
Chris Herold 6e33f3f0f3
iOS: implement dual-connection gateway architecture with deadlock fix
Aligns the iOS app with the Clawnet refactor by implementing proper role
separation for gateway connections. Uses separate operator and node sessions
to match the gateway's authorization requirements.

Changes:
- New GatewayOperatorSession: Wraps GatewayChannelActor for operator-role
  RPC requests (chat.*, health, sessions.list) without invoke handling
- Dual-connection architecture: Operator session for requests, node session
  for node.event calls (e.g., chat.subscribe)
- Separate websocket sessions: Each connection gets its own URLSession to
  prevent response cross-talk
- Updated chat transport: IOSGatewayChatTransport uses operator session for
  requests, node session for subscriptions

ClawdbotKit (shared):
- Deadlock fix in GatewayChannel.swift: Moved connection finalization
  (listen(), connected=true, isConnecting=false, waiter resumption) to occur
  before calling pushHandler. This fixes a latent bug where requests made
  from onConnected callbacks would deadlock. Does not affect macOS (its
  callback doesn't make requests).
- Package.swift: Fixed argument order for Swift 6.2 compatibility

iOS chat is now working. This is the base PR to unlock further work on
the iOS app.
2026-01-27 14:46:27 -08:00

736 lines
31 KiB
Swift

import AVFAudio
import MoltbotKit
import MoltbotProtocol
import Foundation
import Observation
import OSLog
import Speech
@MainActor
@Observable
final class TalkModeManager: NSObject {
private typealias SpeechRequest = SFSpeechAudioBufferRecognitionRequest
private static let defaultModelIdFallback = "eleven_v3"
var isEnabled: Bool = false
var isListening: Bool = false
var isSpeaking: Bool = false
var statusText: String = "Off"
private let audioEngine = AVAudioEngine()
private var speechRecognizer: SFSpeechRecognizer?
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private var silenceTask: Task<Void, Never>?
private var lastHeard: Date?
private var lastTranscript: String = ""
private var lastSpokenText: String?
private var lastInterruptedAtSeconds: Double?
private var defaultVoiceId: String?
private var currentVoiceId: String?
private var defaultModelId: String?
private var currentModelId: String?
private var voiceOverrideActive = false
private var modelOverrideActive = false
private var defaultOutputFormat: String?
private var apiKey: String?
private var voiceAliases: [String: String] = [:]
private var interruptOnSpeech: Bool = true
private var mainSessionKey: String = "main"
private var fallbackVoiceId: String?
private var lastPlaybackWasPCM: Bool = false
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
private var gateway: GatewayOperatorSession?
private var nodeSession: GatewayNodeSession?
private let silenceWindow: TimeInterval = 0.7
private var chatSubscribedSessionKeys = Set<String>()
private let logger = Logger(subsystem: "bot.molt", category: "TalkMode")
func attachGateway(_ gateway: GatewayOperatorSession, nodeSession: GatewayNodeSession? = nil) {
self.gateway = gateway
self.nodeSession = nodeSession
}
func updateMainSessionKey(_ sessionKey: String?) {
let trimmed = (sessionKey ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return }
if SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) { return }
self.mainSessionKey = trimmed
}
func setEnabled(_ enabled: Bool) {
self.isEnabled = enabled
if enabled {
self.logger.info("enabled")
Task { await self.start() }
} else {
self.logger.info("disabled")
self.stop()
}
}
func start() async {
guard self.isEnabled else { return }
if self.isListening { return }
self.logger.info("start")
self.statusText = "Requesting permissions…"
let micOk = await Self.requestMicrophonePermission()
guard micOk else {
self.logger.warning("start blocked: microphone permission denied")
self.statusText = "Microphone permission denied"
return
}
let speechOk = await Self.requestSpeechPermission()
guard speechOk else {
self.logger.warning("start blocked: speech permission denied")
self.statusText = "Speech recognition permission denied"
return
}
await self.reloadConfig()
do {
try Self.configureAudioSession()
try self.startRecognition()
self.isListening = true
self.statusText = "Listening"
self.startSilenceMonitor()
await self.subscribeChatIfNeeded(sessionKey: self.mainSessionKey)
self.logger.info("listening")
} catch {
self.isListening = false
self.statusText = "Start failed: \(error.localizedDescription)"
self.logger.error("start failed: \(error.localizedDescription, privacy: .public)")
}
}
func stop() {
self.isEnabled = false
self.isListening = false
self.statusText = "Off"
self.lastTranscript = ""
self.lastHeard = nil
self.silenceTask?.cancel()
self.silenceTask = nil
self.stopRecognition()
self.stopSpeaking()
self.lastInterruptedAtSeconds = nil
TalkSystemSpeechSynthesizer.shared.stop()
do {
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
} catch {
self.logger.warning("audio session deactivate failed: \(error.localizedDescription, privacy: .public)")
}
Task { await self.unsubscribeAllChats() }
}
func userTappedOrb() {
self.stopSpeaking()
}
private func startRecognition() throws {
#if targetEnvironment(simulator)
throw NSError(domain: "TalkMode", code: 2, userInfo: [
NSLocalizedDescriptionKey: "Talk mode is not supported on the iOS simulator",
])
#endif
self.stopRecognition()
self.speechRecognizer = SFSpeechRecognizer()
guard let recognizer = self.speechRecognizer else {
throw NSError(domain: "TalkMode", code: 1, userInfo: [
NSLocalizedDescriptionKey: "Speech recognizer unavailable",
])
}
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
self.recognitionRequest?.shouldReportPartialResults = true
guard let request = self.recognitionRequest else { return }
let input = self.audioEngine.inputNode
let format = input.outputFormat(forBus: 0)
guard format.sampleRate > 0, format.channelCount > 0 else {
throw NSError(domain: "TalkMode", code: 3, userInfo: [
NSLocalizedDescriptionKey: "Invalid audio input format",
])
}
input.removeTap(onBus: 0)
let tapBlock = Self.makeAudioTapAppendCallback(request: request)
input.installTap(onBus: 0, bufferSize: 2048, format: format, block: tapBlock)
self.audioEngine.prepare()
try self.audioEngine.start()
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
guard let self else { return }
if let error {
if !self.isSpeaking {
self.statusText = "Speech error: \(error.localizedDescription)"
}
self.logger.debug("speech recognition error: \(error.localizedDescription, privacy: .public)")
}
guard let result else { return }
let transcript = result.bestTranscription.formattedString
Task { @MainActor in
await self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
}
}
}
private func stopRecognition() {
self.recognitionTask?.cancel()
self.recognitionTask = nil
self.recognitionRequest?.endAudio()
self.recognitionRequest = nil
self.audioEngine.inputNode.removeTap(onBus: 0)
self.audioEngine.stop()
self.speechRecognizer = nil
}
private nonisolated static func makeAudioTapAppendCallback(request: SpeechRequest) -> AVAudioNodeTapBlock {
{ buffer, _ in
request.append(buffer)
}
}
private func handleTranscript(transcript: String, isFinal: Bool) async {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
if self.isSpeaking, self.interruptOnSpeech {
if self.shouldInterrupt(with: trimmed) {
self.stopSpeaking()
}
return
}
guard self.isListening else { return }
if !trimmed.isEmpty {
self.lastTranscript = trimmed
self.lastHeard = Date()
}
if isFinal {
self.lastTranscript = trimmed
}
}
private func startSilenceMonitor() {
self.silenceTask?.cancel()
self.silenceTask = Task { [weak self] in
guard let self else { return }
while self.isEnabled {
try? await Task.sleep(nanoseconds: 200_000_000)
await self.checkSilence()
}
}
}
private func checkSilence() async {
guard self.isListening, !self.isSpeaking else { return }
let transcript = self.lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !transcript.isEmpty else { return }
guard let lastHeard else { return }
if Date().timeIntervalSince(lastHeard) < self.silenceWindow { return }
await self.finalizeTranscript(transcript)
}
private func finalizeTranscript(_ transcript: String) async {
self.isListening = false
self.statusText = "Thinking…"
self.lastTranscript = ""
self.lastHeard = nil
self.stopRecognition()
await self.reloadConfig()
let prompt = self.buildPrompt(transcript: transcript)
guard let gateway else {
self.statusText = "Gateway not connected"
self.logger.warning("finalize: gateway not connected")
await self.start()
return
}
do {
let startedAt = Date().timeIntervalSince1970
let sessionKey = self.mainSessionKey
await self.subscribeChatIfNeeded(sessionKey: sessionKey)
self.logger.info(
"chat.send start sessionKey=\(sessionKey, privacy: .public) chars=\(prompt.count, privacy: .public)")
let runId = try await self.sendChat(prompt, gateway: gateway)
self.logger.info("chat.send ok runId=\(runId, privacy: .public)")
let completion = await self.waitForChatCompletion(runId: runId, gateway: gateway, timeoutSeconds: 120)
if completion == .timeout {
self.logger.warning(
"chat completion timeout runId=\(runId, privacy: .public); attempting history fallback")
} else if completion == .aborted {
self.statusText = "Aborted"
self.logger.warning("chat completion aborted runId=\(runId, privacy: .public)")
await self.start()
return
} else if completion == .error {
self.statusText = "Chat error"
self.logger.warning("chat completion error runId=\(runId, privacy: .public)")
await self.start()
return
}
guard let assistantText = try await self.waitForAssistantText(
gateway: gateway,
since: startedAt,
timeoutSeconds: completion == .final ? 12 : 25)
else {
self.statusText = "No reply"
self.logger.warning("assistant text timeout runId=\(runId, privacy: .public)")
await self.start()
return
}
self.logger.info("assistant text ok chars=\(assistantText.count, privacy: .public)")
await self.playAssistant(text: assistantText)
} catch {
self.statusText = "Talk failed: \(error.localizedDescription)"
self.logger.error("finalize failed: \(error.localizedDescription, privacy: .public)")
}
await self.start()
}
private func subscribeChatIfNeeded(sessionKey: String) async {
let key = sessionKey.trimmingCharacters(in: .whitespacesAndNewlines)
guard !key.isEmpty else { return }
// Use nodeSession for node.event (chat.subscribe).
guard let nodeSession else { return }
guard !self.chatSubscribedSessionKeys.contains(key) else { return }
let payload = "{\"sessionKey\":\"\(key)\"}"
await nodeSession.sendEvent(event: "chat.subscribe", payloadJSON: payload)
self.chatSubscribedSessionKeys.insert(key)
self.logger.info("chat.subscribe ok sessionKey=\(key, privacy: .public)")
}
private func unsubscribeAllChats() async {
guard let nodeSession else { return }
let keys = self.chatSubscribedSessionKeys
self.chatSubscribedSessionKeys.removeAll()
for key in keys {
let payload = "{\"sessionKey\":\"\(key)\"}"
await nodeSession.sendEvent(event: "chat.unsubscribe", payloadJSON: payload)
}
}
private func buildPrompt(transcript: String) -> String {
let interrupted = self.lastInterruptedAtSeconds
self.lastInterruptedAtSeconds = nil
return TalkPromptBuilder.build(transcript: transcript, interruptedAtSeconds: interrupted)
}
private enum ChatCompletionState: CustomStringConvertible {
case final
case aborted
case error
case timeout
var description: String {
switch self {
case .final: "final"
case .aborted: "aborted"
case .error: "error"
case .timeout: "timeout"
}
}
}
private func sendChat(_ message: String, gateway: GatewayOperatorSession) async throws -> String {
struct SendResponse: Decodable { let runId: String }
let payload: [String: Any] = [
"sessionKey": self.mainSessionKey,
"message": message,
"thinking": "low",
"timeoutMs": 30000,
"idempotencyKey": UUID().uuidString,
]
let data = try JSONSerialization.data(withJSONObject: payload)
guard let json = String(bytes: data, encoding: .utf8) else {
throw NSError(
domain: "TalkModeManager",
code: 1,
userInfo: [NSLocalizedDescriptionKey: "Failed to encode chat payload"])
}
let res = try await gateway.request(method: "chat.send", paramsJSON: json, timeoutSeconds: 30)
let decoded = try JSONDecoder().decode(SendResponse.self, from: res)
return decoded.runId
}
private func waitForChatCompletion(
runId: String,
gateway: GatewayOperatorSession,
timeoutSeconds: Int = 120) async -> ChatCompletionState
{
let stream = await gateway.subscribeServerEvents(bufferingNewest: 200)
return await withTaskGroup(of: ChatCompletionState.self) { group in
group.addTask { [runId] in
for await evt in stream {
if Task.isCancelled { return .timeout }
guard evt.event == "chat", let payload = evt.payload else { continue }
guard let chatEvent = try? GatewayPayloadDecoding.decode(payload, as: ChatEvent.self) else {
continue
}
guard chatEvent.runid == runId else { continue }
if let state = chatEvent.state.value as? String {
switch state {
case "final": return .final
case "aborted": return .aborted
case "error": return .error
default: break
}
}
}
return .timeout
}
group.addTask {
try? await Task.sleep(nanoseconds: UInt64(timeoutSeconds) * 1_000_000_000)
return .timeout
}
let result = await group.next() ?? .timeout
group.cancelAll()
return result
}
}
private func waitForAssistantText(
gateway: GatewayOperatorSession,
since: Double,
timeoutSeconds: Int) async throws -> String?
{
let deadline = Date().addingTimeInterval(TimeInterval(timeoutSeconds))
while Date() < deadline {
if let text = try await self.fetchLatestAssistantText(gateway: gateway, since: since) {
return text
}
try? await Task.sleep(nanoseconds: 300_000_000)
}
return nil
}
private func fetchLatestAssistantText(
gateway: GatewayOperatorSession,
since: Double? = nil) async throws -> String?
{
let res = try await gateway.request(
method: "chat.history",
paramsJSON: "{\"sessionKey\":\"\(self.mainSessionKey)\"}",
timeoutSeconds: 15)
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return nil }
guard let messages = json["messages"] as? [[String: Any]] else { return nil }
for msg in messages.reversed() {
guard (msg["role"] as? String) == "assistant" else { continue }
if let since, let timestamp = msg["timestamp"] as? Double,
TalkHistoryTimestamp.isAfter(timestamp, sinceSeconds: since) == false
{
continue
}
guard let content = msg["content"] as? [[String: Any]] else { continue }
let text = content.compactMap { $0["text"] as? String }.joined(separator: "\n")
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty { return trimmed }
}
return nil
}
private func playAssistant(text: String) async {
let parsed = TalkDirectiveParser.parse(text)
let directive = parsed.directive
let cleaned = parsed.stripped.trimmingCharacters(in: .whitespacesAndNewlines)
guard !cleaned.isEmpty else { return }
let requestedVoice = directive?.voiceId?.trimmingCharacters(in: .whitespacesAndNewlines)
let resolvedVoice = self.resolveVoiceAlias(requestedVoice)
if requestedVoice?.isEmpty == false, resolvedVoice == nil {
self.logger.warning("unknown voice alias \(requestedVoice ?? "?", privacy: .public)")
}
if let voice = resolvedVoice {
if directive?.once != true {
self.currentVoiceId = voice
self.voiceOverrideActive = true
}
}
if let model = directive?.modelId {
if directive?.once != true {
self.currentModelId = model
self.modelOverrideActive = true
}
}
self.statusText = "Generating voice…"
self.isSpeaking = true
self.lastSpokenText = cleaned
do {
let started = Date()
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
let resolvedKey =
(self.apiKey?.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? self.apiKey : nil) ??
ProcessInfo.processInfo.environment["ELEVENLABS_API_KEY"]
let apiKey = resolvedKey?.trimmingCharacters(in: .whitespacesAndNewlines)
let preferredVoice = resolvedVoice ?? self.currentVoiceId ?? self.defaultVoiceId
let voiceId: String? = if let apiKey, !apiKey.isEmpty {
await self.resolveVoiceId(preferred: preferredVoice, apiKey: apiKey)
} else {
nil
}
let canUseElevenLabs = (voiceId?.isEmpty == false) && (apiKey?.isEmpty == false)
if canUseElevenLabs, let voiceId, let apiKey {
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
.trimmingCharacters(in: .whitespacesAndNewlines)
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
if outputFormat == nil, let requestedOutputFormat {
self.logger.warning(
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
}
let modelId = directive?.modelId ?? self.currentModelId ?? self.defaultModelId
func makeRequest(outputFormat: String?) -> ElevenLabsTTSRequest {
ElevenLabsTTSRequest(
text: cleaned,
modelId: modelId,
outputFormat: outputFormat,
speed: TalkTTSValidation.resolveSpeed(speed: directive?.speed, rateWPM: directive?.rateWPM),
stability: TalkTTSValidation.validatedStability(directive?.stability, modelId: modelId),
similarity: TalkTTSValidation.validatedUnit(directive?.similarity),
style: TalkTTSValidation.validatedUnit(directive?.style),
speakerBoost: directive?.speakerBoost,
seed: TalkTTSValidation.validatedSeed(directive?.seed),
normalize: ElevenLabsTTSClient.validatedNormalize(directive?.normalize),
language: language,
latencyTier: TalkTTSValidation.validatedLatencyTier(directive?.latencyTier))
}
let request = makeRequest(outputFormat: outputFormat)
let client = ElevenLabsTTSClient(apiKey: apiKey)
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning(
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking…"
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
let result: StreamingPlaybackResult
if let sampleRate {
self.lastPlaybackWasPCM = true
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
if !playback.finished, playback.interruptedAt == nil {
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
self.logger.warning("pcm playback failed; retrying mp3")
self.lastPlaybackWasPCM = false
let mp3Stream = client.streamSynthesize(
voiceId: voiceId,
request: makeRequest(outputFormat: mp3Format))
playback = await self.mp3Player.play(stream: mp3Stream)
}
result = playback
} else {
self.lastPlaybackWasPCM = false
result = await self.mp3Player.play(stream: stream)
}
let duration = Date().timeIntervalSince(started)
self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s")
if !result.finished, let interruptedAt = result.interruptedAt {
self.lastInterruptedAtSeconds = interruptedAt
}
} else {
self.logger.warning("tts unavailable; falling back to system voice (missing key or voiceId)")
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning(
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking (System)…"
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
}
} catch {
self.logger.error(
"tts failed: \(error.localizedDescription, privacy: .public); falling back to system voice")
do {
if self.interruptOnSpeech {
do {
try self.startRecognition()
} catch {
self.logger.warning(
"startRecognition during speak failed: \(error.localizedDescription, privacy: .public)")
}
}
self.statusText = "Speaking (System)…"
let language = ElevenLabsTTSClient.validatedLanguage(directive?.language)
try await TalkSystemSpeechSynthesizer.shared.speak(text: cleaned, language: language)
} catch {
self.statusText = "Speak failed: \(error.localizedDescription)"
self.logger.error("system voice failed: \(error.localizedDescription, privacy: .public)")
}
}
self.stopRecognition()
self.isSpeaking = false
}
private func stopSpeaking(storeInterruption: Bool = true) {
guard self.isSpeaking else { return }
let interruptedAt = self.lastPlaybackWasPCM
? self.pcmPlayer.stop()
: self.mp3Player.stop()
if storeInterruption {
self.lastInterruptedAtSeconds = interruptedAt
}
_ = self.lastPlaybackWasPCM
? self.mp3Player.stop()
: self.pcmPlayer.stop()
TalkSystemSpeechSynthesizer.shared.stop()
self.isSpeaking = false
}
private func shouldInterrupt(with transcript: String) -> Bool {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
guard trimmed.count >= 3 else { return false }
if let spoken = self.lastSpokenText?.lowercased(), spoken.contains(trimmed.lowercased()) {
return false
}
return true
}
private func resolveVoiceAlias(_ value: String?) -> String? {
let trimmed = (value ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
let normalized = trimmed.lowercased()
if let mapped = self.voiceAliases[normalized] { return mapped }
if self.voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
return trimmed
}
return Self.isLikelyVoiceId(trimmed) ? trimmed : nil
}
private func resolveVoiceId(preferred: String?, apiKey: String) async -> String? {
let trimmed = preferred?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
if !trimmed.isEmpty {
if let resolved = self.resolveVoiceAlias(trimmed) { return resolved }
self.logger.warning("unknown voice alias \(trimmed, privacy: .public)")
}
if let fallbackVoiceId { return fallbackVoiceId }
do {
let voices = try await ElevenLabsTTSClient(apiKey: apiKey).listVoices()
guard let first = voices.first else {
self.logger.warning("elevenlabs voices list empty")
return nil
}
self.fallbackVoiceId = first.voiceId
if self.defaultVoiceId == nil {
self.defaultVoiceId = first.voiceId
}
if !self.voiceOverrideActive {
self.currentVoiceId = first.voiceId
}
let name = first.name ?? "unknown"
self.logger
.info("default voice selected \(name, privacy: .public) (\(first.voiceId, privacy: .public))")
return first.voiceId
} catch {
self.logger.error("elevenlabs list voices failed: \(error.localizedDescription, privacy: .public)")
return nil
}
}
private static func isLikelyVoiceId(_ value: String) -> Bool {
guard value.count >= 10 else { return false }
return value.allSatisfy { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
}
private func reloadConfig() async {
guard let gateway else { return }
do {
let res = try await gateway.request(method: "config.get", paramsJSON: "{}", timeoutSeconds: 8)
guard let json = try JSONSerialization.jsonObject(with: res) as? [String: Any] else { return }
guard let config = json["config"] as? [String: Any] else { return }
let talk = config["talk"] as? [String: Any]
let session = config["session"] as? [String: Any]
let mainKey = SessionKey.normalizeMainKey(session?["mainKey"] as? String)
if !SessionKey.isCanonicalMainSessionKey(self.mainSessionKey) {
self.mainSessionKey = mainKey
}
self.defaultVoiceId = (talk?["voiceId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
if let aliases = talk?["voiceAliases"] as? [String: Any] {
var resolved: [String: String] = [:]
for (key, value) in aliases {
guard let id = value as? String else { continue }
let normalizedKey = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
let trimmedId = id.trimmingCharacters(in: .whitespacesAndNewlines)
guard !normalizedKey.isEmpty, !trimmedId.isEmpty else { continue }
resolved[normalizedKey] = trimmedId
}
self.voiceAliases = resolved
} else {
self.voiceAliases = [:]
}
if !self.voiceOverrideActive {
self.currentVoiceId = self.defaultVoiceId
}
let model = (talk?["modelId"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
self.defaultModelId = (model?.isEmpty == false) ? model : Self.defaultModelIdFallback
if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId
}
self.defaultOutputFormat = (talk?["outputFormat"] as? String)?
.trimmingCharacters(in: .whitespacesAndNewlines)
self.apiKey = (talk?["apiKey"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines)
if let interrupt = talk?["interruptOnSpeech"] as? Bool {
self.interruptOnSpeech = interrupt
}
} catch {
self.defaultModelId = Self.defaultModelIdFallback
if !self.modelOverrideActive {
self.currentModelId = self.defaultModelId
}
}
}
private static func configureAudioSession() throws {
let session = AVAudioSession.sharedInstance()
try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
.duckOthers,
.mixWithOthers,
.allowBluetoothHFP,
.defaultToSpeaker,
])
try session.setActive(true, options: [])
}
private nonisolated static func requestMicrophonePermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
AVAudioApplication.requestRecordPermission { ok in
cont.resume(returning: ok)
}
}
}
private nonisolated static func requestSpeechPermission() async -> Bool {
await withCheckedContinuation(isolation: nil) { cont in
SFSpeechRecognizer.requestAuthorization { status in
cont.resume(returning: status == .authorized)
}
}
}
}