Fixed an index out of bounds crash in VoiceWakeRuntime.trimmedAfterTrigger that occurred when processing voice transcripts. The issue was caused by attempting to subscript a string with an index that could exceed the string's endIndex when using indices from a lowercased version of the string. Added a guard statement to check that the index is within bounds before attempting to subscript the string. If the index is out of bounds, the function continues to the next trigger instead of crashing. Fixes the crash reported in crash.txt at line 743.
807 lines
32 KiB
Swift
807 lines
32 KiB
Swift
import AVFoundation
|
|
import Foundation
|
|
import OSLog
|
|
import Speech
|
|
import SwabbleKit
|
|
#if canImport(AppKit)
|
|
import AppKit
|
|
#endif
|
|
|
|
/// Background listener that keeps the voice-wake pipeline alive outside the settings test view.
|
|
actor VoiceWakeRuntime {
|
|
static let shared = VoiceWakeRuntime()
|
|
|
|
enum ListeningState { case idle, voiceWake, pushToTalk }
|
|
|
|
private let logger = Logger(subsystem: "bot.molt", category: "voicewake.runtime")
|
|
|
|
private var recognizer: SFSpeechRecognizer?
|
|
// Lazily created on start to avoid creating an AVAudioEngine at app launch, which can switch Bluetooth
|
|
// headphones into the low-quality headset profile even if Voice Wake is disabled.
|
|
private var audioEngine: AVAudioEngine?
|
|
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
private var recognitionTask: SFSpeechRecognitionTask?
|
|
private var recognitionGeneration: Int = 0 // drop stale callbacks after restarts
|
|
private var lastHeard: Date?
|
|
private var noiseFloorRMS: Double = 1e-4
|
|
private var captureStartedAt: Date?
|
|
private var captureTask: Task<Void, Never>?
|
|
private var capturedTranscript: String = ""
|
|
private var isCapturing: Bool = false
|
|
private var heardBeyondTrigger: Bool = false
|
|
private var triggerChimePlayed: Bool = false
|
|
private var committedTranscript: String = ""
|
|
private var volatileTranscript: String = ""
|
|
private var cooldownUntil: Date?
|
|
private var currentConfig: RuntimeConfig?
|
|
private var listeningState: ListeningState = .idle
|
|
private var overlayToken: UUID?
|
|
private var activeTriggerEndTime: TimeInterval?
|
|
private var scheduledRestartTask: Task<Void, Never>?
|
|
private var lastLoggedText: String?
|
|
private var lastLoggedAt: Date?
|
|
private var lastTapLogAt: Date?
|
|
private var lastCallbackLogAt: Date?
|
|
private var lastTranscript: String?
|
|
private var lastTranscriptAt: Date?
|
|
private var preDetectTask: Task<Void, Never>?
|
|
private var isStarting: Bool = false
|
|
private var triggerOnlyTask: Task<Void, Never>?
|
|
|
|
// Tunables
|
|
// Silence threshold once we've captured user speech (post-trigger).
|
|
private let silenceWindow: TimeInterval = 2.0
|
|
// Silence threshold when we only heard the trigger but no post-trigger speech yet.
|
|
private let triggerOnlySilenceWindow: TimeInterval = 5.0
|
|
// Maximum capture duration from trigger until we force-send, to avoid runaway sessions.
|
|
private let captureHardStop: TimeInterval = 120.0
|
|
private let debounceAfterSend: TimeInterval = 0.35
|
|
// Voice activity detection parameters (RMS-based).
|
|
private let minSpeechRMS: Double = 1e-3
|
|
private let speechBoostFactor: Double = 6.0 // how far above noise floor we require to mark speech
|
|
private let preDetectSilenceWindow: TimeInterval = 1.0
|
|
private let triggerPauseWindow: TimeInterval = 0.55
|
|
|
|
/// Stops the active Speech pipeline without clearing the stored config, so we can restart cleanly.
|
|
private func haltRecognitionPipeline() {
|
|
// Bump generation first so any in-flight callbacks from the cancelled task get dropped.
|
|
self.recognitionGeneration &+= 1
|
|
self.recognitionTask?.cancel()
|
|
self.recognitionTask = nil
|
|
self.recognitionRequest?.endAudio()
|
|
self.recognitionRequest = nil
|
|
self.audioEngine?.inputNode.removeTap(onBus: 0)
|
|
self.audioEngine?.stop()
|
|
// Release the engine so we also release any audio session/resources when Voice Wake is idle.
|
|
self.audioEngine = nil
|
|
}
|
|
|
|
struct RuntimeConfig: Equatable {
|
|
let triggers: [String]
|
|
let micID: String?
|
|
let localeID: String?
|
|
let triggerChime: VoiceWakeChime
|
|
let sendChime: VoiceWakeChime
|
|
}
|
|
|
|
private struct RecognitionUpdate {
|
|
let transcript: String?
|
|
let segments: [WakeWordSegment]
|
|
let isFinal: Bool
|
|
let error: Error?
|
|
let generation: Int
|
|
}
|
|
|
|
func refresh(state: AppState) async {
|
|
let snapshot = await MainActor.run { () -> (Bool, RuntimeConfig) in
|
|
let enabled = state.swabbleEnabled
|
|
let config = RuntimeConfig(
|
|
triggers: sanitizeVoiceWakeTriggers(state.swabbleTriggerWords),
|
|
micID: state.voiceWakeMicID.isEmpty ? nil : state.voiceWakeMicID,
|
|
localeID: state.voiceWakeLocaleID.isEmpty ? nil : state.voiceWakeLocaleID,
|
|
triggerChime: state.voiceWakeTriggerChime,
|
|
sendChime: state.voiceWakeSendChime)
|
|
return (enabled, config)
|
|
}
|
|
|
|
guard voiceWakeSupported, snapshot.0 else {
|
|
self.stop()
|
|
return
|
|
}
|
|
|
|
guard PermissionManager.voiceWakePermissionsGranted() else {
|
|
self.logger.debug("voicewake runtime not starting: permissions missing")
|
|
self.stop()
|
|
return
|
|
}
|
|
|
|
let config = snapshot.1
|
|
|
|
if self.isStarting {
|
|
return
|
|
}
|
|
|
|
if self.scheduledRestartTask != nil, config == self.currentConfig, self.recognitionTask == nil {
|
|
return
|
|
}
|
|
|
|
if self.scheduledRestartTask != nil {
|
|
self.scheduledRestartTask?.cancel()
|
|
self.scheduledRestartTask = nil
|
|
}
|
|
|
|
if config == self.currentConfig, self.recognitionTask != nil {
|
|
return
|
|
}
|
|
|
|
self.stop()
|
|
await self.start(with: config)
|
|
}
|
|
|
|
private func start(with config: RuntimeConfig) async {
|
|
if self.isStarting {
|
|
return
|
|
}
|
|
self.isStarting = true
|
|
defer { self.isStarting = false }
|
|
do {
|
|
self.recognitionGeneration &+= 1
|
|
let generation = self.recognitionGeneration
|
|
|
|
self.configureSession(localeID: config.localeID)
|
|
|
|
guard let recognizer, recognizer.isAvailable else {
|
|
self.logger.error("voicewake runtime: speech recognizer unavailable")
|
|
return
|
|
}
|
|
|
|
self.recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
|
self.recognitionRequest?.shouldReportPartialResults = true
|
|
self.recognitionRequest?.taskHint = .dictation
|
|
guard let request = self.recognitionRequest else { return }
|
|
|
|
// Lazily create the engine here so app launch doesn't grab audio resources / trigger Bluetooth HFP.
|
|
if self.audioEngine == nil {
|
|
self.audioEngine = AVAudioEngine()
|
|
}
|
|
guard let audioEngine = self.audioEngine else { return }
|
|
|
|
let input = audioEngine.inputNode
|
|
let format = input.outputFormat(forBus: 0)
|
|
guard format.channelCount > 0, format.sampleRate > 0 else {
|
|
throw NSError(
|
|
domain: "VoiceWakeRuntime",
|
|
code: 1,
|
|
userInfo: [NSLocalizedDescriptionKey: "No audio input available"])
|
|
}
|
|
input.removeTap(onBus: 0)
|
|
input.installTap(onBus: 0, bufferSize: 2048, format: format) { [weak self, weak request] buffer, _ in
|
|
request?.append(buffer)
|
|
guard let rms = Self.rmsLevel(buffer: buffer) else { return }
|
|
Task.detached { [weak self] in
|
|
await self?.noteAudioLevel(rms: rms)
|
|
await self?.noteAudioTap(rms: rms)
|
|
}
|
|
}
|
|
|
|
audioEngine.prepare()
|
|
try audioEngine.start()
|
|
|
|
self.currentConfig = config
|
|
self.lastHeard = Date()
|
|
// Preserve any existing cooldownUntil so the debounce after send isn't wiped by a restart.
|
|
|
|
self.recognitionTask = recognizer.recognitionTask(with: request) { [weak self, generation] result, error in
|
|
guard let self else { return }
|
|
let transcript = result?.bestTranscription.formattedString
|
|
let segments = result.flatMap { result in
|
|
transcript
|
|
.map { WakeWordSpeechSegments.from(transcription: result.bestTranscription, transcript: $0) }
|
|
} ?? []
|
|
let isFinal = result?.isFinal ?? false
|
|
Task { await self.noteRecognitionCallback(transcript: transcript, isFinal: isFinal, error: error) }
|
|
let update = RecognitionUpdate(
|
|
transcript: transcript,
|
|
segments: segments,
|
|
isFinal: isFinal,
|
|
error: error,
|
|
generation: generation)
|
|
Task { await self.handleRecognition(update, config: config) }
|
|
}
|
|
|
|
let preferred = config.micID?.isEmpty == false ? config.micID! : "system-default"
|
|
self.logger.info(
|
|
"voicewake runtime input preferred=\(preferred, privacy: .public) " +
|
|
"\(AudioInputDeviceObserver.defaultInputDeviceSummary(), privacy: .public)")
|
|
self.logger.info("voicewake runtime started")
|
|
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "started", fields: [
|
|
"locale": config.localeID ?? "",
|
|
"micID": config.micID ?? "",
|
|
])
|
|
} catch {
|
|
self.logger.error("voicewake runtime failed to start: \(error.localizedDescription, privacy: .public)")
|
|
self.stop()
|
|
}
|
|
}
|
|
|
|
private func stop(dismissOverlay: Bool = true, cancelScheduledRestart: Bool = true) {
|
|
if cancelScheduledRestart {
|
|
self.scheduledRestartTask?.cancel()
|
|
self.scheduledRestartTask = nil
|
|
}
|
|
self.captureTask?.cancel()
|
|
self.captureTask = nil
|
|
self.isCapturing = false
|
|
self.capturedTranscript = ""
|
|
self.captureStartedAt = nil
|
|
self.triggerChimePlayed = false
|
|
self.lastTranscript = nil
|
|
self.lastTranscriptAt = nil
|
|
self.preDetectTask?.cancel()
|
|
self.preDetectTask = nil
|
|
self.triggerOnlyTask?.cancel()
|
|
self.triggerOnlyTask = nil
|
|
self.haltRecognitionPipeline()
|
|
self.recognizer = nil
|
|
self.currentConfig = nil
|
|
self.listeningState = .idle
|
|
self.activeTriggerEndTime = nil
|
|
self.logger.debug("voicewake runtime stopped")
|
|
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "stopped")
|
|
|
|
let token = self.overlayToken
|
|
self.overlayToken = nil
|
|
guard dismissOverlay else { return }
|
|
Task { @MainActor in
|
|
if let token {
|
|
VoiceSessionCoordinator.shared.dismiss(token: token, reason: .explicit, outcome: .empty)
|
|
} else {
|
|
VoiceWakeOverlayController.shared.dismiss()
|
|
}
|
|
}
|
|
}
|
|
|
|
private func configureSession(localeID: String?) {
|
|
let locale = localeID.flatMap { Locale(identifier: $0) } ?? Locale(identifier: Locale.current.identifier)
|
|
self.recognizer = SFSpeechRecognizer(locale: locale)
|
|
self.recognizer?.defaultTaskHint = .dictation
|
|
}
|
|
|
|
private func handleRecognition(_ update: RecognitionUpdate, config: RuntimeConfig) async {
|
|
if update.generation != self.recognitionGeneration {
|
|
return // stale callback from a superseded recognizer session
|
|
}
|
|
if let error = update.error {
|
|
self.logger.debug("voicewake recognition error: \(error.localizedDescription, privacy: .public)")
|
|
}
|
|
|
|
guard let transcript = update.transcript else { return }
|
|
|
|
let now = Date()
|
|
if !transcript.isEmpty {
|
|
self.lastHeard = now
|
|
if !self.isCapturing {
|
|
self.lastTranscript = transcript
|
|
self.lastTranscriptAt = now
|
|
}
|
|
if self.isCapturing {
|
|
self.maybeLogRecognition(
|
|
transcript: transcript,
|
|
segments: update.segments,
|
|
triggers: config.triggers,
|
|
isFinal: update.isFinal,
|
|
match: nil,
|
|
usedFallback: false,
|
|
capturing: true)
|
|
let trimmed = Self.commandAfterTrigger(
|
|
transcript: transcript,
|
|
segments: update.segments,
|
|
triggerEndTime: self.activeTriggerEndTime,
|
|
triggers: config.triggers)
|
|
self.capturedTranscript = trimmed
|
|
self.updateHeardBeyondTrigger(withTrimmed: trimmed)
|
|
if update.isFinal {
|
|
self.committedTranscript = trimmed
|
|
self.volatileTranscript = ""
|
|
} else {
|
|
self.volatileTranscript = Self.delta(after: self.committedTranscript, current: trimmed)
|
|
}
|
|
|
|
let attributed = Self.makeAttributed(
|
|
committed: self.committedTranscript,
|
|
volatile: self.volatileTranscript,
|
|
isFinal: update.isFinal)
|
|
let snapshot = self.committedTranscript + self.volatileTranscript
|
|
if let token = self.overlayToken {
|
|
await MainActor.run {
|
|
VoiceSessionCoordinator.shared.updatePartial(
|
|
token: token,
|
|
text: snapshot,
|
|
attributed: attributed)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if self.isCapturing { return }
|
|
|
|
let gateConfig = WakeWordGateConfig(triggers: config.triggers)
|
|
var usedFallback = false
|
|
var match = WakeWordGate.match(transcript: transcript, segments: update.segments, config: gateConfig)
|
|
if match == nil, update.isFinal {
|
|
match = self.textOnlyFallbackMatch(
|
|
transcript: transcript,
|
|
triggers: config.triggers,
|
|
config: gateConfig)
|
|
usedFallback = match != nil
|
|
}
|
|
self.maybeLogRecognition(
|
|
transcript: transcript,
|
|
segments: update.segments,
|
|
triggers: config.triggers,
|
|
isFinal: update.isFinal,
|
|
match: match,
|
|
usedFallback: usedFallback,
|
|
capturing: false)
|
|
|
|
if let match {
|
|
if let cooldown = cooldownUntil, now < cooldown {
|
|
return
|
|
}
|
|
if usedFallback {
|
|
self.logger.info("voicewake runtime detected (text-only fallback) len=\(match.command.count)")
|
|
} else {
|
|
self.logger.info("voicewake runtime detected len=\(match.command.count)")
|
|
}
|
|
await self.beginCapture(command: match.command, triggerEndTime: match.triggerEndTime, config: config)
|
|
} else if !transcript.isEmpty, update.error == nil {
|
|
if self.isTriggerOnly(transcript: transcript, triggers: config.triggers) {
|
|
self.preDetectTask?.cancel()
|
|
self.preDetectTask = nil
|
|
self.scheduleTriggerOnlyPauseCheck(triggers: config.triggers, config: config)
|
|
} else {
|
|
self.triggerOnlyTask?.cancel()
|
|
self.triggerOnlyTask = nil
|
|
self.schedulePreDetectSilenceCheck(
|
|
triggers: config.triggers,
|
|
gateConfig: gateConfig,
|
|
config: config)
|
|
}
|
|
}
|
|
}
|
|
|
|
private func maybeLogRecognition(
|
|
transcript: String,
|
|
segments: [WakeWordSegment],
|
|
triggers: [String],
|
|
isFinal: Bool,
|
|
match: WakeWordGateMatch?,
|
|
usedFallback: Bool,
|
|
capturing: Bool)
|
|
{
|
|
guard !transcript.isEmpty else { return }
|
|
let level = self.logger.logLevel
|
|
guard level == .debug || level == .trace else { return }
|
|
if transcript == self.lastLoggedText, !isFinal {
|
|
if let last = self.lastLoggedAt, Date().timeIntervalSince(last) < 0.25 {
|
|
return
|
|
}
|
|
}
|
|
self.lastLoggedText = transcript
|
|
self.lastLoggedAt = Date()
|
|
|
|
let textOnly = WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers)
|
|
let timingCount = segments.count(where: { $0.start > 0 || $0.duration > 0 })
|
|
let matchSummary = match.map {
|
|
"match=true gap=\(String(format: "%.2f", $0.postGap))s cmdLen=\($0.command.count)"
|
|
} ?? "match=false"
|
|
let segmentSummary = segments.map { seg in
|
|
let start = String(format: "%.2f", seg.start)
|
|
let end = String(format: "%.2f", seg.end)
|
|
return "\(seg.text)@\(start)-\(end)"
|
|
}.joined(separator: ", ")
|
|
|
|
self.logger.debug(
|
|
"voicewake runtime transcript='\(transcript, privacy: .private)' textOnly=\(textOnly) " +
|
|
"isFinal=\(isFinal) timing=\(timingCount)/\(segments.count) " +
|
|
"capturing=\(capturing) fallback=\(usedFallback) " +
|
|
"\(matchSummary) segments=[\(segmentSummary, privacy: .private)]")
|
|
}
|
|
|
|
private func noteAudioTap(rms: Double) {
|
|
let now = Date()
|
|
if let last = self.lastTapLogAt, now.timeIntervalSince(last) < 1.0 {
|
|
return
|
|
}
|
|
self.lastTapLogAt = now
|
|
let db = 20 * log10(max(rms, 1e-7))
|
|
self.logger.debug(
|
|
"voicewake runtime audio tap rms=\(String(format: "%.6f", rms)) " +
|
|
"db=\(String(format: "%.1f", db)) capturing=\(self.isCapturing)")
|
|
}
|
|
|
|
private func noteRecognitionCallback(transcript: String?, isFinal: Bool, error: Error?) {
|
|
guard transcript?.isEmpty ?? true else { return }
|
|
let now = Date()
|
|
if let last = self.lastCallbackLogAt, now.timeIntervalSince(last) < 1.0 {
|
|
return
|
|
}
|
|
self.lastCallbackLogAt = now
|
|
let errorSummary = error?.localizedDescription ?? "none"
|
|
self.logger.debug(
|
|
"voicewake runtime callback empty transcript isFinal=\(isFinal) error=\(errorSummary, privacy: .public)")
|
|
}
|
|
|
|
private func scheduleTriggerOnlyPauseCheck(triggers: [String], config: RuntimeConfig) {
|
|
self.triggerOnlyTask?.cancel()
|
|
let lastSeenAt = self.lastTranscriptAt
|
|
let lastText = self.lastTranscript
|
|
let windowNanos = UInt64(self.triggerPauseWindow * 1_000_000_000)
|
|
self.triggerOnlyTask = Task { [weak self, lastSeenAt, lastText] in
|
|
try? await Task.sleep(nanoseconds: windowNanos)
|
|
guard let self else { return }
|
|
await self.triggerOnlyPauseCheck(
|
|
lastSeenAt: lastSeenAt,
|
|
lastText: lastText,
|
|
triggers: triggers,
|
|
config: config)
|
|
}
|
|
}
|
|
|
|
private func schedulePreDetectSilenceCheck(
|
|
triggers: [String],
|
|
gateConfig: WakeWordGateConfig,
|
|
config: RuntimeConfig)
|
|
{
|
|
self.preDetectTask?.cancel()
|
|
let lastSeenAt = self.lastTranscriptAt
|
|
let lastText = self.lastTranscript
|
|
let windowNanos = UInt64(self.preDetectSilenceWindow * 1_000_000_000)
|
|
self.preDetectTask = Task { [weak self, lastSeenAt, lastText] in
|
|
try? await Task.sleep(nanoseconds: windowNanos)
|
|
guard let self else { return }
|
|
await self.preDetectSilenceCheck(
|
|
lastSeenAt: lastSeenAt,
|
|
lastText: lastText,
|
|
triggers: triggers,
|
|
gateConfig: gateConfig,
|
|
config: config)
|
|
}
|
|
}
|
|
|
|
private func triggerOnlyPauseCheck(
|
|
lastSeenAt: Date?,
|
|
lastText: String?,
|
|
triggers: [String],
|
|
config: RuntimeConfig) async
|
|
{
|
|
guard !Task.isCancelled else { return }
|
|
guard !self.isCapturing else { return }
|
|
guard let lastSeenAt, let lastText else { return }
|
|
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
|
|
guard self.isTriggerOnly(transcript: lastText, triggers: triggers) else { return }
|
|
if let cooldown = self.cooldownUntil, Date() < cooldown {
|
|
return
|
|
}
|
|
self.logger.info("voicewake runtime detected (trigger-only pause)")
|
|
await self.beginCapture(command: "", triggerEndTime: nil, config: config)
|
|
}
|
|
|
|
private func textOnlyFallbackMatch(
|
|
transcript: String,
|
|
triggers: [String],
|
|
config: WakeWordGateConfig) -> WakeWordGateMatch?
|
|
{
|
|
guard let command = VoiceWakeTextUtils.textOnlyCommand(
|
|
transcript: transcript,
|
|
triggers: triggers,
|
|
minCommandLength: config.minCommandLength,
|
|
trimWake: Self.trimmedAfterTrigger)
|
|
else { return nil }
|
|
return WakeWordGateMatch(triggerEndTime: 0, postGap: 0, command: command)
|
|
}
|
|
|
|
private func isTriggerOnly(transcript: String, triggers: [String]) -> Bool {
|
|
guard WakeWordGate.matchesTextOnly(text: transcript, triggers: triggers) else { return false }
|
|
guard VoiceWakeTextUtils.startsWithTrigger(transcript: transcript, triggers: triggers) else { return false }
|
|
return Self.trimmedAfterTrigger(transcript, triggers: triggers).isEmpty
|
|
}
|
|
|
|
private func preDetectSilenceCheck(
|
|
lastSeenAt: Date?,
|
|
lastText: String?,
|
|
triggers: [String],
|
|
gateConfig: WakeWordGateConfig,
|
|
config: RuntimeConfig) async
|
|
{
|
|
guard !Task.isCancelled else { return }
|
|
guard !self.isCapturing else { return }
|
|
guard let lastSeenAt, let lastText else { return }
|
|
guard self.lastTranscriptAt == lastSeenAt, self.lastTranscript == lastText else { return }
|
|
guard let match = self.textOnlyFallbackMatch(
|
|
transcript: lastText,
|
|
triggers: triggers,
|
|
config: gateConfig)
|
|
else { return }
|
|
if let cooldown = self.cooldownUntil, Date() < cooldown {
|
|
return
|
|
}
|
|
self.logger.info("voicewake runtime detected (silence fallback) len=\(match.command.count)")
|
|
await self.beginCapture(
|
|
command: match.command,
|
|
triggerEndTime: match.triggerEndTime,
|
|
config: config)
|
|
}
|
|
|
|
private func beginCapture(command: String, triggerEndTime: TimeInterval?, config: RuntimeConfig) async {
|
|
self.listeningState = .voiceWake
|
|
self.isCapturing = true
|
|
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "beginCapture")
|
|
self.capturedTranscript = command
|
|
self.committedTranscript = ""
|
|
self.volatileTranscript = command
|
|
self.captureStartedAt = Date()
|
|
self.cooldownUntil = nil
|
|
self.heardBeyondTrigger = !command.isEmpty
|
|
self.triggerChimePlayed = false
|
|
self.activeTriggerEndTime = triggerEndTime
|
|
self.preDetectTask?.cancel()
|
|
self.preDetectTask = nil
|
|
self.triggerOnlyTask?.cancel()
|
|
self.triggerOnlyTask = nil
|
|
|
|
if config.triggerChime != .none, !self.triggerChimePlayed {
|
|
self.triggerChimePlayed = true
|
|
await MainActor.run { VoiceWakeChimePlayer.play(config.triggerChime, reason: "voicewake.trigger") }
|
|
}
|
|
|
|
let snapshot = self.committedTranscript + self.volatileTranscript
|
|
let attributed = Self.makeAttributed(
|
|
committed: self.committedTranscript,
|
|
volatile: self.volatileTranscript,
|
|
isFinal: false)
|
|
self.overlayToken = await MainActor.run {
|
|
VoiceSessionCoordinator.shared.startSession(
|
|
source: .wakeWord,
|
|
text: snapshot,
|
|
attributed: attributed,
|
|
forwardEnabled: true)
|
|
}
|
|
|
|
// Keep the "ears" boosted for the capture window so the status icon animates while recording.
|
|
await MainActor.run { AppStateStore.shared.triggerVoiceEars(ttl: nil) }
|
|
|
|
self.captureTask?.cancel()
|
|
self.captureTask = Task { [weak self] in
|
|
guard let self else { return }
|
|
await self.monitorCapture(config: config)
|
|
}
|
|
}
|
|
|
|
private func monitorCapture(config: RuntimeConfig) async {
|
|
let start = self.captureStartedAt ?? Date()
|
|
let hardStop = start.addingTimeInterval(self.captureHardStop)
|
|
|
|
while self.isCapturing {
|
|
let now = Date()
|
|
if now >= hardStop {
|
|
// Hard-stop after a maximum duration so we never leave the recognizer pinned open.
|
|
await self.finalizeCapture(config: config)
|
|
return
|
|
}
|
|
|
|
let silenceThreshold = self.heardBeyondTrigger ? self.silenceWindow : self.triggerOnlySilenceWindow
|
|
if let last = self.lastHeard, now.timeIntervalSince(last) >= silenceThreshold {
|
|
await self.finalizeCapture(config: config)
|
|
return
|
|
}
|
|
|
|
try? await Task.sleep(nanoseconds: 200_000_000)
|
|
}
|
|
}
|
|
|
|
private func finalizeCapture(config: RuntimeConfig) async {
|
|
guard self.isCapturing else { return }
|
|
self.isCapturing = false
|
|
// Disarm trigger matching immediately (before halting recognition) to avoid double-trigger
|
|
// races from late callbacks that arrive after isCapturing is cleared.
|
|
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
|
self.captureTask?.cancel()
|
|
self.captureTask = nil
|
|
|
|
let finalTranscript = self.capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
DiagnosticsFileLog.shared.log(category: "voicewake.runtime", event: "finalizeCapture", fields: [
|
|
"finalLen": "\(finalTranscript.count)",
|
|
])
|
|
// Stop further recognition events so we don't retrigger immediately with buffered audio.
|
|
self.haltRecognitionPipeline()
|
|
self.capturedTranscript = ""
|
|
self.captureStartedAt = nil
|
|
self.lastHeard = nil
|
|
self.heardBeyondTrigger = false
|
|
self.triggerChimePlayed = false
|
|
self.activeTriggerEndTime = nil
|
|
self.lastTranscript = nil
|
|
self.lastTranscriptAt = nil
|
|
self.preDetectTask?.cancel()
|
|
self.preDetectTask = nil
|
|
self.triggerOnlyTask?.cancel()
|
|
self.triggerOnlyTask = nil
|
|
|
|
await MainActor.run { AppStateStore.shared.stopVoiceEars() }
|
|
if let token = self.overlayToken {
|
|
await MainActor.run { VoiceSessionCoordinator.shared.updateLevel(token: token, 0) }
|
|
}
|
|
|
|
let delay: TimeInterval = 0.0
|
|
let sendChime = finalTranscript.isEmpty ? .none : config.sendChime
|
|
if let token = self.overlayToken {
|
|
await MainActor.run {
|
|
VoiceSessionCoordinator.shared.finalize(
|
|
token: token,
|
|
text: finalTranscript,
|
|
sendChime: sendChime,
|
|
autoSendAfter: delay)
|
|
}
|
|
} else if !finalTranscript.isEmpty {
|
|
if sendChime != .none {
|
|
await MainActor.run { VoiceWakeChimePlayer.play(sendChime, reason: "voicewake.send") }
|
|
}
|
|
Task.detached {
|
|
await VoiceWakeForwarder.forward(transcript: finalTranscript)
|
|
}
|
|
}
|
|
self.overlayToken = nil
|
|
self.scheduleRestartRecognizer()
|
|
}
|
|
|
|
// MARK: - Audio level handling
|
|
|
|
private func noteAudioLevel(rms: Double) {
|
|
guard self.isCapturing else { return }
|
|
|
|
// Update adaptive noise floor: faster when lower energy (quiet), slower when loud.
|
|
let alpha: Double = rms < self.noiseFloorRMS ? 0.08 : 0.01
|
|
self.noiseFloorRMS = max(1e-7, self.noiseFloorRMS + (rms - self.noiseFloorRMS) * alpha)
|
|
|
|
let threshold = max(self.minSpeechRMS, self.noiseFloorRMS * self.speechBoostFactor)
|
|
if rms >= threshold {
|
|
self.lastHeard = Date()
|
|
}
|
|
|
|
// Normalize against the adaptive threshold so the UI meter stays roughly 0...1 across devices.
|
|
let clamped = min(1.0, max(0.0, rms / max(self.minSpeechRMS, threshold)))
|
|
if let token = self.overlayToken {
|
|
Task { @MainActor in
|
|
VoiceSessionCoordinator.shared.updateLevel(token: token, clamped)
|
|
}
|
|
}
|
|
}
|
|
|
|
private static func rmsLevel(buffer: AVAudioPCMBuffer) -> Double? {
|
|
guard let channelData = buffer.floatChannelData?.pointee else { return nil }
|
|
let frameCount = Int(buffer.frameLength)
|
|
guard frameCount > 0 else { return nil }
|
|
var sum: Double = 0
|
|
for i in 0..<frameCount {
|
|
let sample = Double(channelData[i])
|
|
sum += sample * sample
|
|
}
|
|
return sqrt(sum / Double(frameCount))
|
|
}
|
|
|
|
private func restartRecognizer() {
|
|
// Restart the recognizer so we listen for the next trigger with a clean buffer.
|
|
let current = self.currentConfig
|
|
self.stop(dismissOverlay: false, cancelScheduledRestart: false)
|
|
if let current {
|
|
Task { await self.start(with: current) }
|
|
}
|
|
}
|
|
|
|
private func restartRecognizerIfIdleAndOverlayHidden() async {
|
|
if self.isCapturing { return }
|
|
self.restartRecognizer()
|
|
}
|
|
|
|
private func scheduleRestartRecognizer(delay: TimeInterval = 0.7) {
|
|
self.scheduledRestartTask?.cancel()
|
|
self.scheduledRestartTask = Task { [weak self] in
|
|
let nanos = UInt64(max(0, delay) * 1_000_000_000)
|
|
try? await Task.sleep(nanoseconds: nanos)
|
|
guard let self else { return }
|
|
await self.consumeScheduledRestart()
|
|
await self.restartRecognizerIfIdleAndOverlayHidden()
|
|
}
|
|
}
|
|
|
|
private func consumeScheduledRestart() {
|
|
self.scheduledRestartTask = nil
|
|
}
|
|
|
|
func applyPushToTalkCooldown() {
|
|
self.cooldownUntil = Date().addingTimeInterval(self.debounceAfterSend)
|
|
}
|
|
|
|
func pauseForPushToTalk() {
|
|
self.listeningState = .pushToTalk
|
|
self.stop(dismissOverlay: false)
|
|
}
|
|
|
|
private func updateHeardBeyondTrigger(withTrimmed trimmed: String) {
|
|
if !self.heardBeyondTrigger, !trimmed.isEmpty {
|
|
self.heardBeyondTrigger = true
|
|
}
|
|
}
|
|
|
|
private static func trimmedAfterTrigger(_ text: String, triggers: [String]) -> String {
|
|
let lower = text.lowercased()
|
|
for trigger in triggers {
|
|
let token = trigger.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
|
|
guard !token.isEmpty, let range = lower.range(of: token) else { continue }
|
|
let after = range.upperBound
|
|
// Guard against index out of bounds when the trigger is at the end of the string
|
|
guard after <= text.endIndex else { continue }
|
|
let trimmed = text[after...].trimmingCharacters(in: .whitespacesAndNewlines)
|
|
return String(trimmed)
|
|
}
|
|
return text
|
|
}
|
|
|
|
private static func commandAfterTrigger(
|
|
transcript: String,
|
|
segments: [WakeWordSegment],
|
|
triggerEndTime: TimeInterval?,
|
|
triggers: [String]) -> String
|
|
{
|
|
guard let triggerEndTime else {
|
|
return self.trimmedAfterTrigger(transcript, triggers: triggers)
|
|
}
|
|
let trimmed = WakeWordGate.commandText(
|
|
transcript: transcript,
|
|
segments: segments,
|
|
triggerEndTime: triggerEndTime)
|
|
return trimmed.isEmpty ? self.trimmedAfterTrigger(transcript, triggers: triggers) : trimmed
|
|
}
|
|
|
|
#if DEBUG
|
|
static func _testTrimmedAfterTrigger(_ text: String, triggers: [String]) -> String {
|
|
self.trimmedAfterTrigger(text, triggers: triggers)
|
|
}
|
|
|
|
static func _testHasContentAfterTrigger(_ text: String, triggers: [String]) -> Bool {
|
|
!self.trimmedAfterTrigger(text, triggers: triggers).isEmpty
|
|
}
|
|
|
|
static func _testAttributedColor(isFinal: Bool) -> NSColor {
|
|
self.makeAttributed(committed: "sample", volatile: "", isFinal: isFinal)
|
|
.attribute(.foregroundColor, at: 0, effectiveRange: nil) as? NSColor ?? .clear
|
|
}
|
|
|
|
#endif
|
|
|
|
private static func delta(after committed: String, current: String) -> String {
|
|
if current.hasPrefix(committed) {
|
|
let start = current.index(current.startIndex, offsetBy: committed.count)
|
|
return String(current[start...])
|
|
}
|
|
return current
|
|
}
|
|
|
|
private static func makeAttributed(committed: String, volatile: String, isFinal: Bool) -> NSAttributedString {
|
|
let full = NSMutableAttributedString()
|
|
let committedAttr: [NSAttributedString.Key: Any] = [
|
|
.foregroundColor: NSColor.labelColor,
|
|
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
|
]
|
|
full.append(NSAttributedString(string: committed, attributes: committedAttr))
|
|
let volatileColor: NSColor = isFinal ? .labelColor : NSColor.tertiaryLabelColor
|
|
let volatileAttr: [NSAttributedString.Key: Any] = [
|
|
.foregroundColor: volatileColor,
|
|
.font: NSFont.systemFont(ofSize: 13, weight: .regular),
|
|
]
|
|
full.append(NSAttributedString(string: volatile, attributes: volatileAttr))
|
|
return full
|
|
}
|
|
}
|