From 67bbacb3b79f603d5b662e1b0b080ce212ac96a2 Mon Sep 17 00:00:00 2001 From: nullrunner Date: Thu, 29 Jan 2026 23:45:40 +0100 Subject: [PATCH 1/2] fix(media): skip audio files in extractFileBlocks text extraction Audio files (especially OGG/Opus from Telegram voice messages) were being misidentified as text by looksLikeUtf8Text() because OGG headers contain >85% printable ASCII. This caused guessDelimitedMime() to classify them as text/tab-separated-values, injecting raw binary into the model context. Add audio to the skip list alongside image and video in extractFileBlocks() so audio attachments are routed to the transcription pipeline instead of being treated as text files. Fixes #1989 --- src/media-understanding/apply.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7c2a18006..1bb041cc6 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -216,7 +216,7 @@ async function extractFileBlocks(params: { } const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? ""); const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment); - if (!forcedTextMime && (kind === "image" || kind === "video")) { + if (!forcedTextMime && (kind === "image" || kind === "audio" || kind === "video")) { continue; } if (!limits.allowUrl && attachment.url && !attachment.path) { From 586cf569fd4f43e7334f0ef32d4b1f45c7d22368 Mon Sep 17 00:00:00 2001 From: nullrunner Date: Fri, 30 Jan 2026 00:01:03 +0100 Subject: [PATCH 2/2] fix(media): add hasBinaryAudioMagic + audio skip in extractFileBlocks Combines two complementary defenses against audio misidentification: 1. Skip audio kind early in extractFileBlocks() to avoid unnecessary buffer reads (original fix from #4235) 2. Add hasBinaryAudioMagic() to detect OGG/MP3 by magic bytes, preventing looksLikeUtf8Text() false positives (approach from #3904) Together these provide defense-in-depth: the kind check handles the common case efficiently, while magic bytes catch edge cases where attachment kind resolution fails. Fixes #1989 --- src/media-understanding/apply.ts | 48 +++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 1bb041cc6..14e2a6905 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -162,6 +162,48 @@ function looksLikeUtf8Text(buffer?: Buffer): boolean { return printable / total > 0.85; } +/** + * Detects binary audio/video formats by magic bytes (file signatures). + * + * OGG files (used by Telegram voice messages as OGG Opus) have ASCII-heavy + * headers that can pass looksLikeUtf8Text() due to Vorbis/Opus comment metadata + * containing printable strings. This causes them to be misidentified as text. + * + * Magic bytes provide authoritative format detection independent of MIME type + * or file extension, following established file format specifications. + * + * References: + * - OGG container: RFC 3533 (https://datatracker.ietf.org/doc/html/rfc3533) + * Section 6: "OggS" capture pattern at byte offset 0 + * - MP3 ID3v2: id3.org spec (https://id3.org/id3v2.4.0-structure) + * Section 3.1: "ID3" identifier at file start + * + * @see https://github.com/moltbot/moltbot/issues/1989 + */ +function hasBinaryAudioMagic(buffer?: Buffer): boolean { + if (!buffer || buffer.length < 4) return false; + // OGG container format: "OggS" signature (RFC 3533 Section 6) + // Covers OGG Vorbis, OGG Opus (Telegram voice), OGG Theora, etc. + if ( + buffer[0] === 0x4f && // 'O' + buffer[1] === 0x67 && // 'g' + buffer[2] === 0x67 && // 'g' + buffer[3] === 0x53 // 'S' + ) { + return true; + } + // MP3 with ID3v2 tag: "ID3" signature (id3.org spec Section 3.1) + // ID3v2 tags can contain large amounts of ASCII text (lyrics, comments) + if ( + buffer[0] === 0x49 && // 'I' + buffer[1] === 0x44 && // 'D' + buffer[2] === 0x33 // '3' + ) { + return true; + } + return false; +} + function decodeTextSample(buffer?: Buffer): string { if (!buffer || buffer.length === 0) return ""; const sample = buffer.subarray(0, Math.min(buffer.length, 8192)); @@ -242,7 +284,11 @@ async function extractFileBlocks(params: { const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? ""); const utf16Charset = resolveUtf16Charset(bufferResult?.buffer); const textSample = decodeTextSample(bufferResult?.buffer); - const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer); + // Check if content looks like text, but exclude files with known binary audio magic bytes + // OGG files can pass looksLikeUtf8Text() due to ASCII-heavy headers (>85% printable) + const textLike = + (Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer)) && + !hasBinaryAudioMagic(bufferResult?.buffer); if (!forcedTextMimeResolved && kind === "audio" && !textLike) { continue; }