From 67bbacb3b79f603d5b662e1b0b080ce212ac96a2 Mon Sep 17 00:00:00 2001
From: nullrunner <Nicholas.mariani@hotmail.it>
Date: Thu, 29 Jan 2026 23:45:40 +0100
Subject: [PATCH 1/2] fix(media): skip audio files in extractFileBlocks text
 extraction

Audio files (especially OGG/Opus from Telegram voice messages) were being
misidentified as text by looksLikeUtf8Text() because OGG headers contain
>85% printable ASCII. This caused guessDelimitedMime() to classify them
as text/tab-separated-values, injecting raw binary into the model context.

Add audio to the skip list alongside image and video in
extractFileBlocks() so audio attachments are routed to the transcription
pipeline instead of being treated as text files.

Fixes #1989
---
 src/media-understanding/apply.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 7c2a18006..1bb041cc6 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -216,7 +216,7 @@ async function extractFileBlocks(params: {
     }
     const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
     const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
-    if (!forcedTextMime && (kind === "image" || kind === "video")) {
+    if (!forcedTextMime && (kind === "image" || kind === "audio" || kind === "video")) {
       continue;
     }
     if (!limits.allowUrl && attachment.url && !attachment.path) {

From 586cf569fd4f43e7334f0ef32d4b1f45c7d22368 Mon Sep 17 00:00:00 2001
From: nullrunner <Nicholas.mariani@hotmail.it>
Date: Fri, 30 Jan 2026 00:01:03 +0100
Subject: [PATCH 2/2] fix(media): add hasBinaryAudioMagic + audio skip in
 extractFileBlocks

Combines two complementary defenses against audio misidentification:

1. Skip audio kind early in extractFileBlocks() to avoid unnecessary
   buffer reads (original fix from #4235)
2. Add hasBinaryAudioMagic() to detect OGG/MP3 by magic bytes, preventing
   looksLikeUtf8Text() false positives (approach from #3904)

Together these provide defense-in-depth: the kind check handles the
common case efficiently, while magic bytes catch edge cases where
attachment kind resolution fails.

Fixes #1989
---
 src/media-understanding/apply.ts | 48 +++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts
index 1bb041cc6..14e2a6905 100644
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -162,6 +162,48 @@ function looksLikeUtf8Text(buffer?: Buffer): boolean {
   return printable / total > 0.85;
 }
 
+/**
+ * Detects binary audio/video formats by magic bytes (file signatures).
+ *
+ * OGG files (used by Telegram voice messages as OGG Opus) have ASCII-heavy
+ * headers that can pass looksLikeUtf8Text() due to Vorbis/Opus comment metadata
+ * containing printable strings. This causes them to be misidentified as text.
+ *
+ * Magic bytes provide authoritative format detection independent of MIME type
+ * or file extension, following established file format specifications.
+ *
+ * References:
+ * - OGG container: RFC 3533 (https://datatracker.ietf.org/doc/html/rfc3533)
+ *   Section 6: "OggS" capture pattern at byte offset 0
+ * - MP3 ID3v2: id3.org spec (https://id3.org/id3v2.4.0-structure)
+ *   Section 3.1: "ID3" identifier at file start
+ *
+ * @see https://github.com/moltbot/moltbot/issues/1989
+ */
+function hasBinaryAudioMagic(buffer?: Buffer): boolean {
+  if (!buffer || buffer.length < 4) return false;
+  // OGG container format: "OggS" signature (RFC 3533 Section 6)
+  // Covers OGG Vorbis, OGG Opus (Telegram voice), OGG Theora, etc.
+  if (
+    buffer[0] === 0x4f && // 'O'
+    buffer[1] === 0x67 && // 'g'
+    buffer[2] === 0x67 && // 'g'
+    buffer[3] === 0x53 // 'S'
+  ) {
+    return true;
+  }
+  // MP3 with ID3v2 tag: "ID3" signature (id3.org spec Section 3.1)
+  // ID3v2 tags can contain large amounts of ASCII text (lyrics, comments)
+  if (
+    buffer[0] === 0x49 && // 'I'
+    buffer[1] === 0x44 && // 'D'
+    buffer[2] === 0x33 // '3'
+  ) {
+    return true;
+  }
+  return false;
+}
+
 function decodeTextSample(buffer?: Buffer): string {
   if (!buffer || buffer.length === 0) return "";
   const sample = buffer.subarray(0, Math.min(buffer.length, 8192));
@@ -242,7 +284,11 @@ async function extractFileBlocks(params: {
     const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
     const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
     const textSample = decodeTextSample(bufferResult?.buffer);
-    const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
+    // Check if content looks like text, but exclude files with known binary audio magic bytes
+    // OGG files can pass looksLikeUtf8Text() due to ASCII-heavy headers (>85% printable)
+    const textLike =
+      (Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer)) &&
+      !hasBinaryAudioMagic(bufferResult?.buffer);
     if (!forcedTextMimeResolved && kind === "audio" && !textLike) {
       continue;
     }