diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 7a4d68136..d17db473f 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -547,6 +547,73 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toContain("a\tb\tc"); }); + it("does not treat OGG audio as text even with ASCII-heavy headers (issue #1989)", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); + const oggPath = path.join(dir, "voice.ogg"); + // Simulate OGG file with "OggS" magic bytes followed by ASCII-heavy content + // that would pass looksLikeUtf8Text() if not for magic byte detection + // Real OGG Opus files (Telegram voice) have similar structure + const oggMagic = Buffer.from([0x4f, 0x67, 0x67, 0x53]); // "OggS" + const fakeMetadata = Buffer.from("ENCODER=test\tVERSION=1\nTITLE=hello\t"); + const oggBuffer = Buffer.concat([oggMagic, fakeMetadata]); + await fs.writeFile(oggPath, oggBuffer); + + const ctx: MsgContext = { + Body: "", + MediaPath: oggPath, + MediaType: "audio/ogg", + }; + const cfg: MoltbotConfig = { + tools: { + media: { + audio: { enabled: false }, + image: { enabled: false }, + video: { enabled: false }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + // OGG should NOT be treated as a text file + expect(result.appliedFile).toBe(false); + expect(ctx.Body).not.toContain(" { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); + const mp3Path = path.join(dir, "song.mp3"); + // Simulate MP3 file with ID3v2 tag followed by ASCII-heavy metadata + // ID3 tags can contain lyrics, comments, and other text that passes looksLikeUtf8Text() + const id3Magic = Buffer.from([0x49, 0x44, 0x33]); // "ID3" + const fakeMetadata = Buffer.from("TIT2=Song Title\tTPE1=Artist Name\nTALB=Album\t"); + const mp3Buffer = Buffer.concat([id3Magic, fakeMetadata]); + await fs.writeFile(mp3Path, mp3Buffer); + + const ctx: MsgContext = { + Body: "", + MediaPath: mp3Path, + MediaType: "audio/mpeg", + }; + const cfg: MoltbotConfig = { + tools: { + media: { + audio: { enabled: false }, + image: { enabled: false }, + video: { enabled: false }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + // MP3 with ID3 should NOT be treated as a text file + expect(result.appliedFile).toBe(false); + expect(ctx.Body).not.toContain(" { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 7c2a18006..bcb4e40f8 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -162,6 +162,48 @@ function looksLikeUtf8Text(buffer?: Buffer): boolean { return printable / total > 0.85; } +/** + * Detects binary audio/video formats by magic bytes (file signatures). + * + * OGG files (used by Telegram voice messages as OGG Opus) have ASCII-heavy + * headers that can pass looksLikeUtf8Text() due to Vorbis/Opus comment metadata + * containing printable strings. This causes them to be misidentified as text. + * + * Magic bytes provide authoritative format detection independent of MIME type + * or file extension, following established file format specifications. + * + * References: + * - OGG container: RFC 3533 (https://datatracker.ietf.org/doc/html/rfc3533) + * Section 6: "OggS" capture pattern at byte offset 0 + * - MP3 ID3v2: id3.org spec (https://id3.org/id3v2.4.0-structure) + * Section 3.1: "ID3" identifier at file start + * + * @see https://github.com/moltbot/moltbot/issues/1989 + */ +function hasBinaryAudioMagic(buffer?: Buffer): boolean { + if (!buffer || buffer.length < 4) return false; + // OGG container format: "OggS" signature (RFC 3533 Section 6) + // Covers OGG Vorbis, OGG Opus (Telegram voice), OGG Theora, etc. + if ( + buffer[0] === 0x4f && // 'O' + buffer[1] === 0x67 && // 'g' + buffer[2] === 0x67 && // 'g' + buffer[3] === 0x53 // 'S' + ) { + return true; + } + // MP3 with ID3v2 tag: "ID3" signature (id3.org spec Section 3.1) + // ID3v2 tags can contain large amounts of ASCII text (lyrics, comments) + if ( + buffer[0] === 0x49 && // 'I' + buffer[1] === 0x44 && // 'D' + buffer[2] === 0x33 // '3' + ) { + return true; + } + return false; +} + function decodeTextSample(buffer?: Buffer): string { if (!buffer || buffer.length === 0) return ""; const sample = buffer.subarray(0, Math.min(buffer.length, 8192)); @@ -242,7 +284,11 @@ async function extractFileBlocks(params: { const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? ""); const utf16Charset = resolveUtf16Charset(bufferResult?.buffer); const textSample = decodeTextSample(bufferResult?.buffer); - const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer); + // Check if content looks like text, but exclude files with known binary audio magic bytes + // OGG files can pass looksLikeUtf8Text() due to ASCII-heavy headers (>85% printable) + const textLike = + (Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer)) && + !hasBinaryAudioMagic(bufferResult?.buffer); if (!forcedTextMimeResolved && kind === "audio" && !textLike) { continue; }