Compare commits

...

3 Commits

Author SHA1 Message Date
Peter Steinberger
f6e8a76aab test: update whatsapp reply quote assertions 2025-12-23 02:27:46 +01:00
Peter Steinberger
a3c191006e fix: add whatsapp reply context 2025-12-23 02:26:11 +01:00
Peter Steinberger
dd35ed97b8 🤖 codex: add telegram reply context
# Conflicts:
#	src/telegram/bot.ts
2025-12-23 02:25:26 +01:00
10 changed files with 291 additions and 23 deletions

View File

@ -1,5 +1,10 @@
# Changelog
## Unreleased — 2025-12-23
### Fixes
- Telegram/WhatsApp: native replies now target the original inbound message; reply context is captured in `ReplyTo*` fields for templates. (Thanks @joshp123 for the PR and follow-up question.)
## 2.0.0-beta2 — 2025-12-21
Second beta focused on bundled gateway packaging, skills management, onboarding polish, and provider reliability.

View File

@ -10,6 +10,7 @@ Updated: 2025-12-07
Goal: make replies deterministic per channel while keeping one shared context for direct chats.
- **Surfaces** (channel labels): `whatsapp`, `webchat`, `telegram`, `voice`, etc. Add `Surface` to inbound `MsgContext` so templates/agents can log which channel a turn came from. Routing is fixed: replies go back to the origin surface; the model doesnt choose.
- **Reply context (optional):** inbound replies may include `ReplyToId`, `ReplyToBody`, and `ReplyToSender` so templates can surface the quoted context when needed.
- **Canonical direct session:** All direct chats collapse into the single `main` session by default (no config needed). Groups stay `group:<jid>`, so they remain isolated.
- **Session store:** Keys are resolved via `resolveSessionKey(scope, ctx, mainKey)`; the agent JSONL path lives under `~/.clawdis/sessions/<SessionId>.jsonl`.
- **WebChat:** Always attaches to `main`, loads the full session transcript so desktop reflects cross-surface history, and writes new turns back to the same session.

View File

@ -30,11 +30,11 @@ Status: ready for bot-mode use with grammY (long-polling by default; webhook sup
- Sees only messages sent after its added to a chat; no pre-history access.
- Cannot DM users first; they must initiate. Channels are receive-only unless the bot is an admin poster.
- File size caps follow Telegram Bot API (up to 2 GB for documents; smaller for some media types).
- Typing indicators (`sendChatAction`) supported; inline reply/threading supported where Telegram allows.
- Typing indicators (`sendChatAction`) supported; outbound replies are sent as native replies to the triggering message (threaded where Telegram allows).
## Planned implementation details
- Library: grammY is the only client for send + gateway (fetch fallback removed); grammY throttler is enabled by default to stay under Bot API limits.
- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, and `Timestamp`; groups require @bot mention by default.
- Inbound normalization: maps Bot API updates to `MsgContext` with `Surface: "telegram"`, `ChatType: direct|group`, `SenderName`, `MediaPath`/`MediaType` when attachments arrive, `Timestamp`, and reply-to metadata (`ReplyToId`, `ReplyToBody`, `ReplyToSender`) when the user replies; groups require @bot mention by default.
- Outbound: text and media (photo/video/audio/document) with optional caption; chunked to limits. Typing cue sent best-effort.
- Config: `TELEGRAM_BOT_TOKEN` env or `telegram.botToken` required; `telegram.requireMention`, `telegram.allowFrom`, `telegram.mediaMaxMb`, `telegram.proxy`, `telegram.webhookSecret`, `telegram.webhookUrl`, `telegram.webhookPath` supported.

View File

@ -3,6 +3,9 @@ export type MsgContext = {
From?: string;
To?: string;
MessageSid?: string;
ReplyToId?: string;
ReplyToBody?: string;
ReplyToSender?: string;
MediaPath?: string;
MediaUrl?: string;
MediaType?: string;

View File

@ -6,13 +6,16 @@ const useSpy = vi.fn();
const onSpy = vi.fn();
const stopSpy = vi.fn();
const sendChatActionSpy = vi.fn();
const sendMessageSpy = vi.fn(async () => ({ message_id: 77 }));
type ApiStub = {
config: { use: (arg: unknown) => void };
sendChatAction: typeof sendChatActionSpy;
sendMessage: typeof sendMessageSpy;
};
const apiStub: ApiStub = {
config: { use: useSpy },
sendChatAction: sendChatActionSpy,
sendMessage: sendMessageSpy,
};
vi.mock("grammy", () => ({
@ -107,4 +110,70 @@ describe("createTelegramBot", () => {
expect(sendChatActionSpy).toHaveBeenCalledWith(42, "typing");
});
it("includes reply-to context when a Telegram reply is received", async () => {
onSpy.mockReset();
sendMessageSpy.mockReset();
const replySpy = replyModule.__replySpy as unknown as ReturnType<
typeof vi.fn
>;
replySpy.mockReset();
createTelegramBot({ token: "tok" });
const handler = onSpy.mock.calls[0][1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
await handler({
message: {
chat: { id: 7, type: "private" },
text: "Sure, see below",
date: 1736380800,
reply_to_message: {
message_id: 9001,
text: "Can you summarize this?",
from: { first_name: "Ada" },
},
},
me: { username: "clawdis_bot" },
getFile: async () => ({ download: async () => new Uint8Array() }),
});
expect(replySpy).toHaveBeenCalledTimes(1);
const payload = replySpy.mock.calls[0][0];
expect(payload.Body).not.toContain("Reply to Ada: Can you summarize this?");
expect(payload.ReplyToId).toBe("9001");
expect(payload.ReplyToBody).toBe("Can you summarize this?");
expect(payload.ReplyToSender).toBe("Ada");
});
it("sends replies as native replies without chaining", async () => {
onSpy.mockReset();
sendMessageSpy.mockReset();
const replySpy = replyModule.__replySpy as unknown as ReturnType<
typeof vi.fn
>;
replySpy.mockReset();
replySpy.mockResolvedValue({ text: "a".repeat(4500) });
createTelegramBot({ token: "tok" });
const handler = onSpy.mock.calls[0][1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
await handler({
message: {
chat: { id: 5, type: "private" },
text: "hi",
date: 1736380800,
message_id: 101,
},
me: { username: "clawdis_bot" },
getFile: async () => ({ download: async () => new Uint8Array() }),
});
expect(sendMessageSpy.mock.calls.length).toBeGreaterThan(1);
for (const call of sendMessageSpy.mock.calls) {
expect(call[2]?.reply_to_message_id).toBe(101);
}
});
});

View File

@ -11,8 +11,7 @@ import { getReplyFromConfig } from "../auto-reply/reply.js";
import type { ReplyPayload } from "../auto-reply/types.js";
import { loadConfig } from "../config/config.js";
import { resolveStorePath, updateLastRoute } from "../config/sessions.js";
import { danger, logVerbose } from "../globals.js";
import { formatErrorMessage } from "../infra/errors.js";
import { danger, isVerbose, logVerbose } from "../globals.js";
import { getChildLogger } from "../logging.js";
import { mediaKindFromMime } from "../media/constants.js";
import { detectMime } from "../media/mime.js";
@ -117,6 +116,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
opts.token,
opts.proxyFetch,
);
const replyTarget = describeReplyTarget(msg);
const rawBody = (
msg.text ??
msg.caption ??
@ -124,7 +124,6 @@ export function createTelegramBot(opts: TelegramBotOptions) {
""
).trim();
if (!rawBody) return;
const body = formatAgentEnvelope({
surface: "Telegram",
from: isGroup
@ -143,12 +142,22 @@ export function createTelegramBot(opts: TelegramBotOptions) {
SenderName: buildSenderName(msg),
Surface: "telegram",
MessageSid: String(msg.message_id),
ReplyToId: replyTarget?.id,
ReplyToBody: replyTarget?.body,
ReplyToSender: replyTarget?.sender,
Timestamp: msg.date ? msg.date * 1000 : undefined,
MediaPath: media?.path,
MediaType: media?.contentType,
MediaUrl: media?.path,
};
if (replyTarget && isVerbose()) {
const preview = replyTarget.body.replace(/\s+/g, " ").slice(0, 120);
logVerbose(
`telegram reply-context: replyToId=${replyTarget.id} replyToSender=${replyTarget.sender} replyToBody="${preview}"`,
);
}
if (!isGroup) {
const sessionCfg = cfg.inbound?.session;
const mainKey = (sessionCfg?.mainKey ?? "main").trim() || "main";
@ -161,7 +170,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
});
}
if (logVerbose()) {
if (isVerbose()) {
const preview = body.slice(0, 200).replace(/\n/g, "\\n");
logVerbose(
`telegram inbound: chatId=${chatId} from=${ctxPayload.From} len=${body.length} preview="${preview}"`,
@ -186,6 +195,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
token: opts.token,
runtime,
bot,
replyToMessageId: msg.message_id,
});
} catch (err) {
runtime.error?.(danger(`Telegram handler failed: ${String(err)}`));
@ -208,8 +218,10 @@ async function deliverReplies(params: {
token: string;
runtime: RuntimeEnv;
bot: Bot;
replyToMessageId?: number;
}) {
const { replies, chatId, runtime, bot } = params;
const replyTarget = params.replyToMessageId;
for (const reply of replies) {
if (!reply?.text && !reply?.mediaUrl && !(reply?.mediaUrls?.length ?? 0)) {
runtime.error?.(danger("Telegram reply missing text/media"));
@ -220,9 +232,14 @@ async function deliverReplies(params: {
: reply.mediaUrl
? [reply.mediaUrl]
: [];
if (replyTarget && isVerbose()) {
logVerbose(
`telegram reply-send: chatId=${chatId} replyToMessageId=${replyTarget} kind=${mediaList.length ? "media" : "text"}`,
);
}
if (mediaList.length === 0) {
for (const chunk of chunkText(reply.text || "", 4000)) {
await sendTelegramText(bot, chatId, chunk, runtime);
await sendTelegramText(bot, chatId, chunk, runtime, replyTarget);
}
continue;
}
@ -234,14 +251,18 @@ async function deliverReplies(params: {
const file = new InputFile(media.buffer, media.fileName ?? "file");
const caption = first ? (reply.text ?? undefined) : undefined;
first = false;
const replyOpts = replyTarget ? { reply_to_message_id: replyTarget } : {};
if (kind === "image") {
await bot.api.sendPhoto(chatId, file, { caption });
await bot.api.sendPhoto(chatId, file, { caption, ...replyOpts });
} else if (kind === "video") {
await bot.api.sendVideo(chatId, file, { caption });
await bot.api.sendVideo(chatId, file, { caption, ...replyOpts });
} else if (kind === "audio") {
await bot.api.sendAudio(chatId, file, { caption });
await bot.api.sendAudio(chatId, file, { caption, ...replyOpts });
} else {
await bot.api.sendDocument(chatId, file, { caption });
await bot.api.sendDocument(chatId, file, {
caption,
...replyOpts,
});
}
}
}
@ -338,18 +359,47 @@ async function sendTelegramText(
chatId: string,
text: string,
runtime: RuntimeEnv,
) {
replyToMessageId?: number,
): Promise<number | undefined> {
try {
await bot.api.sendMessage(chatId, text, { parse_mode: "Markdown" });
const res = await bot.api.sendMessage(chatId, text, {
parse_mode: "Markdown",
reply_to_message_id: replyToMessageId,
});
return res.message_id;
} catch (err) {
const errText = formatErrorMessage(err);
if (PARSE_ERR_RE.test(errText)) {
if (PARSE_ERR_RE.test(String(err ?? ""))) {
runtime.log?.(
`telegram markdown parse failed; retrying without formatting: ${errText}`,
`telegram markdown parse failed; retrying without formatting: ${String(
err,
)}`,
);
await bot.api.sendMessage(chatId, text);
return;
const res = await bot.api.sendMessage(chatId, text, {
reply_to_message_id: replyToMessageId,
});
return res.message_id;
}
throw err;
}
}
function describeReplyTarget(msg: TelegramMessage) {
const reply = msg.reply_to_message as TelegramMessage | undefined;
if (!reply) return null;
const replyBody = (reply.text ?? reply.caption ?? "").trim();
let body = replyBody;
if (!body) {
if (reply.photo) body = "<media:image>";
else if (reply.video) body = "<media:video>";
else if (reply.audio || reply.voice) body = "<media:audio>";
else if (reply.document) body = "<media:document>";
}
if (!body) return null;
const sender = buildSenderName(reply);
const senderLabel = sender ? `${sender}` : "unknown sender";
return {
id: reply.message_id ? String(reply.message_id) : undefined,
sender: senderLabel,
body,
};
}

View File

@ -1751,6 +1751,47 @@ describe("web auto-reply", () => {
expect(callArg?.Body).toContain("hello");
});
it("forwards reply-to context to resolver", async () => {
let capturedOnMessage:
| ((msg: import("./inbound.js").WebInboundMessage) => Promise<void>)
| undefined;
const listenerFactory = async (opts: {
onMessage: (
msg: import("./inbound.js").WebInboundMessage,
) => Promise<void>;
}) => {
capturedOnMessage = opts.onMessage;
return { close: vi.fn() };
};
const resolver = vi.fn().mockResolvedValue({ text: "reply" });
await monitorWebProvider(false, listenerFactory, false, resolver);
expect(capturedOnMessage).toBeDefined();
await capturedOnMessage?.({
body: "hello",
from: "+1555",
to: "+2666",
id: "msg1",
replyToId: "q1",
replyToBody: "original",
replyToSender: "+1999",
sendComposing: vi.fn(),
reply: vi.fn(),
sendMedia: vi.fn(),
});
const callArg = resolver.mock.calls[0]?.[0] as {
ReplyToId?: string;
ReplyToBody?: string;
ReplyToSender?: string;
};
expect(callArg.ReplyToId).toBe("q1");
expect(callArg.ReplyToBody).toBe("original");
expect(callArg.ReplyToSender).toBe("+1999");
});
it("applies responsePrefix to regular replies", async () => {
setLoadConfigMock(() => ({
inbound: {

View File

@ -1107,6 +1107,9 @@ export async function monitorWebProvider(
From: msg.from,
To: msg.to,
MessageSid: msg.id,
ReplyToId: msg.replyToId,
ReplyToBody: msg.replyToBody,
ReplyToSender: msg.replyToSender,
MediaPath: msg.mediaPath,
MediaUrl: msg.mediaUrl,
MediaType: msg.mediaType,

View File

@ -39,6 +39,9 @@ export type WebInboundMessage = {
senderJid?: string;
senderE164?: string;
senderName?: string;
replyToId?: string;
replyToBody?: string;
replyToSender?: string;
groupSubject?: string;
groupParticipants?: string[];
mentionedJids?: string[];
@ -187,6 +190,9 @@ export async function monitorWebInbox(options: {
body = extractMediaPlaceholder(msg.message ?? undefined);
if (!body) continue;
}
const replyContext = describeReplyContext(
msg.message as proto.IMessage | undefined,
);
let mediaPath: string | undefined;
let mediaType: string | undefined;
try {
@ -211,10 +217,10 @@ export async function monitorWebInbox(options: {
}
};
const reply = async (text: string) => {
await sock.sendMessage(chatJid, { text });
await sock.sendMessage(chatJid, { text }, { quoted: msg });
};
const sendMedia = async (payload: AnyMessageContent) => {
await sock.sendMessage(chatJid, payload);
await sock.sendMessage(chatJid, payload, { quoted: msg });
};
const timestamp = msg.messageTimestamp
? Number(msg.messageTimestamp) * 1000
@ -249,6 +255,9 @@ export async function monitorWebInbox(options: {
senderJid: participantJid,
senderE164: senderE164 ?? undefined,
senderName,
replyToId: replyContext?.id,
replyToBody: replyContext?.body,
replyToSender: replyContext?.sender,
groupSubject,
groupParticipants,
mentionedJids: mentionedJids ?? undefined,
@ -443,6 +452,36 @@ export function extractMediaPlaceholder(
return undefined;
}
function describeReplyContext(rawMessage: proto.IMessage | undefined): {
id?: string;
body: string;
sender: string;
} | null {
const message = unwrapMessage(rawMessage);
if (!message) return null;
const contextInfo =
message.extendedTextMessage?.contextInfo ??
message.imageMessage?.contextInfo ??
message.videoMessage?.contextInfo ??
message.documentMessage?.contextInfo ??
message.audioMessage?.contextInfo ??
message.stickerMessage?.contextInfo ??
message.buttonsResponseMessage?.contextInfo ??
message.listResponseMessage?.contextInfo;
const quoted = contextInfo?.quotedMessage as proto.IMessage | undefined;
if (!quoted) return null;
const body = extractText(quoted) ?? extractMediaPlaceholder(quoted);
if (!body) return null;
const senderJid = contextInfo?.participant ?? undefined;
const senderE164 = senderJid ? jidToE164(senderJid) ?? senderJid : undefined;
const sender = senderE164 ?? "unknown sender";
return {
id: contextInfo?.stanzaId ? String(contextInfo.stanzaId) : undefined,
body,
sender,
};
}
async function downloadInboundMedia(
msg: proto.IWebMessageInfo,
sock: Awaited<ReturnType<typeof createWaSocket>>,

View File

@ -107,9 +107,15 @@ describe("web monitor inbox", () => {
"composing",
"999@s.whatsapp.net",
);
expect(sock.sendMessage).toHaveBeenCalledWith("999@s.whatsapp.net", {
text: "pong",
});
expect(sock.sendMessage).toHaveBeenCalledWith(
"999@s.whatsapp.net",
{ text: "pong" },
{
quoted: expect.objectContaining({
key: expect.objectContaining({ id: "abc" }),
}),
},
);
await listener.close();
});
@ -151,6 +157,57 @@ describe("web monitor inbox", () => {
await listener.close();
});
it("captures reply context from quoted messages", async () => {
const onMessage = vi.fn(async (msg) => {
await msg.reply("pong");
});
const listener = await monitorWebInbox({ verbose: false, onMessage });
const sock = await createWaSocket();
const upsert = {
type: "notify",
messages: [
{
key: { id: "abc", fromMe: false, remoteJid: "999@s.whatsapp.net" },
message: {
extendedTextMessage: {
text: "reply",
contextInfo: {
stanzaId: "q1",
participant: "111@s.whatsapp.net",
quotedMessage: { conversation: "original" },
},
},
},
messageTimestamp: 1_700_000_000,
pushName: "Tester",
},
],
};
sock.ev.emit("messages.upsert", upsert);
await new Promise((resolve) => setImmediate(resolve));
expect(onMessage).toHaveBeenCalledWith(
expect.objectContaining({
replyToId: "q1",
replyToBody: "original",
replyToSender: "+111",
}),
);
expect(sock.sendMessage).toHaveBeenCalledWith(
"999@s.whatsapp.net",
{ text: "pong" },
{
quoted: expect.objectContaining({
key: expect.objectContaining({ id: "abc" }),
}),
},
);
await listener.close();
});
it("captures media path for image messages", async () => {
const onMessage = vi.fn();
const listener = await monitorWebInbox({ verbose: false, onMessage });