exec: run coding agents in resilient screen sessions
This commit is contained in:
parent
2930ebfd43
commit
eb99426c0f
35
src/agents/bash-tools.exec.resilient.test.ts
Normal file
35
src/agents/bash-tools.exec.resilient.test.ts
Normal file
@ -0,0 +1,35 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { __testing } from "./bash-tools.exec.js";
|
||||
|
||||
describe("exec resilient coding agent detection", () => {
|
||||
it("detects common coding agent binaries", () => {
|
||||
expect(__testing.isLikelyCodingAgentCommand("claude --version")).toBe(true);
|
||||
expect(__testing.isLikelyCodingAgentCommand("codex exec --full-auto 'hi'")).toBe(true);
|
||||
expect(__testing.isLikelyCodingAgentCommand("opencode run 'hi'")).toBe(true);
|
||||
expect(__testing.isLikelyCodingAgentCommand("pi 'hi'")).toBe(true);
|
||||
});
|
||||
|
||||
it("handles leading env assignments", () => {
|
||||
expect(__testing.isLikelyCodingAgentCommand("FOO=bar BAR=baz claude --version")).toBe(true);
|
||||
expect(__testing.resolveLeadingCommandBinary("FOO=bar claude --version")).toBe("claude");
|
||||
});
|
||||
|
||||
it("avoids double-wrapping already-resilient commands", () => {
|
||||
expect(
|
||||
__testing.isLikelyCodingAgentCommand(
|
||||
"bash ~/.clawdbot/skills/coding-agent/scripts/resilient-spawn.sh ~/proj claude 'hi' test",
|
||||
),
|
||||
).toBe(false);
|
||||
expect(__testing.isLikelyCodingAgentCommand("screen -dmS foo claude --version")).toBe(false);
|
||||
expect(
|
||||
__testing.isLikelyCodingAgentCommand("tmux new-session -d -s foo 'claude --version'"),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("rejects non-coding commands", () => {
|
||||
expect(__testing.isLikelyCodingAgentCommand("")).toBe(false);
|
||||
expect(__testing.isLikelyCodingAgentCommand("echo hi")).toBe(false);
|
||||
expect(__testing.isLikelyCodingAgentCommand('bash -lc "claude --version"')).toBe(false);
|
||||
});
|
||||
});
|
||||
@ -1,5 +1,7 @@
|
||||
import crypto from "node:crypto";
|
||||
import type { ChildProcessWithoutNullStreams } from "node:child_process";
|
||||
import * as fs from "node:fs/promises";
|
||||
import { homedir } from "node:os";
|
||||
import path from "node:path";
|
||||
import type { AgentTool, AgentToolResult } from "@mariozechner/pi-agent-core";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
@ -201,6 +203,14 @@ export type ExecToolDetails =
|
||||
cwd?: string;
|
||||
tail?: string;
|
||||
}
|
||||
| {
|
||||
status: "running";
|
||||
mode: "resilient-screen";
|
||||
screenSession: string;
|
||||
logFile: string;
|
||||
rawLogFile: string;
|
||||
workdir: string;
|
||||
}
|
||||
| {
|
||||
status: "completed" | "failed";
|
||||
exitCode: number | null;
|
||||
@ -303,6 +313,335 @@ function applyShellPath(env: Record<string, string>, shellPath?: string | null)
|
||||
if (merged) env.PATH = merged;
|
||||
}
|
||||
|
||||
const CODING_AGENT_BINS = new Set(["claude", "codex", "opencode", "pi"]);
|
||||
|
||||
function tokenizeCommand(command: string) {
|
||||
return (command.match(/(?:[^\s"']+|"(?:\\.|[^"])*"|'(?:\\.|[^'])*')+/gu) ?? []).map((token) =>
|
||||
stripQuotes(token),
|
||||
);
|
||||
}
|
||||
|
||||
function stripQuotes(value: string) {
|
||||
const trimmed = value.trim();
|
||||
if (
|
||||
(trimmed.startsWith('"') && trimmed.endsWith('"')) ||
|
||||
(trimmed.startsWith("'") && trimmed.endsWith("'"))
|
||||
) {
|
||||
return trimmed.slice(1, -1);
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
function isEnvAssignmentToken(token: string) {
|
||||
return /^[A-Za-z_][A-Za-z0-9_]*=/.test(token);
|
||||
}
|
||||
|
||||
function resolveLeadingCommandBinary(command: string) {
|
||||
const tokens = tokenizeCommand(command);
|
||||
for (const token of tokens) {
|
||||
if (!token) continue;
|
||||
if (isEnvAssignmentToken(token)) continue;
|
||||
return token;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function isAlreadyResilientCodingCommand(command: string) {
|
||||
const lowered = command.toLowerCase();
|
||||
if (lowered.includes("resilient-spawn.sh")) return true;
|
||||
if (lowered.includes("spawn-monitored.sh")) return true;
|
||||
// Avoid double-wrapping when a user explicitly uses screen/tmux already.
|
||||
if (lowered.includes("screen -dm") || lowered.includes("screen\t-dm")) return true;
|
||||
if (lowered.includes("tmux new-session") || lowered.includes("tmux\tnew-session")) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function isLikelyCodingAgentCommand(command: string) {
|
||||
const trimmed = command.trim();
|
||||
if (!trimmed) return false;
|
||||
if (isAlreadyResilientCodingCommand(trimmed)) return false;
|
||||
const verb = resolveLeadingCommandBinary(trimmed);
|
||||
if (!verb) return false;
|
||||
const base = path.basename(verb);
|
||||
return CODING_AGENT_BINS.has(base);
|
||||
}
|
||||
|
||||
function resolveCodingAgentLogDir() {
|
||||
const stateDir =
|
||||
(process.env.CLAWDBOT_STATE_DIR ?? "").trim() || path.join(homedir(), ".clawdbot");
|
||||
return path.join(stateDir, "logs", "coding-agent");
|
||||
}
|
||||
|
||||
function buildResilientCodingAgentWrapperScript() {
|
||||
// bash 3.2 compatible (macOS default)
|
||||
return `#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SESSION_KEY="$1"
|
||||
AGENT_ID="$2"
|
||||
WORKDIR="$3"
|
||||
CMD_FILE="$4"
|
||||
LOG_FILE="$5"
|
||||
RAW_LOG_FILE="$6"
|
||||
MARKER_FILE="$7"
|
||||
META_FILE="$8"
|
||||
SCREEN_SESSION="$9"
|
||||
|
||||
INTERVAL="\${CLAWDBOT_CODING_AGENT_UPDATE_SEC:-180}"
|
||||
NOTIFY_MODE="\${CLAWDBOT_CODING_AGENT_NOTIFY_MODE:-send}" # send|dry-run|off
|
||||
|
||||
STATE_DIR="\${CLAWDBOT_STATE_DIR:-$HOME/.clawdbot}"
|
||||
SESSIONS_FILE="$STATE_DIR/agents/$AGENT_ID/sessions/sessions.json"
|
||||
|
||||
MSG_CHANNEL=""
|
||||
MSG_TARGET=""
|
||||
MSG_ACCOUNT=""
|
||||
|
||||
if command -v jq >/dev/null 2>&1 && [[ -f "$SESSIONS_FILE" ]] && [[ -n "$SESSION_KEY" ]]; then
|
||||
MSG_CHANNEL="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.channel // empty' "$SESSIONS_FILE" 2>/dev/null || true)"
|
||||
MSG_TARGET="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.to // empty' "$SESSIONS_FILE" 2>/dev/null || true)"
|
||||
MSG_ACCOUNT="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.accountId // empty' "$SESSIONS_FILE" 2>/dev/null || true)"
|
||||
fi
|
||||
|
||||
notify_once() {
|
||||
local msg="$1"
|
||||
if [[ "$NOTIFY_MODE" == "off" ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ -n "$MSG_CHANNEL" && -n "$MSG_TARGET" ]]; then
|
||||
local args=(message send --channel "$MSG_CHANNEL" --target "$MSG_TARGET" --message "$msg")
|
||||
if [[ -n "$MSG_ACCOUNT" ]]; then
|
||||
args+=(--account "$MSG_ACCOUNT")
|
||||
fi
|
||||
if [[ "$NOTIFY_MODE" == "dry-run" ]]; then
|
||||
args+=(--dry-run)
|
||||
fi
|
||||
clawdbot "\${args[@]}" >/dev/null 2>&1
|
||||
return $?
|
||||
fi
|
||||
# Fallback to system events (main session). Note: if HEARTBEAT.md is empty, these may not fire immediately.
|
||||
if [[ "$NOTIFY_MODE" == "dry-run" ]]; then
|
||||
echo "[notify dry-run] $msg" >> "$LOG_FILE" 2>/dev/null || true
|
||||
return 0
|
||||
fi
|
||||
clawdbot system event --text "$msg" --mode now >/dev/null 2>&1
|
||||
return $?
|
||||
}
|
||||
|
||||
notify_best_effort() {
|
||||
notify_once "$1" || true
|
||||
}
|
||||
|
||||
notify_critical() {
|
||||
local msg="$1"
|
||||
local attempts="\${2:-360}"
|
||||
local sleep_sec="\${3:-5}"
|
||||
local i=1
|
||||
while [[ "$i" -le "$attempts" ]]; do
|
||||
if notify_once "$msg"; then
|
||||
return 0
|
||||
fi
|
||||
sleep "$sleep_sec" || true
|
||||
i=$((i + 1))
|
||||
done
|
||||
echo "[notify failed] $msg" >> "$LOG_FILE" 2>/dev/null || true
|
||||
return 0
|
||||
}
|
||||
|
||||
START_ISO="$(date '+%Y-%m-%dT%H:%M:%S%z')"
|
||||
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
|
||||
touch "$MARKER_FILE" 2>/dev/null || true
|
||||
|
||||
CMD="$(cat "$CMD_FILE" 2>/dev/null || true)"
|
||||
|
||||
{
|
||||
echo "=== clawdbot resilient coding agent ==="
|
||||
echo "session: $SCREEN_SESSION"
|
||||
echo "started: $START_ISO"
|
||||
echo "workdir: $WORKDIR"
|
||||
echo "command: $CMD"
|
||||
echo "log: $LOG_FILE"
|
||||
echo "raw: $RAW_LOG_FILE"
|
||||
echo "meta: $META_FILE"
|
||||
echo "======================================="
|
||||
echo ""
|
||||
} >> "$LOG_FILE"
|
||||
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
jq -n \\
|
||||
--arg session "$SCREEN_SESSION" \\
|
||||
--arg startedAt "$START_ISO" \\
|
||||
--arg workdir "$WORKDIR" \\
|
||||
--arg command "$CMD" \\
|
||||
--arg logFile "$LOG_FILE" \\
|
||||
--arg rawLogFile "$RAW_LOG_FILE" \\
|
||||
--arg sessionKey "$SESSION_KEY" \\
|
||||
--arg agentId "$AGENT_ID" \\
|
||||
'{session:$session, startedAt:$startedAt, workdir:$workdir, command:$command, logFile:$logFile, rawLogFile:$rawLogFile, sessionKey:$sessionKey, agentId:$agentId}' \\
|
||||
> "$META_FILE" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
notify_best_effort "🚀 Starting coding agent session '$SCREEN_SESSION'\\n📁 $WORKDIR\\n📄 $LOG_FILE"
|
||||
|
||||
SHELL_BIN="\${CLAWDBOT_CODING_AGENT_SHELL:-\${SHELL:-/bin/bash}}"
|
||||
|
||||
(
|
||||
script -q "$RAW_LOG_FILE" "$SHELL_BIN" -lc "$CMD" 2>&1 | tee -a "$LOG_FILE"
|
||||
) &
|
||||
PIPE_PID=$!
|
||||
|
||||
LAST_BYTES=0
|
||||
SILENT=0
|
||||
|
||||
while kill -0 "$PIPE_PID" >/dev/null 2>&1; do
|
||||
sleep "$INTERVAL" || true
|
||||
if ! kill -0 "$PIPE_PID" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [[ -f "$LOG_FILE" ]]; then
|
||||
BYTES="$(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null || echo 0)"
|
||||
LINES="$(wc -l < "$LOG_FILE" 2>/dev/null || echo 0)"
|
||||
|
||||
if [[ "$BYTES" -le "$LAST_BYTES" ]]; then
|
||||
SILENT=$((SILENT + 1))
|
||||
else
|
||||
SILENT=0
|
||||
fi
|
||||
LAST_BYTES="$BYTES"
|
||||
|
||||
CHANGED_FILES=0
|
||||
if [[ -f "$MARKER_FILE" ]]; then
|
||||
CHANGED_FILES="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | wc -l | tr -d ' ' || echo 0)"
|
||||
fi
|
||||
|
||||
if [[ "$SILENT" -ge 3 ]]; then
|
||||
MIN=$(( (SILENT * INTERVAL) / 60 ))
|
||||
notify_best_effort "⏸️ [$SCREEN_SESSION] No output for ~\${MIN}m. Still running. (log: \${LINES} lines, \${CHANGED_FILES} files touched)"
|
||||
SILENT=0
|
||||
else
|
||||
notify_best_effort "⏳ [$SCREEN_SESSION] Still running. (log: \${LINES} lines, \${CHANGED_FILES} files touched)"
|
||||
fi
|
||||
else
|
||||
notify_best_effort "⏳ [$SCREEN_SESSION] Still running. (log pending)"
|
||||
fi
|
||||
done
|
||||
|
||||
wait "$PIPE_PID"
|
||||
EXIT_CODE=$?
|
||||
|
||||
CHANGED_TOTAL=0
|
||||
CHANGED_LIST=""
|
||||
if [[ -f "$MARKER_FILE" ]]; then
|
||||
CHANGED_TOTAL="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | wc -l | tr -d ' ' || echo 0)"
|
||||
CHANGED_LIST="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | sed "s#^$WORKDIR/##" | sort | head -n 12 | paste -sd ',' - | sed 's/,/, /g' || true)"
|
||||
fi
|
||||
|
||||
TAIL_LINE="$(tail -n 1 "$LOG_FILE" 2>/dev/null | tr -d '\\\\r' | head -c 200 || true)"
|
||||
|
||||
if [[ "$EXIT_CODE" -eq 0 ]]; then
|
||||
MSG="✅ [$SCREEN_SESSION] Done (exit 0)."
|
||||
else
|
||||
MSG="❌ [$SCREEN_SESSION] Exited (code $EXIT_CODE)."
|
||||
fi
|
||||
|
||||
if [[ "$CHANGED_TOTAL" -gt 0 ]]; then
|
||||
MSG="$MSG\\n🧾 Changed files: $CHANGED_TOTAL"
|
||||
if [[ -n "$CHANGED_LIST" ]]; then
|
||||
MSG="$MSG\\n• $CHANGED_LIST"
|
||||
if [[ "$CHANGED_TOTAL" -gt 12 ]]; then
|
||||
MSG="$MSG, …"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "$TAIL_LINE" ]]; then
|
||||
MSG="$MSG\\n🪵 Last log line: $TAIL_LINE"
|
||||
fi
|
||||
|
||||
MSG="$MSG\\n📄 Log: $LOG_FILE\\n🔎 Attach: screen -r $SCREEN_SESSION"
|
||||
notify_critical "$MSG"
|
||||
|
||||
if command -v jq >/dev/null 2>&1 && [[ -f "$META_FILE" ]]; then
|
||||
tmp="$META_FILE.tmp"
|
||||
jq --argjson code "$EXIT_CODE" '. + {exitCode:$code, endedAt:(now|todateiso8601)}' "$META_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$META_FILE" || true
|
||||
fi
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
`;
|
||||
}
|
||||
|
||||
async function spawnResilientCodingAgentSession(opts: {
|
||||
command: string;
|
||||
workdir: string;
|
||||
env: Record<string, string>;
|
||||
sessionKey: string;
|
||||
agentId: string;
|
||||
warnings: string[];
|
||||
}) {
|
||||
const logDir = resolveCodingAgentLogDir();
|
||||
await fs.mkdir(logDir, { recursive: true });
|
||||
|
||||
const slug = createSessionSlug();
|
||||
const screenSession = `coding-${slug}`;
|
||||
|
||||
const cmdFile = path.join(logDir, `${screenSession}.cmd`);
|
||||
const logFile = path.join(logDir, `${screenSession}.log`);
|
||||
const rawLogFile = path.join(logDir, `${screenSession}.raw.log`);
|
||||
const markerFile = path.join(logDir, `${screenSession}.start`);
|
||||
const metaFile = path.join(logDir, `${screenSession}.json`);
|
||||
const wrapperFile = path.join(logDir, `${screenSession}.sh`);
|
||||
|
||||
await fs.writeFile(cmdFile, opts.command, "utf8");
|
||||
await fs.writeFile(wrapperFile, buildResilientCodingAgentWrapperScript(), "utf8");
|
||||
await fs.chmod(wrapperFile, 0o755);
|
||||
|
||||
const env = { ...opts.env };
|
||||
if (!env.CLAWDBOT_CODING_AGENT_SHELL) {
|
||||
env.CLAWDBOT_CODING_AGENT_SHELL = getShellConfig().shell;
|
||||
}
|
||||
|
||||
const { child } = await spawnWithFallback({
|
||||
argv: [
|
||||
"screen",
|
||||
"-dmS",
|
||||
screenSession,
|
||||
wrapperFile,
|
||||
opts.sessionKey,
|
||||
opts.agentId,
|
||||
opts.workdir,
|
||||
cmdFile,
|
||||
logFile,
|
||||
rawLogFile,
|
||||
markerFile,
|
||||
metaFile,
|
||||
screenSession,
|
||||
],
|
||||
options: {
|
||||
cwd: opts.workdir,
|
||||
env,
|
||||
detached: process.platform !== "win32",
|
||||
stdio: "ignore",
|
||||
windowsHide: true,
|
||||
},
|
||||
fallbacks: [
|
||||
{
|
||||
label: "no-detach",
|
||||
options: { detached: false },
|
||||
},
|
||||
],
|
||||
onFallback: (err, fallback) => {
|
||||
const errText = formatSpawnError(err);
|
||||
const warning = `Warning: spawn failed (${errText}); retrying with ${fallback.label}.`;
|
||||
logWarn(`exec: resilient spawn failed (${errText}); retrying with ${fallback.label}.`);
|
||||
opts.warnings.push(warning);
|
||||
},
|
||||
});
|
||||
child.unref?.();
|
||||
|
||||
return { screenSession, logFile, rawLogFile };
|
||||
}
|
||||
|
||||
function maybeNotifyOnExit(session: ProcessSession, status: "completed" | "failed") {
|
||||
if (!session.backgrounded || !session.notifyOnExit || session.exitNotified) return;
|
||||
const sessionKey = session.sessionKey?.trim();
|
||||
@ -885,6 +1224,59 @@ export function createExecTool(
|
||||
}
|
||||
applyPathPrepend(env, defaultPathPrepend);
|
||||
|
||||
// Resilient coding agents: long-running PTY CLIs (claude/codex/opencode/pi) die when the
|
||||
// gateway restarts because their PTY is owned by the gateway process. When the model
|
||||
// backgrounds a coding agent, run it inside a detached `screen` session with its own PTY
|
||||
// and a lightweight notifier so the job survives restarts and users get progress updates.
|
||||
if (
|
||||
!sandbox &&
|
||||
host !== "node" &&
|
||||
allowBackground &&
|
||||
notifySessionKey &&
|
||||
yieldWindow !== null &&
|
||||
isLikelyCodingAgentCommand(params.command) &&
|
||||
process.platform !== "win32"
|
||||
) {
|
||||
try {
|
||||
const started = await spawnResilientCodingAgentSession({
|
||||
command: params.command,
|
||||
workdir,
|
||||
env,
|
||||
sessionKey: notifySessionKey,
|
||||
agentId: agentId ?? "main",
|
||||
warnings,
|
||||
});
|
||||
|
||||
const warningText = warnings.length ? `${warnings.join("\n")}\n\n` : "";
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text:
|
||||
`${warningText}Resilient coding agent started in screen session "${started.screenSession}".\n` +
|
||||
`Workdir: ${workdir}\n` +
|
||||
`Log: ${started.logFile}\n` +
|
||||
`Attach: screen -r ${started.screenSession}\n` +
|
||||
`Tail: tail -f ${started.logFile}`,
|
||||
},
|
||||
],
|
||||
details: {
|
||||
status: "running",
|
||||
mode: "resilient-screen",
|
||||
screenSession: started.screenSession,
|
||||
logFile: started.logFile,
|
||||
rawLogFile: started.rawLogFile,
|
||||
workdir,
|
||||
},
|
||||
} satisfies AgentToolResult<ExecToolDetails>;
|
||||
} catch (err) {
|
||||
const errText = err instanceof Error ? err.message : String(err);
|
||||
warnings.push(
|
||||
`Warning: resilient coding agent spawn failed (${errText}); falling back to normal exec.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (host === "node") {
|
||||
const approvals = resolveExecApprovals(agentId, { security, ask });
|
||||
const hostSecurity = minSecurity(security, approvals.agent.security);
|
||||
@ -1492,4 +1884,11 @@ export function createExecTool(
|
||||
};
|
||||
}
|
||||
|
||||
export const __testing = {
|
||||
isLikelyCodingAgentCommand,
|
||||
isAlreadyResilientCodingCommand,
|
||||
resolveLeadingCommandBinary,
|
||||
tokenizeCommand,
|
||||
};
|
||||
|
||||
export const execTool = createExecTool();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user