From eb99426c0f44b39136c913158aee4c390fa6abf9 Mon Sep 17 00:00:00 2001 From: clawd4xiom Date: Tue, 27 Jan 2026 15:34:19 -0600 Subject: [PATCH] exec: run coding agents in resilient screen sessions --- src/agents/bash-tools.exec.resilient.test.ts | 35 ++ src/agents/bash-tools.exec.ts | 399 +++++++++++++++++++ 2 files changed, 434 insertions(+) create mode 100644 src/agents/bash-tools.exec.resilient.test.ts diff --git a/src/agents/bash-tools.exec.resilient.test.ts b/src/agents/bash-tools.exec.resilient.test.ts new file mode 100644 index 000000000..084ef72f8 --- /dev/null +++ b/src/agents/bash-tools.exec.resilient.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, it } from "vitest"; + +import { __testing } from "./bash-tools.exec.js"; + +describe("exec resilient coding agent detection", () => { + it("detects common coding agent binaries", () => { + expect(__testing.isLikelyCodingAgentCommand("claude --version")).toBe(true); + expect(__testing.isLikelyCodingAgentCommand("codex exec --full-auto 'hi'")).toBe(true); + expect(__testing.isLikelyCodingAgentCommand("opencode run 'hi'")).toBe(true); + expect(__testing.isLikelyCodingAgentCommand("pi 'hi'")).toBe(true); + }); + + it("handles leading env assignments", () => { + expect(__testing.isLikelyCodingAgentCommand("FOO=bar BAR=baz claude --version")).toBe(true); + expect(__testing.resolveLeadingCommandBinary("FOO=bar claude --version")).toBe("claude"); + }); + + it("avoids double-wrapping already-resilient commands", () => { + expect( + __testing.isLikelyCodingAgentCommand( + "bash ~/.clawdbot/skills/coding-agent/scripts/resilient-spawn.sh ~/proj claude 'hi' test", + ), + ).toBe(false); + expect(__testing.isLikelyCodingAgentCommand("screen -dmS foo claude --version")).toBe(false); + expect( + __testing.isLikelyCodingAgentCommand("tmux new-session -d -s foo 'claude --version'"), + ).toBe(false); + }); + + it("rejects non-coding commands", () => { + expect(__testing.isLikelyCodingAgentCommand("")).toBe(false); + expect(__testing.isLikelyCodingAgentCommand("echo hi")).toBe(false); + expect(__testing.isLikelyCodingAgentCommand('bash -lc "claude --version"')).toBe(false); + }); +}); diff --git a/src/agents/bash-tools.exec.ts b/src/agents/bash-tools.exec.ts index b9de81872..73ffea27d 100644 --- a/src/agents/bash-tools.exec.ts +++ b/src/agents/bash-tools.exec.ts @@ -1,5 +1,7 @@ import crypto from "node:crypto"; import type { ChildProcessWithoutNullStreams } from "node:child_process"; +import * as fs from "node:fs/promises"; +import { homedir } from "node:os"; import path from "node:path"; import type { AgentTool, AgentToolResult } from "@mariozechner/pi-agent-core"; import { Type } from "@sinclair/typebox"; @@ -201,6 +203,14 @@ export type ExecToolDetails = cwd?: string; tail?: string; } + | { + status: "running"; + mode: "resilient-screen"; + screenSession: string; + logFile: string; + rawLogFile: string; + workdir: string; + } | { status: "completed" | "failed"; exitCode: number | null; @@ -303,6 +313,335 @@ function applyShellPath(env: Record, shellPath?: string | null) if (merged) env.PATH = merged; } +const CODING_AGENT_BINS = new Set(["claude", "codex", "opencode", "pi"]); + +function tokenizeCommand(command: string) { + return (command.match(/(?:[^\s"']+|"(?:\\.|[^"])*"|'(?:\\.|[^'])*')+/gu) ?? []).map((token) => + stripQuotes(token), + ); +} + +function stripQuotes(value: string) { + const trimmed = value.trim(); + if ( + (trimmed.startsWith('"') && trimmed.endsWith('"')) || + (trimmed.startsWith("'") && trimmed.endsWith("'")) + ) { + return trimmed.slice(1, -1); + } + return trimmed; +} + +function isEnvAssignmentToken(token: string) { + return /^[A-Za-z_][A-Za-z0-9_]*=/.test(token); +} + +function resolveLeadingCommandBinary(command: string) { + const tokens = tokenizeCommand(command); + for (const token of tokens) { + if (!token) continue; + if (isEnvAssignmentToken(token)) continue; + return token; + } + return null; +} + +function isAlreadyResilientCodingCommand(command: string) { + const lowered = command.toLowerCase(); + if (lowered.includes("resilient-spawn.sh")) return true; + if (lowered.includes("spawn-monitored.sh")) return true; + // Avoid double-wrapping when a user explicitly uses screen/tmux already. + if (lowered.includes("screen -dm") || lowered.includes("screen\t-dm")) return true; + if (lowered.includes("tmux new-session") || lowered.includes("tmux\tnew-session")) return true; + return false; +} + +function isLikelyCodingAgentCommand(command: string) { + const trimmed = command.trim(); + if (!trimmed) return false; + if (isAlreadyResilientCodingCommand(trimmed)) return false; + const verb = resolveLeadingCommandBinary(trimmed); + if (!verb) return false; + const base = path.basename(verb); + return CODING_AGENT_BINS.has(base); +} + +function resolveCodingAgentLogDir() { + const stateDir = + (process.env.CLAWDBOT_STATE_DIR ?? "").trim() || path.join(homedir(), ".clawdbot"); + return path.join(stateDir, "logs", "coding-agent"); +} + +function buildResilientCodingAgentWrapperScript() { + // bash 3.2 compatible (macOS default) + return `#!/bin/bash +set -euo pipefail + +SESSION_KEY="$1" +AGENT_ID="$2" +WORKDIR="$3" +CMD_FILE="$4" +LOG_FILE="$5" +RAW_LOG_FILE="$6" +MARKER_FILE="$7" +META_FILE="$8" +SCREEN_SESSION="$9" + +INTERVAL="\${CLAWDBOT_CODING_AGENT_UPDATE_SEC:-180}" +NOTIFY_MODE="\${CLAWDBOT_CODING_AGENT_NOTIFY_MODE:-send}" # send|dry-run|off + +STATE_DIR="\${CLAWDBOT_STATE_DIR:-$HOME/.clawdbot}" +SESSIONS_FILE="$STATE_DIR/agents/$AGENT_ID/sessions/sessions.json" + +MSG_CHANNEL="" +MSG_TARGET="" +MSG_ACCOUNT="" + +if command -v jq >/dev/null 2>&1 && [[ -f "$SESSIONS_FILE" ]] && [[ -n "$SESSION_KEY" ]]; then + MSG_CHANNEL="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.channel // empty' "$SESSIONS_FILE" 2>/dev/null || true)" + MSG_TARGET="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.to // empty' "$SESSIONS_FILE" 2>/dev/null || true)" + MSG_ACCOUNT="$(jq -r --arg k "$SESSION_KEY" '.[$k].deliveryContext.accountId // empty' "$SESSIONS_FILE" 2>/dev/null || true)" +fi + +notify_once() { + local msg="$1" + if [[ "$NOTIFY_MODE" == "off" ]]; then + return 0 + fi + if [[ -n "$MSG_CHANNEL" && -n "$MSG_TARGET" ]]; then + local args=(message send --channel "$MSG_CHANNEL" --target "$MSG_TARGET" --message "$msg") + if [[ -n "$MSG_ACCOUNT" ]]; then + args+=(--account "$MSG_ACCOUNT") + fi + if [[ "$NOTIFY_MODE" == "dry-run" ]]; then + args+=(--dry-run) + fi + clawdbot "\${args[@]}" >/dev/null 2>&1 + return $? + fi + # Fallback to system events (main session). Note: if HEARTBEAT.md is empty, these may not fire immediately. + if [[ "$NOTIFY_MODE" == "dry-run" ]]; then + echo "[notify dry-run] $msg" >> "$LOG_FILE" 2>/dev/null || true + return 0 + fi + clawdbot system event --text "$msg" --mode now >/dev/null 2>&1 + return $? +} + +notify_best_effort() { + notify_once "$1" || true +} + +notify_critical() { + local msg="$1" + local attempts="\${2:-360}" + local sleep_sec="\${3:-5}" + local i=1 + while [[ "$i" -le "$attempts" ]]; do + if notify_once "$msg"; then + return 0 + fi + sleep "$sleep_sec" || true + i=$((i + 1)) + done + echo "[notify failed] $msg" >> "$LOG_FILE" 2>/dev/null || true + return 0 +} + +START_ISO="$(date '+%Y-%m-%dT%H:%M:%S%z')" +mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true +touch "$MARKER_FILE" 2>/dev/null || true + +CMD="$(cat "$CMD_FILE" 2>/dev/null || true)" + +{ + echo "=== clawdbot resilient coding agent ===" + echo "session: $SCREEN_SESSION" + echo "started: $START_ISO" + echo "workdir: $WORKDIR" + echo "command: $CMD" + echo "log: $LOG_FILE" + echo "raw: $RAW_LOG_FILE" + echo "meta: $META_FILE" + echo "=======================================" + echo "" +} >> "$LOG_FILE" + +if command -v jq >/dev/null 2>&1; then + jq -n \\ + --arg session "$SCREEN_SESSION" \\ + --arg startedAt "$START_ISO" \\ + --arg workdir "$WORKDIR" \\ + --arg command "$CMD" \\ + --arg logFile "$LOG_FILE" \\ + --arg rawLogFile "$RAW_LOG_FILE" \\ + --arg sessionKey "$SESSION_KEY" \\ + --arg agentId "$AGENT_ID" \\ + '{session:$session, startedAt:$startedAt, workdir:$workdir, command:$command, logFile:$logFile, rawLogFile:$rawLogFile, sessionKey:$sessionKey, agentId:$agentId}' \\ + > "$META_FILE" 2>/dev/null || true +fi + +notify_best_effort "πŸš€ Starting coding agent session '$SCREEN_SESSION'\\nπŸ“ $WORKDIR\\nπŸ“„ $LOG_FILE" + +SHELL_BIN="\${CLAWDBOT_CODING_AGENT_SHELL:-\${SHELL:-/bin/bash}}" + +( + script -q "$RAW_LOG_FILE" "$SHELL_BIN" -lc "$CMD" 2>&1 | tee -a "$LOG_FILE" +) & +PIPE_PID=$! + +LAST_BYTES=0 +SILENT=0 + +while kill -0 "$PIPE_PID" >/dev/null 2>&1; do + sleep "$INTERVAL" || true + if ! kill -0 "$PIPE_PID" >/dev/null 2>&1; then + break + fi + + if [[ -f "$LOG_FILE" ]]; then + BYTES="$(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null || echo 0)" + LINES="$(wc -l < "$LOG_FILE" 2>/dev/null || echo 0)" + + if [[ "$BYTES" -le "$LAST_BYTES" ]]; then + SILENT=$((SILENT + 1)) + else + SILENT=0 + fi + LAST_BYTES="$BYTES" + + CHANGED_FILES=0 + if [[ -f "$MARKER_FILE" ]]; then + CHANGED_FILES="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | wc -l | tr -d ' ' || echo 0)" + fi + + if [[ "$SILENT" -ge 3 ]]; then + MIN=$(( (SILENT * INTERVAL) / 60 )) + notify_best_effort "⏸️ [$SCREEN_SESSION] No output for ~\${MIN}m. Still running. (log: \${LINES} lines, \${CHANGED_FILES} files touched)" + SILENT=0 + else + notify_best_effort "⏳ [$SCREEN_SESSION] Still running. (log: \${LINES} lines, \${CHANGED_FILES} files touched)" + fi + else + notify_best_effort "⏳ [$SCREEN_SESSION] Still running. (log pending)" + fi +done + +wait "$PIPE_PID" +EXIT_CODE=$? + +CHANGED_TOTAL=0 +CHANGED_LIST="" +if [[ -f "$MARKER_FILE" ]]; then + CHANGED_TOTAL="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | wc -l | tr -d ' ' || echo 0)" + CHANGED_LIST="$(find "$WORKDIR" -type f -newer "$MARKER_FILE" 2>/dev/null | sed "s#^$WORKDIR/##" | sort | head -n 12 | paste -sd ',' - | sed 's/,/, /g' || true)" +fi + +TAIL_LINE="$(tail -n 1 "$LOG_FILE" 2>/dev/null | tr -d '\\\\r' | head -c 200 || true)" + +if [[ "$EXIT_CODE" -eq 0 ]]; then + MSG="βœ… [$SCREEN_SESSION] Done (exit 0)." +else + MSG="❌ [$SCREEN_SESSION] Exited (code $EXIT_CODE)." +fi + +if [[ "$CHANGED_TOTAL" -gt 0 ]]; then + MSG="$MSG\\n🧾 Changed files: $CHANGED_TOTAL" + if [[ -n "$CHANGED_LIST" ]]; then + MSG="$MSG\\nβ€’ $CHANGED_LIST" + if [[ "$CHANGED_TOTAL" -gt 12 ]]; then + MSG="$MSG, …" + fi + fi +fi + +if [[ -n "$TAIL_LINE" ]]; then + MSG="$MSG\\nπŸͺ΅ Last log line: $TAIL_LINE" +fi + +MSG="$MSG\\nπŸ“„ Log: $LOG_FILE\\nπŸ”Ž Attach: screen -r $SCREEN_SESSION" +notify_critical "$MSG" + +if command -v jq >/dev/null 2>&1 && [[ -f "$META_FILE" ]]; then + tmp="$META_FILE.tmp" + jq --argjson code "$EXIT_CODE" '. + {exitCode:$code, endedAt:(now|todateiso8601)}' "$META_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$META_FILE" || true +fi + +exit "$EXIT_CODE" +`; +} + +async function spawnResilientCodingAgentSession(opts: { + command: string; + workdir: string; + env: Record; + sessionKey: string; + agentId: string; + warnings: string[]; +}) { + const logDir = resolveCodingAgentLogDir(); + await fs.mkdir(logDir, { recursive: true }); + + const slug = createSessionSlug(); + const screenSession = `coding-${slug}`; + + const cmdFile = path.join(logDir, `${screenSession}.cmd`); + const logFile = path.join(logDir, `${screenSession}.log`); + const rawLogFile = path.join(logDir, `${screenSession}.raw.log`); + const markerFile = path.join(logDir, `${screenSession}.start`); + const metaFile = path.join(logDir, `${screenSession}.json`); + const wrapperFile = path.join(logDir, `${screenSession}.sh`); + + await fs.writeFile(cmdFile, opts.command, "utf8"); + await fs.writeFile(wrapperFile, buildResilientCodingAgentWrapperScript(), "utf8"); + await fs.chmod(wrapperFile, 0o755); + + const env = { ...opts.env }; + if (!env.CLAWDBOT_CODING_AGENT_SHELL) { + env.CLAWDBOT_CODING_AGENT_SHELL = getShellConfig().shell; + } + + const { child } = await spawnWithFallback({ + argv: [ + "screen", + "-dmS", + screenSession, + wrapperFile, + opts.sessionKey, + opts.agentId, + opts.workdir, + cmdFile, + logFile, + rawLogFile, + markerFile, + metaFile, + screenSession, + ], + options: { + cwd: opts.workdir, + env, + detached: process.platform !== "win32", + stdio: "ignore", + windowsHide: true, + }, + fallbacks: [ + { + label: "no-detach", + options: { detached: false }, + }, + ], + onFallback: (err, fallback) => { + const errText = formatSpawnError(err); + const warning = `Warning: spawn failed (${errText}); retrying with ${fallback.label}.`; + logWarn(`exec: resilient spawn failed (${errText}); retrying with ${fallback.label}.`); + opts.warnings.push(warning); + }, + }); + child.unref?.(); + + return { screenSession, logFile, rawLogFile }; +} + function maybeNotifyOnExit(session: ProcessSession, status: "completed" | "failed") { if (!session.backgrounded || !session.notifyOnExit || session.exitNotified) return; const sessionKey = session.sessionKey?.trim(); @@ -885,6 +1224,59 @@ export function createExecTool( } applyPathPrepend(env, defaultPathPrepend); + // Resilient coding agents: long-running PTY CLIs (claude/codex/opencode/pi) die when the + // gateway restarts because their PTY is owned by the gateway process. When the model + // backgrounds a coding agent, run it inside a detached `screen` session with its own PTY + // and a lightweight notifier so the job survives restarts and users get progress updates. + if ( + !sandbox && + host !== "node" && + allowBackground && + notifySessionKey && + yieldWindow !== null && + isLikelyCodingAgentCommand(params.command) && + process.platform !== "win32" + ) { + try { + const started = await spawnResilientCodingAgentSession({ + command: params.command, + workdir, + env, + sessionKey: notifySessionKey, + agentId: agentId ?? "main", + warnings, + }); + + const warningText = warnings.length ? `${warnings.join("\n")}\n\n` : ""; + return { + content: [ + { + type: "text", + text: + `${warningText}Resilient coding agent started in screen session "${started.screenSession}".\n` + + `Workdir: ${workdir}\n` + + `Log: ${started.logFile}\n` + + `Attach: screen -r ${started.screenSession}\n` + + `Tail: tail -f ${started.logFile}`, + }, + ], + details: { + status: "running", + mode: "resilient-screen", + screenSession: started.screenSession, + logFile: started.logFile, + rawLogFile: started.rawLogFile, + workdir, + }, + } satisfies AgentToolResult; + } catch (err) { + const errText = err instanceof Error ? err.message : String(err); + warnings.push( + `Warning: resilient coding agent spawn failed (${errText}); falling back to normal exec.`, + ); + } + } + if (host === "node") { const approvals = resolveExecApprovals(agentId, { security, ask }); const hostSecurity = minSecurity(security, approvals.agent.security); @@ -1492,4 +1884,11 @@ export function createExecTool( }; } +export const __testing = { + isLikelyCodingAgentCommand, + isAlreadyResilientCodingCommand, + resolveLeadingCommandBinary, + tokenizeCommand, +}; + export const execTool = createExecTool();