diff --git a/src/agents/session-write-lock.test.ts b/src/agents/session-write-lock.test.ts index 27e793d5a..c6cb9d3cb 100644 --- a/src/agents/session-write-lock.test.ts +++ b/src/agents/session-write-lock.test.ts @@ -159,4 +159,77 @@ describe("acquireSessionWriteLock", () => { expect(process.listeners("SIGINT")).toContain(keepAlive); process.off("SIGINT", keepAlive); }); + + it("reclaims stale lock from same PID (container restart scenario)", async () => { + // Simulates: container crashes, lock file remains with pid=process.pid, + // container restarts and gets the same PID due to Docker PID reuse. + // The lock should be reclaimed since it's from the same hostname but different instanceId. + const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-pid-reuse-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + + // Write a stale lock file with current hostname and PID but different instanceId + // (simulating a previous process incarnation) + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(Date.now() - 10_000).toISOString(), + hostname: os.hostname(), + instanceId: `${os.hostname()}-${process.pid}-0`, // Different start time + }), + "utf8", + ); + + // Should be able to acquire the lock: same hostname+pid but different instanceId + const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 }); + + // Verify the lock was reclaimed (new timestamp and instanceId) + const raw = await fs.readFile(lockPath, "utf8"); + const payload = JSON.parse(raw) as { pid: number; createdAt: string; instanceId: string }; + expect(payload.pid).toBe(process.pid); + const lockAge = Date.now() - Date.parse(payload.createdAt); + expect(lockAge).toBeLessThan(1000); // Fresh lock, created just now + + await lock.release(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + + it("does NOT reclaim lock from different hostname (multi-container scenario)", async () => { + // Simulates: two containers with shared volume, both have PID 1. + // Container A holds the lock, Container B should NOT reclaim it. + const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-multi-container-")); + try { + const sessionFile = path.join(root, "sessions.json"); + const lockPath = `${sessionFile}.lock`; + + // Write a lock file from "another container" with same PID but different hostname + await fs.writeFile( + lockPath, + JSON.stringify({ + pid: process.pid, // Same PID (e.g., both containers are PID 1) + createdAt: new Date(Date.now() - 10_000).toISOString(), + hostname: "other-container-hostname", // Different hostname + instanceId: "other-container-hostname-1-12345", + }), + "utf8", + ); + + // Should NOT be able to acquire the lock because hostname differs + // The lock will timeout because isAlive(process.pid) returns true + await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 500 })).rejects.toThrow( + /timeout/, + ); + + // Verify the original lock file is still there (not deleted) + const raw = await fs.readFile(lockPath, "utf8"); + const payload = JSON.parse(raw) as { hostname: string }; + expect(payload.hostname).toBe("other-container-hostname"); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); }); diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index 82a2428da..31e9a3b81 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -1,10 +1,13 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; +import os from "node:os"; import path from "node:path"; type LockFilePayload = { pid: number; createdAt: string; + instanceId?: string; // Unique identifier: hostname-pid-startTime + hostname?: string; // Hostname of the process holding the lock }; type HeldLock = { @@ -15,6 +18,13 @@ type HeldLock = { const HELD_LOCKS = new Map(); const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const; + +// Unique identifier for this process instance, combining hostname, PID, and start time. +// This distinguishes between: +// 1. Same host, PID reuse after restart: same hostname+pid, different start time +// 2. Different containers with same PID: different hostname +const PROCESS_START_TIME = Math.floor(Date.now() - process.uptime() * 1000); +const PROCESS_INSTANCE_ID = `${os.hostname()}-${process.pid}-${PROCESS_START_TIME}`; type CleanupSignal = (typeof CLEANUP_SIGNALS)[number]; const cleanupHandlers = new Map void>(); @@ -93,7 +103,12 @@ async function readLockPayload(lockPath: string): Promise; if (typeof parsed.pid !== "number") return null; if (typeof parsed.createdAt !== "string") return null; - return { pid: parsed.pid, createdAt: parsed.createdAt }; + return { + pid: parsed.pid, + createdAt: parsed.createdAt, + instanceId: parsed.instanceId, + hostname: parsed.hostname, + }; } catch { return null; } @@ -144,7 +159,16 @@ export async function acquireSessionWriteLock(params: { try { const handle = await fs.open(lockPath, "wx"); await handle.writeFile( - JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2), + JSON.stringify( + { + pid: process.pid, + createdAt: new Date().toISOString(), + instanceId: PROCESS_INSTANCE_ID, + hostname: os.hostname(), + }, + null, + 2, + ), "utf8", ); HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath }); @@ -164,7 +188,35 @@ export async function acquireSessionWriteLock(params: { if (code !== "EEXIST") throw err; const payload = await readLockPayload(lockPath); const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN; - const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs; + const lockAgeMs = Number.isFinite(createdAt) ? Date.now() - createdAt : Infinity; + + // If the lock file claims to be held by our own hostname and PID but we + // don't have it in memory, and the instanceId differs, it's a stale lock + // from a previous process incarnation (e.g., container restart where PID + // got reused). Clean it up. + // + // We require hostname to match to avoid accidentally deleting locks from + // other containers that happen to share the same PID (e.g., multiple + // containers with PID 1 on a shared volume - each has a different hostname). + // + // We add a 5-second grace period to avoid accidentally deleting a lock that + // was just created but not yet registered in HELD_LOCKS (race condition). + const ownPidGraceMs = 5_000; + const sameHost = payload?.hostname === os.hostname(); + const samePid = payload?.pid === process.pid; + const differentInstance = payload?.instanceId !== PROCESS_INSTANCE_ID; + if ( + sameHost && + samePid && + differentInstance && + !HELD_LOCKS.has(normalizedSessionFile) && + lockAgeMs > ownPidGraceMs + ) { + await fs.rm(lockPath, { force: true }); + continue; + } + + const stale = lockAgeMs > staleMs; const alive = payload?.pid ? isAlive(payload.pid) : false; if (stale || !alive) { await fs.rm(lockPath, { force: true });