fix: handle PID reuse in container restart scenarios
When a container restarts, the new process may get the same PID as the previous one. If the old process left a lock file behind, the new process would see a lock held by its own PID but not tracked in memory. This change detects this scenario by adding hostname and instanceId to the lock payload: - Only reclaims locks from the same hostname (prevents multi-container conflicts) - Uses instanceId (hostname-pid-startTime) to distinguish process incarnations - Adds 5-second grace period to prevent race conditions Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5f4715acfc
commit
d8d75d9665
@ -159,4 +159,77 @@ describe("acquireSessionWriteLock", () => {
|
||||
expect(process.listeners("SIGINT")).toContain(keepAlive);
|
||||
process.off("SIGINT", keepAlive);
|
||||
});
|
||||
|
||||
it("reclaims stale lock from same PID (container restart scenario)", async () => {
|
||||
// Simulates: container crashes, lock file remains with pid=process.pid,
|
||||
// container restarts and gets the same PID due to Docker PID reuse.
|
||||
// The lock should be reclaimed since it's from the same hostname but different instanceId.
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-pid-reuse-"));
|
||||
try {
|
||||
const sessionFile = path.join(root, "sessions.json");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
|
||||
// Write a stale lock file with current hostname and PID but different instanceId
|
||||
// (simulating a previous process incarnation)
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date(Date.now() - 10_000).toISOString(),
|
||||
hostname: os.hostname(),
|
||||
instanceId: `${os.hostname()}-${process.pid}-0`, // Different start time
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
// Should be able to acquire the lock: same hostname+pid but different instanceId
|
||||
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
|
||||
|
||||
// Verify the lock was reclaimed (new timestamp and instanceId)
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
const payload = JSON.parse(raw) as { pid: number; createdAt: string; instanceId: string };
|
||||
expect(payload.pid).toBe(process.pid);
|
||||
const lockAge = Date.now() - Date.parse(payload.createdAt);
|
||||
expect(lockAge).toBeLessThan(1000); // Fresh lock, created just now
|
||||
|
||||
await lock.release();
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("does NOT reclaim lock from different hostname (multi-container scenario)", async () => {
|
||||
// Simulates: two containers with shared volume, both have PID 1.
|
||||
// Container A holds the lock, Container B should NOT reclaim it.
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-multi-container-"));
|
||||
try {
|
||||
const sessionFile = path.join(root, "sessions.json");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
|
||||
// Write a lock file from "another container" with same PID but different hostname
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({
|
||||
pid: process.pid, // Same PID (e.g., both containers are PID 1)
|
||||
createdAt: new Date(Date.now() - 10_000).toISOString(),
|
||||
hostname: "other-container-hostname", // Different hostname
|
||||
instanceId: "other-container-hostname-1-12345",
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
// Should NOT be able to acquire the lock because hostname differs
|
||||
// The lock will timeout because isAlive(process.pid) returns true
|
||||
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 500 })).rejects.toThrow(
|
||||
/timeout/,
|
||||
);
|
||||
|
||||
// Verify the original lock file is still there (not deleted)
|
||||
const raw = await fs.readFile(lockPath, "utf8");
|
||||
const payload = JSON.parse(raw) as { hostname: string };
|
||||
expect(payload.hostname).toBe("other-container-hostname");
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
type LockFilePayload = {
|
||||
pid: number;
|
||||
createdAt: string;
|
||||
instanceId?: string; // Unique identifier: hostname-pid-startTime
|
||||
hostname?: string; // Hostname of the process holding the lock
|
||||
};
|
||||
|
||||
type HeldLock = {
|
||||
@ -15,6 +18,13 @@ type HeldLock = {
|
||||
|
||||
const HELD_LOCKS = new Map<string, HeldLock>();
|
||||
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
|
||||
|
||||
// Unique identifier for this process instance, combining hostname, PID, and start time.
|
||||
// This distinguishes between:
|
||||
// 1. Same host, PID reuse after restart: same hostname+pid, different start time
|
||||
// 2. Different containers with same PID: different hostname
|
||||
const PROCESS_START_TIME = Math.floor(Date.now() - process.uptime() * 1000);
|
||||
const PROCESS_INSTANCE_ID = `${os.hostname()}-${process.pid}-${PROCESS_START_TIME}`;
|
||||
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
|
||||
const cleanupHandlers = new Map<CleanupSignal, () => void>();
|
||||
|
||||
@ -93,7 +103,12 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
|
||||
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
|
||||
if (typeof parsed.pid !== "number") return null;
|
||||
if (typeof parsed.createdAt !== "string") return null;
|
||||
return { pid: parsed.pid, createdAt: parsed.createdAt };
|
||||
return {
|
||||
pid: parsed.pid,
|
||||
createdAt: parsed.createdAt,
|
||||
instanceId: parsed.instanceId,
|
||||
hostname: parsed.hostname,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
@ -144,7 +159,16 @@ export async function acquireSessionWriteLock(params: {
|
||||
try {
|
||||
const handle = await fs.open(lockPath, "wx");
|
||||
await handle.writeFile(
|
||||
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
|
||||
JSON.stringify(
|
||||
{
|
||||
pid: process.pid,
|
||||
createdAt: new Date().toISOString(),
|
||||
instanceId: PROCESS_INSTANCE_ID,
|
||||
hostname: os.hostname(),
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
"utf8",
|
||||
);
|
||||
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
|
||||
@ -164,7 +188,35 @@ export async function acquireSessionWriteLock(params: {
|
||||
if (code !== "EEXIST") throw err;
|
||||
const payload = await readLockPayload(lockPath);
|
||||
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
|
||||
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
|
||||
const lockAgeMs = Number.isFinite(createdAt) ? Date.now() - createdAt : Infinity;
|
||||
|
||||
// If the lock file claims to be held by our own hostname and PID but we
|
||||
// don't have it in memory, and the instanceId differs, it's a stale lock
|
||||
// from a previous process incarnation (e.g., container restart where PID
|
||||
// got reused). Clean it up.
|
||||
//
|
||||
// We require hostname to match to avoid accidentally deleting locks from
|
||||
// other containers that happen to share the same PID (e.g., multiple
|
||||
// containers with PID 1 on a shared volume - each has a different hostname).
|
||||
//
|
||||
// We add a 5-second grace period to avoid accidentally deleting a lock that
|
||||
// was just created but not yet registered in HELD_LOCKS (race condition).
|
||||
const ownPidGraceMs = 5_000;
|
||||
const sameHost = payload?.hostname === os.hostname();
|
||||
const samePid = payload?.pid === process.pid;
|
||||
const differentInstance = payload?.instanceId !== PROCESS_INSTANCE_ID;
|
||||
if (
|
||||
sameHost &&
|
||||
samePid &&
|
||||
differentInstance &&
|
||||
!HELD_LOCKS.has(normalizedSessionFile) &&
|
||||
lockAgeMs > ownPidGraceMs
|
||||
) {
|
||||
await fs.rm(lockPath, { force: true });
|
||||
continue;
|
||||
}
|
||||
|
||||
const stale = lockAgeMs > staleMs;
|
||||
const alive = payload?.pid ? isAlive(payload.pid) : false;
|
||||
if (stale || !alive) {
|
||||
await fs.rm(lockPath, { force: true });
|
||||
|
||||
Loading…
Reference in New Issue
Block a user