This commit is contained in:
Tianxiang.Zheng 2026-01-29 22:05:31 -08:00 committed by GitHub
commit d5d7fdaac2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 128 additions and 3 deletions

View File

@ -159,4 +159,77 @@ describe("acquireSessionWriteLock", () => {
expect(process.listeners("SIGINT")).toContain(keepAlive);
process.off("SIGINT", keepAlive);
});
it("reclaims stale lock from same PID (container restart scenario)", async () => {
// Simulates: container crashes, lock file remains with pid=process.pid,
// container restarts and gets the same PID due to Docker PID reuse.
// The lock should be reclaimed since it's from the same hostname but different instanceId.
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-pid-reuse-"));
try {
const sessionFile = path.join(root, "sessions.json");
const lockPath = `${sessionFile}.lock`;
// Write a stale lock file with current hostname and PID but different instanceId
// (simulating a previous process incarnation)
await fs.writeFile(
lockPath,
JSON.stringify({
pid: process.pid,
createdAt: new Date(Date.now() - 10_000).toISOString(),
hostname: os.hostname(),
instanceId: `${os.hostname()}-${process.pid}-0`, // Different start time
}),
"utf8",
);
// Should be able to acquire the lock: same hostname+pid but different instanceId
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
// Verify the lock was reclaimed (new timestamp and instanceId)
const raw = await fs.readFile(lockPath, "utf8");
const payload = JSON.parse(raw) as { pid: number; createdAt: string; instanceId: string };
expect(payload.pid).toBe(process.pid);
const lockAge = Date.now() - Date.parse(payload.createdAt);
expect(lockAge).toBeLessThan(1000); // Fresh lock, created just now
await lock.release();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
it("does NOT reclaim lock from different hostname (multi-container scenario)", async () => {
// Simulates: two containers with shared volume, both have PID 1.
// Container A holds the lock, Container B should NOT reclaim it.
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-multi-container-"));
try {
const sessionFile = path.join(root, "sessions.json");
const lockPath = `${sessionFile}.lock`;
// Write a lock file from "another container" with same PID but different hostname
await fs.writeFile(
lockPath,
JSON.stringify({
pid: process.pid, // Same PID (e.g., both containers are PID 1)
createdAt: new Date(Date.now() - 10_000).toISOString(),
hostname: "other-container-hostname", // Different hostname
instanceId: "other-container-hostname-1-12345",
}),
"utf8",
);
// Should NOT be able to acquire the lock because hostname differs
// The lock will timeout because isAlive(process.pid) returns true
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 500 })).rejects.toThrow(
/timeout/,
);
// Verify the original lock file is still there (not deleted)
const raw = await fs.readFile(lockPath, "utf8");
const payload = JSON.parse(raw) as { hostname: string };
expect(payload.hostname).toBe("other-container-hostname");
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
});

View File

@ -1,10 +1,13 @@
import fsSync from "node:fs";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
type LockFilePayload = {
pid: number;
createdAt: string;
instanceId?: string; // Unique identifier: hostname-pid-startTime
hostname?: string; // Hostname of the process holding the lock
};
type HeldLock = {
@ -15,6 +18,13 @@ type HeldLock = {
const HELD_LOCKS = new Map<string, HeldLock>();
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
// Unique identifier for this process instance, combining hostname, PID, and start time.
// This distinguishes between:
// 1. Same host, PID reuse after restart: same hostname+pid, different start time
// 2. Different containers with same PID: different hostname
const PROCESS_START_TIME = Math.floor(Date.now() - process.uptime() * 1000);
const PROCESS_INSTANCE_ID = `${os.hostname()}-${process.pid}-${PROCESS_START_TIME}`;
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
const cleanupHandlers = new Map<CleanupSignal, () => void>();
@ -93,7 +103,12 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
if (typeof parsed.pid !== "number") return null;
if (typeof parsed.createdAt !== "string") return null;
return { pid: parsed.pid, createdAt: parsed.createdAt };
return {
pid: parsed.pid,
createdAt: parsed.createdAt,
instanceId: parsed.instanceId,
hostname: parsed.hostname,
};
} catch {
return null;
}
@ -144,7 +159,16 @@ export async function acquireSessionWriteLock(params: {
try {
const handle = await fs.open(lockPath, "wx");
await handle.writeFile(
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
JSON.stringify(
{
pid: process.pid,
createdAt: new Date().toISOString(),
instanceId: PROCESS_INSTANCE_ID,
hostname: os.hostname(),
},
null,
2,
),
"utf8",
);
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
@ -164,7 +188,35 @@ export async function acquireSessionWriteLock(params: {
if (code !== "EEXIST") throw err;
const payload = await readLockPayload(lockPath);
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
const lockAgeMs = Number.isFinite(createdAt) ? Date.now() - createdAt : Infinity;
// If the lock file claims to be held by our own hostname and PID but we
// don't have it in memory, and the instanceId differs, it's a stale lock
// from a previous process incarnation (e.g., container restart where PID
// got reused). Clean it up.
//
// We require hostname to match to avoid accidentally deleting locks from
// other containers that happen to share the same PID (e.g., multiple
// containers with PID 1 on a shared volume - each has a different hostname).
//
// We add a 5-second grace period to avoid accidentally deleting a lock that
// was just created but not yet registered in HELD_LOCKS (race condition).
const ownPidGraceMs = 5_000;
const sameHost = payload?.hostname === os.hostname();
const samePid = payload?.pid === process.pid;
const differentInstance = payload?.instanceId !== PROCESS_INSTANCE_ID;
if (
sameHost &&
samePid &&
differentInstance &&
!HELD_LOCKS.has(normalizedSessionFile) &&
lockAgeMs > ownPidGraceMs
) {
await fs.rm(lockPath, { force: true });
continue;
}
const stale = lockAgeMs > staleMs;
const alive = payload?.pid ? isAlive(payload.pid) : false;
if (stale || !alive) {
await fs.rm(lockPath, { force: true });