Merge c00695c132 into 87267fad4f
This commit is contained in:
commit
d5d7fdaac2
@ -159,4 +159,77 @@ describe("acquireSessionWriteLock", () => {
|
|||||||
expect(process.listeners("SIGINT")).toContain(keepAlive);
|
expect(process.listeners("SIGINT")).toContain(keepAlive);
|
||||||
process.off("SIGINT", keepAlive);
|
process.off("SIGINT", keepAlive);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("reclaims stale lock from same PID (container restart scenario)", async () => {
|
||||||
|
// Simulates: container crashes, lock file remains with pid=process.pid,
|
||||||
|
// container restarts and gets the same PID due to Docker PID reuse.
|
||||||
|
// The lock should be reclaimed since it's from the same hostname but different instanceId.
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-pid-reuse-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
|
||||||
|
// Write a stale lock file with current hostname and PID but different instanceId
|
||||||
|
// (simulating a previous process incarnation)
|
||||||
|
await fs.writeFile(
|
||||||
|
lockPath,
|
||||||
|
JSON.stringify({
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date(Date.now() - 10_000).toISOString(),
|
||||||
|
hostname: os.hostname(),
|
||||||
|
instanceId: `${os.hostname()}-${process.pid}-0`, // Different start time
|
||||||
|
}),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Should be able to acquire the lock: same hostname+pid but different instanceId
|
||||||
|
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
|
||||||
|
|
||||||
|
// Verify the lock was reclaimed (new timestamp and instanceId)
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const payload = JSON.parse(raw) as { pid: number; createdAt: string; instanceId: string };
|
||||||
|
expect(payload.pid).toBe(process.pid);
|
||||||
|
const lockAge = Date.now() - Date.parse(payload.createdAt);
|
||||||
|
expect(lockAge).toBeLessThan(1000); // Fresh lock, created just now
|
||||||
|
|
||||||
|
await lock.release();
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does NOT reclaim lock from different hostname (multi-container scenario)", async () => {
|
||||||
|
// Simulates: two containers with shared volume, both have PID 1.
|
||||||
|
// Container A holds the lock, Container B should NOT reclaim it.
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-lock-multi-container-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
|
||||||
|
// Write a lock file from "another container" with same PID but different hostname
|
||||||
|
await fs.writeFile(
|
||||||
|
lockPath,
|
||||||
|
JSON.stringify({
|
||||||
|
pid: process.pid, // Same PID (e.g., both containers are PID 1)
|
||||||
|
createdAt: new Date(Date.now() - 10_000).toISOString(),
|
||||||
|
hostname: "other-container-hostname", // Different hostname
|
||||||
|
instanceId: "other-container-hostname-1-12345",
|
||||||
|
}),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Should NOT be able to acquire the lock because hostname differs
|
||||||
|
// The lock will timeout because isAlive(process.pid) returns true
|
||||||
|
await expect(acquireSessionWriteLock({ sessionFile, timeoutMs: 500 })).rejects.toThrow(
|
||||||
|
/timeout/,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the original lock file is still there (not deleted)
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const payload = JSON.parse(raw) as { hostname: string };
|
||||||
|
expect(payload.hostname).toBe("other-container-hostname");
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
import fsSync from "node:fs";
|
import fsSync from "node:fs";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
type LockFilePayload = {
|
type LockFilePayload = {
|
||||||
pid: number;
|
pid: number;
|
||||||
createdAt: string;
|
createdAt: string;
|
||||||
|
instanceId?: string; // Unique identifier: hostname-pid-startTime
|
||||||
|
hostname?: string; // Hostname of the process holding the lock
|
||||||
};
|
};
|
||||||
|
|
||||||
type HeldLock = {
|
type HeldLock = {
|
||||||
@ -15,6 +18,13 @@ type HeldLock = {
|
|||||||
|
|
||||||
const HELD_LOCKS = new Map<string, HeldLock>();
|
const HELD_LOCKS = new Map<string, HeldLock>();
|
||||||
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
|
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
|
||||||
|
|
||||||
|
// Unique identifier for this process instance, combining hostname, PID, and start time.
|
||||||
|
// This distinguishes between:
|
||||||
|
// 1. Same host, PID reuse after restart: same hostname+pid, different start time
|
||||||
|
// 2. Different containers with same PID: different hostname
|
||||||
|
const PROCESS_START_TIME = Math.floor(Date.now() - process.uptime() * 1000);
|
||||||
|
const PROCESS_INSTANCE_ID = `${os.hostname()}-${process.pid}-${PROCESS_START_TIME}`;
|
||||||
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
|
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
|
||||||
const cleanupHandlers = new Map<CleanupSignal, () => void>();
|
const cleanupHandlers = new Map<CleanupSignal, () => void>();
|
||||||
|
|
||||||
@ -93,7 +103,12 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
|
|||||||
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
|
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
|
||||||
if (typeof parsed.pid !== "number") return null;
|
if (typeof parsed.pid !== "number") return null;
|
||||||
if (typeof parsed.createdAt !== "string") return null;
|
if (typeof parsed.createdAt !== "string") return null;
|
||||||
return { pid: parsed.pid, createdAt: parsed.createdAt };
|
return {
|
||||||
|
pid: parsed.pid,
|
||||||
|
createdAt: parsed.createdAt,
|
||||||
|
instanceId: parsed.instanceId,
|
||||||
|
hostname: parsed.hostname,
|
||||||
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -144,7 +159,16 @@ export async function acquireSessionWriteLock(params: {
|
|||||||
try {
|
try {
|
||||||
const handle = await fs.open(lockPath, "wx");
|
const handle = await fs.open(lockPath, "wx");
|
||||||
await handle.writeFile(
|
await handle.writeFile(
|
||||||
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
|
JSON.stringify(
|
||||||
|
{
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
instanceId: PROCESS_INSTANCE_ID,
|
||||||
|
hostname: os.hostname(),
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
"utf8",
|
"utf8",
|
||||||
);
|
);
|
||||||
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
|
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
|
||||||
@ -164,7 +188,35 @@ export async function acquireSessionWriteLock(params: {
|
|||||||
if (code !== "EEXIST") throw err;
|
if (code !== "EEXIST") throw err;
|
||||||
const payload = await readLockPayload(lockPath);
|
const payload = await readLockPayload(lockPath);
|
||||||
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
|
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
|
||||||
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
|
const lockAgeMs = Number.isFinite(createdAt) ? Date.now() - createdAt : Infinity;
|
||||||
|
|
||||||
|
// If the lock file claims to be held by our own hostname and PID but we
|
||||||
|
// don't have it in memory, and the instanceId differs, it's a stale lock
|
||||||
|
// from a previous process incarnation (e.g., container restart where PID
|
||||||
|
// got reused). Clean it up.
|
||||||
|
//
|
||||||
|
// We require hostname to match to avoid accidentally deleting locks from
|
||||||
|
// other containers that happen to share the same PID (e.g., multiple
|
||||||
|
// containers with PID 1 on a shared volume - each has a different hostname).
|
||||||
|
//
|
||||||
|
// We add a 5-second grace period to avoid accidentally deleting a lock that
|
||||||
|
// was just created but not yet registered in HELD_LOCKS (race condition).
|
||||||
|
const ownPidGraceMs = 5_000;
|
||||||
|
const sameHost = payload?.hostname === os.hostname();
|
||||||
|
const samePid = payload?.pid === process.pid;
|
||||||
|
const differentInstance = payload?.instanceId !== PROCESS_INSTANCE_ID;
|
||||||
|
if (
|
||||||
|
sameHost &&
|
||||||
|
samePid &&
|
||||||
|
differentInstance &&
|
||||||
|
!HELD_LOCKS.has(normalizedSessionFile) &&
|
||||||
|
lockAgeMs > ownPidGraceMs
|
||||||
|
) {
|
||||||
|
await fs.rm(lockPath, { force: true });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const stale = lockAgeMs > staleMs;
|
||||||
const alive = payload?.pid ? isAlive(payload.pid) : false;
|
const alive = payload?.pid ? isAlive(payload.pid) : false;
|
||||||
if (stale || !alive) {
|
if (stale || !alive) {
|
||||||
await fs.rm(lockPath, { force: true });
|
await fs.rm(lockPath, { force: true });
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user