Merge 1d3e300098 into 09be5d45d5
This commit is contained in:
commit
2ca00ae564
@ -3,7 +3,11 @@ import os from "node:os";
|
|||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
import { __testing, acquireSessionWriteLock } from "./session-write-lock.js";
|
import {
|
||||||
|
__testing,
|
||||||
|
acquireSessionWriteLock,
|
||||||
|
releaseAllSessionWriteLocks,
|
||||||
|
} from "./session-write-lock.js";
|
||||||
|
|
||||||
describe("acquireSessionWriteLock", () => {
|
describe("acquireSessionWriteLock", () => {
|
||||||
it("reuses locks across symlinked session paths", async () => {
|
it("reuses locks across symlinked session paths", async () => {
|
||||||
@ -159,4 +163,150 @@ describe("acquireSessionWriteLock", () => {
|
|||||||
expect(process.listeners("SIGINT")).toContain(keepAlive);
|
expect(process.listeners("SIGINT")).toContain(keepAlive);
|
||||||
process.off("SIGINT", keepAlive);
|
process.off("SIGINT", keepAlive);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("reclaims lock with same PID but different nonce (stale from prior boot)", async () => {
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
|
||||||
|
// Simulate a lock left by a previous boot iteration: same PID, different nonce
|
||||||
|
await fs.writeFile(
|
||||||
|
lockPath,
|
||||||
|
JSON.stringify({
|
||||||
|
pid: process.pid,
|
||||||
|
nonce: "stale-nonce-from-previous-boot",
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
}),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Should reclaim the lock despite same PID and fresh createdAt
|
||||||
|
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const payload = JSON.parse(raw) as { pid: number; nonce: string };
|
||||||
|
|
||||||
|
expect(payload.pid).toBe(process.pid);
|
||||||
|
expect(payload.nonce).toBe(__testing.instanceNonce);
|
||||||
|
await lock.release();
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not reclaim lock with matching nonce (held by current boot)", async () => {
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-current-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
|
||||||
|
// Acquire normally — writes current nonce
|
||||||
|
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
|
||||||
|
|
||||||
|
// A second acquire on a different normalized path pointing to the same
|
||||||
|
// file should reuse the in-memory held lock (already tested elsewhere),
|
||||||
|
// but if we manually simulate a contention scenario by NOT going through
|
||||||
|
// the in-memory path, the nonce check should NOT treat our own lock as
|
||||||
|
// stale. We verify indirectly: the lock file should contain our nonce.
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const payload = JSON.parse(raw) as { pid: number; nonce: string };
|
||||||
|
|
||||||
|
expect(payload.nonce).toBe(__testing.instanceNonce);
|
||||||
|
await lock.release();
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to pid+staleMs for locks without a nonce (backward compat)", async () => {
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-no-nonce-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
|
||||||
|
// Old-format lock: no nonce, same PID, recent timestamp
|
||||||
|
// This should NOT be reclaimed (backward compat: no nonce → skip nonce check)
|
||||||
|
await fs.writeFile(
|
||||||
|
lockPath,
|
||||||
|
JSON.stringify({
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
}),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
|
||||||
|
// With a short timeout this should fail — lock appears valid (same PID, alive, not stale)
|
||||||
|
await expect(
|
||||||
|
acquireSessionWriteLock({ sessionFile, timeoutMs: 200, staleMs: 60_000 }),
|
||||||
|
).rejects.toThrow(/locked/);
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rotates instance nonce on releaseAllSessionWriteLocks", async () => {
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-rotate-"));
|
||||||
|
try {
|
||||||
|
const sessionFile = path.join(root, "sessions.json");
|
||||||
|
|
||||||
|
// Acquire a lock — captures current nonce
|
||||||
|
const nonceBefore = __testing.instanceNonce;
|
||||||
|
await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
|
||||||
|
|
||||||
|
// Release all — should rotate the nonce
|
||||||
|
await releaseAllSessionWriteLocks();
|
||||||
|
const nonceAfter = __testing.instanceNonce;
|
||||||
|
|
||||||
|
expect(nonceAfter).not.toBe(nonceBefore);
|
||||||
|
|
||||||
|
// A stale lock file with the OLD nonce should now be reclaimable
|
||||||
|
const lockPath = `${sessionFile}.lock`;
|
||||||
|
await fs.writeFile(
|
||||||
|
lockPath,
|
||||||
|
JSON.stringify({
|
||||||
|
pid: process.pid,
|
||||||
|
nonce: nonceBefore,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
}),
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
|
||||||
|
const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
|
||||||
|
const raw = await fs.readFile(lockPath, "utf8");
|
||||||
|
const payload = JSON.parse(raw) as { pid: number; nonce: string };
|
||||||
|
expect(payload.nonce).toBe(nonceAfter);
|
||||||
|
await lock.release();
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("releaseAllSessionWriteLocks removes all held locks", async () => {
|
||||||
|
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-release-all-"));
|
||||||
|
try {
|
||||||
|
const sessionA = path.join(root, "a.json");
|
||||||
|
const sessionB = path.join(root, "b.json");
|
||||||
|
const lockA = `${sessionA}.lock`;
|
||||||
|
const lockB = `${sessionB}.lock`;
|
||||||
|
|
||||||
|
await acquireSessionWriteLock({ sessionFile: sessionA, timeoutMs: 500 });
|
||||||
|
await acquireSessionWriteLock({ sessionFile: sessionB, timeoutMs: 500 });
|
||||||
|
|
||||||
|
// Both lock files should exist
|
||||||
|
await expect(fs.access(lockA)).resolves.toBeUndefined();
|
||||||
|
await expect(fs.access(lockB)).resolves.toBeUndefined();
|
||||||
|
|
||||||
|
await releaseAllSessionWriteLocks();
|
||||||
|
|
||||||
|
// Both lock files should be removed
|
||||||
|
await expect(fs.access(lockA)).rejects.toThrow();
|
||||||
|
await expect(fs.access(lockB)).rejects.toThrow();
|
||||||
|
|
||||||
|
// Should be able to re-acquire after release
|
||||||
|
const lock = await acquireSessionWriteLock({ sessionFile: sessionA, timeoutMs: 500 });
|
||||||
|
await lock.release();
|
||||||
|
} finally {
|
||||||
|
await fs.rm(root, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -1,10 +1,28 @@
|
|||||||
|
import { randomBytes } from "node:crypto";
|
||||||
import fsSync from "node:fs";
|
import fsSync from "node:fs";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instance nonce — regenerated each time the gateway server starts within
|
||||||
|
* the process. ESM modules are cached for the process lifetime, so this
|
||||||
|
* must be mutable and explicitly reset via `resetInstanceNonce()` during
|
||||||
|
* shutdown. After an in-process restart (SIGUSR1), lock files written by
|
||||||
|
* the previous server iteration carry a stale nonce, letting us detect
|
||||||
|
* them even when the PID hasn't changed (common in containers where
|
||||||
|
* PID = 1).
|
||||||
|
*/
|
||||||
|
let instanceNonce: string = randomBytes(12).toString("hex");
|
||||||
|
|
||||||
|
function resetInstanceNonce(): void {
|
||||||
|
instanceNonce = randomBytes(12).toString("hex");
|
||||||
|
}
|
||||||
|
|
||||||
type LockFilePayload = {
|
type LockFilePayload = {
|
||||||
pid: number;
|
pid: number;
|
||||||
createdAt: string;
|
createdAt: string;
|
||||||
|
/** Instance nonce — absent in lock files written by older versions. */
|
||||||
|
nonce?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
type HeldLock = {
|
type HeldLock = {
|
||||||
@ -93,7 +111,11 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
|
|||||||
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
|
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
|
||||||
if (typeof parsed.pid !== "number") return null;
|
if (typeof parsed.pid !== "number") return null;
|
||||||
if (typeof parsed.createdAt !== "string") return null;
|
if (typeof parsed.createdAt !== "string") return null;
|
||||||
return { pid: parsed.pid, createdAt: parsed.createdAt };
|
return {
|
||||||
|
pid: parsed.pid,
|
||||||
|
createdAt: parsed.createdAt,
|
||||||
|
nonce: typeof parsed.nonce === "string" ? parsed.nonce : undefined,
|
||||||
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -144,7 +166,11 @@ export async function acquireSessionWriteLock(params: {
|
|||||||
try {
|
try {
|
||||||
const handle = await fs.open(lockPath, "wx");
|
const handle = await fs.open(lockPath, "wx");
|
||||||
await handle.writeFile(
|
await handle.writeFile(
|
||||||
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
|
JSON.stringify(
|
||||||
|
{ pid: process.pid, nonce: instanceNonce, createdAt: new Date().toISOString() },
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
"utf8",
|
"utf8",
|
||||||
);
|
);
|
||||||
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
|
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
|
||||||
@ -166,7 +192,18 @@ export async function acquireSessionWriteLock(params: {
|
|||||||
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
|
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
|
||||||
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
|
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
|
||||||
const alive = payload?.pid ? isAlive(payload.pid) : false;
|
const alive = payload?.pid ? isAlive(payload.pid) : false;
|
||||||
if (stale || !alive) {
|
|
||||||
|
// Nonce mismatch: lock was written by a previous server iteration of the
|
||||||
|
// same process (e.g. PID 1 in a container after SIGUSR1). The old
|
||||||
|
// iteration is gone even though the PID is still alive, so treat the
|
||||||
|
// lock as stale. If the lock has no nonce (written by an older version)
|
||||||
|
// we fall through to the existing pid+staleMs checks for backward compat.
|
||||||
|
const nonceMismatch =
|
||||||
|
payload?.nonce !== undefined &&
|
||||||
|
payload.nonce !== instanceNonce &&
|
||||||
|
payload.pid === process.pid;
|
||||||
|
|
||||||
|
if (stale || !alive || nonceMismatch) {
|
||||||
await fs.rm(lockPath, { force: true });
|
await fs.rm(lockPath, { force: true });
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -181,8 +218,41 @@ export async function acquireSessionWriteLock(params: {
|
|||||||
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
|
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Release all held session write locks.
|
||||||
|
*
|
||||||
|
* Call this during gateway shutdown (e.g., before an in-process SIGUSR1
|
||||||
|
* restart) to ensure on-disk `.lock` files are removed. Without this,
|
||||||
|
* locks owned by PID 1 survive an in-process restart because
|
||||||
|
* `isAlive(1)` returns true for the same process, making the lock
|
||||||
|
* appear valid for up to `staleMs` (30 min).
|
||||||
|
*/
|
||||||
|
export async function releaseAllSessionWriteLocks(): Promise<void> {
|
||||||
|
for (const [sessionFile, held] of HELD_LOCKS) {
|
||||||
|
try {
|
||||||
|
await held.handle.close();
|
||||||
|
} catch {
|
||||||
|
// Best effort - handle may already be closed.
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await fs.rm(held.lockPath, { force: true });
|
||||||
|
} catch {
|
||||||
|
// Best effort - file may already be removed.
|
||||||
|
}
|
||||||
|
HELD_LOCKS.delete(sessionFile);
|
||||||
|
}
|
||||||
|
// Rotate the instance nonce AFTER all locks are cleaned up, so that any
|
||||||
|
// lock files that survive this cleanup (e.g. due to fs errors) are
|
||||||
|
// detectable as stale by the next server iteration. Rotating before
|
||||||
|
// cleanup would let a concurrent acquirer see the old nonce as stale
|
||||||
|
// and reclaim a lock we haven't finished releasing yet.
|
||||||
|
resetInstanceNonce();
|
||||||
|
}
|
||||||
export const __testing = {
|
export const __testing = {
|
||||||
cleanupSignals: [...CLEANUP_SIGNALS],
|
cleanupSignals: [...CLEANUP_SIGNALS],
|
||||||
handleTerminationSignal,
|
handleTerminationSignal,
|
||||||
releaseAllLocksSync,
|
releaseAllLocksSync,
|
||||||
|
get instanceNonce() {
|
||||||
|
return instanceNonce;
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import type { Server as HttpServer } from "node:http";
|
import type { Server as HttpServer } from "node:http";
|
||||||
import type { WebSocketServer } from "ws";
|
import type { WebSocketServer } from "ws";
|
||||||
|
import { releaseAllSessionWriteLocks } from "../agents/session-write-lock.js";
|
||||||
import type { CanvasHostHandler, CanvasHostServer } from "../canvas-host/server.js";
|
import type { CanvasHostHandler, CanvasHostServer } from "../canvas-host/server.js";
|
||||||
import { type ChannelId, listChannelPlugins } from "../channels/plugins/index.js";
|
import { type ChannelId, listChannelPlugins } from "../channels/plugins/index.js";
|
||||||
import { stopGmailWatcher } from "../hooks/gmail-watcher.js";
|
import { stopGmailWatcher } from "../hooks/gmail-watcher.js";
|
||||||
@ -96,6 +97,20 @@ export function createGatewayCloseHandler(params: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
params.chatRunState.clear();
|
params.chatRunState.clear();
|
||||||
|
|
||||||
|
// Release all held session write locks before restarting.
|
||||||
|
// Without this, in-process restarts (SIGUSR1) leave on-disk .lock files
|
||||||
|
// owned by PID 1, which appear valid to the new server iteration because
|
||||||
|
// isAlive(1) returns true (same process). This blocks session access for
|
||||||
|
// up to staleMs (30 min) until the lock expires.
|
||||||
|
try {
|
||||||
|
await releaseAllSessionWriteLocks();
|
||||||
|
} catch {
|
||||||
|
// Best effort — don't let lock cleanup failure prevent shutdown.
|
||||||
|
// The instance nonce mechanism will detect surviving stale locks
|
||||||
|
// on the next server iteration.
|
||||||
|
}
|
||||||
|
|
||||||
for (const c of params.clients) {
|
for (const c of params.clients) {
|
||||||
try {
|
try {
|
||||||
c.socket.close(1012, "service restart");
|
c.socket.close(1012, "service restart");
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user