fix: add instance nonce to session locks for stale detection after SIGUSR1

The releaseAllSessionWriteLocks() call during shutdown is best-effort. If it fails (crash, timeout, etc.), stale lock files survive with the same PID. Since isAlive(pid) returns true for the still-running process, the lock appears valid for up to staleMs (30 min). Add a per-instance nonce to lock file payloads. The nonce is rotated via resetInstanceNonce() when releaseAllSessionWriteLocks() is called during shutdown, so the next server iteration within the same process gets a fresh nonce. When acquiring a lock, if the on-disk nonce differs from the current instance nonce and the PID matches, the lock is treated as stale and immediately reclaimed. The nonce is mutable (not const) because ESM modules are cached for the process lifetime and are NOT re-evaluated on in-process SIGUSR1 restart. Backward compatible: lock files without a nonce (from older versions) fall through to the existing pid + staleMs checks unchanged. Tests: - Reclaims lock with same PID but different nonce - Preserves lock with matching nonce (current instance) - Falls back to pid+staleMs for nonce-less locks (backward compat) - Verifies nonce rotation on releaseAllSessionWriteLocks
2026-01-29 14:07:14 +00:00 · 2026-01-29 14:07:14 +00:00 · 1d3e300098
commit 1d3e300098
parent 23ec0bceb6
3 changed files with 173 additions and 5 deletions
--- a/src/agents/session-write-lock.test.ts
+++ b/src/agents/session-write-lock.test.ts
@ -164,6 +164,123 @@ describe("acquireSessionWriteLock", () => {
    process.off("SIGINT", keepAlive);
  });

+  it("reclaims lock with same PID but different nonce (stale from prior boot)", async () => {
+    const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-"));
+    try {
+      const sessionFile = path.join(root, "sessions.json");
+      const lockPath = `${sessionFile}.lock`;
+
+      // Simulate a lock left by a previous boot iteration: same PID, different nonce
+      await fs.writeFile(
+        lockPath,
+        JSON.stringify({
+          pid: process.pid,
+          nonce: "stale-nonce-from-previous-boot",
+          createdAt: new Date().toISOString(),
+        }),
+        "utf8",
+      );
+
+      // Should reclaim the lock despite same PID and fresh createdAt
+      const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
+      const raw = await fs.readFile(lockPath, "utf8");
+      const payload = JSON.parse(raw) as { pid: number; nonce: string };
+
+      expect(payload.pid).toBe(process.pid);
+      expect(payload.nonce).toBe(__testing.instanceNonce);
+      await lock.release();
+    } finally {
+      await fs.rm(root, { recursive: true, force: true });
+    }
+  });
+
+  it("does not reclaim lock with matching nonce (held by current boot)", async () => {
+    const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-current-"));
+    try {
+      const sessionFile = path.join(root, "sessions.json");
+
+      // Acquire normally — writes current nonce
+      const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
+
+      // A second acquire on a different normalized path pointing to the same
+      // file should reuse the in-memory held lock (already tested elsewhere),
+      // but if we manually simulate a contention scenario by NOT going through
+      // the in-memory path, the nonce check should NOT treat our own lock as
+      // stale. We verify indirectly: the lock file should contain our nonce.
+      const lockPath = `${sessionFile}.lock`;
+      const raw = await fs.readFile(lockPath, "utf8");
+      const payload = JSON.parse(raw) as { pid: number; nonce: string };
+
+      expect(payload.nonce).toBe(__testing.instanceNonce);
+      await lock.release();
+    } finally {
+      await fs.rm(root, { recursive: true, force: true });
+    }
+  });
+
+  it("falls back to pid+staleMs for locks without a nonce (backward compat)", async () => {
+    const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-no-nonce-"));
+    try {
+      const sessionFile = path.join(root, "sessions.json");
+      const lockPath = `${sessionFile}.lock`;
+
+      // Old-format lock: no nonce, same PID, recent timestamp
+      // This should NOT be reclaimed (backward compat: no nonce → skip nonce check)
+      await fs.writeFile(
+        lockPath,
+        JSON.stringify({
+          pid: process.pid,
+          createdAt: new Date().toISOString(),
+        }),
+        "utf8",
+      );
+
+      // With a short timeout this should fail — lock appears valid (same PID, alive, not stale)
+      await expect(
+        acquireSessionWriteLock({ sessionFile, timeoutMs: 200, staleMs: 60_000 }),
+      ).rejects.toThrow(/locked/);
+    } finally {
+      await fs.rm(root, { recursive: true, force: true });
+    }
+  });
+
+  it("rotates instance nonce on releaseAllSessionWriteLocks", async () => {
+    const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-nonce-rotate-"));
+    try {
+      const sessionFile = path.join(root, "sessions.json");
+
+      // Acquire a lock — captures current nonce
+      const nonceBefore = __testing.instanceNonce;
+      await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
+
+      // Release all — should rotate the nonce
+      await releaseAllSessionWriteLocks();
+      const nonceAfter = __testing.instanceNonce;
+
+      expect(nonceAfter).not.toBe(nonceBefore);
+
+      // A stale lock file with the OLD nonce should now be reclaimable
+      const lockPath = `${sessionFile}.lock`;
+      await fs.writeFile(
+        lockPath,
+        JSON.stringify({
+          pid: process.pid,
+          nonce: nonceBefore,
+          createdAt: new Date().toISOString(),
+        }),
+        "utf8",
+      );
+
+      const lock = await acquireSessionWriteLock({ sessionFile, timeoutMs: 2000 });
+      const raw = await fs.readFile(lockPath, "utf8");
+      const payload = JSON.parse(raw) as { pid: number; nonce: string };
+      expect(payload.nonce).toBe(nonceAfter);
+      await lock.release();
+    } finally {
+      await fs.rm(root, { recursive: true, force: true });
+    }
+  });
+
  it("releaseAllSessionWriteLocks removes all held locks", async () => {
    const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-release-all-"));
    try {
--- a/src/agents/session-write-lock.ts
+++ b/src/agents/session-write-lock.ts
@ -1,10 +1,28 @@
+import { randomBytes } from "node:crypto";
 import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";

+/**
+ * Instance nonce — regenerated each time the gateway server starts within
+ * the process. ESM modules are cached for the process lifetime, so this
+ * must be mutable and explicitly reset via `resetInstanceNonce()` during
+ * shutdown. After an in-process restart (SIGUSR1), lock files written by
+ * the previous server iteration carry a stale nonce, letting us detect
+ * them even when the PID hasn't changed (common in containers where
+ * PID = 1).
+ */
+let instanceNonce: string = randomBytes(12).toString("hex");
+
+function resetInstanceNonce(): void {
+  instanceNonce = randomBytes(12).toString("hex");
+}
+
 type LockFilePayload = {
  pid: number;
  createdAt: string;
+  /** Instance nonce — absent in lock files written by older versions. */
+  nonce?: string;
 };

 type HeldLock = {
@ -93,7 +111,11 @@ async function readLockPayload(lockPath: string): Promise<LockFilePayload | null
    const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
    if (typeof parsed.pid !== "number") return null;
    if (typeof parsed.createdAt !== "string") return null;
-    return { pid: parsed.pid, createdAt: parsed.createdAt };
+    return {
+      pid: parsed.pid,
+      createdAt: parsed.createdAt,
+      nonce: typeof parsed.nonce === "string" ? parsed.nonce : undefined,
+    };
  } catch {
    return null;
  }
@ -144,7 +166,11 @@ export async function acquireSessionWriteLock(params: {
    try {
      const handle = await fs.open(lockPath, "wx");
      await handle.writeFile(
-        JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
+        JSON.stringify(
+          { pid: process.pid, nonce: instanceNonce, createdAt: new Date().toISOString() },
+          null,
+          2,
+        ),
        "utf8",
      );
      HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
@ -166,7 +192,18 @@ export async function acquireSessionWriteLock(params: {
      const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
      const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
      const alive = payload?.pid ? isAlive(payload.pid) : false;
-      if (stale || !alive) {
+
+      // Nonce mismatch: lock was written by a previous server iteration of the
+      // same process (e.g. PID 1 in a container after SIGUSR1).  The old
+      // iteration is gone even though the PID is still alive, so treat the
+      // lock as stale.  If the lock has no nonce (written by an older version)
+      // we fall through to the existing pid+staleMs checks for backward compat.
+      const nonceMismatch =
+        payload?.nonce !== undefined &&
+        payload.nonce !== instanceNonce &&
+        payload.pid === process.pid;
+
+      if (stale || !alive || nonceMismatch) {
        await fs.rm(lockPath, { force: true });
        continue;
      }
@ -181,7 +218,6 @@ export async function acquireSessionWriteLock(params: {
  throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
 }

-
 /**
 * Release all held session write locks.
 *
@ -205,9 +241,18 @@ export async function releaseAllSessionWriteLocks(): Promise<void> {
    }
    HELD_LOCKS.delete(sessionFile);
  }
+  // Rotate the instance nonce AFTER all locks are cleaned up, so that any
+  // lock files that survive this cleanup (e.g. due to fs errors) are
+  // detectable as stale by the next server iteration. Rotating before
+  // cleanup would let a concurrent acquirer see the old nonce as stale
+  // and reclaim a lock we haven't finished releasing yet.
+  resetInstanceNonce();
 }
 export const __testing = {
  cleanupSignals: [...CLEANUP_SIGNALS],
  handleTerminationSignal,
  releaseAllLocksSync,
+  get instanceNonce() {
+    return instanceNonce;
+  },
 };
--- a/src/gateway/server-close.ts
+++ b/src/gateway/server-close.ts
@ -103,7 +103,13 @@ export function createGatewayCloseHandler(params: {
    // owned by PID 1, which appear valid to the new server iteration because
    // isAlive(1) returns true (same process). This blocks session access for
    // up to staleMs (30 min) until the lock expires.
-    await releaseAllSessionWriteLocks();
+    try {
+      await releaseAllSessionWriteLocks();
+    } catch {
+      // Best effort — don't let lock cleanup failure prevent shutdown.
+      // The instance nonce mechanism will detect surviving stale locks
+      // on the next server iteration.
+    }

    for (const c of params.clients) {
      try {