fix: release session locks on gateway restart (SIGUSR1)

When the gateway restarts via SIGUSR1, server.close() kills running
agent turns (chatRunState.clear()) but never releases their session
write locks. The on-disk .lock files persist with PID 1, and
isAlive(1) returns true because it is the same process. This blocks
session access for up to staleMs (30 min) until the lock expires.

- Export releaseAllSessionWriteLocks() from session-write-lock.ts
- Call it in server-close.ts after chatRunState.clear()
- Add test for releaseAllSessionWriteLocks()
This commit is contained in:
Merlin 2026-01-29 13:04:04 +00:00
parent 5f4715acfc
commit 23ec0bceb6
3 changed files with 68 additions and 1 deletions

View File

@ -3,7 +3,11 @@ import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { __testing, acquireSessionWriteLock } from "./session-write-lock.js";
import {
__testing,
acquireSessionWriteLock,
releaseAllSessionWriteLocks,
} from "./session-write-lock.js";
describe("acquireSessionWriteLock", () => {
it("reuses locks across symlinked session paths", async () => {
@ -159,4 +163,33 @@ describe("acquireSessionWriteLock", () => {
expect(process.listeners("SIGINT")).toContain(keepAlive);
process.off("SIGINT", keepAlive);
});
it("releaseAllSessionWriteLocks removes all held locks", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-lock-release-all-"));
try {
const sessionA = path.join(root, "a.json");
const sessionB = path.join(root, "b.json");
const lockA = `${sessionA}.lock`;
const lockB = `${sessionB}.lock`;
await acquireSessionWriteLock({ sessionFile: sessionA, timeoutMs: 500 });
await acquireSessionWriteLock({ sessionFile: sessionB, timeoutMs: 500 });
// Both lock files should exist
await expect(fs.access(lockA)).resolves.toBeUndefined();
await expect(fs.access(lockB)).resolves.toBeUndefined();
await releaseAllSessionWriteLocks();
// Both lock files should be removed
await expect(fs.access(lockA)).rejects.toThrow();
await expect(fs.access(lockB)).rejects.toThrow();
// Should be able to re-acquire after release
const lock = await acquireSessionWriteLock({ sessionFile: sessionA, timeoutMs: 500 });
await lock.release();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
});

View File

@ -181,6 +181,31 @@ export async function acquireSessionWriteLock(params: {
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
}
/**
* Release all held session write locks.
*
* Call this during gateway shutdown (e.g., before an in-process SIGUSR1
* restart) to ensure on-disk `.lock` files are removed. Without this,
* locks owned by PID 1 survive an in-process restart because
* `isAlive(1)` returns true for the same process, making the lock
* appear valid for up to `staleMs` (30 min).
*/
export async function releaseAllSessionWriteLocks(): Promise<void> {
for (const [sessionFile, held] of HELD_LOCKS) {
try {
await held.handle.close();
} catch {
// Best effort - handle may already be closed.
}
try {
await fs.rm(held.lockPath, { force: true });
} catch {
// Best effort - file may already be removed.
}
HELD_LOCKS.delete(sessionFile);
}
}
export const __testing = {
cleanupSignals: [...CLEANUP_SIGNALS],
handleTerminationSignal,

View File

@ -1,5 +1,6 @@
import type { Server as HttpServer } from "node:http";
import type { WebSocketServer } from "ws";
import { releaseAllSessionWriteLocks } from "../agents/session-write-lock.js";
import type { CanvasHostHandler, CanvasHostServer } from "../canvas-host/server.js";
import { type ChannelId, listChannelPlugins } from "../channels/plugins/index.js";
import { stopGmailWatcher } from "../hooks/gmail-watcher.js";
@ -96,6 +97,14 @@ export function createGatewayCloseHandler(params: {
}
}
params.chatRunState.clear();
// Release all held session write locks before restarting.
// Without this, in-process restarts (SIGUSR1) leave on-disk .lock files
// owned by PID 1, which appear valid to the new server iteration because
// isAlive(1) returns true (same process). This blocks session access for
// up to staleMs (30 min) until the lock expires.
await releaseAllSessionWriteLocks();
for (const c of params.clients) {
try {
c.socket.close(1012, "service restart");