fix: harden gateway lock validation (#1572) (thanks @steipete)

This commit is contained in:
Peter Steinberger 2026-01-24 08:14:30 +00:00
parent 90685ef814
commit 3fff943ba1
5 changed files with 221 additions and 28 deletions

View File

@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot
- TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg. - TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg.
### Fixes ### Fixes
- Gateway: compare Linux process start time to avoid PID recycling lock loops; keep locks unless stale. (#1572) Thanks @steipete.
- Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest. - Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest.
- Agents: ignore IDENTITY.md template placeholders when parsing identity to avoid placeholder replies. (#1556) - Agents: ignore IDENTITY.md template placeholders when parsing identity to avoid placeholder replies. (#1556)
- Docker: update gateway command in docker-compose and Hetzner guide. (#1514) - Docker: update gateway command in docker-compose and Hetzner guide. (#1514)

View File

@ -1045,6 +1045,7 @@
"platforms/android", "platforms/android",
"platforms/windows", "platforms/windows",
"platforms/linux", "platforms/linux",
"platforms/fly",
"platforms/hetzner", "platforms/hetzner",
"platforms/exe-dev" "platforms/exe-dev"
] ]

View File

@ -23,6 +23,7 @@ Native companion apps for Windows are also planned; the Gateway is recommended v
## VPS & hosting ## VPS & hosting
- Fly.io: [Fly.io](/platforms/fly)
- Hetzner (Docker): [Hetzner](/platforms/hetzner) - Hetzner (Docker): [Hetzner](/platforms/hetzner)
- exe.dev (VM + HTTPS proxy): [exe.dev](/platforms/exe-dev) - exe.dev (VM + HTTPS proxy): [exe.dev](/platforms/exe-dev)

View File

@ -1,10 +1,13 @@
import { createHash } from "node:crypto";
import fsSync from "node:fs";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import os from "node:os"; import os from "node:os";
import path from "node:path"; import path from "node:path";
import { describe, expect, it } from "vitest"; import { describe, expect, it, vi } from "vitest";
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js"; import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
async function makeEnv() { async function makeEnv() {
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-")); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
@ -22,6 +25,41 @@ async function makeEnv() {
}; };
} }
function resolveLockPath(env: NodeJS.ProcessEnv) {
const stateDir = resolveStateDir(env);
const configPath = resolveConfigPath(env, stateDir);
const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8);
return { lockPath: path.join(stateDir, `gateway.${hash}.lock`), configPath };
}
function makeProcStat(pid: number, startTime: number) {
const fields = [
"R",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
"1",
String(startTime),
"1",
"1",
];
return `${pid} (node) ${fields.join(" ")}`;
}
describe("gateway lock", () => { describe("gateway lock", () => {
it("blocks concurrent acquisition until release", async () => { it("blocks concurrent acquisition until release", async () => {
const { env, cleanup } = await makeEnv(); const { env, cleanup } = await makeEnv();
@ -52,4 +90,98 @@ describe("gateway lock", () => {
await lock2?.release(); await lock2?.release();
await cleanup(); await cleanup();
}); });
it("treats recycled linux pid as stale when start time mismatches", async () => {
const { env, cleanup } = await makeEnv();
const { lockPath, configPath } = resolveLockPath(env);
const payload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
startTime: 111,
};
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
const readFileSync = fsSync.readFileSync;
const statValue = makeProcStat(process.pid, 222);
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
return statValue;
}
return readFileSync(filePath as never, encoding as never) as never;
});
const lock = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
platform: "linux",
});
expect(lock).not.toBeNull();
await lock?.release();
spy.mockRestore();
await cleanup();
});
it("keeps lock on linux when proc access fails unless stale", async () => {
const { env, cleanup } = await makeEnv();
const { lockPath, configPath } = resolveLockPath(env);
const payload = {
pid: process.pid,
createdAt: new Date().toISOString(),
configPath,
startTime: 111,
};
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
const readFileSync = fsSync.readFileSync;
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
throw new Error("EACCES");
}
return readFileSync(filePath as never, encoding as never) as never;
});
await expect(
acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 120,
pollIntervalMs: 20,
staleMs: 10_000,
platform: "linux",
}),
).rejects.toBeInstanceOf(GatewayLockError);
spy.mockRestore();
const stalePayload = {
...payload,
createdAt: new Date(0).toISOString(),
};
await fs.writeFile(lockPath, JSON.stringify(stalePayload), "utf8");
const staleSpy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
if (filePath === `/proc/${process.pid}/stat`) {
throw new Error("EACCES");
}
return readFileSync(filePath as never, encoding as never) as never;
});
const lock = await acquireGatewayLock({
env,
allowInTests: true,
timeoutMs: 200,
pollIntervalMs: 20,
staleMs: 1,
platform: "linux",
});
expect(lock).not.toBeNull();
await lock?.release();
staleSpy.mockRestore();
await cleanup();
});
}); });

View File

@ -13,6 +13,7 @@ type LockPayload = {
pid: number; pid: number;
createdAt: string; createdAt: string;
configPath: string; configPath: string;
startTime?: number;
}; };
export type GatewayLockHandle = { export type GatewayLockHandle = {
@ -27,6 +28,7 @@ export type GatewayLockOptions = {
pollIntervalMs?: number; pollIntervalMs?: number;
staleMs?: number; staleMs?: number;
allowInTests?: boolean; allowInTests?: boolean;
platform?: NodeJS.Platform;
}; };
export class GatewayLockError extends Error { export class GatewayLockError extends Error {
@ -39,6 +41,8 @@ export class GatewayLockError extends Error {
} }
} }
type LockOwnerStatus = "alive" | "dead" | "unknown";
function isAlive(pid: number): boolean { function isAlive(pid: number): boolean {
if (!Number.isFinite(pid) || pid <= 0) return false; if (!Number.isFinite(pid) || pid <= 0) return false;
try { try {
@ -49,32 +53,78 @@ function isAlive(pid: number): boolean {
} }
} }
/** function normalizeProcArg(arg: string): string {
* Check if a PID is actually a clawdbot gateway process. return arg.replaceAll("\\", "/").toLowerCase();
* This handles PID recycling in containers where a different process }
* might have the same PID after a restart.
*/
function isGatewayProcess(pid: number): boolean {
if (!isAlive(pid)) return false;
// On Linux, check /proc/PID/cmdline to verify it's actually clawdbot function parseProcCmdline(raw: string): string[] {
if (process.platform === "linux") { return raw
try { .split("\0")
const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8"); .map((entry) => entry.trim())
// cmdline uses null bytes as separators .filter(Boolean);
const args = cmdline.split("\0").join(" ").toLowerCase(); }
// Check if this is actually a clawdbot gateway process
return args.includes("clawdbot") || args.includes("gateway"); function isGatewayArgv(args: string[]): boolean {
} catch { const normalized = args.map(normalizeProcArg);
// Can't read cmdline - process might have exited or we lack permissions if (!normalized.includes("gateway")) return false;
// Fall back to assuming it's not our process (safer in containers)
return false; const entryCandidates = [
} "dist/index.js",
"dist/index.mjs",
"dist/entry.js",
"dist/entry.mjs",
"scripts/run-node.mjs",
"src/index.ts",
];
if (normalized.some((arg) => entryCandidates.some((entry) => arg.endsWith(entry)))) {
return true;
} }
// On non-Linux (macOS, Windows), trust the PID check const exe = normalized[0] ?? "";
// PID recycling is less of an issue outside containers return exe.endsWith("/clawdbot") || exe === "clawdbot";
return true; }
function readLinuxCmdline(pid: number): string[] | null {
try {
const raw = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
return parseProcCmdline(raw);
} catch {
return null;
}
}
function readLinuxStartTime(pid: number): number | null {
try {
const raw = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8").trim();
const closeParen = raw.lastIndexOf(")");
if (closeParen < 0) return null;
const rest = raw.slice(closeParen + 1).trim();
const fields = rest.split(/\s+/);
const startTime = Number.parseInt(fields[19] ?? "", 10);
return Number.isFinite(startTime) ? startTime : null;
} catch {
return null;
}
}
function resolveGatewayOwnerStatus(
pid: number,
payload: LockPayload | null,
platform: NodeJS.Platform,
): LockOwnerStatus {
if (!isAlive(pid)) return "dead";
if (platform !== "linux") return "alive";
const payloadStartTime = payload?.startTime;
if (Number.isFinite(payloadStartTime)) {
const currentStartTime = readLinuxStartTime(pid);
if (currentStartTime == null) return "unknown";
return currentStartTime === payloadStartTime ? "alive" : "dead";
}
const args = readLinuxCmdline(pid);
if (!args) return "unknown";
return isGatewayArgv(args) ? "alive" : "dead";
} }
async function readLockPayload(lockPath: string): Promise<LockPayload | null> { async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
@ -84,10 +134,12 @@ async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
if (typeof parsed.pid !== "number") return null; if (typeof parsed.pid !== "number") return null;
if (typeof parsed.createdAt !== "string") return null; if (typeof parsed.createdAt !== "string") return null;
if (typeof parsed.configPath !== "string") return null; if (typeof parsed.configPath !== "string") return null;
const startTime = typeof parsed.startTime === "number" ? parsed.startTime : undefined;
return { return {
pid: parsed.pid, pid: parsed.pid,
createdAt: parsed.createdAt, createdAt: parsed.createdAt,
configPath: parsed.configPath, configPath: parsed.configPath,
startTime,
}; };
} catch { } catch {
return null; return null;
@ -117,6 +169,7 @@ export async function acquireGatewayLock(
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS; const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS; const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
const platform = opts.platform ?? process.platform;
const { lockPath, configPath } = resolveGatewayLockPath(env); const { lockPath, configPath } = resolveGatewayLockPath(env);
await fs.mkdir(path.dirname(lockPath), { recursive: true }); await fs.mkdir(path.dirname(lockPath), { recursive: true });
@ -126,11 +179,15 @@ export async function acquireGatewayLock(
while (Date.now() - startedAt < timeoutMs) { while (Date.now() - startedAt < timeoutMs) {
try { try {
const handle = await fs.open(lockPath, "wx"); const handle = await fs.open(lockPath, "wx");
const startTime = platform === "linux" ? readLinuxStartTime(process.pid) : null;
const payload: LockPayload = { const payload: LockPayload = {
pid: process.pid, pid: process.pid,
createdAt: new Date().toISOString(), createdAt: new Date().toISOString(),
configPath, configPath,
}; };
if (typeof startTime === "number" && Number.isFinite(startTime)) {
payload.startTime = startTime;
}
await handle.writeFile(JSON.stringify(payload), "utf8"); await handle.writeFile(JSON.stringify(payload), "utf8");
return { return {
lockPath, lockPath,
@ -148,13 +205,14 @@ export async function acquireGatewayLock(
lastPayload = await readLockPayload(lockPath); lastPayload = await readLockPayload(lockPath);
const ownerPid = lastPayload?.pid; const ownerPid = lastPayload?.pid;
// Use isGatewayProcess to handle PID recycling in containers const ownerStatus = ownerPid
const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false; ? resolveGatewayOwnerStatus(ownerPid, lastPayload, platform)
if (!ownerAlive && ownerPid) { : "unknown";
if (ownerStatus === "dead" && ownerPid) {
await fs.rm(lockPath, { force: true }); await fs.rm(lockPath, { force: true });
continue; continue;
} }
if (!ownerAlive) { if (ownerStatus !== "alive") {
let stale = false; let stale = false;
if (lastPayload?.createdAt) { if (lastPayload?.createdAt) {
const createdAt = Date.parse(lastPayload.createdAt); const createdAt = Date.parse(lastPayload.createdAt);