fix: harden gateway lock validation (#1572) (thanks @steipete)
This commit is contained in:
parent
90685ef814
commit
3fff943ba1
@ -21,6 +21,7 @@ Docs: https://docs.clawd.bot
|
|||||||
- TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg.
|
- TTS: move Telegram TTS into core with auto-replies, commands, and gateway methods. (#1559) Thanks @Glucksberg.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
- Gateway: compare Linux process start time to avoid PID recycling lock loops; keep locks unless stale. (#1572) Thanks @steipete.
|
||||||
- Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest.
|
- Skills: gate bird Homebrew install to macOS. (#1569) Thanks @bradleypriest.
|
||||||
- Agents: ignore IDENTITY.md template placeholders when parsing identity to avoid placeholder replies. (#1556)
|
- Agents: ignore IDENTITY.md template placeholders when parsing identity to avoid placeholder replies. (#1556)
|
||||||
- Docker: update gateway command in docker-compose and Hetzner guide. (#1514)
|
- Docker: update gateway command in docker-compose and Hetzner guide. (#1514)
|
||||||
|
|||||||
@ -1045,6 +1045,7 @@
|
|||||||
"platforms/android",
|
"platforms/android",
|
||||||
"platforms/windows",
|
"platforms/windows",
|
||||||
"platforms/linux",
|
"platforms/linux",
|
||||||
|
"platforms/fly",
|
||||||
"platforms/hetzner",
|
"platforms/hetzner",
|
||||||
"platforms/exe-dev"
|
"platforms/exe-dev"
|
||||||
]
|
]
|
||||||
|
|||||||
@ -23,6 +23,7 @@ Native companion apps for Windows are also planned; the Gateway is recommended v
|
|||||||
|
|
||||||
## VPS & hosting
|
## VPS & hosting
|
||||||
|
|
||||||
|
- Fly.io: [Fly.io](/platforms/fly)
|
||||||
- Hetzner (Docker): [Hetzner](/platforms/hetzner)
|
- Hetzner (Docker): [Hetzner](/platforms/hetzner)
|
||||||
- exe.dev (VM + HTTPS proxy): [exe.dev](/platforms/exe-dev)
|
- exe.dev (VM + HTTPS proxy): [exe.dev](/platforms/exe-dev)
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
|
import { createHash } from "node:crypto";
|
||||||
|
import fsSync from "node:fs";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import os from "node:os";
|
import os from "node:os";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
|
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
|
||||||
|
import { resolveConfigPath, resolveStateDir } from "../config/paths.js";
|
||||||
|
|
||||||
async function makeEnv() {
|
async function makeEnv() {
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-gateway-lock-"));
|
||||||
@ -22,6 +25,41 @@ async function makeEnv() {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function resolveLockPath(env: NodeJS.ProcessEnv) {
|
||||||
|
const stateDir = resolveStateDir(env);
|
||||||
|
const configPath = resolveConfigPath(env, stateDir);
|
||||||
|
const hash = createHash("sha1").update(configPath).digest("hex").slice(0, 8);
|
||||||
|
return { lockPath: path.join(stateDir, `gateway.${hash}.lock`), configPath };
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeProcStat(pid: number, startTime: number) {
|
||||||
|
const fields = [
|
||||||
|
"R",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
String(startTime),
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
];
|
||||||
|
return `${pid} (node) ${fields.join(" ")}`;
|
||||||
|
}
|
||||||
|
|
||||||
describe("gateway lock", () => {
|
describe("gateway lock", () => {
|
||||||
it("blocks concurrent acquisition until release", async () => {
|
it("blocks concurrent acquisition until release", async () => {
|
||||||
const { env, cleanup } = await makeEnv();
|
const { env, cleanup } = await makeEnv();
|
||||||
@ -52,4 +90,98 @@ describe("gateway lock", () => {
|
|||||||
await lock2?.release();
|
await lock2?.release();
|
||||||
await cleanup();
|
await cleanup();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("treats recycled linux pid as stale when start time mismatches", async () => {
|
||||||
|
const { env, cleanup } = await makeEnv();
|
||||||
|
const { lockPath, configPath } = resolveLockPath(env);
|
||||||
|
const payload = {
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
configPath,
|
||||||
|
startTime: 111,
|
||||||
|
};
|
||||||
|
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
|
||||||
|
|
||||||
|
const readFileSync = fsSync.readFileSync;
|
||||||
|
const statValue = makeProcStat(process.pid, 222);
|
||||||
|
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
|
||||||
|
if (filePath === `/proc/${process.pid}/stat`) {
|
||||||
|
return statValue;
|
||||||
|
}
|
||||||
|
return readFileSync(filePath as never, encoding as never) as never;
|
||||||
|
});
|
||||||
|
|
||||||
|
const lock = await acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 200,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
platform: "linux",
|
||||||
|
});
|
||||||
|
expect(lock).not.toBeNull();
|
||||||
|
|
||||||
|
await lock?.release();
|
||||||
|
spy.mockRestore();
|
||||||
|
await cleanup();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps lock on linux when proc access fails unless stale", async () => {
|
||||||
|
const { env, cleanup } = await makeEnv();
|
||||||
|
const { lockPath, configPath } = resolveLockPath(env);
|
||||||
|
const payload = {
|
||||||
|
pid: process.pid,
|
||||||
|
createdAt: new Date().toISOString(),
|
||||||
|
configPath,
|
||||||
|
startTime: 111,
|
||||||
|
};
|
||||||
|
await fs.writeFile(lockPath, JSON.stringify(payload), "utf8");
|
||||||
|
|
||||||
|
const readFileSync = fsSync.readFileSync;
|
||||||
|
const spy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
|
||||||
|
if (filePath === `/proc/${process.pid}/stat`) {
|
||||||
|
throw new Error("EACCES");
|
||||||
|
}
|
||||||
|
return readFileSync(filePath as never, encoding as never) as never;
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 120,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
staleMs: 10_000,
|
||||||
|
platform: "linux",
|
||||||
|
}),
|
||||||
|
).rejects.toBeInstanceOf(GatewayLockError);
|
||||||
|
|
||||||
|
spy.mockRestore();
|
||||||
|
|
||||||
|
const stalePayload = {
|
||||||
|
...payload,
|
||||||
|
createdAt: new Date(0).toISOString(),
|
||||||
|
};
|
||||||
|
await fs.writeFile(lockPath, JSON.stringify(stalePayload), "utf8");
|
||||||
|
|
||||||
|
const staleSpy = vi.spyOn(fsSync, "readFileSync").mockImplementation((filePath, encoding) => {
|
||||||
|
if (filePath === `/proc/${process.pid}/stat`) {
|
||||||
|
throw new Error("EACCES");
|
||||||
|
}
|
||||||
|
return readFileSync(filePath as never, encoding as never) as never;
|
||||||
|
});
|
||||||
|
|
||||||
|
const lock = await acquireGatewayLock({
|
||||||
|
env,
|
||||||
|
allowInTests: true,
|
||||||
|
timeoutMs: 200,
|
||||||
|
pollIntervalMs: 20,
|
||||||
|
staleMs: 1,
|
||||||
|
platform: "linux",
|
||||||
|
});
|
||||||
|
expect(lock).not.toBeNull();
|
||||||
|
|
||||||
|
await lock?.release();
|
||||||
|
staleSpy.mockRestore();
|
||||||
|
await cleanup();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -13,6 +13,7 @@ type LockPayload = {
|
|||||||
pid: number;
|
pid: number;
|
||||||
createdAt: string;
|
createdAt: string;
|
||||||
configPath: string;
|
configPath: string;
|
||||||
|
startTime?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type GatewayLockHandle = {
|
export type GatewayLockHandle = {
|
||||||
@ -27,6 +28,7 @@ export type GatewayLockOptions = {
|
|||||||
pollIntervalMs?: number;
|
pollIntervalMs?: number;
|
||||||
staleMs?: number;
|
staleMs?: number;
|
||||||
allowInTests?: boolean;
|
allowInTests?: boolean;
|
||||||
|
platform?: NodeJS.Platform;
|
||||||
};
|
};
|
||||||
|
|
||||||
export class GatewayLockError extends Error {
|
export class GatewayLockError extends Error {
|
||||||
@ -39,6 +41,8 @@ export class GatewayLockError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LockOwnerStatus = "alive" | "dead" | "unknown";
|
||||||
|
|
||||||
function isAlive(pid: number): boolean {
|
function isAlive(pid: number): boolean {
|
||||||
if (!Number.isFinite(pid) || pid <= 0) return false;
|
if (!Number.isFinite(pid) || pid <= 0) return false;
|
||||||
try {
|
try {
|
||||||
@ -49,32 +53,78 @@ function isAlive(pid: number): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
function normalizeProcArg(arg: string): string {
|
||||||
* Check if a PID is actually a clawdbot gateway process.
|
return arg.replaceAll("\\", "/").toLowerCase();
|
||||||
* This handles PID recycling in containers where a different process
|
}
|
||||||
* might have the same PID after a restart.
|
|
||||||
*/
|
|
||||||
function isGatewayProcess(pid: number): boolean {
|
|
||||||
if (!isAlive(pid)) return false;
|
|
||||||
|
|
||||||
// On Linux, check /proc/PID/cmdline to verify it's actually clawdbot
|
function parseProcCmdline(raw: string): string[] {
|
||||||
if (process.platform === "linux") {
|
return raw
|
||||||
try {
|
.split("\0")
|
||||||
const cmdline = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
|
.map((entry) => entry.trim())
|
||||||
// cmdline uses null bytes as separators
|
.filter(Boolean);
|
||||||
const args = cmdline.split("\0").join(" ").toLowerCase();
|
}
|
||||||
// Check if this is actually a clawdbot gateway process
|
|
||||||
return args.includes("clawdbot") || args.includes("gateway");
|
function isGatewayArgv(args: string[]): boolean {
|
||||||
} catch {
|
const normalized = args.map(normalizeProcArg);
|
||||||
// Can't read cmdline - process might have exited or we lack permissions
|
if (!normalized.includes("gateway")) return false;
|
||||||
// Fall back to assuming it's not our process (safer in containers)
|
|
||||||
return false;
|
const entryCandidates = [
|
||||||
}
|
"dist/index.js",
|
||||||
|
"dist/index.mjs",
|
||||||
|
"dist/entry.js",
|
||||||
|
"dist/entry.mjs",
|
||||||
|
"scripts/run-node.mjs",
|
||||||
|
"src/index.ts",
|
||||||
|
];
|
||||||
|
if (normalized.some((arg) => entryCandidates.some((entry) => arg.endsWith(entry)))) {
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// On non-Linux (macOS, Windows), trust the PID check
|
const exe = normalized[0] ?? "";
|
||||||
// PID recycling is less of an issue outside containers
|
return exe.endsWith("/clawdbot") || exe === "clawdbot";
|
||||||
return true;
|
}
|
||||||
|
|
||||||
|
function readLinuxCmdline(pid: number): string[] | null {
|
||||||
|
try {
|
||||||
|
const raw = fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8");
|
||||||
|
return parseProcCmdline(raw);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function readLinuxStartTime(pid: number): number | null {
|
||||||
|
try {
|
||||||
|
const raw = fsSync.readFileSync(`/proc/${pid}/stat`, "utf8").trim();
|
||||||
|
const closeParen = raw.lastIndexOf(")");
|
||||||
|
if (closeParen < 0) return null;
|
||||||
|
const rest = raw.slice(closeParen + 1).trim();
|
||||||
|
const fields = rest.split(/\s+/);
|
||||||
|
const startTime = Number.parseInt(fields[19] ?? "", 10);
|
||||||
|
return Number.isFinite(startTime) ? startTime : null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveGatewayOwnerStatus(
|
||||||
|
pid: number,
|
||||||
|
payload: LockPayload | null,
|
||||||
|
platform: NodeJS.Platform,
|
||||||
|
): LockOwnerStatus {
|
||||||
|
if (!isAlive(pid)) return "dead";
|
||||||
|
if (platform !== "linux") return "alive";
|
||||||
|
|
||||||
|
const payloadStartTime = payload?.startTime;
|
||||||
|
if (Number.isFinite(payloadStartTime)) {
|
||||||
|
const currentStartTime = readLinuxStartTime(pid);
|
||||||
|
if (currentStartTime == null) return "unknown";
|
||||||
|
return currentStartTime === payloadStartTime ? "alive" : "dead";
|
||||||
|
}
|
||||||
|
|
||||||
|
const args = readLinuxCmdline(pid);
|
||||||
|
if (!args) return "unknown";
|
||||||
|
return isGatewayArgv(args) ? "alive" : "dead";
|
||||||
}
|
}
|
||||||
|
|
||||||
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
||||||
@ -84,10 +134,12 @@ async function readLockPayload(lockPath: string): Promise<LockPayload | null> {
|
|||||||
if (typeof parsed.pid !== "number") return null;
|
if (typeof parsed.pid !== "number") return null;
|
||||||
if (typeof parsed.createdAt !== "string") return null;
|
if (typeof parsed.createdAt !== "string") return null;
|
||||||
if (typeof parsed.configPath !== "string") return null;
|
if (typeof parsed.configPath !== "string") return null;
|
||||||
|
const startTime = typeof parsed.startTime === "number" ? parsed.startTime : undefined;
|
||||||
return {
|
return {
|
||||||
pid: parsed.pid,
|
pid: parsed.pid,
|
||||||
createdAt: parsed.createdAt,
|
createdAt: parsed.createdAt,
|
||||||
configPath: parsed.configPath,
|
configPath: parsed.configPath,
|
||||||
|
startTime,
|
||||||
};
|
};
|
||||||
} catch {
|
} catch {
|
||||||
return null;
|
return null;
|
||||||
@ -117,6 +169,7 @@ export async function acquireGatewayLock(
|
|||||||
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
||||||
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
|
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
|
||||||
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
|
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
|
||||||
|
const platform = opts.platform ?? process.platform;
|
||||||
const { lockPath, configPath } = resolveGatewayLockPath(env);
|
const { lockPath, configPath } = resolveGatewayLockPath(env);
|
||||||
await fs.mkdir(path.dirname(lockPath), { recursive: true });
|
await fs.mkdir(path.dirname(lockPath), { recursive: true });
|
||||||
|
|
||||||
@ -126,11 +179,15 @@ export async function acquireGatewayLock(
|
|||||||
while (Date.now() - startedAt < timeoutMs) {
|
while (Date.now() - startedAt < timeoutMs) {
|
||||||
try {
|
try {
|
||||||
const handle = await fs.open(lockPath, "wx");
|
const handle = await fs.open(lockPath, "wx");
|
||||||
|
const startTime = platform === "linux" ? readLinuxStartTime(process.pid) : null;
|
||||||
const payload: LockPayload = {
|
const payload: LockPayload = {
|
||||||
pid: process.pid,
|
pid: process.pid,
|
||||||
createdAt: new Date().toISOString(),
|
createdAt: new Date().toISOString(),
|
||||||
configPath,
|
configPath,
|
||||||
};
|
};
|
||||||
|
if (typeof startTime === "number" && Number.isFinite(startTime)) {
|
||||||
|
payload.startTime = startTime;
|
||||||
|
}
|
||||||
await handle.writeFile(JSON.stringify(payload), "utf8");
|
await handle.writeFile(JSON.stringify(payload), "utf8");
|
||||||
return {
|
return {
|
||||||
lockPath,
|
lockPath,
|
||||||
@ -148,13 +205,14 @@ export async function acquireGatewayLock(
|
|||||||
|
|
||||||
lastPayload = await readLockPayload(lockPath);
|
lastPayload = await readLockPayload(lockPath);
|
||||||
const ownerPid = lastPayload?.pid;
|
const ownerPid = lastPayload?.pid;
|
||||||
// Use isGatewayProcess to handle PID recycling in containers
|
const ownerStatus = ownerPid
|
||||||
const ownerAlive = ownerPid ? isGatewayProcess(ownerPid) : false;
|
? resolveGatewayOwnerStatus(ownerPid, lastPayload, platform)
|
||||||
if (!ownerAlive && ownerPid) {
|
: "unknown";
|
||||||
|
if (ownerStatus === "dead" && ownerPid) {
|
||||||
await fs.rm(lockPath, { force: true });
|
await fs.rm(lockPath, { force: true });
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!ownerAlive) {
|
if (ownerStatus !== "alive") {
|
||||||
let stale = false;
|
let stale = false;
|
||||||
if (lastPayload?.createdAt) {
|
if (lastPayload?.createdAt) {
|
||||||
const createdAt = Date.parse(lastPayload.createdAt);
|
const createdAt = Date.parse(lastPayload.createdAt);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user