feat(gateway): add crash tracking module

This commit is contained in:
Trevin Chow 2026-01-29 12:11:47 -08:00 committed by Trevin Chow
parent 5a3f915641
commit 5ef33d9359
2 changed files with 211 additions and 0 deletions

View File

@ -0,0 +1,109 @@
// src/cli/gateway-cli/crash-tracker.test.ts
import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
import {
recordCrash,
getRecentCrashes,
getCrashesInLastHour,
clearCrashes,
classifyError,
} from "./crash-tracker.js";
describe("crash-tracker", () => {
beforeEach(() => {
clearCrashes();
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
});
it("records a crash with timestamp", () => {
vi.setSystemTime(new Date("2026-01-29T12:00:00Z"));
recordCrash({
errorType: "fetch_failed",
errorMessage: "ECONNREFUSED",
uptimeMs: 5000,
backoffMs: 2000,
consecutiveFailures: 1,
});
const crashes = getRecentCrashes();
expect(crashes).toHaveLength(1);
expect(crashes[0].timestamp).toBe(Date.now());
expect(crashes[0].errorType).toBe("fetch_failed");
});
it("limits to MAX_CRASH_HISTORY entries", () => {
for (let i = 0; i < 25; i++) {
recordCrash({
errorType: "network_error",
errorMessage: `Error ${i}`,
uptimeMs: 0,
backoffMs: 2000,
consecutiveFailures: i + 1,
});
}
const crashes = getRecentCrashes();
expect(crashes).toHaveLength(20);
expect(crashes[0].errorMessage).toBe("Error 5"); // First 5 were dropped
});
it("counts crashes in last hour correctly", () => {
vi.setSystemTime(new Date("2026-01-29T12:00:00Z"));
recordCrash({
errorType: "fetch_failed",
errorMessage: "a",
uptimeMs: 0,
backoffMs: 2000,
consecutiveFailures: 1,
});
vi.setSystemTime(new Date("2026-01-29T12:30:00Z"));
recordCrash({
errorType: "fetch_failed",
errorMessage: "b",
uptimeMs: 0,
backoffMs: 4000,
consecutiveFailures: 2,
});
// At 13:29:59, "a" is 89 min old (outside), "b" is 59 min old (inside)
vi.setSystemTime(new Date("2026-01-29T13:29:59Z"));
expect(getCrashesInLastHour()).toBe(1); // Only "b" is within last hour
});
});
describe("classifyError", () => {
it("classifies fetch failed errors", () => {
expect(classifyError(new Error("TypeError: fetch failed"))).toBe("fetch_failed");
expect(classifyError(new Error("connect ECONNREFUSED 127.0.0.1:443"))).toBe("fetch_failed");
});
it("classifies network errors", () => {
expect(classifyError(new Error("read ECONNRESET"))).toBe("network_error");
expect(classifyError(new Error("connect ETIMEDOUT"))).toBe("network_error");
expect(classifyError(new Error("network unreachable"))).toBe("network_error");
});
it("classifies startup errors", () => {
expect(classifyError(new Error("startup failed: missing config"))).toBe("startup_error");
expect(classifyError(new Error("init error: bad credentials"))).toBe("startup_error");
});
it("defaults to runtime_error for unrecognized errors", () => {
expect(classifyError(new Error("something went wrong"))).toBe("runtime_error");
expect(classifyError(new Error("unexpected condition"))).toBe("runtime_error");
});
it("handles null/undefined safely", () => {
expect(classifyError(null)).toBe("unknown");
expect(classifyError(undefined)).toBe("unknown");
});
it("handles non-Error objects", () => {
expect(classifyError("string error")).toBe("runtime_error");
expect(classifyError({ message: "object error" })).toBe("runtime_error");
});
});

View File

@ -0,0 +1,102 @@
// src/cli/gateway-cli/crash-tracker.ts
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { isTransientNetworkError } from "../../infra/unhandled-rejections.js";
const log = createSubsystemLogger("gateway");
export type CrashErrorType =
| "fetch_failed"
| "network_error"
| "startup_error"
| "runtime_error"
| "unknown";
export type CrashRecord = {
timestamp: number;
errorType: CrashErrorType;
errorMessage: string;
uptimeMs: number;
backoffMs: number;
consecutiveFailures: number;
};
const MAX_CRASH_HISTORY = 20;
const recentCrashes: CrashRecord[] = [];
export function recordCrash(record: Omit<CrashRecord, "timestamp">): void {
const full: CrashRecord = { ...record, timestamp: Date.now() };
recentCrashes.push(full);
if (recentCrashes.length > MAX_CRASH_HISTORY) {
recentCrashes.shift();
}
log.error("gateway_crash", {
errorType: record.errorType,
errorMessage: record.errorMessage,
uptimeMs: record.uptimeMs,
backoffMs: record.backoffMs,
consecutiveFailures: record.consecutiveFailures,
crashesInLastHour: getCrashesInLastHour(),
});
}
export function getRecentCrashes(): readonly CrashRecord[] {
return recentCrashes;
}
export function getCrashesInLastHour(): number {
const oneHourAgo = Date.now() - 3600_000;
return recentCrashes.filter((c) => c.timestamp > oneHourAgo).length;
}
export function clearCrashes(): void {
recentCrashes.length = 0;
}
// Network error patterns to match in error messages
const FETCH_FAILED_PATTERNS = ["fetch failed", "econnrefused"];
const NETWORK_ERROR_PATTERNS = [
"econnreset",
"etimedout",
"enotfound",
"ehostunreach",
"enetunreach",
"network unreachable",
"socket hang up",
];
function getErrorMessage(err: unknown): string {
if (err instanceof Error) return err.message;
if (typeof err === "string") return err;
if (err && typeof err === "object" && "message" in err && typeof err.message === "string") {
return err.message;
}
return "";
}
export function classifyError(err: unknown): CrashErrorType {
if (!err) return "unknown";
const message = getErrorMessage(err).toLowerCase();
// Use existing transient network detection for consistency
if (isTransientNetworkError(err)) {
if (FETCH_FAILED_PATTERNS.some((p) => message.includes(p))) {
return "fetch_failed";
}
return "network_error";
}
// Also check message patterns for errors without proper error codes
if (FETCH_FAILED_PATTERNS.some((p) => message.includes(p))) {
return "fetch_failed";
}
if (NETWORK_ERROR_PATTERNS.some((p) => message.includes(p))) {
return "network_error";
}
if (message.includes("startup") || message.includes("init")) {
return "startup_error";
}
return "runtime_error";
}