feat(observability): add Prometheus metrics and OpenTelemetry tracing

- Add Prometheus metrics endpoints (GET /metrics)
- Add health checks (GET /health, GET /ready)
- Add OpenTelemetry SDK with automatic HTTP instrumentation
- Expose update metrics for messages, AI calls, and errors
This commit is contained in:
lluviaoscuradeldoce-design 2026-01-29 16:32:05 -05:00
parent f712388584
commit bb97993755
6 changed files with 1393 additions and 121 deletions

View File

@ -167,6 +167,12 @@
"@mariozechner/pi-coding-agent": "0.49.3", "@mariozechner/pi-coding-agent": "0.49.3",
"@mariozechner/pi-tui": "0.49.3", "@mariozechner/pi-tui": "0.49.3",
"@mozilla/readability": "^0.6.0", "@mozilla/readability": "^0.6.0",
"@opentelemetry/api": "^1.9.0",
"@opentelemetry/auto-instrumentations-node": "^0.69.0",
"@opentelemetry/exporter-prometheus": "^0.211.0",
"@opentelemetry/resources": "^2.5.0",
"@opentelemetry/sdk-node": "^0.211.0",
"@opentelemetry/semantic-conventions": "^1.39.0",
"@sinclair/typebox": "0.34.47", "@sinclair/typebox": "0.34.47",
"@slack/bolt": "^4.6.0", "@slack/bolt": "^4.6.0",
"@slack/web-api": "^7.13.0", "@slack/web-api": "^7.13.0",
@ -185,7 +191,7 @@
"express": "^5.2.1", "express": "^5.2.1",
"file-type": "^21.3.0", "file-type": "^21.3.0",
"grammy": "^1.39.3", "grammy": "^1.39.3",
"hono": "4.11.4", "hono": "4.11.7",
"jiti": "^2.6.1", "jiti": "^2.6.1",
"json5": "^2.2.3", "json5": "^2.2.3",
"jszip": "^3.10.1", "jszip": "^3.10.1",
@ -196,11 +202,12 @@
"osc-progress": "^0.3.0", "osc-progress": "^0.3.0",
"pdfjs-dist": "^5.4.530", "pdfjs-dist": "^5.4.530",
"playwright-core": "1.58.0", "playwright-core": "1.58.0",
"prom-client": "^15.1.3",
"proper-lockfile": "^4.1.2", "proper-lockfile": "^4.1.2",
"qrcode-terminal": "^0.12.0", "qrcode-terminal": "^0.12.0",
"sharp": "^0.34.5", "sharp": "^0.34.5",
"sqlite-vec": "0.1.7-alpha.2", "sqlite-vec": "0.1.7-alpha.2",
"tar": "7.5.4", "tar": "7.5.7",
"tslog": "^4.10.2", "tslog": "^4.10.2",
"undici": "^7.19.0", "undici": "^7.19.0",
"ws": "^8.19.0", "ws": "^8.19.0",
@ -241,14 +248,14 @@
"wireit": "^0.14.12" "wireit": "^0.14.12"
}, },
"overrides": { "overrides": {
"tar": "7.5.4" "tar": "7.5.7"
}, },
"pnpm": { "pnpm": {
"minimumReleaseAge": 2880, "minimumReleaseAge": 2880,
"overrides": { "overrides": {
"@sinclair/typebox": "0.34.47", "@sinclair/typebox": "0.34.47",
"hono": "4.11.4", "hono": "4.11.7",
"tar": "7.5.4" "tar": "7.5.7"
} }
}, },
"vitest": { "vitest": {
@ -283,4 +290,4 @@
"dist/Moltbot.app/**" "dist/Moltbot.app/**"
] ]
} }
} }

1075
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
/**
* HTTP handler for Prometheus metrics endpoint.
*
* Exposes metrics at GET /metrics in Prometheus text format.
*/
import type { IncomingMessage, ServerResponse } from 'node:http';
import { getMetricsText, getMetricsContentType } from '../infra/metrics.js';
/**
* Handle GET /metrics requests for Prometheus scraping.
*/
export async function handleMetricsRequest(
_req: IncomingMessage,
res: ServerResponse,
): Promise<void> {
try {
const metricsText = await getMetricsText();
res.writeHead(200, {
'Content-Type': getMetricsContentType(),
'Cache-Control': 'no-cache, no-store, must-revalidate',
});
res.end(metricsText);
} catch (error) {
res.writeHead(500, { 'Content-Type': 'text/plain' });
res.end('Error collecting metrics');
}
}
/**
* Handle GET /health liveness probe.
* Always returns 200 if the server is responding.
*/
export function handleHealthRequest(
_req: IncomingMessage,
res: ServerResponse,
): void {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
status: 'ok',
timestamp: new Date().toISOString(),
}));
}
/**
* Readiness check function type.
* Returns true if the service is ready to accept traffic.
*/
export type ReadinessCheck = () => Promise<boolean> | boolean;
/**
* Create a readiness handler with custom checks.
*
* @param checks - Array of check functions that must all pass
*/
export function createReadinessHandler(
checks: ReadinessCheck[] = [],
): (req: IncomingMessage, res: ServerResponse) => Promise<void> {
return async (_req: IncomingMessage, res: ServerResponse): Promise<void> => {
try {
const results = await Promise.all(checks.map((check) => check()));
const isReady = results.every(Boolean);
if (isReady) {
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
status: 'ready',
timestamp: new Date().toISOString(),
}));
} else {
res.writeHead(503, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
status: 'not_ready',
timestamp: new Date().toISOString(),
}));
}
} catch (error) {
res.writeHead(503, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
status: 'error',
timestamp: new Date().toISOString(),
}));
}
};
}
/**
* Simple readiness handler that always returns ready.
* Use createReadinessHandler for custom checks.
*/
export async function handleReadyRequest(
req: IncomingMessage,
res: ServerResponse,
): Promise<void> {
const handler = createReadinessHandler([]);
return handler(req, res);
}

View File

@ -30,6 +30,11 @@ import { applyHookMappings } from "./hooks-mapping.js";
import { handleOpenAiHttpRequest } from "./openai-http.js"; import { handleOpenAiHttpRequest } from "./openai-http.js";
import { handleOpenResponsesHttpRequest } from "./openresponses-http.js"; import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js"; import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
import {
handleMetricsRequest,
handleHealthRequest,
handleReadyRequest,
} from "./metrics-http.js";
type SubsystemLogger = ReturnType<typeof createSubsystemLogger>; type SubsystemLogger = ReturnType<typeof createSubsystemLogger>;
@ -86,8 +91,8 @@ export function createHooksRequestHandler(
if (fromQuery) { if (fromQuery) {
logHooks.warn( logHooks.warn(
"Hook token provided via query parameter is deprecated for security reasons. " + "Hook token provided via query parameter is deprecated for security reasons. " +
"Tokens in URLs appear in logs, browser history, and referrer headers. " + "Tokens in URLs appear in logs, browser history, and referrer headers. " +
"Use Authorization: Bearer <token> or X-Moltbot-Token header instead.", "Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
); );
} }
@ -225,17 +230,32 @@ export function createGatewayHttpServer(opts: {
} = opts; } = opts;
const httpServer: HttpServer = opts.tlsOptions const httpServer: HttpServer = opts.tlsOptions
? createHttpsServer(opts.tlsOptions, (req, res) => { ? createHttpsServer(opts.tlsOptions, (req, res) => {
void handleRequest(req, res); void handleRequest(req, res);
}) })
: createHttpServer((req, res) => { : createHttpServer((req, res) => {
void handleRequest(req, res); void handleRequest(req, res);
}); });
async function handleRequest(req: IncomingMessage, res: ServerResponse) { async function handleRequest(req: IncomingMessage, res: ServerResponse) {
// Don't interfere with WebSocket upgrades; ws handles the 'upgrade' event. // Don't interfere with WebSocket upgrades; ws handles the 'upgrade' event.
if (String(req.headers.upgrade ?? "").toLowerCase() === "websocket") return; if (String(req.headers.upgrade ?? "").toLowerCase() === "websocket") return;
try { try {
// Observability endpoints - no auth required for health checks
const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
if (req.method === "GET" && url.pathname === "/metrics") {
await handleMetricsRequest(req, res);
return;
}
if (req.method === "GET" && url.pathname === "/health") {
handleHealthRequest(req, res);
return;
}
if (req.method === "GET" && url.pathname === "/ready") {
await handleReadyRequest(req, res);
return;
}
const configSnapshot = loadConfig(); const configSnapshot = loadConfig();
const trustedProxies = configSnapshot.gateway?.trustedProxies ?? []; const trustedProxies = configSnapshot.gateway?.trustedProxies ?? [];
if (await handleHooksRequest(req, res)) return; if (await handleHooksRequest(req, res)) return;

158
src/infra/metrics.ts Normal file
View File

@ -0,0 +1,158 @@
/**
* Prometheus metrics for Moltbot observability.
*
* Usage:
* import { metrics } from '../infra/metrics.js';
* metrics.messagesTotal.inc({ channel: 'whatsapp', direction: 'inbound' });
*/
import * as promClient from 'prom-client';
// Create a custom registry to avoid global state pollution
export const metricsRegistry = new promClient.Registry();
// Add default metrics (process CPU, memory, etc.)
promClient.collectDefaultMetrics({ register: metricsRegistry });
// =============================================================================
// COUNTERS - Monotonically increasing values
// =============================================================================
/** Total messages processed by channel and direction */
export const messagesTotal = new promClient.Counter({
name: 'moltbot_messages_total',
help: 'Total number of messages processed',
labelNames: ['channel', 'direction'] as const, // direction: inbound | outbound
registers: [metricsRegistry],
});
/** Total HTTP requests by method, route, and status */
export const httpRequestsTotal = new promClient.Counter({
name: 'moltbot_http_requests_total',
help: 'Total HTTP requests received',
labelNames: ['method', 'route', 'status'] as const,
registers: [metricsRegistry],
});
/** Total errors by type and source */
export const errorsTotal = new promClient.Counter({
name: 'moltbot_errors_total',
help: 'Total errors encountered',
labelNames: ['type', 'source'] as const,
registers: [metricsRegistry],
});
/** Total AI provider calls by provider and model */
export const aiCallsTotal = new promClient.Counter({
name: 'moltbot_ai_calls_total',
help: 'Total AI provider API calls',
labelNames: ['provider', 'model'] as const,
registers: [metricsRegistry],
});
// =============================================================================
// HISTOGRAMS - Request duration distributions
// =============================================================================
/** HTTP request duration in seconds */
export const httpRequestDuration = new promClient.Histogram({
name: 'moltbot_http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route'] as const,
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [metricsRegistry],
});
/** AI provider response latency in seconds */
export const aiLatency = new promClient.Histogram({
name: 'moltbot_ai_latency_seconds',
help: 'AI provider response latency in seconds',
labelNames: ['provider', 'model'] as const,
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60],
registers: [metricsRegistry],
});
/** Message processing time in seconds */
export const messageProcessingDuration = new promClient.Histogram({
name: 'moltbot_message_processing_duration_seconds',
help: 'Time to process a message end-to-end',
labelNames: ['channel'] as const,
buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
registers: [metricsRegistry],
});
// =============================================================================
// GAUGES - Values that can go up and down
// =============================================================================
/** Number of active sessions in memory */
export const activeSessions = new promClient.Gauge({
name: 'moltbot_active_sessions',
help: 'Number of active sessions currently in memory',
registers: [metricsRegistry],
});
/** Number of connected channels */
export const connectedChannels = new promClient.Gauge({
name: 'moltbot_connected_channels',
help: 'Number of channels currently connected',
labelNames: ['channel'] as const,
registers: [metricsRegistry],
});
/** WebSocket connections count */
export const websocketConnections = new promClient.Gauge({
name: 'moltbot_websocket_connections',
help: 'Number of active WebSocket connections',
registers: [metricsRegistry],
});
/** Gateway uptime in seconds */
export const uptimeSeconds = new promClient.Gauge({
name: 'moltbot_uptime_seconds',
help: 'Gateway uptime in seconds',
registers: [metricsRegistry],
});
// =============================================================================
// Helper functions
// =============================================================================
const startTime = Date.now();
/** Update the uptime gauge - call periodically */
export function updateUptime(): void {
uptimeSeconds.set((Date.now() - startTime) / 1000);
}
/** Get metrics in Prometheus text format */
export async function getMetricsText(): Promise<string> {
updateUptime();
return metricsRegistry.metrics();
}
/** Get the content type for Prometheus metrics */
export function getMetricsContentType(): string {
return metricsRegistry.contentType;
}
// Convenience export for common usage patterns
export const metrics = {
messagesTotal,
httpRequestsTotal,
errorsTotal,
aiCallsTotal,
httpRequestDuration,
aiLatency,
messageProcessingDuration,
activeSessions,
connectedChannels,
websocketConnections,
uptimeSeconds,
getMetricsText,
getMetricsContentType,
updateUptime,
registry: metricsRegistry,
};
export default metrics;

133
src/infra/otel.ts Normal file
View File

@ -0,0 +1,133 @@
/**
* OpenTelemetry SDK initialization for Moltbot.
*
* This module sets up distributed tracing with automatic HTTP instrumentation.
* Import this at the very start of your application entry point.
*
* Usage:
* import './infra/otel.js'; // Must be first import
* import { trace } from '@opentelemetry/api';
*/
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { trace, context, SpanStatusCode, type Span } from '@opentelemetry/api';
// SDK configuration - only enable if OTEL_ENABLED is set
const isOtelEnabled = process.env.OTEL_ENABLED === 'true' || process.env.OTEL_ENABLED === '1';
let sdk: NodeSDK | null = null;
if (isOtelEnabled) {
sdk = new NodeSDK({
serviceName: 'moltbot',
instrumentations: [
getNodeAutoInstrumentations({
// Disable some noisy instrumentations
'@opentelemetry/instrumentation-fs': { enabled: false },
'@opentelemetry/instrumentation-dns': { enabled: false },
}),
],
});
// Start the SDK
sdk.start();
// Graceful shutdown
process.on('SIGTERM', () => {
sdk?.shutdown()
.then(() => console.log('[otel] Tracing terminated'))
.catch((error) => console.error('[otel] Error terminating tracing', error))
.finally(() => process.exit(0));
});
console.log('[otel] OpenTelemetry SDK initialized');
} else {
console.log('[otel] OpenTelemetry disabled (set OTEL_ENABLED=true to enable)');
}
// =============================================================================
// Helper functions for manual instrumentation
// =============================================================================
/**
* Get the current tracer for manual span creation.
*/
export function getTracer(name: string = 'moltbot') {
return trace.getTracer(name);
}
/**
* Get the current span context for correlation.
*/
export function getCurrentSpan() {
return trace.getActiveSpan();
}
/**
* Get the current trace ID for log correlation.
*/
export function getCurrentTraceId(): string | undefined {
const span = getCurrentSpan();
if (!span) return undefined;
return span.spanContext().traceId;
}
/**
* Get the current span ID for log correlation.
*/
export function getCurrentSpanId(): string | undefined {
const span = getCurrentSpan();
if (!span) return undefined;
return span.spanContext().spanId;
}
/**
* Create a custom span for a specific operation.
*
* @example
* await withSpan('ai.completion', { provider: 'openai' }, async (span) => {
* const result = await callOpenAI();
* span.setAttribute('tokens', result.usage.total_tokens);
* return result;
* });
*/
export async function withSpan<T>(
name: string,
attributes: Record<string, string | number | boolean> = {},
fn: (span: ReturnType<typeof trace.getTracer>['startSpan'] extends (name: string) => infer R ? R : never) => Promise<T>,
): Promise<T> {
const tracer = getTracer();
const span = tracer.startSpan(name);
for (const [key, value] of Object.entries(attributes)) {
span.setAttribute(key, value);
}
try {
const result = await context.with(trace.setSpan(context.active(), span), () => fn(span));
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error instanceof Error ? error.message : 'Unknown error',
});
span.recordException(error instanceof Error ? error : new Error(String(error)));
throw error;
} finally {
span.end();
}
}
/**
* Add trace context to a log record.
*/
export function getTraceContext(): { traceId?: string; spanId?: string } {
return {
traceId: getCurrentTraceId(),
spanId: getCurrentSpanId(),
};
}
export { sdk as otelSdk };