feat(observability): add Prometheus metrics and OpenTelemetry tracing

- Add Prometheus metrics endpoints (GET /metrics) - Add health checks (GET /health, GET /ready) - Add OpenTelemetry SDK with automatic HTTP instrumentation - Expose update metrics for messages, AI calls, and errors
2026-01-29 16:32:05 -05:00 · 2026-01-29 16:32:05 -05:00 · bb97993755
commit bb97993755
parent f712388584
6 changed files with 1393 additions and 121 deletions
--- a/package.json
+++ b/package.json
@ -167,6 +167,12 @@
    "@mariozechner/pi-coding-agent": "0.49.3",
    "@mariozechner/pi-tui": "0.49.3",
    "@mozilla/readability": "^0.6.0",
+    "@opentelemetry/api": "^1.9.0",
+    "@opentelemetry/auto-instrumentations-node": "^0.69.0",
+    "@opentelemetry/exporter-prometheus": "^0.211.0",
+    "@opentelemetry/resources": "^2.5.0",
+    "@opentelemetry/sdk-node": "^0.211.0",
+    "@opentelemetry/semantic-conventions": "^1.39.0",
    "@sinclair/typebox": "0.34.47",
    "@slack/bolt": "^4.6.0",
    "@slack/web-api": "^7.13.0",
@ -185,7 +191,7 @@
    "express": "^5.2.1",
    "file-type": "^21.3.0",
    "grammy": "^1.39.3",
-    "hono": "4.11.4",
+    "hono": "4.11.7",
    "jiti": "^2.6.1",
    "json5": "^2.2.3",
    "jszip": "^3.10.1",
@ -196,11 +202,12 @@
    "osc-progress": "^0.3.0",
    "pdfjs-dist": "^5.4.530",
    "playwright-core": "1.58.0",
+    "prom-client": "^15.1.3",
    "proper-lockfile": "^4.1.2",
    "qrcode-terminal": "^0.12.0",
    "sharp": "^0.34.5",
    "sqlite-vec": "0.1.7-alpha.2",
-    "tar": "7.5.4",
+    "tar": "7.5.7",
    "tslog": "^4.10.2",
    "undici": "^7.19.0",
    "ws": "^8.19.0",
@ -241,14 +248,14 @@
    "wireit": "^0.14.12"
  },
  "overrides": {
-    "tar": "7.5.4"
+    "tar": "7.5.7"
  },
  "pnpm": {
    "minimumReleaseAge": 2880,
    "overrides": {
      "@sinclair/typebox": "0.34.47",
-      "hono": "4.11.4",
-      "tar": "7.5.4"
+      "hono": "4.11.7",
+      "tar": "7.5.7"
    }
  },
  "vitest": {
@ -283,4 +290,4 @@
      "dist/Moltbot.app/**"
    ]
  }
-}
+}
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/src/gateway/metrics-http.ts
+++ b/src/gateway/metrics-http.ts
@ -0,0 +1,97 @@
+/**
+ * HTTP handler for Prometheus metrics endpoint.
+ * 
+ * Exposes metrics at GET /metrics in Prometheus text format.
+ */
+
+import type { IncomingMessage, ServerResponse } from 'node:http';
+import { getMetricsText, getMetricsContentType } from '../infra/metrics.js';
+
+/**
+ * Handle GET /metrics requests for Prometheus scraping.
+ */
+export async function handleMetricsRequest(
+    _req: IncomingMessage,
+    res: ServerResponse,
+): Promise<void> {
+    try {
+        const metricsText = await getMetricsText();
+        res.writeHead(200, {
+            'Content-Type': getMetricsContentType(),
+            'Cache-Control': 'no-cache, no-store, must-revalidate',
+        });
+        res.end(metricsText);
+    } catch (error) {
+        res.writeHead(500, { 'Content-Type': 'text/plain' });
+        res.end('Error collecting metrics');
+    }
+}
+
+/**
+ * Handle GET /health liveness probe.
+ * Always returns 200 if the server is responding.
+ */
+export function handleHealthRequest(
+    _req: IncomingMessage,
+    res: ServerResponse,
+): void {
+    res.writeHead(200, { 'Content-Type': 'application/json' });
+    res.end(JSON.stringify({
+        status: 'ok',
+        timestamp: new Date().toISOString(),
+    }));
+}
+
+/**
+ * Readiness check function type.
+ * Returns true if the service is ready to accept traffic.
+ */
+export type ReadinessCheck = () => Promise<boolean> | boolean;
+
+/**
+ * Create a readiness handler with custom checks.
+ * 
+ * @param checks - Array of check functions that must all pass
+ */
+export function createReadinessHandler(
+    checks: ReadinessCheck[] = [],
+): (req: IncomingMessage, res: ServerResponse) => Promise<void> {
+    return async (_req: IncomingMessage, res: ServerResponse): Promise<void> => {
+        try {
+            const results = await Promise.all(checks.map((check) => check()));
+            const isReady = results.every(Boolean);
+
+            if (isReady) {
+                res.writeHead(200, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({
+                    status: 'ready',
+                    timestamp: new Date().toISOString(),
+                }));
+            } else {
+                res.writeHead(503, { 'Content-Type': 'application/json' });
+                res.end(JSON.stringify({
+                    status: 'not_ready',
+                    timestamp: new Date().toISOString(),
+                }));
+            }
+        } catch (error) {
+            res.writeHead(503, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify({
+                status: 'error',
+                timestamp: new Date().toISOString(),
+            }));
+        }
+    };
+}
+
+/**
+ * Simple readiness handler that always returns ready.
+ * Use createReadinessHandler for custom checks.
+ */
+export async function handleReadyRequest(
+    req: IncomingMessage,
+    res: ServerResponse,
+): Promise<void> {
+    const handler = createReadinessHandler([]);
+    return handler(req, res);
+}
--- a/src/gateway/server-http.ts
+++ b/src/gateway/server-http.ts
@ -30,6 +30,11 @@ import { applyHookMappings } from "./hooks-mapping.js";
 import { handleOpenAiHttpRequest } from "./openai-http.js";
 import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
 import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
+import {
+  handleMetricsRequest,
+  handleHealthRequest,
+  handleReadyRequest,
+} from "./metrics-http.js";

 type SubsystemLogger = ReturnType<typeof createSubsystemLogger>;

@ -86,8 +91,8 @@ export function createHooksRequestHandler(
    if (fromQuery) {
      logHooks.warn(
        "Hook token provided via query parameter is deprecated for security reasons. " +
-          "Tokens in URLs appear in logs, browser history, and referrer headers. " +
-          "Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
+        "Tokens in URLs appear in logs, browser history, and referrer headers. " +
+        "Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
      );
    }

@ -225,17 +230,32 @@ export function createGatewayHttpServer(opts: {
  } = opts;
  const httpServer: HttpServer = opts.tlsOptions
    ? createHttpsServer(opts.tlsOptions, (req, res) => {
-        void handleRequest(req, res);
-      })
+      void handleRequest(req, res);
+    })
    : createHttpServer((req, res) => {
-        void handleRequest(req, res);
-      });
+      void handleRequest(req, res);
+    });

  async function handleRequest(req: IncomingMessage, res: ServerResponse) {
    // Don't interfere with WebSocket upgrades; ws handles the 'upgrade' event.
    if (String(req.headers.upgrade ?? "").toLowerCase() === "websocket") return;

    try {
+      // Observability endpoints - no auth required for health checks
+      const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
+      if (req.method === "GET" && url.pathname === "/metrics") {
+        await handleMetricsRequest(req, res);
+        return;
+      }
+      if (req.method === "GET" && url.pathname === "/health") {
+        handleHealthRequest(req, res);
+        return;
+      }
+      if (req.method === "GET" && url.pathname === "/ready") {
+        await handleReadyRequest(req, res);
+        return;
+      }
+
      const configSnapshot = loadConfig();
      const trustedProxies = configSnapshot.gateway?.trustedProxies ?? [];
      if (await handleHooksRequest(req, res)) return;
--- a/src/infra/metrics.ts
+++ b/src/infra/metrics.ts
@ -0,0 +1,158 @@
+/**
+ * Prometheus metrics for Moltbot observability.
+ * 
+ * Usage:
+ *   import { metrics } from '../infra/metrics.js';
+ *   metrics.messagesTotal.inc({ channel: 'whatsapp', direction: 'inbound' });
+ */
+
+import * as promClient from 'prom-client';
+
+// Create a custom registry to avoid global state pollution
+export const metricsRegistry = new promClient.Registry();
+
+// Add default metrics (process CPU, memory, etc.)
+promClient.collectDefaultMetrics({ register: metricsRegistry });
+
+// =============================================================================
+// COUNTERS - Monotonically increasing values
+// =============================================================================
+
+/** Total messages processed by channel and direction */
+export const messagesTotal = new promClient.Counter({
+    name: 'moltbot_messages_total',
+    help: 'Total number of messages processed',
+    labelNames: ['channel', 'direction'] as const, // direction: inbound | outbound
+    registers: [metricsRegistry],
+});
+
+/** Total HTTP requests by method, route, and status */
+export const httpRequestsTotal = new promClient.Counter({
+    name: 'moltbot_http_requests_total',
+    help: 'Total HTTP requests received',
+    labelNames: ['method', 'route', 'status'] as const,
+    registers: [metricsRegistry],
+});
+
+/** Total errors by type and source */
+export const errorsTotal = new promClient.Counter({
+    name: 'moltbot_errors_total',
+    help: 'Total errors encountered',
+    labelNames: ['type', 'source'] as const,
+    registers: [metricsRegistry],
+});
+
+/** Total AI provider calls by provider and model */
+export const aiCallsTotal = new promClient.Counter({
+    name: 'moltbot_ai_calls_total',
+    help: 'Total AI provider API calls',
+    labelNames: ['provider', 'model'] as const,
+    registers: [metricsRegistry],
+});
+
+// =============================================================================
+// HISTOGRAMS - Request duration distributions
+// =============================================================================
+
+/** HTTP request duration in seconds */
+export const httpRequestDuration = new promClient.Histogram({
+    name: 'moltbot_http_request_duration_seconds',
+    help: 'HTTP request duration in seconds',
+    labelNames: ['method', 'route'] as const,
+    buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
+    registers: [metricsRegistry],
+});
+
+/** AI provider response latency in seconds */
+export const aiLatency = new promClient.Histogram({
+    name: 'moltbot_ai_latency_seconds',
+    help: 'AI provider response latency in seconds',
+    labelNames: ['provider', 'model'] as const,
+    buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60],
+    registers: [metricsRegistry],
+});
+
+/** Message processing time in seconds */
+export const messageProcessingDuration = new promClient.Histogram({
+    name: 'moltbot_message_processing_duration_seconds',
+    help: 'Time to process a message end-to-end',
+    labelNames: ['channel'] as const,
+    buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
+    registers: [metricsRegistry],
+});
+
+// =============================================================================
+// GAUGES - Values that can go up and down
+// =============================================================================
+
+/** Number of active sessions in memory */
+export const activeSessions = new promClient.Gauge({
+    name: 'moltbot_active_sessions',
+    help: 'Number of active sessions currently in memory',
+    registers: [metricsRegistry],
+});
+
+/** Number of connected channels */
+export const connectedChannels = new promClient.Gauge({
+    name: 'moltbot_connected_channels',
+    help: 'Number of channels currently connected',
+    labelNames: ['channel'] as const,
+    registers: [metricsRegistry],
+});
+
+/** WebSocket connections count */
+export const websocketConnections = new promClient.Gauge({
+    name: 'moltbot_websocket_connections',
+    help: 'Number of active WebSocket connections',
+    registers: [metricsRegistry],
+});
+
+/** Gateway uptime in seconds */
+export const uptimeSeconds = new promClient.Gauge({
+    name: 'moltbot_uptime_seconds',
+    help: 'Gateway uptime in seconds',
+    registers: [metricsRegistry],
+});
+
+// =============================================================================
+// Helper functions
+// =============================================================================
+
+const startTime = Date.now();
+
+/** Update the uptime gauge - call periodically */
+export function updateUptime(): void {
+    uptimeSeconds.set((Date.now() - startTime) / 1000);
+}
+
+/** Get metrics in Prometheus text format */
+export async function getMetricsText(): Promise<string> {
+    updateUptime();
+    return metricsRegistry.metrics();
+}
+
+/** Get the content type for Prometheus metrics */
+export function getMetricsContentType(): string {
+    return metricsRegistry.contentType;
+}
+
+// Convenience export for common usage patterns
+export const metrics = {
+    messagesTotal,
+    httpRequestsTotal,
+    errorsTotal,
+    aiCallsTotal,
+    httpRequestDuration,
+    aiLatency,
+    messageProcessingDuration,
+    activeSessions,
+    connectedChannels,
+    websocketConnections,
+    uptimeSeconds,
+    getMetricsText,
+    getMetricsContentType,
+    updateUptime,
+    registry: metricsRegistry,
+};
+
+export default metrics;
--- a/src/infra/otel.ts
+++ b/src/infra/otel.ts
@ -0,0 +1,133 @@
+/**
+ * OpenTelemetry SDK initialization for Moltbot.
+ * 
+ * This module sets up distributed tracing with automatic HTTP instrumentation.
+ * Import this at the very start of your application entry point.
+ * 
+ * Usage:
+ *   import './infra/otel.js'; // Must be first import
+ *   import { trace } from '@opentelemetry/api';
+ */
+
+import { NodeSDK } from '@opentelemetry/sdk-node';
+import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
+import { trace, context, SpanStatusCode, type Span } from '@opentelemetry/api';
+
+// SDK configuration - only enable if OTEL_ENABLED is set
+const isOtelEnabled = process.env.OTEL_ENABLED === 'true' || process.env.OTEL_ENABLED === '1';
+
+let sdk: NodeSDK | null = null;
+
+if (isOtelEnabled) {
+    sdk = new NodeSDK({
+        serviceName: 'moltbot',
+        instrumentations: [
+            getNodeAutoInstrumentations({
+                // Disable some noisy instrumentations
+                '@opentelemetry/instrumentation-fs': { enabled: false },
+                '@opentelemetry/instrumentation-dns': { enabled: false },
+            }),
+        ],
+    });
+
+    // Start the SDK
+    sdk.start();
+
+    // Graceful shutdown
+    process.on('SIGTERM', () => {
+        sdk?.shutdown()
+            .then(() => console.log('[otel] Tracing terminated'))
+            .catch((error) => console.error('[otel] Error terminating tracing', error))
+            .finally(() => process.exit(0));
+    });
+
+    console.log('[otel] OpenTelemetry SDK initialized');
+} else {
+    console.log('[otel] OpenTelemetry disabled (set OTEL_ENABLED=true to enable)');
+}
+
+// =============================================================================
+// Helper functions for manual instrumentation
+// =============================================================================
+
+/**
+ * Get the current tracer for manual span creation.
+ */
+export function getTracer(name: string = 'moltbot') {
+    return trace.getTracer(name);
+}
+
+/**
+ * Get the current span context for correlation.
+ */
+export function getCurrentSpan() {
+    return trace.getActiveSpan();
+}
+
+/**
+ * Get the current trace ID for log correlation.
+ */
+export function getCurrentTraceId(): string | undefined {
+    const span = getCurrentSpan();
+    if (!span) return undefined;
+    return span.spanContext().traceId;
+}
+
+/**
+ * Get the current span ID for log correlation.
+ */
+export function getCurrentSpanId(): string | undefined {
+    const span = getCurrentSpan();
+    if (!span) return undefined;
+    return span.spanContext().spanId;
+}
+
+/**
+ * Create a custom span for a specific operation.
+ * 
+ * @example
+ * await withSpan('ai.completion', { provider: 'openai' }, async (span) => {
+ *   const result = await callOpenAI();
+ *   span.setAttribute('tokens', result.usage.total_tokens);
+ *   return result;
+ * });
+ */
+export async function withSpan<T>(
+    name: string,
+    attributes: Record<string, string | number | boolean> = {},
+    fn: (span: ReturnType<typeof trace.getTracer>['startSpan'] extends (name: string) => infer R ? R : never) => Promise<T>,
+): Promise<T> {
+    const tracer = getTracer();
+    const span = tracer.startSpan(name);
+
+    for (const [key, value] of Object.entries(attributes)) {
+        span.setAttribute(key, value);
+    }
+
+    try {
+        const result = await context.with(trace.setSpan(context.active(), span), () => fn(span));
+        span.setStatus({ code: SpanStatusCode.OK });
+        return result;
+    } catch (error) {
+        span.setStatus({
+            code: SpanStatusCode.ERROR,
+            message: error instanceof Error ? error.message : 'Unknown error',
+        });
+        span.recordException(error instanceof Error ? error : new Error(String(error)));
+        throw error;
+    } finally {
+        span.end();
+    }
+}
+
+/**
+ * Add trace context to a log record.
+ */
+export function getTraceContext(): { traceId?: string; spanId?: string } {
+    return {
+        traceId: getCurrentTraceId(),
+        spanId: getCurrentSpanId(),
+    };
+}
+
+export { sdk as otelSdk };