feat(observability): add Prometheus metrics and OpenTelemetry tracing

- Add Prometheus metrics endpoints (GET /metrics) - Add health checks (GET /health, GET /ready) - Add OpenTelemetry SDK with automatic HTTP instrumentation - Expose update metrics for messages, AI calls, and errors
2026-01-29 16:32:05 -05:00 · 2026-01-29 16:32:05 -05:00 · bb97993755
commit bb97993755
parent f712388584
6 changed files with 1393 additions and 121 deletions
--- a/package.json
+++ b/package.json
@ -167,6 +167,12 @@
    "@mariozechner/pi-coding-agent": "0.49.3",
    "@mariozechner/pi-tui": "0.49.3",
    "@mozilla/readability": "^0.6.0",
    "@opentelemetry/api": "^1.9.0",
    "@opentelemetry/auto-instrumentations-node": "^0.69.0",
    "@opentelemetry/exporter-prometheus": "^0.211.0",
    "@opentelemetry/resources": "^2.5.0",
    "@opentelemetry/sdk-node": "^0.211.0",
    "@opentelemetry/semantic-conventions": "^1.39.0",
    "@sinclair/typebox": "0.34.47",
    "@slack/bolt": "^4.6.0",
    "@slack/web-api": "^7.13.0",
@ -185,7 +191,7 @@
    "express": "^5.2.1",
    "file-type": "^21.3.0",
    "grammy": "^1.39.3",
-    "hono": "4.11.4",
+    "hono": "4.11.7",
    "jiti": "^2.6.1",
    "json5": "^2.2.3",
    "jszip": "^3.10.1",
@ -196,11 +202,12 @@
    "osc-progress": "^0.3.0",
    "pdfjs-dist": "^5.4.530",
    "playwright-core": "1.58.0",
    "prom-client": "^15.1.3",
    "proper-lockfile": "^4.1.2",
    "qrcode-terminal": "^0.12.0",
    "sharp": "^0.34.5",
    "sqlite-vec": "0.1.7-alpha.2",
-    "tar": "7.5.4",
+    "tar": "7.5.7",
    "tslog": "^4.10.2",
    "undici": "^7.19.0",
    "ws": "^8.19.0",
@ -241,14 +248,14 @@
    "wireit": "^0.14.12"
  },
  "overrides": {
-    "tar": "7.5.4"
+    "tar": "7.5.7"
  },
  "pnpm": {
    "minimumReleaseAge": 2880,
    "overrides": {
      "@sinclair/typebox": "0.34.47",
-      "hono": "4.11.4",
+      "hono": "4.11.7",
-      "tar": "7.5.4"
+      "tar": "7.5.7"
    }
  },
  "vitest": {
@ -283,4 +290,4 @@
      "dist/Moltbot.app/**"
    ]
  }
-}
+}
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
--- a/src/gateway/metrics-http.ts
+++ b/src/gateway/metrics-http.ts
@ -0,0 +1,97 @@
 /**
 * HTTP handler for Prometheus metrics endpoint.
 * 
 * Exposes metrics at GET /metrics in Prometheus text format.
 */
 import type { IncomingMessage, ServerResponse } from 'node:http';
 import { getMetricsText, getMetricsContentType } from '../infra/metrics.js';
 /**
 * Handle GET /metrics requests for Prometheus scraping.
 */
 export async function handleMetricsRequest(
    _req: IncomingMessage,
    res: ServerResponse,
 ): Promise<void> {
    try {
        const metricsText = await getMetricsText();
        res.writeHead(200, {
            'Content-Type': getMetricsContentType(),
            'Cache-Control': 'no-cache, no-store, must-revalidate',
        });
        res.end(metricsText);
    } catch (error) {
        res.writeHead(500, { 'Content-Type': 'text/plain' });
        res.end('Error collecting metrics');
    }
 }
 /**
 * Handle GET /health liveness probe.
 * Always returns 200 if the server is responding.
 */
 export function handleHealthRequest(
    _req: IncomingMessage,
    res: ServerResponse,
 ): void {
    res.writeHead(200, { 'Content-Type': 'application/json' });
    res.end(JSON.stringify({
        status: 'ok',
        timestamp: new Date().toISOString(),
    }));
 }
 /**
 * Readiness check function type.
 * Returns true if the service is ready to accept traffic.
 */
 export type ReadinessCheck = () => Promise<boolean> | boolean;
 /**
 * Create a readiness handler with custom checks.
 * 
 * @param checks - Array of check functions that must all pass
 */
 export function createReadinessHandler(
    checks: ReadinessCheck[] = [],
 ): (req: IncomingMessage, res: ServerResponse) => Promise<void> {
    return async (_req: IncomingMessage, res: ServerResponse): Promise<void> => {
        try {
            const results = await Promise.all(checks.map((check) => check()));
            const isReady = results.every(Boolean);
            if (isReady) {
                res.writeHead(200, { 'Content-Type': 'application/json' });
                res.end(JSON.stringify({
                    status: 'ready',
                    timestamp: new Date().toISOString(),
                }));
            } else {
                res.writeHead(503, { 'Content-Type': 'application/json' });
                res.end(JSON.stringify({
                    status: 'not_ready',
                    timestamp: new Date().toISOString(),
                }));
            }
        } catch (error) {
            res.writeHead(503, { 'Content-Type': 'application/json' });
            res.end(JSON.stringify({
                status: 'error',
                timestamp: new Date().toISOString(),
            }));
        }
    };
 }
 /**
 * Simple readiness handler that always returns ready.
 * Use createReadinessHandler for custom checks.
 */
 export async function handleReadyRequest(
    req: IncomingMessage,
    res: ServerResponse,
 ): Promise<void> {
    const handler = createReadinessHandler([]);
    return handler(req, res);
 }
--- a/src/gateway/server-http.ts
+++ b/src/gateway/server-http.ts
@ -30,6 +30,11 @@ import { applyHookMappings } from "./hooks-mapping.js";
 import { handleOpenAiHttpRequest } from "./openai-http.js";
 import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
 import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
 import {
  handleMetricsRequest,
  handleHealthRequest,
  handleReadyRequest,
 } from "./metrics-http.js";
 type SubsystemLogger = ReturnType<typeof createSubsystemLogger>;
@ -86,8 +91,8 @@ export function createHooksRequestHandler(
    if (fromQuery) {
      logHooks.warn(
        "Hook token provided via query parameter is deprecated for security reasons. " +
-          "Tokens in URLs appear in logs, browser history, and referrer headers. " +
+        "Tokens in URLs appear in logs, browser history, and referrer headers. " +
-          "Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
+        "Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
      );
    }
@ -225,17 +230,32 @@ export function createGatewayHttpServer(opts: {
  } = opts;
  const httpServer: HttpServer = opts.tlsOptions
    ? createHttpsServer(opts.tlsOptions, (req, res) => {
-        void handleRequest(req, res);
+      void handleRequest(req, res);
-      })
+    })
    : createHttpServer((req, res) => {
-        void handleRequest(req, res);
+      void handleRequest(req, res);
-      });
+    });
  async function handleRequest(req: IncomingMessage, res: ServerResponse) {
    // Don't interfere with WebSocket upgrades; ws handles the 'upgrade' event.
    if (String(req.headers.upgrade ?? "").toLowerCase() === "websocket") return;
    try {
      // Observability endpoints - no auth required for health checks
      const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
      if (req.method === "GET" && url.pathname === "/metrics") {
        await handleMetricsRequest(req, res);
        return;
      }
      if (req.method === "GET" && url.pathname === "/health") {
        handleHealthRequest(req, res);
        return;
      }
      if (req.method === "GET" && url.pathname === "/ready") {
        await handleReadyRequest(req, res);
        return;
      }
      const configSnapshot = loadConfig();
      const trustedProxies = configSnapshot.gateway?.trustedProxies ?? [];
      if (await handleHooksRequest(req, res)) return;
--- a/src/infra/metrics.ts
+++ b/src/infra/metrics.ts
@ -0,0 +1,158 @@
 /**
 * Prometheus metrics for Moltbot observability.
 * 
 * Usage:
 *   import { metrics } from '../infra/metrics.js';
 *   metrics.messagesTotal.inc({ channel: 'whatsapp', direction: 'inbound' });
 */
 import * as promClient from 'prom-client';
 // Create a custom registry to avoid global state pollution
 export const metricsRegistry = new promClient.Registry();
 // Add default metrics (process CPU, memory, etc.)
 promClient.collectDefaultMetrics({ register: metricsRegistry });
 // =============================================================================
 // COUNTERS - Monotonically increasing values
 // =============================================================================
 /** Total messages processed by channel and direction */
 export const messagesTotal = new promClient.Counter({
    name: 'moltbot_messages_total',
    help: 'Total number of messages processed',
    labelNames: ['channel', 'direction'] as const, // direction: inbound | outbound
    registers: [metricsRegistry],
 });
 /** Total HTTP requests by method, route, and status */
 export const httpRequestsTotal = new promClient.Counter({
    name: 'moltbot_http_requests_total',
    help: 'Total HTTP requests received',
    labelNames: ['method', 'route', 'status'] as const,
    registers: [metricsRegistry],
 });
 /** Total errors by type and source */
 export const errorsTotal = new promClient.Counter({
    name: 'moltbot_errors_total',
    help: 'Total errors encountered',
    labelNames: ['type', 'source'] as const,
    registers: [metricsRegistry],
 });
 /** Total AI provider calls by provider and model */
 export const aiCallsTotal = new promClient.Counter({
    name: 'moltbot_ai_calls_total',
    help: 'Total AI provider API calls',
    labelNames: ['provider', 'model'] as const,
    registers: [metricsRegistry],
 });
 // =============================================================================
 // HISTOGRAMS - Request duration distributions
 // =============================================================================
 /** HTTP request duration in seconds */
 export const httpRequestDuration = new promClient.Histogram({
    name: 'moltbot_http_request_duration_seconds',
    help: 'HTTP request duration in seconds',
    labelNames: ['method', 'route'] as const,
    buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
    registers: [metricsRegistry],
 });
 /** AI provider response latency in seconds */
 export const aiLatency = new promClient.Histogram({
    name: 'moltbot_ai_latency_seconds',
    help: 'AI provider response latency in seconds',
    labelNames: ['provider', 'model'] as const,
    buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60],
    registers: [metricsRegistry],
 });
 /** Message processing time in seconds */
 export const messageProcessingDuration = new promClient.Histogram({
    name: 'moltbot_message_processing_duration_seconds',
    help: 'Time to process a message end-to-end',
    labelNames: ['channel'] as const,
    buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
    registers: [metricsRegistry],
 });
 // =============================================================================
 // GAUGES - Values that can go up and down
 // =============================================================================
 /** Number of active sessions in memory */
 export const activeSessions = new promClient.Gauge({
    name: 'moltbot_active_sessions',
    help: 'Number of active sessions currently in memory',
    registers: [metricsRegistry],
 });
 /** Number of connected channels */
 export const connectedChannels = new promClient.Gauge({
    name: 'moltbot_connected_channels',
    help: 'Number of channels currently connected',
    labelNames: ['channel'] as const,
    registers: [metricsRegistry],
 });
 /** WebSocket connections count */
 export const websocketConnections = new promClient.Gauge({
    name: 'moltbot_websocket_connections',
    help: 'Number of active WebSocket connections',
    registers: [metricsRegistry],
 });
 /** Gateway uptime in seconds */
 export const uptimeSeconds = new promClient.Gauge({
    name: 'moltbot_uptime_seconds',
    help: 'Gateway uptime in seconds',
    registers: [metricsRegistry],
 });
 // =============================================================================
 // Helper functions
 // =============================================================================
 const startTime = Date.now();
 /** Update the uptime gauge - call periodically */
 export function updateUptime(): void {
    uptimeSeconds.set((Date.now() - startTime) / 1000);
 }
 /** Get metrics in Prometheus text format */
 export async function getMetricsText(): Promise<string> {
    updateUptime();
    return metricsRegistry.metrics();
 }
 /** Get the content type for Prometheus metrics */
 export function getMetricsContentType(): string {
    return metricsRegistry.contentType;
 }
 // Convenience export for common usage patterns
 export const metrics = {
    messagesTotal,
    httpRequestsTotal,
    errorsTotal,
    aiCallsTotal,
    httpRequestDuration,
    aiLatency,
    messageProcessingDuration,
    activeSessions,
    connectedChannels,
    websocketConnections,
    uptimeSeconds,
    getMetricsText,
    getMetricsContentType,
    updateUptime,
    registry: metricsRegistry,
 };
 export default metrics;
--- a/src/infra/otel.ts
+++ b/src/infra/otel.ts
@ -0,0 +1,133 @@
 /**
 * OpenTelemetry SDK initialization for Moltbot.
 * 
 * This module sets up distributed tracing with automatic HTTP instrumentation.
 * Import this at the very start of your application entry point.
 * 
 * Usage:
 *   import './infra/otel.js'; // Must be first import
 *   import { trace } from '@opentelemetry/api';
 */
 import { NodeSDK } from '@opentelemetry/sdk-node';
 import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
 import { trace, context, SpanStatusCode, type Span } from '@opentelemetry/api';
 // SDK configuration - only enable if OTEL_ENABLED is set
 const isOtelEnabled = process.env.OTEL_ENABLED === 'true' || process.env.OTEL_ENABLED === '1';
 let sdk: NodeSDK | null = null;
 if (isOtelEnabled) {
    sdk = new NodeSDK({
        serviceName: 'moltbot',
        instrumentations: [
            getNodeAutoInstrumentations({
                // Disable some noisy instrumentations
                '@opentelemetry/instrumentation-fs': { enabled: false },
                '@opentelemetry/instrumentation-dns': { enabled: false },
            }),
        ],
    });
    // Start the SDK
    sdk.start();
    // Graceful shutdown
    process.on('SIGTERM', () => {
        sdk?.shutdown()
            .then(() => console.log('[otel] Tracing terminated'))
            .catch((error) => console.error('[otel] Error terminating tracing', error))
            .finally(() => process.exit(0));
    });
    console.log('[otel] OpenTelemetry SDK initialized');
 } else {
    console.log('[otel] OpenTelemetry disabled (set OTEL_ENABLED=true to enable)');
 }
 // =============================================================================
 // Helper functions for manual instrumentation
 // =============================================================================
 /**
 * Get the current tracer for manual span creation.
 */
 export function getTracer(name: string = 'moltbot') {
    return trace.getTracer(name);
 }
 /**
 * Get the current span context for correlation.
 */
 export function getCurrentSpan() {
    return trace.getActiveSpan();
 }
 /**
 * Get the current trace ID for log correlation.
 */
 export function getCurrentTraceId(): string | undefined {
    const span = getCurrentSpan();
    if (!span) return undefined;
    return span.spanContext().traceId;
 }
 /**
 * Get the current span ID for log correlation.
 */
 export function getCurrentSpanId(): string | undefined {
    const span = getCurrentSpan();
    if (!span) return undefined;
    return span.spanContext().spanId;
 }
 /**
 * Create a custom span for a specific operation.
 * 
 * @example
 * await withSpan('ai.completion', { provider: 'openai' }, async (span) => {
 *   const result = await callOpenAI();
 *   span.setAttribute('tokens', result.usage.total_tokens);
 *   return result;
 * });
 */
 export async function withSpan<T>(
    name: string,
    attributes: Record<string, string | number | boolean> = {},
    fn: (span: ReturnType<typeof trace.getTracer>['startSpan'] extends (name: string) => infer R ? R : never) => Promise<T>,
 ): Promise<T> {
    const tracer = getTracer();
    const span = tracer.startSpan(name);
    for (const [key, value] of Object.entries(attributes)) {
        span.setAttribute(key, value);
    }
    try {
        const result = await context.with(trace.setSpan(context.active(), span), () => fn(span));
        span.setStatus({ code: SpanStatusCode.OK });
        return result;
    } catch (error) {
        span.setStatus({
            code: SpanStatusCode.ERROR,
            message: error instanceof Error ? error.message : 'Unknown error',
        });
        span.recordException(error instanceof Error ? error : new Error(String(error)));
        throw error;
    } finally {
        span.end();
    }
 }
 /**
 * Add trace context to a log record.
 */
 export function getTraceContext(): { traceId?: string; spanId?: string } {
    return {
        traceId: getCurrentTraceId(),
        spanId: getCurrentSpanId(),
    };
 }
 export { sdk as otelSdk };