feat(observability): add Prometheus metrics and OpenTelemetry tracing
- Add Prometheus metrics endpoints (GET /metrics) - Add health checks (GET /health, GET /ready) - Add OpenTelemetry SDK with automatic HTTP instrumentation - Expose update metrics for messages, AI calls, and errors
This commit is contained in:
parent
f712388584
commit
bb97993755
19
package.json
19
package.json
@ -167,6 +167,12 @@
|
||||
"@mariozechner/pi-coding-agent": "0.49.3",
|
||||
"@mariozechner/pi-tui": "0.49.3",
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"@opentelemetry/api": "^1.9.0",
|
||||
"@opentelemetry/auto-instrumentations-node": "^0.69.0",
|
||||
"@opentelemetry/exporter-prometheus": "^0.211.0",
|
||||
"@opentelemetry/resources": "^2.5.0",
|
||||
"@opentelemetry/sdk-node": "^0.211.0",
|
||||
"@opentelemetry/semantic-conventions": "^1.39.0",
|
||||
"@sinclair/typebox": "0.34.47",
|
||||
"@slack/bolt": "^4.6.0",
|
||||
"@slack/web-api": "^7.13.0",
|
||||
@ -185,7 +191,7 @@
|
||||
"express": "^5.2.1",
|
||||
"file-type": "^21.3.0",
|
||||
"grammy": "^1.39.3",
|
||||
"hono": "4.11.4",
|
||||
"hono": "4.11.7",
|
||||
"jiti": "^2.6.1",
|
||||
"json5": "^2.2.3",
|
||||
"jszip": "^3.10.1",
|
||||
@ -196,11 +202,12 @@
|
||||
"osc-progress": "^0.3.0",
|
||||
"pdfjs-dist": "^5.4.530",
|
||||
"playwright-core": "1.58.0",
|
||||
"prom-client": "^15.1.3",
|
||||
"proper-lockfile": "^4.1.2",
|
||||
"qrcode-terminal": "^0.12.0",
|
||||
"sharp": "^0.34.5",
|
||||
"sqlite-vec": "0.1.7-alpha.2",
|
||||
"tar": "7.5.4",
|
||||
"tar": "7.5.7",
|
||||
"tslog": "^4.10.2",
|
||||
"undici": "^7.19.0",
|
||||
"ws": "^8.19.0",
|
||||
@ -241,14 +248,14 @@
|
||||
"wireit": "^0.14.12"
|
||||
},
|
||||
"overrides": {
|
||||
"tar": "7.5.4"
|
||||
"tar": "7.5.7"
|
||||
},
|
||||
"pnpm": {
|
||||
"minimumReleaseAge": 2880,
|
||||
"overrides": {
|
||||
"@sinclair/typebox": "0.34.47",
|
||||
"hono": "4.11.4",
|
||||
"tar": "7.5.4"
|
||||
"hono": "4.11.7",
|
||||
"tar": "7.5.7"
|
||||
}
|
||||
},
|
||||
"vitest": {
|
||||
@ -283,4 +290,4 @@
|
||||
"dist/Moltbot.app/**"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
1075
pnpm-lock.yaml
generated
1075
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
97
src/gateway/metrics-http.ts
Normal file
97
src/gateway/metrics-http.ts
Normal file
@ -0,0 +1,97 @@
|
||||
/**
|
||||
* HTTP handler for Prometheus metrics endpoint.
|
||||
*
|
||||
* Exposes metrics at GET /metrics in Prometheus text format.
|
||||
*/
|
||||
|
||||
import type { IncomingMessage, ServerResponse } from 'node:http';
|
||||
import { getMetricsText, getMetricsContentType } from '../infra/metrics.js';
|
||||
|
||||
/**
|
||||
* Handle GET /metrics requests for Prometheus scraping.
|
||||
*/
|
||||
export async function handleMetricsRequest(
|
||||
_req: IncomingMessage,
|
||||
res: ServerResponse,
|
||||
): Promise<void> {
|
||||
try {
|
||||
const metricsText = await getMetricsText();
|
||||
res.writeHead(200, {
|
||||
'Content-Type': getMetricsContentType(),
|
||||
'Cache-Control': 'no-cache, no-store, must-revalidate',
|
||||
});
|
||||
res.end(metricsText);
|
||||
} catch (error) {
|
||||
res.writeHead(500, { 'Content-Type': 'text/plain' });
|
||||
res.end('Error collecting metrics');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle GET /health liveness probe.
|
||||
* Always returns 200 if the server is responding.
|
||||
*/
|
||||
export function handleHealthRequest(
|
||||
_req: IncomingMessage,
|
||||
res: ServerResponse,
|
||||
): void {
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({
|
||||
status: 'ok',
|
||||
timestamp: new Date().toISOString(),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Readiness check function type.
|
||||
* Returns true if the service is ready to accept traffic.
|
||||
*/
|
||||
export type ReadinessCheck = () => Promise<boolean> | boolean;
|
||||
|
||||
/**
|
||||
* Create a readiness handler with custom checks.
|
||||
*
|
||||
* @param checks - Array of check functions that must all pass
|
||||
*/
|
||||
export function createReadinessHandler(
|
||||
checks: ReadinessCheck[] = [],
|
||||
): (req: IncomingMessage, res: ServerResponse) => Promise<void> {
|
||||
return async (_req: IncomingMessage, res: ServerResponse): Promise<void> => {
|
||||
try {
|
||||
const results = await Promise.all(checks.map((check) => check()));
|
||||
const isReady = results.every(Boolean);
|
||||
|
||||
if (isReady) {
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({
|
||||
status: 'ready',
|
||||
timestamp: new Date().toISOString(),
|
||||
}));
|
||||
} else {
|
||||
res.writeHead(503, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({
|
||||
status: 'not_ready',
|
||||
timestamp: new Date().toISOString(),
|
||||
}));
|
||||
}
|
||||
} catch (error) {
|
||||
res.writeHead(503, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({
|
||||
status: 'error',
|
||||
timestamp: new Date().toISOString(),
|
||||
}));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple readiness handler that always returns ready.
|
||||
* Use createReadinessHandler for custom checks.
|
||||
*/
|
||||
export async function handleReadyRequest(
|
||||
req: IncomingMessage,
|
||||
res: ServerResponse,
|
||||
): Promise<void> {
|
||||
const handler = createReadinessHandler([]);
|
||||
return handler(req, res);
|
||||
}
|
||||
@ -30,6 +30,11 @@ import { applyHookMappings } from "./hooks-mapping.js";
|
||||
import { handleOpenAiHttpRequest } from "./openai-http.js";
|
||||
import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
|
||||
import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
|
||||
import {
|
||||
handleMetricsRequest,
|
||||
handleHealthRequest,
|
||||
handleReadyRequest,
|
||||
} from "./metrics-http.js";
|
||||
|
||||
type SubsystemLogger = ReturnType<typeof createSubsystemLogger>;
|
||||
|
||||
@ -86,8 +91,8 @@ export function createHooksRequestHandler(
|
||||
if (fromQuery) {
|
||||
logHooks.warn(
|
||||
"Hook token provided via query parameter is deprecated for security reasons. " +
|
||||
"Tokens in URLs appear in logs, browser history, and referrer headers. " +
|
||||
"Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
|
||||
"Tokens in URLs appear in logs, browser history, and referrer headers. " +
|
||||
"Use Authorization: Bearer <token> or X-Moltbot-Token header instead.",
|
||||
);
|
||||
}
|
||||
|
||||
@ -225,17 +230,32 @@ export function createGatewayHttpServer(opts: {
|
||||
} = opts;
|
||||
const httpServer: HttpServer = opts.tlsOptions
|
||||
? createHttpsServer(opts.tlsOptions, (req, res) => {
|
||||
void handleRequest(req, res);
|
||||
})
|
||||
void handleRequest(req, res);
|
||||
})
|
||||
: createHttpServer((req, res) => {
|
||||
void handleRequest(req, res);
|
||||
});
|
||||
void handleRequest(req, res);
|
||||
});
|
||||
|
||||
async function handleRequest(req: IncomingMessage, res: ServerResponse) {
|
||||
// Don't interfere with WebSocket upgrades; ws handles the 'upgrade' event.
|
||||
if (String(req.headers.upgrade ?? "").toLowerCase() === "websocket") return;
|
||||
|
||||
try {
|
||||
// Observability endpoints - no auth required for health checks
|
||||
const url = new URL(req.url ?? "/", `http://${req.headers.host ?? "localhost"}`);
|
||||
if (req.method === "GET" && url.pathname === "/metrics") {
|
||||
await handleMetricsRequest(req, res);
|
||||
return;
|
||||
}
|
||||
if (req.method === "GET" && url.pathname === "/health") {
|
||||
handleHealthRequest(req, res);
|
||||
return;
|
||||
}
|
||||
if (req.method === "GET" && url.pathname === "/ready") {
|
||||
await handleReadyRequest(req, res);
|
||||
return;
|
||||
}
|
||||
|
||||
const configSnapshot = loadConfig();
|
||||
const trustedProxies = configSnapshot.gateway?.trustedProxies ?? [];
|
||||
if (await handleHooksRequest(req, res)) return;
|
||||
|
||||
158
src/infra/metrics.ts
Normal file
158
src/infra/metrics.ts
Normal file
@ -0,0 +1,158 @@
|
||||
/**
|
||||
* Prometheus metrics for Moltbot observability.
|
||||
*
|
||||
* Usage:
|
||||
* import { metrics } from '../infra/metrics.js';
|
||||
* metrics.messagesTotal.inc({ channel: 'whatsapp', direction: 'inbound' });
|
||||
*/
|
||||
|
||||
import * as promClient from 'prom-client';
|
||||
|
||||
// Create a custom registry to avoid global state pollution
|
||||
export const metricsRegistry = new promClient.Registry();
|
||||
|
||||
// Add default metrics (process CPU, memory, etc.)
|
||||
promClient.collectDefaultMetrics({ register: metricsRegistry });
|
||||
|
||||
// =============================================================================
|
||||
// COUNTERS - Monotonically increasing values
|
||||
// =============================================================================
|
||||
|
||||
/** Total messages processed by channel and direction */
|
||||
export const messagesTotal = new promClient.Counter({
|
||||
name: 'moltbot_messages_total',
|
||||
help: 'Total number of messages processed',
|
||||
labelNames: ['channel', 'direction'] as const, // direction: inbound | outbound
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Total HTTP requests by method, route, and status */
|
||||
export const httpRequestsTotal = new promClient.Counter({
|
||||
name: 'moltbot_http_requests_total',
|
||||
help: 'Total HTTP requests received',
|
||||
labelNames: ['method', 'route', 'status'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Total errors by type and source */
|
||||
export const errorsTotal = new promClient.Counter({
|
||||
name: 'moltbot_errors_total',
|
||||
help: 'Total errors encountered',
|
||||
labelNames: ['type', 'source'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Total AI provider calls by provider and model */
|
||||
export const aiCallsTotal = new promClient.Counter({
|
||||
name: 'moltbot_ai_calls_total',
|
||||
help: 'Total AI provider API calls',
|
||||
labelNames: ['provider', 'model'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
// HISTOGRAMS - Request duration distributions
|
||||
// =============================================================================
|
||||
|
||||
/** HTTP request duration in seconds */
|
||||
export const httpRequestDuration = new promClient.Histogram({
|
||||
name: 'moltbot_http_request_duration_seconds',
|
||||
help: 'HTTP request duration in seconds',
|
||||
labelNames: ['method', 'route'] as const,
|
||||
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** AI provider response latency in seconds */
|
||||
export const aiLatency = new promClient.Histogram({
|
||||
name: 'moltbot_ai_latency_seconds',
|
||||
help: 'AI provider response latency in seconds',
|
||||
labelNames: ['provider', 'model'] as const,
|
||||
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Message processing time in seconds */
|
||||
export const messageProcessingDuration = new promClient.Histogram({
|
||||
name: 'moltbot_message_processing_duration_seconds',
|
||||
help: 'Time to process a message end-to-end',
|
||||
labelNames: ['channel'] as const,
|
||||
buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
// GAUGES - Values that can go up and down
|
||||
// =============================================================================
|
||||
|
||||
/** Number of active sessions in memory */
|
||||
export const activeSessions = new promClient.Gauge({
|
||||
name: 'moltbot_active_sessions',
|
||||
help: 'Number of active sessions currently in memory',
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Number of connected channels */
|
||||
export const connectedChannels = new promClient.Gauge({
|
||||
name: 'moltbot_connected_channels',
|
||||
help: 'Number of channels currently connected',
|
||||
labelNames: ['channel'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** WebSocket connections count */
|
||||
export const websocketConnections = new promClient.Gauge({
|
||||
name: 'moltbot_websocket_connections',
|
||||
help: 'Number of active WebSocket connections',
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/** Gateway uptime in seconds */
|
||||
export const uptimeSeconds = new promClient.Gauge({
|
||||
name: 'moltbot_uptime_seconds',
|
||||
help: 'Gateway uptime in seconds',
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
// Helper functions
|
||||
// =============================================================================
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
/** Update the uptime gauge - call periodically */
|
||||
export function updateUptime(): void {
|
||||
uptimeSeconds.set((Date.now() - startTime) / 1000);
|
||||
}
|
||||
|
||||
/** Get metrics in Prometheus text format */
|
||||
export async function getMetricsText(): Promise<string> {
|
||||
updateUptime();
|
||||
return metricsRegistry.metrics();
|
||||
}
|
||||
|
||||
/** Get the content type for Prometheus metrics */
|
||||
export function getMetricsContentType(): string {
|
||||
return metricsRegistry.contentType;
|
||||
}
|
||||
|
||||
// Convenience export for common usage patterns
|
||||
export const metrics = {
|
||||
messagesTotal,
|
||||
httpRequestsTotal,
|
||||
errorsTotal,
|
||||
aiCallsTotal,
|
||||
httpRequestDuration,
|
||||
aiLatency,
|
||||
messageProcessingDuration,
|
||||
activeSessions,
|
||||
connectedChannels,
|
||||
websocketConnections,
|
||||
uptimeSeconds,
|
||||
getMetricsText,
|
||||
getMetricsContentType,
|
||||
updateUptime,
|
||||
registry: metricsRegistry,
|
||||
};
|
||||
|
||||
export default metrics;
|
||||
133
src/infra/otel.ts
Normal file
133
src/infra/otel.ts
Normal file
@ -0,0 +1,133 @@
|
||||
/**
|
||||
* OpenTelemetry SDK initialization for Moltbot.
|
||||
*
|
||||
* This module sets up distributed tracing with automatic HTTP instrumentation.
|
||||
* Import this at the very start of your application entry point.
|
||||
*
|
||||
* Usage:
|
||||
* import './infra/otel.js'; // Must be first import
|
||||
* import { trace } from '@opentelemetry/api';
|
||||
*/
|
||||
|
||||
import { NodeSDK } from '@opentelemetry/sdk-node';
|
||||
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
|
||||
import { trace, context, SpanStatusCode, type Span } from '@opentelemetry/api';
|
||||
|
||||
// SDK configuration - only enable if OTEL_ENABLED is set
|
||||
const isOtelEnabled = process.env.OTEL_ENABLED === 'true' || process.env.OTEL_ENABLED === '1';
|
||||
|
||||
let sdk: NodeSDK | null = null;
|
||||
|
||||
if (isOtelEnabled) {
|
||||
sdk = new NodeSDK({
|
||||
serviceName: 'moltbot',
|
||||
instrumentations: [
|
||||
getNodeAutoInstrumentations({
|
||||
// Disable some noisy instrumentations
|
||||
'@opentelemetry/instrumentation-fs': { enabled: false },
|
||||
'@opentelemetry/instrumentation-dns': { enabled: false },
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
// Start the SDK
|
||||
sdk.start();
|
||||
|
||||
// Graceful shutdown
|
||||
process.on('SIGTERM', () => {
|
||||
sdk?.shutdown()
|
||||
.then(() => console.log('[otel] Tracing terminated'))
|
||||
.catch((error) => console.error('[otel] Error terminating tracing', error))
|
||||
.finally(() => process.exit(0));
|
||||
});
|
||||
|
||||
console.log('[otel] OpenTelemetry SDK initialized');
|
||||
} else {
|
||||
console.log('[otel] OpenTelemetry disabled (set OTEL_ENABLED=true to enable)');
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Helper functions for manual instrumentation
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Get the current tracer for manual span creation.
|
||||
*/
|
||||
export function getTracer(name: string = 'moltbot') {
|
||||
return trace.getTracer(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current span context for correlation.
|
||||
*/
|
||||
export function getCurrentSpan() {
|
||||
return trace.getActiveSpan();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current trace ID for log correlation.
|
||||
*/
|
||||
export function getCurrentTraceId(): string | undefined {
|
||||
const span = getCurrentSpan();
|
||||
if (!span) return undefined;
|
||||
return span.spanContext().traceId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current span ID for log correlation.
|
||||
*/
|
||||
export function getCurrentSpanId(): string | undefined {
|
||||
const span = getCurrentSpan();
|
||||
if (!span) return undefined;
|
||||
return span.spanContext().spanId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a custom span for a specific operation.
|
||||
*
|
||||
* @example
|
||||
* await withSpan('ai.completion', { provider: 'openai' }, async (span) => {
|
||||
* const result = await callOpenAI();
|
||||
* span.setAttribute('tokens', result.usage.total_tokens);
|
||||
* return result;
|
||||
* });
|
||||
*/
|
||||
export async function withSpan<T>(
|
||||
name: string,
|
||||
attributes: Record<string, string | number | boolean> = {},
|
||||
fn: (span: ReturnType<typeof trace.getTracer>['startSpan'] extends (name: string) => infer R ? R : never) => Promise<T>,
|
||||
): Promise<T> {
|
||||
const tracer = getTracer();
|
||||
const span = tracer.startSpan(name);
|
||||
|
||||
for (const [key, value] of Object.entries(attributes)) {
|
||||
span.setAttribute(key, value);
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await context.with(trace.setSpan(context.active(), span), () => fn(span));
|
||||
span.setStatus({ code: SpanStatusCode.OK });
|
||||
return result;
|
||||
} catch (error) {
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: error instanceof Error ? error.message : 'Unknown error',
|
||||
});
|
||||
span.recordException(error instanceof Error ? error : new Error(String(error)));
|
||||
throw error;
|
||||
} finally {
|
||||
span.end();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add trace context to a log record.
|
||||
*/
|
||||
export function getTraceContext(): { traceId?: string; spanId?: string } {
|
||||
return {
|
||||
traceId: getCurrentTraceId(),
|
||||
spanId: getCurrentSpanId(),
|
||||
};
|
||||
}
|
||||
|
||||
export { sdk as otelSdk };
|
||||
Loading…
Reference in New Issue
Block a user