diff --git a/packages/metamcp-observability.patch b/packages/metamcp-observability.patch new file mode 100644 index 0000000..f846424 --- /dev/null +++ b/packages/metamcp-observability.patch @@ -0,0 +1,441 @@ +diff -urN metamcp-2.4.22.orig/apps/backend/src/index.ts metamcp-2.4.22/apps/backend/src/index.ts +--- a/apps/backend/src/index.ts 2026-05-23 23:45:36.313040968 +0300 ++++ b/apps/backend/src/index.ts 2026-05-23 23:50:43.286804066 +0300 +@@ -1,6 +1,14 @@ + import express from "express"; + + import { auth } from "./auth"; ++import { ++ mcpCancellationTotal, ++ mcpHopDurationSeconds, ++ mcpTraceparentSynthesizedTotal, ++ mcpUncaughtThrowTotal, ++ renderMetricsExposition, ++} from "./lib/observability/metrics"; ++import { bindFromHeader, HOP_NAME, traceLog, traceStore } from "./lib/observability/trace"; + import { initializeIdleServers } from "./lib/startup"; + import mcpProxyRouter from "./routers/mcp-proxy"; + import oauthRouter from "./routers/oauth"; +@@ -9,6 +17,42 @@ + + const app = express(); + ++// cluster#49 — bind W3C trace context for every request and stamp every ++// downstream log via traceStore.run(). cluster#50 — observe wall time ++// per hop and count cancellations. ++app.use((req, res, next) => { ++ const raw = req.headers.traceparent as string | undefined; ++ const { ctx, event } = bindFromHeader(raw); ++ if (event === "traceparent_synthesized") { ++ mcpTraceparentSynthesizedTotal.inc({ hop: HOP_NAME }); ++ } ++ const t0 = process.hrtime.bigint(); ++ let status: "success" | "error" | "cancelled" = "success"; ++ res.on("close", () => { ++ // res.writableEnded is true when we finished writing; false here ++ // means the downstream client hung up mid-response. ++ if (!res.writableEnded) { ++ status = "cancelled"; ++ mcpCancellationTotal.inc({ hop: HOP_NAME, source: "downstream" }); ++ } ++ const dt = Number(process.hrtime.bigint() - t0) / 1e9; ++ mcpHopDurationSeconds.observe( ++ { hop: HOP_NAME, tool_name: "", status }, ++ dt, ++ ); ++ }); ++ traceStore.run(ctx, () => { ++ if (event) traceLog("info", `event=${event}`); ++ next(); ++ }); ++}); ++ ++// cluster#50 — Prometheus exposition. Schema in Specs/mcp-metric-schema. ++app.get("/metrics", (_req, res) => { ++ res.set("Content-Type", "text/plain; version=0.0.4"); ++ res.send(renderMetricsExposition()); ++}); ++ + // Global JSON middleware for non-proxy routes + app.use((req, res, next) => { + if (req.path.startsWith("/mcp-proxy/") || req.path.startsWith("/metamcp/")) { +@@ -108,3 +152,39 @@ + status: "ok", + }); + }); ++ ++// cluster#50 — Express error tail. Any unhandled error in middleware / ++// handlers funnels here; we count it so the uncaught-throw alert fires. ++app.use( ++ ( ++ err: unknown, ++ _req: express.Request, ++ res: express.Response, ++ next: express.NextFunction, ++ ) => { ++ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "express" }); ++ traceLog("error", "event=uncaught_throw", { ++ error: err instanceof Error ? err.message : String(err), ++ }); ++ if (res.headersSent) return next(err); ++ res.status(500).json({ error: "internal error" }); ++ }, ++); ++ ++// cluster#50 — last-resort process traps so unhandled rejections / ++// uncaught exceptions still feed the alert. Counting only; existing ++// behaviour (log + continue) is preserved. ++process.on("unhandledRejection", (err) => { ++ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "unhandledRejection" }); ++ traceLog("error", "event=uncaught_throw", { ++ component: "unhandledRejection", ++ error: err instanceof Error ? err.message : String(err), ++ }); ++}); ++process.on("uncaughtException", (err) => { ++ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "uncaughtException" }); ++ traceLog("error", "event=uncaught_throw", { ++ component: "uncaughtException", ++ error: err.message, ++ }); ++}); +diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/metrics.ts metamcp-2.4.22/apps/backend/src/lib/observability/metrics.ts +--- a/apps/backend/src/lib/observability/metrics.ts 1970-01-01 03:00:00.000000000 +0300 ++++ b/apps/backend/src/lib/observability/metrics.ts 2026-05-23 23:49:42.080145070 +0300 +@@ -0,0 +1,160 @@ ++// Hand-rolled Prometheus metrics for the metamcp aggregator. ++// ++// We avoid adding `prom-client` to package.json so the pnpm lockfile and ++// the Nix `pnpmDeps` hash stay untouched (cluster#50). This module ++// emits the canonical schema from Specs/mcp-metric-schema: ++// mcp_hop_duration_seconds (histogram) ++// mcp_cancellation_total (counter) ++// mcp_uncaught_throw_total (counter) ++// mcp_traceparent_synthesized_total (counter) ++// ++// Exposition is text format v0.0.4. The /metrics endpoint is registered ++// in index.ts. ++ ++type LabelSet = Record; ++ ++function escapeLabel(v: string): string { ++ return v.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n"); ++} ++ ++function renderLabels(labels: LabelSet): string { ++ const parts: string[] = []; ++ for (const k of Object.keys(labels).sort()) { ++ parts.push(`${k}="${escapeLabel(labels[k])}"`); ++ } ++ return parts.length ? `{${parts.join(",")}}` : ""; ++} ++ ++function labelKey(labels: LabelSet): string { ++ // Stable key for the labels map. Sort keys so the order is deterministic. ++ const ks = Object.keys(labels).sort(); ++ return ks.map((k) => `${k}=${labels[k]}`).join("\x1f"); ++} ++ ++export class Counter { ++ private values = new Map(); ++ ++ constructor( ++ public readonly name: string, ++ public readonly help: string, ++ public readonly labelNames: readonly string[], ++ ) {} ++ ++ inc(labels: LabelSet, by: number = 1): void { ++ const key = labelKey(labels); ++ const cur = this.values.get(key); ++ if (cur) { ++ cur.value += by; ++ } else { ++ this.values.set(key, { labels: { ...labels }, value: by }); ++ } ++ } ++ ++ render(): string { ++ const lines: string[] = []; ++ lines.push(`# HELP ${this.name} ${this.help}`); ++ lines.push(`# TYPE ${this.name} counter`); ++ for (const { labels, value } of this.values.values()) { ++ lines.push(`${this.name}${renderLabels(labels)} ${value}`); ++ } ++ return lines.join("\n"); ++ } ++} ++ ++export class Histogram { ++ private series = new Map< ++ string, ++ { ++ labels: LabelSet; ++ bucketCounts: number[]; // counts per bucket boundary (cumulative on render) ++ sum: number; ++ count: number; ++ } ++ >(); ++ ++ constructor( ++ public readonly name: string, ++ public readonly help: string, ++ public readonly labelNames: readonly string[], ++ public readonly buckets: readonly number[], ++ ) {} ++ ++ observe(labels: LabelSet, value: number): void { ++ const key = labelKey(labels); ++ let s = this.series.get(key); ++ if (!s) { ++ s = { ++ labels: { ...labels }, ++ bucketCounts: new Array(this.buckets.length).fill(0), ++ sum: 0, ++ count: 0, ++ }; ++ this.series.set(key, s); ++ } ++ for (let i = 0; i < this.buckets.length; i++) { ++ if (value <= this.buckets[i]) s.bucketCounts[i] += 1; ++ } ++ s.sum += value; ++ s.count += 1; ++ } ++ ++ render(): string { ++ const lines: string[] = []; ++ lines.push(`# HELP ${this.name} ${this.help}`); ++ lines.push(`# TYPE ${this.name} histogram`); ++ for (const s of this.series.values()) { ++ for (let i = 0; i < this.buckets.length; i++) { ++ const le = String(this.buckets[i]); ++ const lbls = renderLabels({ ...s.labels, le }); ++ lines.push(`${this.name}_bucket${lbls} ${s.bucketCounts[i]}`); ++ } ++ const lblsInf = renderLabels({ ...s.labels, le: "+Inf" }); ++ lines.push(`${this.name}_bucket${lblsInf} ${s.count}`); ++ lines.push(`${this.name}_sum${renderLabels(s.labels)} ${s.sum}`); ++ lines.push(`${this.name}_count${renderLabels(s.labels)} ${s.count}`); ++ } ++ return lines.join("\n"); ++ } ++} ++ ++// --- Schema instances -------------------------------------------------------- ++ ++const DURATION_BUCKETS = [ ++ 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, ++]; ++ ++export const mcpHopDurationSeconds = new Histogram( ++ "mcp_hop_duration_seconds", ++ "Wall time spent in this hop processing one MCP request", ++ ["hop", "tool_name", "status"], ++ DURATION_BUCKETS, ++); ++ ++export const mcpCancellationTotal = new Counter( ++ "mcp_cancellation_total", ++ "Cancellations observed per hop", ++ ["hop", "source"], ++); ++ ++export const mcpUncaughtThrowTotal = new Counter( ++ "mcp_uncaught_throw_total", ++ "Uncaught exceptions per component (today's smoking gun, see cluster#44)", ++ ["hop", "component"], ++); ++ ++export const mcpTraceparentSynthesizedTotal = new Counter( ++ "mcp_traceparent_synthesized_total", ++ "Inbound MCP requests missing a traceparent header", ++ ["hop"], ++); ++ ++export function renderMetricsExposition(): string { ++ return ( ++ [ ++ mcpHopDurationSeconds.render(), ++ mcpCancellationTotal.render(), ++ mcpUncaughtThrowTotal.render(), ++ mcpTraceparentSynthesizedTotal.render(), ++ ].join("\n") + "\n" ++ ); ++} +diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/trace.ts metamcp-2.4.22/apps/backend/src/lib/observability/trace.ts +--- a/apps/backend/src/lib/observability/trace.ts 1970-01-01 03:00:00.000000000 +0300 ++++ b/apps/backend/src/lib/observability/trace.ts 2026-05-23 23:49:11.700817887 +0300 +@@ -0,0 +1,127 @@ ++// W3C Trace Context propagation for the metamcp aggregator. ++// ++// Implements the contract from cluster#45 (Specs/mcp-request-id): ++// traceparent header: 00--- ++// Bound per-request via AsyncLocalStorage. Inbound HTTP header is read by ++// the express middleware in index.ts; outbound JSON-RPC to stdio children ++// reads currentTraceparentForUpstream() and injects it under ++// params._meta.traceparent (process-managed-transport.send). ++ ++import { AsyncLocalStorage } from "node:async_hooks"; ++import { randomBytes } from "node:crypto"; ++ ++export const HOP_NAME = "metamcp"; ++ ++export interface TraceCtx { ++ trace_id: string; ++ span_id: string; ++ parent_span_id: string; ++ hop: string; ++ tool_name: string; ++} ++ ++const TRACEPARENT_RE = /^00-([0-9a-f]{32})-([0-9a-f]{16})-[0-9a-f]{2}$/; ++ ++export const traceStore = new AsyncLocalStorage(); ++ ++export function newTraceId(): string { ++ return randomBytes(16).toString("hex"); ++} ++ ++export function newSpanId(): string { ++ return randomBytes(8).toString("hex"); ++} ++ ++export type ParsedTraceparent = { ++ trace_id: string; ++ parent_span_id: string; ++ malformed: boolean; ++}; ++ ++export function parseTraceparent(raw: string | undefined): ParsedTraceparent { ++ if (typeof raw !== "string") { ++ return { trace_id: "", parent_span_id: "", malformed: false }; ++ } ++ const m = TRACEPARENT_RE.exec(raw.trim()); ++ if (!m) { ++ return { trace_id: newTraceId(), parent_span_id: "", malformed: true }; ++ } ++ return { trace_id: m[1], parent_span_id: m[2], malformed: false }; ++} ++ ++export type BindResult = { ++ ctx: TraceCtx; ++ // One of: "" (normal), "traceparent_synthesized", "traceparent_malformed". ++ // Caller emits the reserved event log so the dashboard can count it. ++ event: "" | "traceparent_synthesized" | "traceparent_malformed"; ++}; ++ ++export function bindFromHeader( ++ raw: string | undefined, ++ tool_name: string = "", ++): BindResult { ++ let event: BindResult["event"] = ""; ++ let trace_id = ""; ++ let parent_span_id = ""; ++ if (raw == null) { ++ trace_id = newTraceId(); ++ event = "traceparent_synthesized"; ++ } else { ++ const p = parseTraceparent(raw); ++ trace_id = p.trace_id; ++ parent_span_id = p.parent_span_id; ++ if (p.malformed) event = "traceparent_malformed"; ++ } ++ const ctx: TraceCtx = { ++ trace_id, ++ span_id: newSpanId(), ++ parent_span_id, ++ hop: HOP_NAME, ++ tool_name, ++ }; ++ return { ctx, event }; ++} ++ ++// Build a traceparent literal for outbound calls. Our span-id becomes the ++// downstream hop's parent_span_id (standard W3C propagation). Falls back ++// to synthesizing a fresh context when called outside any bound request ++// (e.g. session-pool keepalive paths that originate inside the server). ++export function currentTraceparentForUpstream(): string { ++ let ctx = traceStore.getStore(); ++ if (!ctx || !ctx.trace_id) { ++ ctx = { ++ trace_id: newTraceId(), ++ span_id: newSpanId(), ++ parent_span_id: "", ++ hop: HOP_NAME, ++ tool_name: "", ++ }; ++ } ++ return `00-${ctx.trace_id}-${ctx.span_id}-01`; ++} ++ ++// Single structured log line stamped with the current trace fields. Use ++// from any code path that has a bound trace context. Outside a context, ++// emits with empty fields so existing call sites do not crash. ++export function traceLog( ++ level: "info" | "warn" | "error", ++ msg: string, ++ extra: Record = {}, ++): void { ++ const ctx = traceStore.getStore(); ++ const rec: Record = { ++ ts: Date.now() / 1000, ++ level, ++ msg, ++ hop: HOP_NAME, ++ trace_id: ctx?.trace_id ?? "", ++ span_id: ctx?.span_id ?? "", ++ parent_span_id: ctx?.parent_span_id ?? "", ++ tool_name: ctx?.tool_name ?? "", ++ ...extra, ++ }; ++ const line = JSON.stringify(rec); ++ if (level === "error") console.error(line); ++ else if (level === "warn") console.warn(line); ++ else console.log(line); ++} +diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/stdio-transport/process-managed-transport.ts metamcp-2.4.22/apps/backend/src/lib/stdio-transport/process-managed-transport.ts +--- a/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:45:36.315112630 +0300 ++++ b/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:51:05.898047455 +0300 +@@ -6,6 +6,7 @@ + import { JSONRPCMessage } from "@modelcontextprotocol/sdk/types.js"; + import spawn from "cross-spawn"; + ++import { currentTraceparentForUpstream } from "../observability/trace"; + import { ReadBuffer, serializeMessage } from "./shared"; + + export type StdioServerParameters = { +@@ -279,7 +280,32 @@ + throw new Error("Not connected"); + } + +- const json = serializeMessage(message); ++ // cluster#49 — propagate W3C trace context to the stdio child via ++ // params._meta.traceparent (the MCP spec's standard escape hatch ++ // for transport-agnostic metadata). Only stamp on messages with ++ // params (notifications/requests); JSON-RPC responses ride back ++ // up so don't need it. Idempotent: existing _meta is preserved. ++ const tracedMessage = (() => { ++ const m = message as JSONRPCMessage & { ++ params?: Record; ++ }; ++ if (!m || typeof m !== "object" || !("params" in m)) return message; ++ if (m.params == null || typeof m.params !== "object") return message; ++ const meta = (m.params as Record)._meta; ++ const traceparent = currentTraceparentForUpstream(); ++ return { ++ ...m, ++ params: { ++ ...m.params, ++ _meta: { ++ ...(typeof meta === "object" && meta !== null ? meta : {}), ++ traceparent, ++ }, ++ }, ++ } as JSONRPCMessage; ++ })(); ++ ++ const json = serializeMessage(tracedMessage); + if (this._process.stdin.write(json)) { + resolve(); + } else { diff --git a/packages/metamcp.nix b/packages/metamcp.nix index eed2648..f63628e 100644 --- a/packages/metamcp.nix +++ b/packages/metamcp.nix @@ -67,6 +67,15 @@ stdenv.mkDerivation (finalAttrs: { if (pkg.engines) delete pkg.engines.pnpm; fs.writeFileSync("package.json", JSON.stringify(pkg, null, 2) + "\n"); ' + + # cluster#49, cluster#50 — observability patch: W3C traceparent + # propagation (HTTP header → params._meta.traceparent on stdio + # children) + hand-rolled Prometheus metrics on /metrics, no new + # dependency so pnpmDeps stays untouched. Two new source files + # under apps/backend/src/lib/observability/ and minimal edits to + # apps/backend/src/index.ts and process-managed-transport.ts. + # Retire when this lands upstream. + patch -p1 < ${./metamcp-observability.patch} ''; # pnpmConfigHook places node_modules; build the workspace with Turbo.