metamcp: observability patch — W3C traceparent + Prometheus /metrics (cluster#49, cluster#50)
Adds a postPatch step that applies metamcp-observability.patch to
the upstream metatool-ai/metamcp v2.4.22 source before pnpm build.
The patch:
* Drops in apps/backend/src/lib/observability/{trace,metrics}.ts:
AsyncLocalStorage trace context + parser/synthesizer per the W3C
contract (Specs/mcp-request-id), plus a hand-rolled
Counter/Histogram so we don't have to touch pnpm-lock.yaml /
pnpmDeps hash (no new npm dependency).
* Wires a top-level express middleware in apps/backend/src/index.ts
that binds trace context, observes mcp_hop_duration_seconds on
response close, and counts mcp_cancellation_total when the
downstream client hangs up mid-response.
* Adds /metrics to the Express app and last-resort process traps
(unhandledRejection / uncaughtException) feeding
mcp_uncaught_throw_total — the smoking-gun signal from
cluster#44.
* Patches process-managed-transport.send() to inject
params._meta.traceparent on every outbound JSON-RPC bound for a
stdio child (MCP _meta convention, Specs/mcp-request-id).
Retire when this lands upstream.
This commit is contained in:
@@ -0,0 +1,441 @@
|
||||
diff -urN metamcp-2.4.22.orig/apps/backend/src/index.ts metamcp-2.4.22/apps/backend/src/index.ts
|
||||
--- a/apps/backend/src/index.ts 2026-05-23 23:45:36.313040968 +0300
|
||||
+++ b/apps/backend/src/index.ts 2026-05-23 23:50:43.286804066 +0300
|
||||
@@ -1,6 +1,14 @@
|
||||
import express from "express";
|
||||
|
||||
import { auth } from "./auth";
|
||||
+import {
|
||||
+ mcpCancellationTotal,
|
||||
+ mcpHopDurationSeconds,
|
||||
+ mcpTraceparentSynthesizedTotal,
|
||||
+ mcpUncaughtThrowTotal,
|
||||
+ renderMetricsExposition,
|
||||
+} from "./lib/observability/metrics";
|
||||
+import { bindFromHeader, HOP_NAME, traceLog, traceStore } from "./lib/observability/trace";
|
||||
import { initializeIdleServers } from "./lib/startup";
|
||||
import mcpProxyRouter from "./routers/mcp-proxy";
|
||||
import oauthRouter from "./routers/oauth";
|
||||
@@ -9,6 +17,42 @@
|
||||
|
||||
const app = express();
|
||||
|
||||
+// cluster#49 — bind W3C trace context for every request and stamp every
|
||||
+// downstream log via traceStore.run(). cluster#50 — observe wall time
|
||||
+// per hop and count cancellations.
|
||||
+app.use((req, res, next) => {
|
||||
+ const raw = req.headers.traceparent as string | undefined;
|
||||
+ const { ctx, event } = bindFromHeader(raw);
|
||||
+ if (event === "traceparent_synthesized") {
|
||||
+ mcpTraceparentSynthesizedTotal.inc({ hop: HOP_NAME });
|
||||
+ }
|
||||
+ const t0 = process.hrtime.bigint();
|
||||
+ let status: "success" | "error" | "cancelled" = "success";
|
||||
+ res.on("close", () => {
|
||||
+ // res.writableEnded is true when we finished writing; false here
|
||||
+ // means the downstream client hung up mid-response.
|
||||
+ if (!res.writableEnded) {
|
||||
+ status = "cancelled";
|
||||
+ mcpCancellationTotal.inc({ hop: HOP_NAME, source: "downstream" });
|
||||
+ }
|
||||
+ const dt = Number(process.hrtime.bigint() - t0) / 1e9;
|
||||
+ mcpHopDurationSeconds.observe(
|
||||
+ { hop: HOP_NAME, tool_name: "", status },
|
||||
+ dt,
|
||||
+ );
|
||||
+ });
|
||||
+ traceStore.run(ctx, () => {
|
||||
+ if (event) traceLog("info", `event=${event}`);
|
||||
+ next();
|
||||
+ });
|
||||
+});
|
||||
+
|
||||
+// cluster#50 — Prometheus exposition. Schema in Specs/mcp-metric-schema.
|
||||
+app.get("/metrics", (_req, res) => {
|
||||
+ res.set("Content-Type", "text/plain; version=0.0.4");
|
||||
+ res.send(renderMetricsExposition());
|
||||
+});
|
||||
+
|
||||
// Global JSON middleware for non-proxy routes
|
||||
app.use((req, res, next) => {
|
||||
if (req.path.startsWith("/mcp-proxy/") || req.path.startsWith("/metamcp/")) {
|
||||
@@ -108,3 +152,39 @@
|
||||
status: "ok",
|
||||
});
|
||||
});
|
||||
+
|
||||
+// cluster#50 — Express error tail. Any unhandled error in middleware /
|
||||
+// handlers funnels here; we count it so the uncaught-throw alert fires.
|
||||
+app.use(
|
||||
+ (
|
||||
+ err: unknown,
|
||||
+ _req: express.Request,
|
||||
+ res: express.Response,
|
||||
+ next: express.NextFunction,
|
||||
+ ) => {
|
||||
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "express" });
|
||||
+ traceLog("error", "event=uncaught_throw", {
|
||||
+ error: err instanceof Error ? err.message : String(err),
|
||||
+ });
|
||||
+ if (res.headersSent) return next(err);
|
||||
+ res.status(500).json({ error: "internal error" });
|
||||
+ },
|
||||
+);
|
||||
+
|
||||
+// cluster#50 — last-resort process traps so unhandled rejections /
|
||||
+// uncaught exceptions still feed the alert. Counting only; existing
|
||||
+// behaviour (log + continue) is preserved.
|
||||
+process.on("unhandledRejection", (err) => {
|
||||
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "unhandledRejection" });
|
||||
+ traceLog("error", "event=uncaught_throw", {
|
||||
+ component: "unhandledRejection",
|
||||
+ error: err instanceof Error ? err.message : String(err),
|
||||
+ });
|
||||
+});
|
||||
+process.on("uncaughtException", (err) => {
|
||||
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "uncaughtException" });
|
||||
+ traceLog("error", "event=uncaught_throw", {
|
||||
+ component: "uncaughtException",
|
||||
+ error: err.message,
|
||||
+ });
|
||||
+});
|
||||
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/metrics.ts metamcp-2.4.22/apps/backend/src/lib/observability/metrics.ts
|
||||
--- a/apps/backend/src/lib/observability/metrics.ts 1970-01-01 03:00:00.000000000 +0300
|
||||
+++ b/apps/backend/src/lib/observability/metrics.ts 2026-05-23 23:49:42.080145070 +0300
|
||||
@@ -0,0 +1,160 @@
|
||||
+// Hand-rolled Prometheus metrics for the metamcp aggregator.
|
||||
+//
|
||||
+// We avoid adding `prom-client` to package.json so the pnpm lockfile and
|
||||
+// the Nix `pnpmDeps` hash stay untouched (cluster#50). This module
|
||||
+// emits the canonical schema from Specs/mcp-metric-schema:
|
||||
+// mcp_hop_duration_seconds (histogram)
|
||||
+// mcp_cancellation_total (counter)
|
||||
+// mcp_uncaught_throw_total (counter)
|
||||
+// mcp_traceparent_synthesized_total (counter)
|
||||
+//
|
||||
+// Exposition is text format v0.0.4. The /metrics endpoint is registered
|
||||
+// in index.ts.
|
||||
+
|
||||
+type LabelSet = Record<string, string>;
|
||||
+
|
||||
+function escapeLabel(v: string): string {
|
||||
+ return v.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
|
||||
+}
|
||||
+
|
||||
+function renderLabels(labels: LabelSet): string {
|
||||
+ const parts: string[] = [];
|
||||
+ for (const k of Object.keys(labels).sort()) {
|
||||
+ parts.push(`${k}="${escapeLabel(labels[k])}"`);
|
||||
+ }
|
||||
+ return parts.length ? `{${parts.join(",")}}` : "";
|
||||
+}
|
||||
+
|
||||
+function labelKey(labels: LabelSet): string {
|
||||
+ // Stable key for the labels map. Sort keys so the order is deterministic.
|
||||
+ const ks = Object.keys(labels).sort();
|
||||
+ return ks.map((k) => `${k}=${labels[k]}`).join("\x1f");
|
||||
+}
|
||||
+
|
||||
+export class Counter {
|
||||
+ private values = new Map<string, { labels: LabelSet; value: number }>();
|
||||
+
|
||||
+ constructor(
|
||||
+ public readonly name: string,
|
||||
+ public readonly help: string,
|
||||
+ public readonly labelNames: readonly string[],
|
||||
+ ) {}
|
||||
+
|
||||
+ inc(labels: LabelSet, by: number = 1): void {
|
||||
+ const key = labelKey(labels);
|
||||
+ const cur = this.values.get(key);
|
||||
+ if (cur) {
|
||||
+ cur.value += by;
|
||||
+ } else {
|
||||
+ this.values.set(key, { labels: { ...labels }, value: by });
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ render(): string {
|
||||
+ const lines: string[] = [];
|
||||
+ lines.push(`# HELP ${this.name} ${this.help}`);
|
||||
+ lines.push(`# TYPE ${this.name} counter`);
|
||||
+ for (const { labels, value } of this.values.values()) {
|
||||
+ lines.push(`${this.name}${renderLabels(labels)} ${value}`);
|
||||
+ }
|
||||
+ return lines.join("\n");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+export class Histogram {
|
||||
+ private series = new Map<
|
||||
+ string,
|
||||
+ {
|
||||
+ labels: LabelSet;
|
||||
+ bucketCounts: number[]; // counts per bucket boundary (cumulative on render)
|
||||
+ sum: number;
|
||||
+ count: number;
|
||||
+ }
|
||||
+ >();
|
||||
+
|
||||
+ constructor(
|
||||
+ public readonly name: string,
|
||||
+ public readonly help: string,
|
||||
+ public readonly labelNames: readonly string[],
|
||||
+ public readonly buckets: readonly number[],
|
||||
+ ) {}
|
||||
+
|
||||
+ observe(labels: LabelSet, value: number): void {
|
||||
+ const key = labelKey(labels);
|
||||
+ let s = this.series.get(key);
|
||||
+ if (!s) {
|
||||
+ s = {
|
||||
+ labels: { ...labels },
|
||||
+ bucketCounts: new Array(this.buckets.length).fill(0),
|
||||
+ sum: 0,
|
||||
+ count: 0,
|
||||
+ };
|
||||
+ this.series.set(key, s);
|
||||
+ }
|
||||
+ for (let i = 0; i < this.buckets.length; i++) {
|
||||
+ if (value <= this.buckets[i]) s.bucketCounts[i] += 1;
|
||||
+ }
|
||||
+ s.sum += value;
|
||||
+ s.count += 1;
|
||||
+ }
|
||||
+
|
||||
+ render(): string {
|
||||
+ const lines: string[] = [];
|
||||
+ lines.push(`# HELP ${this.name} ${this.help}`);
|
||||
+ lines.push(`# TYPE ${this.name} histogram`);
|
||||
+ for (const s of this.series.values()) {
|
||||
+ for (let i = 0; i < this.buckets.length; i++) {
|
||||
+ const le = String(this.buckets[i]);
|
||||
+ const lbls = renderLabels({ ...s.labels, le });
|
||||
+ lines.push(`${this.name}_bucket${lbls} ${s.bucketCounts[i]}`);
|
||||
+ }
|
||||
+ const lblsInf = renderLabels({ ...s.labels, le: "+Inf" });
|
||||
+ lines.push(`${this.name}_bucket${lblsInf} ${s.count}`);
|
||||
+ lines.push(`${this.name}_sum${renderLabels(s.labels)} ${s.sum}`);
|
||||
+ lines.push(`${this.name}_count${renderLabels(s.labels)} ${s.count}`);
|
||||
+ }
|
||||
+ return lines.join("\n");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// --- Schema instances --------------------------------------------------------
|
||||
+
|
||||
+const DURATION_BUCKETS = [
|
||||
+ 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300,
|
||||
+];
|
||||
+
|
||||
+export const mcpHopDurationSeconds = new Histogram(
|
||||
+ "mcp_hop_duration_seconds",
|
||||
+ "Wall time spent in this hop processing one MCP request",
|
||||
+ ["hop", "tool_name", "status"],
|
||||
+ DURATION_BUCKETS,
|
||||
+);
|
||||
+
|
||||
+export const mcpCancellationTotal = new Counter(
|
||||
+ "mcp_cancellation_total",
|
||||
+ "Cancellations observed per hop",
|
||||
+ ["hop", "source"],
|
||||
+);
|
||||
+
|
||||
+export const mcpUncaughtThrowTotal = new Counter(
|
||||
+ "mcp_uncaught_throw_total",
|
||||
+ "Uncaught exceptions per component (today's smoking gun, see cluster#44)",
|
||||
+ ["hop", "component"],
|
||||
+);
|
||||
+
|
||||
+export const mcpTraceparentSynthesizedTotal = new Counter(
|
||||
+ "mcp_traceparent_synthesized_total",
|
||||
+ "Inbound MCP requests missing a traceparent header",
|
||||
+ ["hop"],
|
||||
+);
|
||||
+
|
||||
+export function renderMetricsExposition(): string {
|
||||
+ return (
|
||||
+ [
|
||||
+ mcpHopDurationSeconds.render(),
|
||||
+ mcpCancellationTotal.render(),
|
||||
+ mcpUncaughtThrowTotal.render(),
|
||||
+ mcpTraceparentSynthesizedTotal.render(),
|
||||
+ ].join("\n") + "\n"
|
||||
+ );
|
||||
+}
|
||||
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/trace.ts metamcp-2.4.22/apps/backend/src/lib/observability/trace.ts
|
||||
--- a/apps/backend/src/lib/observability/trace.ts 1970-01-01 03:00:00.000000000 +0300
|
||||
+++ b/apps/backend/src/lib/observability/trace.ts 2026-05-23 23:49:11.700817887 +0300
|
||||
@@ -0,0 +1,127 @@
|
||||
+// W3C Trace Context propagation for the metamcp aggregator.
|
||||
+//
|
||||
+// Implements the contract from cluster#45 (Specs/mcp-request-id):
|
||||
+// traceparent header: 00-<trace-id:32-hex>-<span-id:16-hex>-<flags:2-hex>
|
||||
+// Bound per-request via AsyncLocalStorage. Inbound HTTP header is read by
|
||||
+// the express middleware in index.ts; outbound JSON-RPC to stdio children
|
||||
+// reads currentTraceparentForUpstream() and injects it under
|
||||
+// params._meta.traceparent (process-managed-transport.send).
|
||||
+
|
||||
+import { AsyncLocalStorage } from "node:async_hooks";
|
||||
+import { randomBytes } from "node:crypto";
|
||||
+
|
||||
+export const HOP_NAME = "metamcp";
|
||||
+
|
||||
+export interface TraceCtx {
|
||||
+ trace_id: string;
|
||||
+ span_id: string;
|
||||
+ parent_span_id: string;
|
||||
+ hop: string;
|
||||
+ tool_name: string;
|
||||
+}
|
||||
+
|
||||
+const TRACEPARENT_RE = /^00-([0-9a-f]{32})-([0-9a-f]{16})-[0-9a-f]{2}$/;
|
||||
+
|
||||
+export const traceStore = new AsyncLocalStorage<TraceCtx>();
|
||||
+
|
||||
+export function newTraceId(): string {
|
||||
+ return randomBytes(16).toString("hex");
|
||||
+}
|
||||
+
|
||||
+export function newSpanId(): string {
|
||||
+ return randomBytes(8).toString("hex");
|
||||
+}
|
||||
+
|
||||
+export type ParsedTraceparent = {
|
||||
+ trace_id: string;
|
||||
+ parent_span_id: string;
|
||||
+ malformed: boolean;
|
||||
+};
|
||||
+
|
||||
+export function parseTraceparent(raw: string | undefined): ParsedTraceparent {
|
||||
+ if (typeof raw !== "string") {
|
||||
+ return { trace_id: "", parent_span_id: "", malformed: false };
|
||||
+ }
|
||||
+ const m = TRACEPARENT_RE.exec(raw.trim());
|
||||
+ if (!m) {
|
||||
+ return { trace_id: newTraceId(), parent_span_id: "", malformed: true };
|
||||
+ }
|
||||
+ return { trace_id: m[1], parent_span_id: m[2], malformed: false };
|
||||
+}
|
||||
+
|
||||
+export type BindResult = {
|
||||
+ ctx: TraceCtx;
|
||||
+ // One of: "" (normal), "traceparent_synthesized", "traceparent_malformed".
|
||||
+ // Caller emits the reserved event log so the dashboard can count it.
|
||||
+ event: "" | "traceparent_synthesized" | "traceparent_malformed";
|
||||
+};
|
||||
+
|
||||
+export function bindFromHeader(
|
||||
+ raw: string | undefined,
|
||||
+ tool_name: string = "",
|
||||
+): BindResult {
|
||||
+ let event: BindResult["event"] = "";
|
||||
+ let trace_id = "";
|
||||
+ let parent_span_id = "";
|
||||
+ if (raw == null) {
|
||||
+ trace_id = newTraceId();
|
||||
+ event = "traceparent_synthesized";
|
||||
+ } else {
|
||||
+ const p = parseTraceparent(raw);
|
||||
+ trace_id = p.trace_id;
|
||||
+ parent_span_id = p.parent_span_id;
|
||||
+ if (p.malformed) event = "traceparent_malformed";
|
||||
+ }
|
||||
+ const ctx: TraceCtx = {
|
||||
+ trace_id,
|
||||
+ span_id: newSpanId(),
|
||||
+ parent_span_id,
|
||||
+ hop: HOP_NAME,
|
||||
+ tool_name,
|
||||
+ };
|
||||
+ return { ctx, event };
|
||||
+}
|
||||
+
|
||||
+// Build a traceparent literal for outbound calls. Our span-id becomes the
|
||||
+// downstream hop's parent_span_id (standard W3C propagation). Falls back
|
||||
+// to synthesizing a fresh context when called outside any bound request
|
||||
+// (e.g. session-pool keepalive paths that originate inside the server).
|
||||
+export function currentTraceparentForUpstream(): string {
|
||||
+ let ctx = traceStore.getStore();
|
||||
+ if (!ctx || !ctx.trace_id) {
|
||||
+ ctx = {
|
||||
+ trace_id: newTraceId(),
|
||||
+ span_id: newSpanId(),
|
||||
+ parent_span_id: "",
|
||||
+ hop: HOP_NAME,
|
||||
+ tool_name: "",
|
||||
+ };
|
||||
+ }
|
||||
+ return `00-${ctx.trace_id}-${ctx.span_id}-01`;
|
||||
+}
|
||||
+
|
||||
+// Single structured log line stamped with the current trace fields. Use
|
||||
+// from any code path that has a bound trace context. Outside a context,
|
||||
+// emits with empty fields so existing call sites do not crash.
|
||||
+export function traceLog(
|
||||
+ level: "info" | "warn" | "error",
|
||||
+ msg: string,
|
||||
+ extra: Record<string, unknown> = {},
|
||||
+): void {
|
||||
+ const ctx = traceStore.getStore();
|
||||
+ const rec: Record<string, unknown> = {
|
||||
+ ts: Date.now() / 1000,
|
||||
+ level,
|
||||
+ msg,
|
||||
+ hop: HOP_NAME,
|
||||
+ trace_id: ctx?.trace_id ?? "",
|
||||
+ span_id: ctx?.span_id ?? "",
|
||||
+ parent_span_id: ctx?.parent_span_id ?? "",
|
||||
+ tool_name: ctx?.tool_name ?? "",
|
||||
+ ...extra,
|
||||
+ };
|
||||
+ const line = JSON.stringify(rec);
|
||||
+ if (level === "error") console.error(line);
|
||||
+ else if (level === "warn") console.warn(line);
|
||||
+ else console.log(line);
|
||||
+}
|
||||
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/stdio-transport/process-managed-transport.ts metamcp-2.4.22/apps/backend/src/lib/stdio-transport/process-managed-transport.ts
|
||||
--- a/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:45:36.315112630 +0300
|
||||
+++ b/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:51:05.898047455 +0300
|
||||
@@ -6,6 +6,7 @@
|
||||
import { JSONRPCMessage } from "@modelcontextprotocol/sdk/types.js";
|
||||
import spawn from "cross-spawn";
|
||||
|
||||
+import { currentTraceparentForUpstream } from "../observability/trace";
|
||||
import { ReadBuffer, serializeMessage } from "./shared";
|
||||
|
||||
export type StdioServerParameters = {
|
||||
@@ -279,7 +280,32 @@
|
||||
throw new Error("Not connected");
|
||||
}
|
||||
|
||||
- const json = serializeMessage(message);
|
||||
+ // cluster#49 — propagate W3C trace context to the stdio child via
|
||||
+ // params._meta.traceparent (the MCP spec's standard escape hatch
|
||||
+ // for transport-agnostic metadata). Only stamp on messages with
|
||||
+ // params (notifications/requests); JSON-RPC responses ride back
|
||||
+ // up so don't need it. Idempotent: existing _meta is preserved.
|
||||
+ const tracedMessage = (() => {
|
||||
+ const m = message as JSONRPCMessage & {
|
||||
+ params?: Record<string, unknown>;
|
||||
+ };
|
||||
+ if (!m || typeof m !== "object" || !("params" in m)) return message;
|
||||
+ if (m.params == null || typeof m.params !== "object") return message;
|
||||
+ const meta = (m.params as Record<string, unknown>)._meta;
|
||||
+ const traceparent = currentTraceparentForUpstream();
|
||||
+ return {
|
||||
+ ...m,
|
||||
+ params: {
|
||||
+ ...m.params,
|
||||
+ _meta: {
|
||||
+ ...(typeof meta === "object" && meta !== null ? meta : {}),
|
||||
+ traceparent,
|
||||
+ },
|
||||
+ },
|
||||
+ } as JSONRPCMessage;
|
||||
+ })();
|
||||
+
|
||||
+ const json = serializeMessage(tracedMessage);
|
||||
if (this._process.stdin.write(json)) {
|
||||
resolve();
|
||||
} else {
|
||||
@@ -67,6 +67,15 @@ stdenv.mkDerivation (finalAttrs: {
|
||||
if (pkg.engines) delete pkg.engines.pnpm;
|
||||
fs.writeFileSync("package.json", JSON.stringify(pkg, null, 2) + "\n");
|
||||
'
|
||||
|
||||
# cluster#49, cluster#50 — observability patch: W3C traceparent
|
||||
# propagation (HTTP header → params._meta.traceparent on stdio
|
||||
# children) + hand-rolled Prometheus metrics on /metrics, no new
|
||||
# dependency so pnpmDeps stays untouched. Two new source files
|
||||
# under apps/backend/src/lib/observability/ and minimal edits to
|
||||
# apps/backend/src/index.ts and process-managed-transport.ts.
|
||||
# Retire when this lands upstream.
|
||||
patch -p1 < ${./metamcp-observability.patch}
|
||||
'';
|
||||
|
||||
# pnpmConfigHook places node_modules; build the workspace with Turbo.
|
||||
|
||||
Reference in New Issue
Block a user