Files
flake-hub/packages/metamcp-observability.patch
Oleks cb369d3be3
ci/woodpecker/push/arm64 Pipeline failed
ci/woodpecker/push/amd64 Pipeline failed
metamcp: observability patch — W3C traceparent + Prometheus /metrics (cluster#49, cluster#50)
Adds a postPatch step that applies metamcp-observability.patch to
the upstream metatool-ai/metamcp v2.4.22 source before pnpm build.
The patch:

  * Drops in apps/backend/src/lib/observability/{trace,metrics}.ts:
    AsyncLocalStorage trace context + parser/synthesizer per the W3C
    contract (Specs/mcp-request-id), plus a hand-rolled
    Counter/Histogram so we don't have to touch pnpm-lock.yaml /
    pnpmDeps hash (no new npm dependency).

  * Wires a top-level express middleware in apps/backend/src/index.ts
    that binds trace context, observes mcp_hop_duration_seconds on
    response close, and counts mcp_cancellation_total when the
    downstream client hangs up mid-response.

  * Adds /metrics to the Express app and last-resort process traps
    (unhandledRejection / uncaughtException) feeding
    mcp_uncaught_throw_total — the smoking-gun signal from
    cluster#44.

  * Patches process-managed-transport.send() to inject
    params._meta.traceparent on every outbound JSON-RPC bound for a
    stdio child (MCP _meta convention, Specs/mcp-request-id).

Retire when this lands upstream.
2026-05-24 05:34:47 +03:00

442 lines
15 KiB
Diff

diff -urN metamcp-2.4.22.orig/apps/backend/src/index.ts metamcp-2.4.22/apps/backend/src/index.ts
--- a/apps/backend/src/index.ts 2026-05-23 23:45:36.313040968 +0300
+++ b/apps/backend/src/index.ts 2026-05-23 23:50:43.286804066 +0300
@@ -1,6 +1,14 @@
import express from "express";
import { auth } from "./auth";
+import {
+ mcpCancellationTotal,
+ mcpHopDurationSeconds,
+ mcpTraceparentSynthesizedTotal,
+ mcpUncaughtThrowTotal,
+ renderMetricsExposition,
+} from "./lib/observability/metrics";
+import { bindFromHeader, HOP_NAME, traceLog, traceStore } from "./lib/observability/trace";
import { initializeIdleServers } from "./lib/startup";
import mcpProxyRouter from "./routers/mcp-proxy";
import oauthRouter from "./routers/oauth";
@@ -9,6 +17,42 @@
const app = express();
+// cluster#49 — bind W3C trace context for every request and stamp every
+// downstream log via traceStore.run(). cluster#50 — observe wall time
+// per hop and count cancellations.
+app.use((req, res, next) => {
+ const raw = req.headers.traceparent as string | undefined;
+ const { ctx, event } = bindFromHeader(raw);
+ if (event === "traceparent_synthesized") {
+ mcpTraceparentSynthesizedTotal.inc({ hop: HOP_NAME });
+ }
+ const t0 = process.hrtime.bigint();
+ let status: "success" | "error" | "cancelled" = "success";
+ res.on("close", () => {
+ // res.writableEnded is true when we finished writing; false here
+ // means the downstream client hung up mid-response.
+ if (!res.writableEnded) {
+ status = "cancelled";
+ mcpCancellationTotal.inc({ hop: HOP_NAME, source: "downstream" });
+ }
+ const dt = Number(process.hrtime.bigint() - t0) / 1e9;
+ mcpHopDurationSeconds.observe(
+ { hop: HOP_NAME, tool_name: "", status },
+ dt,
+ );
+ });
+ traceStore.run(ctx, () => {
+ if (event) traceLog("info", `event=${event}`);
+ next();
+ });
+});
+
+// cluster#50 — Prometheus exposition. Schema in Specs/mcp-metric-schema.
+app.get("/metrics", (_req, res) => {
+ res.set("Content-Type", "text/plain; version=0.0.4");
+ res.send(renderMetricsExposition());
+});
+
// Global JSON middleware for non-proxy routes
app.use((req, res, next) => {
if (req.path.startsWith("/mcp-proxy/") || req.path.startsWith("/metamcp/")) {
@@ -108,3 +152,39 @@
status: "ok",
});
});
+
+// cluster#50 — Express error tail. Any unhandled error in middleware /
+// handlers funnels here; we count it so the uncaught-throw alert fires.
+app.use(
+ (
+ err: unknown,
+ _req: express.Request,
+ res: express.Response,
+ next: express.NextFunction,
+ ) => {
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "express" });
+ traceLog("error", "event=uncaught_throw", {
+ error: err instanceof Error ? err.message : String(err),
+ });
+ if (res.headersSent) return next(err);
+ res.status(500).json({ error: "internal error" });
+ },
+);
+
+// cluster#50 — last-resort process traps so unhandled rejections /
+// uncaught exceptions still feed the alert. Counting only; existing
+// behaviour (log + continue) is preserved.
+process.on("unhandledRejection", (err) => {
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "unhandledRejection" });
+ traceLog("error", "event=uncaught_throw", {
+ component: "unhandledRejection",
+ error: err instanceof Error ? err.message : String(err),
+ });
+});
+process.on("uncaughtException", (err) => {
+ mcpUncaughtThrowTotal.inc({ hop: HOP_NAME, component: "uncaughtException" });
+ traceLog("error", "event=uncaught_throw", {
+ component: "uncaughtException",
+ error: err.message,
+ });
+});
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/metrics.ts metamcp-2.4.22/apps/backend/src/lib/observability/metrics.ts
--- a/apps/backend/src/lib/observability/metrics.ts 1970-01-01 03:00:00.000000000 +0300
+++ b/apps/backend/src/lib/observability/metrics.ts 2026-05-23 23:49:42.080145070 +0300
@@ -0,0 +1,160 @@
+// Hand-rolled Prometheus metrics for the metamcp aggregator.
+//
+// We avoid adding `prom-client` to package.json so the pnpm lockfile and
+// the Nix `pnpmDeps` hash stay untouched (cluster#50). This module
+// emits the canonical schema from Specs/mcp-metric-schema:
+// mcp_hop_duration_seconds (histogram)
+// mcp_cancellation_total (counter)
+// mcp_uncaught_throw_total (counter)
+// mcp_traceparent_synthesized_total (counter)
+//
+// Exposition is text format v0.0.4. The /metrics endpoint is registered
+// in index.ts.
+
+type LabelSet = Record<string, string>;
+
+function escapeLabel(v: string): string {
+ return v.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
+}
+
+function renderLabels(labels: LabelSet): string {
+ const parts: string[] = [];
+ for (const k of Object.keys(labels).sort()) {
+ parts.push(`${k}="${escapeLabel(labels[k])}"`);
+ }
+ return parts.length ? `{${parts.join(",")}}` : "";
+}
+
+function labelKey(labels: LabelSet): string {
+ // Stable key for the labels map. Sort keys so the order is deterministic.
+ const ks = Object.keys(labels).sort();
+ return ks.map((k) => `${k}=${labels[k]}`).join("\x1f");
+}
+
+export class Counter {
+ private values = new Map<string, { labels: LabelSet; value: number }>();
+
+ constructor(
+ public readonly name: string,
+ public readonly help: string,
+ public readonly labelNames: readonly string[],
+ ) {}
+
+ inc(labels: LabelSet, by: number = 1): void {
+ const key = labelKey(labels);
+ const cur = this.values.get(key);
+ if (cur) {
+ cur.value += by;
+ } else {
+ this.values.set(key, { labels: { ...labels }, value: by });
+ }
+ }
+
+ render(): string {
+ const lines: string[] = [];
+ lines.push(`# HELP ${this.name} ${this.help}`);
+ lines.push(`# TYPE ${this.name} counter`);
+ for (const { labels, value } of this.values.values()) {
+ lines.push(`${this.name}${renderLabels(labels)} ${value}`);
+ }
+ return lines.join("\n");
+ }
+}
+
+export class Histogram {
+ private series = new Map<
+ string,
+ {
+ labels: LabelSet;
+ bucketCounts: number[]; // counts per bucket boundary (cumulative on render)
+ sum: number;
+ count: number;
+ }
+ >();
+
+ constructor(
+ public readonly name: string,
+ public readonly help: string,
+ public readonly labelNames: readonly string[],
+ public readonly buckets: readonly number[],
+ ) {}
+
+ observe(labels: LabelSet, value: number): void {
+ const key = labelKey(labels);
+ let s = this.series.get(key);
+ if (!s) {
+ s = {
+ labels: { ...labels },
+ bucketCounts: new Array(this.buckets.length).fill(0),
+ sum: 0,
+ count: 0,
+ };
+ this.series.set(key, s);
+ }
+ for (let i = 0; i < this.buckets.length; i++) {
+ if (value <= this.buckets[i]) s.bucketCounts[i] += 1;
+ }
+ s.sum += value;
+ s.count += 1;
+ }
+
+ render(): string {
+ const lines: string[] = [];
+ lines.push(`# HELP ${this.name} ${this.help}`);
+ lines.push(`# TYPE ${this.name} histogram`);
+ for (const s of this.series.values()) {
+ for (let i = 0; i < this.buckets.length; i++) {
+ const le = String(this.buckets[i]);
+ const lbls = renderLabels({ ...s.labels, le });
+ lines.push(`${this.name}_bucket${lbls} ${s.bucketCounts[i]}`);
+ }
+ const lblsInf = renderLabels({ ...s.labels, le: "+Inf" });
+ lines.push(`${this.name}_bucket${lblsInf} ${s.count}`);
+ lines.push(`${this.name}_sum${renderLabels(s.labels)} ${s.sum}`);
+ lines.push(`${this.name}_count${renderLabels(s.labels)} ${s.count}`);
+ }
+ return lines.join("\n");
+ }
+}
+
+// --- Schema instances --------------------------------------------------------
+
+const DURATION_BUCKETS = [
+ 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300,
+];
+
+export const mcpHopDurationSeconds = new Histogram(
+ "mcp_hop_duration_seconds",
+ "Wall time spent in this hop processing one MCP request",
+ ["hop", "tool_name", "status"],
+ DURATION_BUCKETS,
+);
+
+export const mcpCancellationTotal = new Counter(
+ "mcp_cancellation_total",
+ "Cancellations observed per hop",
+ ["hop", "source"],
+);
+
+export const mcpUncaughtThrowTotal = new Counter(
+ "mcp_uncaught_throw_total",
+ "Uncaught exceptions per component (today's smoking gun, see cluster#44)",
+ ["hop", "component"],
+);
+
+export const mcpTraceparentSynthesizedTotal = new Counter(
+ "mcp_traceparent_synthesized_total",
+ "Inbound MCP requests missing a traceparent header",
+ ["hop"],
+);
+
+export function renderMetricsExposition(): string {
+ return (
+ [
+ mcpHopDurationSeconds.render(),
+ mcpCancellationTotal.render(),
+ mcpUncaughtThrowTotal.render(),
+ mcpTraceparentSynthesizedTotal.render(),
+ ].join("\n") + "\n"
+ );
+}
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/observability/trace.ts metamcp-2.4.22/apps/backend/src/lib/observability/trace.ts
--- a/apps/backend/src/lib/observability/trace.ts 1970-01-01 03:00:00.000000000 +0300
+++ b/apps/backend/src/lib/observability/trace.ts 2026-05-23 23:49:11.700817887 +0300
@@ -0,0 +1,127 @@
+// W3C Trace Context propagation for the metamcp aggregator.
+//
+// Implements the contract from cluster#45 (Specs/mcp-request-id):
+// traceparent header: 00-<trace-id:32-hex>-<span-id:16-hex>-<flags:2-hex>
+// Bound per-request via AsyncLocalStorage. Inbound HTTP header is read by
+// the express middleware in index.ts; outbound JSON-RPC to stdio children
+// reads currentTraceparentForUpstream() and injects it under
+// params._meta.traceparent (process-managed-transport.send).
+
+import { AsyncLocalStorage } from "node:async_hooks";
+import { randomBytes } from "node:crypto";
+
+export const HOP_NAME = "metamcp";
+
+export interface TraceCtx {
+ trace_id: string;
+ span_id: string;
+ parent_span_id: string;
+ hop: string;
+ tool_name: string;
+}
+
+const TRACEPARENT_RE = /^00-([0-9a-f]{32})-([0-9a-f]{16})-[0-9a-f]{2}$/;
+
+export const traceStore = new AsyncLocalStorage<TraceCtx>();
+
+export function newTraceId(): string {
+ return randomBytes(16).toString("hex");
+}
+
+export function newSpanId(): string {
+ return randomBytes(8).toString("hex");
+}
+
+export type ParsedTraceparent = {
+ trace_id: string;
+ parent_span_id: string;
+ malformed: boolean;
+};
+
+export function parseTraceparent(raw: string | undefined): ParsedTraceparent {
+ if (typeof raw !== "string") {
+ return { trace_id: "", parent_span_id: "", malformed: false };
+ }
+ const m = TRACEPARENT_RE.exec(raw.trim());
+ if (!m) {
+ return { trace_id: newTraceId(), parent_span_id: "", malformed: true };
+ }
+ return { trace_id: m[1], parent_span_id: m[2], malformed: false };
+}
+
+export type BindResult = {
+ ctx: TraceCtx;
+ // One of: "" (normal), "traceparent_synthesized", "traceparent_malformed".
+ // Caller emits the reserved event log so the dashboard can count it.
+ event: "" | "traceparent_synthesized" | "traceparent_malformed";
+};
+
+export function bindFromHeader(
+ raw: string | undefined,
+ tool_name: string = "",
+): BindResult {
+ let event: BindResult["event"] = "";
+ let trace_id = "";
+ let parent_span_id = "";
+ if (raw == null) {
+ trace_id = newTraceId();
+ event = "traceparent_synthesized";
+ } else {
+ const p = parseTraceparent(raw);
+ trace_id = p.trace_id;
+ parent_span_id = p.parent_span_id;
+ if (p.malformed) event = "traceparent_malformed";
+ }
+ const ctx: TraceCtx = {
+ trace_id,
+ span_id: newSpanId(),
+ parent_span_id,
+ hop: HOP_NAME,
+ tool_name,
+ };
+ return { ctx, event };
+}
+
+// Build a traceparent literal for outbound calls. Our span-id becomes the
+// downstream hop's parent_span_id (standard W3C propagation). Falls back
+// to synthesizing a fresh context when called outside any bound request
+// (e.g. session-pool keepalive paths that originate inside the server).
+export function currentTraceparentForUpstream(): string {
+ let ctx = traceStore.getStore();
+ if (!ctx || !ctx.trace_id) {
+ ctx = {
+ trace_id: newTraceId(),
+ span_id: newSpanId(),
+ parent_span_id: "",
+ hop: HOP_NAME,
+ tool_name: "",
+ };
+ }
+ return `00-${ctx.trace_id}-${ctx.span_id}-01`;
+}
+
+// Single structured log line stamped with the current trace fields. Use
+// from any code path that has a bound trace context. Outside a context,
+// emits with empty fields so existing call sites do not crash.
+export function traceLog(
+ level: "info" | "warn" | "error",
+ msg: string,
+ extra: Record<string, unknown> = {},
+): void {
+ const ctx = traceStore.getStore();
+ const rec: Record<string, unknown> = {
+ ts: Date.now() / 1000,
+ level,
+ msg,
+ hop: HOP_NAME,
+ trace_id: ctx?.trace_id ?? "",
+ span_id: ctx?.span_id ?? "",
+ parent_span_id: ctx?.parent_span_id ?? "",
+ tool_name: ctx?.tool_name ?? "",
+ ...extra,
+ };
+ const line = JSON.stringify(rec);
+ if (level === "error") console.error(line);
+ else if (level === "warn") console.warn(line);
+ else console.log(line);
+}
diff -urN metamcp-2.4.22.orig/apps/backend/src/lib/stdio-transport/process-managed-transport.ts metamcp-2.4.22/apps/backend/src/lib/stdio-transport/process-managed-transport.ts
--- a/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:45:36.315112630 +0300
+++ b/apps/backend/src/lib/stdio-transport/process-managed-transport.ts 2026-05-23 23:51:05.898047455 +0300
@@ -6,6 +6,7 @@
import { JSONRPCMessage } from "@modelcontextprotocol/sdk/types.js";
import spawn from "cross-spawn";
+import { currentTraceparentForUpstream } from "../observability/trace";
import { ReadBuffer, serializeMessage } from "./shared";
export type StdioServerParameters = {
@@ -279,7 +280,32 @@
throw new Error("Not connected");
}
- const json = serializeMessage(message);
+ // cluster#49 — propagate W3C trace context to the stdio child via
+ // params._meta.traceparent (the MCP spec's standard escape hatch
+ // for transport-agnostic metadata). Only stamp on messages with
+ // params (notifications/requests); JSON-RPC responses ride back
+ // up so don't need it. Idempotent: existing _meta is preserved.
+ const tracedMessage = (() => {
+ const m = message as JSONRPCMessage & {
+ params?: Record<string, unknown>;
+ };
+ if (!m || typeof m !== "object" || !("params" in m)) return message;
+ if (m.params == null || typeof m.params !== "object") return message;
+ const meta = (m.params as Record<string, unknown>)._meta;
+ const traceparent = currentTraceparentForUpstream();
+ return {
+ ...m,
+ params: {
+ ...m.params,
+ _meta: {
+ ...(typeof meta === "object" && meta !== null ? meta : {}),
+ traceparent,
+ },
+ },
+ } as JSONRPCMessage;
+ })();
+
+ const json = serializeMessage(tracedMessage);
if (this._process.stdin.write(json)) {
resolve();
} else {