From e12b624cf7a9e9c7997195c450e69228aea33bea Mon Sep 17 00:00:00 2001 From: pi Date: Fri, 26 Jun 2026 00:22:21 +0200 Subject: [PATCH] feat(pi-ext): self-healing respawn + scoped init timeout for mempalace-mcp A stall-kill (or any crash) of mempalace-mcp was a permanent latch: available flipped off and stayed off until pi restart. Now the next tool call transparently respawns the server and retries. - ensureAlive(): bounded respawn with capped exponential backoff (MEMPALACE_MCP_MAX_RESPAWNS, default 2; MEMPALACE_MCP_RESPAWN_BACKOFF_MS, default 1000). Respawn budget resets on any successful JSON-RPC response, so a recovered server regains full patience while a persistently-broken one hits the cap and stays down (no hot-loop). - Init timeout default raised 120000 -> 300000 (scoped to init only): a genuine virtiofs cold-open shouldn't be killed mid-progress only to respawn and re-pay the same cost. Per-call timeout stays 60000. - Concurrency hardening: generation counter so a late exit from a killed old process can't tear down a fresh respawn; explicit healthy flag replaces racy proc!=null liveness check. - README: document self-heal, new env vars, and why generous-init + bounded-respawn compose rather than overlap. --- extensions/pi/README.md | 35 ++++++++- extensions/pi/mempalace.ts | 146 +++++++++++++++++++++++++++++++++---- 2 files changed, 163 insertions(+), 18 deletions(-) diff --git a/extensions/pi/README.md b/extensions/pi/README.md index e1dbf76..e2462e5 100644 --- a/extensions/pi/README.md +++ b/extensions/pi/README.md @@ -60,15 +60,44 @@ wedged server (classically: an OrbStack/virtiofs cold-open of a large *forever*, which freezes the pi TUI — ESC cancels the LLM stream, not a pending tool `execute()`. On timeout the extension rejects the request **and** kills the stalled child (SIGTERM→SIGKILL), so pi gets a clear -error instead of hanging and later calls fail fast (`available` flips off; -restart pi to retry). This is a per-REQUEST timeout, not a process-lifetime +error instead of hanging. This is a per-REQUEST timeout, not a process-lifetime one — the long-lived server is only killed when a request genuinely stalls. - `MEMPALACE_MCP_TIMEOUT_MS` — tool-call/request timeout. Default `60000`. + Kept short on purpose: a *query* taking this long is genuinely wedged. - `MEMPALACE_MCP_INIT_TIMEOUT_MS` — `initialize` + `tools/list` handshake - timeout (cold-open is expected to be slower here). Default `120000`. + timeout. Default `300000`. Deliberately generous: a genuine first + cold-open over virtiofs can legitimately take minutes, and killing a + still-progressing init only to respawn and re-pay the same cold cost is + strictly worse than waiting. - Set either to `0` to disable (legacy unbounded behavior). +### Self-heal (respawn instead of a permanent latch) + +A stall-kill (or any crash) used to be a **permanent** latch: `available` +flipped off and stayed off until you restarted pi. It is now self-healing — +the next tool call transparently respawns `mempalace-mcp` and retries. + +- Respawns use **capped exponential backoff** so a persistently-broken + server can't hot-loop: `MEMPALACE_MCP_MAX_RESPAWNS` attempts (default + `2`; set `0` to disable self-heal and keep the old fail-fast latch), + with `MEMPALACE_MCP_RESPAWN_BACKOFF_MS` (default `1000`) doubled per + attempt. +- The budget **resets on any successful JSON-RPC response** — proof the + server is actually live — so a server that recovers regains full + patience, while one that keeps dying hits the cap and stays down (then + restart pi). +- Why the long init timeout and bounded respawn compose rather than + overlap: once a server has opened the palace once, the OS page cache is + warm, so respawn cold-opens are fast. The long init timeout prevents + killing a healthy *first* cold-open; the respawn handles a genuinely + dead server cheaply afterwards. (Note the HNSW deserialize is CPU work + that isn't page-cacheable across spawns, which is exactly why we can't + rely on respawn-warming alone and keep the generous init budget.) +- The initial startup is tolerant too: if the very first `start()` fails, + the extension runs the same bounded respawn before falling back to + fail-soft (pi keeps working without palace tools). + ## Debugging - `MEMPALACE_EXT_DEBUG=1` — surface `mempalace-mcp` stderr into pi's diff --git a/extensions/pi/mempalace.ts b/extensions/pi/mempalace.ts index e032464..40d7f62 100644 --- a/extensions/pi/mempalace.ts +++ b/extensions/pi/mempalace.ts @@ -30,9 +30,24 @@ * This is a per-REQUEST timeout, not a process-lifetime one — the * long-lived server is only killed when a request genuinely stalls. * - MEMPALACE_MCP_TIMEOUT_MS tool-call/request timeout (default 60000) - * - MEMPALACE_MCP_INIT_TIMEOUT_MS initialize+tools/list timeout (default 120000) + * - MEMPALACE_MCP_INIT_TIMEOUT_MS initialize+tools/list timeout (default 300000) * Set either to 0 to disable (legacy unbounded behavior). * + * Self-heal (respawn): a stall-kill (or any crash) is no longer a permanent + * latch. The next tool call transparently respawns `mempalace-mcp` and + * retries, with capped exponential backoff so a persistently-broken server + * can't hot-loop. The respawn budget resets on ANY successful JSON-RPC + * response (proof the server is actually live), so a recovered server + * regains full patience. The two timeouts above are deliberately split: + * the long INIT timeout lets a genuine first cold-open finish without being + * killed (the original incident), while the short per-call timeout still + * aggressively kills a stuck query. After a server has opened the palace + * once, the OS page cache is warm, so respawn cold-opens are fast — which + * is exactly why a generous INIT timeout + bounded respawn compose well + * instead of overlapping. + * - MEMPALACE_MCP_MAX_RESPAWNS respawn attempts before giving up (default 2; 0 disables self-heal) + * - MEMPALACE_MCP_RESPAWN_BACKOFF_MS base backoff, doubled per attempt (default 1000) + * * Debug: set MEMPALACE_EXT_DEBUG=1 to surface mempalace-mcp stderr. */ @@ -58,6 +73,12 @@ const num = (envVal: string | undefined, fallback: number): number => { return Number.isFinite(n) && n >= 0 ? n : fallback; }; +const sleep = (ms: number): Promise => + new Promise((res) => { + const t = setTimeout(res, ms); + if (typeof t.unref === "function") t.unref(); + }); + class McpClient { private proc: ChildProcessWithoutNullStreams | null = null; private nextId = 1; @@ -68,18 +89,42 @@ class McpClient { // Per-request timeouts (ms). 0 = disabled (unbounded, legacy behavior). private requestTimeoutMs = num(process.env.MEMPALACE_MCP_TIMEOUT_MS, 60_000); - private initTimeoutMs = num(process.env.MEMPALACE_MCP_INIT_TIMEOUT_MS, 120_000); + private initTimeoutMs = num(process.env.MEMPALACE_MCP_INIT_TIMEOUT_MS, 300_000); + // Self-heal (respawn) controls. + private maxRespawns = num(process.env.MEMPALACE_MCP_MAX_RESPAWNS, 2); + private respawnBackoffMs = num(process.env.MEMPALACE_MCP_RESPAWN_BACKOFF_MS, 1_000); + private respawns = 0; // consecutive respawn attempts; reset on any success + private reviving: Promise | null = null; + // Liveness: true only between a completed init and the matching death. + // Inferring from `proc` is racy (non-null during the SIGTERM→exit window). + private healthy = false; + // Spawn generation. Each (re)spawn bumps it; death handlers carry the gen + // they were attached for and no-op if a newer server has since taken over + // (prevents a stale OLD-proc 'exit' from clobbering a freshly respawned one). + private gen = 0; + // Spawn args remembered so a respawn can reuse them. + private command = "mempalace-mcp"; + private args: string[] = []; // Fired when the child process dies (exit or stall-kill). Lets the // extension flip `available` so later tool calls fail fast. public onExit: (() => void) | null = null; + /** True only when a server has completed init and not since died. */ + get alive(): boolean { + return this.healthy; + } + async start(command: string, args: string[] = []): Promise { if (this.ready) return this.ready; + this.command = command; + this.args = args; this.ready = (async () => { - this.proc = spawn(command, args, { stdio: ["pipe", "pipe", "pipe"] }); - this.proc.on("error", (err) => this.handleDeath(err)); - this.proc.on("exit", (code) => - this.handleDeath(new Error(`mempalace-mcp exited (code=${code})`)), + const myGen = ++this.gen; + const child = spawn(command, args, { stdio: ["pipe", "pipe", "pipe"] }); + this.proc = child; + child.on("error", (err) => this.handleDeath(myGen, err)); + child.on("exit", (code) => + this.handleDeath(myGen, new Error(`mempalace-mcp exited (code=${code})`)), ); this.proc.stdout.setEncoding("utf8"); this.proc.stdout.on("data", (chunk: string) => this.onStdout(chunk)); @@ -108,6 +153,9 @@ class McpClient { const listed = await this.request("tools/list", {}, this.initTimeoutMs); this.tools = (listed?.tools as McpTool[]) ?? []; + // Init complete and the process is still ours — mark live so ensureAlive() + // reports true. Guard against a death that raced in during init. + if (myGen === this.gen) this.healthy = true; })(); return this.ready; } @@ -129,7 +177,12 @@ class McpClient { const p = this.pending.get(msg.id)!; this.settle(msg.id); if (msg.error) p.reject(new Error(msg.error.message ?? "MCP error")); - else p.resolve(msg.result); + else { + // A successful response proves the server is live — restore the + // full respawn budget for any future (unrelated) stall. + this.respawns = 0; + p.resolve(msg.result); + } } // notifications (no id) ignored for now. } @@ -150,15 +203,60 @@ class McpClient { } } - /** Child died (exit, spawn error, or stall-kill): reject everything. */ - private handleDeath(err: Error) { + /** + * Child died (exit, spawn error, or stall-kill): reject everything. + * Ignored if a newer generation has already taken over — a late 'exit' + * from a killed old process must not tear down a fresh respawn. + */ + private handleDeath(gen: number, err: Error) { + if (gen !== this.gen) return; // stale handler from a superseded process this.proc = null; + this.healthy = false; + // Clear the memoized start() promise so a subsequent start()/ensureAlive() + // spawns a fresh server instead of returning the dead one's promise. + this.ready = null; this.failAll(err); try { this.onExit?.(); } catch {} } + /** + * Ensure a live, initialized server — respawning a dead one with capped + * exponential backoff. Returns whether the server is alive afterwards. + * Concurrent callers share a single in-flight revive. The attempt counter + * is reset by `onStdout` on any successful response, so a server that comes + * back and works regains its full budget; a server that keeps dying hits + * `maxRespawns` and stays down (restart pi) rather than hot-looping. + */ + async ensureAlive(): Promise { + if (this.alive) return true; + if (this.reviving) return this.reviving; + this.reviving = (async () => { + while (!this.alive && this.respawns < this.maxRespawns) { + this.respawns++; + await sleep(this.respawnBackoffMs * 2 ** (this.respawns - 1)); + // Force a fresh spawn: drop any settled (rejected/dead) start() + // promise so start() doesn't short-circuit on a stale memo. Safe + // because ensureAlive is serialized (single `reviving`) and only + // runs while not healthy — no useful in-flight start() can exist. + this.ready = null; + try { + await this.start(this.command, this.args); + } catch { + // start() rejected (e.g. respawn cold-open also stalled and was + // killed). Loop until the budget is exhausted. + } + } + return this.alive; + })(); + try { + return await this.reviving; + } finally { + this.reviving = null; + } + } + private write(obj: unknown) { if (!this.proc) throw new Error("MCP process not started"); this.proc.stdin.write(`${JSON.stringify(obj)}\n`); @@ -180,7 +278,7 @@ class McpClient { new Error( `mempalace-mcp request '${method}' timed out after ${timeoutMs}ms ` + `(server wedged — likely cold storage open). Terminating the ` + - `stalled server; restart pi to retry palace tools.`, + `stalled server; it will be respawned on the next call.`, ), ); // Kill the wedged child so subsequent calls fail fast instead of @@ -241,17 +339,26 @@ export default async function mempalaceExtension(pi: ExtensionAPI) { try { client.onExit = () => { - // Child died (stall-kill or crash): later tool calls return a clear - // error instead of trying to talk to a dead/respawn-less server. + // Child died (stall-kill or crash): mark unavailable. The next tool + // call will attempt a bounded respawn via client.ensureAlive(). available = false; }; await client.start("mempalace-mcp"); available = true; } catch (err) { + // First cold-open stalled/crashed. Give the bounded self-heal a chance + // before giving up entirely — a respawn often succeeds against a now-warm + // page cache. Only fail-soft if even that is exhausted. process.stderr.write( - `[mempalace ext] failed to start mempalace-mcp: ${(err as Error).message}\n`, + `[mempalace ext] mempalace-mcp start failed: ${(err as Error).message} — attempting respawn\n`, ); - return; // fail-soft: pi keeps working without palace tools + available = await client.ensureAlive(); + if (!available) { + process.stderr.write( + "[mempalace ext] mempalace-mcp unavailable after retries; continuing without palace tools\n", + ); + return; // fail-soft: pi keeps working without palace tools + } } // Register MCP tools as pi tools. Pass the MCP `inputSchema` through as @@ -270,9 +377,18 @@ export default async function mempalaceExtension(pi: ExtensionAPI) { description: tool.description ?? `MemPalace tool: ${tool.name}`, parameters: schema, async execute(_toolCallId, params) { + if (!available) { + // Stall-kill/crash is not a permanent latch: try a bounded respawn. + available = await client.ensureAlive(); + } if (!available) { return { - content: [{ type: "text", text: "mempalace-mcp not available" }], + content: [ + { + type: "text", + text: "mempalace-mcp not available (respawn budget exhausted) — restart pi to retry palace tools", + }, + ], details: {}, isError: true, };