diff --git a/extensions/pi/README.md b/extensions/pi/README.md index 076d372..e1dbf76 100644 --- a/extensions/pi/README.md +++ b/extensions/pi/README.md @@ -52,6 +52,23 @@ to `"pi"`. First diary write against that identity creates `wing_` in the palace. Set the env var if you want to run pi under a distinct identity on a given machine (e.g. `pi-laptop` vs `pi-server`). +## Stall protection (per-request timeout) + +Every JSON-RPC request to `mempalace-mcp` carries a timeout. Without it, a +wedged server (classically: an OrbStack/virtiofs cold-open of a large +`chroma.sqlite3` or an HNSW load) leaves the awaiting promise pending +*forever*, which freezes the pi TUI — ESC cancels the LLM stream, not a +pending tool `execute()`. On timeout the extension rejects the request +**and** kills the stalled child (SIGTERM→SIGKILL), so pi gets a clear +error instead of hanging and later calls fail fast (`available` flips off; +restart pi to retry). This is a per-REQUEST timeout, not a process-lifetime +one — the long-lived server is only killed when a request genuinely stalls. + +- `MEMPALACE_MCP_TIMEOUT_MS` — tool-call/request timeout. Default `60000`. +- `MEMPALACE_MCP_INIT_TIMEOUT_MS` — `initialize` + `tools/list` handshake + timeout (cold-open is expected to be slower here). Default `120000`. +- Set either to `0` to disable (legacy unbounded behavior). + ## Debugging - `MEMPALACE_EXT_DEBUG=1` — surface `mempalace-mcp` stderr into pi's diff --git a/extensions/pi/mempalace.ts b/extensions/pi/mempalace.ts index 410021d..e032464 100644 --- a/extensions/pi/mempalace.ts +++ b/extensions/pi/mempalace.ts @@ -21,6 +21,18 @@ * Fail-soft: if the MCP subprocess can't start, pi keeps working without * palace tools (warning on stderr only). * + * Stall protection: every JSON-RPC request carries a timeout. If + * `mempalace-mcp` wedges (e.g. OrbStack virtiofs cold-open of a large + * chroma.sqlite3 / HNSW load), the awaiting promise would otherwise hang + * forever and freeze the pi TUI (ESC cancels the LLM stream, not a pending + * tool execution). On timeout we reject the request AND kill the wedged + * child, so pi gets an error instead of hanging and later calls fail fast. + * This is a per-REQUEST timeout, not a process-lifetime one — the + * long-lived server is only killed when a request genuinely stalls. + * - MEMPALACE_MCP_TIMEOUT_MS tool-call/request timeout (default 60000) + * - MEMPALACE_MCP_INIT_TIMEOUT_MS initialize+tools/list timeout (default 120000) + * Set either to 0 to disable (legacy unbounded behavior). + * * Debug: set MEMPALACE_EXT_DEBUG=1 to surface mempalace-mcp stderr. */ @@ -35,21 +47,39 @@ interface McpTool { inputSchema?: unknown; } +interface Pending { + resolve: (v: any) => void; + reject: (e: Error) => void; + timer: ReturnType | null; +} + +const num = (envVal: string | undefined, fallback: number): number => { + const n = envVal !== undefined ? Number(envVal) : Number.NaN; + return Number.isFinite(n) && n >= 0 ? n : fallback; +}; + class McpClient { private proc: ChildProcessWithoutNullStreams | null = null; private nextId = 1; - private pending = new Map void; reject: (e: Error) => void }>(); + private pending = new Map(); private stdoutBuf = ""; private ready: Promise | null = null; public tools: McpTool[] = []; + // Per-request timeouts (ms). 0 = disabled (unbounded, legacy behavior). + private requestTimeoutMs = num(process.env.MEMPALACE_MCP_TIMEOUT_MS, 60_000); + private initTimeoutMs = num(process.env.MEMPALACE_MCP_INIT_TIMEOUT_MS, 120_000); + // Fired when the child process dies (exit or stall-kill). Lets the + // extension flip `available` so later tool calls fail fast. + public onExit: (() => void) | null = null; + async start(command: string, args: string[] = []): Promise { if (this.ready) return this.ready; this.ready = (async () => { this.proc = spawn(command, args, { stdio: ["pipe", "pipe", "pipe"] }); - this.proc.on("error", (err) => this.failAll(err)); + this.proc.on("error", (err) => this.handleDeath(err)); this.proc.on("exit", (code) => - this.failAll(new Error(`mempalace-mcp exited (code=${code})`)), + this.handleDeath(new Error(`mempalace-mcp exited (code=${code})`)), ); this.proc.stdout.setEncoding("utf8"); this.proc.stdout.on("data", (chunk: string) => this.onStdout(chunk)); @@ -63,15 +93,20 @@ class McpClient { this.proc.stderr.resume(); // drain without logging } - // MCP initialize handshake. - await this.request("initialize", { - protocolVersion: "2024-11-05", - capabilities: {}, - clientInfo: { name: "pi-mempalace-ext", version: "0.1.0" }, - }); + // MCP initialize handshake. Cold-open over virtiofs can be slow, so + // use the (longer) init timeout here rather than the per-call one. + await this.request( + "initialize", + { + protocolVersion: "2024-11-05", + capabilities: {}, + clientInfo: { name: "pi-mempalace-ext", version: "0.1.0" }, + }, + this.initTimeoutMs, + ); this.notify("notifications/initialized", {}); - const listed = await this.request("tools/list", {}); + const listed = await this.request("tools/list", {}, this.initTimeoutMs); this.tools = (listed?.tools as McpTool[]) ?? []; })(); return this.ready; @@ -91,18 +126,37 @@ class McpClient { continue; } if (typeof msg.id === "number" && this.pending.has(msg.id)) { - const { resolve, reject } = this.pending.get(msg.id)!; - this.pending.delete(msg.id); - if (msg.error) reject(new Error(msg.error.message ?? "MCP error")); - else resolve(msg.result); + const p = this.pending.get(msg.id)!; + this.settle(msg.id); + if (msg.error) p.reject(new Error(msg.error.message ?? "MCP error")); + else p.resolve(msg.result); } // notifications (no id) ignored for now. } } + /** Remove a pending request and clear its timeout timer. */ + private settle(id: number) { + const p = this.pending.get(id); + if (p?.timer) clearTimeout(p.timer); + this.pending.delete(id); + } + private failAll(err: Error) { - for (const { reject } of this.pending.values()) reject(err); - this.pending.clear(); + for (const id of [...this.pending.keys()]) { + const p = this.pending.get(id)!; + this.settle(id); + p.reject(err); + } + } + + /** Child died (exit, spawn error, or stall-kill): reject everything. */ + private handleDeath(err: Error) { + this.proc = null; + this.failAll(err); + try { + this.onExit?.(); + } catch {} } private write(obj: unknown) { @@ -114,11 +168,36 @@ class McpClient { this.write({ jsonrpc: "2.0", method, params }); } - request(method: string, params: unknown): Promise { + request(method: string, params: unknown, timeoutMs = this.requestTimeoutMs): Promise { const id = this.nextId++; return new Promise((resolve, reject) => { - this.pending.set(id, { resolve, reject }); - this.write({ jsonrpc: "2.0", id, method, params }); + let timer: ReturnType | null = null; + if (timeoutMs > 0) { + timer = setTimeout(() => { + if (!this.pending.has(id)) return; + this.settle(id); + reject( + new Error( + `mempalace-mcp request '${method}' timed out after ${timeoutMs}ms ` + + `(server wedged — likely cold storage open). Terminating the ` + + `stalled server; restart pi to retry palace tools.`, + ), + ); + // Kill the wedged child so subsequent calls fail fast instead of + // stacking up behind a dead server. The 'exit' handler + // (handleDeath) rejects any other pending requests. + this.kill(); + }, timeoutMs); + // Don't let a pending MCP timer keep the event loop alive. + if (typeof timer.unref === "function") timer.unref(); + } + this.pending.set(id, { resolve, reject, timer }); + try { + this.write({ jsonrpc: "2.0", id, method, params }); + } catch (err) { + this.settle(id); + reject(err as Error); + } }); } @@ -126,6 +205,21 @@ class McpClient { return this.request("tools/call", { name, arguments: args }); } + /** SIGTERM then SIGKILL grace, for stall recovery. */ + private kill() { + const proc = this.proc; + if (!proc) return; + try { + proc.kill("SIGTERM"); + } catch {} + const t = setTimeout(() => { + try { + proc.kill("SIGKILL"); + } catch {} + }, 2_000); + if (typeof t.unref === "function") t.unref(); + } + stop() { if (this.proc) { try { @@ -146,6 +240,11 @@ export default async function mempalaceExtension(pi: ExtensionAPI) { let wokeUp = false; try { + client.onExit = () => { + // Child died (stall-kill or crash): later tool calls return a clear + // error instead of trying to talk to a dead/respawn-less server. + available = false; + }; await client.start("mempalace-mcp"); available = true; } catch (err) {