feat(pi-ext): per-request timeout + stall-kill for mempalace-mcp
A wedged mempalace-mcp (classically an OrbStack virtiofs cold-open of a large chroma.sqlite3 / HNSW load) left the awaiting JSON-RPC promise pending forever, freezing the pi TUI uninterruptibly: ESC cancels the LLM stream, not a pending tool execute(). The JSON-RPC client now arms a per-request timer. On expiry it rejects the request AND kills the stalled child (SIGTERM->SIGKILL), so pi gets an error instead of hanging; the extension flips available=false so later calls fail fast (restart pi to retry). Per-REQUEST, not per-process: the long-lived server only dies on a genuine stall. Knobs: MEMPALACE_MCP_TIMEOUT_MS (default 60000), MEMPALACE_MCP_INIT_TIMEOUT_MS (default 120000), 0 = disable. This supersedes the planned standalone stdio-watchdog shim: the extension already owns request/response correlation, so a separate framing-reparsing shim is unnecessary.
This commit is contained in:
@@ -52,6 +52,23 @@ to `"pi"`. First diary write against that identity creates `wing_<name>`
|
||||
in the palace. Set the env var if you want to run pi under a distinct
|
||||
identity on a given machine (e.g. `pi-laptop` vs `pi-server`).
|
||||
|
||||
## Stall protection (per-request timeout)
|
||||
|
||||
Every JSON-RPC request to `mempalace-mcp` carries a timeout. Without it, a
|
||||
wedged server (classically: an OrbStack/virtiofs cold-open of a large
|
||||
`chroma.sqlite3` or an HNSW load) leaves the awaiting promise pending
|
||||
*forever*, which freezes the pi TUI — ESC cancels the LLM stream, not a
|
||||
pending tool `execute()`. On timeout the extension rejects the request
|
||||
**and** kills the stalled child (SIGTERM→SIGKILL), so pi gets a clear
|
||||
error instead of hanging and later calls fail fast (`available` flips off;
|
||||
restart pi to retry). This is a per-REQUEST timeout, not a process-lifetime
|
||||
one — the long-lived server is only killed when a request genuinely stalls.
|
||||
|
||||
- `MEMPALACE_MCP_TIMEOUT_MS` — tool-call/request timeout. Default `60000`.
|
||||
- `MEMPALACE_MCP_INIT_TIMEOUT_MS` — `initialize` + `tools/list` handshake
|
||||
timeout (cold-open is expected to be slower here). Default `120000`.
|
||||
- Set either to `0` to disable (legacy unbounded behavior).
|
||||
|
||||
## Debugging
|
||||
|
||||
- `MEMPALACE_EXT_DEBUG=1` — surface `mempalace-mcp` stderr into pi's
|
||||
|
||||
+114
-15
@@ -21,6 +21,18 @@
|
||||
* Fail-soft: if the MCP subprocess can't start, pi keeps working without
|
||||
* palace tools (warning on stderr only).
|
||||
*
|
||||
* Stall protection: every JSON-RPC request carries a timeout. If
|
||||
* `mempalace-mcp` wedges (e.g. OrbStack virtiofs cold-open of a large
|
||||
* chroma.sqlite3 / HNSW load), the awaiting promise would otherwise hang
|
||||
* forever and freeze the pi TUI (ESC cancels the LLM stream, not a pending
|
||||
* tool execution). On timeout we reject the request AND kill the wedged
|
||||
* child, so pi gets an error instead of hanging and later calls fail fast.
|
||||
* This is a per-REQUEST timeout, not a process-lifetime one — the
|
||||
* long-lived server is only killed when a request genuinely stalls.
|
||||
* - MEMPALACE_MCP_TIMEOUT_MS tool-call/request timeout (default 60000)
|
||||
* - MEMPALACE_MCP_INIT_TIMEOUT_MS initialize+tools/list timeout (default 120000)
|
||||
* Set either to 0 to disable (legacy unbounded behavior).
|
||||
*
|
||||
* Debug: set MEMPALACE_EXT_DEBUG=1 to surface mempalace-mcp stderr.
|
||||
*/
|
||||
|
||||
@@ -35,21 +47,39 @@ interface McpTool {
|
||||
inputSchema?: unknown;
|
||||
}
|
||||
|
||||
interface Pending {
|
||||
resolve: (v: any) => void;
|
||||
reject: (e: Error) => void;
|
||||
timer: ReturnType<typeof setTimeout> | null;
|
||||
}
|
||||
|
||||
const num = (envVal: string | undefined, fallback: number): number => {
|
||||
const n = envVal !== undefined ? Number(envVal) : Number.NaN;
|
||||
return Number.isFinite(n) && n >= 0 ? n : fallback;
|
||||
};
|
||||
|
||||
class McpClient {
|
||||
private proc: ChildProcessWithoutNullStreams | null = null;
|
||||
private nextId = 1;
|
||||
private pending = new Map<number, { resolve: (v: any) => void; reject: (e: Error) => void }>();
|
||||
private pending = new Map<number, Pending>();
|
||||
private stdoutBuf = "";
|
||||
private ready: Promise<void> | null = null;
|
||||
public tools: McpTool[] = [];
|
||||
|
||||
// Per-request timeouts (ms). 0 = disabled (unbounded, legacy behavior).
|
||||
private requestTimeoutMs = num(process.env.MEMPALACE_MCP_TIMEOUT_MS, 60_000);
|
||||
private initTimeoutMs = num(process.env.MEMPALACE_MCP_INIT_TIMEOUT_MS, 120_000);
|
||||
// Fired when the child process dies (exit or stall-kill). Lets the
|
||||
// extension flip `available` so later tool calls fail fast.
|
||||
public onExit: (() => void) | null = null;
|
||||
|
||||
async start(command: string, args: string[] = []): Promise<void> {
|
||||
if (this.ready) return this.ready;
|
||||
this.ready = (async () => {
|
||||
this.proc = spawn(command, args, { stdio: ["pipe", "pipe", "pipe"] });
|
||||
this.proc.on("error", (err) => this.failAll(err));
|
||||
this.proc.on("error", (err) => this.handleDeath(err));
|
||||
this.proc.on("exit", (code) =>
|
||||
this.failAll(new Error(`mempalace-mcp exited (code=${code})`)),
|
||||
this.handleDeath(new Error(`mempalace-mcp exited (code=${code})`)),
|
||||
);
|
||||
this.proc.stdout.setEncoding("utf8");
|
||||
this.proc.stdout.on("data", (chunk: string) => this.onStdout(chunk));
|
||||
@@ -63,15 +93,20 @@ class McpClient {
|
||||
this.proc.stderr.resume(); // drain without logging
|
||||
}
|
||||
|
||||
// MCP initialize handshake.
|
||||
await this.request("initialize", {
|
||||
// MCP initialize handshake. Cold-open over virtiofs can be slow, so
|
||||
// use the (longer) init timeout here rather than the per-call one.
|
||||
await this.request(
|
||||
"initialize",
|
||||
{
|
||||
protocolVersion: "2024-11-05",
|
||||
capabilities: {},
|
||||
clientInfo: { name: "pi-mempalace-ext", version: "0.1.0" },
|
||||
});
|
||||
},
|
||||
this.initTimeoutMs,
|
||||
);
|
||||
this.notify("notifications/initialized", {});
|
||||
|
||||
const listed = await this.request("tools/list", {});
|
||||
const listed = await this.request("tools/list", {}, this.initTimeoutMs);
|
||||
this.tools = (listed?.tools as McpTool[]) ?? [];
|
||||
})();
|
||||
return this.ready;
|
||||
@@ -91,18 +126,37 @@ class McpClient {
|
||||
continue;
|
||||
}
|
||||
if (typeof msg.id === "number" && this.pending.has(msg.id)) {
|
||||
const { resolve, reject } = this.pending.get(msg.id)!;
|
||||
this.pending.delete(msg.id);
|
||||
if (msg.error) reject(new Error(msg.error.message ?? "MCP error"));
|
||||
else resolve(msg.result);
|
||||
const p = this.pending.get(msg.id)!;
|
||||
this.settle(msg.id);
|
||||
if (msg.error) p.reject(new Error(msg.error.message ?? "MCP error"));
|
||||
else p.resolve(msg.result);
|
||||
}
|
||||
// notifications (no id) ignored for now.
|
||||
}
|
||||
}
|
||||
|
||||
/** Remove a pending request and clear its timeout timer. */
|
||||
private settle(id: number) {
|
||||
const p = this.pending.get(id);
|
||||
if (p?.timer) clearTimeout(p.timer);
|
||||
this.pending.delete(id);
|
||||
}
|
||||
|
||||
private failAll(err: Error) {
|
||||
for (const { reject } of this.pending.values()) reject(err);
|
||||
this.pending.clear();
|
||||
for (const id of [...this.pending.keys()]) {
|
||||
const p = this.pending.get(id)!;
|
||||
this.settle(id);
|
||||
p.reject(err);
|
||||
}
|
||||
}
|
||||
|
||||
/** Child died (exit, spawn error, or stall-kill): reject everything. */
|
||||
private handleDeath(err: Error) {
|
||||
this.proc = null;
|
||||
this.failAll(err);
|
||||
try {
|
||||
this.onExit?.();
|
||||
} catch {}
|
||||
}
|
||||
|
||||
private write(obj: unknown) {
|
||||
@@ -114,11 +168,36 @@ class McpClient {
|
||||
this.write({ jsonrpc: "2.0", method, params });
|
||||
}
|
||||
|
||||
request(method: string, params: unknown): Promise<any> {
|
||||
request(method: string, params: unknown, timeoutMs = this.requestTimeoutMs): Promise<any> {
|
||||
const id = this.nextId++;
|
||||
return new Promise((resolve, reject) => {
|
||||
this.pending.set(id, { resolve, reject });
|
||||
let timer: ReturnType<typeof setTimeout> | null = null;
|
||||
if (timeoutMs > 0) {
|
||||
timer = setTimeout(() => {
|
||||
if (!this.pending.has(id)) return;
|
||||
this.settle(id);
|
||||
reject(
|
||||
new Error(
|
||||
`mempalace-mcp request '${method}' timed out after ${timeoutMs}ms ` +
|
||||
`(server wedged — likely cold storage open). Terminating the ` +
|
||||
`stalled server; restart pi to retry palace tools.`,
|
||||
),
|
||||
);
|
||||
// Kill the wedged child so subsequent calls fail fast instead of
|
||||
// stacking up behind a dead server. The 'exit' handler
|
||||
// (handleDeath) rejects any other pending requests.
|
||||
this.kill();
|
||||
}, timeoutMs);
|
||||
// Don't let a pending MCP timer keep the event loop alive.
|
||||
if (typeof timer.unref === "function") timer.unref();
|
||||
}
|
||||
this.pending.set(id, { resolve, reject, timer });
|
||||
try {
|
||||
this.write({ jsonrpc: "2.0", id, method, params });
|
||||
} catch (err) {
|
||||
this.settle(id);
|
||||
reject(err as Error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -126,6 +205,21 @@ class McpClient {
|
||||
return this.request("tools/call", { name, arguments: args });
|
||||
}
|
||||
|
||||
/** SIGTERM then SIGKILL grace, for stall recovery. */
|
||||
private kill() {
|
||||
const proc = this.proc;
|
||||
if (!proc) return;
|
||||
try {
|
||||
proc.kill("SIGTERM");
|
||||
} catch {}
|
||||
const t = setTimeout(() => {
|
||||
try {
|
||||
proc.kill("SIGKILL");
|
||||
} catch {}
|
||||
}, 2_000);
|
||||
if (typeof t.unref === "function") t.unref();
|
||||
}
|
||||
|
||||
stop() {
|
||||
if (this.proc) {
|
||||
try {
|
||||
@@ -146,6 +240,11 @@ export default async function mempalaceExtension(pi: ExtensionAPI) {
|
||||
let wokeUp = false;
|
||||
|
||||
try {
|
||||
client.onExit = () => {
|
||||
// Child died (stall-kill or crash): later tool calls return a clear
|
||||
// error instead of trying to talk to a dead/respawn-less server.
|
||||
available = false;
|
||||
};
|
||||
await client.start("mempalace-mcp");
|
||||
available = true;
|
||||
} catch (err) {
|
||||
|
||||
Reference in New Issue
Block a user