mempalace-toolkit/bin/mempalace-session

#!/usr/bin/env bash
# mempalace-session — mine opencode session history into MemPalace
#
# Opencode persists every session (verbatim user/assistant turns + tool calls)
# in a local SQLite DB at ~/.local/share/opencode/opencode.db. There is
# currently no opencode session-stopping hook upstream, so the diary-based
# auto-save is best-effort; this wrapper closes the gap by mining the SQLite
# directly.
#
# Strategy:
#   1. Read opencode.db and export each qualifying session to a Claude Code
#      JSONL file (format the mempalace normalizer already understands).
#   2. Stage exports under ~/.cache/mempalace-session/<wing>/.
#   3. Run `mempalace mine --mode convos` against the staging dir.
#
# Dedup: mempalace convos mode keys on source_file (absolute staging path).
# The staging path is deterministic (per-wing under XDG_CACHE_HOME) so re-runs
# are idempotent as long as session content hasn't changed.
#
# Session filter: sessions with fewer than --min-messages messages (default 3)
# are skipped to avoid filing throwaway /exit'd sessions.
#
# Usage:
#   mempalace-session
#   mempalace-session --wing <name>
#   mempalace-session --session <id>
#   mempalace-session --since 2026-04-01
#   mempalace-session --min-messages 6
#   mempalace-session --dry-run
#   mempalace-session --help
#
# Exit codes:
#   0  success
#   1  usage / argument error
#   2  opencode.db missing or unreadable
#   3  mempalace CLI not installed
#   4  mine failed
#
# Dependencies: bash, python3 (stdlib sqlite3), mempalace (v3.3.3+)

set -euo pipefail

# ── Defaults ─────────────────────────────────────────────────────────
AGENT="${USER:-mempalace}"
WING="wing_conversations"
SESSION_ID=""
SINCE=""
MIN_MESSAGES=3
DRY_RUN=0
DO_REPAIR=0
OPENCODE_DB="${OPENCODE_DB:-$HOME/.local/share/opencode/opencode.db}"

# ── Usage ────────────────────────────────────────────────────────────
usage() {
  cat <<'EOF'
mempalace-session — mine opencode session history into MemPalace

Usage:
  mempalace-session [options]

Options:
  --wing <name>          Target wing (default: wing_conversations)
  --session <id>         Export one session only (default: all qualifying)
  --since <YYYY-MM-DD>   Only sessions with time_updated on/after this date
  --min-messages <N>     Skip sessions with fewer than N messages (default: 3)
  --agent <name>         Agent name recorded on drawers (default: $USER)
  --db <path>            Path to opencode.db (default: $OPENCODE_DB or
                         ~/.local/share/opencode/opencode.db)
  --dry-run              Export + list; do not mine into palace. Each session
                         is tagged [NEW] or [SKIP] based on whether its
                         source_file is already present in the palace.
  --repair               Run `mempalace repair` after mining (opt-in).
                         WARNING: repair does a destructive in-place HNSW
                         rebuild. If it races a live MCP connection or
                         crashes mid-rebuild, it can wipe the collection.
                         Only pass this from a quiet, interactive context.
                         Not safe for unattended cron/launchd schedules.
  --no-repair            (Deprecated; no-repair is now the default.)
  -h, --help             Show this help

Idempotency:
  Re-running on the same corpus is safe. The export step always writes every
  qualifying session to the cache, but the mine step dedups on source_file
  path — already-filed sessions are skipped without re-embedding. A --dry-run
  summary shows exactly how many of the exported files are new vs already
  filed, so you can see in advance what a real run would do.

What gets mined:
  - Each qualifying session → one Claude Code JSONL file
  - Staged under ~/.cache/mempalace-session/<wing>/
  - Filed via `mempalace mine --mode convos`

Transcript shape per session:
  - Synthetic header as first user turn:
      [session: <title> | <directory> | <YYYY-MM-DD> | source: opencode]
  - User/assistant messages extracted from message.data + part.data
  - Tool calls → Claude Code `tool_use` blocks
  - Tool outputs → `tool_result` blocks (folded into the assistant turn by the
    mempalace normalizer)
  - `step-start` / `step-finish` parts are dropped (noise)
  - `reasoning` parts prefixed with `[reasoning]` and kept as text

Dedup:
  - source_file = absolute staging path (deterministic per session ID)
  - Re-runs skip unchanged sessions. To force re-mining, delete the staging
    dir: rm -rf ~/.cache/mempalace-session/<wing>/

Rationale:
  Opencode lacks a session-stopping hook (upstream PRs #16598, #16769 still
  open). Until that lands + mempalace hooks_cli.py gains an opencode harness,
  this wrapper is how we get automatic session capture.
EOF
}

# ── Parse args ───────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
  case "$1" in
    -h|--help) usage; exit 0 ;;
    --wing) WING="${2:-}"; shift 2 ;;
    --session) SESSION_ID="${2:-}"; shift 2 ;;
    --since) SINCE="${2:-}"; shift 2 ;;
    --min-messages) MIN_MESSAGES="${2:-}"; shift 2 ;;
    --agent) AGENT="${2:-}"; shift 2 ;;
    --db) OPENCODE_DB="${2:-}"; shift 2 ;;
    --dry-run) DRY_RUN=1; shift ;;
    --repair) DO_REPAIR=1; shift ;;
    --no-repair) shift ;;  # deprecated alias; no-repair is the default
    --) shift; break ;;
    -*) echo "error: unknown option: $1" >&2; usage >&2; exit 1 ;;
    *) echo "error: unexpected arg: $1" >&2; exit 1 ;;
  esac
done

# ── Preflight ────────────────────────────────────────────────────────
if [[ ! -f "$OPENCODE_DB" ]]; then
  echo "error: opencode.db not found at $OPENCODE_DB" >&2
  echo "       override with --db <path> or OPENCODE_DB env var" >&2
  exit 2
fi
if ! command -v mempalace >/dev/null 2>&1; then
  echo "error: mempalace CLI not found in PATH" >&2
  exit 3
fi
if ! [[ "$MIN_MESSAGES" =~ ^[0-9]+$ ]]; then
  echo "error: --min-messages must be an integer" >&2
  exit 1
fi

# ── Staging dir ──────────────────────────────────────────────────────
# Deterministic per-wing path so source_file dedup works across re-runs.
CACHE_ROOT="${XDG_CACHE_HOME:-$HOME/.cache}/mempalace-session"
STAGE="$CACHE_ROOT/$WING"
mkdir -p "$STAGE"

# ── Export sessions (Python heredoc) ────────────────────────────────
# Writes one JSONL file per qualifying session into $STAGE.
# Prints: EXPORTED <count>  on stdout, plus per-session lines.
#
# If the palace is reachable, also classifies each export as NEW or ALREADY
# FILED (matching by source_file path) so --dry-run can report the true
# mine-set size, not just the export-set size. Classification is advisory
# only — the real mine step delegates dedup to `mempalace mine --mode convos`,
# which is the authoritative source of truth.
export_count=$(python3 - "$OPENCODE_DB" "$STAGE" "$SESSION_ID" "$SINCE" "$MIN_MESSAGES" <<'PY'
import sqlite3, json, sys, os
from datetime import datetime, timezone
from pathlib import Path

db_path, stage, session_filter, since, min_messages = sys.argv[1:6]
min_messages = int(min_messages)
stage = Path(stage)

# Convert --since YYYY-MM-DD to epoch ms (opencode uses ms timestamps)
since_ms = None
if since:
    try:
        since_ms = int(datetime.strptime(since, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp() * 1000)
    except ValueError:
        print(f"error: --since must be YYYY-MM-DD, got {since!r}", file=sys.stderr)
        sys.exit(1)

# ── Load palace's already-filed source_files (best-effort, read-only) ──
# Key the dedup check on absolute staging path. The palace stores these in
# chroma.sqlite3 under embedding_metadata.key='source_file'. If the palace
# isn't reachable (first install, moved, permission-denied), we fall through
# to "everything is new" — the mine step will do the real dedup anyway.
already_filed = set()
palace_path = os.environ.get("MEMPALACE_PATH", os.path.expanduser("~/.mempalace/palace"))
chroma_db = Path(palace_path) / "chroma.sqlite3"
if chroma_db.is_file():
    try:
        pcon = sqlite3.connect(f"file:{chroma_db}?mode=ro", uri=True)
        for (sf,) in pcon.execute(
            "SELECT DISTINCT string_value FROM embedding_metadata "
            "WHERE key='source_file' AND string_value LIKE ?",
            (f"{stage}%",),
        ):
            if sf:
                already_filed.add(sf)
        pcon.close()
    except sqlite3.Error:
        pass  # palace unreachable → treat all exports as new (miner will dedup)

conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
conn.row_factory = sqlite3.Row
cur = conn.cursor()

# Select sessions
q = "SELECT * FROM session WHERE 1=1"
params = []
if session_filter:
    q += " AND id = ?"
    params.append(session_filter)
if since_ms is not None:
    q += " AND time_updated >= ?"
    params.append(since_ms)
q += " ORDER BY time_updated"
cur.execute(q, params)
sessions = [dict(r) for r in cur.fetchall()]

if not sessions:
    print("EXPORTED 0")
    sys.exit(0)

# Prefetch messages + parts for qualifying sessions
exported = 0
skipped_short = 0
skipped_already_filed = 0
for sess in sessions:
    sid = sess["id"]
    cur.execute("SELECT COUNT(*) FROM message WHERE session_id=?", (sid,))
    msg_count = cur.fetchone()[0]
    if msg_count < min_messages:
        skipped_short += 1
        continue

    cur.execute(
        "SELECT * FROM message WHERE session_id=? ORDER BY time_created", (sid,)
    )
    messages = [dict(r) for r in cur.fetchall()]
    cur.execute(
        "SELECT * FROM part WHERE session_id=? ORDER BY time_created", (sid,)
    )
    parts_by_msg: dict[str, list] = {}
    for r in cur.fetchall():
        d = dict(r)
        parts_by_msg.setdefault(d["message_id"], []).append(d)

    # Build JSONL lines
    out_lines: list[dict] = []

    # Synthetic header as first user turn — injects title/directory/date
    # into the transcript so semantic search can find sessions by topic,
    # not just by session-id filename.
    title = sess.get("title") or "(untitled)"
    directory = sess.get("directory") or "?"
    date_str = datetime.fromtimestamp(
        sess["time_created"] / 1000, tz=timezone.utc
    ).strftime("%Y-%m-%d")
    header = f"[session: {title} | {directory} | {date_str} | source: opencode]"
    out_lines.append({"type": "user", "message": {"content": header}})

    for msg in messages:
        mdata = json.loads(msg["data"])
        role = mdata.get("role")
        if role not in ("user", "assistant"):
            continue
        parts = parts_by_msg.get(msg["id"], [])

        blocks = []
        tool_results = []
        for p in parts:
            try:
                pd = json.loads(p["data"])
            except json.JSONDecodeError:
                continue
            t = pd.get("type")
            if t == "text":
                txt = (pd.get("text") or "").strip()
                if txt:
                    blocks.append({"type": "text", "text": txt})
            elif t == "tool":
                # opencode tool part → tool_use block + deferred tool_result
                state = pd.get("state") or {}
                tool_name = pd.get("tool") or "Unknown"
                call_id = pd.get("callID") or p["id"]
                tool_input = state.get("input") or {}
                tool_output = state.get("output")
                blocks.append({
                    "type": "tool_use",
                    "id": call_id,
                    "name": tool_name,
                    "input": tool_input,
                })
                if tool_output:
                    tool_results.append({
                        "type": "tool_result",
                        "tool_use_id": call_id,
                        "content": str(tool_output),
                    })
            elif t in ("step-start", "step-finish"):
                continue
            elif t == "reasoning":
                rtext = (pd.get("text") or "").strip()
                if rtext:
                    blocks.append({"type": "text", "text": f"[reasoning] {rtext}"})

        if not blocks:
            continue

        # Simplify single-text-block messages to a bare string (more tolerant
        # of normalizer edge cases; mempalace accepts either shape).
        if len(blocks) == 1 and blocks[0]["type"] == "text":
            content = blocks[0]["text"]
        else:
            content = blocks

        out_lines.append({
            "type": role,
            "message": {"content": content},
        })

        # For assistants, follow up with a synthetic human tool_result message
        # per tool call. The mempalace normalizer's `is_tool_only` branch
        # folds these back into the assistant turn (see normalize.py:211-214).
        if role == "assistant" and tool_results:
            out_lines.append({
                "type": "human",
                "message": {"content": tool_results},
            })

    # Must have at least 2 turns for the normalizer to accept the file
    if len(out_lines) < 2:
        skipped_short += 1
        continue

    slug = sess.get("slug") or "session"
    out_path = stage / f"{slug}_{sid}.jsonl"
    with open(out_path, "w", encoding="utf-8") as f:
        for obj in out_lines:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    # Set mtime to session time_updated so dedup sees a stable value.
    try:
        ts = sess["time_updated"] / 1000
        os.utime(out_path, (ts, ts))
    except Exception:
        pass

    exported += 1
    is_filed = str(out_path) in already_filed
    if is_filed:
        skipped_already_filed += 1
    status = "SKIP" if is_filed else "NEW "
    print(f"  [{status}] {out_path.name}  ({msg_count} msgs, {len(out_lines)} turns)",
          file=sys.stderr)

print(f"EXPORTED {exported}")
print(f"ALREADY_FILED {skipped_already_filed}")
if skipped_short:
    print(f"SKIPPED_SHORT {skipped_short}", file=sys.stderr)
PY
)

# Parse counts from stdout
count="$(printf '%s\n' "$export_count" | awk '/^EXPORTED / { print $2 }')"
count="${count:-0}"
already_filed="$(printf '%s\n' "$export_count" | awk '/^ALREADY_FILED / { print $2 }')"
already_filed="${already_filed:-0}"
to_file=$(( count - already_filed ))

if [[ "$count" -eq 0 ]]; then
  echo "no sessions qualified for export"
  exit 0
fi

echo ""
echo "Exported $count session(s) to $STAGE"
echo "  $to_file new   → will be filed on mine"
echo "  $already_filed already filed → will be skipped (dedup by source_file)"

if [[ $DRY_RUN -eq 1 ]]; then
  if [[ "$to_file" -eq 0 ]]; then
    echo ""
    echo "--dry-run: no new sessions to mine. A real run would skip all $count."
  else
    echo ""
    echo "--dry-run: skipping mine step. A real run would file $to_file new session(s)."
  fi
  exit 0
fi

# ── Run the mine ─────────────────────────────────────────────────────
echo ""
echo "Mining into wing '$WING'..."
if ! mempalace mine "$STAGE" --mode convos --wing "$WING" --agent "$AGENT"; then
  echo "error: mempalace mine failed" >&2
  exit 4
fi

# ── Repair index ─────────────────────────────────────────────────────
if [[ $DO_REPAIR -eq 1 ]]; then
  echo ""
  echo "WARNING: --repair runs an in-place HNSW rebuild that has wiped"
  echo "         live palaces on past runs. Proceeding in 3 seconds..."
  sleep 3
  echo "Rebuilding HNSW index..."
  mempalace repair --yes
fi

echo ""
echo "Done. Wing '$WING' updated. Remember to reconnect any live MCP sessions."