diff --git a/README.md b/README.md index c86ecee..190b0fe 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,18 @@ mempalace-session --help **Dedup:** staging at `~/.cache/mempalace-session//` with deterministic per-session filenames (`_.jsonl`). The convos miner keys on `source_file`, so re-runs skip unchanged sessions. To force re-mining a session, delete its JSONL from the staging dir. +**`--dry-run` is dedup-aware.** Each session is tagged `[NEW]` (would be filed) or `[SKIP]` (already in the palace), and the summary breaks down the count: + +``` +Exported 62 session(s) to ~/.cache/mempalace-session/wing_conversations + 0 new → will be filed on mine + 62 already filed → will be skipped (dedup by source_file) + +--dry-run: no new sessions to mine. A real run would skip all 62. +``` + +If the palace is unreachable (fresh install, moved, permission-denied) the wrapper falls back to "everything is new" — the real mine step delegates dedup to `mempalace mine --mode convos`, which is always the source of truth. So running `mempalace-session` twice in a row is never destructive or wasteful: the second run's only cost is the post-mine HNSW `repair` step (~5 min on a ~5k-drawer palace). + **Filter:** sessions with fewer than `--min-messages` messages (default 3) are skipped — drops throwaway `/exit`'d sessions that would otherwise flood the palace. On a reference 140-session corpus, 78 were filtered this way. **Cost profile:** ~20 minutes per 60-session batch. Scales roughly linearly with message count. Dedup re-run: mine step instant, only the post-mine `repair` runs (~5 min on 5k drawers). diff --git a/SKILL.md b/SKILL.md index c9686b6..356b57c 100644 --- a/SKILL.md +++ b/SKILL.md @@ -90,6 +90,16 @@ A docs-heavy repo should produce ~5–10 drawers per file. >15 drawers/file on a Second run immediately after first → 0 new drawers, only the post-mine `repair` step runs (~5 min on 5k drawers). +**`mempalace-session --dry-run` is dedup-aware.** Each session listed is tagged `[NEW]` (would be filed) or `[SKIP]` (already in the palace), and the summary reports the split: + +``` +Exported 62 session(s) to ~/.cache/... + 0 new → will be filed on mine + 62 already filed → will be skipped (dedup by source_file) +``` + +So when a user asks "will it mine the same sessions again?" — point them at `mempalace-session --dry-run` and read the summary line. If `N new = 0`, nothing will be re-filed. The classification check is best-effort (falls back to "everything is new" if palace unreachable); the real mine step delegates to `mempalace mine --mode convos`, which is always the authoritative dedup source. + ### Incremental catch-up ```bash diff --git a/bin/mempalace-session b/bin/mempalace-session index cb9ab6b..797a8b6 100755 --- a/bin/mempalace-session +++ b/bin/mempalace-session @@ -66,10 +66,19 @@ Options: --agent Agent name recorded on drawers (default: $USER) --db Path to opencode.db (default: $OPENCODE_DB or ~/.local/share/opencode/opencode.db) - --dry-run Export + list; do not mine into palace + --dry-run Export + list; do not mine into palace. Each session + is tagged [NEW] or [SKIP] based on whether its + source_file is already present in the palace. --no-repair Skip `mempalace repair` after mining -h, --help Show this help +Idempotency: + Re-running on the same corpus is safe. The export step always writes every + qualifying session to the cache, but the mine step dedups on source_file + path — already-filed sessions are skipped without re-embedding. A --dry-run + summary shows exactly how many of the exported files are new vs already + filed, so you can see in advance what a real run would do. + What gets mined: - Each qualifying session → one Claude Code JSONL file - Staged under ~/.cache/mempalace-session// @@ -139,6 +148,12 @@ mkdir -p "$STAGE" # ── Export sessions (Python heredoc) ──────────────────────────────── # Writes one JSONL file per qualifying session into $STAGE. # Prints: EXPORTED on stdout, plus per-session lines. +# +# If the palace is reachable, also classifies each export as NEW or ALREADY +# FILED (matching by source_file path) so --dry-run can report the true +# mine-set size, not just the export-set size. Classification is advisory +# only — the real mine step delegates dedup to `mempalace mine --mode convos`, +# which is the authoritative source of truth. export_count=$(python3 - "$OPENCODE_DB" "$STAGE" "$SESSION_ID" "$SINCE" "$MIN_MESSAGES" <<'PY' import sqlite3, json, sys, os from datetime import datetime, timezone @@ -157,6 +172,28 @@ if since: print(f"error: --since must be YYYY-MM-DD, got {since!r}", file=sys.stderr) sys.exit(1) +# ── Load palace's already-filed source_files (best-effort, read-only) ── +# Key the dedup check on absolute staging path. The palace stores these in +# chroma.sqlite3 under embedding_metadata.key='source_file'. If the palace +# isn't reachable (first install, moved, permission-denied), we fall through +# to "everything is new" — the mine step will do the real dedup anyway. +already_filed = set() +palace_path = os.environ.get("MEMPALACE_PATH", os.path.expanduser("~/.mempalace/palace")) +chroma_db = Path(palace_path) / "chroma.sqlite3" +if chroma_db.is_file(): + try: + pcon = sqlite3.connect(f"file:{chroma_db}?mode=ro", uri=True) + for (sf,) in pcon.execute( + "SELECT DISTINCT string_value FROM embedding_metadata " + "WHERE key='source_file' AND string_value LIKE ?", + (f"{stage}%",), + ): + if sf: + already_filed.add(sf) + pcon.close() + except sqlite3.Error: + pass # palace unreachable → treat all exports as new (miner will dedup) + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) conn.row_factory = sqlite3.Row cur = conn.cursor() @@ -181,6 +218,7 @@ if not sessions: # Prefetch messages + parts for qualifying sessions exported = 0 skipped_short = 0 +skipped_already_filed = 0 for sess in sessions: sid = sess["id"] cur.execute("SELECT COUNT(*) FROM message WHERE session_id=?", (sid,)) @@ -303,19 +341,26 @@ for sess in sessions: pass exported += 1 - print(f" {out_path.name} ({msg_count} msgs, {len(out_lines)} turns)", + is_filed = str(out_path) in already_filed + if is_filed: + skipped_already_filed += 1 + status = "SKIP" if is_filed else "NEW " + print(f" [{status}] {out_path.name} ({msg_count} msgs, {len(out_lines)} turns)", file=sys.stderr) print(f"EXPORTED {exported}") +print(f"ALREADY_FILED {skipped_already_filed}") if skipped_short: print(f"SKIPPED_SHORT {skipped_short}", file=sys.stderr) PY ) -# Parse count from stdout -count="${export_count##*EXPORTED }" -count="${count%%[!0-9]*}" +# Parse counts from stdout +count="$(printf '%s\n' "$export_count" | awk '/^EXPORTED / { print $2 }')" count="${count:-0}" +already_filed="$(printf '%s\n' "$export_count" | awk '/^ALREADY_FILED / { print $2 }')" +already_filed="${already_filed:-0}" +to_file=$(( count - already_filed )) if [[ "$count" -eq 0 ]]; then echo "no sessions qualified for export" @@ -324,9 +369,17 @@ fi echo "" echo "Exported $count session(s) to $STAGE" +echo " $to_file new → will be filed on mine" +echo " $already_filed already filed → will be skipped (dedup by source_file)" if [[ $DRY_RUN -eq 1 ]]; then - echo "--dry-run: skipping mine step" + if [[ "$to_file" -eq 0 ]]; then + echo "" + echo "--dry-run: no new sessions to mine. A real run would skip all $count." + else + echo "" + echo "--dry-run: skipping mine step. A real run would file $to_file new session(s)." + fi exit 0 fi