#!/usr/bin/env bash # mempalace-docs — mine a project into MemPalace with docs-only filtering # # Works around the fact that upstream `mempalace mine` has a hardcoded # READABLE_EXTENSIONS list that includes .py / .ts / .js / .go / .rs etc, # which pollutes the palace with low-signal code-fragment drawers. # # Strategy: stage a copy of only docs/config/script files into /tmp, then # run `mempalace mine` against that staging dir. Wing is derived from the # source directory name (override with --wing). # # Once MemPalace PR #1213 (exclude_patterns in mempalace.yaml) lands, this # wrapper becomes a thin shim over `mempalace mine` with a default # exclude_patterns injected. # # Usage: # mempalace-docs # mempalace-docs --wing # mempalace-docs --agent # mempalace-docs --dry-run # mempalace-docs --help # # Exit codes: # 0 success # 1 usage / argument error # 2 source directory missing # 3 mempalace CLI not installed # 4 mine failed # # Dependencies: bash, find, cp, mempalace (v3.3.3+) set -euo pipefail # ── Defaults ───────────────────────────────────────────────────────── AGENT="${USER:-mempalace}" WING="" SRC="" DRY_RUN=0 NO_REPAIR=0 # File patterns to include. Docs + config + intent-bearing scripts. # Everything else (code) is excluded by omission. INCLUDE_GLOBS=( '*.md' '*.mdx' '*.rst' '*.txt' '*.yml' '*.yaml' '*.toml' '*.json' # includes package.json, pyproject companions; lockfiles filtered below '*.sh' '*.bash' '*.zsh' '*.fish' 'Dockerfile*' 'Makefile*' 'Containerfile*' '*.conf' '*.cfg' '*.ini' 'LICENSE*' 'COPYING*' 'NOTICE*' 'AUTHORS*' 'CONTRIBUTORS*' ) # Path segments to always skip (in addition to .gitignore). SKIP_DIRS=( '.git' '.venv' 'venv' '__pycache__' 'node_modules' '.mypy_cache' '.pytest_cache' '.ruff_cache' '.tox' '.nox' 'dist' 'build' '.next' '.nuxt' 'target' 'coverage' '.DS_Store' ) # Filename patterns to skip even if caught by an include glob. SKIP_FILES=( 'package-lock.json' 'yarn.lock' 'pnpm-lock.yaml' 'poetry.lock' 'Cargo.lock' 'Gemfile.lock' 'composer.lock' '.gitignore' '.dockerignore' ) # ── Usage ──────────────────────────────────────────────────────────── usage() { cat <<'EOF' mempalace-docs — mine a project into MemPalace, docs/config/scripts only Usage: mempalace-docs [options] Options: --wing Override wing name (default: source directory name) --agent Agent name recorded on drawers (default: $USER) --dry-run List files that would be mined; do not file --no-repair Skip `mempalace repair` after mining -h, --help Show this help What gets mined: Docs: *.md *.mdx *.rst *.txt Config: *.yml *.yaml *.toml *.json *.conf *.cfg *.ini Scripts: *.sh *.bash *.zsh *.fish Dockerfile* Makefile* Legal: LICENSE* COPYING* NOTICE* AUTHORS* What gets skipped (by design): Source code: .py .ts .tsx .js .jsx .go .rs .java .cpp .c .rb .kt .swift Caches / deps: .git .venv venv node_modules __pycache__ .mypy_cache .pytest_cache .ruff_cache dist build .next target coverage Lockfiles: package-lock.json yarn.lock poetry.lock Cargo.lock ... Rationale: The palace is for context and intent. Agents read code directly via grep/glob/Read — mining it creates a parallel, lossier, drift-prone copy that pollutes semantic search. This wrapper is a bridge until MemPalace PR #1213 (exclude_patterns) lands upstream. EOF } # ── Parse args ─────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in -h|--help) usage; exit 0 ;; --wing) WING="${2:-}"; shift 2 ;; --agent) AGENT="${2:-}"; shift 2 ;; --dry-run) DRY_RUN=1; shift ;; --no-repair) NO_REPAIR=1; shift ;; --) shift; break ;; -*) echo "error: unknown option: $1" >&2; usage >&2; exit 1 ;; *) if [[ -z "$SRC" ]]; then SRC="$1"; shift; else echo "error: unexpected arg: $1" >&2; exit 1; fi ;; esac done if [[ -z "$SRC" ]]; then usage >&2; exit 1; fi if [[ ! -d "$SRC" ]]; then echo "error: not a directory: $SRC" >&2; exit 2 fi if ! command -v mempalace >/dev/null 2>&1; then echo "error: mempalace CLI not found in PATH" >&2; exit 3 fi SRC="$(cd "$SRC" && pwd)" # Determine wing name with the following precedence: # 1. explicit --wing flag (user override) # 2. `wing:` value in $SRC/mempalace.yaml (respect existing project config) # 3. sanitized source directory basename (hyphens → underscores, matching # mempalace's convention for implicit wing names) if [[ -z "$WING" && -f "$SRC/mempalace.yaml" ]]; then WING="$(awk -F': *' '/^wing:/ { gsub(/["\x27 ]/,"",$2); print $2; exit }' "$SRC/mempalace.yaml" 2>/dev/null || true)" fi if [[ -z "$WING" ]]; then WING="$(basename "$SRC" | tr '-' '_')" fi # ── Build staging directory ────────────────────────────────────────── # Use a deterministic, per-wing cache path so re-runs produce the same # source_file paths the miner saw last time. This is critical: mempalace # dedup keys on source_file + source_mtime, so a mktemp path would cause # every run to re-file the entire wing. CACHE_ROOT="${XDG_CACHE_HOME:-$HOME/.cache}/mempalace-docs" STAGE="$CACHE_ROOT/$WING" mkdir -p "$CACHE_ROOT" rm -rf "$STAGE" mkdir -p "$STAGE" # Only clean up the per-wing stage on exit — leave $CACHE_ROOT itself # alone in case other wings are staging concurrently. trap 'rm -rf "$STAGE"' EXIT INT TERM # Build find expression find_cmd=(find "$SRC" -type f) # Prune unwanted dirs for d in "${SKIP_DIRS[@]}"; do find_cmd+=('!' -path "*/$d/*" '!' -path "*/$d") done # Include only matching names find_cmd+=('(' -false) for g in "${INCLUDE_GLOBS[@]}"; do find_cmd+=('-o' '-name' "$g") done find_cmd+=(')') # Gather matches, then filter skip_files mapfile -t matches < <("${find_cmd[@]}") filtered=() for f in "${matches[@]}"; do base="$(basename "$f")" skip=0 for sf in "${SKIP_FILES[@]}"; do if [[ "$base" == "$sf" ]]; then skip=1; break; fi done [[ $skip -eq 0 ]] && filtered+=("$f") done count="${#filtered[@]}" if [[ $count -eq 0 ]]; then echo "no matching files found in $SRC" >&2 exit 0 fi if [[ $DRY_RUN -eq 1 ]]; then echo "Would mine $count files into wing '$WING':" printf ' %s\n' "${filtered[@]}" | sed "s#^ $SRC/# #" exit 0 fi # Copy into staging, preserving mtime (critical for mempalace dedup — # the miner compares stored mtime against the staged copy's mtime). for f in "${filtered[@]}"; do rel="${f#$SRC/}" dest="$STAGE/$rel" mkdir -p "$(dirname "$dest")" cp -p "$f" "$dest" done # Purge any drawers in this wing that came from the original source # directory. The miner records source_file = absolute path from the # staging dir; this differs from a prior `mempalace mine ` run, # so without this purge the wing would accumulate duplicates every time # we switch between upstream `mempalace mine` and this wrapper. # We only purge source_file paths matching $SRC/*, leaving other wings # and other sources alone. python3 - "$WING" "$SRC" <<'PY' import sqlite3, sys, os wing, src = sys.argv[1], sys.argv[2].rstrip("/") db_path = os.path.expanduser("~/.mempalace/palace/chroma.sqlite3") if not os.path.exists(db_path): sys.exit(0) db = sqlite3.connect(db_path) cur = db.cursor() # Find embedding ids in target wing whose source_file is under $SRC/ q = """ SELECT DISTINCT w.id FROM embedding_metadata w JOIN embedding_metadata s ON w.id = s.id AND s.key = 'source_file' WHERE w.key = 'wing' AND w.string_value = ? AND (s.string_value LIKE ? OR s.string_value LIKE ?) """ pats = (f"{src}/%", f"{src}") ids = [r[0] for r in cur.execute(q, (wing, pats[0], pats[1]))] if ids: ph = ",".join("?" * len(ids)) for tbl in ("embedding_metadata", "embeddings"): try: cur.execute(f"DELETE FROM {tbl} WHERE id IN ({ph})", ids) except sqlite3.OperationalError: pass db.commit() print(f" purged {len(ids)} pre-existing drawers for {src} from wing '{wing}'") db.close() PY # Write mempalace.yaml into staging dir so the miner uses the right wing cat > "$STAGE/mempalace.yaml" <&2 exit 4 fi # ── Repair index ───────────────────────────────────────────────────── if [[ $NO_REPAIR -eq 0 ]]; then echo "" echo "Rebuilding HNSW index..." mempalace repair --yes fi echo "" echo "Done. Wing '$WING' is ready. Remember to reconnect any live MCP sessions."