#!/usr/bin/env bash
# mempalace-docs — mine a project into MemPalace with docs-only filtering
#
# Works around the fact that upstream `mempalace mine` has a hardcoded
# READABLE_EXTENSIONS list that includes .py / .ts / .js / .go / .rs etc,
# which pollutes the palace with low-signal code-fragment drawers.
#
# Strategy: stage a copy of only docs/config/script files into /tmp, then
# run `mempalace mine` against that staging dir. Wing is derived from the
# source directory name (override with --wing).
#
# Once MemPalace PR #1213 (exclude_patterns in mempalace.yaml) lands, this
# wrapper becomes a thin shim over `mempalace mine` with a default
# exclude_patterns injected.
#
# Usage:
#   mempalace-docs <directory>
#   mempalace-docs <directory> --wing <name>
#   mempalace-docs <directory> --agent <name>
#   mempalace-docs <directory> --dry-run
#   mempalace-docs --help
#
# Exit codes:
#   0  success
#   1  usage / argument error
#   2  source directory missing
#   3  mempalace CLI not installed
#   4  mine failed
#
# Dependencies: bash, find, cp, mempalace (v3.3.3+)

set -euo pipefail

# ── Defaults ─────────────────────────────────────────────────────────
AGENT="${USER:-mempalace}"
WING=""
SRC=""
DRY_RUN=0
NO_REPAIR=0

# File patterns to include. Docs + config + intent-bearing scripts.
# Everything else (code) is excluded by omission.
INCLUDE_GLOBS=(
  '*.md' '*.mdx' '*.rst' '*.txt'
  '*.yml' '*.yaml' '*.toml'
  '*.json'                                   # includes package.json, pyproject companions; lockfiles filtered below
  '*.sh' '*.bash' '*.zsh' '*.fish'
  'Dockerfile*' 'Makefile*' 'Containerfile*'
  '*.conf' '*.cfg' '*.ini'
  'LICENSE*' 'COPYING*' 'NOTICE*' 'AUTHORS*' 'CONTRIBUTORS*'
)

# Path segments to always skip (in addition to .gitignore).
SKIP_DIRS=(
  '.git' '.venv' 'venv' '__pycache__' 'node_modules'
  '.mypy_cache' '.pytest_cache' '.ruff_cache' '.tox' '.nox'
  'dist' 'build' '.next' '.nuxt' 'target' 'coverage'
  '.DS_Store'
)

# Filename patterns to skip even if caught by an include glob.
SKIP_FILES=(
  'package-lock.json' 'yarn.lock' 'pnpm-lock.yaml' 'poetry.lock'
  'Cargo.lock' 'Gemfile.lock' 'composer.lock'
  '.gitignore' '.dockerignore'
)

# ── Usage ────────────────────────────────────────────────────────────
usage() {
  cat <<'EOF'
mempalace-docs — mine a project into MemPalace, docs/config/scripts only

Usage:
  mempalace-docs <directory> [options]

Options:
  --wing <name>    Override wing name (default: source directory name)
  --agent <name>   Agent name recorded on drawers (default: $USER)
  --dry-run        List files that would be mined; do not file
  --no-repair      Skip `mempalace repair` after mining
  -h, --help       Show this help

What gets mined:
  Docs:    *.md *.mdx *.rst *.txt
  Config:  *.yml *.yaml *.toml *.json *.conf *.cfg *.ini
  Scripts: *.sh *.bash *.zsh *.fish Dockerfile* Makefile*
  Legal:   LICENSE* COPYING* NOTICE* AUTHORS*

What gets skipped (by design):
  Source code: .py .ts .tsx .js .jsx .go .rs .java .cpp .c .rb .kt .swift
  Caches / deps: .git .venv venv node_modules __pycache__ .mypy_cache
                 .pytest_cache .ruff_cache dist build .next target coverage
  Lockfiles:   package-lock.json yarn.lock poetry.lock Cargo.lock ...

Rationale:
  The palace is for context and intent. Agents read code directly via
  grep/glob/Read — mining it creates a parallel, lossier, drift-prone
  copy that pollutes semantic search.

  This wrapper is a bridge until MemPalace PR #1213 (exclude_patterns)
  lands upstream.
EOF
}

# ── Parse args ───────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
  case "$1" in
    -h|--help) usage; exit 0 ;;
    --wing) WING="${2:-}"; shift 2 ;;
    --agent) AGENT="${2:-}"; shift 2 ;;
    --dry-run) DRY_RUN=1; shift ;;
    --no-repair) NO_REPAIR=1; shift ;;
    --) shift; break ;;
    -*) echo "error: unknown option: $1" >&2; usage >&2; exit 1 ;;
    *) if [[ -z "$SRC" ]]; then SRC="$1"; shift; else echo "error: unexpected arg: $1" >&2; exit 1; fi ;;
  esac
done

if [[ -z "$SRC" ]]; then usage >&2; exit 1; fi
if [[ ! -d "$SRC" ]]; then
  echo "error: not a directory: $SRC" >&2; exit 2
fi
if ! command -v mempalace >/dev/null 2>&1; then
  echo "error: mempalace CLI not found in PATH" >&2; exit 3
fi

SRC="$(cd "$SRC" && pwd)"

# Determine wing name with the following precedence:
#   1. explicit --wing flag (user override)
#   2. `wing:` value in $SRC/mempalace.yaml (respect existing project config)
#   3. sanitized source directory basename (hyphens → underscores, matching
#      mempalace's convention for implicit wing names)
if [[ -z "$WING" && -f "$SRC/mempalace.yaml" ]]; then
  WING="$(awk -F': *' '/^wing:/ { gsub(/["\x27 ]/,"",$2); print $2; exit }' "$SRC/mempalace.yaml" 2>/dev/null || true)"
fi
if [[ -z "$WING" ]]; then
  WING="$(basename "$SRC" | tr '-' '_')"
fi

# ── Build staging directory ──────────────────────────────────────────
# Use a deterministic, per-wing cache path so re-runs produce the same
# source_file paths the miner saw last time. This is critical: mempalace
# dedup keys on source_file + source_mtime, so a mktemp path would cause
# every run to re-file the entire wing.
CACHE_ROOT="${XDG_CACHE_HOME:-$HOME/.cache}/mempalace-docs"
STAGE="$CACHE_ROOT/$WING"
mkdir -p "$CACHE_ROOT"
rm -rf "$STAGE"
mkdir -p "$STAGE"
# Only clean up the per-wing stage on exit — leave $CACHE_ROOT itself
# alone in case other wings are staging concurrently.
trap 'rm -rf "$STAGE"' EXIT INT TERM

# Build find expression
find_cmd=(find "$SRC" -type f)

# Prune unwanted dirs
for d in "${SKIP_DIRS[@]}"; do
  find_cmd+=('!' -path "*/$d/*" '!' -path "*/$d")
done

# Include only matching names
find_cmd+=('(' -false)
for g in "${INCLUDE_GLOBS[@]}"; do
  find_cmd+=('-o' '-name' "$g")
done
find_cmd+=(')')

# Gather matches, then filter skip_files
mapfile -t matches < <("${find_cmd[@]}")

filtered=()
for f in "${matches[@]}"; do
  base="$(basename "$f")"
  skip=0
  for sf in "${SKIP_FILES[@]}"; do
    if [[ "$base" == "$sf" ]]; then skip=1; break; fi
  done
  [[ $skip -eq 0 ]] && filtered+=("$f")
done

count="${#filtered[@]}"

if [[ $count -eq 0 ]]; then
  echo "no matching files found in $SRC" >&2
  exit 0
fi

if [[ $DRY_RUN -eq 1 ]]; then
  echo "Would mine $count files into wing '$WING':"
  printf '  %s\n' "${filtered[@]}" | sed "s#^  $SRC/#  #"
  exit 0
fi

# Copy into staging, preserving mtime (critical for mempalace dedup —
# the miner compares stored mtime against the staged copy's mtime).
for f in "${filtered[@]}"; do
  rel="${f#$SRC/}"
  dest="$STAGE/$rel"
  mkdir -p "$(dirname "$dest")"
  cp -p "$f" "$dest"
done

# Purge any drawers in this wing that came from the original source
# directory. The miner records source_file = absolute path from the
# staging dir; this differs from a prior `mempalace mine <source>` run,
# so without this purge the wing would accumulate duplicates every time
# we switch between upstream `mempalace mine` and this wrapper.
# We only purge source_file paths matching $SRC/*, leaving other wings
# and other sources alone.
python3 - "$WING" "$SRC" <<'PY'
import sqlite3, sys, os
wing, src = sys.argv[1], sys.argv[2].rstrip("/")
db_path = os.path.expanduser("~/.mempalace/palace/chroma.sqlite3")
if not os.path.exists(db_path):
    sys.exit(0)
db = sqlite3.connect(db_path)
cur = db.cursor()
# Find embedding ids in target wing whose source_file is under $SRC/
q = """
SELECT DISTINCT w.id
FROM embedding_metadata w
JOIN embedding_metadata s ON w.id = s.id AND s.key = 'source_file'
WHERE w.key = 'wing'
  AND w.string_value = ?
  AND (s.string_value LIKE ? OR s.string_value LIKE ?)
"""
pats = (f"{src}/%", f"{src}")
ids = [r[0] for r in cur.execute(q, (wing, pats[0], pats[1]))]
if ids:
    ph = ",".join("?" * len(ids))
    for tbl in ("embedding_metadata", "embeddings"):
        try:
            cur.execute(f"DELETE FROM {tbl} WHERE id IN ({ph})", ids)
        except sqlite3.OperationalError:
            pass
    db.commit()
    print(f"  purged {len(ids)} pre-existing drawers for {src} from wing '{wing}'")
db.close()
PY

# Write mempalace.yaml into staging dir so the miner uses the right wing
cat > "$STAGE/mempalace.yaml" <<EOF
wing: $WING
rooms:
  - name: general
    description: Docs, config, and scripts from $WING
    keywords: [general]
EOF

echo "Staging $count files into wing '$WING'..."

# ── Run the mine ─────────────────────────────────────────────────────
if ! mempalace mine "$STAGE" --agent "$AGENT" --wing "$WING"; then
  echo "error: mempalace mine failed" >&2
  exit 4
fi

# ── Repair index ─────────────────────────────────────────────────────
if [[ $NO_REPAIR -eq 0 ]]; then
  echo ""
  echo "Rebuilding HNSW index..."
  mempalace repair --yes
fi

echo ""
echo "Done. Wing '$WING' is ready. Remember to reconnect any live MCP sessions."
