brahman/scripts/index-gioser-docs.py

#!/usr/bin/env python3
"""Indexador de docs/ de gioser-web → Qdrant.

Recorre crates/apps/gioser-web/docs/, parsea YAML frontmatter,
trocea cada documento en fragmentos de párrafo, pide embeddings al
servicio agnóstico y hace upsert a Qdrant.

Uso:
    python scripts/index-gioser-docs.py                          # usa defaults
    python scripts/index-gioser-docs.py --rebuild                # recrea colección
    python scripts/index-gioser-docs.py --docs ./docs --rebuild  # docs custom
"""

from __future__ import annotations

import argparse
import hashlib
import os
import re
import sys
import uuid
from dataclasses import dataclass
from pathlib import Path

import httpx
import yaml
from qdrant_client import QdrantClient
from qdrant_client.http import models as qm


DEFAULT_DOCS = Path(__file__).resolve().parent.parent / "crates/apps/gioser-web/docs"
DEFAULT_QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
DEFAULT_EMBED_URL = os.getenv("EMBEDDINGS_URL", "http://localhost:8001")
DEFAULT_COLLECTION = os.getenv("QDRANT_COLLECTION", "gioser")

VALID_CAMINOS = {"logos", "uku", "kay", "nomos", "aire", "fuego", "tierra", "agua", "cuerpo", "sombra", "cosmos", "practica", "olvido"}
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?\n)---\s*\n(.*)$", re.DOTALL)


@dataclass
class Chunk:
    doc_id: str
    chunk_index: int
    text: str
    title: str
    camino: str
    tags: list[str]


def parse_md(path: Path) -> tuple[dict, str]:
    raw = path.read_text(encoding="utf-8")
    m = FRONTMATTER_RE.match(raw)
    if m:
        meta = yaml.safe_load(m.group(1)) or {}
        body = m.group(2)
    else:
        meta = {}
        body = raw
    return meta, body


def chunk_body(body: str, min_chars: int = 200, max_chars: int = 900) -> list[str]:
    """Fragmenta por párrafos respetando un mínimo y un máximo."""
    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
    chunks: list[str] = []
    buf = ""
    for p in paragraphs:
        candidate = f"{buf}\n\n{p}".strip() if buf else p
        if len(candidate) >= max_chars:
            if buf:
                chunks.append(buf)
            buf = p
        else:
            buf = candidate
            if len(buf) >= min_chars:
                chunks.append(buf)
                buf = ""
    if buf:
        if chunks and len(buf) < min_chars:
            chunks[-1] = f"{chunks[-1]}\n\n{buf}"
        else:
            chunks.append(buf)
    return chunks


def discover_chunks(docs_dir: Path) -> list[Chunk]:
    out: list[Chunk] = []
    for path in sorted(docs_dir.rglob("*.md")):
        meta, body = parse_md(path)
        camino = (meta.get("camino") or path.stem).lower()
        if camino not in VALID_CAMINOS:
            print(f"  ⚠ saltando {path}: camino '{camino}' inválido", file=sys.stderr)
            continue
        title = meta.get("title") or path.stem.replace("-", " ").title()
        tags = list(meta.get("tags") or [])
        doc_id = meta.get("id") or hashlib.sha1(str(path).encode()).hexdigest()[:12]
        for i, chunk in enumerate(chunk_body(body)):
            out.append(
                Chunk(
                    doc_id=doc_id,
                    chunk_index=i,
                    text=chunk,
                    title=title if i == 0 else f"{title} · §{i + 1}",
                    camino=camino,
                    tags=tags,
                )
            )
    return out


def embed_batches(http: httpx.Client, embed_url: str, texts: list[str], batch: int = 32) -> list[list[float]]:
    out: list[list[float]] = []
    for i in range(0, len(texts), batch):
        chunk = texts[i : i + batch]
        r = http.post(
            f"{embed_url}/embed",
            json={"texts": chunk, "kind": "passage", "normalize": True},
            timeout=120.0,
        )
        r.raise_for_status()
        out.extend(r.json()["vectors"])
    return out


def ensure_collection(qdrant: QdrantClient, name: str, dim: int, rebuild: bool):
    existing = {c.name for c in qdrant.get_collections().collections}
    if name in existing and rebuild:
        qdrant.delete_collection(name)
        existing.discard(name)
    if name not in existing:
        qdrant.create_collection(
            collection_name=name,
            vectors_config=qm.VectorParams(size=dim, distance=qm.Distance.COSINE),
        )


def main():
    ap = argparse.ArgumentParser(description="Indexa docs/ de gioser-web en Qdrant")
    ap.add_argument("--docs", default=str(DEFAULT_DOCS))
    ap.add_argument("--qdrant", default=DEFAULT_QDRANT_URL)
    ap.add_argument("--embed", default=DEFAULT_EMBED_URL)
    ap.add_argument("--collection", default=DEFAULT_COLLECTION)
    ap.add_argument("--rebuild", action="store_true", help="borra y recrea la colección")
    args = ap.parse_args()

    docs_dir = Path(args.docs)
    if not docs_dir.is_dir():
        sys.exit(f"docs no existe: {docs_dir}")

    chunks = discover_chunks(docs_dir)
    if not chunks:
        sys.exit("no se encontraron docs para indexar")
    print(f"→ {len(chunks)} fragmentos descubiertos")

    with httpx.Client() as http:
        health = http.get(f"{args.embed}/health", timeout=10.0).json()
        dim = int(health["dim"])
        print(f"→ embeddings: {health['model']} (dim={dim})")

        qdrant = QdrantClient(url=args.qdrant)
        ensure_collection(qdrant, args.collection, dim, rebuild=args.rebuild)

        vectors = embed_batches(http, args.embed, [c.text for c in chunks])

    points = [
        qm.PointStruct(
            id=str(uuid.uuid5(uuid.NAMESPACE_URL, f"{c.doc_id}:{c.chunk_index}")),
            vector=v,
            payload={
                "doc_id": c.doc_id,
                "chunk_index": c.chunk_index,
                "title": c.title,
                "text": c.text,
                "camino": c.camino,
                "tags": c.tags,
                "source": "gioser-web",
            },
        )
        for c, v in zip(chunks, vectors)
    ]
    qdrant.upsert(collection_name=args.collection, points=points)
    print(f"✓ {len(points)} puntos en colección '{args.collection}'")

    if args.rebuild:
        print("  ✔ colección recreada")


if __name__ == "__main__":
    main()