gioser-web: add docs/ dir with frontmatter for Qdrant indexing

- 4 md files (aire, fuego, tierra, agua) with YAML frontmatter - caminos mapped: logos, nomos, kay, uku - original md/ unchanged - add scripts/index-gioser-docs.py (adapted from gioserv)
2026-05-23 14:36:20 +00:00
parent 12e3b1d4d0
commit b17149c528
5 changed files with 340 additions and 0 deletions
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""Indexador de docs/ de gioser-web → Qdrant.
+
+Recorre crates/apps/gioser-web/docs/, parsea YAML frontmatter,
+trocea cada documento en fragmentos de párrafo, pide embeddings al
+servicio agnóstico y hace upsert a Qdrant.
+
+Uso:
+    python scripts/index-gioser-docs.py                          # usa defaults
+    python scripts/index-gioser-docs.py --rebuild                # recrea colección
+    python scripts/index-gioser-docs.py --docs ./docs --rebuild  # docs custom
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import os
+import re
+import sys
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+import yaml
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as qm
+
+
+DEFAULT_DOCS = Path(__file__).resolve().parent.parent / "crates/apps/gioser-web/docs"
+DEFAULT_QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+DEFAULT_EMBED_URL = os.getenv("EMBEDDINGS_URL", "http://localhost:8001")
+DEFAULT_COLLECTION = os.getenv("QDRANT_COLLECTION", "gioser")
+
+VALID_CAMINOS = {"logos", "uku", "kay", "nomos", "aire", "fuego", "tierra", "agua"}
+FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?\n)---\s*\n(.*)$", re.DOTALL)
+
+
+@dataclass
+class Chunk:
+    doc_id: str
+    chunk_index: int
+    text: str
+    title: str
+    camino: str
+    tags: list[str]
+
+
+def parse_md(path: Path) -> tuple[dict, str]:
+    raw = path.read_text(encoding="utf-8")
+    m = FRONTMATTER_RE.match(raw)
+    if m:
+        meta = yaml.safe_load(m.group(1)) or {}
+        body = m.group(2)
+    else:
+        meta = {}
+        body = raw
+    return meta, body
+
+
+def chunk_body(body: str, min_chars: int = 200, max_chars: int = 900) -> list[str]:
+    """Fragmenta por párrafos respetando un mínimo y un máximo."""
+    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
+    chunks: list[str] = []
+    buf = ""
+    for p in paragraphs:
+        candidate = f"{buf}\n\n{p}".strip() if buf else p
+        if len(candidate) >= max_chars:
+            if buf:
+                chunks.append(buf)
+            buf = p
+        else:
+            buf = candidate
+            if len(buf) >= min_chars:
+                chunks.append(buf)
+                buf = ""
+    if buf:
+        if chunks and len(buf) < min_chars:
+            chunks[-1] = f"{chunks[-1]}\n\n{buf}"
+        else:
+            chunks.append(buf)
+    return chunks
+
+
+def discover_chunks(docs_dir: Path) -> list[Chunk]:
+    out: list[Chunk] = []
+    for path in sorted(docs_dir.rglob("*.md")):
+        meta, body = parse_md(path)
+        camino = (meta.get("camino") or path.stem).lower()
+        if camino not in VALID_CAMINOS:
+            print(f"  ⚠ saltando {path}: camino '{camino}' inválido", file=sys.stderr)
+            continue
+        title = meta.get("title") or path.stem.replace("-", " ").title()
+        tags = list(meta.get("tags") or [])
+        doc_id = meta.get("id") or hashlib.sha1(str(path).encode()).hexdigest()[:12]
+        for i, chunk in enumerate(chunk_body(body)):
+            out.append(
+                Chunk(
+                    doc_id=doc_id,
+                    chunk_index=i,
+                    text=chunk,
+                    title=title if i == 0 else f"{title} · §{i + 1}",
+                    camino=camino,
+                    tags=tags,
+                )
+            )
+    return out
+
+
+def embed_batches(http: httpx.Client, embed_url: str, texts: list[str], batch: int = 32) -> list[list[float]]:
+    out: list[list[float]] = []
+    for i in range(0, len(texts), batch):
+        chunk = texts[i : i + batch]
+        r = http.post(
+            f"{embed_url}/embed",
+            json={"texts": chunk, "kind": "passage", "normalize": True},
+            timeout=120.0,
+        )
+        r.raise_for_status()
+        out.extend(r.json()["vectors"])
+    return out
+
+
+def ensure_collection(qdrant: QdrantClient, name: str, dim: int, rebuild: bool):
+    existing = {c.name for c in qdrant.get_collections().collections}
+    if name in existing and rebuild:
+        qdrant.delete_collection(name)
+        existing.discard(name)
+    if name not in existing:
+        qdrant.create_collection(
+            collection_name=name,
+            vectors_config=qm.VectorParams(size=dim, distance=qm.Distance.COSINE),
+        )
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Indexa docs/ de gioser-web en Qdrant")
+    ap.add_argument("--docs", default=str(DEFAULT_DOCS))
+    ap.add_argument("--qdrant", default=DEFAULT_QDRANT_URL)
+    ap.add_argument("--embed", default=DEFAULT_EMBED_URL)
+    ap.add_argument("--collection", default=DEFAULT_COLLECTION)
+    ap.add_argument("--rebuild", action="store_true", help="borra y recrea la colección")
+    args = ap.parse_args()
+
+    docs_dir = Path(args.docs)
+    if not docs_dir.is_dir():
+        sys.exit(f"docs no existe: {docs_dir}")
+
+    chunks = discover_chunks(docs_dir)
+    if not chunks:
+        sys.exit("no se encontraron docs para indexar")
+    print(f"→ {len(chunks)} fragmentos descubiertos")
+
+    with httpx.Client() as http:
+        health = http.get(f"{args.embed}/health", timeout=10.0).json()
+        dim = int(health["dim"])
+        print(f"→ embeddings: {health['model']} (dim={dim})")
+
+        qdrant = QdrantClient(url=args.qdrant)
+        ensure_collection(qdrant, args.collection, dim, rebuild=args.rebuild)
+
+        vectors = embed_batches(http, args.embed, [c.text for c in chunks])
+
+    points = [
+        qm.PointStruct(
+            id=str(uuid.uuid5(uuid.NAMESPACE_URL, f"{c.doc_id}:{c.chunk_index}")),
+            vector=v,
+            payload={
+                "doc_id": c.doc_id,
+                "chunk_index": c.chunk_index,
+                "title": c.title,
+                "text": c.text,
+                "camino": c.camino,
+                "tags": c.tags,
+                "source": "gioser-web",
+            },
+        )
+        for c, v in zip(chunks, vectors)
+    ]
+    qdrant.upsert(collection_name=args.collection, points=points)
+    print(f"✓ {len(points)} puntos en colección '{args.collection}'")
+
+    if args.rebuild:
+        print("  ✔ colección recreada")
+
+
+if __name__ == "__main__":
+    main()