#!/usr/bin/env python3 """Indexador de docs/ de gioser-web → Qdrant. Recorre crates/apps/gioser-web/docs/, parsea YAML frontmatter, trocea cada documento en fragmentos de párrafo, pide embeddings al servicio agnóstico y hace upsert a Qdrant. Uso: python scripts/index-gioser-docs.py # usa defaults python scripts/index-gioser-docs.py --rebuild # recrea colección python scripts/index-gioser-docs.py --docs ./docs --rebuild # docs custom """ from __future__ import annotations import argparse import hashlib import os import re import sys import uuid from dataclasses import dataclass from pathlib import Path import httpx import yaml from qdrant_client import QdrantClient from qdrant_client.http import models as qm DEFAULT_DOCS = Path(__file__).resolve().parent.parent / "crates/apps/gioser-web/docs" DEFAULT_QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") DEFAULT_EMBED_URL = os.getenv("EMBEDDINGS_URL", "http://localhost:8001") DEFAULT_COLLECTION = os.getenv("QDRANT_COLLECTION", "gioser") VALID_CAMINOS = {"logos", "uku", "kay", "nomos", "aire", "fuego", "tierra", "agua"} FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?\n)---\s*\n(.*)$", re.DOTALL) @dataclass class Chunk: doc_id: str chunk_index: int text: str title: str camino: str tags: list[str] def parse_md(path: Path) -> tuple[dict, str]: raw = path.read_text(encoding="utf-8") m = FRONTMATTER_RE.match(raw) if m: meta = yaml.safe_load(m.group(1)) or {} body = m.group(2) else: meta = {} body = raw return meta, body def chunk_body(body: str, min_chars: int = 200, max_chars: int = 900) -> list[str]: """Fragmenta por párrafos respetando un mínimo y un máximo.""" paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()] chunks: list[str] = [] buf = "" for p in paragraphs: candidate = f"{buf}\n\n{p}".strip() if buf else p if len(candidate) >= max_chars: if buf: chunks.append(buf) buf = p else: buf = candidate if len(buf) >= min_chars: chunks.append(buf) buf = "" if buf: if chunks and len(buf) < min_chars: chunks[-1] = f"{chunks[-1]}\n\n{buf}" else: chunks.append(buf) return chunks def discover_chunks(docs_dir: Path) -> list[Chunk]: out: list[Chunk] = [] for path in sorted(docs_dir.rglob("*.md")): meta, body = parse_md(path) camino = (meta.get("camino") or path.stem).lower() if camino not in VALID_CAMINOS: print(f" ⚠ saltando {path}: camino '{camino}' inválido", file=sys.stderr) continue title = meta.get("title") or path.stem.replace("-", " ").title() tags = list(meta.get("tags") or []) doc_id = meta.get("id") or hashlib.sha1(str(path).encode()).hexdigest()[:12] for i, chunk in enumerate(chunk_body(body)): out.append( Chunk( doc_id=doc_id, chunk_index=i, text=chunk, title=title if i == 0 else f"{title} · §{i + 1}", camino=camino, tags=tags, ) ) return out def embed_batches(http: httpx.Client, embed_url: str, texts: list[str], batch: int = 32) -> list[list[float]]: out: list[list[float]] = [] for i in range(0, len(texts), batch): chunk = texts[i : i + batch] r = http.post( f"{embed_url}/embed", json={"texts": chunk, "kind": "passage", "normalize": True}, timeout=120.0, ) r.raise_for_status() out.extend(r.json()["vectors"]) return out def ensure_collection(qdrant: QdrantClient, name: str, dim: int, rebuild: bool): existing = {c.name for c in qdrant.get_collections().collections} if name in existing and rebuild: qdrant.delete_collection(name) existing.discard(name) if name not in existing: qdrant.create_collection( collection_name=name, vectors_config=qm.VectorParams(size=dim, distance=qm.Distance.COSINE), ) def main(): ap = argparse.ArgumentParser(description="Indexa docs/ de gioser-web en Qdrant") ap.add_argument("--docs", default=str(DEFAULT_DOCS)) ap.add_argument("--qdrant", default=DEFAULT_QDRANT_URL) ap.add_argument("--embed", default=DEFAULT_EMBED_URL) ap.add_argument("--collection", default=DEFAULT_COLLECTION) ap.add_argument("--rebuild", action="store_true", help="borra y recrea la colección") args = ap.parse_args() docs_dir = Path(args.docs) if not docs_dir.is_dir(): sys.exit(f"docs no existe: {docs_dir}") chunks = discover_chunks(docs_dir) if not chunks: sys.exit("no se encontraron docs para indexar") print(f"→ {len(chunks)} fragmentos descubiertos") with httpx.Client() as http: health = http.get(f"{args.embed}/health", timeout=10.0).json() dim = int(health["dim"]) print(f"→ embeddings: {health['model']} (dim={dim})") qdrant = QdrantClient(url=args.qdrant) ensure_collection(qdrant, args.collection, dim, rebuild=args.rebuild) vectors = embed_batches(http, args.embed, [c.text for c in chunks]) points = [ qm.PointStruct( id=str(uuid.uuid5(uuid.NAMESPACE_URL, f"{c.doc_id}:{c.chunk_index}")), vector=v, payload={ "doc_id": c.doc_id, "chunk_index": c.chunk_index, "title": c.title, "text": c.text, "camino": c.camino, "tags": c.tags, "source": "gioser-web", }, ) for c, v in zip(chunks, vectors) ] qdrant.upsert(collection_name=args.collection, points=points) print(f"✓ {len(points)} puntos en colección '{args.collection}'") if args.rebuild: print(" ✔ colección recreada") if __name__ == "__main__": main()