190 lines
6.1 KiB
Python
190 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Indexador de docs/ de gioser-web → Qdrant.
|
|
|
|
Recorre crates/apps/gioser-web/docs/, parsea YAML frontmatter,
|
|
trocea cada documento en fragmentos de párrafo, pide embeddings al
|
|
servicio agnóstico y hace upsert a Qdrant.
|
|
|
|
Uso:
|
|
python scripts/index-gioser-docs.py # usa defaults
|
|
python scripts/index-gioser-docs.py --rebuild # recrea colección
|
|
python scripts/index-gioser-docs.py --docs ./docs --rebuild # docs custom
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import os
|
|
import re
|
|
import sys
|
|
import uuid
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import yaml
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.http import models as qm
|
|
|
|
|
|
DEFAULT_DOCS = Path(__file__).resolve().parent.parent / "crates/apps/gioser-web/docs"
|
|
DEFAULT_QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
DEFAULT_EMBED_URL = os.getenv("EMBEDDINGS_URL", "http://localhost:8001")
|
|
DEFAULT_COLLECTION = os.getenv("QDRANT_COLLECTION", "gioser")
|
|
|
|
VALID_CAMINOS = {"logos", "uku", "kay", "nomos", "aire", "fuego", "tierra", "agua", "cuerpo", "sombra", "cosmos", "practica", "olvido"}
|
|
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?\n)---\s*\n(.*)$", re.DOTALL)
|
|
|
|
|
|
@dataclass
|
|
class Chunk:
|
|
doc_id: str
|
|
chunk_index: int
|
|
text: str
|
|
title: str
|
|
camino: str
|
|
tags: list[str]
|
|
|
|
|
|
def parse_md(path: Path) -> tuple[dict, str]:
|
|
raw = path.read_text(encoding="utf-8")
|
|
m = FRONTMATTER_RE.match(raw)
|
|
if m:
|
|
meta = yaml.safe_load(m.group(1)) or {}
|
|
body = m.group(2)
|
|
else:
|
|
meta = {}
|
|
body = raw
|
|
return meta, body
|
|
|
|
|
|
def chunk_body(body: str, min_chars: int = 200, max_chars: int = 900) -> list[str]:
|
|
"""Fragmenta por párrafos respetando un mínimo y un máximo."""
|
|
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
|
|
chunks: list[str] = []
|
|
buf = ""
|
|
for p in paragraphs:
|
|
candidate = f"{buf}\n\n{p}".strip() if buf else p
|
|
if len(candidate) >= max_chars:
|
|
if buf:
|
|
chunks.append(buf)
|
|
buf = p
|
|
else:
|
|
buf = candidate
|
|
if len(buf) >= min_chars:
|
|
chunks.append(buf)
|
|
buf = ""
|
|
if buf:
|
|
if chunks and len(buf) < min_chars:
|
|
chunks[-1] = f"{chunks[-1]}\n\n{buf}"
|
|
else:
|
|
chunks.append(buf)
|
|
return chunks
|
|
|
|
|
|
def discover_chunks(docs_dir: Path) -> list[Chunk]:
|
|
out: list[Chunk] = []
|
|
for path in sorted(docs_dir.rglob("*.md")):
|
|
meta, body = parse_md(path)
|
|
camino = (meta.get("camino") or path.stem).lower()
|
|
if camino not in VALID_CAMINOS:
|
|
print(f" ⚠ saltando {path}: camino '{camino}' inválido", file=sys.stderr)
|
|
continue
|
|
title = meta.get("title") or path.stem.replace("-", " ").title()
|
|
tags = list(meta.get("tags") or [])
|
|
doc_id = meta.get("id") or hashlib.sha1(str(path).encode()).hexdigest()[:12]
|
|
for i, chunk in enumerate(chunk_body(body)):
|
|
out.append(
|
|
Chunk(
|
|
doc_id=doc_id,
|
|
chunk_index=i,
|
|
text=chunk,
|
|
title=title if i == 0 else f"{title} · §{i + 1}",
|
|
camino=camino,
|
|
tags=tags,
|
|
)
|
|
)
|
|
return out
|
|
|
|
|
|
def embed_batches(http: httpx.Client, embed_url: str, texts: list[str], batch: int = 32) -> list[list[float]]:
|
|
out: list[list[float]] = []
|
|
for i in range(0, len(texts), batch):
|
|
chunk = texts[i : i + batch]
|
|
r = http.post(
|
|
f"{embed_url}/embed",
|
|
json={"texts": chunk, "kind": "passage", "normalize": True},
|
|
timeout=120.0,
|
|
)
|
|
r.raise_for_status()
|
|
out.extend(r.json()["vectors"])
|
|
return out
|
|
|
|
|
|
def ensure_collection(qdrant: QdrantClient, name: str, dim: int, rebuild: bool):
|
|
existing = {c.name for c in qdrant.get_collections().collections}
|
|
if name in existing and rebuild:
|
|
qdrant.delete_collection(name)
|
|
existing.discard(name)
|
|
if name not in existing:
|
|
qdrant.create_collection(
|
|
collection_name=name,
|
|
vectors_config=qm.VectorParams(size=dim, distance=qm.Distance.COSINE),
|
|
)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser(description="Indexa docs/ de gioser-web en Qdrant")
|
|
ap.add_argument("--docs", default=str(DEFAULT_DOCS))
|
|
ap.add_argument("--qdrant", default=DEFAULT_QDRANT_URL)
|
|
ap.add_argument("--embed", default=DEFAULT_EMBED_URL)
|
|
ap.add_argument("--collection", default=DEFAULT_COLLECTION)
|
|
ap.add_argument("--rebuild", action="store_true", help="borra y recrea la colección")
|
|
args = ap.parse_args()
|
|
|
|
docs_dir = Path(args.docs)
|
|
if not docs_dir.is_dir():
|
|
sys.exit(f"docs no existe: {docs_dir}")
|
|
|
|
chunks = discover_chunks(docs_dir)
|
|
if not chunks:
|
|
sys.exit("no se encontraron docs para indexar")
|
|
print(f"→ {len(chunks)} fragmentos descubiertos")
|
|
|
|
with httpx.Client() as http:
|
|
health = http.get(f"{args.embed}/health", timeout=10.0).json()
|
|
dim = int(health["dim"])
|
|
print(f"→ embeddings: {health['model']} (dim={dim})")
|
|
|
|
qdrant = QdrantClient(url=args.qdrant)
|
|
ensure_collection(qdrant, args.collection, dim, rebuild=args.rebuild)
|
|
|
|
vectors = embed_batches(http, args.embed, [c.text for c in chunks])
|
|
|
|
points = [
|
|
qm.PointStruct(
|
|
id=str(uuid.uuid5(uuid.NAMESPACE_URL, f"{c.doc_id}:{c.chunk_index}")),
|
|
vector=v,
|
|
payload={
|
|
"doc_id": c.doc_id,
|
|
"chunk_index": c.chunk_index,
|
|
"title": c.title,
|
|
"text": c.text,
|
|
"camino": c.camino,
|
|
"tags": c.tags,
|
|
"source": "gioser-web",
|
|
},
|
|
)
|
|
for c, v in zip(chunks, vectors)
|
|
]
|
|
qdrant.upsert(collection_name=args.collection, points=points)
|
|
print(f"✓ {len(points)} puntos en colección '{args.collection}'")
|
|
|
|
if args.rebuild:
|
|
print(" ✔ colección recreada")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|