""" منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB. """ from pathlib import Path from .chunks import build_all_chunks from .rag_settings import RAGConfig from .embeddings import get_embedder COLLECTION_NAME = "croplogic_kb" def build_index(config: RAGConfig) -> int: """ ساخت/بازسازی کامل index پایگاه دانش. chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره می‌کند. Returns: تعداد documentهای اضافه شده. """ tone_path = Path(config.tone_file) chunks = build_all_chunks( tone_path=tone_path, max_chunk_tokens=config.chunking.max_chunk_tokens, overlap_tokens=config.chunking.overlap_tokens, ) if not chunks: return 0 texts = [t for t, _ in chunks] metadatas = [m for _, m in chunks] # تبدیل metadata به فرمت ChromaDB (فقط str, int, float) def _serialize_meta(m: dict) -> dict: out = {} for k, v in m.items(): if v is None: continue if isinstance(v, (str, int, float, bool)): out[k] = v else: out[k] = str(v) return out metadatas = [_serialize_meta(m) for m in metadatas] embedder = get_embedder(config) batch_size = config.embedding.batch_size all_embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i : i + batch_size] embs = embedder.encode(batch, batch_size=batch_size) all_embeddings.extend(embs) # ChromaDB persist_dir = Path(config.chromadb.persist_directory) persist_dir.mkdir(parents=True, exist_ok=True) import chromadb from chromadb.config import Settings as ChromaSettings client = chromadb.PersistentClient( path=str(persist_dir), settings=ChromaSettings(anonymized_telemetry=False), ) collection_name = config.chromadb.collection_name or COLLECTION_NAME try: client.delete_collection(collection_name) except Exception: pass collection = client.create_collection( name=collection_name, metadata={"hnsw:space": "cosine"}, ) ids = [f"doc_{i}" for i in range(len(texts))] collection.add( ids=ids, embeddings=all_embeddings, documents=texts, metadatas=metadatas, ) return len(texts)