91 lines
2.4 KiB
Python
91 lines
2.4 KiB
Python
|
|
"""
|
||
|
|
منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB.
|
||
|
|
"""
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from .chunks import build_all_chunks
|
||
|
|
from .rag_settings import RAGConfig
|
||
|
|
from .embeddings import get_embedder
|
||
|
|
|
||
|
|
|
||
|
|
COLLECTION_NAME = "croplogic_kb"
|
||
|
|
|
||
|
|
|
||
|
|
def build_index(config: RAGConfig) -> int:
|
||
|
|
"""
|
||
|
|
ساخت/بازسازی کامل index پایگاه دانش.
|
||
|
|
chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره میکند.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
تعداد documentهای اضافه شده.
|
||
|
|
"""
|
||
|
|
tone_path = Path(config.tone_file)
|
||
|
|
|
||
|
|
chunks = build_all_chunks(
|
||
|
|
tone_path=tone_path,
|
||
|
|
max_chunk_tokens=config.chunking.max_chunk_tokens,
|
||
|
|
overlap_tokens=config.chunking.overlap_tokens,
|
||
|
|
)
|
||
|
|
|
||
|
|
if not chunks:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
texts = [t for t, _ in chunks]
|
||
|
|
metadatas = [m for _, m in chunks]
|
||
|
|
|
||
|
|
# تبدیل metadata به فرمت ChromaDB (فقط str, int, float)
|
||
|
|
def _serialize_meta(m: dict) -> dict:
|
||
|
|
out = {}
|
||
|
|
for k, v in m.items():
|
||
|
|
if v is None:
|
||
|
|
continue
|
||
|
|
if isinstance(v, (str, int, float, bool)):
|
||
|
|
out[k] = v
|
||
|
|
else:
|
||
|
|
out[k] = str(v)
|
||
|
|
return out
|
||
|
|
|
||
|
|
metadatas = [_serialize_meta(m) for m in metadatas]
|
||
|
|
|
||
|
|
embedder = get_embedder(config)
|
||
|
|
batch_size = config.embedding.batch_size
|
||
|
|
|
||
|
|
all_embeddings = []
|
||
|
|
for i in range(0, len(texts), batch_size):
|
||
|
|
batch = texts[i : i + batch_size]
|
||
|
|
embs = embedder.encode(batch, batch_size=batch_size)
|
||
|
|
all_embeddings.extend(embs)
|
||
|
|
|
||
|
|
# ChromaDB
|
||
|
|
persist_dir = Path(config.chromadb.persist_directory)
|
||
|
|
persist_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
import chromadb
|
||
|
|
from chromadb.config import Settings as ChromaSettings
|
||
|
|
|
||
|
|
client = chromadb.PersistentClient(
|
||
|
|
path=str(persist_dir),
|
||
|
|
settings=ChromaSettings(anonymized_telemetry=False),
|
||
|
|
)
|
||
|
|
|
||
|
|
collection_name = config.chromadb.collection_name or COLLECTION_NAME
|
||
|
|
try:
|
||
|
|
client.delete_collection(collection_name)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
collection = client.create_collection(
|
||
|
|
name=collection_name,
|
||
|
|
metadata={"hnsw:space": "cosine"},
|
||
|
|
)
|
||
|
|
|
||
|
|
ids = [f"doc_{i}" for i in range(len(texts))]
|
||
|
|
collection.add(
|
||
|
|
ids=ids,
|
||
|
|
embeddings=all_embeddings,
|
||
|
|
documents=texts,
|
||
|
|
metadatas=metadatas,
|
||
|
|
)
|
||
|
|
|
||
|
|
return len(texts)
|