Add Qdrant and ChromaDB support to the project
- Added Qdrant service to both docker-compose files for production and development. - Updated environment variables in .env.example and settings.py to include Qdrant configuration. - Included necessary dependencies for Qdrant and ChromaDB in requirements.txt. - Updated .gitignore to exclude ChromaDB data files.
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
from .chunks import build_all_chunks
|
||||
from .rag_settings import RAGConfig
|
||||
from .embeddings import get_embedder
|
||||
|
||||
|
||||
COLLECTION_NAME = "croplogic_kb"
|
||||
|
||||
|
||||
def build_index(config: RAGConfig) -> int:
|
||||
"""
|
||||
ساخت/بازسازی کامل index پایگاه دانش.
|
||||
chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره میکند.
|
||||
|
||||
Returns:
|
||||
تعداد documentهای اضافه شده.
|
||||
"""
|
||||
tone_path = Path(config.tone_file)
|
||||
|
||||
chunks = build_all_chunks(
|
||||
tone_path=tone_path,
|
||||
max_chunk_tokens=config.chunking.max_chunk_tokens,
|
||||
overlap_tokens=config.chunking.overlap_tokens,
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
texts = [t for t, _ in chunks]
|
||||
metadatas = [m for _, m in chunks]
|
||||
|
||||
# تبدیل metadata به فرمت ChromaDB (فقط str, int, float)
|
||||
def _serialize_meta(m: dict) -> dict:
|
||||
out = {}
|
||||
for k, v in m.items():
|
||||
if v is None:
|
||||
continue
|
||||
if isinstance(v, (str, int, float, bool)):
|
||||
out[k] = v
|
||||
else:
|
||||
out[k] = str(v)
|
||||
return out
|
||||
|
||||
metadatas = [_serialize_meta(m) for m in metadatas]
|
||||
|
||||
embedder = get_embedder(config)
|
||||
batch_size = config.embedding.batch_size
|
||||
|
||||
all_embeddings = []
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i : i + batch_size]
|
||||
embs = embedder.encode(batch, batch_size=batch_size)
|
||||
all_embeddings.extend(embs)
|
||||
|
||||
# ChromaDB
|
||||
persist_dir = Path(config.chromadb.persist_directory)
|
||||
persist_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
import chromadb
|
||||
from chromadb.config import Settings as ChromaSettings
|
||||
|
||||
client = chromadb.PersistentClient(
|
||||
path=str(persist_dir),
|
||||
settings=ChromaSettings(anonymized_telemetry=False),
|
||||
)
|
||||
|
||||
collection_name = config.chromadb.collection_name or COLLECTION_NAME
|
||||
try:
|
||||
client.delete_collection(collection_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
|
||||
ids = [f"doc_{i}" for i in range(len(texts))]
|
||||
collection.add(
|
||||
ids=ids,
|
||||
embeddings=all_embeddings,
|
||||
documents=texts,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
|
||||
return len(texts)
|
||||
Reference in New Issue
Block a user