This commit is contained in:
2026-05-05 21:02:12 +03:30
parent 5301071df5
commit 1679825ae2
47 changed files with 1347 additions and 1403 deletions
+41 -23
View File
@@ -12,6 +12,7 @@ from pathlib import Path
from .chunker import chunk_text, chunk_texts
from .config import load_rag_config, RAGConfig
from .embedding import embed_texts
from .observability import classify_exception, log_event, observe_operation, record_metric
from .user_data import load_user_sources, build_user_weather_text
from .vector_store import QdrantVectorStore
@@ -36,7 +37,19 @@ def _load_file(path: Path) -> str | None:
return None
try:
return path.read_text(encoding="utf-8").strip()
except Exception:
except Exception as exc:
failure = classify_exception(exc)
log_event(
level=40,
message="rag ingest file load failed",
source="rag.ingest",
provider=None,
operation="load_file",
result_status="error",
error_code=failure.error_code,
path=str(path),
)
record_metric("rag.ingest.file_load_failure", error_code=failure.error_code)
return None
@@ -122,12 +135,14 @@ def ingest(
"""
cfg = config or load_rag_config()
store = QdrantVectorStore(config=cfg)
if recreate:
store.ensure_collection(recreate=True)
with observe_operation(source="rag.ingest", provider=cfg.embedding.provider, operation="ingest"):
if recreate:
store.ensure_collection(recreate=True)
sources = load_sources(config=cfg, kb_name=kb_name)
if not sources:
return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}
sources = load_sources(config=cfg, kb_name=kb_name)
if not sources:
record_metric("rag.ingest.empty_sources", kb_name=kb_name)
return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}
all_chunks: list[str] = []
all_metas: list[dict] = []
@@ -146,24 +161,27 @@ def ingest(
"kb_name": src_kb,
})
if not all_chunks:
return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}
if not all_chunks:
record_metric("rag.ingest.empty_chunks", kb_name=kb_name)
return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}
embeddings = embed_texts(all_chunks, config=cfg)
if len(embeddings) != len(all_chunks):
embeddings = embed_texts(all_chunks, config=cfg)
if len(embeddings) != len(all_chunks):
record_metric("rag.ingest.embedding_mismatch", kb_name=kb_name)
return {
"chunks_added": 0,
"sources": [s[0] for s in sources],
"error": f"تعداد embed با چانک‌ها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
}
store.add_documents(
ids=all_ids,
embeddings=embeddings,
documents=all_chunks,
metadatas=all_metas,
)
record_metric("rag.ingest.success", kb_name=kb_name, chunks=len(all_chunks))
return {
"chunks_added": 0,
"chunks_added": len(all_chunks),
"sources": [s[0] for s in sources],
"error": f"تعداد embed با چانک‌ها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
}
store.add_documents(
ids=all_ids,
embeddings=embeddings,
documents=all_chunks,
metadatas=all_metas,
)
return {
"chunks_added": len(all_chunks),
"sources": [s[0] for s in sources],
}