188 lines
6.8 KiB
Python
188 lines
6.8 KiB
Python
"""
|
|
پایپلاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store — با پشتیبانی از چند پایگاه دانش
|
|
|
|
منابع:
|
|
۱. لحن هر پایگاه دانش (tone) — sensor_uuid=__global__, kb_name=chat|irrigation|fertilization
|
|
۲. پایگاههای دانش سهگانه — sensor_uuid=__global__, kb_name=chat|irrigation|fertilization
|
|
۳. دیتای خاک + هواشناسی هر کاربر از DB — sensor_uuid=uuid, kb_name=__all__
|
|
"""
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from .chunker import chunk_text, chunk_texts
|
|
from .config import load_rag_config, RAGConfig
|
|
from .embedding import embed_texts
|
|
from .observability import classify_exception, log_event, observe_operation, record_metric
|
|
from .user_data import load_user_sources, build_user_weather_text
|
|
from .vector_store import QdrantVectorStore
|
|
|
|
TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}
|
|
|
|
SENSOR_UUID_GLOBAL = "__global__"
|
|
|
|
KB_NAME_ALL = "__all__"
|
|
|
|
|
|
def _resolve_path(base: Path, p: str) -> Path:
|
|
"""تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""
|
|
path = Path(p)
|
|
if not path.is_absolute():
|
|
path = base / path
|
|
return path
|
|
|
|
|
|
def _load_file(path: Path) -> str | None:
|
|
"""خواندن یک فایل متنی."""
|
|
if not path.exists() or not path.is_file():
|
|
return None
|
|
try:
|
|
return path.read_text(encoding="utf-8").strip()
|
|
except Exception as exc:
|
|
failure = classify_exception(exc)
|
|
log_event(
|
|
level=40,
|
|
message="rag ingest file load failed",
|
|
source="rag.ingest",
|
|
provider=None,
|
|
operation="load_file",
|
|
result_status="error",
|
|
error_code=failure.error_code,
|
|
path=str(path),
|
|
)
|
|
record_metric("rag.ingest.file_load_failure", error_code=failure.error_code)
|
|
return None
|
|
|
|
|
|
def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str, str]]:
|
|
"""
|
|
خواندن همه فایلهای متنی از یک دایرکتوری.
|
|
Returns: [(source_id, content), ...]
|
|
"""
|
|
if not dir_path.exists() or not dir_path.is_dir():
|
|
return []
|
|
out: list[tuple[str, str]] = []
|
|
for f in sorted(dir_path.rglob("*")):
|
|
if f.is_file() and f.suffix.lower() in TEXT_EXTENSIONS:
|
|
rel = f.relative_to(dir_path)
|
|
source_id = f"{prefix}:{rel}"
|
|
content = _load_file(f)
|
|
if content:
|
|
out.append((source_id, content))
|
|
return out
|
|
|
|
|
|
def load_sources(
|
|
config: RAGConfig | None = None,
|
|
kb_name: str | None = None,
|
|
) -> list[tuple[str, str, str, str]]:
|
|
"""
|
|
بارگذاری منابع: لحنها، پایگاههای دانش سهگانه، دیتای کاربران.
|
|
اگر kb_name مشخص شود، فقط آن پایگاه دانش لود میشود.
|
|
|
|
Returns:
|
|
[(source_id, content, sensor_uuid, kb_name), ...]
|
|
"""
|
|
cfg = config or load_rag_config()
|
|
base = Path(__file__).resolve().parent.parent
|
|
sources: list[tuple[str, str, str, str]] = []
|
|
|
|
kbs_to_load = cfg.knowledge_bases.items()
|
|
if kb_name:
|
|
kbs_to_load = [(k, v) for k, v in kbs_to_load if k == kb_name]
|
|
|
|
for kbn, kb_cfg in kbs_to_load:
|
|
tone_path = _resolve_path(base, kb_cfg.tone_file)
|
|
content = _load_file(tone_path)
|
|
if content:
|
|
sources.append((f"tone:{kbn}", content, SENSOR_UUID_GLOBAL, kbn))
|
|
|
|
kb_path = _resolve_path(base, kb_cfg.path)
|
|
for sid, c in _load_files_from_dir(kb_path, prefix=f"kb:{kbn}"):
|
|
sources.append((sid, c, SENSOR_UUID_GLOBAL, kbn))
|
|
if kb_path.is_file():
|
|
content = _load_file(kb_path)
|
|
if content:
|
|
sources.append((f"kb:{kbn}:{kb_path.name}", content, SENSOR_UUID_GLOBAL, kbn))
|
|
|
|
for sid, content in load_user_sources():
|
|
if sid.startswith("user:"):
|
|
sensor_uuid = sid.replace("user:", "")
|
|
elif sid.startswith("weather:"):
|
|
sensor_uuid = sid.replace("weather:", "")
|
|
else:
|
|
sensor_uuid = sid
|
|
sources.append((sid, content, sensor_uuid, KB_NAME_ALL))
|
|
|
|
return sources
|
|
|
|
|
|
def ingest(
|
|
recreate: bool = False,
|
|
config: RAGConfig | None = None,
|
|
kb_name: str | None = None,
|
|
) -> dict:
|
|
"""
|
|
ورودی کامل: منابع را میخواند، چانک، embed و به vector store میفرستد.
|
|
kb_name اختیاری: اگر مشخص شود فقط آن پایگاه دانش ingest میشود.
|
|
|
|
Args:
|
|
recreate: اگر True باشد، collection را از نو میسازد
|
|
config: تنظیمات RAG
|
|
kb_name: نام پایگاه دانش (chat/irrigation/fertilization) — اختیاری
|
|
|
|
Returns:
|
|
آمار ورودی (تعداد چانک، منبعها، خطاها)
|
|
"""
|
|
cfg = config or load_rag_config()
|
|
store = QdrantVectorStore(config=cfg)
|
|
with observe_operation(source="rag.ingest", provider=cfg.embedding.provider, operation="ingest"):
|
|
if recreate:
|
|
store.ensure_collection(recreate=True)
|
|
|
|
sources = load_sources(config=cfg, kb_name=kb_name)
|
|
if not sources:
|
|
record_metric("rag.ingest.empty_sources", kb_name=kb_name)
|
|
return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}
|
|
|
|
all_chunks: list[str] = []
|
|
all_metas: list[dict] = []
|
|
all_ids: list[str] = []
|
|
|
|
for source_id, content, sensor_uuid, src_kb in sources:
|
|
chunks = chunk_text(content, config=cfg)
|
|
for i, ch in enumerate(chunks):
|
|
uid = str(uuid.uuid4())
|
|
all_ids.append(uid)
|
|
all_chunks.append(ch)
|
|
all_metas.append({
|
|
"source": source_id,
|
|
"chunk_index": i,
|
|
"sensor_uuid": sensor_uuid,
|
|
"kb_name": src_kb,
|
|
})
|
|
|
|
if not all_chunks:
|
|
record_metric("rag.ingest.empty_chunks", kb_name=kb_name)
|
|
return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}
|
|
|
|
embeddings = embed_texts(all_chunks, config=cfg)
|
|
if len(embeddings) != len(all_chunks):
|
|
record_metric("rag.ingest.embedding_mismatch", kb_name=kb_name)
|
|
return {
|
|
"chunks_added": 0,
|
|
"sources": [s[0] for s in sources],
|
|
"error": f"تعداد embed با چانکها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
|
|
}
|
|
|
|
store.add_documents(
|
|
ids=all_ids,
|
|
embeddings=embeddings,
|
|
documents=all_chunks,
|
|
metadatas=all_metas,
|
|
)
|
|
record_metric("rag.ingest.success", kb_name=kb_name, chunks=len(all_chunks))
|
|
return {
|
|
"chunks_added": len(all_chunks),
|
|
"sources": [s[0] for s in sources],
|
|
}
|