rag/ingest.py

"""
پایپ‌لاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store — با پشتیبانی از چند پایگاه دانش

منابع:
۱. لحن هر پایگاه دانش (tone) — sensor_uuid=__global__, kb_name=chat|irrigation|fertilization
۲. پایگاه‌های دانش سه‌گانه — sensor_uuid=__global__, kb_name=chat|irrigation|fertilization
۳. دیتای خاک + هواشناسی هر کاربر از DB — sensor_uuid=uuid, kb_name=__all__
"""
import uuid
from pathlib import Path

from .chunker import chunk_text, chunk_texts
from .config import load_rag_config, RAGConfig
from .embedding import embed_texts
from .observability import classify_exception, log_event, observe_operation, record_metric
from .user_data import load_user_sources, build_user_weather_text
from .vector_store import QdrantVectorStore

TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}

SENSOR_UUID_GLOBAL = "__global__"

KB_NAME_ALL = "__all__"


def _resolve_path(base: Path, p: str) -> Path:
    """تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""
    path = Path(p)
    if not path.is_absolute():
        path = base / path
    return path


def _load_file(path: Path) -> str | None:
    """خواندن یک فایل متنی."""
    if not path.exists() or not path.is_file():
        return None
    try:
        return path.read_text(encoding="utf-8").strip()
    except Exception as exc:
        failure = classify_exception(exc)
        log_event(
            level=40,
            message="rag ingest file load failed",
            source="rag.ingest",
            provider=None,
            operation="load_file",
            result_status="error",
            error_code=failure.error_code,
            path=str(path),
        )
        record_metric("rag.ingest.file_load_failure", error_code=failure.error_code)
        return None


def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str, str]]:
    """
    خواندن همه فایل‌های متنی از یک دایرکتوری.
    Returns: [(source_id, content), ...]
    """
    if not dir_path.exists() or not dir_path.is_dir():
        return []
    out: list[tuple[str, str]] = []
    for f in sorted(dir_path.rglob("*")):
        if f.is_file() and f.suffix.lower() in TEXT_EXTENSIONS:
            rel = f.relative_to(dir_path)
            source_id = f"{prefix}:{rel}"
            content = _load_file(f)
            if content:
                out.append((source_id, content))
    return out


def load_sources(
    config: RAGConfig | None = None,
    kb_name: str | None = None,
) -> list[tuple[str, str, str, str]]:
    """
    بارگذاری منابع: لحن‌ها، پایگاه‌های دانش سه‌گانه، دیتای کاربران.
    اگر kb_name مشخص شود، فقط آن پایگاه دانش لود می‌شود.

    Returns:
        [(source_id, content, sensor_uuid, kb_name), ...]
    """
    cfg = config or load_rag_config()
    base = Path(__file__).resolve().parent.parent
    sources: list[tuple[str, str, str, str]] = []

    kbs_to_load = cfg.knowledge_bases.items()
    if kb_name:
        kbs_to_load = [(k, v) for k, v in kbs_to_load if k == kb_name]

    for kbn, kb_cfg in kbs_to_load:
        tone_path = _resolve_path(base, kb_cfg.tone_file)
        content = _load_file(tone_path)
        if content:
            sources.append((f"tone:{kbn}", content, SENSOR_UUID_GLOBAL, kbn))

        kb_path = _resolve_path(base, kb_cfg.path)
        for sid, c in _load_files_from_dir(kb_path, prefix=f"kb:{kbn}"):
            sources.append((sid, c, SENSOR_UUID_GLOBAL, kbn))
        if kb_path.is_file():
            content = _load_file(kb_path)
            if content:
                sources.append((f"kb:{kbn}:{kb_path.name}", content, SENSOR_UUID_GLOBAL, kbn))

    for sid, content in load_user_sources():
        if sid.startswith("user:"):
            sensor_uuid = sid.replace("user:", "")
        elif sid.startswith("weather:"):
            sensor_uuid = sid.replace("weather:", "")
        else:
            sensor_uuid = sid
        sources.append((sid, content, sensor_uuid, KB_NAME_ALL))

    return sources


def ingest(
    recreate: bool = False,
    config: RAGConfig | None = None,
    kb_name: str | None = None,
) -> dict:
    """
    ورودی کامل: منابع را می‌خواند، چانک، embed و به vector store می‌فرستد.
    kb_name اختیاری: اگر مشخص شود فقط آن پایگاه دانش ingest می‌شود.

    Args:
        recreate: اگر True باشد، collection را از نو می‌سازد
        config: تنظیمات RAG
        kb_name: نام پایگاه دانش (chat/irrigation/fertilization) — اختیاری

    Returns:
        آمار ورودی (تعداد چانک، منبع‌ها، خطاها)
    """
    cfg = config or load_rag_config()
    store = QdrantVectorStore(config=cfg)
    with observe_operation(source="rag.ingest", provider=cfg.embedding.provider, operation="ingest"):
        if recreate:
            store.ensure_collection(recreate=True)

        sources = load_sources(config=cfg, kb_name=kb_name)
        if not sources:
            record_metric("rag.ingest.empty_sources", kb_name=kb_name)
            return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}

    all_chunks: list[str] = []
    all_metas: list[dict] = []
    all_ids: list[str] = []

    for source_id, content, sensor_uuid, src_kb in sources:
        chunks = chunk_text(content, config=cfg)
        for i, ch in enumerate(chunks):
            uid = str(uuid.uuid4())
            all_ids.append(uid)
            all_chunks.append(ch)
            all_metas.append({
                "source": source_id,
                "chunk_index": i,
                "sensor_uuid": sensor_uuid,
                "kb_name": src_kb,
            })

        if not all_chunks:
            record_metric("rag.ingest.empty_chunks", kb_name=kb_name)
            return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}

        embeddings = embed_texts(all_chunks, config=cfg)
        if len(embeddings) != len(all_chunks):
            record_metric("rag.ingest.embedding_mismatch", kb_name=kb_name)
            return {
                "chunks_added": 0,
                "sources": [s[0] for s in sources],
                "error": f"تعداد embed با چانک‌ها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
            }

        store.add_documents(
            ids=all_ids,
            embeddings=embeddings,
            documents=all_chunks,
            metadatas=all_metas,
        )
        record_metric("rag.ingest.success", kb_name=kb_name, chunks=len(all_chunks))
        return {
            "chunks_added": len(all_chunks),
            "sources": [s[0] for s in sources],
        }
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"""`
first commit 2026-03-19 22:54:29 +03:30			`پایپ‌لاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store — با پشتیبانی از چند پایگاه دانش`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
first commit 2026-03-19 22:54:29 +03:30			`منابع:`
			`۱. لحن هر پایگاه دانش (tone) — sensor_uuid=__global__, kb_name=chat\|irrigation\|fertilization`
			`۲. پایگاه‌های دانش سه‌گانه — sensor_uuid=__global__, kb_name=chat\|irrigation\|fertilization`
			`۳. دیتای خاک + هواشناسی هر کاربر از DB — sensor_uuid=uuid, kb_name=__all__`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"""`
			`import uuid`
			`from pathlib import Path`

			`from .chunker import chunk_text, chunk_texts`
			`from .config import load_rag_config, RAGConfig`
			`from .embedding import embed_texts`
UPDATE 2026-05-05 21:02:12 +03:30			`from .observability import classify_exception, log_event, observe_operation, record_metric`
first commit 2026-03-19 22:54:29 +03:30			`from .user_data import load_user_sources, build_user_weather_text`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`from .vector_store import QdrantVectorStore`

			`TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}`

Refactor user data handling and enhance chat functionality 2026-02-27 20:06:46 +03:30			`SENSOR_UUID_GLOBAL = "__global__"`

first commit 2026-03-19 22:54:29 +03:30			`KB_NAME_ALL = "__all__"`

Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`def _resolve_path(base: Path, p: str) -> Path:`
			`"""تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""`
			`path = Path(p)`
			`if not path.is_absolute():`
			`path = base / path`
			`return path`


			`def _load_file(path: Path) -> str \| None:`
			`"""خواندن یک فایل متنی."""`
			`if not path.exists() or not path.is_file():`
			`return None`
			`try:`
			`return path.read_text(encoding="utf-8").strip()`
UPDATE 2026-05-05 21:02:12 +03:30			`except Exception as exc:`
			`failure = classify_exception(exc)`
			`log_event(`
			`level=40,`
			`message="rag ingest file load failed",`
			`source="rag.ingest",`
			`provider=None,`
			`operation="load_file",`
			`result_status="error",`
			`error_code=failure.error_code,`
			`path=str(path),`
			`)`
			`record_metric("rag.ingest.file_load_failure", error_code=failure.error_code)`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`return None`


			`def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str, str]]:`
			`"""`
			`خواندن همه فایل‌های متنی از یک دایرکتوری.`
			`Returns: [(source_id, content), ...]`
			`"""`
			`if not dir_path.exists() or not dir_path.is_dir():`
			`return []`
			`out: list[tuple[str, str]] = []`
			`for f in sorted(dir_path.rglob("*")):`
			`if f.is_file() and f.suffix.lower() in TEXT_EXTENSIONS:`
			`rel = f.relative_to(dir_path)`
			`source_id = f"{prefix}:{rel}"`
			`content = _load_file(f)`
			`if content:`
			`out.append((source_id, content))`
			`return out`


first commit 2026-03-19 22:54:29 +03:30			`def load_sources(`
			`config: RAGConfig \| None = None,`
			`kb_name: str \| None = None,`
			`) -> list[tuple[str, str, str, str]]:`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"""`
first commit 2026-03-19 22:54:29 +03:30			`بارگذاری منابع: لحن‌ها، پایگاه‌های دانش سه‌گانه، دیتای کاربران.`
			`اگر kb_name مشخص شود، فقط آن پایگاه دانش لود می‌شود.`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`Returns:`
first commit 2026-03-19 22:54:29 +03:30			`[(source_id, content, sensor_uuid, kb_name), ...]`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"""`
			`cfg = config or load_rag_config()`
			`base = Path(__file__).resolve().parent.parent`
first commit 2026-03-19 22:54:29 +03:30			`sources: list[tuple[str, str, str, str]] = []`

			`kbs_to_load = cfg.knowledge_bases.items()`
			`if kb_name:`
			`kbs_to_load = [(k, v) for k, v in kbs_to_load if k == kb_name]`

			`for kbn, kb_cfg in kbs_to_load:`
			`tone_path = _resolve_path(base, kb_cfg.tone_file)`
			`content = _load_file(tone_path)`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`if content:`
first commit 2026-03-19 22:54:29 +03:30			`sources.append((f"tone:{kbn}", content, SENSOR_UUID_GLOBAL, kbn))`

			`kb_path = _resolve_path(base, kb_cfg.path)`
			`for sid, c in _load_files_from_dir(kb_path, prefix=f"kb:{kbn}"):`
			`sources.append((sid, c, SENSOR_UUID_GLOBAL, kbn))`
			`if kb_path.is_file():`
			`content = _load_file(kb_path)`
			`if content:`
			`sources.append((f"kb:{kbn}:{kb_path.name}", content, SENSOR_UUID_GLOBAL, kbn))`
Refactor user data handling and enhance chat functionality 2026-02-27 20:06:46 +03:30
			`for sid, content in load_user_sources():`
first commit 2026-03-19 22:54:29 +03:30			`if sid.startswith("user:"):`
			`sensor_uuid = sid.replace("user:", "")`
			`elif sid.startswith("weather:"):`
			`sensor_uuid = sid.replace("weather:", "")`
			`else:`
			`sensor_uuid = sid`
			`sources.append((sid, content, sensor_uuid, KB_NAME_ALL))`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`return sources`


first commit 2026-03-19 22:54:29 +03:30			`def ingest(`
			`recreate: bool = False,`
			`config: RAGConfig \| None = None,`
			`kb_name: str \| None = None,`
			`) -> dict:`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"""`
first commit 2026-03-19 22:54:29 +03:30			`ورودی کامل: منابع را می‌خواند، چانک، embed و به vector store می‌فرستد.`
			`kb_name اختیاری: اگر مشخص شود فقط آن پایگاه دانش ingest می‌شود.`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`Args:`
			`recreate: اگر True باشد، collection را از نو می‌سازد`
			`config: تنظیمات RAG`
first commit 2026-03-19 22:54:29 +03:30			`kb_name: نام پایگاه دانش (chat/irrigation/fertilization) — اختیاری`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`Returns:`
			`آمار ورودی (تعداد چانک، منبع‌ها، خطاها)`
			`"""`
			`cfg = config or load_rag_config()`
			`store = QdrantVectorStore(config=cfg)`
UPDATE 2026-05-05 21:02:12 +03:30			`with observe_operation(source="rag.ingest", provider=cfg.embedding.provider, operation="ingest"):`
			`if recreate:`
			`store.ensure_collection(recreate=True)`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
UPDATE 2026-05-05 21:02:12 +03:30			`sources = load_sources(config=cfg, kb_name=kb_name)`
			`if not sources:`
			`record_metric("rag.ingest.empty_sources", kb_name=kb_name)`
			`return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
			`all_chunks: list[str] = []`
			`all_metas: list[dict] = []`
			`all_ids: list[str] = []`

first commit 2026-03-19 22:54:29 +03:30			`for source_id, content, sensor_uuid, src_kb in sources:`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`chunks = chunk_text(content, config=cfg)`
			`for i, ch in enumerate(chunks):`
			`uid = str(uuid.uuid4())`
			`all_ids.append(uid)`
			`all_chunks.append(ch)`
Refactor user data handling and enhance chat functionality 2026-02-27 20:06:46 +03:30			`all_metas.append({`
			`"source": source_id,`
			`"chunk_index": i,`
			`"sensor_uuid": sensor_uuid,`
first commit 2026-03-19 22:54:29 +03:30			`"kb_name": src_kb,`
Refactor user data handling and enhance chat functionality 2026-02-27 20:06:46 +03:30			`})`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30
UPDATE 2026-05-05 21:02:12 +03:30			`if not all_chunks:`
			`record_metric("rag.ingest.empty_chunks", kb_name=kb_name)`
			`return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}`

			`embeddings = embed_texts(all_chunks, config=cfg)`
			`if len(embeddings) != len(all_chunks):`
			`record_metric("rag.ingest.embedding_mismatch", kb_name=kb_name)`
			`return {`
			`"chunks_added": 0,`
			`"sources": [s[0] for s in sources],`
			`"error": f"تعداد embed با چانک‌ها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",`
			`}`

			`store.add_documents(`
			`ids=all_ids,`
			`embeddings=embeddings,`
			`documents=all_chunks,`
			`metadatas=all_metas,`
			`)`
			`record_metric("rag.ingest.success", kb_name=kb_name, chunks=len(all_chunks))`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`return {`
UPDATE 2026-05-05 21:02:12 +03:30			`"chunks_added": len(all_chunks),`
Add Qdrant and ChromaDB support to the project 2026-02-27 19:37:02 +03:30			`"sources": [s[0] for s in sources],`
			`}`