Refactor user data handling and enhance chat functionality

- Removed deprecated user_info files and paths from configuration. - Added user soil data integration in chat context to improve response accuracy. - Updated build_rag_context and chat_rag_stream functions to include sensor_uuid for user-specific data retrieval. - Enhanced load_sources function to load user data from the database. - Implemented filtering in search_with_query and QdrantVectorStore to isolate user data based on sensor_uuid. - Introduced Celery Beat schedule for periodic user data ingestion.
2026-02-27 20:06:46 +03:30
parent 94355af62b
commit 2c42ebe01c
13 changed files with 246 additions and 89 deletions
@@ -2,9 +2,9 @@
 پایپ‌لاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store

 سه منبع:
-۱. لحن (tone)
-۲. پایگاه دانش (knowledge base)
-۳. اطلاعات هر کاربر (user info)
+۱. لحن (tone) — sensor_uuid=__global__
+۲. پایگاه دانش (knowledge base) — sensor_uuid=__global__
+۳. دیتای خاک هر کاربر از DB (sensor_data + soil_data) — sensor_uuid=uuid
 """
 import uuid
 from pathlib import Path
@@ -12,11 +12,14 @@ from pathlib import Path
 from .chunker import chunk_text, chunk_texts
 from .config import load_rag_config, RAGConfig
 from .embedding import embed_texts
+from .user_data import load_user_sources
 from .vector_store import QdrantVectorStore

 # پسوندهای قابل خواندن
 TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}

+SENSOR_UUID_GLOBAL = "__global__"
+

 def _resolve_path(base: Path, p: str) -> Path:
    """تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""
@@ -54,41 +57,37 @@ def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str,
    return out


-def load_sources(config: RAGConfig | None = None) -> list[tuple[str, str]]:
+def load_sources(config: RAGConfig | None = None) -> list[tuple[str, str, str]]:
    """
-    بارگذاری سه منبع: لحن، پایگاه دانش، اطلاعات کاربر.
+    بارگذاری سه منبع: لحن، پایگاه دانش، دیتای کاربر از DB.

    Returns:
-        [(source_id, content), ...]
-        source_id مثال: tone, kb:file.txt, user:profile.txt
+        [(source_id, content, sensor_uuid), ...]
+        sensor_uuid: __global__ برای tone/kb، uuid سنسور برای user
    """
    cfg = config or load_rag_config()
    base = Path(__file__).resolve().parent.parent
-    sources: list[tuple[str, str]] = []
+    sources: list[tuple[str, str, str]] = []

    # ۱. لحن
    tone_path = _resolve_path(base, cfg.tone_file)
    content = _load_file(tone_path)
    if content:
-        sources.append(("tone", content))
+        sources.append(("tone", content, SENSOR_UUID_GLOBAL))

    # ۲. پایگاه دانش
    kb_path = _resolve_path(base, cfg.knowledge_base_path)
    for sid, c in _load_files_from_dir(kb_path, prefix="kb"):
-        sources.append((sid, c))
+        sources.append((sid, c, SENSOR_UUID_GLOBAL))
    if kb_path.is_file():
        content = _load_file(kb_path)
        if content:
-            sources.append((f"kb:{kb_path.name}", content))
+            sources.append((f"kb:{kb_path.name}", content, SENSOR_UUID_GLOBAL))

-    # ۳. اطلاعات کاربر
-    user_path = _resolve_path(base, cfg.user_info_path)
-    for sid, c in _load_files_from_dir(user_path, prefix="user"):
-        sources.append((sid, c))
-    if user_path.is_file():
-        content = _load_file(user_path)
-        if content:
-            sources.append((f"user:{user_path.name}", content))
+    # ۳. دیتای کاربران از sensor_data + soil_data
+    for sid, content in load_user_sources():
+        sensor_uuid = sid.replace("user:", "")
+        sources.append((sid, content, sensor_uuid))

    return sources

@@ -96,6 +95,7 @@ def load_sources(config: RAGConfig | None = None) -> list[tuple[str, str]]:
 def ingest(recreate: bool = False, config: RAGConfig | None = None) -> dict:
    """
    ورودی کامل: منابع را می‌خواند، چانک می‌کند، embed می‌کند و به vector store می‌فرستد.
+    دیتای هر کاربر (sensor_uuid) جدا embed و با metadata ذخیره می‌شود.

    Args:
        recreate: اگر True باشد، collection را از نو می‌سازد
@@ -117,13 +117,17 @@ def ingest(recreate: bool = False, config: RAGConfig | None = None) -> dict:
    all_metas: list[dict] = []
    all_ids: list[str] = []

-    for source_id, content in sources:
+    for source_id, content, sensor_uuid in sources:
        chunks = chunk_text(content, config=cfg)
        for i, ch in enumerate(chunks):
            uid = str(uuid.uuid4())
            all_ids.append(uid)
            all_chunks.append(ch)
-            all_metas.append({"source": source_id, "chunk_index": i})
+            all_metas.append({
+                "source": source_id,
+                "chunk_index": i,
+                "sensor_uuid": sensor_uuid,
+            })

    if not all_chunks:
        return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}