Add Qdrant and ChromaDB support to the project

- Added Qdrant service to both docker-compose files for production and development. - Updated environment variables in .env.example and settings.py to include Qdrant configuration. - Included necessary dependencies for Qdrant and ChromaDB in requirements.txt. - Updated .gitignore to exclude ChromaDB data files.
2026-02-27 19:37:02 +03:30
parent 9ec0807d3c
commit 197f70ee12
36 changed files with 1199 additions and 0 deletions
@@ -0,0 +1,25 @@
+"""
+ماژول RAG — پایگاه دانش CropLogic
+فاز یک: Qdrant به‌عنوان vector store
+"""
+
+from .chunker import chunk_text, chunk_texts
+from .client import get_qdrant_client
+from .config import load_rag_config
+from .embedding import embed_single, embed_texts
+from .ingest import ingest, load_sources
+from .retrieve import search_with_query
+from .vector_store import QdrantVectorStore
+
+__all__ = [
+    "chunk_text",
+    "chunk_texts",
+    "embed_single",
+    "embed_texts",
+    "get_qdrant_client",
+    "ingest",
+    "load_rag_config",
+    "load_sources",
+    "QdrantVectorStore",
+    "search_with_query",
+]
@@ -0,0 +1,7 @@
+from django.apps import AppConfig
+
+
+class RagConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "rag"
+    verbose_name = "RAG - پایگاه دانش"
@@ -0,0 +1,65 @@
+"""
+تکه‌تکه کردن متن (Chunking) برای RAG
+"""
+from .config import load_rag_config, RAGConfig
+
+
+# تقریب: هر توکن حدود ۳–۴ نویسه برای فارسی/انگلیسی
+CHARS_PER_TOKEN = 3.5
+
+
+def chunk_text(
+    text: str,
+    config: RAGConfig | None = None,
+    max_chunk_tokens: int | None = None,
+    overlap_tokens: int | None = None,
+) -> list[str]:
+    """
+    تکه‌تکه کردن متن بر اساس توکن (تقریبی با نویسه).
+
+    Args:
+        text: متن ورودی
+        config: تنظیمات RAG
+        max_chunk_tokens: حداکثر توکن هر چانک (override)
+        overlap_tokens: تعداد توکن همپوشانی بین چانک‌ها (override)
+
+    Returns:
+        لیست چانک‌ها
+    """
+    cfg = config or load_rag_config()
+    max_tok = max_chunk_tokens if max_chunk_tokens is not None else cfg.chunking.max_chunk_tokens
+    overlap = overlap_tokens if overlap_tokens is not None else cfg.chunking.overlap_tokens
+
+    max_chars = int(max_tok * CHARS_PER_TOKEN)
+    overlap_chars = int(overlap * CHARS_PER_TOKEN)
+    step = max_chars - overlap_chars
+
+    if step <= 0:
+        step = max_chars
+
+    text = text.strip()
+    if not text:
+        return []
+
+    chunks: list[str] = []
+    start = 0
+    while start < len(text):
+        end = start + max_chars
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        start += step
+
+    return chunks
+
+
+def chunk_texts(
+    texts: list[str],
+    config: RAGConfig | None = None,
+    **kwargs,
+) -> list[str]:
+    """چند متن را تکه‌تکه می‌کند و همه چانک‌ها را برمی‌گرداند."""
+    all_chunks: list[str] = []
+    for t in texts:
+        all_chunks.extend(chunk_text(t, config=config, **kwargs))
+    return all_chunks
@@ -0,0 +1,19 @@
+"""
+کلاینت Qdrant — اتصال به دیتابیس وکتور
+"""
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as qmodels
+
+from .config import QdrantConfig, load_rag_config
+
+
+def get_qdrant_client(config: QdrantConfig | None = None) -> QdrantClient:
+    """
+    ایجاد کلاینت Qdrant.
+    اگر config داده نشود، از rag_config.yaml بارگذاری می‌شود.
+    """
+    if config is None:
+        rag = load_rag_config()
+        config = rag.qdrant
+
+    return QdrantClient(host=config.host, port=config.port)
@@ -0,0 +1,93 @@
+"""
+بارگذاری تنظیمات RAG از rag_config.yaml
+"""
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+@dataclass
+class EmbeddingConfig:
+    provider: str
+    model: str
+    batch_size: int = 32
+    api_key_env: str | None = None
+    base_url: str | None = None
+
+
+@dataclass
+class QdrantConfig:
+    host: str = "localhost"
+    port: int = 6333
+    collection_name: str = "croplogic_kb"
+    vector_size: int = 384
+
+
+@dataclass
+class ChunkingConfig:
+    max_chunk_tokens: int = 500
+    overlap_tokens: int = 50
+
+
+@dataclass
+class RAGConfig:
+    embedding: EmbeddingConfig
+    qdrant: QdrantConfig
+    chunking: ChunkingConfig
+    tone_file: str = "config/tone.txt"
+    knowledge_base_path: str = "config/knowledge_base"
+    user_info_path: str = "config/user_info"
+    chromadb: dict[str, Any] = field(default_factory=dict)
+
+
+def load_rag_config(config_path: str | Path | None = None) -> RAGConfig:
+    """
+    بارگذاری تنظیمات از YAML و env.
+    QDRANT_HOST و QDRANT_PORT از متغیرهای محیطی override می‌شوند.
+    """
+    if config_path is None:
+        base = Path(__file__).resolve().parent.parent
+        config_path = base / "config" / "rag_config.yaml"
+
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"RAG config not found: {path}")
+
+    with open(path, encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+
+    emb = data.get("embedding", {})
+    embedding = EmbeddingConfig(
+        provider=emb.get("provider", "sentence_transformers"),
+        model=emb.get("model", "text-embedding-3-small"),
+        batch_size=emb.get("batch_size", 32),
+        api_key_env=emb.get("api_key_env"),
+        base_url=emb.get("base_url"),
+    )
+
+    qd = data.get("qdrant", {})
+    qdrant = QdrantConfig(
+        host=os.environ.get("QDRANT_HOST", qd.get("host", "localhost")),
+        port=int(os.environ.get("QDRANT_PORT", qd.get("port", 6333))),
+        collection_name=qd.get("collection_name", "croplogic_kb"),
+        vector_size=qd.get("vector_size", 1536),
+    )
+
+    ch = data.get("chunking", {})
+    chunking = ChunkingConfig(
+        max_chunk_tokens=ch.get("max_chunk_tokens", 500),
+        overlap_tokens=ch.get("overlap_tokens", 50),
+    )
+
+    return RAGConfig(
+        embedding=embedding,
+        qdrant=qdrant,
+        chunking=chunking,
+        tone_file=data.get("tone_file", "config/tone.txt"),
+        knowledge_base_path=data.get("knowledge_base_path", "config/knowledge_base"),
+        user_info_path=data.get("user_info_path", "config/user_info"),
+        chromadb=data.get("chromadb", {}),
+    )
@@ -0,0 +1,71 @@
+"""
+سرویس تعبیه‌سازی متن با Avalai API (OpenAI-compatible)
+"""
+import os
+from typing import overload
+
+from openai import OpenAI
+
+from .config import load_rag_config, RAGConfig
+
+
+def _get_avalai_client(config: RAGConfig | None) -> OpenAI:
+    """ساخت کلاینت OpenAI برای Avalai API."""
+    cfg = config or load_rag_config()
+    emb = cfg.embedding
+    env_var = emb.api_key_env or "AVALAI_API_KEY"
+    api_key = os.environ.get(env_var)
+    base_url = emb.base_url or os.environ.get(
+        "AVALAI_BASE_URL", "https://api.avalai.ir/v1"
+    )
+    return OpenAI(api_key=api_key, base_url=base_url)
+
+
+def embed_texts(
+    texts: list[str],
+    config: RAGConfig | None = None,
+    model: str | None = None,
+    dimensions: int | None = None,
+) -> list[list[float]]:
+    """
+    تعبیه‌سازی لیست متن‌ها با Avalai.
+
+    Args:
+        texts: لیست رشته‌های ورودی
+        config: تنظیمات RAG (پیش‌فرض: load_rag_config)
+        model: نام مدل (override از config)
+        dimensions: تعداد ابعاد (فقط برای مدل‌های پشتیبانی‌کننده)
+
+    Returns:
+        لیست وکتورها
+    """
+    if not texts:
+        return []
+
+    cfg = config or load_rag_config()
+    client = _get_avalai_client(cfg)
+    model_name = model or cfg.embedding.model
+    batch_size = cfg.embedding.batch_size
+
+    all_embeddings: list[list[float]] = []
+    extra = {}
+    if dimensions is not None:
+        extra["dimensions"] = dimensions
+
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i : i + batch_size]
+        resp = client.embeddings.create(
+            model=model_name,
+            input=batch,
+            **extra,
+        )
+        for item in sorted(resp.data, key=lambda x: x.index):
+            all_embeddings.append(item.embedding)
+
+    return all_embeddings
+
+
+def embed_single(text: str, config: RAGConfig | None = None, **kwargs) -> list[float]:
+    """تعبیه‌سازی یک متن. خروجی مستقیماً یک وکتور است."""
+    vecs = embed_texts([text], config=config, **kwargs)
+    return vecs[0] if vecs else []
@@ -0,0 +1,148 @@
+"""
+پایپ‌لاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store
+
+سه منبع:
+۱. لحن (tone)
+۲. پایگاه دانش (knowledge base)
+۳. اطلاعات هر کاربر (user info)
+"""
+import uuid
+from pathlib import Path
+
+from .chunker import chunk_text, chunk_texts
+from .config import load_rag_config, RAGConfig
+from .embedding import embed_texts
+from .vector_store import QdrantVectorStore
+
+# پسوندهای قابل خواندن
+TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}
+
+
+def _resolve_path(base: Path, p: str) -> Path:
+    """تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""
+    path = Path(p)
+    if not path.is_absolute():
+        path = base / path
+    return path
+
+
+def _load_file(path: Path) -> str | None:
+    """خواندن یک فایل متنی."""
+    if not path.exists() or not path.is_file():
+        return None
+    try:
+        return path.read_text(encoding="utf-8").strip()
+    except Exception:
+        return None
+
+
+def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str, str]]:
+    """
+    خواندن همه فایل‌های متنی از یک دایرکتوری.
+    Returns: [(source_id, content), ...]
+    """
+    if not dir_path.exists() or not dir_path.is_dir():
+        return []
+    out: list[tuple[str, str]] = []
+    for f in sorted(dir_path.rglob("*")):
+        if f.is_file() and f.suffix.lower() in TEXT_EXTENSIONS:
+            rel = f.relative_to(dir_path)
+            source_id = f"{prefix}:{rel}"
+            content = _load_file(f)
+            if content:
+                out.append((source_id, content))
+    return out
+
+
+def load_sources(config: RAGConfig | None = None) -> list[tuple[str, str]]:
+    """
+    بارگذاری سه منبع: لحن، پایگاه دانش، اطلاعات کاربر.
+
+    Returns:
+        [(source_id, content), ...]
+        source_id مثال: tone, kb:file.txt, user:profile.txt
+    """
+    cfg = config or load_rag_config()
+    base = Path(__file__).resolve().parent.parent
+    sources: list[tuple[str, str]] = []
+
+    # ۱. لحن
+    tone_path = _resolve_path(base, cfg.tone_file)
+    content = _load_file(tone_path)
+    if content:
+        sources.append(("tone", content))
+
+    # ۲. پایگاه دانش
+    kb_path = _resolve_path(base, cfg.knowledge_base_path)
+    for sid, c in _load_files_from_dir(kb_path, prefix="kb"):
+        sources.append((sid, c))
+    if kb_path.is_file():
+        content = _load_file(kb_path)
+        if content:
+            sources.append((f"kb:{kb_path.name}", content))
+
+    # ۳. اطلاعات کاربر
+    user_path = _resolve_path(base, cfg.user_info_path)
+    for sid, c in _load_files_from_dir(user_path, prefix="user"):
+        sources.append((sid, c))
+    if user_path.is_file():
+        content = _load_file(user_path)
+        if content:
+            sources.append((f"user:{user_path.name}", content))
+
+    return sources
+
+
+def ingest(recreate: bool = False, config: RAGConfig | None = None) -> dict:
+    """
+    ورودی کامل: منابع را می‌خواند، چانک می‌کند، embed می‌کند و به vector store می‌فرستد.
+
+    Args:
+        recreate: اگر True باشد، collection را از نو می‌سازد
+        config: تنظیمات RAG
+
+    Returns:
+        آمار ورودی (تعداد چانک، منبع‌ها، خطاها)
+    """
+    cfg = config or load_rag_config()
+    store = QdrantVectorStore(config=cfg)
+    if recreate:
+        store.ensure_collection(recreate=True)
+
+    sources = load_sources(config=cfg)
+    if not sources:
+        return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}
+
+    all_chunks: list[str] = []
+    all_metas: list[dict] = []
+    all_ids: list[str] = []
+
+    for source_id, content in sources:
+        chunks = chunk_text(content, config=cfg)
+        for i, ch in enumerate(chunks):
+            uid = str(uuid.uuid4())
+            all_ids.append(uid)
+            all_chunks.append(ch)
+            all_metas.append({"source": source_id, "chunk_index": i})
+
+    if not all_chunks:
+        return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}
+
+    embeddings = embed_texts(all_chunks, config=cfg)
+    if len(embeddings) != len(all_chunks):
+        return {
+            "chunks_added": 0,
+            "sources": [s[0] for s in sources],
+            "error": f"تعداد embed با چانک‌ها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
+        }
+
+    store.add_documents(
+        ids=all_ids,
+        embeddings=embeddings,
+        documents=all_chunks,
+        metadatas=all_metas,
+    )
+    return {
+        "chunks_added": len(all_chunks),
+        "sources": [s[0] for s in sources],
+    }
@@ -0,0 +1,30 @@
+"""
+ورودی RAG: لحن، پایگاه دانش و اطلاعات کاربر را embed و به Qdrant می‌فرستد.
+اجرا: python manage.py rag_ingest [--recreate]
+"""
+from django.core.management.base import BaseCommand
+
+from rag.ingest import ingest
+
+
+class Command(BaseCommand):
+    help = "Embed لحن، پایگاه دانش و اطلاعات کاربر و ذخیره در Qdrant"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--recreate",
+            action="store_true",
+            help="collection را از نو بساز (حذف و ایجاد مجدد)",
+        )
+
+    def handle(self, *args, **options):
+        recreate = options.get("recreate", False)
+        result = ingest(recreate=recreate)
+        if "error" in result:
+            self.stderr.write(self.style.ERROR(result["error"]))
+            return
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"✓ {result['chunks_added']} چانک از منابع {result['sources']} ذخیره شد."
+            )
+        )
@@ -0,0 +1,28 @@
+"""
+بازیابی RAG: embed کوئری و جستجو در vector store
+"""
+from .config import load_rag_config, RAGConfig
+from .embedding import embed_single
+from .vector_store import QdrantVectorStore
+
+
+def search_with_query(
+    query: str,
+    limit: int = 5,
+    score_threshold: float | None = None,
+    config: RAGConfig | None = None,
+) -> list[dict]:
+    """
+    کوئری را embed می‌کند و در vector store جستجو می‌کند.
+
+    Returns:
+        لیست نتایج با id, score, text, metadata
+    """
+    cfg = config or load_rag_config()
+    query_vector = embed_single(query, config=cfg)
+    store = QdrantVectorStore(config=cfg)
+    return store.search(
+        query_vector=query_vector,
+        limit=limit,
+        score_threshold=score_threshold,
+    )
@@ -0,0 +1,117 @@
+"""
+Qdrant Vector Store — ذخیره و جستجوی وکتورها
+"""
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as qmodels
+
+from .client import get_qdrant_client
+from .config import load_rag_config, RAGConfig
+
+
+class QdrantVectorStore:
+    """
+    ذخیره و جستجوی documents در Qdrant.
+    """
+
+    def __init__(self, config: RAGConfig | None = None):
+        self.config = config or load_rag_config()
+        self.qdrant = self.config.qdrant
+        self._client: QdrantClient | None = None
+
+    @property
+    def client(self) -> QdrantClient:
+        if self._client is None:
+            self._client = get_qdrant_client(self.qdrant)
+        return self._client
+
+    def ensure_collection(self, recreate: bool = False) -> None:
+        """
+        اطمینان از وجود collection با نام و اندازه مناسب.
+        """
+        name = self.qdrant.collection_name
+        size = self.qdrant.vector_size
+
+        try:
+            self.client.get_collection(name)
+            if recreate:
+                self.client.delete_collection(name)
+                self.client.create_collection(
+                    collection_name=name,
+                    vectors_config=qmodels.VectorParams(
+                        size=size,
+                        distance=qmodels.Distance.COSINE,
+                    ),
+                )
+        except Exception:
+            self.client.create_collection(
+                collection_name=name,
+                vectors_config=qmodels.VectorParams(
+                    size=size,
+                    distance=qmodels.Distance.COSINE,
+                ),
+            )
+
+    def add_documents(
+        self,
+        ids: list[str],
+        embeddings: list[list[float]],
+        documents: list[str],
+        metadatas: list[dict] | None = None,
+    ) -> int:
+        """
+        افزودن documents به collection.
+        metadata فقط str, int, float, bool پشتیبانی می‌شود.
+        """
+        self.ensure_collection()
+        metas = metadatas or [{}] * len(ids)
+
+        def _serialize(m: dict) -> dict:
+            out = {}
+            for k, v in m.items():
+                if v is None:
+                    continue
+                if isinstance(v, (str, int, float, bool)):
+                    out[k] = v
+                else:
+                    out[k] = str(v)
+            return out
+
+        payloads = [
+            {"text": doc, "doc_id": sid, **_serialize(m)}
+            for doc, m, sid in zip(documents, metas, ids)
+        ]
+
+        self.client.upsert(
+            collection_name=self.qdrant.collection_name,
+            points=[
+                qmodels.PointStruct(id=pid, vector=emb, payload=pl)
+                for pid, emb, pl in zip(ids, embeddings, payloads)
+            ],
+        )
+        return len(ids)
+
+    def search(
+        self,
+        query_vector: list[float],
+        limit: int = 5,
+        score_threshold: float | None = None,
+    ) -> list[dict]:
+        """
+        جستجوی شباهت بر اساس query vector.
+        """
+        results = self.client.search(
+            collection_name=self.qdrant.collection_name,
+            query_vector=query_vector,
+            limit=limit,
+            score_threshold=score_threshold,
+        )
+
+        return [
+            {
+                "id": str(r.id),
+                "score": r.score,
+                "text": r.payload.get("text", ""),
+                "metadata": {k: v for k, v in r.payload.items() if k != "text"},
+            }
+            for r in results
+        ]