From 1a178f39b7ba822c698ee46938c14ccebb5ebc3e Mon Sep 17 00:00:00 2001 From: Mohammad Sajad Pourajam Date: Fri, 27 Feb 2026 20:09:51 +0330 Subject: [PATCH] Remove knowledge_base module files - Deleted unused files from the knowledge_base module, including __init__.py, apps.py, chunks.py, embeddings.py, indexer.py, and management commands. - This cleanup helps streamline the codebase by removing obsolete components. --- knowledge_base/__init__.py | 0 knowledge_base/apps.py | 7 - knowledge_base/chunks.py | 202 ------------------ knowledge_base/config/__init__.py | 0 knowledge_base/config/rag_config.example.yaml | 21 -- knowledge_base/embeddings.py | 84 -------- knowledge_base/indexer.py | 90 -------- knowledge_base/management/__init__.py | 0 .../management/commands/__init__.py | 0 .../commands/build_knowledge_base.py | 47 ---- 10 files changed, 451 deletions(-) delete mode 100644 knowledge_base/__init__.py delete mode 100644 knowledge_base/apps.py delete mode 100644 knowledge_base/chunks.py delete mode 100644 knowledge_base/config/__init__.py delete mode 100644 knowledge_base/config/rag_config.example.yaml delete mode 100644 knowledge_base/embeddings.py delete mode 100644 knowledge_base/indexer.py delete mode 100644 knowledge_base/management/__init__.py delete mode 100644 knowledge_base/management/commands/__init__.py delete mode 100644 knowledge_base/management/commands/build_knowledge_base.py diff --git a/knowledge_base/__init__.py b/knowledge_base/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/knowledge_base/apps.py b/knowledge_base/apps.py deleted file mode 100644 index 470c7cf..0000000 --- a/knowledge_base/apps.py +++ /dev/null @@ -1,7 +0,0 @@ -from django.apps import AppConfig - - -class KnowledgeBaseConfig(AppConfig): - default_auto_field = "django.db.models.BigAutoField" - name = "knowledge_base" - verbose_name = "Knowledge Base" diff --git a/knowledge_base/chunks.py b/knowledge_base/chunks.py deleted file mode 100644 index a37054a..0000000 --- a/knowledge_base/chunks.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -تولید chunk متنی از داده‌های sensor_data، soil_data و فایل لحن. -""" -import re -from pathlib import Path -from typing import Iterator - -from django.db.models import Prefetch - -from sensor_data.models import SensorData -from soil_data.models import SoilDepthData, SoilLocation - - -DEPTH_LABELS_FA = { - "0-5cm": "۰–۵ سانتی‌متر", - "5-15cm": "۵–۱۵ سانتی‌متر", - "15-30cm": "۱۵–۳۰ سانتی‌متر", -} - -SOIL_FIELD_NAMES_FA = { - "bdod": "چگالی توده خاک", - "cec": "ظرفیت تبادل کاتیونی", - "cfvo": "حجم کسر ریزدانه", - "clay": "رس", - "nitrogen": "نیتروژن", - "ocd": "کربن آلی خاک", - "ocs": "ذخیره کربن آلی", - "phh2o": "pH خاک", - "sand": "ماسه", - "silt": "لای", - "soc": "کربن آلی خاک", - "wv0010": "آب موجود در ۱۰ kPa", - "wv0033": "آب موجود در ۳۳ kPa", - "wv1500": "آب موجود در ۱۵۰۰ kPa", -} - - -def _fmt(val: float | None) -> str: - if val is None: - return "ندارد" - return f"{val:.2f}" - - -def _soil_depth_to_text(depth: SoilDepthData) -> str: - """تبدیل یک SoilDepthData به متن توضیحی.""" - parts = [] - for field in ["phh2o", "nitrogen", "clay", "sand", "silt", "cec", "soc", "bdod"]: - val = getattr(depth, field, None) - if val is not None: - name = SOIL_FIELD_NAMES_FA.get(field, field) - parts.append(f"{name}={_fmt(val)}") - if not parts: - return "داده خاک موجود نیست." - return "، ".join(parts) - - -def _location_to_text(location: SoilLocation) -> str: - """ - تبدیل یک SoilLocation به همراه depths و sensor_data به متن. - """ - lat = float(location.latitude) - lon = float(location.longitude) - lines = [f"موقعیت جغرافیایی: عرض {lat}، طول {lon}."] - - depths = list(location.depths.order_by("depth_label")) - for d in depths: - label_fa = DEPTH_LABELS_FA.get(d.depth_label, d.depth_label) - lines.append(f"داده‌های خاک عمق {label_fa}: {_soil_depth_to_text(d)}.") - - sensors = list(location.sensor_data.all()) - if sensors: - for s in sensors: - parts = [] - if s.soil_moisture is not None: - parts.append(f"رطوبت خاک={_fmt(s.soil_moisture)}") - if s.soil_temperature is not None: - parts.append(f"دما={_fmt(s.soil_temperature)}") - if s.soil_ph is not None: - parts.append(f"pH={_fmt(s.soil_ph)}") - if s.electrical_conductivity is not None: - parts.append(f"هدایت الکتریکی={_fmt(s.electrical_conductivity)}") - if s.nitrogen is not None: - parts.append(f"نیتروژن={_fmt(s.nitrogen)}") - if s.phosphorus is not None: - parts.append(f"فسفر={_fmt(s.phosphorus)}") - if s.potassium is not None: - parts.append(f"پتاسیم={_fmt(s.potassium)}") - if parts: - lines.append( - f"داده سنسور (location_id={location.id}): " - + "، ".join(parts) - + "." - ) - - return "\n".join(lines) - - -def _load_tone_file(path: str | Path) -> str: - """بارگذاری محتوای فایل لحن.""" - path = Path(path) - if not path.exists(): - return "" - return path.read_text(encoding="utf-8").strip() - - -def _simple_token_count(text: str) -> int: - """تخمین تعداد توکن با تقسیم بر حدود ۴ کاراکتر.""" - return max(1, len(text) // 4) - - -def _chunk_text( - text: str, - max_tokens: int = 500, - overlap_tokens: int = 50, -) -> list[str]: - """ - تقسیم متن به chunkها بر اساس تخمین توکن. - از پاراگراف‌ها (خطوط خالی) به عنوان مرز استفاده می‌کند. - """ - if not text.strip(): - return [] - if _simple_token_count(text) <= max_tokens: - return [text.strip()] - - chunks = [] - paragraphs = re.split(r"\n\s*\n", text) - current = [] - current_tokens = 0 - - for para in paragraphs: - para = para.strip() - if not para: - continue - pt = _simple_token_count(para) - if current_tokens + pt > max_tokens and current: - chunks.append("\n\n".join(current)) - overlap_text = [] - overlap_sofar = 0 - for p in reversed(current): - if overlap_sofar + _simple_token_count(p) > overlap_tokens: - break - overlap_text.insert(0, p) - overlap_sofar += _simple_token_count(p) - current = overlap_text - current_tokens = overlap_sofar - current.append(para) - current_tokens += pt - - if current: - chunks.append("\n\n".join(current)) - return chunks - - -def iter_soil_chunks() -> Iterator[tuple[str, dict]]: - """ - تولید chunkهای متنی از soil_data و sensor_data. - هر chunk: (text, metadata) - """ - locations = ( - SoilLocation.objects.prefetch_related( - Prefetch("depths", queryset=SoilDepthData.objects.order_by("depth_label")), - "sensor_data", - ) - .order_by("id") - ) - - for loc in locations: - text = _location_to_text(loc) - if not text.strip(): - continue - yield text, { - "source": "soil_data", - "location_id": loc.id, - } - - -def iter_tone_chunks(tone_path: str | Path, max_tokens: int = 500, overlap: int = 50) -> Iterator[tuple[str, dict]]: - """تولید chunkهای فایل لحن.""" - content = _load_tone_file(tone_path) - if not content: - return - for chunk in _chunk_text(content, max_tokens=max_tokens, overlap_tokens=overlap): - yield chunk, {"source": "tone"} - - -def build_all_chunks( - tone_path: str | Path, - max_chunk_tokens: int = 500, - overlap_tokens: int = 50, -) -> list[tuple[str, dict]]: - """ - ساخت همه chunkها از soil_data، sensor_data و فایل لحن. - خروجی: لیست (text, metadata) - """ - out = [] - for text, meta in iter_soil_chunks(): - out.append((text, meta)) - for text, meta in iter_tone_chunks( - tone_path, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens - ): - out.append((text, meta)) - return out diff --git a/knowledge_base/config/__init__.py b/knowledge_base/config/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/knowledge_base/config/rag_config.example.yaml b/knowledge_base/config/rag_config.example.yaml deleted file mode 100644 index 5618dd8..0000000 --- a/knowledge_base/config/rag_config.example.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# نمونه تنظیمات RAG برای پایگاه دانش CropLogic -# کپی به rag_config.yaml و در صورت نیاز ویرایش کنید - -embedding: - provider: "sentence_transformers" # یا openai - model: "paraphrase-multilingual-MiniLM-L12-v2" - # برای OpenAI: - # provider: "openai" - # model: "text-embedding-3-small" - # api_key_env: "OPENAI_API_KEY" - batch_size: 32 - -chromadb: - persist_directory: "data/chromadb" - collection_name: "croplogic_kb" - -chunking: - max_chunk_tokens: 500 - overlap_tokens: 50 - -tone_file: "config/tone.txt" diff --git a/knowledge_base/embeddings.py b/knowledge_base/embeddings.py deleted file mode 100644 index 785bbd2..0000000 --- a/knowledge_base/embeddings.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -لایه Embedding سازگار با چند provider (sentence_transformers، openai). -""" -from typing import Protocol - -from .rag_settings import EmbeddingConfig, RAGConfig - - -class Embedder(Protocol): - """پروتکل embedder.""" - - def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]: - ... - - -class SentenceTransformerEmbedder: - """Embedder با استفاده از sentence-transformers.""" - - def __init__(self, model_name: str): - from sentence_transformers import SentenceTransformer - - self._model = SentenceTransformer(model_name) - - def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]: - embeddings = self._model.encode( - texts, - batch_size=batch_size or 32, - show_progress_bar=len(texts) > 50, - convert_to_numpy=True, - ) - return embeddings.tolist() - - -class OpenAIEmbedder: - """Embedder با استفاده از OpenAI API.""" - - def __init__(self, model_name: str, api_key: str | None = None): - import os - - from openai import OpenAI - - key = api_key or os.environ.get("OPENAI_API_KEY") - if not key: - raise ValueError( - "OpenAI API key required. Set OPENAI_API_KEY env or pass api_key." - ) - self._client = OpenAI(api_key=key) - self._model = model_name - - def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]: - # OpenAI limits batch size (max ~2048 inputs); we use smaller batches - batch_size = min(batch_size or 100, 100) - all_embeddings = [] - for i in range(0, len(texts), batch_size): - batch = texts[i : i + batch_size] - resp = self._client.embeddings.create( - model=self._model, - input=batch, - ) - for e in resp.data: - all_embeddings.append(e.embedding) - return all_embeddings - - -def get_embedder(config: RAGConfig | EmbeddingConfig) -> Embedder: - """ - بر اساس config، embedder مناسب را برمی‌گرداند. - """ - if isinstance(config, RAGConfig): - cfg = config.embedding - else: - cfg = config - - if cfg.provider == "sentence_transformers": - return SentenceTransformerEmbedder(model_name=cfg.model) - if cfg.provider == "openai": - api_key = None - if cfg.api_key_env: - import os - - api_key = os.environ.get(cfg.api_key_env) - return OpenAIEmbedder(model_name=cfg.model, api_key=api_key) - - raise ValueError(f"Unknown embedding provider: {cfg.provider}") diff --git a/knowledge_base/indexer.py b/knowledge_base/indexer.py deleted file mode 100644 index 1ba8dfa..0000000 --- a/knowledge_base/indexer.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB. -""" -from pathlib import Path - -from .chunks import build_all_chunks -from .rag_settings import RAGConfig -from .embeddings import get_embedder - - -COLLECTION_NAME = "croplogic_kb" - - -def build_index(config: RAGConfig) -> int: - """ - ساخت/بازسازی کامل index پایگاه دانش. - chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره می‌کند. - - Returns: - تعداد documentهای اضافه شده. - """ - tone_path = Path(config.tone_file) - - chunks = build_all_chunks( - tone_path=tone_path, - max_chunk_tokens=config.chunking.max_chunk_tokens, - overlap_tokens=config.chunking.overlap_tokens, - ) - - if not chunks: - return 0 - - texts = [t for t, _ in chunks] - metadatas = [m for _, m in chunks] - - # تبدیل metadata به فرمت ChromaDB (فقط str, int, float) - def _serialize_meta(m: dict) -> dict: - out = {} - for k, v in m.items(): - if v is None: - continue - if isinstance(v, (str, int, float, bool)): - out[k] = v - else: - out[k] = str(v) - return out - - metadatas = [_serialize_meta(m) for m in metadatas] - - embedder = get_embedder(config) - batch_size = config.embedding.batch_size - - all_embeddings = [] - for i in range(0, len(texts), batch_size): - batch = texts[i : i + batch_size] - embs = embedder.encode(batch, batch_size=batch_size) - all_embeddings.extend(embs) - - # ChromaDB - persist_dir = Path(config.chromadb.persist_directory) - persist_dir.mkdir(parents=True, exist_ok=True) - - import chromadb - from chromadb.config import Settings as ChromaSettings - - client = chromadb.PersistentClient( - path=str(persist_dir), - settings=ChromaSettings(anonymized_telemetry=False), - ) - - collection_name = config.chromadb.collection_name or COLLECTION_NAME - try: - client.delete_collection(collection_name) - except Exception: - pass - - collection = client.create_collection( - name=collection_name, - metadata={"hnsw:space": "cosine"}, - ) - - ids = [f"doc_{i}" for i in range(len(texts))] - collection.add( - ids=ids, - embeddings=all_embeddings, - documents=texts, - metadatas=metadatas, - ) - - return len(texts) diff --git a/knowledge_base/management/__init__.py b/knowledge_base/management/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/knowledge_base/management/commands/__init__.py b/knowledge_base/management/commands/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/knowledge_base/management/commands/build_knowledge_base.py b/knowledge_base/management/commands/build_knowledge_base.py deleted file mode 100644 index 0557868..0000000 --- a/knowledge_base/management/commands/build_knowledge_base.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -دستور CLI برای ساخت index پایگاه دانش. -""" -from pathlib import Path - -from django.core.management.base import BaseCommand - -from knowledge_base.rag_settings import RAGConfig -from knowledge_base.indexer import build_index - - -class Command(BaseCommand): - help = "ساخت/بازسازی پایگاه دانش RAG از sensor_data، soil_data و فایل لحن" - - def add_arguments(self, parser): - parser.add_argument( - "--config", - type=str, - default="config/rag_config.yaml", - help="مسیر فایل config یامل (پیش‌فرض: config/rag_config.yaml)", - ) - - def handle(self, *args, **options): - config_path = options["config"] - path = Path(config_path) - - if not path.is_absolute(): - path = Path.cwd() / config_path - - if not path.exists(): - self.stderr.write( - self.style.ERROR(f"فایل config یافت نشد: {path}") - ) - self.stderr.write( - "یک فایل config از روی config/rag_config.yaml بسازید یا از config/rag_config.example.yaml کپی کنید." - ) - return - - self.stdout.write("در حال بارگذاری config...") - config = RAGConfig.load(path) - - self.stdout.write("در حال تولید chunks از soil_data و sensor_data...") - count = build_index(config) - - self.stdout.write( - self.style.SUCCESS(f"پایگاه دانش با {count} سند ساخته شد.") - )