Remove knowledge_base module files
- Deleted unused files from the knowledge_base module, including __init__.py, apps.py, chunks.py, embeddings.py, indexer.py, and management commands. - This cleanup helps streamline the codebase by removing obsolete components.
This commit is contained in:
@@ -1,7 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBaseConfig(AppConfig):
|
|
||||||
default_auto_field = "django.db.models.BigAutoField"
|
|
||||||
name = "knowledge_base"
|
|
||||||
verbose_name = "Knowledge Base"
|
|
||||||
@@ -1,202 +0,0 @@
|
|||||||
"""
|
|
||||||
تولید chunk متنی از دادههای sensor_data، soil_data و فایل لحن.
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Iterator
|
|
||||||
|
|
||||||
from django.db.models import Prefetch
|
|
||||||
|
|
||||||
from sensor_data.models import SensorData
|
|
||||||
from soil_data.models import SoilDepthData, SoilLocation
|
|
||||||
|
|
||||||
|
|
||||||
DEPTH_LABELS_FA = {
|
|
||||||
"0-5cm": "۰–۵ سانتیمتر",
|
|
||||||
"5-15cm": "۵–۱۵ سانتیمتر",
|
|
||||||
"15-30cm": "۱۵–۳۰ سانتیمتر",
|
|
||||||
}
|
|
||||||
|
|
||||||
SOIL_FIELD_NAMES_FA = {
|
|
||||||
"bdod": "چگالی توده خاک",
|
|
||||||
"cec": "ظرفیت تبادل کاتیونی",
|
|
||||||
"cfvo": "حجم کسر ریزدانه",
|
|
||||||
"clay": "رس",
|
|
||||||
"nitrogen": "نیتروژن",
|
|
||||||
"ocd": "کربن آلی خاک",
|
|
||||||
"ocs": "ذخیره کربن آلی",
|
|
||||||
"phh2o": "pH خاک",
|
|
||||||
"sand": "ماسه",
|
|
||||||
"silt": "لای",
|
|
||||||
"soc": "کربن آلی خاک",
|
|
||||||
"wv0010": "آب موجود در ۱۰ kPa",
|
|
||||||
"wv0033": "آب موجود در ۳۳ kPa",
|
|
||||||
"wv1500": "آب موجود در ۱۵۰۰ kPa",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt(val: float | None) -> str:
|
|
||||||
if val is None:
|
|
||||||
return "ندارد"
|
|
||||||
return f"{val:.2f}"
|
|
||||||
|
|
||||||
|
|
||||||
def _soil_depth_to_text(depth: SoilDepthData) -> str:
|
|
||||||
"""تبدیل یک SoilDepthData به متن توضیحی."""
|
|
||||||
parts = []
|
|
||||||
for field in ["phh2o", "nitrogen", "clay", "sand", "silt", "cec", "soc", "bdod"]:
|
|
||||||
val = getattr(depth, field, None)
|
|
||||||
if val is not None:
|
|
||||||
name = SOIL_FIELD_NAMES_FA.get(field, field)
|
|
||||||
parts.append(f"{name}={_fmt(val)}")
|
|
||||||
if not parts:
|
|
||||||
return "داده خاک موجود نیست."
|
|
||||||
return "، ".join(parts)
|
|
||||||
|
|
||||||
|
|
||||||
def _location_to_text(location: SoilLocation) -> str:
|
|
||||||
"""
|
|
||||||
تبدیل یک SoilLocation به همراه depths و sensor_data به متن.
|
|
||||||
"""
|
|
||||||
lat = float(location.latitude)
|
|
||||||
lon = float(location.longitude)
|
|
||||||
lines = [f"موقعیت جغرافیایی: عرض {lat}، طول {lon}."]
|
|
||||||
|
|
||||||
depths = list(location.depths.order_by("depth_label"))
|
|
||||||
for d in depths:
|
|
||||||
label_fa = DEPTH_LABELS_FA.get(d.depth_label, d.depth_label)
|
|
||||||
lines.append(f"دادههای خاک عمق {label_fa}: {_soil_depth_to_text(d)}.")
|
|
||||||
|
|
||||||
sensors = list(location.sensor_data.all())
|
|
||||||
if sensors:
|
|
||||||
for s in sensors:
|
|
||||||
parts = []
|
|
||||||
if s.soil_moisture is not None:
|
|
||||||
parts.append(f"رطوبت خاک={_fmt(s.soil_moisture)}")
|
|
||||||
if s.soil_temperature is not None:
|
|
||||||
parts.append(f"دما={_fmt(s.soil_temperature)}")
|
|
||||||
if s.soil_ph is not None:
|
|
||||||
parts.append(f"pH={_fmt(s.soil_ph)}")
|
|
||||||
if s.electrical_conductivity is not None:
|
|
||||||
parts.append(f"هدایت الکتریکی={_fmt(s.electrical_conductivity)}")
|
|
||||||
if s.nitrogen is not None:
|
|
||||||
parts.append(f"نیتروژن={_fmt(s.nitrogen)}")
|
|
||||||
if s.phosphorus is not None:
|
|
||||||
parts.append(f"فسفر={_fmt(s.phosphorus)}")
|
|
||||||
if s.potassium is not None:
|
|
||||||
parts.append(f"پتاسیم={_fmt(s.potassium)}")
|
|
||||||
if parts:
|
|
||||||
lines.append(
|
|
||||||
f"داده سنسور (location_id={location.id}): "
|
|
||||||
+ "، ".join(parts)
|
|
||||||
+ "."
|
|
||||||
)
|
|
||||||
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_tone_file(path: str | Path) -> str:
|
|
||||||
"""بارگذاری محتوای فایل لحن."""
|
|
||||||
path = Path(path)
|
|
||||||
if not path.exists():
|
|
||||||
return ""
|
|
||||||
return path.read_text(encoding="utf-8").strip()
|
|
||||||
|
|
||||||
|
|
||||||
def _simple_token_count(text: str) -> int:
|
|
||||||
"""تخمین تعداد توکن با تقسیم بر حدود ۴ کاراکتر."""
|
|
||||||
return max(1, len(text) // 4)
|
|
||||||
|
|
||||||
|
|
||||||
def _chunk_text(
|
|
||||||
text: str,
|
|
||||||
max_tokens: int = 500,
|
|
||||||
overlap_tokens: int = 50,
|
|
||||||
) -> list[str]:
|
|
||||||
"""
|
|
||||||
تقسیم متن به chunkها بر اساس تخمین توکن.
|
|
||||||
از پاراگرافها (خطوط خالی) به عنوان مرز استفاده میکند.
|
|
||||||
"""
|
|
||||||
if not text.strip():
|
|
||||||
return []
|
|
||||||
if _simple_token_count(text) <= max_tokens:
|
|
||||||
return [text.strip()]
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
paragraphs = re.split(r"\n\s*\n", text)
|
|
||||||
current = []
|
|
||||||
current_tokens = 0
|
|
||||||
|
|
||||||
for para in paragraphs:
|
|
||||||
para = para.strip()
|
|
||||||
if not para:
|
|
||||||
continue
|
|
||||||
pt = _simple_token_count(para)
|
|
||||||
if current_tokens + pt > max_tokens and current:
|
|
||||||
chunks.append("\n\n".join(current))
|
|
||||||
overlap_text = []
|
|
||||||
overlap_sofar = 0
|
|
||||||
for p in reversed(current):
|
|
||||||
if overlap_sofar + _simple_token_count(p) > overlap_tokens:
|
|
||||||
break
|
|
||||||
overlap_text.insert(0, p)
|
|
||||||
overlap_sofar += _simple_token_count(p)
|
|
||||||
current = overlap_text
|
|
||||||
current_tokens = overlap_sofar
|
|
||||||
current.append(para)
|
|
||||||
current_tokens += pt
|
|
||||||
|
|
||||||
if current:
|
|
||||||
chunks.append("\n\n".join(current))
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
|
|
||||||
def iter_soil_chunks() -> Iterator[tuple[str, dict]]:
|
|
||||||
"""
|
|
||||||
تولید chunkهای متنی از soil_data و sensor_data.
|
|
||||||
هر chunk: (text, metadata)
|
|
||||||
"""
|
|
||||||
locations = (
|
|
||||||
SoilLocation.objects.prefetch_related(
|
|
||||||
Prefetch("depths", queryset=SoilDepthData.objects.order_by("depth_label")),
|
|
||||||
"sensor_data",
|
|
||||||
)
|
|
||||||
.order_by("id")
|
|
||||||
)
|
|
||||||
|
|
||||||
for loc in locations:
|
|
||||||
text = _location_to_text(loc)
|
|
||||||
if not text.strip():
|
|
||||||
continue
|
|
||||||
yield text, {
|
|
||||||
"source": "soil_data",
|
|
||||||
"location_id": loc.id,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def iter_tone_chunks(tone_path: str | Path, max_tokens: int = 500, overlap: int = 50) -> Iterator[tuple[str, dict]]:
|
|
||||||
"""تولید chunkهای فایل لحن."""
|
|
||||||
content = _load_tone_file(tone_path)
|
|
||||||
if not content:
|
|
||||||
return
|
|
||||||
for chunk in _chunk_text(content, max_tokens=max_tokens, overlap_tokens=overlap):
|
|
||||||
yield chunk, {"source": "tone"}
|
|
||||||
|
|
||||||
|
|
||||||
def build_all_chunks(
|
|
||||||
tone_path: str | Path,
|
|
||||||
max_chunk_tokens: int = 500,
|
|
||||||
overlap_tokens: int = 50,
|
|
||||||
) -> list[tuple[str, dict]]:
|
|
||||||
"""
|
|
||||||
ساخت همه chunkها از soil_data، sensor_data و فایل لحن.
|
|
||||||
خروجی: لیست (text, metadata)
|
|
||||||
"""
|
|
||||||
out = []
|
|
||||||
for text, meta in iter_soil_chunks():
|
|
||||||
out.append((text, meta))
|
|
||||||
for text, meta in iter_tone_chunks(
|
|
||||||
tone_path, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens
|
|
||||||
):
|
|
||||||
out.append((text, meta))
|
|
||||||
return out
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
# نمونه تنظیمات RAG برای پایگاه دانش CropLogic
|
|
||||||
# کپی به rag_config.yaml و در صورت نیاز ویرایش کنید
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
provider: "sentence_transformers" # یا openai
|
|
||||||
model: "paraphrase-multilingual-MiniLM-L12-v2"
|
|
||||||
# برای OpenAI:
|
|
||||||
# provider: "openai"
|
|
||||||
# model: "text-embedding-3-small"
|
|
||||||
# api_key_env: "OPENAI_API_KEY"
|
|
||||||
batch_size: 32
|
|
||||||
|
|
||||||
chromadb:
|
|
||||||
persist_directory: "data/chromadb"
|
|
||||||
collection_name: "croplogic_kb"
|
|
||||||
|
|
||||||
chunking:
|
|
||||||
max_chunk_tokens: 500
|
|
||||||
overlap_tokens: 50
|
|
||||||
|
|
||||||
tone_file: "config/tone.txt"
|
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
"""
|
|
||||||
لایه Embedding سازگار با چند provider (sentence_transformers، openai).
|
|
||||||
"""
|
|
||||||
from typing import Protocol
|
|
||||||
|
|
||||||
from .rag_settings import EmbeddingConfig, RAGConfig
|
|
||||||
|
|
||||||
|
|
||||||
class Embedder(Protocol):
|
|
||||||
"""پروتکل embedder."""
|
|
||||||
|
|
||||||
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
class SentenceTransformerEmbedder:
|
|
||||||
"""Embedder با استفاده از sentence-transformers."""
|
|
||||||
|
|
||||||
def __init__(self, model_name: str):
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
|
|
||||||
self._model = SentenceTransformer(model_name)
|
|
||||||
|
|
||||||
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
|
||||||
embeddings = self._model.encode(
|
|
||||||
texts,
|
|
||||||
batch_size=batch_size or 32,
|
|
||||||
show_progress_bar=len(texts) > 50,
|
|
||||||
convert_to_numpy=True,
|
|
||||||
)
|
|
||||||
return embeddings.tolist()
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIEmbedder:
|
|
||||||
"""Embedder با استفاده از OpenAI API."""
|
|
||||||
|
|
||||||
def __init__(self, model_name: str, api_key: str | None = None):
|
|
||||||
import os
|
|
||||||
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
||||||
if not key:
|
|
||||||
raise ValueError(
|
|
||||||
"OpenAI API key required. Set OPENAI_API_KEY env or pass api_key."
|
|
||||||
)
|
|
||||||
self._client = OpenAI(api_key=key)
|
|
||||||
self._model = model_name
|
|
||||||
|
|
||||||
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
|
||||||
# OpenAI limits batch size (max ~2048 inputs); we use smaller batches
|
|
||||||
batch_size = min(batch_size or 100, 100)
|
|
||||||
all_embeddings = []
|
|
||||||
for i in range(0, len(texts), batch_size):
|
|
||||||
batch = texts[i : i + batch_size]
|
|
||||||
resp = self._client.embeddings.create(
|
|
||||||
model=self._model,
|
|
||||||
input=batch,
|
|
||||||
)
|
|
||||||
for e in resp.data:
|
|
||||||
all_embeddings.append(e.embedding)
|
|
||||||
return all_embeddings
|
|
||||||
|
|
||||||
|
|
||||||
def get_embedder(config: RAGConfig | EmbeddingConfig) -> Embedder:
|
|
||||||
"""
|
|
||||||
بر اساس config، embedder مناسب را برمیگرداند.
|
|
||||||
"""
|
|
||||||
if isinstance(config, RAGConfig):
|
|
||||||
cfg = config.embedding
|
|
||||||
else:
|
|
||||||
cfg = config
|
|
||||||
|
|
||||||
if cfg.provider == "sentence_transformers":
|
|
||||||
return SentenceTransformerEmbedder(model_name=cfg.model)
|
|
||||||
if cfg.provider == "openai":
|
|
||||||
api_key = None
|
|
||||||
if cfg.api_key_env:
|
|
||||||
import os
|
|
||||||
|
|
||||||
api_key = os.environ.get(cfg.api_key_env)
|
|
||||||
return OpenAIEmbedder(model_name=cfg.model, api_key=api_key)
|
|
||||||
|
|
||||||
raise ValueError(f"Unknown embedding provider: {cfg.provider}")
|
|
||||||
@@ -1,90 +0,0 @@
|
|||||||
"""
|
|
||||||
منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB.
|
|
||||||
"""
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .chunks import build_all_chunks
|
|
||||||
from .rag_settings import RAGConfig
|
|
||||||
from .embeddings import get_embedder
|
|
||||||
|
|
||||||
|
|
||||||
COLLECTION_NAME = "croplogic_kb"
|
|
||||||
|
|
||||||
|
|
||||||
def build_index(config: RAGConfig) -> int:
|
|
||||||
"""
|
|
||||||
ساخت/بازسازی کامل index پایگاه دانش.
|
|
||||||
chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره میکند.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
تعداد documentهای اضافه شده.
|
|
||||||
"""
|
|
||||||
tone_path = Path(config.tone_file)
|
|
||||||
|
|
||||||
chunks = build_all_chunks(
|
|
||||||
tone_path=tone_path,
|
|
||||||
max_chunk_tokens=config.chunking.max_chunk_tokens,
|
|
||||||
overlap_tokens=config.chunking.overlap_tokens,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not chunks:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
texts = [t for t, _ in chunks]
|
|
||||||
metadatas = [m for _, m in chunks]
|
|
||||||
|
|
||||||
# تبدیل metadata به فرمت ChromaDB (فقط str, int, float)
|
|
||||||
def _serialize_meta(m: dict) -> dict:
|
|
||||||
out = {}
|
|
||||||
for k, v in m.items():
|
|
||||||
if v is None:
|
|
||||||
continue
|
|
||||||
if isinstance(v, (str, int, float, bool)):
|
|
||||||
out[k] = v
|
|
||||||
else:
|
|
||||||
out[k] = str(v)
|
|
||||||
return out
|
|
||||||
|
|
||||||
metadatas = [_serialize_meta(m) for m in metadatas]
|
|
||||||
|
|
||||||
embedder = get_embedder(config)
|
|
||||||
batch_size = config.embedding.batch_size
|
|
||||||
|
|
||||||
all_embeddings = []
|
|
||||||
for i in range(0, len(texts), batch_size):
|
|
||||||
batch = texts[i : i + batch_size]
|
|
||||||
embs = embedder.encode(batch, batch_size=batch_size)
|
|
||||||
all_embeddings.extend(embs)
|
|
||||||
|
|
||||||
# ChromaDB
|
|
||||||
persist_dir = Path(config.chromadb.persist_directory)
|
|
||||||
persist_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
from chromadb.config import Settings as ChromaSettings
|
|
||||||
|
|
||||||
client = chromadb.PersistentClient(
|
|
||||||
path=str(persist_dir),
|
|
||||||
settings=ChromaSettings(anonymized_telemetry=False),
|
|
||||||
)
|
|
||||||
|
|
||||||
collection_name = config.chromadb.collection_name or COLLECTION_NAME
|
|
||||||
try:
|
|
||||||
client.delete_collection(collection_name)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
collection = client.create_collection(
|
|
||||||
name=collection_name,
|
|
||||||
metadata={"hnsw:space": "cosine"},
|
|
||||||
)
|
|
||||||
|
|
||||||
ids = [f"doc_{i}" for i in range(len(texts))]
|
|
||||||
collection.add(
|
|
||||||
ids=ids,
|
|
||||||
embeddings=all_embeddings,
|
|
||||||
documents=texts,
|
|
||||||
metadatas=metadatas,
|
|
||||||
)
|
|
||||||
|
|
||||||
return len(texts)
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
"""
|
|
||||||
دستور CLI برای ساخت index پایگاه دانش.
|
|
||||||
"""
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
|
|
||||||
from knowledge_base.rag_settings import RAGConfig
|
|
||||||
from knowledge_base.indexer import build_index
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
help = "ساخت/بازسازی پایگاه دانش RAG از sensor_data، soil_data و فایل لحن"
|
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
|
||||||
parser.add_argument(
|
|
||||||
"--config",
|
|
||||||
type=str,
|
|
||||||
default="config/rag_config.yaml",
|
|
||||||
help="مسیر فایل config یامل (پیشفرض: config/rag_config.yaml)",
|
|
||||||
)
|
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
|
||||||
config_path = options["config"]
|
|
||||||
path = Path(config_path)
|
|
||||||
|
|
||||||
if not path.is_absolute():
|
|
||||||
path = Path.cwd() / config_path
|
|
||||||
|
|
||||||
if not path.exists():
|
|
||||||
self.stderr.write(
|
|
||||||
self.style.ERROR(f"فایل config یافت نشد: {path}")
|
|
||||||
)
|
|
||||||
self.stderr.write(
|
|
||||||
"یک فایل config از روی config/rag_config.yaml بسازید یا از config/rag_config.example.yaml کپی کنید."
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
self.stdout.write("در حال بارگذاری config...")
|
|
||||||
config = RAGConfig.load(path)
|
|
||||||
|
|
||||||
self.stdout.write("در حال تولید chunks از soil_data و sensor_data...")
|
|
||||||
count = build_index(config)
|
|
||||||
|
|
||||||
self.stdout.write(
|
|
||||||
self.style.SUCCESS(f"پایگاه دانش با {count} سند ساخته شد.")
|
|
||||||
)
|
|
||||||
Reference in New Issue
Block a user