""" تولید chunk متنی از داده‌های sensor_data، soil_data و فایل لحن. """ import re from pathlib import Path from typing import Iterator from django.db.models import Prefetch from sensor_data.models import SensorData from soil_data.models import SoilDepthData, SoilLocation DEPTH_LABELS_FA = { "0-5cm": "۰–۵ سانتی‌متر", "5-15cm": "۵–۱۵ سانتی‌متر", "15-30cm": "۱۵–۳۰ سانتی‌متر", } SOIL_FIELD_NAMES_FA = { "bdod": "چگالی توده خاک", "cec": "ظرفیت تبادل کاتیونی", "cfvo": "حجم کسر ریزدانه", "clay": "رس", "nitrogen": "نیتروژن", "ocd": "کربن آلی خاک", "ocs": "ذخیره کربن آلی", "phh2o": "pH خاک", "sand": "ماسه", "silt": "لای", "soc": "کربن آلی خاک", "wv0010": "آب موجود در ۱۰ kPa", "wv0033": "آب موجود در ۳۳ kPa", "wv1500": "آب موجود در ۱۵۰۰ kPa", } def _fmt(val: float | None) -> str: if val is None: return "ندارد" return f"{val:.2f}" def _soil_depth_to_text(depth: SoilDepthData) -> str: """تبدیل یک SoilDepthData به متن توضیحی.""" parts = [] for field in ["phh2o", "nitrogen", "clay", "sand", "silt", "cec", "soc", "bdod"]: val = getattr(depth, field, None) if val is not None: name = SOIL_FIELD_NAMES_FA.get(field, field) parts.append(f"{name}={_fmt(val)}") if not parts: return "داده خاک موجود نیست." return "، ".join(parts) def _location_to_text(location: SoilLocation) -> str: """ تبدیل یک SoilLocation به همراه depths و sensor_data به متن. """ lat = float(location.latitude) lon = float(location.longitude) lines = [f"موقعیت جغرافیایی: عرض {lat}، طول {lon}."] depths = list(location.depths.order_by("depth_label")) for d in depths: label_fa = DEPTH_LABELS_FA.get(d.depth_label, d.depth_label) lines.append(f"داده‌های خاک عمق {label_fa}: {_soil_depth_to_text(d)}.") sensors = list(location.sensor_data.all()) if sensors: for s in sensors: parts = [] if s.soil_moisture is not None: parts.append(f"رطوبت خاک={_fmt(s.soil_moisture)}") if s.soil_temperature is not None: parts.append(f"دما={_fmt(s.soil_temperature)}") if s.soil_ph is not None: parts.append(f"pH={_fmt(s.soil_ph)}") if s.electrical_conductivity is not None: parts.append(f"هدایت الکتریکی={_fmt(s.electrical_conductivity)}") if s.nitrogen is not None: parts.append(f"نیتروژن={_fmt(s.nitrogen)}") if s.phosphorus is not None: parts.append(f"فسفر={_fmt(s.phosphorus)}") if s.potassium is not None: parts.append(f"پتاسیم={_fmt(s.potassium)}") if parts: lines.append( f"داده سنسور (location_id={location.id}): " + "، ".join(parts) + "." ) return "\n".join(lines) def _load_tone_file(path: str | Path) -> str: """بارگذاری محتوای فایل لحن.""" path = Path(path) if not path.exists(): return "" return path.read_text(encoding="utf-8").strip() def _simple_token_count(text: str) -> int: """تخمین تعداد توکن با تقسیم بر حدود ۴ کاراکتر.""" return max(1, len(text) // 4) def _chunk_text( text: str, max_tokens: int = 500, overlap_tokens: int = 50, ) -> list[str]: """ تقسیم متن به chunkها بر اساس تخمین توکن. از پاراگراف‌ها (خطوط خالی) به عنوان مرز استفاده می‌کند. """ if not text.strip(): return [] if _simple_token_count(text) <= max_tokens: return [text.strip()] chunks = [] paragraphs = re.split(r"\n\s*\n", text) current = [] current_tokens = 0 for para in paragraphs: para = para.strip() if not para: continue pt = _simple_token_count(para) if current_tokens + pt > max_tokens and current: chunks.append("\n\n".join(current)) overlap_text = [] overlap_sofar = 0 for p in reversed(current): if overlap_sofar + _simple_token_count(p) > overlap_tokens: break overlap_text.insert(0, p) overlap_sofar += _simple_token_count(p) current = overlap_text current_tokens = overlap_sofar current.append(para) current_tokens += pt if current: chunks.append("\n\n".join(current)) return chunks def iter_soil_chunks() -> Iterator[tuple[str, dict]]: """ تولید chunkهای متنی از soil_data و sensor_data. هر chunk: (text, metadata) """ locations = ( SoilLocation.objects.prefetch_related( Prefetch("depths", queryset=SoilDepthData.objects.order_by("depth_label")), "sensor_data", ) .order_by("id") ) for loc in locations: text = _location_to_text(loc) if not text.strip(): continue yield text, { "source": "soil_data", "location_id": loc.id, } def iter_tone_chunks(tone_path: str | Path, max_tokens: int = 500, overlap: int = 50) -> Iterator[tuple[str, dict]]: """تولید chunkهای فایل لحن.""" content = _load_tone_file(tone_path) if not content: return for chunk in _chunk_text(content, max_tokens=max_tokens, overlap_tokens=overlap): yield chunk, {"source": "tone"} def build_all_chunks( tone_path: str | Path, max_chunk_tokens: int = 500, overlap_tokens: int = 50, ) -> list[tuple[str, dict]]: """ ساخت همه chunkها از soil_data، sensor_data و فایل لحن. خروجی: لیست (text, metadata) """ out = [] for text, meta in iter_soil_chunks(): out.append((text, meta)) for text, meta in iter_tone_chunks( tone_path, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens ): out.append((text, meta)) return out