203 lines
6.4 KiB
Python
203 lines
6.4 KiB
Python
|
|
"""
|
||
|
|
تولید chunk متنی از دادههای sensor_data، soil_data و فایل لحن.
|
||
|
|
"""
|
||
|
|
import re
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Iterator
|
||
|
|
|
||
|
|
from django.db.models import Prefetch
|
||
|
|
|
||
|
|
from sensor_data.models import SensorData
|
||
|
|
from soil_data.models import SoilDepthData, SoilLocation
|
||
|
|
|
||
|
|
|
||
|
|
DEPTH_LABELS_FA = {
|
||
|
|
"0-5cm": "۰–۵ سانتیمتر",
|
||
|
|
"5-15cm": "۵–۱۵ سانتیمتر",
|
||
|
|
"15-30cm": "۱۵–۳۰ سانتیمتر",
|
||
|
|
}
|
||
|
|
|
||
|
|
SOIL_FIELD_NAMES_FA = {
|
||
|
|
"bdod": "چگالی توده خاک",
|
||
|
|
"cec": "ظرفیت تبادل کاتیونی",
|
||
|
|
"cfvo": "حجم کسر ریزدانه",
|
||
|
|
"clay": "رس",
|
||
|
|
"nitrogen": "نیتروژن",
|
||
|
|
"ocd": "کربن آلی خاک",
|
||
|
|
"ocs": "ذخیره کربن آلی",
|
||
|
|
"phh2o": "pH خاک",
|
||
|
|
"sand": "ماسه",
|
||
|
|
"silt": "لای",
|
||
|
|
"soc": "کربن آلی خاک",
|
||
|
|
"wv0010": "آب موجود در ۱۰ kPa",
|
||
|
|
"wv0033": "آب موجود در ۳۳ kPa",
|
||
|
|
"wv1500": "آب موجود در ۱۵۰۰ kPa",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _fmt(val: float | None) -> str:
|
||
|
|
if val is None:
|
||
|
|
return "ندارد"
|
||
|
|
return f"{val:.2f}"
|
||
|
|
|
||
|
|
|
||
|
|
def _soil_depth_to_text(depth: SoilDepthData) -> str:
|
||
|
|
"""تبدیل یک SoilDepthData به متن توضیحی."""
|
||
|
|
parts = []
|
||
|
|
for field in ["phh2o", "nitrogen", "clay", "sand", "silt", "cec", "soc", "bdod"]:
|
||
|
|
val = getattr(depth, field, None)
|
||
|
|
if val is not None:
|
||
|
|
name = SOIL_FIELD_NAMES_FA.get(field, field)
|
||
|
|
parts.append(f"{name}={_fmt(val)}")
|
||
|
|
if not parts:
|
||
|
|
return "داده خاک موجود نیست."
|
||
|
|
return "، ".join(parts)
|
||
|
|
|
||
|
|
|
||
|
|
def _location_to_text(location: SoilLocation) -> str:
|
||
|
|
"""
|
||
|
|
تبدیل یک SoilLocation به همراه depths و sensor_data به متن.
|
||
|
|
"""
|
||
|
|
lat = float(location.latitude)
|
||
|
|
lon = float(location.longitude)
|
||
|
|
lines = [f"موقعیت جغرافیایی: عرض {lat}، طول {lon}."]
|
||
|
|
|
||
|
|
depths = list(location.depths.order_by("depth_label"))
|
||
|
|
for d in depths:
|
||
|
|
label_fa = DEPTH_LABELS_FA.get(d.depth_label, d.depth_label)
|
||
|
|
lines.append(f"دادههای خاک عمق {label_fa}: {_soil_depth_to_text(d)}.")
|
||
|
|
|
||
|
|
sensors = list(location.sensor_data.all())
|
||
|
|
if sensors:
|
||
|
|
for s in sensors:
|
||
|
|
parts = []
|
||
|
|
if s.soil_moisture is not None:
|
||
|
|
parts.append(f"رطوبت خاک={_fmt(s.soil_moisture)}")
|
||
|
|
if s.soil_temperature is not None:
|
||
|
|
parts.append(f"دما={_fmt(s.soil_temperature)}")
|
||
|
|
if s.soil_ph is not None:
|
||
|
|
parts.append(f"pH={_fmt(s.soil_ph)}")
|
||
|
|
if s.electrical_conductivity is not None:
|
||
|
|
parts.append(f"هدایت الکتریکی={_fmt(s.electrical_conductivity)}")
|
||
|
|
if s.nitrogen is not None:
|
||
|
|
parts.append(f"نیتروژن={_fmt(s.nitrogen)}")
|
||
|
|
if s.phosphorus is not None:
|
||
|
|
parts.append(f"فسفر={_fmt(s.phosphorus)}")
|
||
|
|
if s.potassium is not None:
|
||
|
|
parts.append(f"پتاسیم={_fmt(s.potassium)}")
|
||
|
|
if parts:
|
||
|
|
lines.append(
|
||
|
|
f"داده سنسور (location_id={location.id}): "
|
||
|
|
+ "، ".join(parts)
|
||
|
|
+ "."
|
||
|
|
)
|
||
|
|
|
||
|
|
return "\n".join(lines)
|
||
|
|
|
||
|
|
|
||
|
|
def _load_tone_file(path: str | Path) -> str:
|
||
|
|
"""بارگذاری محتوای فایل لحن."""
|
||
|
|
path = Path(path)
|
||
|
|
if not path.exists():
|
||
|
|
return ""
|
||
|
|
return path.read_text(encoding="utf-8").strip()
|
||
|
|
|
||
|
|
|
||
|
|
def _simple_token_count(text: str) -> int:
|
||
|
|
"""تخمین تعداد توکن با تقسیم بر حدود ۴ کاراکتر."""
|
||
|
|
return max(1, len(text) // 4)
|
||
|
|
|
||
|
|
|
||
|
|
def _chunk_text(
|
||
|
|
text: str,
|
||
|
|
max_tokens: int = 500,
|
||
|
|
overlap_tokens: int = 50,
|
||
|
|
) -> list[str]:
|
||
|
|
"""
|
||
|
|
تقسیم متن به chunkها بر اساس تخمین توکن.
|
||
|
|
از پاراگرافها (خطوط خالی) به عنوان مرز استفاده میکند.
|
||
|
|
"""
|
||
|
|
if not text.strip():
|
||
|
|
return []
|
||
|
|
if _simple_token_count(text) <= max_tokens:
|
||
|
|
return [text.strip()]
|
||
|
|
|
||
|
|
chunks = []
|
||
|
|
paragraphs = re.split(r"\n\s*\n", text)
|
||
|
|
current = []
|
||
|
|
current_tokens = 0
|
||
|
|
|
||
|
|
for para in paragraphs:
|
||
|
|
para = para.strip()
|
||
|
|
if not para:
|
||
|
|
continue
|
||
|
|
pt = _simple_token_count(para)
|
||
|
|
if current_tokens + pt > max_tokens and current:
|
||
|
|
chunks.append("\n\n".join(current))
|
||
|
|
overlap_text = []
|
||
|
|
overlap_sofar = 0
|
||
|
|
for p in reversed(current):
|
||
|
|
if overlap_sofar + _simple_token_count(p) > overlap_tokens:
|
||
|
|
break
|
||
|
|
overlap_text.insert(0, p)
|
||
|
|
overlap_sofar += _simple_token_count(p)
|
||
|
|
current = overlap_text
|
||
|
|
current_tokens = overlap_sofar
|
||
|
|
current.append(para)
|
||
|
|
current_tokens += pt
|
||
|
|
|
||
|
|
if current:
|
||
|
|
chunks.append("\n\n".join(current))
|
||
|
|
return chunks
|
||
|
|
|
||
|
|
|
||
|
|
def iter_soil_chunks() -> Iterator[tuple[str, dict]]:
|
||
|
|
"""
|
||
|
|
تولید chunkهای متنی از soil_data و sensor_data.
|
||
|
|
هر chunk: (text, metadata)
|
||
|
|
"""
|
||
|
|
locations = (
|
||
|
|
SoilLocation.objects.prefetch_related(
|
||
|
|
Prefetch("depths", queryset=SoilDepthData.objects.order_by("depth_label")),
|
||
|
|
"sensor_data",
|
||
|
|
)
|
||
|
|
.order_by("id")
|
||
|
|
)
|
||
|
|
|
||
|
|
for loc in locations:
|
||
|
|
text = _location_to_text(loc)
|
||
|
|
if not text.strip():
|
||
|
|
continue
|
||
|
|
yield text, {
|
||
|
|
"source": "soil_data",
|
||
|
|
"location_id": loc.id,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def iter_tone_chunks(tone_path: str | Path, max_tokens: int = 500, overlap: int = 50) -> Iterator[tuple[str, dict]]:
|
||
|
|
"""تولید chunkهای فایل لحن."""
|
||
|
|
content = _load_tone_file(tone_path)
|
||
|
|
if not content:
|
||
|
|
return
|
||
|
|
for chunk in _chunk_text(content, max_tokens=max_tokens, overlap_tokens=overlap):
|
||
|
|
yield chunk, {"source": "tone"}
|
||
|
|
|
||
|
|
|
||
|
|
def build_all_chunks(
|
||
|
|
tone_path: str | Path,
|
||
|
|
max_chunk_tokens: int = 500,
|
||
|
|
overlap_tokens: int = 50,
|
||
|
|
) -> list[tuple[str, dict]]:
|
||
|
|
"""
|
||
|
|
ساخت همه chunkها از soil_data، sensor_data و فایل لحن.
|
||
|
|
خروجی: لیست (text, metadata)
|
||
|
|
"""
|
||
|
|
out = []
|
||
|
|
for text, meta in iter_soil_chunks():
|
||
|
|
out.append((text, meta))
|
||
|
|
for text, meta in iter_tone_chunks(
|
||
|
|
tone_path, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens
|
||
|
|
):
|
||
|
|
out.append((text, meta))
|
||
|
|
return out
|