Files
Ai/location_data/data_driven_subdivision.py
T

899 lines
33 KiB
Python
Raw Normal View History

2026-05-09 16:55:06 +03:30
from __future__ import annotations
2026-05-11 00:36:02 +03:30
from io import BytesIO
import math
import os
from pathlib import Path
2026-05-09 16:55:06 +03:30
from dataclasses import dataclass
2026-05-10 22:49:07 +03:30
import json
import logging
2026-05-09 16:55:06 +03:30
from typing import Any
2026-05-11 00:36:02 +03:30
from django.conf import settings
from django.core.files.base import ContentFile
2026-05-09 16:55:06 +03:30
from django.db import transaction
from .block_subdivision import detect_elbow_point, render_elbow_plot
from .models import (
AnalysisGridObservation,
BlockSubdivision,
RemoteSensingClusterAssignment,
RemoteSensingRun,
RemoteSensingSubdivisionResult,
SoilLocation,
)
DEFAULT_CLUSTER_FEATURES = [
"ndvi",
"ndwi",
"soil_vv_db",
]
SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
DEFAULT_RANDOM_STATE = 42
DEFAULT_MAX_K = 10
2026-05-11 00:36:02 +03:30
DEFAULT_REMOTE_SENSING_DIAGNOSTIC_DIR = "artifacts/remote_sensing_charts"
2026-05-09 16:55:06 +03:30
2026-05-10 22:49:07 +03:30
logger = logging.getLogger(__name__)
2026-05-09 16:55:06 +03:30
class DataDrivenSubdivisionError(Exception):
"""Raised when remote-sensing-driven subdivision can not be computed."""
2026-05-10 22:49:07 +03:30
class EmptyObservationDatasetError(DataDrivenSubdivisionError):
"""Raised when upstream persistence completes without usable clustering features."""
2026-05-09 16:55:06 +03:30
@dataclass
class ClusteringDataset:
observations: list[AnalysisGridObservation]
selected_features: list[str]
raw_feature_rows: list[list[float | None]]
raw_feature_maps: list[dict[str, float | None]]
skipped_cell_codes: list[str]
used_cell_codes: list[str]
imputed_matrix: list[list[float]]
scaled_matrix: list[list[float]]
imputer_statistics: dict[str, float | None]
scaler_means: dict[str, float]
scaler_scales: dict[str, float]
missing_value_counts: dict[str, int]
skipped_reasons: dict[str, list[str]]
def create_remote_sensing_subdivision_result(
*,
location: SoilLocation,
run: RemoteSensingRun,
observations: list[AnalysisGridObservation],
block_subdivision: BlockSubdivision | None = None,
block_code: str = "",
selected_features: list[str] | None = None,
explicit_k: int | None = None,
max_k: int = DEFAULT_MAX_K,
random_state: int = DEFAULT_RANDOM_STATE,
) -> RemoteSensingSubdivisionResult:
"""
Build a data-driven subdivision result from stored remote sensing observations.
KMeans is applied on actual per-cell feature vectors, not geometric points.
"""
dataset = build_clustering_dataset(
observations=observations,
selected_features=selected_features,
2026-05-10 22:49:07 +03:30
run=run,
location=location,
2026-05-09 16:55:06 +03:30
)
if not dataset.observations:
raise DataDrivenSubdivisionError("هیچ observation قابل استفاده‌ای برای خوشه‌بندی باقی نماند.")
optimal_k, inertia_curve = choose_cluster_count(
scaled_matrix=dataset.scaled_matrix,
explicit_k=explicit_k,
max_k=max_k,
random_state=random_state,
)
cluster_selection_strategy = "explicit_k" if explicit_k is not None else "elbow"
labels = run_kmeans_labels(
scaled_matrix=dataset.scaled_matrix,
cluster_count=optimal_k,
random_state=random_state,
)
cluster_summaries = build_cluster_summaries(
observations=dataset.observations,
labels=labels,
)
with transaction.atomic():
result, _created = RemoteSensingSubdivisionResult.objects.update_or_create(
run=run,
defaults={
"soil_location": location,
"block_subdivision": block_subdivision,
"block_code": block_code,
"chunk_size_sqm": run.chunk_size_sqm,
"temporal_start": run.temporal_start,
"temporal_end": run.temporal_end,
"cluster_count": optimal_k,
"selected_features": dataset.selected_features,
"skipped_cell_codes": dataset.skipped_cell_codes,
"metadata": {
"cell_count": len(observations),
"used_cell_count": len(dataset.observations),
"skipped_cell_count": len(dataset.skipped_cell_codes),
"used_cell_codes": dataset.used_cell_codes,
"skipped_reasons": dataset.skipped_reasons,
"selected_features": dataset.selected_features,
"imputer_strategy": "median",
"imputer_statistics": dataset.imputer_statistics,
"missing_value_counts": dataset.missing_value_counts,
"scaler_means": dataset.scaler_means,
"scaler_scales": dataset.scaler_scales,
"kmeans_params": {
"random_state": random_state,
"explicit_k": explicit_k,
"selected_k": optimal_k,
"max_k": max_k,
"n_init": 10,
"selection_strategy": cluster_selection_strategy,
},
"inertia_curve": inertia_curve,
"cluster_summaries": cluster_summaries,
},
},
)
result.assignments.all().delete()
assignment_rows = []
for index, observation in enumerate(dataset.observations):
assignment_rows.append(
RemoteSensingClusterAssignment(
result=result,
cell=observation.cell,
cluster_label=int(labels[index]),
raw_feature_values=dataset.raw_feature_maps[index],
scaled_feature_values={
feature_name: round(dataset.scaled_matrix[index][feature_index], 6)
for feature_index, feature_name in enumerate(dataset.selected_features)
},
)
)
RemoteSensingClusterAssignment.objects.bulk_create(assignment_rows)
2026-05-11 00:36:02 +03:30
diagnostic_artifacts = _persist_remote_sensing_diagnostic_artifacts(
result=result,
observations=dataset.observations,
labels=labels,
cluster_summaries=cluster_summaries,
selected_features=dataset.selected_features,
scaled_matrix=dataset.scaled_matrix,
inertia_curve=inertia_curve,
)
if diagnostic_artifacts:
metadata = dict(result.metadata or {})
metadata["diagnostic_artifacts"] = diagnostic_artifacts
result.metadata = metadata
result.save(update_fields=["metadata", "updated_at"])
2026-05-09 16:55:06 +03:30
if block_subdivision is not None:
sync_block_subdivision_with_result(
block_subdivision=block_subdivision,
result=result,
observations=observations,
cluster_summaries=cluster_summaries,
)
sync_location_block_layout_with_result(
location=location,
result=result,
cluster_summaries=cluster_summaries,
)
return result
def build_clustering_dataset(
*,
observations: list[AnalysisGridObservation],
selected_features: list[str] | None = None,
2026-05-10 22:49:07 +03:30
run: RemoteSensingRun | None = None,
location: SoilLocation | None = None,
2026-05-09 16:55:06 +03:30
) -> ClusteringDataset:
selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
invalid_features = [
feature_name
for feature_name in selected_features
if feature_name not in SUPPORTED_CLUSTER_FEATURES
]
if invalid_features:
raise DataDrivenSubdivisionError(
"ویژگی‌های نامعتبر برای خوشه‌بندی: "
+ ", ".join(sorted(invalid_features))
)
2026-05-10 22:49:07 +03:30
log_context = _build_clustering_log_context(
observations=observations,
selected_features=selected_features,
run=run,
location=location,
)
logger.info(
"Preparing clustering dataset: %s",
_serialize_log_payload(
{
**log_context,
"total_observations": len(observations),
"non_null_counts": _count_non_null_features(observations),
}
),
)
2026-05-09 16:55:06 +03:30
raw_rows: list[list[float | None]] = []
raw_maps: list[dict[str, float | None]] = []
usable_observations: list[AnalysisGridObservation] = []
skipped_cell_codes: list[str] = []
used_cell_codes: list[str] = []
missing_value_counts = {feature_name: 0 for feature_name in selected_features}
skipped_reasons = {"all_features_missing": []}
for observation in observations:
feature_map = {
feature_name: _coerce_float(getattr(observation, feature_name, None))
for feature_name in selected_features
}
for feature_name, value in feature_map.items():
if value is None:
missing_value_counts[feature_name] += 1
if all(value is None for value in feature_map.values()):
2026-05-10 22:49:07 +03:30
logger.debug(
"Skipping observation cell=%s: all clustering features are null | context=%s",
observation.cell.cell_code,
_serialize_log_payload(log_context),
)
2026-05-09 16:55:06 +03:30
skipped_cell_codes.append(observation.cell.cell_code)
skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
continue
usable_observations.append(observation)
used_cell_codes.append(observation.cell.cell_code)
raw_maps.append(feature_map)
raw_rows.append([feature_map[feature_name] for feature_name in selected_features])
2026-05-10 22:49:07 +03:30
logger.info(
"Clustering dataset filtered observations: %s",
_serialize_log_payload(
{
**log_context,
"remaining_observations": len(usable_observations),
"removed_observations": len(observations) - len(usable_observations),
}
)
)
zero_usable_feature_names = [
feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
]
if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
for feature_name in zero_usable_feature_names:
logger.warning(
"Feature %s has zero usable values in dataset | context=%s",
feature_name,
_serialize_log_payload(log_context),
)
2026-05-09 16:55:06 +03:30
if not usable_observations:
2026-05-10 22:49:07 +03:30
error_context = {
**log_context,
"total_observations": len(observations),
"removed_observations": len(observations),
"null_counts_per_feature": missing_value_counts,
"selected_features": selected_features,
}
logger.error(
"No usable observations available for clustering: %s",
_serialize_log_payload(error_context),
)
raise EmptyObservationDatasetError(
"Upstream processing completed but no usable feature values were persisted."
2026-05-09 16:55:06 +03:30
)
try:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
except ImportError as exc: # pragma: no cover - runtime dependency guard
raise DataDrivenSubdivisionError(
"scikit-learn و numpy برای خوشه‌بندی داده‌محور لازم هستند."
) from exc
raw_matrix = np.array(raw_rows, dtype=float)
imputer = SimpleImputer(strategy="median")
imputed_matrix = imputer.fit_transform(raw_matrix)
scaler = StandardScaler()
scaled_matrix = scaler.fit_transform(imputed_matrix)
return ClusteringDataset(
observations=usable_observations,
selected_features=selected_features,
raw_feature_rows=raw_rows,
raw_feature_maps=raw_maps,
skipped_cell_codes=skipped_cell_codes,
used_cell_codes=used_cell_codes,
imputed_matrix=imputed_matrix.tolist(),
scaled_matrix=scaled_matrix.tolist(),
imputer_statistics={
feature_name: _coerce_float(imputer.statistics_[index])
for index, feature_name in enumerate(selected_features)
},
scaler_means={
feature_name: float(scaler.mean_[index])
for index, feature_name in enumerate(selected_features)
},
scaler_scales={
feature_name: float(scaler.scale_[index] or 1.0)
for index, feature_name in enumerate(selected_features)
},
missing_value_counts=missing_value_counts,
skipped_reasons=skipped_reasons,
)
def choose_cluster_count(
*,
scaled_matrix: list[list[float]],
explicit_k: int | None,
max_k: int,
random_state: int,
) -> tuple[int, list[dict[str, float]]]:
sample_count = len(scaled_matrix)
if sample_count == 0:
raise DataDrivenSubdivisionError("هیچ نمونه‌ای برای خوشه‌بندی وجود ندارد.")
if sample_count == 1:
return 1, [{"k": 1, "sse": 0.0}]
if explicit_k is not None:
if explicit_k <= 0:
raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.")
return min(explicit_k, sample_count), []
try:
from sklearn.cluster import KMeans
except ImportError as exc: # pragma: no cover
raise DataDrivenSubdivisionError("scikit-learn برای انتخاب تعداد خوشه لازم است.") from exc
max_allowed_k = min(max_k, sample_count)
inertia_curve = []
for k in range(1, max_allowed_k + 1):
model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
model.fit(scaled_matrix)
inertia_curve.append({"k": k, "sse": round(float(model.inertia_), 6)})
return detect_elbow_point(inertia_curve), inertia_curve
def run_kmeans_labels(
*,
scaled_matrix: list[list[float]],
cluster_count: int,
random_state: int,
) -> list[int]:
if cluster_count <= 0:
raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.")
if len(scaled_matrix) == 1:
return [0]
try:
from sklearn.cluster import KMeans
except ImportError as exc: # pragma: no cover
raise DataDrivenSubdivisionError("scikit-learn برای اجرای KMeans لازم است.") from exc
model = KMeans(n_clusters=cluster_count, n_init=10, random_state=random_state)
return [int(label) for label in model.fit_predict(scaled_matrix)]
def build_cluster_summaries(
*,
observations: list[AnalysisGridObservation],
labels: list[int],
) -> list[dict[str, Any]]:
clusters: dict[int, dict[str, Any]] = {}
for observation, label in zip(observations, labels):
cluster = clusters.setdefault(
int(label),
{
"cluster_label": int(label),
"cell_codes": [],
"centroid_lat_sum": 0.0,
"centroid_lon_sum": 0.0,
"cell_count": 0,
},
)
cluster["cell_codes"].append(observation.cell.cell_code)
cluster["centroid_lat_sum"] += float(observation.cell.centroid_lat)
cluster["centroid_lon_sum"] += float(observation.cell.centroid_lon)
cluster["cell_count"] += 1
summaries = []
for cluster_label in sorted(clusters):
cluster = clusters[cluster_label]
cell_count = cluster["cell_count"] or 1
summaries.append(
{
"cluster_label": cluster_label,
"cell_count": cluster["cell_count"],
"centroid_lat": round(cluster["centroid_lat_sum"] / cell_count, 6),
"centroid_lon": round(cluster["centroid_lon_sum"] / cell_count, 6),
"cell_codes": cluster["cell_codes"],
}
)
return summaries
def sync_location_block_layout_with_result(
*,
location: SoilLocation,
result: RemoteSensingSubdivisionResult,
cluster_summaries: list[dict[str, Any]],
) -> None:
layout = dict(location.block_layout or {})
blocks = list(layout.get("blocks") or [])
target_block = None
for block in blocks:
if block.get("block_code") == result.block_code:
target_block = block
break
if target_block is None:
target_block = {
"block_code": result.block_code,
"order": len(blocks) + 1,
"source": "remote_sensing",
"needs_subdivision": None,
"sub_blocks": [],
}
blocks.append(target_block)
target_block["needs_subdivision"] = result.cluster_count > 1
target_block["sub_blocks"] = [
{
"sub_block_code": f"cluster-{cluster['cluster_label']}",
"cluster_label": cluster["cluster_label"],
"centroid_lat": cluster["centroid_lat"],
"centroid_lon": cluster["centroid_lon"],
"cell_count": cluster["cell_count"],
}
for cluster in cluster_summaries
]
target_block["subdivision_summary"] = {
"type": "data_driven_remote_sensing",
"cluster_count": result.cluster_count,
"selected_features": result.selected_features,
"used_cell_count": result.metadata.get("used_cell_count", 0),
"skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
"run_id": result.run_id,
}
layout["blocks"] = blocks
layout["algorithm_status"] = "completed"
location.block_layout = layout
location.save(update_fields=["block_layout", "updated_at"])
def sync_block_subdivision_with_result(
*,
block_subdivision: BlockSubdivision,
result: RemoteSensingSubdivisionResult,
observations: list[AnalysisGridObservation],
cluster_summaries: list[dict[str, Any]],
) -> None:
metadata = dict(block_subdivision.metadata or {})
metadata["data_driven_subdivision"] = {
"run_id": result.run_id,
"result_id": result.id,
"cluster_count": result.cluster_count,
"selected_features": result.selected_features,
"used_cell_count": result.metadata.get("used_cell_count", 0),
"skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
"temporal_extent": {
"start_date": result.temporal_start.isoformat() if result.temporal_start else None,
"end_date": result.temporal_end.isoformat() if result.temporal_end else None,
},
"inertia_curve": result.metadata.get("inertia_curve", []),
2026-05-11 00:36:02 +03:30
"diagnostic_artifacts": result.metadata.get("diagnostic_artifacts", {}),
2026-05-09 16:55:06 +03:30
}
block_subdivision.grid_points = [
{
"cell_code": observation.cell.cell_code,
"centroid_lat": round(float(observation.cell.centroid_lat), 6),
"centroid_lon": round(float(observation.cell.centroid_lon), 6),
}
for observation in observations
]
block_subdivision.centroid_points = [
{
"sub_block_code": f"cluster-{cluster['cluster_label']}",
"cluster_label": cluster["cluster_label"],
"centroid_lat": cluster["centroid_lat"],
"centroid_lon": cluster["centroid_lon"],
"cell_count": cluster["cell_count"],
"cell_codes": cluster["cell_codes"],
}
for cluster in cluster_summaries
]
block_subdivision.grid_point_count = len(observations)
block_subdivision.centroid_count = len(cluster_summaries)
block_subdivision.status = "subdivided"
block_subdivision.metadata = metadata
plot_content = render_elbow_plot(
inertia_curve=result.metadata.get("inertia_curve", []),
optimal_k=result.cluster_count,
block_code=result.block_code or block_subdivision.block_code,
)
if plot_content is not None:
block_subdivision.elbow_plot.save(
f"remote-sensing-{result.soil_location_id}-{result.block_code or block_subdivision.block_code}-elbow.png",
plot_content,
save=False,
)
block_subdivision.save(
update_fields=[
"grid_points",
"centroid_points",
"grid_point_count",
"centroid_count",
"status",
"metadata",
"elbow_plot",
"updated_at",
]
)
return
block_subdivision.save(
update_fields=[
"grid_points",
"centroid_points",
"grid_point_count",
"centroid_count",
"status",
"metadata",
"updated_at",
]
)
def _coerce_float(value: Any) -> float | None:
if value is None:
return None
try:
return float(value)
except (TypeError, ValueError):
return None
2026-05-10 22:49:07 +03:30
def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
for observation in observations:
for feature_name in DEFAULT_CLUSTER_FEATURES:
if _coerce_float(getattr(observation, feature_name, None)) is not None:
counts[feature_name] += 1
return counts
2026-05-11 00:36:02 +03:30
def _persist_remote_sensing_diagnostic_artifacts(
*,
result: RemoteSensingSubdivisionResult,
observations: list[AnalysisGridObservation],
labels: list[int],
cluster_summaries: list[dict[str, Any]],
selected_features: list[str],
scaled_matrix: list[list[float]],
inertia_curve: list[dict[str, float]],
) -> dict[str, Any]:
try:
artifact_dir = _build_remote_sensing_diagnostic_dir(result=result)
artifact_dir.mkdir(parents=True, exist_ok=True)
specs = [
(
"elbow_plot",
render_elbow_plot(
inertia_curve=inertia_curve,
optimal_k=result.cluster_count,
block_code=result.block_code or "farm",
),
"elbow",
),
(
"cluster_map",
_render_cluster_map_plot(
observations=observations,
labels=labels,
block_code=result.block_code or "farm",
),
"cluster-map",
),
(
"cluster_sizes",
_render_cluster_size_plot(
cluster_summaries=cluster_summaries,
block_code=result.block_code or "farm",
),
"cluster-sizes",
),
(
"feature_pairs",
_render_feature_pair_plot(
selected_features=selected_features,
scaled_matrix=scaled_matrix,
labels=labels,
block_code=result.block_code or "farm",
),
"feature-pairs",
),
]
files: dict[str, str] = {}
for artifact_key, content, suffix in specs:
if content is None:
continue
target_path = artifact_dir / f"{_build_remote_sensing_artifact_stem(result=result)}__{suffix}.png"
_write_content_file(target_path=target_path, content=content)
files[artifact_key] = _to_project_relative_path(target_path)
return {
"directory": _to_project_relative_path(artifact_dir),
"files": files,
}
except (DataDrivenSubdivisionError, OSError) as exc:
logger.warning(
"Failed to persist remote sensing diagnostic artifacts for result_id=%s: %s",
result.id,
exc,
)
return {}
def _build_remote_sensing_diagnostic_dir(*, result: RemoteSensingSubdivisionResult) -> Path:
configured_dir = str(
os.environ.get("REMOTE_SENSING_DIAGNOSTIC_DIR", DEFAULT_REMOTE_SENSING_DIAGNOSTIC_DIR)
).strip()
base_dir = Path(getattr(settings, "BASE_DIR", Path.cwd()))
target_dir = Path(configured_dir)
if not target_dir.is_absolute():
target_dir = base_dir / target_dir
block_component = _sanitize_path_component(result.block_code or "farm")
return target_dir / f"location-{result.soil_location_id}" / f"run-{result.run_id}-{block_component}"
def _build_remote_sensing_artifact_stem(*, result: RemoteSensingSubdivisionResult) -> str:
return (
f"location-{result.soil_location_id}"
f"__run-{result.run_id}"
f"__{_sanitize_path_component(result.block_code or 'farm')}"
)
def _write_content_file(*, target_path: Path, content: ContentFile) -> None:
target_path.parent.mkdir(parents=True, exist_ok=True)
content.open("rb")
try:
target_path.write_bytes(content.read())
finally:
content.close()
def _to_project_relative_path(path: Path) -> str:
base_dir = Path(getattr(settings, "BASE_DIR", Path.cwd()))
try:
return str(path.relative_to(base_dir))
except ValueError:
return str(path)
def _sanitize_path_component(value: str) -> str:
text = str(value or "").strip() or "unknown"
sanitized = "".join(character if character.isalnum() or character in {"-", "_", "."} else "_" for character in text)
return sanitized or "unknown"
def _render_cluster_map_plot(
*,
observations: list[AnalysisGridObservation],
labels: list[int],
block_code: str,
) -> ContentFile | None:
if not observations:
return None
plt = _import_matplotlib_pyplot()
unique_labels = sorted(set(int(label) for label in labels))
colors = plt.cm.get_cmap("tab10", max(len(unique_labels), 1))
fig, ax = plt.subplots(figsize=(8, 6))
buffer = BytesIO()
try:
for color_index, cluster_label in enumerate(unique_labels):
cluster_points = [
(float(observation.cell.centroid_lon), float(observation.cell.centroid_lat))
for observation, label in zip(observations, labels)
if int(label) == cluster_label
]
if not cluster_points:
continue
xs = [point[0] for point in cluster_points]
ys = [point[1] for point in cluster_points]
ax.scatter(
xs,
ys,
s=70,
alpha=0.9,
color=colors(color_index),
edgecolors="white",
linewidths=0.8,
label=f"Cluster {cluster_label}",
)
ax.set_title(f"KMeans Spatial Cluster Map - {block_code}")
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)
if unique_labels:
ax.legend()
fig.tight_layout()
fig.savefig(buffer, format="png", dpi=150)
buffer.seek(0)
return ContentFile(buffer.getvalue())
finally:
buffer.close()
plt.close(fig)
def _render_cluster_size_plot(
*,
cluster_summaries: list[dict[str, Any]],
block_code: str,
) -> ContentFile | None:
if not cluster_summaries:
return None
plt = _import_matplotlib_pyplot()
labels = [f"C{int(cluster['cluster_label'])}" for cluster in cluster_summaries]
counts = [int(cluster["cell_count"]) for cluster in cluster_summaries]
fig, ax = plt.subplots(figsize=(8, 5))
buffer = BytesIO()
try:
bars = ax.bar(labels, counts, color="#2f6fed", alpha=0.85)
for bar, count in zip(bars, counts):
ax.text(
bar.get_x() + bar.get_width() / 2.0,
bar.get_height(),
str(count),
ha="center",
va="bottom",
fontsize=9,
)
ax.set_title(f"Cluster Sizes - {block_code}")
ax.set_xlabel("Cluster")
ax.set_ylabel("Cell Count")
ax.grid(True, axis="y", linestyle="--", linewidth=0.5, alpha=0.4)
fig.tight_layout()
fig.savefig(buffer, format="png", dpi=150)
buffer.seek(0)
return ContentFile(buffer.getvalue())
finally:
buffer.close()
plt.close(fig)
def _render_feature_pair_plot(
*,
selected_features: list[str],
scaled_matrix: list[list[float]],
labels: list[int],
block_code: str,
) -> ContentFile | None:
if not scaled_matrix or not selected_features:
return None
plt = _import_matplotlib_pyplot()
feature_count = len(selected_features)
pair_indexes = [(0, 0)] if feature_count == 1 else [
(left_index, right_index)
for left_index in range(feature_count)
for right_index in range(left_index + 1, feature_count)
]
subplot_count = len(pair_indexes)
columns = 2 if subplot_count > 1 else 1
rows = math.ceil(subplot_count / columns)
fig, axes = plt.subplots(rows, columns, figsize=(7 * columns, 5 * rows))
axes_list = axes.flatten().tolist() if hasattr(axes, "flatten") else [axes]
unique_labels = sorted(set(int(label) for label in labels))
colors = plt.cm.get_cmap("tab10", max(len(unique_labels), 1))
buffer = BytesIO()
try:
for axis, (left_index, right_index) in zip(axes_list, pair_indexes):
if feature_count == 1:
xs = list(range(1, len(scaled_matrix) + 1))
ys = [row[0] for row in scaled_matrix]
for color_index, cluster_label in enumerate(unique_labels):
filtered = [
(x_value, y_value)
for x_value, y_value, label in zip(xs, ys, labels)
if int(label) == cluster_label
]
axis.scatter(
[item[0] for item in filtered],
[item[1] for item in filtered],
s=55,
color=colors(color_index),
alpha=0.85,
label=f"Cluster {cluster_label}",
)
axis.set_xlabel("Observation Index")
axis.set_ylabel(f"{selected_features[0]} (scaled)")
axis.set_title(f"{selected_features[0]} distribution")
else:
x_values = [row[left_index] for row in scaled_matrix]
y_values = [row[right_index] for row in scaled_matrix]
for color_index, cluster_label in enumerate(unique_labels):
filtered = [
(x_value, y_value)
for x_value, y_value, label in zip(x_values, y_values, labels)
if int(label) == cluster_label
]
axis.scatter(
[item[0] for item in filtered],
[item[1] for item in filtered],
s=55,
color=colors(color_index),
alpha=0.85,
label=f"Cluster {cluster_label}",
)
axis.set_xlabel(f"{selected_features[left_index]} (scaled)")
axis.set_ylabel(f"{selected_features[right_index]} (scaled)")
axis.set_title(
f"{selected_features[left_index]} vs {selected_features[right_index]}"
)
axis.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)
for axis in axes_list[subplot_count:]:
axis.remove()
if unique_labels and axes_list:
axes_list[0].legend()
fig.suptitle(f"KMeans Feature Diagnostics - {block_code}", fontsize=14)
fig.tight_layout(rect=(0, 0, 1, 0.97))
fig.savefig(buffer, format="png", dpi=150)
buffer.seek(0)
return ContentFile(buffer.getvalue())
finally:
buffer.close()
plt.close(fig)
def _import_matplotlib_pyplot():
try:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
except ImportError as exc: # pragma: no cover - runtime dependency guard
raise DataDrivenSubdivisionError("matplotlib برای ذخیره نمودارهای KMeans لازم است.") from exc
return plt
2026-05-10 22:49:07 +03:30
def _build_clustering_log_context(
*,
observations: list[AnalysisGridObservation],
selected_features: list[str],
run: RemoteSensingRun | None,
location: SoilLocation | None,
) -> dict[str, Any]:
first_observation = observations[0] if observations else None
observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
resolved_run = run or getattr(first_observation, "run", None)
resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
return {
"run_id": getattr(resolved_run, "id", None),
"job_ref": observation_metadata.get("job_refs", {}),
"region_id": getattr(resolved_location, "id", None),
"date_range": {
"temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
"temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
},
"selected_features": selected_features,
}
def _serialize_log_payload(payload: dict[str, Any]) -> str:
return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)