location_data/data_driven_subdivision.py

from __future__ import annotations

from io import BytesIO
import math
import os
from pathlib import Path
from dataclasses import dataclass
import json
import logging
from typing import Any

from django.conf import settings
from django.core.files.base import ContentFile
from django.db import transaction

from .block_subdivision import detect_elbow_point, render_elbow_plot
from .models import (
    AnalysisGridObservation,
    BlockSubdivision,
    RemoteSensingClusterAssignment,
    RemoteSensingRun,
    RemoteSensingSubdivisionResult,
    SoilLocation,
)


DEFAULT_CLUSTER_FEATURES = [
    "ndvi",
    "ndwi",
    "soil_vv_db",
]
SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
DEFAULT_RANDOM_STATE = 42
DEFAULT_MAX_K = 10
DEFAULT_REMOTE_SENSING_DIAGNOSTIC_DIR = "artifacts/remote_sensing_charts"

logger = logging.getLogger(__name__)


class DataDrivenSubdivisionError(Exception):
    """Raised when remote-sensing-driven subdivision can not be computed."""


class EmptyObservationDatasetError(DataDrivenSubdivisionError):
    """Raised when upstream persistence completes without usable clustering features."""


@dataclass
class ClusteringDataset:
    observations: list[AnalysisGridObservation]
    selected_features: list[str]
    raw_feature_rows: list[list[float | None]]
    raw_feature_maps: list[dict[str, float | None]]
    skipped_cell_codes: list[str]
    used_cell_codes: list[str]
    imputed_matrix: list[list[float]]
    scaled_matrix: list[list[float]]
    imputer_statistics: dict[str, float | None]
    scaler_means: dict[str, float]
    scaler_scales: dict[str, float]
    missing_value_counts: dict[str, int]
    skipped_reasons: dict[str, list[str]]


def create_remote_sensing_subdivision_result(
    *,
    location: SoilLocation,
    run: RemoteSensingRun,
    observations: list[AnalysisGridObservation],
    block_subdivision: BlockSubdivision | None = None,
    block_code: str = "",
    selected_features: list[str] | None = None,
    explicit_k: int | None = None,
    max_k: int = DEFAULT_MAX_K,
    random_state: int = DEFAULT_RANDOM_STATE,
) -> RemoteSensingSubdivisionResult:
    """
    Build a data-driven subdivision result from stored remote sensing observations.

    KMeans is applied on actual per-cell feature vectors, not geometric points.
    """
    dataset = build_clustering_dataset(
        observations=observations,
        selected_features=selected_features,
        run=run,
        location=location,
    )
    if not dataset.observations:
        raise DataDrivenSubdivisionError("هیچ observation قابل استفاده‌ای برای خوشه‌بندی باقی نماند.")

    optimal_k, inertia_curve = choose_cluster_count(
        scaled_matrix=dataset.scaled_matrix,
        explicit_k=explicit_k,
        max_k=max_k,
        random_state=random_state,
    )
    cluster_selection_strategy = "explicit_k" if explicit_k is not None else "elbow"
    labels = run_kmeans_labels(
        scaled_matrix=dataset.scaled_matrix,
        cluster_count=optimal_k,
        random_state=random_state,
    )
    cluster_summaries = build_cluster_summaries(
        observations=dataset.observations,
        labels=labels,
    )

    with transaction.atomic():
        result, _created = RemoteSensingSubdivisionResult.objects.update_or_create(
            run=run,
            defaults={
                "soil_location": location,
                "block_subdivision": block_subdivision,
                "block_code": block_code,
                "chunk_size_sqm": run.chunk_size_sqm,
                "temporal_start": run.temporal_start,
                "temporal_end": run.temporal_end,
                "cluster_count": optimal_k,
                "selected_features": dataset.selected_features,
                "skipped_cell_codes": dataset.skipped_cell_codes,
                "metadata": {
                    "cell_count": len(observations),
                    "used_cell_count": len(dataset.observations),
                    "skipped_cell_count": len(dataset.skipped_cell_codes),
                    "used_cell_codes": dataset.used_cell_codes,
                    "skipped_reasons": dataset.skipped_reasons,
                    "selected_features": dataset.selected_features,
                    "imputer_strategy": "median",
                    "imputer_statistics": dataset.imputer_statistics,
                    "missing_value_counts": dataset.missing_value_counts,
                    "scaler_means": dataset.scaler_means,
                    "scaler_scales": dataset.scaler_scales,
                    "kmeans_params": {
                        "random_state": random_state,
                        "explicit_k": explicit_k,
                        "selected_k": optimal_k,
                        "max_k": max_k,
                        "n_init": 10,
                        "selection_strategy": cluster_selection_strategy,
                    },
                    "inertia_curve": inertia_curve,
                    "cluster_summaries": cluster_summaries,
                },
            },
        )
        result.assignments.all().delete()
        assignment_rows = []
        for index, observation in enumerate(dataset.observations):
            assignment_rows.append(
                RemoteSensingClusterAssignment(
                    result=result,
                    cell=observation.cell,
                    cluster_label=int(labels[index]),
                    raw_feature_values=dataset.raw_feature_maps[index],
                    scaled_feature_values={
                        feature_name: round(dataset.scaled_matrix[index][feature_index], 6)
                        for feature_index, feature_name in enumerate(dataset.selected_features)
                    },
                )
            )
        RemoteSensingClusterAssignment.objects.bulk_create(assignment_rows)
        diagnostic_artifacts = _persist_remote_sensing_diagnostic_artifacts(
            result=result,
            observations=dataset.observations,
            labels=labels,
            cluster_summaries=cluster_summaries,
            selected_features=dataset.selected_features,
            scaled_matrix=dataset.scaled_matrix,
            inertia_curve=inertia_curve,
        )
        if diagnostic_artifacts:
            metadata = dict(result.metadata or {})
            metadata["diagnostic_artifacts"] = diagnostic_artifacts
            result.metadata = metadata
            result.save(update_fields=["metadata", "updated_at"])
        if block_subdivision is not None:
            sync_block_subdivision_with_result(
                block_subdivision=block_subdivision,
                result=result,
                observations=observations,
                cluster_summaries=cluster_summaries,
            )
        sync_location_block_layout_with_result(
            location=location,
            result=result,
            cluster_summaries=cluster_summaries,
        )
    return result


def build_clustering_dataset(
    *,
    observations: list[AnalysisGridObservation],
    selected_features: list[str] | None = None,
    run: RemoteSensingRun | None = None,
    location: SoilLocation | None = None,
) -> ClusteringDataset:
    selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
    invalid_features = [
        feature_name
        for feature_name in selected_features
        if feature_name not in SUPPORTED_CLUSTER_FEATURES
    ]
    if invalid_features:
        raise DataDrivenSubdivisionError(
            "ویژگی‌های نامعتبر برای خوشه‌بندی: "
            + ", ".join(sorted(invalid_features))
        )
    log_context = _build_clustering_log_context(
        observations=observations,
        selected_features=selected_features,
        run=run,
        location=location,
    )
    logger.info(
        "Preparing clustering dataset: %s",
        _serialize_log_payload(
            {
                **log_context,
                "total_observations": len(observations),
                "non_null_counts": _count_non_null_features(observations),
            }
        ),
    )
    raw_rows: list[list[float | None]] = []
    raw_maps: list[dict[str, float | None]] = []
    usable_observations: list[AnalysisGridObservation] = []
    skipped_cell_codes: list[str] = []
    used_cell_codes: list[str] = []
    missing_value_counts = {feature_name: 0 for feature_name in selected_features}
    skipped_reasons = {"all_features_missing": []}

    for observation in observations:
        feature_map = {
            feature_name: _coerce_float(getattr(observation, feature_name, None))
            for feature_name in selected_features
        }
        for feature_name, value in feature_map.items():
            if value is None:
                missing_value_counts[feature_name] += 1
        if all(value is None for value in feature_map.values()):
            logger.debug(
                "Skipping observation cell=%s: all clustering features are null | context=%s",
                observation.cell.cell_code,
                _serialize_log_payload(log_context),
            )
            skipped_cell_codes.append(observation.cell.cell_code)
            skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
            continue
        usable_observations.append(observation)
        used_cell_codes.append(observation.cell.cell_code)
        raw_maps.append(feature_map)
        raw_rows.append([feature_map[feature_name] for feature_name in selected_features])

    logger.info(
        "Clustering dataset filtered observations: %s",
        _serialize_log_payload(
            {
                **log_context,
                "remaining_observations": len(usable_observations),
                "removed_observations": len(observations) - len(usable_observations),
            }
        )
    )

    zero_usable_feature_names = [
        feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
    ]
    if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
        for feature_name in zero_usable_feature_names:
            logger.warning(
                "Feature %s has zero usable values in dataset | context=%s",
                feature_name,
                _serialize_log_payload(log_context),
            )

    if not usable_observations:
        error_context = {
            **log_context,
            "total_observations": len(observations),
            "removed_observations": len(observations),
            "null_counts_per_feature": missing_value_counts,
            "selected_features": selected_features,
        }
        logger.error(
            "No usable observations available for clustering: %s",
            _serialize_log_payload(error_context),
        )
        raise EmptyObservationDatasetError(
            "Upstream processing completed but no usable feature values were persisted."
        )

    try:
        import numpy as np
        from sklearn.impute import SimpleImputer
        from sklearn.preprocessing import StandardScaler
    except ImportError as exc:  # pragma: no cover - runtime dependency guard
        raise DataDrivenSubdivisionError(
            "scikit-learn و numpy برای خوشه‌بندی داده‌محور لازم هستند."
        ) from exc

    raw_matrix = np.array(raw_rows, dtype=float)
    imputer = SimpleImputer(strategy="median")
    imputed_matrix = imputer.fit_transform(raw_matrix)
    scaler = StandardScaler()
    scaled_matrix = scaler.fit_transform(imputed_matrix)

    return ClusteringDataset(
        observations=usable_observations,
        selected_features=selected_features,
        raw_feature_rows=raw_rows,
        raw_feature_maps=raw_maps,
        skipped_cell_codes=skipped_cell_codes,
        used_cell_codes=used_cell_codes,
        imputed_matrix=imputed_matrix.tolist(),
        scaled_matrix=scaled_matrix.tolist(),
        imputer_statistics={
            feature_name: _coerce_float(imputer.statistics_[index])
            for index, feature_name in enumerate(selected_features)
        },
        scaler_means={
            feature_name: float(scaler.mean_[index])
            for index, feature_name in enumerate(selected_features)
        },
        scaler_scales={
            feature_name: float(scaler.scale_[index] or 1.0)
            for index, feature_name in enumerate(selected_features)
        },
        missing_value_counts=missing_value_counts,
        skipped_reasons=skipped_reasons,
    )


def choose_cluster_count(
    *,
    scaled_matrix: list[list[float]],
    explicit_k: int | None,
    max_k: int,
    random_state: int,
) -> tuple[int, list[dict[str, float]]]:
    sample_count = len(scaled_matrix)
    if sample_count == 0:
        raise DataDrivenSubdivisionError("هیچ نمونه‌ای برای خوشه‌بندی وجود ندارد.")
    if sample_count == 1:
        return 1, [{"k": 1, "sse": 0.0}]

    if explicit_k is not None:
        if explicit_k <= 0:
            raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.")
        return min(explicit_k, sample_count), []

    try:
        from sklearn.cluster import KMeans
    except ImportError as exc:  # pragma: no cover
        raise DataDrivenSubdivisionError("scikit-learn برای انتخاب تعداد خوشه لازم است.") from exc

    max_allowed_k = min(max_k, sample_count)
    inertia_curve = []
    for k in range(1, max_allowed_k + 1):
        model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
        model.fit(scaled_matrix)
        inertia_curve.append({"k": k, "sse": round(float(model.inertia_), 6)})
    return detect_elbow_point(inertia_curve), inertia_curve


def run_kmeans_labels(
    *,
    scaled_matrix: list[list[float]],
    cluster_count: int,
    random_state: int,
) -> list[int]:
    if cluster_count <= 0:
        raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.")
    if len(scaled_matrix) == 1:
        return [0]
    try:
        from sklearn.cluster import KMeans
    except ImportError as exc:  # pragma: no cover
        raise DataDrivenSubdivisionError("scikit-learn برای اجرای KMeans لازم است.") from exc
    model = KMeans(n_clusters=cluster_count, n_init=10, random_state=random_state)
    return [int(label) for label in model.fit_predict(scaled_matrix)]


def build_cluster_summaries(
    *,
    observations: list[AnalysisGridObservation],
    labels: list[int],
) -> list[dict[str, Any]]:
    clusters: dict[int, dict[str, Any]] = {}
    for observation, label in zip(observations, labels):
        cluster = clusters.setdefault(
            int(label),
            {
                "cluster_label": int(label),
                "cell_codes": [],
                "centroid_lat_sum": 0.0,
                "centroid_lon_sum": 0.0,
                "cell_count": 0,
            },
        )
        cluster["cell_codes"].append(observation.cell.cell_code)
        cluster["centroid_lat_sum"] += float(observation.cell.centroid_lat)
        cluster["centroid_lon_sum"] += float(observation.cell.centroid_lon)
        cluster["cell_count"] += 1

    summaries = []
    for cluster_label in sorted(clusters):
        cluster = clusters[cluster_label]
        cell_count = cluster["cell_count"] or 1
        summaries.append(
            {
                "cluster_label": cluster_label,
                "cell_count": cluster["cell_count"],
                "centroid_lat": round(cluster["centroid_lat_sum"] / cell_count, 6),
                "centroid_lon": round(cluster["centroid_lon_sum"] / cell_count, 6),
                "cell_codes": cluster["cell_codes"],
            }
        )
    return summaries


def sync_location_block_layout_with_result(
    *,
    location: SoilLocation,
    result: RemoteSensingSubdivisionResult,
    cluster_summaries: list[dict[str, Any]],
) -> None:
    layout = dict(location.block_layout or {})
    blocks = list(layout.get("blocks") or [])
    target_block = None
    for block in blocks:
        if block.get("block_code") == result.block_code:
            target_block = block
            break

    if target_block is None:
        target_block = {
            "block_code": result.block_code,
            "order": len(blocks) + 1,
            "source": "remote_sensing",
            "needs_subdivision": None,
            "sub_blocks": [],
        }
        blocks.append(target_block)

    target_block["needs_subdivision"] = result.cluster_count > 1
    target_block["sub_blocks"] = [
        {
            "sub_block_code": f"cluster-{cluster['cluster_label']}",
            "cluster_label": cluster["cluster_label"],
            "centroid_lat": cluster["centroid_lat"],
            "centroid_lon": cluster["centroid_lon"],
            "cell_count": cluster["cell_count"],
        }
        for cluster in cluster_summaries
    ]
    target_block["subdivision_summary"] = {
        "type": "data_driven_remote_sensing",
        "cluster_count": result.cluster_count,
        "selected_features": result.selected_features,
        "used_cell_count": result.metadata.get("used_cell_count", 0),
        "skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
        "run_id": result.run_id,
    }
    layout["blocks"] = blocks
    layout["algorithm_status"] = "completed"
    location.block_layout = layout
    location.save(update_fields=["block_layout", "updated_at"])


def sync_block_subdivision_with_result(
    *,
    block_subdivision: BlockSubdivision,
    result: RemoteSensingSubdivisionResult,
    observations: list[AnalysisGridObservation],
    cluster_summaries: list[dict[str, Any]],
) -> None:
    metadata = dict(block_subdivision.metadata or {})
    metadata["data_driven_subdivision"] = {
        "run_id": result.run_id,
        "result_id": result.id,
        "cluster_count": result.cluster_count,
        "selected_features": result.selected_features,
        "used_cell_count": result.metadata.get("used_cell_count", 0),
        "skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
        "temporal_extent": {
            "start_date": result.temporal_start.isoformat() if result.temporal_start else None,
            "end_date": result.temporal_end.isoformat() if result.temporal_end else None,
        },
        "inertia_curve": result.metadata.get("inertia_curve", []),
        "diagnostic_artifacts": result.metadata.get("diagnostic_artifacts", {}),
    }

    block_subdivision.grid_points = [
        {
            "cell_code": observation.cell.cell_code,
            "centroid_lat": round(float(observation.cell.centroid_lat), 6),
            "centroid_lon": round(float(observation.cell.centroid_lon), 6),
        }
        for observation in observations
    ]
    block_subdivision.centroid_points = [
        {
            "sub_block_code": f"cluster-{cluster['cluster_label']}",
            "cluster_label": cluster["cluster_label"],
            "centroid_lat": cluster["centroid_lat"],
            "centroid_lon": cluster["centroid_lon"],
            "cell_count": cluster["cell_count"],
            "cell_codes": cluster["cell_codes"],
        }
        for cluster in cluster_summaries
    ]
    block_subdivision.grid_point_count = len(observations)
    block_subdivision.centroid_count = len(cluster_summaries)
    block_subdivision.status = "subdivided"
    block_subdivision.metadata = metadata

    plot_content = render_elbow_plot(
        inertia_curve=result.metadata.get("inertia_curve", []),
        optimal_k=result.cluster_count,
        block_code=result.block_code or block_subdivision.block_code,
    )
    if plot_content is not None:
        block_subdivision.elbow_plot.save(
            f"remote-sensing-{result.soil_location_id}-{result.block_code or block_subdivision.block_code}-elbow.png",
            plot_content,
            save=False,
        )
        block_subdivision.save(
            update_fields=[
                "grid_points",
                "centroid_points",
                "grid_point_count",
                "centroid_count",
                "status",
                "metadata",
                "elbow_plot",
                "updated_at",
            ]
        )
        return

    block_subdivision.save(
        update_fields=[
            "grid_points",
            "centroid_points",
            "grid_point_count",
            "centroid_count",
            "status",
            "metadata",
            "updated_at",
        ]
    )


def _coerce_float(value: Any) -> float | None:
    if value is None:
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
    counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
    for observation in observations:
        for feature_name in DEFAULT_CLUSTER_FEATURES:
            if _coerce_float(getattr(observation, feature_name, None)) is not None:
                counts[feature_name] += 1
    return counts


def _persist_remote_sensing_diagnostic_artifacts(
    *,
    result: RemoteSensingSubdivisionResult,
    observations: list[AnalysisGridObservation],
    labels: list[int],
    cluster_summaries: list[dict[str, Any]],
    selected_features: list[str],
    scaled_matrix: list[list[float]],
    inertia_curve: list[dict[str, float]],
) -> dict[str, Any]:
    try:
        artifact_dir = _build_remote_sensing_diagnostic_dir(result=result)
        artifact_dir.mkdir(parents=True, exist_ok=True)

        specs = [
            (
                "elbow_plot",
                render_elbow_plot(
                    inertia_curve=inertia_curve,
                    optimal_k=result.cluster_count,
                    block_code=result.block_code or "farm",
                ),
                "elbow",
            ),
            (
                "cluster_map",
                _render_cluster_map_plot(
                    observations=observations,
                    labels=labels,
                    block_code=result.block_code or "farm",
                ),
                "cluster-map",
            ),
            (
                "cluster_sizes",
                _render_cluster_size_plot(
                    cluster_summaries=cluster_summaries,
                    block_code=result.block_code or "farm",
                ),
                "cluster-sizes",
            ),
            (
                "feature_pairs",
                _render_feature_pair_plot(
                    selected_features=selected_features,
                    scaled_matrix=scaled_matrix,
                    labels=labels,
                    block_code=result.block_code or "farm",
                ),
                "feature-pairs",
            ),
        ]

        files: dict[str, str] = {}
        for artifact_key, content, suffix in specs:
            if content is None:
                continue
            target_path = artifact_dir / f"{_build_remote_sensing_artifact_stem(result=result)}__{suffix}.png"
            _write_content_file(target_path=target_path, content=content)
            files[artifact_key] = _to_project_relative_path(target_path)

        return {
            "directory": _to_project_relative_path(artifact_dir),
            "files": files,
        }
    except (DataDrivenSubdivisionError, OSError) as exc:
        logger.warning(
            "Failed to persist remote sensing diagnostic artifacts for result_id=%s: %s",
            result.id,
            exc,
        )
        return {}


def _build_remote_sensing_diagnostic_dir(*, result: RemoteSensingSubdivisionResult) -> Path:
    configured_dir = str(
        os.environ.get("REMOTE_SENSING_DIAGNOSTIC_DIR", DEFAULT_REMOTE_SENSING_DIAGNOSTIC_DIR)
    ).strip()
    base_dir = Path(getattr(settings, "BASE_DIR", Path.cwd()))
    target_dir = Path(configured_dir)
    if not target_dir.is_absolute():
        target_dir = base_dir / target_dir
    block_component = _sanitize_path_component(result.block_code or "farm")
    return target_dir / f"location-{result.soil_location_id}" / f"run-{result.run_id}-{block_component}"


def _build_remote_sensing_artifact_stem(*, result: RemoteSensingSubdivisionResult) -> str:
    return (
        f"location-{result.soil_location_id}"
        f"__run-{result.run_id}"
        f"__{_sanitize_path_component(result.block_code or 'farm')}"
    )


def _write_content_file(*, target_path: Path, content: ContentFile) -> None:
    target_path.parent.mkdir(parents=True, exist_ok=True)
    content.open("rb")
    try:
        target_path.write_bytes(content.read())
    finally:
        content.close()


def _to_project_relative_path(path: Path) -> str:
    base_dir = Path(getattr(settings, "BASE_DIR", Path.cwd()))
    try:
        return str(path.relative_to(base_dir))
    except ValueError:
        return str(path)


def _sanitize_path_component(value: str) -> str:
    text = str(value or "").strip() or "unknown"
    sanitized = "".join(character if character.isalnum() or character in {"-", "_", "."} else "_" for character in text)
    return sanitized or "unknown"


def _render_cluster_map_plot(
    *,
    observations: list[AnalysisGridObservation],
    labels: list[int],
    block_code: str,
) -> ContentFile | None:
    if not observations:
        return None
    plt = _import_matplotlib_pyplot()
    unique_labels = sorted(set(int(label) for label in labels))
    colors = plt.cm.get_cmap("tab10", max(len(unique_labels), 1))
    fig, ax = plt.subplots(figsize=(8, 6))
    buffer = BytesIO()
    try:
        for color_index, cluster_label in enumerate(unique_labels):
            cluster_points = [
                (float(observation.cell.centroid_lon), float(observation.cell.centroid_lat))
                for observation, label in zip(observations, labels)
                if int(label) == cluster_label
            ]
            if not cluster_points:
                continue
            xs = [point[0] for point in cluster_points]
            ys = [point[1] for point in cluster_points]
            ax.scatter(
                xs,
                ys,
                s=70,
                alpha=0.9,
                color=colors(color_index),
                edgecolors="white",
                linewidths=0.8,
                label=f"Cluster {cluster_label}",
            )
        ax.set_title(f"KMeans Spatial Cluster Map - {block_code}")
        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")
        ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)
        if unique_labels:
            ax.legend()
        fig.tight_layout()
        fig.savefig(buffer, format="png", dpi=150)
        buffer.seek(0)
        return ContentFile(buffer.getvalue())
    finally:
        buffer.close()
        plt.close(fig)


def _render_cluster_size_plot(
    *,
    cluster_summaries: list[dict[str, Any]],
    block_code: str,
) -> ContentFile | None:
    if not cluster_summaries:
        return None
    plt = _import_matplotlib_pyplot()
    labels = [f"C{int(cluster['cluster_label'])}" for cluster in cluster_summaries]
    counts = [int(cluster["cell_count"]) for cluster in cluster_summaries]
    fig, ax = plt.subplots(figsize=(8, 5))
    buffer = BytesIO()
    try:
        bars = ax.bar(labels, counts, color="#2f6fed", alpha=0.85)
        for bar, count in zip(bars, counts):
            ax.text(
                bar.get_x() + bar.get_width() / 2.0,
                bar.get_height(),
                str(count),
                ha="center",
                va="bottom",
                fontsize=9,
            )
        ax.set_title(f"Cluster Sizes - {block_code}")
        ax.set_xlabel("Cluster")
        ax.set_ylabel("Cell Count")
        ax.grid(True, axis="y", linestyle="--", linewidth=0.5, alpha=0.4)
        fig.tight_layout()
        fig.savefig(buffer, format="png", dpi=150)
        buffer.seek(0)
        return ContentFile(buffer.getvalue())
    finally:
        buffer.close()
        plt.close(fig)


def _render_feature_pair_plot(
    *,
    selected_features: list[str],
    scaled_matrix: list[list[float]],
    labels: list[int],
    block_code: str,
) -> ContentFile | None:
    if not scaled_matrix or not selected_features:
        return None
    plt = _import_matplotlib_pyplot()
    feature_count = len(selected_features)
    pair_indexes = [(0, 0)] if feature_count == 1 else [
        (left_index, right_index)
        for left_index in range(feature_count)
        for right_index in range(left_index + 1, feature_count)
    ]
    subplot_count = len(pair_indexes)
    columns = 2 if subplot_count > 1 else 1
    rows = math.ceil(subplot_count / columns)
    fig, axes = plt.subplots(rows, columns, figsize=(7 * columns, 5 * rows))
    axes_list = axes.flatten().tolist() if hasattr(axes, "flatten") else [axes]
    unique_labels = sorted(set(int(label) for label in labels))
    colors = plt.cm.get_cmap("tab10", max(len(unique_labels), 1))
    buffer = BytesIO()
    try:
        for axis, (left_index, right_index) in zip(axes_list, pair_indexes):
            if feature_count == 1:
                xs = list(range(1, len(scaled_matrix) + 1))
                ys = [row[0] for row in scaled_matrix]
                for color_index, cluster_label in enumerate(unique_labels):
                    filtered = [
                        (x_value, y_value)
                        for x_value, y_value, label in zip(xs, ys, labels)
                        if int(label) == cluster_label
                    ]
                    axis.scatter(
                        [item[0] for item in filtered],
                        [item[1] for item in filtered],
                        s=55,
                        color=colors(color_index),
                        alpha=0.85,
                        label=f"Cluster {cluster_label}",
                    )
                axis.set_xlabel("Observation Index")
                axis.set_ylabel(f"{selected_features[0]} (scaled)")
                axis.set_title(f"{selected_features[0]} distribution")
            else:
                x_values = [row[left_index] for row in scaled_matrix]
                y_values = [row[right_index] for row in scaled_matrix]
                for color_index, cluster_label in enumerate(unique_labels):
                    filtered = [
                        (x_value, y_value)
                        for x_value, y_value, label in zip(x_values, y_values, labels)
                        if int(label) == cluster_label
                    ]
                    axis.scatter(
                        [item[0] for item in filtered],
                        [item[1] for item in filtered],
                        s=55,
                        color=colors(color_index),
                        alpha=0.85,
                        label=f"Cluster {cluster_label}",
                    )
                axis.set_xlabel(f"{selected_features[left_index]} (scaled)")
                axis.set_ylabel(f"{selected_features[right_index]} (scaled)")
                axis.set_title(
                    f"{selected_features[left_index]} vs {selected_features[right_index]}"
                )
            axis.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)

        for axis in axes_list[subplot_count:]:
            axis.remove()

        if unique_labels and axes_list:
            axes_list[0].legend()
        fig.suptitle(f"KMeans Feature Diagnostics - {block_code}", fontsize=14)
        fig.tight_layout(rect=(0, 0, 1, 0.97))
        fig.savefig(buffer, format="png", dpi=150)
        buffer.seek(0)
        return ContentFile(buffer.getvalue())
    finally:
        buffer.close()
        plt.close(fig)


def _import_matplotlib_pyplot():
    try:
        import matplotlib

        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
    except ImportError as exc:  # pragma: no cover - runtime dependency guard
        raise DataDrivenSubdivisionError("matplotlib برای ذخیره نمودارهای KMeans لازم است.") from exc
    return plt


def _build_clustering_log_context(
    *,
    observations: list[AnalysisGridObservation],
    selected_features: list[str],
    run: RemoteSensingRun | None,
    location: SoilLocation | None,
) -> dict[str, Any]:
    first_observation = observations[0] if observations else None
    observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
    resolved_run = run or getattr(first_observation, "run", None)
    resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
    temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
    temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
    return {
        "run_id": getattr(resolved_run, "id", None),
        "job_ref": observation_metadata.get("job_refs", {}),
        "region_id": getattr(resolved_location, "id", None),
        "date_range": {
            "temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
            "temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
        },
        "selected_features": selected_features,
    }


def _serialize_log_payload(payload: dict[str, Any]) -> str:
    return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)