UPDATE

2026-05-10 22:49:07 +03:30
parent 2d1f7da89e
commit 2a6321a263
15 changed files with 2667 additions and 162 deletions
@@ -1,6 +1,8 @@
 from __future__ import annotations

 from dataclasses import dataclass
+import json
+import logging
 from typing import Any

 from django.db import transaction
@@ -21,18 +23,22 @@ DEFAULT_CLUSTER_FEATURES = [
    "ndwi",
    "lst_c",
    "soil_vv_db",
-    "dem_m",
-    "slope_deg",
 ]
 SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
 DEFAULT_RANDOM_STATE = 42
 DEFAULT_MAX_K = 10

+logger = logging.getLogger(__name__)
+

 class DataDrivenSubdivisionError(Exception):
    """Raised when remote-sensing-driven subdivision can not be computed."""


+class EmptyObservationDatasetError(DataDrivenSubdivisionError):
+    """Raised when upstream persistence completes without usable clustering features."""
+
+
@dataclass
 class ClusteringDataset:
    observations: list[AnalysisGridObservation]
@@ -70,6 +76,8 @@ def create_remote_sensing_subdivision_result(
    dataset = build_clustering_dataset(
        observations=observations,
        selected_features=selected_features,
+        run=run,
+        location=location,
    )
    if not dataset.observations:
        raise DataDrivenSubdivisionError("هیچ observation قابل استفاده‌ای برای خوشه‌بندی باقی نماند.")
@@ -164,6 +172,8 @@ def build_clustering_dataset(
    *,
    observations: list[AnalysisGridObservation],
    selected_features: list[str] | None = None,
+    run: RemoteSensingRun | None = None,
+    location: SoilLocation | None = None,
 ) -> ClusteringDataset:
    selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
    invalid_features = [
@@ -176,6 +186,22 @@ def build_clustering_dataset(
            "ویژگی‌های نامعتبر برای خوشه‌بندی: "
            + ", ".join(sorted(invalid_features))
        )
+    log_context = _build_clustering_log_context(
+        observations=observations,
+        selected_features=selected_features,
+        run=run,
+        location=location,
+    )
+    logger.info(
+        "Preparing clustering dataset: %s",
+        _serialize_log_payload(
+            {
+                **log_context,
+                "total_observations": len(observations),
+                "non_null_counts": _count_non_null_features(observations),
+            }
+        ),
+    )
    raw_rows: list[list[float | None]] = []
    raw_maps: list[dict[str, float | None]] = []
    usable_observations: list[AnalysisGridObservation] = []
@@ -193,6 +219,11 @@ def build_clustering_dataset(
            if value is None:
                missing_value_counts[feature_name] += 1
        if all(value is None for value in feature_map.values()):
+            logger.debug(
+                "Skipping observation cell=%s: all clustering features are null | context=%s",
+                observation.cell.cell_code,
+                _serialize_log_payload(log_context),
+            )
            skipped_cell_codes.append(observation.cell.cell_code)
            skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
            continue
@@ -201,21 +232,42 @@ def build_clustering_dataset(
        raw_maps.append(feature_map)
        raw_rows.append([feature_map[feature_name] for feature_name in selected_features])

+    logger.info(
+        "Clustering dataset filtered observations: %s",
+        _serialize_log_payload(
+            {
+                **log_context,
+                "remaining_observations": len(usable_observations),
+                "removed_observations": len(observations) - len(usable_observations),
+            }
+        )
+    )
+
+    zero_usable_feature_names = [
+        feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
+    ]
+    if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
+        for feature_name in zero_usable_feature_names:
+            logger.warning(
+                "Feature %s has zero usable values in dataset | context=%s",
+                feature_name,
+                _serialize_log_payload(log_context),
+            )
+
    if not usable_observations:
-        return ClusteringDataset(
-            observations=[],
-            selected_features=selected_features,
-            raw_feature_rows=[],
-            raw_feature_maps=[],
-            skipped_cell_codes=skipped_cell_codes,
-            used_cell_codes=[],
-            imputed_matrix=[],
-            scaled_matrix=[],
-            imputer_statistics={feature_name: None for feature_name in selected_features},
-            scaler_means={feature_name: 0.0 for feature_name in selected_features},
-            scaler_scales={feature_name: 1.0 for feature_name in selected_features},
-            missing_value_counts=missing_value_counts,
-            skipped_reasons=skipped_reasons,
+        error_context = {
+            **log_context,
+            "total_observations": len(observations),
+            "removed_observations": len(observations),
+            "null_counts_per_feature": missing_value_counts,
+            "selected_features": selected_features,
+        }
+        logger.error(
+            "No usable observations available for clustering: %s",
+            _serialize_log_payload(error_context),
+        )
+        raise EmptyObservationDatasetError(
+            "Upstream processing completed but no usable feature values were persisted."
        )

    try:
@@ -487,3 +539,41 @@ def _coerce_float(value: Any) -> float | None:
        return float(value)
    except (TypeError, ValueError):
        return None
+
+
+def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
+    counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
+    for observation in observations:
+        for feature_name in DEFAULT_CLUSTER_FEATURES:
+            if _coerce_float(getattr(observation, feature_name, None)) is not None:
+                counts[feature_name] += 1
+    return counts
+
+
+def _build_clustering_log_context(
+    *,
+    observations: list[AnalysisGridObservation],
+    selected_features: list[str],
+    run: RemoteSensingRun | None,
+    location: SoilLocation | None,
+) -> dict[str, Any]:
+    first_observation = observations[0] if observations else None
+    observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
+    resolved_run = run or getattr(first_observation, "run", None)
+    resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
+    temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
+    temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
+    return {
+        "run_id": getattr(resolved_run, "id", None),
+        "job_ref": observation_metadata.get("job_refs", {}),
+        "region_id": getattr(resolved_location, "id", None),
+        "date_range": {
+            "temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
+            "temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
+        },
+        "selected_features": selected_features,
+    }
+
+
+def _serialize_log_payload(payload: dict[str, Any]) -> str:
+    return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)