This commit is contained in:
2026-05-10 22:49:07 +03:30
parent 2d1f7da89e
commit 2a6321a263
15 changed files with 2667 additions and 162 deletions
+106 -16
View File
@@ -1,6 +1,8 @@
from __future__ import annotations
from dataclasses import dataclass
import json
import logging
from typing import Any
from django.db import transaction
@@ -21,18 +23,22 @@ DEFAULT_CLUSTER_FEATURES = [
"ndwi",
"lst_c",
"soil_vv_db",
"dem_m",
"slope_deg",
]
SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
DEFAULT_RANDOM_STATE = 42
DEFAULT_MAX_K = 10
logger = logging.getLogger(__name__)
class DataDrivenSubdivisionError(Exception):
"""Raised when remote-sensing-driven subdivision can not be computed."""
class EmptyObservationDatasetError(DataDrivenSubdivisionError):
"""Raised when upstream persistence completes without usable clustering features."""
@dataclass
class ClusteringDataset:
observations: list[AnalysisGridObservation]
@@ -70,6 +76,8 @@ def create_remote_sensing_subdivision_result(
dataset = build_clustering_dataset(
observations=observations,
selected_features=selected_features,
run=run,
location=location,
)
if not dataset.observations:
raise DataDrivenSubdivisionError("هیچ observation قابل استفاده‌ای برای خوشه‌بندی باقی نماند.")
@@ -164,6 +172,8 @@ def build_clustering_dataset(
*,
observations: list[AnalysisGridObservation],
selected_features: list[str] | None = None,
run: RemoteSensingRun | None = None,
location: SoilLocation | None = None,
) -> ClusteringDataset:
selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
invalid_features = [
@@ -176,6 +186,22 @@ def build_clustering_dataset(
"ویژگی‌های نامعتبر برای خوشه‌بندی: "
+ ", ".join(sorted(invalid_features))
)
log_context = _build_clustering_log_context(
observations=observations,
selected_features=selected_features,
run=run,
location=location,
)
logger.info(
"Preparing clustering dataset: %s",
_serialize_log_payload(
{
**log_context,
"total_observations": len(observations),
"non_null_counts": _count_non_null_features(observations),
}
),
)
raw_rows: list[list[float | None]] = []
raw_maps: list[dict[str, float | None]] = []
usable_observations: list[AnalysisGridObservation] = []
@@ -193,6 +219,11 @@ def build_clustering_dataset(
if value is None:
missing_value_counts[feature_name] += 1
if all(value is None for value in feature_map.values()):
logger.debug(
"Skipping observation cell=%s: all clustering features are null | context=%s",
observation.cell.cell_code,
_serialize_log_payload(log_context),
)
skipped_cell_codes.append(observation.cell.cell_code)
skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
continue
@@ -201,21 +232,42 @@ def build_clustering_dataset(
raw_maps.append(feature_map)
raw_rows.append([feature_map[feature_name] for feature_name in selected_features])
logger.info(
"Clustering dataset filtered observations: %s",
_serialize_log_payload(
{
**log_context,
"remaining_observations": len(usable_observations),
"removed_observations": len(observations) - len(usable_observations),
}
)
)
zero_usable_feature_names = [
feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
]
if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
for feature_name in zero_usable_feature_names:
logger.warning(
"Feature %s has zero usable values in dataset | context=%s",
feature_name,
_serialize_log_payload(log_context),
)
if not usable_observations:
return ClusteringDataset(
observations=[],
selected_features=selected_features,
raw_feature_rows=[],
raw_feature_maps=[],
skipped_cell_codes=skipped_cell_codes,
used_cell_codes=[],
imputed_matrix=[],
scaled_matrix=[],
imputer_statistics={feature_name: None for feature_name in selected_features},
scaler_means={feature_name: 0.0 for feature_name in selected_features},
scaler_scales={feature_name: 1.0 for feature_name in selected_features},
missing_value_counts=missing_value_counts,
skipped_reasons=skipped_reasons,
error_context = {
**log_context,
"total_observations": len(observations),
"removed_observations": len(observations),
"null_counts_per_feature": missing_value_counts,
"selected_features": selected_features,
}
logger.error(
"No usable observations available for clustering: %s",
_serialize_log_payload(error_context),
)
raise EmptyObservationDatasetError(
"Upstream processing completed but no usable feature values were persisted."
)
try:
@@ -487,3 +539,41 @@ def _coerce_float(value: Any) -> float | None:
return float(value)
except (TypeError, ValueError):
return None
def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
for observation in observations:
for feature_name in DEFAULT_CLUSTER_FEATURES:
if _coerce_float(getattr(observation, feature_name, None)) is not None:
counts[feature_name] += 1
return counts
def _build_clustering_log_context(
*,
observations: list[AnalysisGridObservation],
selected_features: list[str],
run: RemoteSensingRun | None,
location: SoilLocation | None,
) -> dict[str, Any]:
first_observation = observations[0] if observations else None
observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
resolved_run = run or getattr(first_observation, "run", None)
resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
return {
"run_id": getattr(resolved_run, "id", None),
"job_ref": observation_metadata.get("job_refs", {}),
"region_id": getattr(resolved_location, "id", None),
"date_range": {
"temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
"temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
},
"selected_features": selected_features,
}
def _serialize_log_payload(payload: dict[str, Any]) -> str:
return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)