UPDATE
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from django.db import transaction
|
||||
@@ -21,18 +23,22 @@ DEFAULT_CLUSTER_FEATURES = [
|
||||
"ndwi",
|
||||
"lst_c",
|
||||
"soil_vv_db",
|
||||
"dem_m",
|
||||
"slope_deg",
|
||||
]
|
||||
SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
|
||||
DEFAULT_RANDOM_STATE = 42
|
||||
DEFAULT_MAX_K = 10
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataDrivenSubdivisionError(Exception):
|
||||
"""Raised when remote-sensing-driven subdivision can not be computed."""
|
||||
|
||||
|
||||
class EmptyObservationDatasetError(DataDrivenSubdivisionError):
|
||||
"""Raised when upstream persistence completes without usable clustering features."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClusteringDataset:
|
||||
observations: list[AnalysisGridObservation]
|
||||
@@ -70,6 +76,8 @@ def create_remote_sensing_subdivision_result(
|
||||
dataset = build_clustering_dataset(
|
||||
observations=observations,
|
||||
selected_features=selected_features,
|
||||
run=run,
|
||||
location=location,
|
||||
)
|
||||
if not dataset.observations:
|
||||
raise DataDrivenSubdivisionError("هیچ observation قابل استفادهای برای خوشهبندی باقی نماند.")
|
||||
@@ -164,6 +172,8 @@ def build_clustering_dataset(
|
||||
*,
|
||||
observations: list[AnalysisGridObservation],
|
||||
selected_features: list[str] | None = None,
|
||||
run: RemoteSensingRun | None = None,
|
||||
location: SoilLocation | None = None,
|
||||
) -> ClusteringDataset:
|
||||
selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
|
||||
invalid_features = [
|
||||
@@ -176,6 +186,22 @@ def build_clustering_dataset(
|
||||
"ویژگیهای نامعتبر برای خوشهبندی: "
|
||||
+ ", ".join(sorted(invalid_features))
|
||||
)
|
||||
log_context = _build_clustering_log_context(
|
||||
observations=observations,
|
||||
selected_features=selected_features,
|
||||
run=run,
|
||||
location=location,
|
||||
)
|
||||
logger.info(
|
||||
"Preparing clustering dataset: %s",
|
||||
_serialize_log_payload(
|
||||
{
|
||||
**log_context,
|
||||
"total_observations": len(observations),
|
||||
"non_null_counts": _count_non_null_features(observations),
|
||||
}
|
||||
),
|
||||
)
|
||||
raw_rows: list[list[float | None]] = []
|
||||
raw_maps: list[dict[str, float | None]] = []
|
||||
usable_observations: list[AnalysisGridObservation] = []
|
||||
@@ -193,6 +219,11 @@ def build_clustering_dataset(
|
||||
if value is None:
|
||||
missing_value_counts[feature_name] += 1
|
||||
if all(value is None for value in feature_map.values()):
|
||||
logger.debug(
|
||||
"Skipping observation cell=%s: all clustering features are null | context=%s",
|
||||
observation.cell.cell_code,
|
||||
_serialize_log_payload(log_context),
|
||||
)
|
||||
skipped_cell_codes.append(observation.cell.cell_code)
|
||||
skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
|
||||
continue
|
||||
@@ -201,21 +232,42 @@ def build_clustering_dataset(
|
||||
raw_maps.append(feature_map)
|
||||
raw_rows.append([feature_map[feature_name] for feature_name in selected_features])
|
||||
|
||||
logger.info(
|
||||
"Clustering dataset filtered observations: %s",
|
||||
_serialize_log_payload(
|
||||
{
|
||||
**log_context,
|
||||
"remaining_observations": len(usable_observations),
|
||||
"removed_observations": len(observations) - len(usable_observations),
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
zero_usable_feature_names = [
|
||||
feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
|
||||
]
|
||||
if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
|
||||
for feature_name in zero_usable_feature_names:
|
||||
logger.warning(
|
||||
"Feature %s has zero usable values in dataset | context=%s",
|
||||
feature_name,
|
||||
_serialize_log_payload(log_context),
|
||||
)
|
||||
|
||||
if not usable_observations:
|
||||
return ClusteringDataset(
|
||||
observations=[],
|
||||
selected_features=selected_features,
|
||||
raw_feature_rows=[],
|
||||
raw_feature_maps=[],
|
||||
skipped_cell_codes=skipped_cell_codes,
|
||||
used_cell_codes=[],
|
||||
imputed_matrix=[],
|
||||
scaled_matrix=[],
|
||||
imputer_statistics={feature_name: None for feature_name in selected_features},
|
||||
scaler_means={feature_name: 0.0 for feature_name in selected_features},
|
||||
scaler_scales={feature_name: 1.0 for feature_name in selected_features},
|
||||
missing_value_counts=missing_value_counts,
|
||||
skipped_reasons=skipped_reasons,
|
||||
error_context = {
|
||||
**log_context,
|
||||
"total_observations": len(observations),
|
||||
"removed_observations": len(observations),
|
||||
"null_counts_per_feature": missing_value_counts,
|
||||
"selected_features": selected_features,
|
||||
}
|
||||
logger.error(
|
||||
"No usable observations available for clustering: %s",
|
||||
_serialize_log_payload(error_context),
|
||||
)
|
||||
raise EmptyObservationDatasetError(
|
||||
"Upstream processing completed but no usable feature values were persisted."
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -487,3 +539,41 @@ def _coerce_float(value: Any) -> float | None:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
|
||||
counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
|
||||
for observation in observations:
|
||||
for feature_name in DEFAULT_CLUSTER_FEATURES:
|
||||
if _coerce_float(getattr(observation, feature_name, None)) is not None:
|
||||
counts[feature_name] += 1
|
||||
return counts
|
||||
|
||||
|
||||
def _build_clustering_log_context(
|
||||
*,
|
||||
observations: list[AnalysisGridObservation],
|
||||
selected_features: list[str],
|
||||
run: RemoteSensingRun | None,
|
||||
location: SoilLocation | None,
|
||||
) -> dict[str, Any]:
|
||||
first_observation = observations[0] if observations else None
|
||||
observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
|
||||
resolved_run = run or getattr(first_observation, "run", None)
|
||||
resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
|
||||
temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
|
||||
temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
|
||||
return {
|
||||
"run_id": getattr(resolved_run, "id", None),
|
||||
"job_ref": observation_metadata.get("job_refs", {}),
|
||||
"region_id": getattr(resolved_location, "id", None),
|
||||
"date_range": {
|
||||
"temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
|
||||
"temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
|
||||
},
|
||||
"selected_features": selected_features,
|
||||
}
|
||||
|
||||
|
||||
def _serialize_log_payload(payload: dict[str, Any]) -> str:
|
||||
return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)
|
||||
|
||||
Reference in New Issue
Block a user