from __future__ import annotations from dataclasses import dataclass import json import logging from typing import Any from django.db import transaction from .block_subdivision import detect_elbow_point, render_elbow_plot from .models import ( AnalysisGridObservation, BlockSubdivision, RemoteSensingClusterAssignment, RemoteSensingRun, RemoteSensingSubdivisionResult, SoilLocation, ) DEFAULT_CLUSTER_FEATURES = [ "ndvi", "ndwi", "lst_c", "soil_vv_db", ] SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES) DEFAULT_RANDOM_STATE = 42 DEFAULT_MAX_K = 10 logger = logging.getLogger(__name__) class DataDrivenSubdivisionError(Exception): """Raised when remote-sensing-driven subdivision can not be computed.""" class EmptyObservationDatasetError(DataDrivenSubdivisionError): """Raised when upstream persistence completes without usable clustering features.""" @dataclass class ClusteringDataset: observations: list[AnalysisGridObservation] selected_features: list[str] raw_feature_rows: list[list[float | None]] raw_feature_maps: list[dict[str, float | None]] skipped_cell_codes: list[str] used_cell_codes: list[str] imputed_matrix: list[list[float]] scaled_matrix: list[list[float]] imputer_statistics: dict[str, float | None] scaler_means: dict[str, float] scaler_scales: dict[str, float] missing_value_counts: dict[str, int] skipped_reasons: dict[str, list[str]] def create_remote_sensing_subdivision_result( *, location: SoilLocation, run: RemoteSensingRun, observations: list[AnalysisGridObservation], block_subdivision: BlockSubdivision | None = None, block_code: str = "", selected_features: list[str] | None = None, explicit_k: int | None = None, max_k: int = DEFAULT_MAX_K, random_state: int = DEFAULT_RANDOM_STATE, ) -> RemoteSensingSubdivisionResult: """ Build a data-driven subdivision result from stored remote sensing observations. KMeans is applied on actual per-cell feature vectors, not geometric points. """ dataset = build_clustering_dataset( observations=observations, selected_features=selected_features, run=run, location=location, ) if not dataset.observations: raise DataDrivenSubdivisionError("هیچ observation قابل استفاده‌ای برای خوشه‌بندی باقی نماند.") optimal_k, inertia_curve = choose_cluster_count( scaled_matrix=dataset.scaled_matrix, explicit_k=explicit_k, max_k=max_k, random_state=random_state, ) cluster_selection_strategy = "explicit_k" if explicit_k is not None else "elbow" labels = run_kmeans_labels( scaled_matrix=dataset.scaled_matrix, cluster_count=optimal_k, random_state=random_state, ) cluster_summaries = build_cluster_summaries( observations=dataset.observations, labels=labels, ) with transaction.atomic(): result, _created = RemoteSensingSubdivisionResult.objects.update_or_create( run=run, defaults={ "soil_location": location, "block_subdivision": block_subdivision, "block_code": block_code, "chunk_size_sqm": run.chunk_size_sqm, "temporal_start": run.temporal_start, "temporal_end": run.temporal_end, "cluster_count": optimal_k, "selected_features": dataset.selected_features, "skipped_cell_codes": dataset.skipped_cell_codes, "metadata": { "cell_count": len(observations), "used_cell_count": len(dataset.observations), "skipped_cell_count": len(dataset.skipped_cell_codes), "used_cell_codes": dataset.used_cell_codes, "skipped_reasons": dataset.skipped_reasons, "selected_features": dataset.selected_features, "imputer_strategy": "median", "imputer_statistics": dataset.imputer_statistics, "missing_value_counts": dataset.missing_value_counts, "scaler_means": dataset.scaler_means, "scaler_scales": dataset.scaler_scales, "kmeans_params": { "random_state": random_state, "explicit_k": explicit_k, "selected_k": optimal_k, "max_k": max_k, "n_init": 10, "selection_strategy": cluster_selection_strategy, }, "inertia_curve": inertia_curve, "cluster_summaries": cluster_summaries, }, }, ) result.assignments.all().delete() assignment_rows = [] for index, observation in enumerate(dataset.observations): assignment_rows.append( RemoteSensingClusterAssignment( result=result, cell=observation.cell, cluster_label=int(labels[index]), raw_feature_values=dataset.raw_feature_maps[index], scaled_feature_values={ feature_name: round(dataset.scaled_matrix[index][feature_index], 6) for feature_index, feature_name in enumerate(dataset.selected_features) }, ) ) RemoteSensingClusterAssignment.objects.bulk_create(assignment_rows) if block_subdivision is not None: sync_block_subdivision_with_result( block_subdivision=block_subdivision, result=result, observations=observations, cluster_summaries=cluster_summaries, ) sync_location_block_layout_with_result( location=location, result=result, cluster_summaries=cluster_summaries, ) return result def build_clustering_dataset( *, observations: list[AnalysisGridObservation], selected_features: list[str] | None = None, run: RemoteSensingRun | None = None, location: SoilLocation | None = None, ) -> ClusteringDataset: selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES) invalid_features = [ feature_name for feature_name in selected_features if feature_name not in SUPPORTED_CLUSTER_FEATURES ] if invalid_features: raise DataDrivenSubdivisionError( "ویژگی‌های نامعتبر برای خوشه‌بندی: " + ", ".join(sorted(invalid_features)) ) log_context = _build_clustering_log_context( observations=observations, selected_features=selected_features, run=run, location=location, ) logger.info( "Preparing clustering dataset: %s", _serialize_log_payload( { **log_context, "total_observations": len(observations), "non_null_counts": _count_non_null_features(observations), } ), ) raw_rows: list[list[float | None]] = [] raw_maps: list[dict[str, float | None]] = [] usable_observations: list[AnalysisGridObservation] = [] skipped_cell_codes: list[str] = [] used_cell_codes: list[str] = [] missing_value_counts = {feature_name: 0 for feature_name in selected_features} skipped_reasons = {"all_features_missing": []} for observation in observations: feature_map = { feature_name: _coerce_float(getattr(observation, feature_name, None)) for feature_name in selected_features } for feature_name, value in feature_map.items(): if value is None: missing_value_counts[feature_name] += 1 if all(value is None for value in feature_map.values()): logger.debug( "Skipping observation cell=%s: all clustering features are null | context=%s", observation.cell.cell_code, _serialize_log_payload(log_context), ) skipped_cell_codes.append(observation.cell.cell_code) skipped_reasons["all_features_missing"].append(observation.cell.cell_code) continue usable_observations.append(observation) used_cell_codes.append(observation.cell.cell_code) raw_maps.append(feature_map) raw_rows.append([feature_map[feature_name] for feature_name in selected_features]) logger.info( "Clustering dataset filtered observations: %s", _serialize_log_payload( { **log_context, "remaining_observations": len(usable_observations), "removed_observations": len(observations) - len(usable_observations), } ) ) zero_usable_feature_names = [ feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations) ] if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features): for feature_name in zero_usable_feature_names: logger.warning( "Feature %s has zero usable values in dataset | context=%s", feature_name, _serialize_log_payload(log_context), ) if not usable_observations: error_context = { **log_context, "total_observations": len(observations), "removed_observations": len(observations), "null_counts_per_feature": missing_value_counts, "selected_features": selected_features, } logger.error( "No usable observations available for clustering: %s", _serialize_log_payload(error_context), ) raise EmptyObservationDatasetError( "Upstream processing completed but no usable feature values were persisted." ) try: import numpy as np from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler except ImportError as exc: # pragma: no cover - runtime dependency guard raise DataDrivenSubdivisionError( "scikit-learn و numpy برای خوشه‌بندی داده‌محور لازم هستند." ) from exc raw_matrix = np.array(raw_rows, dtype=float) imputer = SimpleImputer(strategy="median") imputed_matrix = imputer.fit_transform(raw_matrix) scaler = StandardScaler() scaled_matrix = scaler.fit_transform(imputed_matrix) return ClusteringDataset( observations=usable_observations, selected_features=selected_features, raw_feature_rows=raw_rows, raw_feature_maps=raw_maps, skipped_cell_codes=skipped_cell_codes, used_cell_codes=used_cell_codes, imputed_matrix=imputed_matrix.tolist(), scaled_matrix=scaled_matrix.tolist(), imputer_statistics={ feature_name: _coerce_float(imputer.statistics_[index]) for index, feature_name in enumerate(selected_features) }, scaler_means={ feature_name: float(scaler.mean_[index]) for index, feature_name in enumerate(selected_features) }, scaler_scales={ feature_name: float(scaler.scale_[index] or 1.0) for index, feature_name in enumerate(selected_features) }, missing_value_counts=missing_value_counts, skipped_reasons=skipped_reasons, ) def choose_cluster_count( *, scaled_matrix: list[list[float]], explicit_k: int | None, max_k: int, random_state: int, ) -> tuple[int, list[dict[str, float]]]: sample_count = len(scaled_matrix) if sample_count == 0: raise DataDrivenSubdivisionError("هیچ نمونه‌ای برای خوشه‌بندی وجود ندارد.") if sample_count == 1: return 1, [{"k": 1, "sse": 0.0}] if explicit_k is not None: if explicit_k <= 0: raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.") return min(explicit_k, sample_count), [] try: from sklearn.cluster import KMeans except ImportError as exc: # pragma: no cover raise DataDrivenSubdivisionError("scikit-learn برای انتخاب تعداد خوشه لازم است.") from exc max_allowed_k = min(max_k, sample_count) inertia_curve = [] for k in range(1, max_allowed_k + 1): model = KMeans(n_clusters=k, n_init=10, random_state=random_state) model.fit(scaled_matrix) inertia_curve.append({"k": k, "sse": round(float(model.inertia_), 6)}) return detect_elbow_point(inertia_curve), inertia_curve def run_kmeans_labels( *, scaled_matrix: list[list[float]], cluster_count: int, random_state: int, ) -> list[int]: if cluster_count <= 0: raise DataDrivenSubdivisionError("cluster_count باید بزرگ‌تر از صفر باشد.") if len(scaled_matrix) == 1: return [0] try: from sklearn.cluster import KMeans except ImportError as exc: # pragma: no cover raise DataDrivenSubdivisionError("scikit-learn برای اجرای KMeans لازم است.") from exc model = KMeans(n_clusters=cluster_count, n_init=10, random_state=random_state) return [int(label) for label in model.fit_predict(scaled_matrix)] def build_cluster_summaries( *, observations: list[AnalysisGridObservation], labels: list[int], ) -> list[dict[str, Any]]: clusters: dict[int, dict[str, Any]] = {} for observation, label in zip(observations, labels): cluster = clusters.setdefault( int(label), { "cluster_label": int(label), "cell_codes": [], "centroid_lat_sum": 0.0, "centroid_lon_sum": 0.0, "cell_count": 0, }, ) cluster["cell_codes"].append(observation.cell.cell_code) cluster["centroid_lat_sum"] += float(observation.cell.centroid_lat) cluster["centroid_lon_sum"] += float(observation.cell.centroid_lon) cluster["cell_count"] += 1 summaries = [] for cluster_label in sorted(clusters): cluster = clusters[cluster_label] cell_count = cluster["cell_count"] or 1 summaries.append( { "cluster_label": cluster_label, "cell_count": cluster["cell_count"], "centroid_lat": round(cluster["centroid_lat_sum"] / cell_count, 6), "centroid_lon": round(cluster["centroid_lon_sum"] / cell_count, 6), "cell_codes": cluster["cell_codes"], } ) return summaries def sync_location_block_layout_with_result( *, location: SoilLocation, result: RemoteSensingSubdivisionResult, cluster_summaries: list[dict[str, Any]], ) -> None: layout = dict(location.block_layout or {}) blocks = list(layout.get("blocks") or []) target_block = None for block in blocks: if block.get("block_code") == result.block_code: target_block = block break if target_block is None: target_block = { "block_code": result.block_code, "order": len(blocks) + 1, "source": "remote_sensing", "needs_subdivision": None, "sub_blocks": [], } blocks.append(target_block) target_block["needs_subdivision"] = result.cluster_count > 1 target_block["sub_blocks"] = [ { "sub_block_code": f"cluster-{cluster['cluster_label']}", "cluster_label": cluster["cluster_label"], "centroid_lat": cluster["centroid_lat"], "centroid_lon": cluster["centroid_lon"], "cell_count": cluster["cell_count"], } for cluster in cluster_summaries ] target_block["subdivision_summary"] = { "type": "data_driven_remote_sensing", "cluster_count": result.cluster_count, "selected_features": result.selected_features, "used_cell_count": result.metadata.get("used_cell_count", 0), "skipped_cell_count": result.metadata.get("skipped_cell_count", 0), "run_id": result.run_id, } layout["blocks"] = blocks layout["algorithm_status"] = "completed" location.block_layout = layout location.save(update_fields=["block_layout", "updated_at"]) def sync_block_subdivision_with_result( *, block_subdivision: BlockSubdivision, result: RemoteSensingSubdivisionResult, observations: list[AnalysisGridObservation], cluster_summaries: list[dict[str, Any]], ) -> None: metadata = dict(block_subdivision.metadata or {}) metadata["data_driven_subdivision"] = { "run_id": result.run_id, "result_id": result.id, "cluster_count": result.cluster_count, "selected_features": result.selected_features, "used_cell_count": result.metadata.get("used_cell_count", 0), "skipped_cell_count": result.metadata.get("skipped_cell_count", 0), "temporal_extent": { "start_date": result.temporal_start.isoformat() if result.temporal_start else None, "end_date": result.temporal_end.isoformat() if result.temporal_end else None, }, "inertia_curve": result.metadata.get("inertia_curve", []), } block_subdivision.grid_points = [ { "cell_code": observation.cell.cell_code, "centroid_lat": round(float(observation.cell.centroid_lat), 6), "centroid_lon": round(float(observation.cell.centroid_lon), 6), } for observation in observations ] block_subdivision.centroid_points = [ { "sub_block_code": f"cluster-{cluster['cluster_label']}", "cluster_label": cluster["cluster_label"], "centroid_lat": cluster["centroid_lat"], "centroid_lon": cluster["centroid_lon"], "cell_count": cluster["cell_count"], "cell_codes": cluster["cell_codes"], } for cluster in cluster_summaries ] block_subdivision.grid_point_count = len(observations) block_subdivision.centroid_count = len(cluster_summaries) block_subdivision.status = "subdivided" block_subdivision.metadata = metadata plot_content = render_elbow_plot( inertia_curve=result.metadata.get("inertia_curve", []), optimal_k=result.cluster_count, block_code=result.block_code or block_subdivision.block_code, ) if plot_content is not None: block_subdivision.elbow_plot.save( f"remote-sensing-{result.soil_location_id}-{result.block_code or block_subdivision.block_code}-elbow.png", plot_content, save=False, ) block_subdivision.save( update_fields=[ "grid_points", "centroid_points", "grid_point_count", "centroid_count", "status", "metadata", "elbow_plot", "updated_at", ] ) return block_subdivision.save( update_fields=[ "grid_points", "centroid_points", "grid_point_count", "centroid_count", "status", "metadata", "updated_at", ] ) def _coerce_float(value: Any) -> float | None: if value is None: return None try: return float(value) except (TypeError, ValueError): return None def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]: counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES} for observation in observations: for feature_name in DEFAULT_CLUSTER_FEATURES: if _coerce_float(getattr(observation, feature_name, None)) is not None: counts[feature_name] += 1 return counts def _build_clustering_log_context( *, observations: list[AnalysisGridObservation], selected_features: list[str], run: RemoteSensingRun | None, location: SoilLocation | None, ) -> dict[str, Any]: first_observation = observations[0] if observations else None observation_metadata = dict(getattr(first_observation, "metadata", {}) or {}) resolved_run = run or getattr(first_observation, "run", None) resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None) temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None) temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None) return { "run_id": getattr(resolved_run, "id", None), "job_ref": observation_metadata.get("job_refs", {}), "region_id": getattr(resolved_location, "id", None), "date_range": { "temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start, "temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end, }, "selected_features": selected_features, } def _serialize_log_payload(payload: dict[str, Any]) -> str: return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)