UPDATE

2026-05-10 22:49:07 +03:30
parent 2d1f7da89e
commit 2a6321a263
15 changed files with 2667 additions and 162 deletions
@@ -42,6 +42,17 @@ else:

 logger = logging.getLogger(__name__)

+REMOTE_SENSING_TASK_MAX_RETRIES = 5
+REMOTE_SENSING_TASK_RETRY_DELAY_SECONDS = 60
+REMOTE_SENSING_TASK_RETRY_BACKOFF_MAX_SECONDS = 600
+PERSISTED_OBSERVATION_FEATURES = (
+    "ndvi",
+    "ndwi",
+    "lst_c",
+    "soil_vv",
+    "soil_vv_db",
+)
+

 def run_remote_sensing_analysis(
    *,
@@ -122,58 +133,83 @@ def run_remote_sensing_analysis(
        )

        if not cells_to_process:
-            _record_run_stage(
-                run,
-                "using_cached_observations",
-                {"source": "database"},
-            )
            observations = _load_observations(
                location=location,
                block_code=resolved_block_code,
                temporal_start=start_date,
                temporal_end=end_date,
            )
-            subdivision_result = _ensure_subdivision_result(
-                location=location,
-                run=run,
-                subdivision=subdivision,
-                block_code=resolved_block_code,
+            if not _has_usable_observations(
                observations=observations,
-                cluster_count=cluster_count,
-                selected_features=selected_features,
-            )
-            _record_run_stage(
-                run,
-                "clustering_completed",
-                _build_clustering_stage_metadata(subdivision_result),
-            )
-            summary = {
-                "status": "completed",
-                "source": "database",
-                "run_id": run.id,
-                "processed_cell_count": 0,
-                "created_observation_count": 0,
-                "updated_observation_count": 0,
-                "existing_observation_count": len(all_cells),
-                "failed_metric_count": 0,
-                "chunk_size_sqm": grid_summary["chunk_size_sqm"],
-                "block_code": resolved_block_code,
-                "cell_count": len(all_cells),
-                "subdivision_result_id": getattr(subdivision_result, "id", None),
-                "cluster_count": getattr(subdivision_result, "cluster_count", 0),
-            }
-            _mark_run_success(run, summary)
-            return summary
+                selected_features=selected_features or list(DEFAULT_CLUSTER_FEATURES),
+            ):
+                logger.warning(
+                    "Cached observations are fully null, refetching remote metrics for run_id=%s",
+                    run.id,
+                )
+                _record_run_stage(
+                    run,
+                    "using_cached_observations",
+                    {"source": "database", "usable": False, "refetching": True},
+                )
+                cells_to_process = all_cells
+            else:
+                _record_run_stage(
+                    run,
+                    "using_cached_observations",
+                    {"source": "database", "usable": True, "refetching": False},
+                )
+                subdivision_result = _ensure_subdivision_result(
+                    location=location,
+                    run=run,
+                    subdivision=subdivision,
+                    block_code=resolved_block_code,
+                    observations=observations,
+                    cluster_count=cluster_count,
+                    selected_features=selected_features,
+                )
+                _record_run_stage(
+                    run,
+                    "clustering_completed",
+                    _build_clustering_stage_metadata(subdivision_result),
+                )
+                summary = {
+                    "status": "completed",
+                    "source": "database",
+                    "run_id": run.id,
+                    "processed_cell_count": 0,
+                    "created_observation_count": 0,
+                    "updated_observation_count": 0,
+                    "existing_observation_count": len(all_cells),
+                    "failed_metric_count": 0,
+                    "chunk_size_sqm": grid_summary["chunk_size_sqm"],
+                    "block_code": resolved_block_code,
+                    "cell_count": len(all_cells),
+                    "subdivision_result_id": getattr(subdivision_result, "id", None),
+                    "cluster_count": getattr(subdivision_result, "cluster_count", 0),
+                }
+                _mark_run_success(run, summary)
+                return summary

        _record_run_stage(
            run,
            "fetching_remote_metrics",
-            {"requested_cell_count": len(cells_to_process)},
+            _build_remote_metric_stage_details(
+                cells=cells_to_process,
+                selected_features=selected_features,
+            ),
+        )
+        progress_callback = _build_remote_metric_progress_callback(
+            run=run,
+            cells=cells_to_process,
+            selected_features=selected_features,
        )
        remote_payload = compute_remote_sensing_metrics(
            cells_to_process,
            temporal_start=start_date,
            temporal_end=end_date,
+            selected_features=selected_features or list(DEFAULT_CLUSTER_FEATURES),
+            progress_callback=progress_callback,
        )
        _record_run_stage(
            run,
@@ -242,7 +278,11 @@ def run_remote_sensing_analysis(
        raise


-@app.task(bind=True, max_retries=3, default_retry_delay=60)
+@app.task(
+    bind=True,
+    max_retries=REMOTE_SENSING_TASK_MAX_RETRIES,
+    default_retry_delay=REMOTE_SENSING_TASK_RETRY_DELAY_SECONDS,
+)
 def run_remote_sensing_analysis_task(
    self,
    soil_location_id: int,
@@ -287,17 +327,30 @@ def run_remote_sensing_analysis_task(
        )
        raise
    except (OpenEOExecutionError, OpenEOServiceError, RequestException, DataDrivenSubdivisionError) as exc:
+        retry_count = self.request.retries + 1
+        countdown = min(
+            REMOTE_SENSING_TASK_RETRY_DELAY_SECONDS * (2 ** self.request.retries),
+            REMOTE_SENSING_TASK_RETRY_BACKOFF_MAX_SECONDS,
+        )
+        _mark_run_retrying(
+            run_id=run_id,
+            task_id=self.request.id,
+            error_message=str(exc),
+            retry_count=retry_count,
+            retry_delay_seconds=countdown,
+        )
        logger.warning(
            "Transient remote sensing failure, retrying task",
            extra={
                "task_id": self.request.id,
                "soil_location_id": soil_location_id,
                "block_code": block_code,
-                "retry_count": self.request.retries,
+                "retry_count": retry_count,
+                "retry_delay_seconds": countdown,
                "error": str(exc),
            },
        )
-        raise self.retry(exc=exc)
+        raise self.retry(exc=exc, countdown=countdown)


 def _normalize_temporal_date(value: Any, field_name: str):
@@ -442,8 +495,20 @@ def _mark_run_success(

 def _mark_run_failure(run: RemoteSensingRun, error_message: str) -> None:
    metadata = dict(run.metadata or {})
+    failed_stage = str(metadata.get("stage") or "").strip() or None
+    stage_details = dict(metadata.get("stage_details") or {})
    metadata["status_label"] = "failed"
+    metadata["stage"] = "failed"
+    metadata["failed_stage"] = failed_stage
    metadata["failure_reason"] = error_message[:4000]
+    metadata["stage_details"] = {
+        **stage_details,
+        "failed": {
+            "failed_stage": failed_stage,
+            "error_message": error_message[:4000],
+            "failed_stage_details": stage_details.get(failed_stage, {}) if failed_stage else {},
+        },
+    }
    metadata["timestamps"] = {
        **dict(metadata.get("timestamps") or {}),
        "failed_at": timezone.now().isoformat(),
@@ -467,6 +532,51 @@ def _mark_run_failure(run: RemoteSensingRun, error_message: str) -> None:
    )


+def _mark_run_retrying(
+    *,
+    run_id: int | None,
+    task_id: str,
+    error_message: str,
+    retry_count: int,
+    retry_delay_seconds: int,
+) -> None:
+    run = None
+    if run_id is not None:
+        run = RemoteSensingRun.objects.filter(pk=run_id).first()
+    if run is None and task_id:
+        run = RemoteSensingRun.objects.filter(metadata__task_id=str(task_id)).first()
+    if run is None:
+        return
+
+    metadata = dict(run.metadata or {})
+    stage_details = dict(metadata.get("stage_details") or {})
+    failed_stage = (
+        str(metadata.get("failed_stage") or metadata.get("stage") or "").strip() or None
+    )
+    metadata["status_label"] = "retrying"
+    metadata["stage"] = "retrying"
+    metadata["failed_stage"] = failed_stage
+    metadata.pop("failure_reason", None)
+    metadata["stage_details"] = {
+        **stage_details,
+        "retrying": {
+            "retry_count": retry_count,
+            "retry_delay_seconds": retry_delay_seconds,
+            "last_error": error_message[:4000],
+            "failed_stage": failed_stage,
+            "failed_stage_details": stage_details.get(failed_stage, {}) if failed_stage else {},
+        },
+    }
+    metadata["timestamps"] = {
+        **dict(metadata.get("timestamps") or {}),
+        "retrying_at": timezone.now().isoformat(),
+    }
+    run.status = RemoteSensingRun.STATUS_RUNNING
+    run.error_message = ""
+    run.metadata = metadata
+    run.save(update_fields=["status", "error_message", "metadata", "updated_at"])
+
+
 def _load_grid_cells(location: SoilLocation, block_code: str) -> list[AnalysisGridCell]:
    queryset = AnalysisGridCell.objects.filter(soil_location=location)
    queryset = queryset.filter(block_code=block_code or "")
@@ -513,6 +623,17 @@ def _select_cells_for_processing(
    return [cell for cell in all_cells if cell.id not in existing_ids]


+def _has_usable_observations(
+    *,
+    observations: list[AnalysisGridObservation],
+    selected_features: list[str],
+) -> bool:
+    for observation in observations:
+        if any(getattr(observation, feature_name, None) is not None for feature_name in selected_features):
+            return True
+    return False
+
+
 def _upsert_grid_observations(
    *,
    cells: list[AnalysisGridCell],
@@ -521,19 +642,47 @@ def _upsert_grid_observations(
    temporal_end,
    metric_payload: dict[str, Any],
 ) -> dict[str, int]:
+    result_by_cell = metric_payload.get("results", {})
+    payload_diagnostics = metric_payload["metadata"].get("payload_diagnostics", {})
+    payload_cell_codes = sorted(str(cell_code) for cell_code in result_by_cell.keys())
+    db_cell_codes = [cell.cell_code for cell in cells]
+    matched_cell_codes = sorted(set(db_cell_codes) & set(payload_cell_codes))
+    unmatched_db_cell_codes = sorted(set(db_cell_codes) - set(payload_cell_codes))
+    unmatched_payload_cell_codes = sorted(set(payload_cell_codes) - set(db_cell_codes))
+    available_features = _collect_available_features(
+        result_by_cell=result_by_cell,
+        payload_diagnostics=payload_diagnostics,
+    )
+    payload_keys_sample = payload_cell_codes[:5]
+
    metadata_template = {
        "backend_name": metric_payload["metadata"].get("backend"),
        "backend_url": metric_payload["metadata"].get("backend_url"),
        "collections_used": metric_payload["metadata"].get("collections_used", []),
-        "slope_supported": metric_payload["metadata"].get("slope_supported", False),
        "job_refs": metric_payload["metadata"].get("job_refs", {}),
        "failed_metrics": metric_payload["metadata"].get("failed_metrics", []),
+        "payload_diagnostics": payload_diagnostics,
        "run_id": run.id,
    }
-    result_by_cell = metric_payload.get("results", {})
+
+    logger.info(
+        "Remote sensing payload/DB cell comparison: %s",
+        {
+            "run_id": run.id,
+            "db_cell_count": len(db_cell_codes),
+            "payload_cell_count": len(payload_cell_codes),
+            "matched_cell_count": len(matched_cell_codes),
+            "unmatched_db_cell_codes": unmatched_db_cell_codes,
+            "unmatched_payload_cell_codes": unmatched_payload_cell_codes,
+        },
+    )
+    if not matched_cell_codes:
+        logger.error("No payload cells matched DB cell_codes for run_id=%s", run.id)

    created_count = 0
    updated_count = 0
+    usable_observation_count = 0
+    fully_null_observation_count = 0
    with transaction.atomic():
        for cell in cells:
            values = result_by_cell.get(cell.cell_code, {})
@@ -544,10 +693,19 @@ def _upsert_grid_observations(
                "lst_c": values.get("lst_c"),
                "soil_vv": values.get("soil_vv"),
                "soil_vv_db": values.get("soil_vv_db"),
-                "dem_m": values.get("dem_m"),
-                "slope_deg": values.get("slope_deg"),
                "metadata": metadata_template,
            }
+            persisted_values = [defaults[feature_name] for feature_name in PERSISTED_OBSERVATION_FEATURES]
+            usable_values = [defaults[feature_name] for feature_name in DEFAULT_CLUSTER_FEATURES]
+            if all(value is None for value in persisted_values):
+                fully_null_observation_count += 1
+                logger.warning(
+                    "Persisting empty observation for cell=%s, run_id=%s",
+                    cell.cell_code,
+                    run.id,
+                )
+            if any(value is not None for value in usable_values):
+                usable_observation_count += 1
            observation, created = AnalysisGridObservation.objects.update_or_create(
                cell=cell,
                temporal_start=temporal_start,
@@ -558,7 +716,179 @@ def _upsert_grid_observations(
                created_count += 1
            else:
                updated_count += 1
-    return {"created_count": created_count, "updated_count": updated_count}
+
+    summary = {
+        "created_count": created_count,
+        "updated_count": updated_count,
+        "total_observation_count": len(cells),
+        "usable_observation_count": usable_observation_count,
+        "fully_null_observation_count": fully_null_observation_count,
+        "matched_cell_count": len(matched_cell_codes),
+        "matched_cell_codes": matched_cell_codes,
+        "unmatched_db_cell_codes": unmatched_db_cell_codes,
+        "unmatched_payload_cell_codes": unmatched_payload_cell_codes,
+        "payload_keys_sample": payload_keys_sample,
+        "available_features": available_features,
+    }
+    logger.info("Grid observation upsert summary: %s", summary)
+    if usable_observation_count == 0:
+        diagnostics = {
+            "job_ref": metadata_template["job_refs"],
+            "total_cells": len(cells),
+            "matched_cells": len(matched_cell_codes),
+            "payload_keys_sample": payload_keys_sample,
+            "available_features": available_features,
+        }
+        logger.error("All persisted observations are empty for run_id=%s", run.id)
+        _store_empty_observation_diagnostics(run=run, diagnostics=diagnostics)
+        summary["empty_observation_diagnostics"] = diagnostics
+
+    return summary
+
+
+def _collect_available_features(
+    *,
+    result_by_cell: dict[str, dict[str, Any]],
+    payload_diagnostics: dict[str, Any],
+) -> list[str]:
+    available = {
+        feature_name
+        for values in result_by_cell.values()
+        for feature_name, value in (values or {}).items()
+        if value is not None
+    }
+    for metric_diagnostics in payload_diagnostics.values():
+        available.update(metric_diagnostics.get("available_features", []))
+    return sorted(str(feature_name) for feature_name in available)
+
+
+def _store_empty_observation_diagnostics(*, run: RemoteSensingRun, diagnostics: dict[str, Any]) -> None:
+    metadata = dict(run.metadata or {})
+    metadata["diagnostics"] = {
+        **dict(metadata.get("diagnostics") or {}),
+        "empty_observations": diagnostics,
+    }
+    run.metadata = metadata
+    run.save(update_fields=["metadata", "updated_at"])
+
+
+def _build_remote_metric_stage_details(
+    *,
+    cells: list[AnalysisGridCell],
+    selected_features: list[str] | None,
+    active_metric: str | None = None,
+    completed_metrics: list[str] | None = None,
+    failed_metrics: list[dict[str, Any]] | None = None,
+    metric_states: list[dict[str, Any]] | None = None,
+) -> dict[str, Any]:
+    features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
+    completed = list(completed_metrics or [])
+    failed = list(failed_metrics or [])
+    states = metric_states or [
+        {
+            "metric": metric_name,
+            "status": (
+                "completed"
+                if metric_name in completed
+                else "failed"
+                if any(item.get("metric") == metric_name for item in failed)
+                else "running"
+                if metric_name == active_metric
+                else "pending"
+            ),
+        }
+        for metric_name in features
+    ]
+    return {
+        "requested_cell_count": len(cells),
+        "target_cells": [
+            {
+                "cell_code": cell.cell_code,
+                "block_code": cell.block_code,
+                "centroid_lat": str(cell.centroid_lat),
+                "centroid_lon": str(cell.centroid_lon),
+                "chunk_size_sqm": cell.chunk_size_sqm,
+            }
+            for cell in cells
+        ],
+        "metric_progress": {
+            "total_metrics": len(features),
+            "completed_metric_count": len(completed),
+            "active_metric": active_metric,
+            "completed_metrics": completed,
+            "failed_metrics": failed,
+            "states": states,
+        },
+    }
+
+
+def _normalize_progress_metric_name(metric_name: str, features: list[str]) -> str:
+    derived_metric_map = {
+        "soil_vv": "soil_vv_db",
+    }
+    normalized = derived_metric_map.get(metric_name, metric_name)
+    if normalized in features:
+        return normalized
+    return metric_name
+
+
+def _resolve_progress_job_ref(candidate: str, job_refs: dict[str, Any]) -> Any:
+    if candidate in job_refs:
+        return job_refs.get(candidate)
+    source_metric_map = {
+        "soil_vv_db": "soil_vv",
+    }
+    return job_refs.get(source_metric_map.get(candidate, candidate))
+
+
+def _build_remote_metric_progress_callback(
+    *,
+    run: RemoteSensingRun,
+    cells: list[AnalysisGridCell],
+    selected_features: list[str] | None,
+):
+    features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
+    completed_metrics: list[str] = []
+    failed_metrics: list[dict[str, Any]] = []
+
+    def callback(*, metric_name: str, state: str, metadata: dict[str, Any], metric_payload=None, error: str = "") -> None:
+        progress_metric_name = _normalize_progress_metric_name(metric_name, features)
+        if state == "completed" and progress_metric_name not in completed_metrics:
+            completed_metrics.append(progress_metric_name)
+        if state == "failed":
+            failed_entry = {"metric": progress_metric_name, "error": error}
+            if not any(
+                item.get("metric") == progress_metric_name and item.get("error") == error
+                for item in failed_metrics
+            ):
+                failed_metrics.append(failed_entry)
+
+        stage_details = _build_remote_metric_stage_details(
+            cells=cells,
+            selected_features=features,
+            active_metric=progress_metric_name if state == "running" else None,
+            completed_metrics=completed_metrics,
+            failed_metrics=failed_metrics,
+            metric_states=[
+                {
+                    "metric": candidate,
+                    "status": (
+                        "completed"
+                        if candidate in completed_metrics
+                        else "failed"
+                        if any(item.get("metric") == candidate for item in failed_metrics)
+                        else "running"
+                        if candidate == progress_metric_name and state == "running"
+                        else "pending"
+                    ),
+                    "job_ref": _resolve_progress_job_ref(candidate, metadata.get("job_refs", {})),
+                }
+                for candidate in features
+            ],
+        )
+        _record_run_stage(run, "fetching_remote_metrics", stage_details)
+
+    return callback


 def _ensure_subdivision_result(