580 lines
21 KiB
Python
580 lines
21 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
import json
|
|
import logging
|
|
from typing import Any
|
|
|
|
from django.db import transaction
|
|
|
|
from .block_subdivision import detect_elbow_point, render_elbow_plot
|
|
from .models import (
|
|
AnalysisGridObservation,
|
|
BlockSubdivision,
|
|
RemoteSensingClusterAssignment,
|
|
RemoteSensingRun,
|
|
RemoteSensingSubdivisionResult,
|
|
SoilLocation,
|
|
)
|
|
|
|
|
|
DEFAULT_CLUSTER_FEATURES = [
|
|
"ndvi",
|
|
"ndwi",
|
|
"lst_c",
|
|
"soil_vv_db",
|
|
]
|
|
SUPPORTED_CLUSTER_FEATURES = tuple(DEFAULT_CLUSTER_FEATURES)
|
|
DEFAULT_RANDOM_STATE = 42
|
|
DEFAULT_MAX_K = 10
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DataDrivenSubdivisionError(Exception):
|
|
"""Raised when remote-sensing-driven subdivision can not be computed."""
|
|
|
|
|
|
class EmptyObservationDatasetError(DataDrivenSubdivisionError):
|
|
"""Raised when upstream persistence completes without usable clustering features."""
|
|
|
|
|
|
@dataclass
|
|
class ClusteringDataset:
|
|
observations: list[AnalysisGridObservation]
|
|
selected_features: list[str]
|
|
raw_feature_rows: list[list[float | None]]
|
|
raw_feature_maps: list[dict[str, float | None]]
|
|
skipped_cell_codes: list[str]
|
|
used_cell_codes: list[str]
|
|
imputed_matrix: list[list[float]]
|
|
scaled_matrix: list[list[float]]
|
|
imputer_statistics: dict[str, float | None]
|
|
scaler_means: dict[str, float]
|
|
scaler_scales: dict[str, float]
|
|
missing_value_counts: dict[str, int]
|
|
skipped_reasons: dict[str, list[str]]
|
|
|
|
|
|
def create_remote_sensing_subdivision_result(
|
|
*,
|
|
location: SoilLocation,
|
|
run: RemoteSensingRun,
|
|
observations: list[AnalysisGridObservation],
|
|
block_subdivision: BlockSubdivision | None = None,
|
|
block_code: str = "",
|
|
selected_features: list[str] | None = None,
|
|
explicit_k: int | None = None,
|
|
max_k: int = DEFAULT_MAX_K,
|
|
random_state: int = DEFAULT_RANDOM_STATE,
|
|
) -> RemoteSensingSubdivisionResult:
|
|
"""
|
|
Build a data-driven subdivision result from stored remote sensing observations.
|
|
|
|
KMeans is applied on actual per-cell feature vectors, not geometric points.
|
|
"""
|
|
dataset = build_clustering_dataset(
|
|
observations=observations,
|
|
selected_features=selected_features,
|
|
run=run,
|
|
location=location,
|
|
)
|
|
if not dataset.observations:
|
|
raise DataDrivenSubdivisionError("هیچ observation قابل استفادهای برای خوشهبندی باقی نماند.")
|
|
|
|
optimal_k, inertia_curve = choose_cluster_count(
|
|
scaled_matrix=dataset.scaled_matrix,
|
|
explicit_k=explicit_k,
|
|
max_k=max_k,
|
|
random_state=random_state,
|
|
)
|
|
cluster_selection_strategy = "explicit_k" if explicit_k is not None else "elbow"
|
|
labels = run_kmeans_labels(
|
|
scaled_matrix=dataset.scaled_matrix,
|
|
cluster_count=optimal_k,
|
|
random_state=random_state,
|
|
)
|
|
cluster_summaries = build_cluster_summaries(
|
|
observations=dataset.observations,
|
|
labels=labels,
|
|
)
|
|
|
|
with transaction.atomic():
|
|
result, _created = RemoteSensingSubdivisionResult.objects.update_or_create(
|
|
run=run,
|
|
defaults={
|
|
"soil_location": location,
|
|
"block_subdivision": block_subdivision,
|
|
"block_code": block_code,
|
|
"chunk_size_sqm": run.chunk_size_sqm,
|
|
"temporal_start": run.temporal_start,
|
|
"temporal_end": run.temporal_end,
|
|
"cluster_count": optimal_k,
|
|
"selected_features": dataset.selected_features,
|
|
"skipped_cell_codes": dataset.skipped_cell_codes,
|
|
"metadata": {
|
|
"cell_count": len(observations),
|
|
"used_cell_count": len(dataset.observations),
|
|
"skipped_cell_count": len(dataset.skipped_cell_codes),
|
|
"used_cell_codes": dataset.used_cell_codes,
|
|
"skipped_reasons": dataset.skipped_reasons,
|
|
"selected_features": dataset.selected_features,
|
|
"imputer_strategy": "median",
|
|
"imputer_statistics": dataset.imputer_statistics,
|
|
"missing_value_counts": dataset.missing_value_counts,
|
|
"scaler_means": dataset.scaler_means,
|
|
"scaler_scales": dataset.scaler_scales,
|
|
"kmeans_params": {
|
|
"random_state": random_state,
|
|
"explicit_k": explicit_k,
|
|
"selected_k": optimal_k,
|
|
"max_k": max_k,
|
|
"n_init": 10,
|
|
"selection_strategy": cluster_selection_strategy,
|
|
},
|
|
"inertia_curve": inertia_curve,
|
|
"cluster_summaries": cluster_summaries,
|
|
},
|
|
},
|
|
)
|
|
result.assignments.all().delete()
|
|
assignment_rows = []
|
|
for index, observation in enumerate(dataset.observations):
|
|
assignment_rows.append(
|
|
RemoteSensingClusterAssignment(
|
|
result=result,
|
|
cell=observation.cell,
|
|
cluster_label=int(labels[index]),
|
|
raw_feature_values=dataset.raw_feature_maps[index],
|
|
scaled_feature_values={
|
|
feature_name: round(dataset.scaled_matrix[index][feature_index], 6)
|
|
for feature_index, feature_name in enumerate(dataset.selected_features)
|
|
},
|
|
)
|
|
)
|
|
RemoteSensingClusterAssignment.objects.bulk_create(assignment_rows)
|
|
if block_subdivision is not None:
|
|
sync_block_subdivision_with_result(
|
|
block_subdivision=block_subdivision,
|
|
result=result,
|
|
observations=observations,
|
|
cluster_summaries=cluster_summaries,
|
|
)
|
|
sync_location_block_layout_with_result(
|
|
location=location,
|
|
result=result,
|
|
cluster_summaries=cluster_summaries,
|
|
)
|
|
return result
|
|
|
|
|
|
def build_clustering_dataset(
|
|
*,
|
|
observations: list[AnalysisGridObservation],
|
|
selected_features: list[str] | None = None,
|
|
run: RemoteSensingRun | None = None,
|
|
location: SoilLocation | None = None,
|
|
) -> ClusteringDataset:
|
|
selected_features = list(selected_features or DEFAULT_CLUSTER_FEATURES)
|
|
invalid_features = [
|
|
feature_name
|
|
for feature_name in selected_features
|
|
if feature_name not in SUPPORTED_CLUSTER_FEATURES
|
|
]
|
|
if invalid_features:
|
|
raise DataDrivenSubdivisionError(
|
|
"ویژگیهای نامعتبر برای خوشهبندی: "
|
|
+ ", ".join(sorted(invalid_features))
|
|
)
|
|
log_context = _build_clustering_log_context(
|
|
observations=observations,
|
|
selected_features=selected_features,
|
|
run=run,
|
|
location=location,
|
|
)
|
|
logger.info(
|
|
"Preparing clustering dataset: %s",
|
|
_serialize_log_payload(
|
|
{
|
|
**log_context,
|
|
"total_observations": len(observations),
|
|
"non_null_counts": _count_non_null_features(observations),
|
|
}
|
|
),
|
|
)
|
|
raw_rows: list[list[float | None]] = []
|
|
raw_maps: list[dict[str, float | None]] = []
|
|
usable_observations: list[AnalysisGridObservation] = []
|
|
skipped_cell_codes: list[str] = []
|
|
used_cell_codes: list[str] = []
|
|
missing_value_counts = {feature_name: 0 for feature_name in selected_features}
|
|
skipped_reasons = {"all_features_missing": []}
|
|
|
|
for observation in observations:
|
|
feature_map = {
|
|
feature_name: _coerce_float(getattr(observation, feature_name, None))
|
|
for feature_name in selected_features
|
|
}
|
|
for feature_name, value in feature_map.items():
|
|
if value is None:
|
|
missing_value_counts[feature_name] += 1
|
|
if all(value is None for value in feature_map.values()):
|
|
logger.debug(
|
|
"Skipping observation cell=%s: all clustering features are null | context=%s",
|
|
observation.cell.cell_code,
|
|
_serialize_log_payload(log_context),
|
|
)
|
|
skipped_cell_codes.append(observation.cell.cell_code)
|
|
skipped_reasons["all_features_missing"].append(observation.cell.cell_code)
|
|
continue
|
|
usable_observations.append(observation)
|
|
used_cell_codes.append(observation.cell.cell_code)
|
|
raw_maps.append(feature_map)
|
|
raw_rows.append([feature_map[feature_name] for feature_name in selected_features])
|
|
|
|
logger.info(
|
|
"Clustering dataset filtered observations: %s",
|
|
_serialize_log_payload(
|
|
{
|
|
**log_context,
|
|
"remaining_observations": len(usable_observations),
|
|
"removed_observations": len(observations) - len(usable_observations),
|
|
}
|
|
)
|
|
)
|
|
|
|
zero_usable_feature_names = [
|
|
feature_name for feature_name, missing_count in missing_value_counts.items() if missing_count == len(observations)
|
|
]
|
|
if zero_usable_feature_names and len(zero_usable_feature_names) < len(selected_features):
|
|
for feature_name in zero_usable_feature_names:
|
|
logger.warning(
|
|
"Feature %s has zero usable values in dataset | context=%s",
|
|
feature_name,
|
|
_serialize_log_payload(log_context),
|
|
)
|
|
|
|
if not usable_observations:
|
|
error_context = {
|
|
**log_context,
|
|
"total_observations": len(observations),
|
|
"removed_observations": len(observations),
|
|
"null_counts_per_feature": missing_value_counts,
|
|
"selected_features": selected_features,
|
|
}
|
|
logger.error(
|
|
"No usable observations available for clustering: %s",
|
|
_serialize_log_payload(error_context),
|
|
)
|
|
raise EmptyObservationDatasetError(
|
|
"Upstream processing completed but no usable feature values were persisted."
|
|
)
|
|
|
|
try:
|
|
import numpy as np
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.preprocessing import StandardScaler
|
|
except ImportError as exc: # pragma: no cover - runtime dependency guard
|
|
raise DataDrivenSubdivisionError(
|
|
"scikit-learn و numpy برای خوشهبندی دادهمحور لازم هستند."
|
|
) from exc
|
|
|
|
raw_matrix = np.array(raw_rows, dtype=float)
|
|
imputer = SimpleImputer(strategy="median")
|
|
imputed_matrix = imputer.fit_transform(raw_matrix)
|
|
scaler = StandardScaler()
|
|
scaled_matrix = scaler.fit_transform(imputed_matrix)
|
|
|
|
return ClusteringDataset(
|
|
observations=usable_observations,
|
|
selected_features=selected_features,
|
|
raw_feature_rows=raw_rows,
|
|
raw_feature_maps=raw_maps,
|
|
skipped_cell_codes=skipped_cell_codes,
|
|
used_cell_codes=used_cell_codes,
|
|
imputed_matrix=imputed_matrix.tolist(),
|
|
scaled_matrix=scaled_matrix.tolist(),
|
|
imputer_statistics={
|
|
feature_name: _coerce_float(imputer.statistics_[index])
|
|
for index, feature_name in enumerate(selected_features)
|
|
},
|
|
scaler_means={
|
|
feature_name: float(scaler.mean_[index])
|
|
for index, feature_name in enumerate(selected_features)
|
|
},
|
|
scaler_scales={
|
|
feature_name: float(scaler.scale_[index] or 1.0)
|
|
for index, feature_name in enumerate(selected_features)
|
|
},
|
|
missing_value_counts=missing_value_counts,
|
|
skipped_reasons=skipped_reasons,
|
|
)
|
|
|
|
|
|
def choose_cluster_count(
|
|
*,
|
|
scaled_matrix: list[list[float]],
|
|
explicit_k: int | None,
|
|
max_k: int,
|
|
random_state: int,
|
|
) -> tuple[int, list[dict[str, float]]]:
|
|
sample_count = len(scaled_matrix)
|
|
if sample_count == 0:
|
|
raise DataDrivenSubdivisionError("هیچ نمونهای برای خوشهبندی وجود ندارد.")
|
|
if sample_count == 1:
|
|
return 1, [{"k": 1, "sse": 0.0}]
|
|
|
|
if explicit_k is not None:
|
|
if explicit_k <= 0:
|
|
raise DataDrivenSubdivisionError("cluster_count باید بزرگتر از صفر باشد.")
|
|
return min(explicit_k, sample_count), []
|
|
|
|
try:
|
|
from sklearn.cluster import KMeans
|
|
except ImportError as exc: # pragma: no cover
|
|
raise DataDrivenSubdivisionError("scikit-learn برای انتخاب تعداد خوشه لازم است.") from exc
|
|
|
|
max_allowed_k = min(max_k, sample_count)
|
|
inertia_curve = []
|
|
for k in range(1, max_allowed_k + 1):
|
|
model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
|
|
model.fit(scaled_matrix)
|
|
inertia_curve.append({"k": k, "sse": round(float(model.inertia_), 6)})
|
|
return detect_elbow_point(inertia_curve), inertia_curve
|
|
|
|
|
|
def run_kmeans_labels(
|
|
*,
|
|
scaled_matrix: list[list[float]],
|
|
cluster_count: int,
|
|
random_state: int,
|
|
) -> list[int]:
|
|
if cluster_count <= 0:
|
|
raise DataDrivenSubdivisionError("cluster_count باید بزرگتر از صفر باشد.")
|
|
if len(scaled_matrix) == 1:
|
|
return [0]
|
|
try:
|
|
from sklearn.cluster import KMeans
|
|
except ImportError as exc: # pragma: no cover
|
|
raise DataDrivenSubdivisionError("scikit-learn برای اجرای KMeans لازم است.") from exc
|
|
model = KMeans(n_clusters=cluster_count, n_init=10, random_state=random_state)
|
|
return [int(label) for label in model.fit_predict(scaled_matrix)]
|
|
|
|
|
|
def build_cluster_summaries(
|
|
*,
|
|
observations: list[AnalysisGridObservation],
|
|
labels: list[int],
|
|
) -> list[dict[str, Any]]:
|
|
clusters: dict[int, dict[str, Any]] = {}
|
|
for observation, label in zip(observations, labels):
|
|
cluster = clusters.setdefault(
|
|
int(label),
|
|
{
|
|
"cluster_label": int(label),
|
|
"cell_codes": [],
|
|
"centroid_lat_sum": 0.0,
|
|
"centroid_lon_sum": 0.0,
|
|
"cell_count": 0,
|
|
},
|
|
)
|
|
cluster["cell_codes"].append(observation.cell.cell_code)
|
|
cluster["centroid_lat_sum"] += float(observation.cell.centroid_lat)
|
|
cluster["centroid_lon_sum"] += float(observation.cell.centroid_lon)
|
|
cluster["cell_count"] += 1
|
|
|
|
summaries = []
|
|
for cluster_label in sorted(clusters):
|
|
cluster = clusters[cluster_label]
|
|
cell_count = cluster["cell_count"] or 1
|
|
summaries.append(
|
|
{
|
|
"cluster_label": cluster_label,
|
|
"cell_count": cluster["cell_count"],
|
|
"centroid_lat": round(cluster["centroid_lat_sum"] / cell_count, 6),
|
|
"centroid_lon": round(cluster["centroid_lon_sum"] / cell_count, 6),
|
|
"cell_codes": cluster["cell_codes"],
|
|
}
|
|
)
|
|
return summaries
|
|
|
|
|
|
def sync_location_block_layout_with_result(
|
|
*,
|
|
location: SoilLocation,
|
|
result: RemoteSensingSubdivisionResult,
|
|
cluster_summaries: list[dict[str, Any]],
|
|
) -> None:
|
|
layout = dict(location.block_layout or {})
|
|
blocks = list(layout.get("blocks") or [])
|
|
target_block = None
|
|
for block in blocks:
|
|
if block.get("block_code") == result.block_code:
|
|
target_block = block
|
|
break
|
|
|
|
if target_block is None:
|
|
target_block = {
|
|
"block_code": result.block_code,
|
|
"order": len(blocks) + 1,
|
|
"source": "remote_sensing",
|
|
"needs_subdivision": None,
|
|
"sub_blocks": [],
|
|
}
|
|
blocks.append(target_block)
|
|
|
|
target_block["needs_subdivision"] = result.cluster_count > 1
|
|
target_block["sub_blocks"] = [
|
|
{
|
|
"sub_block_code": f"cluster-{cluster['cluster_label']}",
|
|
"cluster_label": cluster["cluster_label"],
|
|
"centroid_lat": cluster["centroid_lat"],
|
|
"centroid_lon": cluster["centroid_lon"],
|
|
"cell_count": cluster["cell_count"],
|
|
}
|
|
for cluster in cluster_summaries
|
|
]
|
|
target_block["subdivision_summary"] = {
|
|
"type": "data_driven_remote_sensing",
|
|
"cluster_count": result.cluster_count,
|
|
"selected_features": result.selected_features,
|
|
"used_cell_count": result.metadata.get("used_cell_count", 0),
|
|
"skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
|
|
"run_id": result.run_id,
|
|
}
|
|
layout["blocks"] = blocks
|
|
layout["algorithm_status"] = "completed"
|
|
location.block_layout = layout
|
|
location.save(update_fields=["block_layout", "updated_at"])
|
|
|
|
|
|
def sync_block_subdivision_with_result(
|
|
*,
|
|
block_subdivision: BlockSubdivision,
|
|
result: RemoteSensingSubdivisionResult,
|
|
observations: list[AnalysisGridObservation],
|
|
cluster_summaries: list[dict[str, Any]],
|
|
) -> None:
|
|
metadata = dict(block_subdivision.metadata or {})
|
|
metadata["data_driven_subdivision"] = {
|
|
"run_id": result.run_id,
|
|
"result_id": result.id,
|
|
"cluster_count": result.cluster_count,
|
|
"selected_features": result.selected_features,
|
|
"used_cell_count": result.metadata.get("used_cell_count", 0),
|
|
"skipped_cell_count": result.metadata.get("skipped_cell_count", 0),
|
|
"temporal_extent": {
|
|
"start_date": result.temporal_start.isoformat() if result.temporal_start else None,
|
|
"end_date": result.temporal_end.isoformat() if result.temporal_end else None,
|
|
},
|
|
"inertia_curve": result.metadata.get("inertia_curve", []),
|
|
}
|
|
|
|
block_subdivision.grid_points = [
|
|
{
|
|
"cell_code": observation.cell.cell_code,
|
|
"centroid_lat": round(float(observation.cell.centroid_lat), 6),
|
|
"centroid_lon": round(float(observation.cell.centroid_lon), 6),
|
|
}
|
|
for observation in observations
|
|
]
|
|
block_subdivision.centroid_points = [
|
|
{
|
|
"sub_block_code": f"cluster-{cluster['cluster_label']}",
|
|
"cluster_label": cluster["cluster_label"],
|
|
"centroid_lat": cluster["centroid_lat"],
|
|
"centroid_lon": cluster["centroid_lon"],
|
|
"cell_count": cluster["cell_count"],
|
|
"cell_codes": cluster["cell_codes"],
|
|
}
|
|
for cluster in cluster_summaries
|
|
]
|
|
block_subdivision.grid_point_count = len(observations)
|
|
block_subdivision.centroid_count = len(cluster_summaries)
|
|
block_subdivision.status = "subdivided"
|
|
block_subdivision.metadata = metadata
|
|
|
|
plot_content = render_elbow_plot(
|
|
inertia_curve=result.metadata.get("inertia_curve", []),
|
|
optimal_k=result.cluster_count,
|
|
block_code=result.block_code or block_subdivision.block_code,
|
|
)
|
|
if plot_content is not None:
|
|
block_subdivision.elbow_plot.save(
|
|
f"remote-sensing-{result.soil_location_id}-{result.block_code or block_subdivision.block_code}-elbow.png",
|
|
plot_content,
|
|
save=False,
|
|
)
|
|
block_subdivision.save(
|
|
update_fields=[
|
|
"grid_points",
|
|
"centroid_points",
|
|
"grid_point_count",
|
|
"centroid_count",
|
|
"status",
|
|
"metadata",
|
|
"elbow_plot",
|
|
"updated_at",
|
|
]
|
|
)
|
|
return
|
|
|
|
block_subdivision.save(
|
|
update_fields=[
|
|
"grid_points",
|
|
"centroid_points",
|
|
"grid_point_count",
|
|
"centroid_count",
|
|
"status",
|
|
"metadata",
|
|
"updated_at",
|
|
]
|
|
)
|
|
|
|
|
|
def _coerce_float(value: Any) -> float | None:
|
|
if value is None:
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _count_non_null_features(observations: list[AnalysisGridObservation]) -> dict[str, int]:
|
|
counts = {feature_name: 0 for feature_name in DEFAULT_CLUSTER_FEATURES}
|
|
for observation in observations:
|
|
for feature_name in DEFAULT_CLUSTER_FEATURES:
|
|
if _coerce_float(getattr(observation, feature_name, None)) is not None:
|
|
counts[feature_name] += 1
|
|
return counts
|
|
|
|
|
|
def _build_clustering_log_context(
|
|
*,
|
|
observations: list[AnalysisGridObservation],
|
|
selected_features: list[str],
|
|
run: RemoteSensingRun | None,
|
|
location: SoilLocation | None,
|
|
) -> dict[str, Any]:
|
|
first_observation = observations[0] if observations else None
|
|
observation_metadata = dict(getattr(first_observation, "metadata", {}) or {})
|
|
resolved_run = run or getattr(first_observation, "run", None)
|
|
resolved_location = location or getattr(getattr(first_observation, "cell", None), "soil_location", None)
|
|
temporal_start = getattr(resolved_run, "temporal_start", None) or getattr(first_observation, "temporal_start", None)
|
|
temporal_end = getattr(resolved_run, "temporal_end", None) or getattr(first_observation, "temporal_end", None)
|
|
return {
|
|
"run_id": getattr(resolved_run, "id", None),
|
|
"job_ref": observation_metadata.get("job_refs", {}),
|
|
"region_id": getattr(resolved_location, "id", None),
|
|
"date_range": {
|
|
"temporal_start": temporal_start.isoformat() if hasattr(temporal_start, "isoformat") else temporal_start,
|
|
"temporal_end": temporal_end.isoformat() if hasattr(temporal_end, "isoformat") else temporal_end,
|
|
},
|
|
"selected_features": selected_features,
|
|
}
|
|
|
|
|
|
def _serialize_log_payload(payload: dict[str, Any]) -> str:
|
|
return json.dumps(payload, ensure_ascii=True, default=str, sort_keys=True)
|