Fix MLflow experiment tracker crash with non-existent runs

claude · claude · commit 7335d5248d39 · 2025-11-20T09:21:54.000Z
Previously, the MLflow experiment tracker would crash with a RESOURCE_DOES_NOT_EXIST error when attempting to resume a run that existed in ZenML's cache but not on the MLflow server. This was particularly problematic with Azure ML deployments. The fix validates that a cached run_id actually exists on the MLflow server before attempting to resume it. If validation fails, the code gracefully creates a new run instead of crashing. Changes: - Added MlflowException import - Added run existence validation in prepare_step_run() - Log warning when creating new run instead of resuming stale one - Added test to verify graceful handling of missing runs Fixes #4207
diff --git a/src/zenml/integrations/mlflow/experiment_trackers/mlflow_experiment_tracker.py b/src/zenml/integrations/mlflow/experiment_trackers/mlflow_experiment_tracker.py
@@ -19,6 +19,7 @@
 
 import mlflow
 from mlflow.entities import Experiment, Run
+from mlflow.exceptions import MlflowException
 from mlflow.store.db.db_types import DATABASE_ENGINES
 
 import zenml
@@ -195,6 +196,18 @@ def prepare_step_run(self, info: "StepRunInfo") -> None:
             experiment_name=experiment_name, run_name=info.run_name
         )
 
+        # Validate that the run exists before attempting to resume it
+        if run_id:
+            try:
+                mlflow.get_run(run_id)
+            except MlflowException as e:
+                # Run doesn't exist on the MLflow server, create a new one
+                logger.warning(
+                    f"Run with id {run_id} not found in MLflow tracking server. "
+                    f"Creating a new run instead. Error: {e}"
+                )
+                run_id = None
+
         tags = settings.tags.copy()
         tags.update(self._get_internal_tags())
 
diff --git a/tests/integration/integrations/mlflow/experiment_trackers/test_mlflow_experiment_tracker.py b/tests/integration/integrations/mlflow/experiment_trackers/test_mlflow_experiment_tracker.py
@@ -15,9 +15,11 @@
 import os
 from contextlib import ExitStack as does_not_raise
 from datetime import datetime
+from unittest.mock import MagicMock, patch
 from uuid import uuid4
 
 import pytest
+from mlflow.exceptions import MlflowException
 from pydantic import ValidationError
 
 from zenml.enums import StackComponentType
@@ -245,3 +247,69 @@ def test_mlflow_experiment_tracker_set_config(local_stack: Stack) -> None:
     assert os.environ[DATABRICKS_PASSWORD] == "password"
     assert os.environ[DATABRICKS_TOKEN] == "token1234"
     assert os.environ[DATABRICKS_HOST] == "https://databricks.com"
+
+
+@patch("mlflow.start_run")
+@patch("mlflow.get_run")
+@patch("mlflow.get_experiment_by_name")
+@patch("mlflow.set_experiment")
+def test_mlflow_experiment_tracker_handles_missing_run(
+    mock_set_experiment: MagicMock,
+    mock_get_experiment: MagicMock,
+    mock_get_run: MagicMock,
+    mock_start_run: MagicMock,
+) -> None:
+    """Tests that the MLflow experiment tracker handles missing runs gracefully.
+
+    This test verifies the fix for issue #4207 where MLflow would crash
+    when trying to resume a run that doesn't exist on the server.
+    """
+    # Setup mocks
+    mock_experiment = MagicMock()
+    mock_experiment.experiment_id = "test_experiment_id"
+    mock_get_experiment.return_value = mock_experiment
+
+    # Simulate a run that doesn't exist on the MLflow server
+    mock_get_run.side_effect = MlflowException("RESOURCE_DOES_NOT_EXIST")
+
+    # Create experiment tracker
+    tracker = MLFlowExperimentTracker(
+        name="test_tracker",
+        id=uuid4(),
+        config=MLFlowExperimentTrackerConfig(
+            tracking_uri="file:///tmp/mlflow",
+        ),
+        flavor="mlflow",
+        type=StackComponentType.EXPERIMENT_TRACKER,
+        user=uuid4(),
+        created=datetime.now(),
+        updated=datetime.now(),
+    )
+
+    # Create a mock StepRunInfo
+    mock_step_info = MagicMock()
+    mock_step_info.pipeline.name = "test_pipeline"
+    mock_step_info.run_name = "test_run"
+    mock_step_info.pipeline_step_name = "test_step"
+
+    # Mock get_run_id to return a stale run_id
+    with patch.object(tracker, "get_run_id", return_value="stale_run_id"):
+        with patch.object(
+            tracker,
+            "get_settings",
+            return_value=MagicMock(
+                experiment_name=None,
+                tags={},
+                nested=False,
+            ),
+        ):
+            # This should not raise an exception, even though the run doesn't exist
+            tracker.prepare_step_run(mock_step_info)
+
+    # Verify that start_run was called with run_id=None (creating a new run)
+    mock_start_run.assert_called_once()
+    call_kwargs = mock_start_run.call_args[1]
+    assert call_kwargs["run_id"] is None, (
+        "Expected run_id to be None when run doesn't exist"
+    )
+    assert call_kwargs["run_name"] == "test_run"