exospherehost · NiveditJain · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/state-manager/.coverage b/state-manager/.coverage
diff --git a/state-manager/app/controller/manual_retry_state.py b/state-manager/app/controller/manual_retry_state.py
@@ -0,0 +1,49 @@
+from pymongo.errors import DuplicateKeyError
+from app.models.manual_retry import ManualRetryRequestModel, ManualRetryResponseModel
+from beanie import PydanticObjectId
+from app.singletons.logs_manager import LogsManager
+from app.models.state_status_enum import StateStatusEnum
+from fastapi import HTTPException, status
+from app.models.db.state import State
+
+
+logger = LogsManager().get_logger()
+
+async def manual_retry_state(namespace_name: str, state_id: PydanticObjectId, body: ManualRetryRequestModel, x_exosphere_request_id: str):
+    try:
+        logger.info(f"Manual retry state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
+
+        state = await State.find_one(State.id == state_id, State.namespace_name == namespace_name)
+        if not state:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="State not found")
+
+        try:
+            retry_state = State(
+                node_name=state.node_name,
+                namespace_name=state.namespace_name,
+                identifier=state.identifier,
+                graph_name=state.graph_name,
+                run_id=state.run_id,
+                status=StateStatusEnum.CREATED,
+                inputs=state.inputs,
+                outputs={},
+                error=None,
+                parents=state.parents,
+                does_unites=state.does_unites,
+                fanout_id=body.fanout_id # this will ensure that multiple unwanted retries are not formed because of index in database
+            )
-            retry_state = State(
-                node_name=state.node_name,
-                namespace_name=state.namespace_name,
-                identifier=state.identifier,
-                graph_name=state.graph_name,
-                run_id=state.run_id,
-                status=StateStatusEnum.CREATED,
-                inputs=state.inputs,
-                outputs={},
-                error=None,
-                parents=state.parents,
-                does_unites=state.does_unites,
-                fanout_id=body.fanout_id # this will ensure that multiple unwanted retries are not formed because of index in database
-            )
+            retry_state = State(
+                node_name=state.node_name,
+                namespace_name=state.namespace_name,
+                identifier=state.identifier,
+                graph_name=state.graph_name,
+                run_id=state.run_id,
+                status=StateStatusEnum.CREATED,
+                inputs=state.inputs,
+                outputs={},
+                error=None,
+                parents=state.parents,
+                does_unites=state.does_unites,
+                retry_count=(getattr(state, "retry_count", 0) + 1),
+                fanout_id=body.fanout_id # this will ensure that multiple unwanted retries are not formed because of index in database
+            )
-            retry_state = State(
-                node_name=state.node_name,
-                namespace_name=state.namespace_name,
-                identifier=state.identifier,
-                graph_name=state.graph_name,
-                run_id=state.run_id,
-                status=StateStatusEnum.CREATED,
-                inputs=state.inputs,
-                outputs={},
-                error=None,
-                parents=state.parents,
-                does_unites=state.does_unites,
-                fanout_id=body.fanout_id # this will ensure that multiple unwanted retries are not formed because of index in database
-            )
+            retry_state = State(
+                node_name=state.node_name,
+                namespace_name=state.namespace_name,
+                identifier=state.identifier,
+                graph_name=state.graph_name,
+                run_id=state.run_id,
+                status=StateStatusEnum.CREATED,
+                inputs=state.inputs,
+                outputs={},
+                error=None,
+                parents=state.parents,
+                does_unites=state.does_unites,
+                retry_count=(getattr(state, "retry_count", 0) + 1),
+                fanout_id=body.fanout_id # this will ensure that multiple unwanted retries are not formed because of index in database
+            )
+            retry_state = await retry_state.insert()
+            logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id)
+
+            state.status = StateStatusEnum.RETRY_CREATED
+            await state.save()
+
+            return ManualRetryResponseModel(id=str(retry_state.id), status=retry_state.status)
+        except DuplicateKeyError:
+            logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id)
+            raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail="Duplicate retry state detected")
+
+
+    except Exception as _:
+        logger.error(f"Error manual retry state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
+        raise
-    except Exception as _:
-        logger.error(f"Error manual retry state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
-        raise
+    except HTTPException:
+        # propagate expected HTTP errors without error severity logging
+        raise
+    except Exception:
+        logger.exception(
+            f"Error creating manual retry state {state_id} for namespace {namespace_name}",
+            x_exosphere_request_id=x_exosphere_request_id,
+        )
+        raise
-    except Exception as _:
-        logger.error(f"Error manual retry state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
-        raise
+    except HTTPException:
+        # propagate expected HTTP errors without error severity logging
+        raise
+    except Exception:
+        logger.exception(
+            f"Error creating manual retry state {state_id} for namespace {namespace_name}",
+            x_exosphere_request_id=x_exosphere_request_id,
+        )
+        raise
diff --git a/state-manager/app/models/manual_retry.py b/state-manager/app/models/manual_retry.py
@@ -0,0 +1,11 @@
+from pydantic import BaseModel, Field
+from .state_status_enum import StateStatusEnum
+
+
+class ManualRetryRequestModel(BaseModel):
+    fanout_id: str = Field(..., description="Fanout ID of the state")
+
+
+class ManualRetryResponseModel(BaseModel):
+    id: str = Field(..., description="ID of the state")
+    status: StateStatusEnum = Field(..., description="Status of the state")
diff --git a/state-manager/app/routes.py b/state-manager/app/routes.py
@@ -50,6 +50,10 @@
 from .models.signal_models import ReEnqueueAfterRequestModel
 from .controller.re_queue_after_signal import re_queue_after_signal
 
+# manual_retry
+from .models.manual_retry import ManualRetryRequestModel, ManualRetryResponseModel
+from .controller.manual_retry_state import manual_retry_state
+
 
 logger = LogsManager().get_logger()
 
@@ -176,6 +180,24 @@ async def re_enqueue_after_state_route(namespace_name: str, state_id: str, body:
 
     return await re_queue_after_signal(namespace_name, PydanticObjectId(state_id), body, x_exosphere_request_id)
 
+@router.post(
+    "/state/{state_id}/manual-retry",
+    response_model=ManualRetryResponseModel,
+    status_code=status.HTTP_200_OK,
+    response_description="State manual retry successfully",
+    tags=["state"]
+)
+async def manual_retry_state_route(namespace_name: str, state_id: str, body: ManualRetryRequestModel, request: Request, api_key: str = Depends(check_api_key)):
+    x_exosphere_request_id = getattr(request.state, "x_exosphere_request_id", str(uuid4()))
+
+    if api_key:
+        logger.info(f"API key is valid for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
+    else:
+        logger.error(f"API key is invalid for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id)
+        raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key")
+
+    return await manual_retry_state(namespace_name, PydanticObjectId(state_id), body, x_exosphere_request_id)
+
 
 @router.put(
     "/graph/{graph_name}",

diff --git a/state-manager/tests/README.md b/state-manager/tests/README.md
@@ -14,6 +14,7 @@ tests/
 │       ├── test_errored_state.py
 │       ├── test_get_graph_template.py
 │       ├── test_get_secrets.py
+│       ├── test_manual_retry_state.py
 │       ├── test_register_nodes.py
 │       └── test_upsert_graph_template.py
 └── README.md
@@ -80,7 +81,21 @@ The unit tests cover all controller functions in the state-manager:
 - ✅ Complex schema handling
 - ✅ Database error handling
 
-### 8. `upsert_graph_template.py`
+### 8. `manual_retry_state.py`
+- ✅ Successful manual retry state creation
+- ✅ State not found scenarios
+- ✅ Duplicate retry state detection (DuplicateKeyError)
+- ✅ Different fanout_id handling
+- ✅ Complex inputs and multiple parents preservation
+- ✅ Database errors during state lookup
+- ✅ Database errors during state save
+- ✅ Database errors during retry state insert
+- ✅ Empty inputs and parents handling
+- ✅ Namespace mismatch scenarios
+- ✅ Field preservation and reset logic
+- ✅ Logging verification
+
-### 8. `manual_retry_state.py`
- ✅ Successful manual retry state creation
- ✅ State not found scenarios
- ✅ Duplicate retry state detection (DuplicateKeyError)
- ✅ Different fanout_id handling
- ✅ Complex inputs and multiple parents preservation
- ✅ Database errors during state lookup
- ✅ Database errors during state save
- ✅ Database errors during retry state insert
- ✅ Empty inputs and parents handling
- ✅ Namespace mismatch scenarios
- ✅ Field preservation and reset logic
- ✅ Logging verification
+
+### 8. `manual_retry_state.py`
+
+- ✅ Successful manual retry state creation
+- ✅ State not found scenarios
+- ✅ Duplicate retry state detection (DuplicateKeyError)
+- ✅ Different fanout_id handling
+- ✅ Complex inputs and multiple parents preservation
+- ✅ Database errors during state lookup
+- ✅ Database errors during state save
+- ✅ Database errors during retry state insert
+- ✅ Empty inputs and parents handling
+- ✅ Namespace mismatch scenarios
+- ✅ Field preservation and reset logic
+- ✅ Logging verification
+
-### 8. `manual_retry_state.py`
- ✅ Successful manual retry state creation
- ✅ State not found scenarios
- ✅ Duplicate retry state detection (DuplicateKeyError)
- ✅ Different fanout_id handling
- ✅ Complex inputs and multiple parents preservation
- ✅ Database errors during state lookup
- ✅ Database errors during state save
- ✅ Database errors during retry state insert
- ✅ Empty inputs and parents handling
- ✅ Namespace mismatch scenarios
- ✅ Field preservation and reset logic
- ✅ Logging verification
+
+### 8. `manual_retry_state.py`
+
+- ✅ Successful manual retry state creation
+- ✅ State not found scenarios
+- ✅ Duplicate retry state detection (DuplicateKeyError)
+- ✅ Different fanout_id handling
+- ✅ Complex inputs and multiple parents preservation
+- ✅ Database errors during state lookup
+- ✅ Database errors during state save
+- ✅ Database errors during retry state insert
+- ✅ Empty inputs and parents handling
+- ✅ Namespace mismatch scenarios
+- ✅ Field preservation and reset logic
+- ✅ Logging verification
+
+### 9. `upsert_graph_template.py`
-### 9. `upsert_graph_template.py`
+
+### 9. `upsert_graph_template.py`
-### 9. `upsert_graph_template.py`
+
+### 9. `upsert_graph_template.py`
 - ✅ Existing template updates
 - ✅ New template creation
 - ✅ Empty nodes handling