diff --git a/.github/workflows/publish-state-mangaer.yml b/.github/workflows/publish-state-mangaer.yml index 883fd9b8..b3926336 100644 --- a/.github/workflows/publish-state-mangaer.yml +++ b/.github/workflows/publish-state-mangaer.yml @@ -111,30 +111,3 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - - deploy-to-k8s: - needs: publish-image - runs-on: ubuntu-latest - - steps: - - name: Deploy to K8s - run: | - echo "${{ secrets.KUBE_CONFIG }}" | base64 -d > kubeconfig.yaml - export KUBECONFIG=$PWD/kubeconfig.yaml - kubectl get nodes - - echo "selected image: ${{ fromJson(needs.publish-image.outputs.json).tags[1] }}" - - kubectl set image deployment/exosphere-state-manager exosphere-state-manager=${{fromJson(needs.publish-image.outputs.json).tags[1]}} - - kubectl rollout status deployment/exosphere-state-manager - - status=$(kubectl rollout status deployment/exosphere-state-manager) - - echo "$status" - - if [[ "$status" != *"successfully rolled out"* ]]; then - kubectl rollout undo deployment/exosphere-state-manager - echo "❌ Deployment failed. Rolled back." >&2 - exit 1 - fi \ No newline at end of file diff --git a/docs/docs/exosphere/create-graph.md b/docs/docs/exosphere/create-graph.md index 11d20395..178debad 100644 --- a/docs/docs/exosphere/create-graph.md +++ b/docs/docs/exosphere/create-graph.md @@ -51,7 +51,13 @@ One can define a graph on Exosphere through a simple json config, which specifie }, "next_nodes": [] } - ] + ], + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } } ``` @@ -126,6 +132,24 @@ Use the `${{ ... }}` syntax to map outputs from previous nodes: - **`${{ node_identifier.outputs.field_name }}`**: Maps output from a specific node - **`initial`**: Static value provided when the graph is triggered - **Direct values**: String values. In v1, numbers/booleans must be string-encoded (e.g., "42", "true"). + +### Retry Policy + +Graphs can include a retry policy to handle transient failures automatically. The retry policy is configured at the graph level and applies to all nodes within the graph. + +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, // milliseconds + "exponent": 2 + } +} +``` + +For detailed information about retry policies, including all available strategies and configuration options, see the [Retry Policy](retry-policy.md) documentation. + ## Creating Graph Templates The recommended way to create graph templates is using the Exosphere Python SDK, which provides a clean interface to the State Manager API. @@ -156,7 +180,13 @@ async def create_graph_template(): result = await state_manager.upsert_graph( graph_name="my-workflow", graph_nodes=graph_nodes, - secrets=secrets + secrets=secrets, + retry_policy={ + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } ) print("Graph template created successfully!") print(f"Validation status: {result['validation_status']}") @@ -268,7 +298,13 @@ The state manager validates your graph template: result = await state_manager.upsert_graph( graph_name="my-workflow", graph_nodes=updated_nodes, - secrets=updated_secrets + secrets=updated_secrets, + retry_policy={ + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2 + } ) print("Graph template updated successfully!") print(f"Validation status: {result['validation_status']}") diff --git a/docs/docs/exosphere/retry-policy.md b/docs/docs/exosphere/retry-policy.md new file mode 100644 index 00000000..639306dc --- /dev/null +++ b/docs/docs/exosphere/retry-policy.md @@ -0,0 +1,397 @@ +# Retry Policy + +!!! beta "Beta Feature" + Retry Policy is currently available in beta. The API and functionality may change in future releases. + +The Retry Policy feature in Exosphere provides sophisticated retry mechanisms for handling transient failures in your workflow nodes. When a node execution fails, the retry policy automatically determines when and how to retry the execution based on configurable strategies. + +## Overview + +Retry policies are configured at the graph level and apply to all nodes within that graph. When a node fails with an error, the state manager automatically creates a retry state with a calculated delay before the next execution attempt. + +## Configuration + +Retry policies are defined in your graph template configuration: + +```json +{ + "secrets": { + "api_key": "your-api-key" + }, + "nodes": [ + { + "node_name": "MyNode", + "namespace": "MyProject", + "identifier": "my_node", + "inputs": { + "data": "initial" + }, + "next_nodes": [] + } + ], + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2, + "max_delay": 3600000 + } +} +``` + +## Parameters + +### max_retries + +- **Type**: `int` +- **Default**: `3` +- **Description**: The maximum number of retry attempts before giving up +- **Constraints**: Must be >= 0 + +### strategy + +- **Type**: `string` +- **Default**: `"EXPONENTIAL"` +- **Description**: The retry strategy to use for calculating delays +- **Options**: See [Retry Strategies](#retry-strategies) below + +### backoff_factor + +- **Type**: `int` (milliseconds) +- **Default**: `2000` (2 seconds) +- **Description**: The base delay factor in milliseconds +- **Constraints**: Must be > 0 + +### exponent + +- **Type**: `int` +- **Default**: `2` +- **Description**: The exponent used for exponential strategies +- **Constraints**: Must be > 0 + +### max_delay + +- **Type**: `int | null` (milliseconds) +- **Default**: `null` (no maximum delay) +- **Description**: The maximum delay in milliseconds that any retry attempt can have. When set, all calculated delays are capped at this value using the `_cap` function +- **Constraints**: Must be > 0 when not null +- **Example**: `3600000` (1 hour) would cap all delays to a maximum of 1 hour + +## Retry Strategies + +Exosphere supports three main categories of retry strategies, each with jitter variants to prevent thundering herd problems. + +### Exponential Strategies + +Exponential strategies increase the delay exponentially with each retry attempt. + +#### EXPONENTIAL + +Standard exponential backoff without jitter. + +**Formula**: `backoff_factor * (exponent ^ (retry_count - 1))` + +**Example**: + +- Retry 1: 2000ms (2 seconds) +- Retry 2: 4000ms (4 seconds) +- Retry 3: 8000ms (8 seconds) + +#### EXPONENTIAL_FULL_JITTER + +Exponential backoff with full jitter (random delay between 0 and calculated delay). + +**Formula**: `random(0, backoff_factor * (exponent ^ (retry_count - 1)))` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 0-2000ms (random) +- Retry 2: 0-4000ms (random) +- Retry 3: 0-8000ms (random) + +#### EXPONENTIAL_EQUAL_JITTER + +Exponential backoff with equal jitter (random delay around half the calculated delay). + +**Formula**: `(backoff_factor * (exponent ^ (retry_count - 1))) / 2 + random(0, (backoff_factor * (exponent ^ (retry_count - 1))) / 2)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 1000-2000ms (random) +- Retry 2: 2000-4000ms (random) +- Retry 3: 4000-8000ms (random) + +### Linear Strategies + +Linear strategies increase the delay linearly with each retry attempt. + +#### LINEAR + +Standard linear backoff without jitter. + +**Formula**: `backoff_factor * retry_count` + +**Example**: + +- Retry 1: 2000ms (2 seconds) +- Retry 2: 4000ms (4 seconds) +- Retry 3: 6000ms (6 seconds) + +#### LINEAR_FULL_JITTER + +Linear backoff with full jitter. + +**Formula**: `random(0, backoff_factor * retry_count)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 0-2000ms (random) +- Retry 2: 0-4000ms (random) +- Retry 3: 0-6000ms (random) + +#### LINEAR_EQUAL_JITTER + +Linear backoff with equal jitter. + +**Formula**: `(backoff_factor * retry_count) / 2 + random(0, (backoff_factor * retry_count) / 2)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 1000-2000ms (random) +- Retry 2: 2000-4000ms (random) +- Retry 3: 3000-6000ms (random) + +### Fixed Strategies + +Fixed strategies use a constant delay for all retry attempts. + +#### FIXED + +Standard fixed delay without jitter. + +**Formula**: `backoff_factor` + +**Example**: + +- Retry 1: 2000ms (2 seconds) +- Retry 2: 2000ms (2 seconds) +- Retry 3: 2000ms (2 seconds) + +#### FIXED_FULL_JITTER + +Fixed delay with full jitter. + +**Formula**: `random(0, backoff_factor)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 0-2000ms (random) +- Retry 2: 0-2000ms (random) +- Retry 3: 0-2000ms (random) + +#### FIXED_EQUAL_JITTER + +Fixed delay with equal jitter. + +**Formula**: `backoff_factor / 2 + random(0, backoff_factor / 2)` + +*Note: `random(a, b)` denotes a uniform random draw over the inclusive range [a, b].* + +**Example**: + +- Retry 1: 1000-2000ms (random) +- Retry 2: 1000-2000ms (random) +- Retry 3: 1000-2000ms (random) + +## Delay Capping + +The retry policy includes a built-in delay capping mechanism through the `_cap` function and `max_delay` parameter. This ensures that retry delays never exceed a specified maximum value, even with aggressive exponential backoff strategies. + +### How Delay Capping Works + +The `_cap` function is applied to all calculated delays: + +```python +def _cap(value: int) -> int: + if self.max_delay is not None: + return min(value, self.max_delay) + return value +``` + +**Behavior:** + +- If `max_delay` is set, all calculated delays are capped at this value +- If `max_delay` is `null` (default), no capping is applied +- The capping is applied after all strategy calculations. + +### Example with Delay Capping + +Consider an exponential strategy with `backoff_factor: 2000`, `exponent: 2`, and `max_delay: 10000`: + +**With capping:** + +- Retry 1: 2000ms +- Retry 2: 4000ms +- Retry 3: 8000ms +- Retry 4: 10000ms (capped at max_delay) + +### When to Use Delay Capping + +- **Long-running workflows**: Prevent excessive delays that could impact overall workflow completion time +- **User-facing applications**: Ensure retries don't create unacceptable wait times +- **Resource management**: Control resource consumption by limiting retry delays +- **Predictable behavior**: Create more predictable retry patterns for monitoring and alerting + +## Usage Examples + +### Basic Exponential Retry + +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL", + "backoff_factor": 1000, + "exponent": 2 + } +} +``` + +### Aggressive Retry with Jitter + +```json +{ + "retry_policy": { + "max_retries": 5, + "strategy": "EXPONENTIAL_FULL_JITTER", + "backoff_factor": 500, + "exponent": 3 + } +} +``` + +### Conservative Linear Retry + +```json +{ + "retry_policy": { + "max_retries": 2, + "strategy": "LINEAR", + "backoff_factor": 5000 + } +} +``` + +### Fixed Retry for Rate Limiting + +```json +{ + "retry_policy": { + "max_retries": 10, + "strategy": "FIXED_EQUAL_JITTER", + "backoff_factor": 1000 + } +} +``` + +### Exponential Retry with Delay Capping + +```json +{ + "retry_policy": { + "max_retries": 5, + "strategy": "EXPONENTIAL", + "backoff_factor": 2000, + "exponent": 2, + "max_delay": 30000 + } +} +``` + +### Conservative Retry with Maximum Delay + +```json +{ + "retry_policy": { + "max_retries": 3, + "strategy": "EXPONENTIAL_FULL_JITTER", + "backoff_factor": 1000, + "exponent": 3, + "max_delay": 60000 + } +} +``` + +## When Retries Are Triggered + +Retries are automatically triggered when: + +1. A node execution fails with an error +2. The current retry count is less than `max_retries` +3. The state status is `QUEUED` + +The retry mechanism: + +- Creates a new state with `retry_count` incremented by 1 +- Sets `enqueue_after` to the current time plus the calculated delay +- Sets the original state status to `ERRORED` with the error message + +## Best Practices + +### Choose the Right Strategy + +- **EXPONENTIAL**: Best for most transient failures (network issues, temporary service unavailability) +- **LINEAR**: Good for predictable, consistent delays +- **FIXED**: Useful for rate limiting scenarios + +### Use Jitter for High Concurrency + +- **FULL_JITTER**: Best for high concurrency to prevent thundering herd +- **EQUAL_JITTER**: Good balance between predictability and randomization +- **No Jitter**: Use only when you need deterministic behavior + +### Set Appropriate Limits + +- **max_retries**: Consider the nature of your failures and downstream dependencies +- **backoff_factor**: Balance between responsiveness and resource usage +- **exponent**: Higher values create more aggressive backoff +- **max_delay**: Set a reasonable maximum delay to prevent excessive wait times, especially for exponential strategies + +### Monitor Retry Patterns + +- Track retry counts in your monitoring system +- Set up alerts for graphs with high retry rates +- Analyze retry patterns to identify systemic issues + +## Limitations + +- Retry policies apply to all nodes in a graph uniformly +- Individual node-level retry policies are not supported +- Retry delays are calculated in milliseconds +- Maximum delay can be capped using the `max_delay` parameter (recommended for long-running workflows) + +## Error Handling + +If a retry policy configuration is invalid: + +- The graph template validation will fail +- An error will be returned during graph creation +- The graph will not be saved until the configuration is corrected + +## Integration with Signals + +Retry policies work alongside Exosphere's signaling system: + +- Nodes can still raise `PruneSignal` to stop retries immediately +- Nodes can raise `ReQueueAfterSignal` to re-queue after some time. This will not mark nodes as failures. +- When a node is re-queued using `ReQueueAfterSignal`, the `retry_count` is not incremented. The existing count is carried over to the new state. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 51df18d8..5ebcf23f 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -101,6 +101,7 @@ plugins: - exosphere/register-node.md - exosphere/create-runtime.md - exosphere/create-graph.md + - exosphere/retry-policy.md - exosphere/trigger-graph.md - exosphere/dashboard.md - exosphere/signals.md @@ -129,6 +130,7 @@ nav: - Register Node: exosphere/register-node.md - Create Runtime: exosphere/create-runtime.md - Create Graph: exosphere/create-graph.md + - Retry Policy: exosphere/retry-policy.md - Trigger Graph: exosphere/trigger-graph.md - Dashboard: exosphere/dashboard.md - Signals: exosphere/signals.md diff --git a/state-manager/app/controller/errored_state.py b/state-manager/app/controller/errored_state.py index b59e2573..f798cec8 100644 --- a/state-manager/app/controller/errored_state.py +++ b/state-manager/app/controller/errored_state.py @@ -1,10 +1,14 @@ +import time + from app.models.errored_models import ErroredRequestModel, ErroredResponseModel from fastapi import HTTPException, status from beanie import PydanticObjectId +from pymongo.errors import DuplicateKeyError from app.models.db.state import State from app.models.state_status_enum import StateStatusEnum from app.singletons.logs_manager import LogsManager +from app.models.db.graph_template_model import GraphTemplate logger = LogsManager().get_logger() @@ -23,11 +27,45 @@ async def errored_state(namespace_name: str, state_id: PydanticObjectId, body: E if state.status == StateStatusEnum.EXECUTED: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="State is already executed") + try: + graph_template = await GraphTemplate.get(namespace_name, state.graph_name) + except Exception as e: + logger.error(f"Error getting graph template {state.graph_name} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e) + if isinstance(e, ValueError) and "Graph template not found" in str(e): + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Graph template not found") + raise e + + retry_created = False + + if state.retry_count < graph_template.retry_policy.max_retries: + try: + retry_state = State( + node_name=state.node_name, + namespace_name=state.namespace_name, + identifier=state.identifier, + graph_name=state.graph_name, + run_id=state.run_id, + status=StateStatusEnum.CREATED, + inputs=state.inputs, + outputs={}, + error=None, + parents=state.parents, + does_unites=state.does_unites, + enqueue_after= int(time.time() * 1000) + graph_template.retry_policy.compute_delay(state.retry_count + 1), + retry_count=state.retry_count + 1, + fanout_id=state.fanout_id + ) + retry_state = await retry_state.insert() + logger.info(f"Retry state {retry_state.id} created for state {state_id}", x_exosphere_request_id=x_exosphere_request_id) + retry_created = True + except DuplicateKeyError: + logger.info(f"Duplicate retry state detected for state {state_id}. A retry state with the same unique key already exists.", x_exosphere_request_id=x_exosphere_request_id) + state.status = StateStatusEnum.ERRORED state.error = body.error await state.save() - return ErroredResponseModel(status=StateStatusEnum.ERRORED) + return ErroredResponseModel(status=StateStatusEnum.ERRORED, retry_created=retry_created) except Exception as e: logger.error(f"Error errored state {state_id} for namespace {namespace_name}", x_exosphere_request_id=x_exosphere_request_id, error=e) diff --git a/state-manager/app/controller/upsert_graph_template.py b/state-manager/app/controller/upsert_graph_template.py index 99a178ae..16882018 100644 --- a/state-manager/app/controller/upsert_graph_template.py +++ b/state-manager/app/controller/upsert_graph_template.py @@ -27,7 +27,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse Set({ GraphTemplate.nodes: body.nodes, # type: ignore GraphTemplate.validation_status: GraphTemplateValidationStatus.PENDING, # type: ignore - GraphTemplate.validation_errors: [] # type: ignore + GraphTemplate.validation_errors: [], # type: ignore + GraphTemplate.retry_policy: body.retry_policy # type: ignore }) ) @@ -44,7 +45,8 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse namespace=namespace_name, nodes=body.nodes, validation_status=GraphTemplateValidationStatus.PENDING, - validation_errors=[] + validation_errors=[], + retry_policy=body.retry_policy ).set_secrets(body.secrets) ) except ValueError as e: @@ -58,6 +60,7 @@ async def upsert_graph_template(namespace_name: str, graph_name: str, body: Upse validation_status=graph_template.validation_status, validation_errors=graph_template.validation_errors, secrets={secret_name: True for secret_name in graph_template.get_secrets().keys()}, + retry_policy=graph_template.retry_policy, created_at=graph_template.created_at, updated_at=graph_template.updated_at ) diff --git a/state-manager/app/models/db/graph_template_model.py b/state-manager/app/models/db/graph_template_model.py index 999d5389..4b3a4731 100644 --- a/state-manager/app/models/db/graph_template_model.py +++ b/state-manager/app/models/db/graph_template_model.py @@ -11,7 +11,7 @@ from ..node_template_model import NodeTemplate from app.utils.encrypter import get_encrypter from app.models.dependent_string import DependentString - +from app.models.retry_policy_model import RetryPolicyModel class GraphTemplate(BaseDatabaseModel): name: str = Field(..., description="Name of the graph") @@ -20,6 +20,7 @@ class GraphTemplate(BaseDatabaseModel): validation_status: GraphTemplateValidationStatus = Field(..., description="Validation status of the graph") validation_errors: List[str] = Field(default_factory=list, description="Validation errors of the graph") secrets: Dict[str, str] = Field(default_factory=dict, description="Secrets of the graph") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") _node_by_identifier: Dict[str, NodeTemplate] | None = PrivateAttr(default=None) _parents_by_identifier: Dict[str, set[str]] | None = PrivateAttr(default=None) # type: ignore diff --git a/state-manager/app/models/db/state.py b/state-manager/app/models/db/state.py index 6ed350d2..05441ec3 100644 --- a/state-manager/app/models/db/state.py +++ b/state-manager/app/models/db/state.py @@ -8,6 +8,7 @@ import hashlib import json import time +import uuid class State(BaseDatabaseModel): node_name: str = Field(..., description="Name of the node of the state") @@ -24,7 +25,9 @@ class State(BaseDatabaseModel): does_unites: bool = Field(default=False, description="Whether this state unites other states") state_fingerprint: str = Field(default="", description="Fingerprint of the state") enqueue_after: int = Field(default_factory=lambda: int(time.time() * 1000), gt=0, description="Unix time in milliseconds after which the state should be enqueued") - + retry_count: int = Field(default=0, description="Number of times the state has been retried") + fanout_id: str = Field(default_factory=lambda: str(uuid.uuid4()), description="Fanout ID of the state") + @before_event([Insert, Replace, Save]) def _generate_fingerprint(self): if not self.does_unites: @@ -37,6 +40,7 @@ def _generate_fingerprint(self): "identifier": self.identifier, "graph_name": self.graph_name, "run_id": self.run_id, + "retry_count": self.retry_count, "parents": {k: str(v) for k, v in self.parents.items()}, } payload = json.dumps( @@ -76,5 +80,18 @@ class Settings: ("node_name", 1), ], name="enqueue_query" + ), + IndexModel( + [ + ("node_name", 1), + ("namespace_name", 1), + ("graph_name", 1), + ("identifier", 1), + ("run_id", 1), + ("retry_count", 1), + ("fanout_id", 1), + ], + unique=True, + name="uniq_fanout_retry" ) ] \ No newline at end of file diff --git a/state-manager/app/models/errored_models.py b/state-manager/app/models/errored_models.py index 5acfaa34..8814d56a 100644 --- a/state-manager/app/models/errored_models.py +++ b/state-manager/app/models/errored_models.py @@ -7,4 +7,5 @@ class ErroredRequestModel(BaseModel): class ErroredResponseModel(BaseModel): - status: StateStatusEnum = Field(..., description="Status of the state") \ No newline at end of file + status: StateStatusEnum = Field(..., description="Status of the state") + retry_created: bool = Field(default=False, description="Whether a retry state was created") \ No newline at end of file diff --git a/state-manager/app/models/graph_models.py b/state-manager/app/models/graph_models.py index 1fd80bd1..8e67cb2d 100644 --- a/state-manager/app/models/graph_models.py +++ b/state-manager/app/models/graph_models.py @@ -3,16 +3,19 @@ from typing import Dict, List, Optional from datetime import datetime from .graph_template_validation_status import GraphTemplateValidationStatus +from .retry_policy_model import RetryPolicyModel class UpsertGraphTemplateRequest(BaseModel): secrets: Dict[str, str] = Field(..., description="Dictionary of secrets that are used while graph execution") nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") class UpsertGraphTemplateResponse(BaseModel): nodes: List[NodeTemplate] = Field(..., description="List of node templates that define the graph structure") secrets: Dict[str, bool] = Field(..., description="Dictionary of secrets that are used while graph execution") + retry_policy: RetryPolicyModel = Field(default_factory=RetryPolicyModel, description="Retry policy of the graph") created_at: datetime = Field(..., description="Timestamp when the graph template was created") updated_at: datetime = Field(..., description="Timestamp when the graph template was last updated") validation_status: GraphTemplateValidationStatus = Field(..., description="Current validation status of the graph template") diff --git a/state-manager/app/models/retry_policy_model.py b/state-manager/app/models/retry_policy_model.py new file mode 100644 index 00000000..be719176 --- /dev/null +++ b/state-manager/app/models/retry_policy_model.py @@ -0,0 +1,69 @@ +from pydantic import BaseModel, Field +from enum import Enum +import random + +class RetryStrategy(str, Enum): + EXPONENTIAL = "EXPONENTIAL" + EXPONENTIAL_FULL_JITTER = "EXPONENTIAL_FULL_JITTER" + EXPONENTIAL_EQUAL_JITTER = "EXPONENTIAL_EQUAL_JITTER" + + LINEAR = "LINEAR" + LINEAR_FULL_JITTER = "LINEAR_FULL_JITTER" + LINEAR_EQUAL_JITTER = "LINEAR_EQUAL_JITTER" + + FIXED = "FIXED" + FIXED_FULL_JITTER = "FIXED_FULL_JITTER" + FIXED_EQUAL_JITTER = "FIXED_EQUAL_JITTER" + +class RetryPolicyModel(BaseModel): + max_retries: int = Field(default=3, description="The maximum number of retries", ge=0) + strategy: RetryStrategy = Field(default=RetryStrategy.EXPONENTIAL, description="The method of retry") + backoff_factor: int = Field(default=2000, description="The backoff factor in milliseconds (default: 2000 = 2 seconds)", gt=0) + exponent: int = Field(default=2, description="The exponent for the exponential retry strategy", gt=0) + max_delay: int | None = Field(default=None, description="The maximum delay in milliseconds (no default limit when None)", gt=0) + + def compute_delay(self, retry_count: int) -> int: + + def _cap(value: int) -> int: + if self.max_delay is not None: + return min(value, self.max_delay) + return value + + if retry_count < 1: + raise ValueError(f"Retry count must be greater than or equal to 1, got {retry_count}") + + if self.strategy == RetryStrategy.EXPONENTIAL: + return _cap(self.backoff_factor * (self.exponent ** (retry_count - 1))) + + elif self.strategy == RetryStrategy.EXPONENTIAL_FULL_JITTER: + base = self.backoff_factor * (self.exponent ** (retry_count - 1)) + return _cap(int(random.uniform(0, base))) + + elif self.strategy == RetryStrategy.EXPONENTIAL_EQUAL_JITTER: + base = self.backoff_factor * (self.exponent ** (retry_count - 1)) + return _cap(int(base/2 + random.uniform(0, base / 2))) + + elif self.strategy == RetryStrategy.LINEAR: + return _cap(self.backoff_factor * retry_count) + + elif self.strategy == RetryStrategy.LINEAR_FULL_JITTER: + base = self.backoff_factor * retry_count + return _cap(int(random.uniform(0, base))) + + elif self.strategy == RetryStrategy.LINEAR_EQUAL_JITTER: + base = self.backoff_factor * retry_count + return _cap(int(base/2 + random.uniform(0, base / 2))) + + elif self.strategy == RetryStrategy.FIXED: + return _cap(self.backoff_factor) + + elif self.strategy == RetryStrategy.FIXED_FULL_JITTER: + base = self.backoff_factor + return _cap(int(random.uniform(0, base))) + + elif self.strategy == RetryStrategy.FIXED_EQUAL_JITTER: + base = self.backoff_factor + return _cap(int(base/2 + random.uniform(0, base / 2))) + + else: + raise ValueError(f"Invalid retry strategy: {self.strategy}") \ No newline at end of file diff --git a/state-manager/tests/unit/controller/test_errored_state.py b/state-manager/tests/unit/controller/test_errored_state.py index b1fc7df5..032571bf 100644 --- a/state-manager/tests/unit/controller/test_errored_state.py +++ b/state-manager/tests/unit/controller/test_errored_state.py @@ -2,6 +2,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from fastapi import HTTPException, status from beanie import PydanticObjectId +from pymongo.errors import DuplicateKeyError from app.controller.errored_state import errored_state from app.models.errored_models import ErroredRequestModel @@ -34,6 +35,16 @@ def mock_state_queued(self): state = MagicMock() state.id = PydanticObjectId() state.status = StateStatusEnum.QUEUED + state.graph_name = "test_graph" + state.retry_count = 0 + state.node_name = "test_node" + state.namespace_name = "test_namespace" + state.identifier = "test_identifier" + state.run_id = "test_run_id" + state.inputs = {} + state.parents = [] + state.does_unites = False + state.fanout_id = None return state @pytest.fixture @@ -41,11 +52,23 @@ def mock_state_executed(self): state = MagicMock() state.id = PydanticObjectId() state.status = StateStatusEnum.EXECUTED + state.graph_name = "test_graph" + state.retry_count = 0 + state.node_name = "test_node" + state.namespace_name = "test_namespace" + state.identifier = "test_identifier" + state.run_id = "test_run_id" + state.inputs = {} + state.parents = [] + state.does_unites = False + state.fanout_id = None return state @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_success_queued( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, @@ -55,10 +78,18 @@ async def test_errored_state_success_queued( ): """Test successful error marking of queued state""" - mock_state_queued.save = AsyncMock() + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) - mock_state_queued.status = StateStatusEnum.QUEUED - mock_state_queued.save = AsyncMock() + # Mock State constructor and insert method + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(return_value=mock_retry_state) + mock_state_class.return_value = mock_retry_state + + mock_state_queued.save = AsyncMock() mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) # Act @@ -75,8 +106,10 @@ async def test_errored_state_success_queued( @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_success_executed( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, @@ -84,26 +117,22 @@ async def test_errored_state_success_executed( mock_state_executed, mock_request_id ): - """Test successful error marking of executed state""" - - mock_state_executed.save = AsyncMock() - - mock_state_executed.status = StateStatusEnum.QUEUED - mock_state_executed.save = AsyncMock() + """Test that executed states cannot be marked as errored""" + # Arrange + mock_state_executed.status = StateStatusEnum.EXECUTED mock_state_class.find_one = AsyncMock(return_value=mock_state_executed) - # Act - result = await errored_state( - mock_namespace, - mock_state_id, - mock_errored_request, - mock_request_id - ) - - # Assert - assert result.status == StateStatusEnum.ERRORED - assert mock_state_class.find_one.call_count == 1 # Called once for finding + # Act & Assert + with pytest.raises(HTTPException) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + assert exc_info.value.status_code == status.HTTP_400_BAD_REQUEST + assert exc_info.value.detail == "State is already executed" @patch('app.controller.errored_state.State') async def test_errored_state_not_found( @@ -197,6 +226,7 @@ async def test_errored_state_already_executed( # Arrange mock_state = MagicMock() mock_state.status = StateStatusEnum.EXECUTED + mock_state.graph_name = "test_graph" mock_state_class.find_one = AsyncMock(return_value=mock_state) # Act & Assert @@ -222,7 +252,7 @@ async def test_errored_state_database_error( ): """Test handling of database errors""" # Arrange - mock_state_class.find_one = MagicMock(side_effect=Exception("Database error")) + mock_state_class.find_one = AsyncMock(side_effect=Exception("Database error")) # Act & Assert with pytest.raises(Exception) as exc_info: @@ -236,31 +266,42 @@ async def test_errored_state_database_error( assert str(exc_info.value) == "Database error" @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') async def test_errored_state_with_different_error_message( self, + mock_graph_template_class, mock_state_class, mock_namespace, mock_state_id, + mock_errored_request, mock_state_queued, mock_request_id ): """Test error marking with different error message""" # Arrange - errored_request = ErroredRequestModel( + different_error_request = ErroredRequestModel( error="Different error message" - ) + ) + + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Mock State constructor and insert method + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(return_value=mock_retry_state) + mock_state_class.return_value = mock_retry_state mock_state_queued.save = AsyncMock() - - mock_state_queued.status = StateStatusEnum.QUEUED - mock_state_queued.set = AsyncMock() mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) # Act result = await errored_state( mock_namespace, mock_state_id, - errored_request, + different_error_request, mock_request_id ) @@ -269,3 +310,183 @@ async def test_errored_state_with_different_error_message( assert mock_state_class.find_one.call_count == 1 # Called once for finding assert mock_state_queued.error == "Different error message" + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_graph_template_not_found( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when graph template is not found""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to raise ValueError with "Graph template not found" + mock_graph_template_class.get = AsyncMock(side_effect=ValueError("Graph template not found")) + + # Act & Assert + with pytest.raises(HTTPException) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert exc_info.value.status_code == status.HTTP_404_NOT_FOUND + assert exc_info.value.detail == "Graph template not found" + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_graph_template_other_error( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when graph template raises other exceptions""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to raise a different exception + mock_graph_template_class.get = AsyncMock(side_effect=Exception("Database connection error")) + + # Act & Assert + with pytest.raises(Exception) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert str(exc_info.value) == "Database connection error" + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_duplicate_key_error( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_state_queued, + mock_request_id + ): + """Test when creating retry state encounters DuplicateKeyError""" + # Arrange + mock_state_queued.save = AsyncMock() + mock_state_class.find_one = AsyncMock(return_value=mock_state_queued) + + # Mock GraphTemplate.get to return a valid graph template + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template.retry_policy.compute_delay = MagicMock(return_value=1000) + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Mock State constructor and insert method to raise DuplicateKeyError + mock_retry_state = MagicMock() + mock_retry_state.insert = AsyncMock(side_effect=DuplicateKeyError("Duplicate key error")) + mock_state_class.return_value = mock_retry_state + + # Act + result = await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + # Assert + assert result.status == StateStatusEnum.ERRORED + assert not result.retry_created + assert mock_state_queued.status == StateStatusEnum.ERRORED + assert mock_state_queued.error == mock_errored_request.error + + @patch('app.controller.errored_state.State') + @patch('app.controller.errored_state.GraphTemplate') + async def test_errored_state_max_retries_reached( + self, + mock_graph_template_class, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ): + """Test when state has reached max retries""" + # Arrange + mock_state = MagicMock() + mock_state.id = PydanticObjectId() + mock_state.status = StateStatusEnum.QUEUED + mock_state.graph_name = "test_graph" + mock_state.retry_count = 3 # Already at max retries + mock_state.node_name = "test_node" + mock_state.namespace_name = "test_namespace" + mock_state.identifier = "test_identifier" + mock_state.run_id = "test_run_id" + mock_state.inputs = {} + mock_state.parents = [] + mock_state.does_unites = False + mock_state.fanout_id = None + mock_state.save = AsyncMock() + + mock_state_class.find_one = AsyncMock(return_value=mock_state) + + # Mock GraphTemplate.get to return a valid graph template with max_retries = 3 + mock_graph_template = MagicMock() + mock_graph_template.retry_policy.max_retries = 3 + mock_graph_template_class.get = AsyncMock(return_value=mock_graph_template) + + # Act + result = await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + # Assert + assert result.status == StateStatusEnum.ERRORED + assert not result.retry_created + assert mock_state.status == StateStatusEnum.ERRORED + assert mock_state.error == mock_errored_request.error + # Verify that State constructor was not called (no retry created) + mock_state_class.assert_not_called() + + @patch('app.controller.errored_state.State') + async def test_errored_state_general_exception( + self, + mock_state_class, + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ): + """Test handling of general exceptions in the main try-catch block""" + # Arrange + mock_state_class.find_one = AsyncMock(side_effect=Exception("Unexpected error")) + + # Act & Assert + with pytest.raises(Exception) as exc_info: + await errored_state( + mock_namespace, + mock_state_id, + mock_errored_request, + mock_request_id + ) + + assert str(exc_info.value) == "Unexpected error" + diff --git a/state-manager/tests/unit/controller/test_upsert_graph_template.py b/state-manager/tests/unit/controller/test_upsert_graph_template.py index 2d309376..00832b7f 100644 --- a/state-manager/tests/unit/controller/test_upsert_graph_template.py +++ b/state-manager/tests/unit/controller/test_upsert_graph_template.py @@ -7,6 +7,7 @@ from app.models.graph_models import UpsertGraphTemplateRequest from app.models.graph_template_validation_status import GraphTemplateValidationStatus from app.models.node_template_model import NodeTemplate +from app.models.retry_policy_model import RetryPolicyModel class TestUpsertGraphTemplate: @@ -74,6 +75,14 @@ def mock_existing_template(self, mock_nodes, mock_secrets): template.updated_at = datetime(2023, 1, 2, 12, 0, 0) template.get_secrets.return_value = mock_secrets template.set_secrets.return_value = template + + # Add proper retry_policy using real RetryPolicyModel + template.retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + return template @patch('app.controller.upsert_graph_template.GraphTemplate') @@ -148,6 +157,14 @@ async def test_upsert_graph_template_create_new( mock_new_template.get_secrets.return_value = mock_upsert_request.secrets mock_new_template.set_secrets.return_value = mock_new_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_new_template.retry_policy = mock_retry_policy + mock_graph_template_class.insert = AsyncMock(return_value=mock_new_template) # Act @@ -225,6 +242,14 @@ async def test_upsert_graph_template_with_empty_nodes( mock_existing_template.get_secrets.return_value = {} mock_existing_template.set_secrets.return_value = mock_existing_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_existing_template.retry_policy = mock_retry_policy + mock_existing_template.update = AsyncMock() mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template) @@ -269,6 +294,14 @@ async def test_upsert_graph_template_with_validation_errors( mock_existing_template.get_secrets.return_value = mock_upsert_request.secrets mock_existing_template.set_secrets.return_value = mock_existing_template + # Add proper retry_policy mock + mock_retry_policy = RetryPolicyModel( + max_retries=3, + backoff_factor=1000, + max_delay=30000 + ) + mock_existing_template.retry_policy = mock_retry_policy + mock_existing_template.update = AsyncMock() mock_graph_template_class.find_one = AsyncMock(return_value=mock_existing_template) diff --git a/state-manager/tests/unit/models/test_retry_policy_model.py b/state-manager/tests/unit/models/test_retry_policy_model.py new file mode 100644 index 00000000..038f3adc --- /dev/null +++ b/state-manager/tests/unit/models/test_retry_policy_model.py @@ -0,0 +1,377 @@ +import pytest +from app.models.retry_policy_model import RetryPolicyModel, RetryStrategy + + +class TestRetryPolicyModel: + """Test cases for RetryPolicyModel""" + + def test_default_initialization(self): + """Test RetryPolicyModel with default values""" + policy = RetryPolicyModel() + + assert policy.max_retries == 3 + assert policy.strategy == RetryStrategy.EXPONENTIAL + assert policy.backoff_factor == 2000 + assert policy.exponent == 2 + assert policy.max_delay is None + + def test_custom_initialization(self): + """Test RetryPolicyModel with custom values""" + policy = RetryPolicyModel( + max_retries=5, + strategy=RetryStrategy.LINEAR, + backoff_factor=1000, + exponent=3, + max_delay=10000 + ) + + assert policy.max_retries == 5 + assert policy.strategy == RetryStrategy.LINEAR + assert policy.backoff_factor == 1000 + assert policy.exponent == 3 + assert policy.max_delay == 10000 + + def test_exponential_strategy(self): + """Test exponential retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 2^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2^1 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 4000 # 1000 * 2^2 + + def test_exponential_strategy_with_max_delay(self): + """Test exponential retry strategy with max delay cap""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=3000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 2^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2^1 + + # Test retry count 3 (should be capped at max_delay) + delay = policy.compute_delay(3) + assert delay == 3000 # Capped at max_delay + + def test_linear_strategy(self): + """Test linear retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 1 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 2000 # 1000 * 2 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 3000 # 1000 * 3 + + def test_fixed_strategy(self): + """Test fixed retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # Always 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1000 # Always 1000 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 1000 # Always 1000 + + def test_exponential_full_jitter_strategy(self): + """Test exponential full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 2000 # Random between 0 and 2000 + + def test_exponential_equal_jitter_strategy(self): + """Test exponential equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_EQUAL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 1000 <= delay <= 2000 # Random between 1000 and 2000 + + def test_linear_full_jitter_strategy(self): + """Test linear full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR_FULL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 2000 # Random between 0 and 2000 + + def test_linear_equal_jitter_strategy(self): + """Test linear equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR_EQUAL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 1000 <= delay <= 2000 # Random between 1000 and 2000 + + def test_fixed_full_jitter_strategy(self): + """Test fixed full jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED_FULL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 0 <= delay <= 1000 # Random between 0 and 1000 + + def test_fixed_equal_jitter_strategy(self): + """Test fixed equal jitter retry strategy""" + policy = RetryPolicyModel( + strategy=RetryStrategy.FIXED_EQUAL_JITTER, + backoff_factor=1000 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert 500 <= delay <= 1000 # Random between 500 and 1000 + + def test_invalid_retry_count(self): + """Test that invalid retry count raises ValueError""" + policy = RetryPolicyModel() + + # Test retry count 0 + with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"): + policy.compute_delay(0) + + # Test retry count -1 + with pytest.raises(ValueError, match="Retry count must be greater than or equal to 1"): + policy.compute_delay(-1) + + def test_max_delay_capping(self): + """Test that max_delay properly caps the delay""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=1500 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # Not capped + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1500 # Capped at max_delay + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 1500 # Capped at max_delay + + def test_jitter_strategies_with_max_delay(self): + """Test jitter strategies with max delay capping""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2, + max_delay=1500 + ) + + # Test multiple calls to ensure max_delay is respected + for _ in range(10): + delay = policy.compute_delay(3) + assert delay <= 1500 # Should never exceed max_delay + + def test_different_exponents(self): + """Test different exponent values""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=3 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 1000 # 1000 * 3^0 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 3000 # 1000 * 3^1 + + # Test retry count 3 + delay = policy.compute_delay(3) + assert delay == 9000 # 1000 * 3^2 + + def test_different_backoff_factors(self): + """Test different backoff factor values""" + policy = RetryPolicyModel( + strategy=RetryStrategy.LINEAR, + backoff_factor=500 + ) + + # Test retry count 1 + delay = policy.compute_delay(1) + assert delay == 500 # 500 * 1 + + # Test retry count 2 + delay = policy.compute_delay(2) + assert delay == 1000 # 500 * 2 + + def test_model_validation(self): + """Test Pydantic model validation""" + # Test valid model + RetryPolicyModel( + max_retries=5, + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2, + max_delay=10000 + ) + + # Test invalid max_retries (negative) + with pytest.raises(ValueError): + RetryPolicyModel(max_retries=-1) + + # Test invalid backoff_factor (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(backoff_factor=0) + + # Test invalid exponent (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(exponent=0) + + # Test invalid max_delay (non-positive) + with pytest.raises(ValueError): + RetryPolicyModel(max_delay=0) + + def test_strategy_enum_values(self): + """Test all RetryStrategy enum values""" + strategies = [ + RetryStrategy.EXPONENTIAL, + RetryStrategy.EXPONENTIAL_FULL_JITTER, + RetryStrategy.EXPONENTIAL_EQUAL_JITTER, + RetryStrategy.LINEAR, + RetryStrategy.LINEAR_FULL_JITTER, + RetryStrategy.LINEAR_EQUAL_JITTER, + RetryStrategy.FIXED, + RetryStrategy.FIXED_FULL_JITTER, + RetryStrategy.FIXED_EQUAL_JITTER + ] + + for strategy in strategies: + policy = RetryPolicyModel(strategy=strategy) + assert policy.strategy == strategy + # Should not raise any exceptions + delay = policy.compute_delay(1) + assert isinstance(delay, int) + assert delay >= 0 + + def test_edge_case_large_numbers(self): + """Test edge cases with large numbers""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000000, + exponent=10 + ) + + # Test that large numbers don't cause overflow + delay = policy.compute_delay(3) + assert isinstance(delay, int) + assert delay > 0 + + def test_consistency_across_calls(self): + """Test that non-jitter strategies are consistent""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL, + backoff_factor=1000, + exponent=2 + ) + + # Multiple calls should return the same result for non-jitter strategies + delay1 = policy.compute_delay(2) + delay2 = policy.compute_delay(2) + assert delay1 == delay2 + + def test_jitter_variability(self): + """Test that jitter strategies produce different results""" + policy = RetryPolicyModel( + strategy=RetryStrategy.EXPONENTIAL_FULL_JITTER, + backoff_factor=1000, + exponent=2 + ) + + # Multiple calls should return different results for jitter strategies + delays = set() + for _ in range(100): + delay = policy.compute_delay(2) + delays.add(delay) + + # Should have multiple different values (not all the same) + assert len(delays) > 1 \ No newline at end of file