Source code for feldera.rest.retry

from dataclasses import dataclass, field
from typing import FrozenSet


_DEFAULT_RETRYABLE_STATUS_CODES: FrozenSet[int] = frozenset({408, 429, 502, 503, 504})



[docs]
@dataclass(frozen=True)
class RetryConfig:
    """
    Configures retry behavior for the Feldera HTTP client.

    Retries are attempted on transient failures: connection/read timeouts and
    the HTTP statuses listed in `retryable_status_codes` (408, 429, 502, 503,
    504 by default).

    Wait strategies:
      - 408, 429, 503, 504 and connection/read timeouts use exponential
        backoff: `min(initial_backoff * (multiplier ** n), max_backoff)`,
        plus a uniform random `[0, jitter)` term, where `n` is the
        zero-based retry index.
      - 502 uses cluster-aware backoff: the client probes
        `/cluster_healthz`; if the cluster is healthy, the 502 is treated as
        spurious and the next retry runs immediately (wait = 0). If the
        cluster reports unhealthy (e.g. an upgrade is in progress), the next
        retry waits `unhealthy_backoff` seconds.
      - A server-supplied `Retry-After` header always overrides the computed
        wait (capped at `max_backoff`).

    :param max_retries: Number of retries to attempt after the initial request.
        A value of `3` means up to `4` total attempts. Must be `>= 0`.
        Default: `3`.
    :param initial_backoff: Base wait in seconds before the first retry.
        Default: `2.0`.
    :param max_backoff: Maximum wait in seconds between retries. The computed
        exponential wait is clamped to this value. Default: `64.0`.
    :param multiplier: Exponential base applied to `initial_backoff` for each
        successive retry. Default: `2.0`.
    :param jitter: Maximum random extra wait in seconds added to each
        exponential backoff (drawn uniformly from `[0, jitter)`). Helps avoid
        thundering-herd retries when many clients fail at once.
        Default: `0.0` (no jitter).
    :param unhealthy_backoff: Flat wait in seconds between 502 retries when
        the cluster reports unhealthy on `/cluster_healthz`. The cluster is
        likely upgrading/restarting, so a flat pause is preferable to an
        exponential ramp. Default: `90.0`.
    :param retryable_status_codes: HTTP status codes that should trigger a
        retry. Default: `{408, 429, 502, 503, 504}`.
    """

    max_retries: int = 3
    initial_backoff: float = 2.0
    max_backoff: float = 64.0
    multiplier: float = 2.0
    jitter: float = 0.0
    unhealthy_backoff: float = 90.0
    retryable_status_codes: FrozenSet[int] = field(
        default_factory=lambda: _DEFAULT_RETRYABLE_STATUS_CODES
    )

    def __post_init__(self) -> None:
        if self.max_retries < 0:
            raise ValueError("max_retries must be >= 0")
        if self.initial_backoff < 0:
            raise ValueError("initial_backoff must be >= 0")
        if self.max_backoff < 0:
            raise ValueError("max_backoff must be >= 0")
        if self.multiplier <= 0:
            raise ValueError("multiplier must be > 0")
        if self.jitter < 0:
            raise ValueError("jitter must be >= 0")
        if self.unhealthy_backoff < 0:
            raise ValueError("unhealthy_backoff must be >= 0")
        # Coerce to frozenset so callers can pass a set/list without surprises,
        # and so equality comparisons against the default behave intuitively.
        if not isinstance(self.retryable_status_codes, frozenset):
            object.__setattr__(
                self, "retryable_status_codes", frozenset(self.retryable_status_codes)
            )