From 6021d30f003a3a81499e7aa69f81c13c458a96a0 Mon Sep 17 00:00:00 2001 From: Anas Khan <83116240+anxkhn@users.noreply.github.com> Date: Sun, 28 Jun 2026 23:14:15 +0530 Subject: [PATCH 1/2] fix: Ignore negative `Retry-After` delay-seconds in `parse_retry_after_header` The `delay-seconds` branch accepted a negative value (e.g. `Retry-After: -5`) and returned a negative `timedelta`, while the HTTP-date branch already rejects non-positive delays. RFC 7231 defines `delay-seconds` as a non-negative integer. A negative delta flows into `ThrottlingRequestManager.record_domain_delay`, setting `throttled_until` in the past, so `_is_domain_throttled` returns False and the server's HTTP 429 backoff is silently skipped. Guard the integer branch to ignore negative values (falling through to `None`), consistent with the HTTP-date branch. `0` ("retry immediately") and positive values are unchanged. --- src/crawlee/_utils/http.py | 6 +++++- tests/unit/test_throttling_request_manager.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py index 64084343b4..c83e3a4037 100644 --- a/src/crawlee/_utils/http.py +++ b/src/crawlee/_utils/http.py @@ -25,7 +25,11 @@ def parse_retry_after_header(value: str | None) -> timedelta | None: return None try: - return timedelta(seconds=int(value)) + seconds = int(value) + # `delay-seconds` is a non-negative integer per RFC 7231; ignore malformed negative values, + # consistent with the HTTP-date branch below which also rejects non-positive delays. + if seconds >= 0: + return timedelta(seconds=seconds) except ValueError: pass diff --git a/tests/unit/test_throttling_request_manager.py b/tests/unit/test_throttling_request_manager.py index 7277e7773a..0451297fff 100644 --- a/tests/unit/test_throttling_request_manager.py +++ b/tests/unit/test_throttling_request_manager.py @@ -543,6 +543,16 @@ def test_parse_retry_after_integer_seconds() -> None: assert result == timedelta(seconds=120) +def test_parse_retry_after_zero_seconds() -> None: + """A delay of `0` ("retry immediately") is valid and must yield a zero delta, not None.""" + assert parse_retry_after_header('0') == timedelta(0) + + +def test_parse_retry_after_negative_seconds() -> None: + """`delay-seconds` is non-negative per RFC 7231; a malformed negative value must be ignored.""" + assert parse_retry_after_header('-5') is None + + def test_parse_retry_after_invalid_value() -> None: assert parse_retry_after_header('not-a-date-or-number') is None From 553d47369769f3ebdd3e65f4e75998ab4d629d4b Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 30 Jun 2026 11:51:14 +0200 Subject: [PATCH 2/2] refactor: handle negative Retry-After explicitly via try/except/else --- src/crawlee/_utils/http.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py index c83e3a4037..4713f66dea 100644 --- a/src/crawlee/_utils/http.py +++ b/src/crawlee/_utils/http.py @@ -24,15 +24,20 @@ def parse_retry_after_header(value: str | None) -> timedelta | None: if not value: return None + # Numeric form: `delay-seconds`, a non-negative integer per RFC 7231 ยง7.1.3. try: seconds = int(value) - # `delay-seconds` is a non-negative integer per RFC 7231; ignore malformed negative values, - # consistent with the HTTP-date branch below which also rejects non-positive delays. - if seconds >= 0: - return timedelta(seconds=seconds) except ValueError: - pass - + pass # Not an integer, fall through to the HTTP-date form below. + else: + if seconds < 0: + # A negative delay is malformed. Reject it instead of returning a negative `timedelta`, which would + # push `throttled_until` into the past and silently disable the 429 back-off downstream. + logger.debug(f'Retry-After delay-seconds {value!r} is negative; ignoring.') + return None + return timedelta(seconds=seconds) + + # HTTP-date form, e.g. "Wed, 21 Oct 2015 07:28:00 GMT". try: retry_date = parsedate_to_datetime(value) # `parsedate_to_datetime` may return a naive datetime when the input has no timezone info.