Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@
from crawlee.errors import (
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
CriticalError,
HttpClientStatusCodeError,
HttpStatusCodeError,
NonRetryableError,
RequestCollisionError,
RequestHandlerError,
SessionError,
Expand Down Expand Up @@ -961,6 +963,9 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
if context.request.no_retry:
return False

if isinstance(error, NonRetryableError):
return False

# Do not retry on client errors.
if isinstance(error, HttpClientStatusCodeError):
return False
Expand Down Expand Up @@ -1470,6 +1475,13 @@ async def __run_task_function(self) -> None:
'RequestHandlerError[TCrawlingContext]', primary_error
) # valid thanks to ContextPipeline

if isinstance(primary_error.wrapped_exception, CriticalError):
self._logger.critical(
'A CriticalError occurred in the user-defined request handler. Crawling will be terminated.',
exc_info=primary_error.wrapped_exception,
)
raise primary_error.wrapped_exception

self._logger.debug(
'An exception occurred in the user-defined request handler',
exc_info=primary_error.wrapped_exception,
Expand Down
12 changes: 12 additions & 0 deletions src/crawlee/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
'ContextPipelineFinalizationError',
'ContextPipelineInitializationError',
'ContextPipelineInterruptedError',
'CriticalError',
'HttpClientStatusCodeError',
'HttpStatusCodeError',
'NonRetryableError',
'ProxyError',
'RequestCollisionError',
'RequestHandlerError',
Expand All @@ -33,6 +35,16 @@ class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""


@docs_group('Errors')
class CriticalError(Exception):
"""Raised for severe errors where the crawl should be immediately aborted or gracefully shut down."""


@docs_group('Errors')
class NonRetryableError(Exception):
"""Raised when a request failed and it is known that retrying will not resolve the issue."""


@docs_group('Errors')
class SessionError(Exception):
"""Errors of `SessionError` type will trigger a session rotation.
Expand Down
52 changes: 52 additions & 0 deletions tests/stress_test_fixes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from __future__ import annotations

import asyncio

import pytest

from typing import Any

from crawlee import Request
from crawlee.crawlers import BasicCrawler
from crawlee.errors import CriticalError, NonRetryableError


async def test_non_retryable_error_not_retried() -> None:
"""Stress test: Ensure NonRetryableError prevents subsequent retries instantly."""
runs = 0

async def _handler(context: Any) -> None:
nonlocal runs
runs += 1
raise NonRetryableError("This request should not be retried under any circumstances.")

crawler = BasicCrawler(
request_handler=_handler,
max_request_retries=5,
)

await crawler.run(['http://tests.crawlee.com/non-retryable'])

# The crawler should process the URL exactly once, ignoring max_request_retries.
assert runs == 1, f"Expected 1 run, but handler was executed {runs} times."


async def test_critical_error_aborts_crawler() -> None:
"""Stress test: Ensure CriticalError aborts the entire crawler immediately."""
runs = 0

async def _handler(context: Any) -> None:
nonlocal runs
runs += 1
raise CriticalError("System-level critical failure simulation.")

crawler = BasicCrawler(
request_handler=_handler,
max_request_retries=3,
)

# A CriticalError should escape the internal loop and cause the run to fail by surfacing
with pytest.raises(CriticalError, match="System-level critical failure simulation."):
await crawler.run(['http://tests.crawlee.com/critical'])

assert runs == 1, f"Expected crawler to abort instantly, but ran {runs} times."