diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 973e0ad430..f82cb18ee7 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -53,8 +53,10 @@ from crawlee.errors import ( ContextPipelineInitializationError, ContextPipelineInterruptedError, + CriticalError, HttpClientStatusCodeError, HttpStatusCodeError, + NonRetryableError, RequestCollisionError, RequestHandlerError, SessionError, @@ -961,6 +963,9 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) if context.request.no_retry: return False + if isinstance(error, NonRetryableError): + return False + # Do not retry on client errors. if isinstance(error, HttpClientStatusCodeError): return False @@ -1470,6 +1475,13 @@ async def __run_task_function(self) -> None: 'RequestHandlerError[TCrawlingContext]', primary_error ) # valid thanks to ContextPipeline + if isinstance(primary_error.wrapped_exception, CriticalError): + self._logger.critical( + 'A CriticalError occurred in the user-defined request handler. Crawling will be terminated.', + exc_info=primary_error.wrapped_exception, + ) + raise primary_error.wrapped_exception + self._logger.debug( 'An exception occurred in the user-defined request handler', exc_info=primary_error.wrapped_exception, diff --git a/src/crawlee/errors.py b/src/crawlee/errors.py index 539bcf7711..7199f00a4e 100644 --- a/src/crawlee/errors.py +++ b/src/crawlee/errors.py @@ -11,8 +11,10 @@ 'ContextPipelineFinalizationError', 'ContextPipelineInitializationError', 'ContextPipelineInterruptedError', + 'CriticalError', 'HttpClientStatusCodeError', 'HttpStatusCodeError', + 'NonRetryableError', 'ProxyError', 'RequestCollisionError', 'RequestHandlerError', @@ -33,6 +35,16 @@ class UserHandlerTimeoutError(UserDefinedErrorHandlerError): """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out.""" +@docs_group('Errors') +class CriticalError(Exception): + """Raised for severe errors where the crawl should be immediately aborted or gracefully shut down.""" + + +@docs_group('Errors') +class NonRetryableError(Exception): + """Raised when a request failed and it is known that retrying will not resolve the issue.""" + + @docs_group('Errors') class SessionError(Exception): """Errors of `SessionError` type will trigger a session rotation. diff --git a/tests/stress_test_fixes.py b/tests/stress_test_fixes.py new file mode 100644 index 0000000000..73eb7db5fe --- /dev/null +++ b/tests/stress_test_fixes.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import asyncio + +import pytest + +from typing import Any + +from crawlee import Request +from crawlee.crawlers import BasicCrawler +from crawlee.errors import CriticalError, NonRetryableError + + +async def test_non_retryable_error_not_retried() -> None: + """Stress test: Ensure NonRetryableError prevents subsequent retries instantly.""" + runs = 0 + + async def _handler(context: Any) -> None: + nonlocal runs + runs += 1 + raise NonRetryableError("This request should not be retried under any circumstances.") + + crawler = BasicCrawler( + request_handler=_handler, + max_request_retries=5, + ) + + await crawler.run(['http://tests.crawlee.com/non-retryable']) + + # The crawler should process the URL exactly once, ignoring max_request_retries. + assert runs == 1, f"Expected 1 run, but handler was executed {runs} times." + + +async def test_critical_error_aborts_crawler() -> None: + """Stress test: Ensure CriticalError aborts the entire crawler immediately.""" + runs = 0 + + async def _handler(context: Any) -> None: + nonlocal runs + runs += 1 + raise CriticalError("System-level critical failure simulation.") + + crawler = BasicCrawler( + request_handler=_handler, + max_request_retries=3, + ) + + # A CriticalError should escape the internal loop and cause the run to fail by surfacing + with pytest.raises(CriticalError, match="System-level critical failure simulation."): + await crawler.run(['http://tests.crawlee.com/critical']) + + assert runs == 1, f"Expected crawler to abort instantly, but ran {runs} times."