diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py index e522b6d63a..83cdf843ef 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_http_client.py +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -7,6 +7,7 @@ from typing_extensions import override from crawlee._types import HttpHeaders +from crawlee._utils.urls import validate_http_url from crawlee.crawlers._playwright._types import PlaywrightHttpResponse from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse @@ -79,6 +80,8 @@ async def send_request( # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved # https://github.com/apify/crawlee-python/issues/1055 + validate_http_url(url) + if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index b22a807de3..a2a3e691d7 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -19,6 +19,7 @@ from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group +from crawlee._utils.urls import validate_http_url from crawlee.errors import ProxyError from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse @@ -197,6 +198,8 @@ async def send_request( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: + validate_http_url(url) + if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) @@ -238,6 +241,8 @@ async def stream( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: + validate_http_url(url) + if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index a1f79826c0..f2c2565125 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -11,6 +11,7 @@ from crawlee._types import HttpHeaders from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group +from crawlee._utils.urls import validate_http_url from crawlee.errors import ProxyError from crawlee.fingerprint_suite import HeaderGenerator from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse @@ -192,6 +193,8 @@ async def send_request( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: + validate_http_url(url) + client = self._get_client(proxy_info.url if proxy_info else None) http_request = self._build_request( @@ -228,6 +231,8 @@ async def stream( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: + validate_http_url(url) + client = self._get_client(proxy_info.url if proxy_info else None) http_request = self._build_request( diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index 7ff441d5d3..708a5c363e 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -13,6 +13,7 @@ from crawlee._types import HttpHeaders from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group +from crawlee._utils.urls import validate_http_url from crawlee.errors import ProxyError from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse @@ -163,6 +164,8 @@ async def send_request( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> HttpResponse: + validate_http_url(url) + if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) @@ -198,6 +201,8 @@ async def stream( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, ) -> AsyncGenerator[HttpResponse]: + validate_http_url(url) + client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None) try: diff --git a/tests/unit/_utils/test_urls.py b/tests/unit/_utils/test_urls.py index bd07cfb09e..659ef09226 100644 --- a/tests/unit/_utils/test_urls.py +++ b/tests/unit/_utils/test_urls.py @@ -39,6 +39,19 @@ def test_validate_http_url() -> None: valid_url = 'https://example.com' assert validate_http_url(valid_url) == valid_url - invalid_url = 'htp://invalid-url' + +@pytest.mark.parametrize( + 'invalid_url', + [ + 'htp://invalid-url', + 'gopher://127.0.0.1:6379/_PING', + 'file:///etc/passwd', + 'dict://127.0.0.1:11211/stat', + 'ftp://example.com/secret.txt', + 'javascript:alert(1)', + 'example.com/path', + ], +) +def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None: with pytest.raises(ValidationError): validate_http_url(invalid_url) diff --git a/tests/unit/http_clients/test_http_clients.py b/tests/unit/http_clients/test_http_clients.py index bbb13846ec..fb1ea43eca 100644 --- a/tests/unit/http_clients/test_http_clients.py +++ b/tests/unit/http_clients/test_http_clients.py @@ -5,6 +5,7 @@ import pytest from curl_cffi import CurlHttpVersion +from pydantic import ValidationError from crawlee import Request from crawlee.errors import ProxyError @@ -251,3 +252,14 @@ async def test_compressed_chunked_stream(http_client: HttpClient, server_url: UR content_body += chunk assert content_body == HELLO_WORLD * 1000 + + +async def test_send_request_rejects_non_http_scheme(http_client: HttpClient) -> None: + with pytest.raises(ValidationError): + await http_client.send_request('gopher://127.0.0.1:6379/_PING') + + +async def test_stream_rejects_non_http_scheme(http_client: HttpClient) -> None: + with pytest.raises(ValidationError): + async with http_client.stream('gopher://127.0.0.1:6379/_PING'): + pass