Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_http_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._utils.urls import validate_http_url
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

Expand Down Expand Up @@ -79,6 +80,8 @@ async def send_request(
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
# https://github.com/apify/crawlee-python/issues/1055

validate_http_url(url)

if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee._utils.urls import validate_http_url
from crawlee.errors import ProxyError
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

Expand Down Expand Up @@ -197,6 +198,8 @@ async def send_request(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
validate_http_url(url)

if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

Expand Down Expand Up @@ -238,6 +241,8 @@ async def stream(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> AsyncGenerator[HttpResponse]:
validate_http_url(url)

if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from crawlee._types import HttpHeaders
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee._utils.urls import validate_http_url
from crawlee.errors import ProxyError
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
Expand Down Expand Up @@ -192,6 +193,8 @@ async def send_request(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
validate_http_url(url)

client = self._get_client(proxy_info.url if proxy_info else None)

http_request = self._build_request(
Expand Down Expand Up @@ -228,6 +231,8 @@ async def stream(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> AsyncGenerator[HttpResponse]:
validate_http_url(url)

client = self._get_client(proxy_info.url if proxy_info else None)

http_request = self._build_request(
Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/http_clients/_impit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from crawlee._types import HttpHeaders
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee._utils.docs import docs_group
from crawlee._utils.urls import validate_http_url
from crawlee.errors import ProxyError
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

Expand Down Expand Up @@ -163,6 +164,8 @@ async def send_request(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> HttpResponse:
validate_http_url(url)

if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

Expand Down Expand Up @@ -198,6 +201,8 @@ async def stream(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
) -> AsyncGenerator[HttpResponse]:
validate_http_url(url)

client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)

try:
Expand Down
15 changes: 14 additions & 1 deletion tests/unit/_utils/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ def test_validate_http_url() -> None:
valid_url = 'https://example.com'
assert validate_http_url(valid_url) == valid_url

invalid_url = 'htp://invalid-url'

@pytest.mark.parametrize(
'invalid_url',
[
'htp://invalid-url',
'gopher://127.0.0.1:6379/_PING',
'file:///etc/passwd',
'dict://127.0.0.1:11211/stat',
'ftp://example.com/secret.txt',
'javascript:alert(1)',
'example.com/path',
],
)
def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None:
with pytest.raises(ValidationError):
validate_http_url(invalid_url)
12 changes: 12 additions & 0 deletions tests/unit/http_clients/test_http_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pytest
from curl_cffi import CurlHttpVersion
from pydantic import ValidationError

from crawlee import Request
from crawlee.errors import ProxyError
Expand Down Expand Up @@ -251,3 +252,14 @@ async def test_compressed_chunked_stream(http_client: HttpClient, server_url: UR
content_body += chunk

assert content_body == HELLO_WORLD * 1000


async def test_send_request_rejects_non_http_scheme(http_client: HttpClient) -> None:
with pytest.raises(ValidationError):
await http_client.send_request('gopher://127.0.0.1:6379/_PING')


async def test_stream_rejects_non_http_scheme(http_client: HttpClient) -> None:
with pytest.raises(ValidationError):
async with http_client.stream('gopher://127.0.0.1:6379/_PING'):
pass
Loading