-
-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathlychee.toml
More file actions
67 lines (53 loc) · 2.03 KB
/
lychee.toml
File metadata and controls
67 lines (53 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Interface
verbose = "info"
no_progress = true
cache = true
max_cache_age = "10m"
# Stealth
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0"
header = { "Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language" = "en-US,en;q=0.5", "Accept-Encoding" = "gzip, deflate, br" }
# Prevent 'Too Many Open Files'
max_concurrency = 16
# Slower retries give rate-limited servers time to cool down
retry_wait_time = 5
# Check links inside `<code>` and `<pre>` blocks and Markdown code blocks
include_verbatim = true
# Don't check emails, it's problematic
include_mail = false
# Accept 429 (Too Many Requests) and also 403 (Forbidden) as "not broken"
# 403 is the most common false positive from anti-bot systems
# 999 is LinkedIn's custom anti-bot rejection code
accept = ["100..=103", "200..=299", "403", "429", "999"]
# Be benevolent
max_retries = 5
timeout = 40
# Root directory
root_dir = "public"
# Exclude - https://github.com/lycheeverse/lychee/discussions/1909
exclude_path = [
"^public/jobs/index\\.html$",
"^public/jobs/[^/]+/index\\.html$",
"^public/news/[^/]+/index\\.html$",
]
exclude = [
# Specifications
"^https://ogp\\.me/ns/website",
# Internet Archive – assume it works, don't strain their servers
"^https://web\\.archive\\.org/web/.*",
# Infrastructure
"^https://juniorguru\\.memberful\\.com",
"^https://js\\.memberful\\.com",
"^https://assets\\.memberful\\.com",
"^https://js\\.stripe\\.com",
"^https://m\\.stripe\\.network",
"^https://sa\\.junior\\.guru",
# Webcal
"^webcal://",
# Anti-scraping protections
"^https://(www\\.)?(linkedin|facebook|reddit)\\.com",
"^https://(www\\.)?(beeit|glancemedia|prace\\.rovnou|keyguru)\\.cz",
"^https://([^\\.]+\\.)?discord\\.com",
# Not yet deployed avatars of club members or company logos
"^https://junior\\.guru/static/avatars-club/[^\\.]+\\.png",
"^https://junior\\.guru/static/logos-jobs/[^\\.]+\\.webp",
]