diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 0b3874d0c68..4e745acc91f 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -92,7 +92,7 @@ jobs:
VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1"
FLAT_LAYOUT_INLINE_ARRAY_NODE: "1"
run: |
- bash scripts/bench-taskset.sh target/release_debug/${{ matrix.benchmark.id }} --formats ${{ matrix.benchmark.formats }} -d gh-json -o results.json
+ bash scripts/bench-taskset.sh target/release_debug/${{ matrix.benchmark.id }} --formats ${{ matrix.benchmark.formats }} -d gh-json -o results.json --gh-json-v3 results.v3.jsonl
- name: Setup AWS CLI
uses: aws-actions/configure-aws-credentials@v6
@@ -105,6 +105,19 @@ jobs:
run: |
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
+ - name: Ingest results to v3 server
+ if: vars.V3_INGEST_URL != ''
+ continue-on-error: true
+ shell: bash
+ env:
+ INGEST_BEARER_TOKEN: ${{ secrets.INGEST_BEARER_TOKEN }}
+ run: |
+ python3 scripts/post-ingest.py results.v3.jsonl \
+ --server "${{ vars.V3_INGEST_URL }}" \
+ --commit-sha "${{ github.sha }}" \
+ --benchmark-id "${{ matrix.benchmark.id }}" \
+ --repo-url "${{ github.server_url }}/${{ github.repository }}"
+
- name: Alert incident.io
if: failure()
uses: ./.github/actions/alert-incident-io
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c6b146d35b2..08e3be97996 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -358,7 +358,8 @@ jobs:
if: matrix.os == 'windows-x64'
run: |
cargo nextest run --cargo-profile ci --locked --workspace --all-features --no-fail-fast `
- --exclude vortex-bench --exclude vortex-python --exclude vortex-duckdb `
+ --exclude vortex-bench --exclude vortex-bench-server `
+ --exclude vortex-python --exclude vortex-duckdb `
--exclude vortex-fuzz --exclude vortex-cuda --exclude vortex-nvcomp `
--exclude vortex-cub --exclude vortex-test-e2e-cuda --exclude duckdb-bench `
--exclude lance-bench --exclude datafusion-bench --exclude random-access-bench `
diff --git a/.github/workflows/publish-benchmarks-website.yml b/.github/workflows/publish-bench-server.yml
similarity index 58%
rename from .github/workflows/publish-benchmarks-website.yml
rename to .github/workflows/publish-bench-server.yml
index e7eeefb8ecc..0bfcb6d3293 100644
--- a/.github/workflows/publish-benchmarks-website.yml
+++ b/.github/workflows/publish-bench-server.yml
@@ -1,18 +1,23 @@
-name: Publish Benchmarks Website
+name: Publish Bench Server
on:
push:
branches: [develop]
paths:
- - "benchmarks-website/**"
+ - "benchmarks-website/server/**"
+ - "vortex-bench/**"
+ - "Cargo.lock"
+ - ".github/workflows/publish-bench-server.yml"
+ workflow_dispatch:
jobs:
publish:
runs-on: ubuntu-latest
- timeout-minutes: 10
+ timeout-minutes: 30
permissions:
contents: read
packages: write
+ id-token: write
steps:
- uses: actions/checkout@v6
@@ -32,7 +37,10 @@ jobs:
- name: Build and push
uses: docker/build-push-action@v7
with:
- context: ./benchmarks-website
+ context: .
+ file: ./benchmarks-website/server/Dockerfile
platforms: linux/arm64
push: true
- tags: ghcr.io/${{ github.repository }}/benchmarks-website:latest
+ tags: |
+ ghcr.io/${{ github.repository }}/vortex-bench-server:latest
+ ghcr.io/${{ github.repository }}/vortex-bench-server:${{ github.sha }}
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
index 8dcb56bceda..d8eef8cd246 100644
--- a/.github/workflows/sql-benchmarks.yml
+++ b/.github/workflows/sql-benchmarks.yml
@@ -376,6 +376,7 @@ jobs:
bash scripts/bench-taskset.sh uv run --project bench-orchestrator vx-bench run "${{ matrix.subcommand }}" \
--targets-json '${{ steps.targets.outputs.targets_json }}' \
--output results.json \
+ --gh-json-v3 results.v3.jsonl \
--no-build \
--runner "ec2_${{ inputs.machine_type }}" \
${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
@@ -395,6 +396,7 @@ jobs:
bash scripts/bench-taskset.sh uv run --project bench-orchestrator vx-bench run "${{ matrix.subcommand }}" \
--targets-json '${{ steps.targets.outputs.targets_json }}' \
--output results.json \
+ --gh-json-v3 results.v3.jsonl \
--no-build \
--runner "ec2_${{ inputs.machine_type }}" \
${{ matrix.iterations && format('--iterations {0}', matrix.iterations) || '' }} \
@@ -499,6 +501,19 @@ jobs:
run: |
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
+ - name: Ingest results to v3 server
+ if: inputs.mode == 'develop' && vars.V3_INGEST_URL != ''
+ continue-on-error: true
+ shell: bash
+ env:
+ INGEST_BEARER_TOKEN: ${{ secrets.INGEST_BEARER_TOKEN }}
+ run: |
+ python3 scripts/post-ingest.py results.v3.jsonl \
+ --server "${{ vars.V3_INGEST_URL }}" \
+ --commit-sha "${{ github.sha }}" \
+ --benchmark-id "${{ matrix.id }}" \
+ --repo-url "${{ github.server_url }}/${{ github.repository }}"
+
- name: Upload File Sizes
if: inputs.mode == 'develop' && matrix.remote_storage == null
shell: bash
diff --git a/.github/workflows/v3-commit-metadata.yml b/.github/workflows/v3-commit-metadata.yml
new file mode 100644
index 00000000000..8f18be93e53
--- /dev/null
+++ b/.github/workflows/v3-commit-metadata.yml
@@ -0,0 +1,35 @@
+# Posts a v3 ingest envelope with no records on every push to develop, so the
+# `commits` dim stays populated even when no benchmark ran.
+
+name: v3 commit metadata
+
+on:
+ push:
+ branches: [develop]
+ workflow_dispatch:
+
+permissions:
+ contents: read
+
+jobs:
+ commit-metadata:
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ fetch-depth: 2
+
+ - name: Ingest commit metadata to v3 server
+ if: vars.V3_INGEST_URL != ''
+ continue-on-error: true
+ shell: bash
+ env:
+ INGEST_BEARER_TOKEN: ${{ secrets.INGEST_BEARER_TOKEN }}
+ run: |
+ echo -n > empty.jsonl
+ python3 scripts/post-ingest.py empty.jsonl \
+ --server "${{ vars.V3_INGEST_URL }}" \
+ --commit-sha "${{ github.sha }}" \
+ --benchmark-id "commit-metadata" \
+ --repo-url "${{ github.server_url }}/${{ github.repository }}"
diff --git a/.gitignore b/.gitignore
index 7fa79fb2162..bcc8ef746ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -242,3 +242,6 @@ trace*.pb
# pytest-benchmark output
vortex-python/.benchmarks/
+# For local benchmarks website server and things like the WAL
+**.duckdb*
+.bench-env
diff --git a/Cargo.lock b/Cargo.lock
index 92b8f535503..fbf7c8dbcfd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -197,6 +197,9 @@ name = "arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
+dependencies = [
+ "derive_arbitrary",
+]
[[package]]
name = "arc-swap"
@@ -687,9 +690,9 @@ dependencies = [
[[package]]
name = "async-compression"
-version = "0.4.41"
+version = "0.4.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1"
+checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac"
dependencies = [
"compression-codecs",
"compression-core",
@@ -900,6 +903,58 @@ dependencies = [
"fs_extra",
]
+[[package]]
+name = "axum"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90"
+dependencies = [
+ "axum-core",
+ "bytes",
+ "form_urlencoded",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde_core",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
[[package]]
name = "base16ct"
version = "1.0.0"
@@ -1025,9 +1080,9 @@ dependencies = [
[[package]]
name = "blake3"
-version = "1.8.4"
+version = "1.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e"
+checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
dependencies = [
"arrayref",
"arrayvec",
@@ -1298,9 +1353,9 @@ dependencies = [
[[package]]
name = "cc"
-version = "1.2.60"
+version = "1.2.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
+checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d"
dependencies = [
"find-msvc-tools",
"jobserver",
@@ -1314,12 +1369,6 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
-[[package]]
-name = "cesu8"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
-
[[package]]
name = "cexpr"
version = "0.6.0"
@@ -1606,10 +1655,11 @@ dependencies = [
[[package]]
name = "comfy-table"
-version = "7.2.2"
+version = "7.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47"
+checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a"
dependencies = [
+ "crossterm 0.28.1",
"unicode-segmentation",
"unicode-width 0.2.2",
]
@@ -1652,9 +1702,9 @@ dependencies = [
[[package]]
name = "compression-codecs"
-version = "0.4.37"
+version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7"
+checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf"
dependencies = [
"bzip2",
"compression-core",
@@ -1667,9 +1717,9 @@ dependencies = [
[[package]]
name = "compression-core"
-version = "0.4.31"
+version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d"
+checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789"
[[package]]
name = "concurrent-queue"
@@ -1910,6 +1960,19 @@ version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "crossterm"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "parking_lot",
+ "rustix 0.38.44",
+ "winapi",
+]
+
[[package]]
name = "crossterm"
version = "0.29.0"
@@ -3550,6 +3613,17 @@ dependencies = [
"serde_core",
]
+[[package]]
+name = "derive_arbitrary"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "derive_more"
version = "2.1.1"
@@ -3660,6 +3734,25 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab23e69df104e2fd85ee63a533a22d2132ef5975dc6b36f9f3e5a7305e4a8ed7"
+[[package]]
+name = "duckdb"
+version = "1.10502.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fdc796383b176dd5a45353fbb5e64583c0ee4da12cb62c9e510b785324b2488"
+dependencies = [
+ "arrow 58.1.0",
+ "cast",
+ "comfy-table",
+ "fallible-iterator",
+ "fallible-streaming-iterator",
+ "hashlink",
+ "libduckdb-sys",
+ "num",
+ "num-integer",
+ "rust_decimal",
+ "strum 0.27.2",
+]
+
[[package]]
name = "duckdb-bench"
version = "0.1.0"
@@ -3861,6 +3954,18 @@ dependencies = [
"ext-trait",
]
+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
+[[package]]
+name = "fallible-streaming-iterator"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
+
[[package]]
name = "fast-float2"
version = "0.2.3"
@@ -4003,9 +4108,9 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsst"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2195cc7f87e84bd695586137de99605e7e9579b26ec5e01b82960ddb4d0922f2"
+checksum = "2b3a6f3550e61b999febd7168d462db953948eff4fc3448276b3d10d10324dbb"
dependencies = [
"arrow-array 57.3.0",
"rand 0.9.4",
@@ -4356,6 +4461,15 @@ dependencies = [
"foldhash 0.2.0",
]
+[[package]]
+name = "hashlink"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1"
+dependencies = [
+ "hashbrown 0.15.5",
+]
+
[[package]]
name = "heck"
version = "0.5.0"
@@ -4428,6 +4542,12 @@ version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
[[package]]
name = "humansize"
version = "2.1.3"
@@ -4445,9 +4565,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"
[[package]]
name = "hybrid-array"
-version = "0.4.10"
+version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3944cf8cf766b40e2a1a333ee5e9b563f854d5fa49d6a8ca2764e97c6eddb214"
+checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5"
dependencies = [
"typenum",
]
@@ -4466,6 +4586,7 @@ dependencies = [
"http",
"http-body",
"httparse",
+ "httpdate",
"itoa",
"pin-project-lite",
"smallvec",
@@ -4487,6 +4608,7 @@ dependencies = [
"tokio",
"tokio-rustls",
"tower-service",
+ "webpki-roots",
]
[[package]]
@@ -4852,9 +4974,9 @@ dependencies = [
[[package]]
name = "jiff"
-version = "0.2.23"
+version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359"
+checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d"
dependencies = [
"jiff-static",
"jiff-tzdb-platform",
@@ -4867,9 +4989,9 @@ dependencies = [
[[package]]
name = "jiff-static"
-version = "0.2.23"
+version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
+checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7"
dependencies = [
"proc-macro2",
"quote",
@@ -4891,22 +5013,6 @@ dependencies = [
"jiff-tzdb",
]
-[[package]]
-name = "jni"
-version = "0.21.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97"
-dependencies = [
- "cesu8",
- "cfg-if",
- "combine",
- "jni-sys 0.3.1",
- "log",
- "thiserror 1.0.69",
- "walkdir",
- "windows-sys 0.45.0",
-]
-
[[package]]
name = "jni"
version = "0.22.4"
@@ -4917,7 +5023,7 @@ dependencies = [
"combine",
"java-locator",
"jni-macros",
- "jni-sys 0.4.1",
+ "jni-sys",
"libloading",
"log",
"simd_cesu8",
@@ -4939,15 +5045,6 @@ dependencies = [
"syn 2.0.117",
]
-[[package]]
-name = "jni-sys"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258"
-dependencies = [
- "jni-sys 0.4.1",
-]
-
[[package]]
name = "jni-sys"
version = "0.4.1"
@@ -5047,9 +5144,9 @@ checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37"
[[package]]
name = "lance"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efe6c3ddd79cdfd2b7e1c23cafae52806906bc40fbd97de9e8cf2f8c7a75fc04"
+checksum = "f63e285ceee2b4ca8eb3a8742266cc1ac8161599767a8ecb4d8c2f9fd43d8b29"
dependencies = [
"arrow 57.3.0",
"arrow-arith 57.3.0",
@@ -5113,9 +5210,9 @@ dependencies = [
[[package]]
name = "lance-arrow"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d9f5d95bdda2a2b790f1fb8028b5b6dcf661abeb3133a8bca0f3d24b054af87"
+checksum = "5c55e62fc04422ef4cd4af6f863ada32641ae23124f9b2e9c567a40d617e8c97"
dependencies = [
"arrow-array 57.3.0",
"arrow-buffer 57.3.0",
@@ -5153,9 +5250,9 @@ dependencies = [
[[package]]
name = "lance-bitpacking"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f827d6ab9f8f337a9509d5ad66a12f3314db8713868260521c344ef6135eb4e4"
+checksum = "a48d232a2908645af0040f96c60a6387fea2df75e762d7033e93e17bb420c6a1"
dependencies = [
"arrayref",
"paste",
@@ -5164,9 +5261,9 @@ dependencies = [
[[package]]
name = "lance-core"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f1e25df6a79bf72ee6bcde0851f19b1cd36c5848c1b7db83340882d3c9fdecb"
+checksum = "ce071baaff88fcdcf67f1dd0af54e17656f52ae75aaeb75f25f9cf4da29241f2"
dependencies = [
"arrow-array 57.3.0",
"arrow-buffer 57.3.0",
@@ -5203,9 +5300,9 @@ dependencies = [
[[package]]
name = "lance-datafusion"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93146de8ae720cb90edef81c2f2d0a1b065fc2f23ecff2419546f389b0fa70a4"
+checksum = "11ebc97ee94fa8e1af6fd0520066c7e7e0eab38a100e750ba9aabad644c5aa57"
dependencies = [
"arrow 57.3.0",
"arrow-array 57.3.0",
@@ -5235,9 +5332,9 @@ dependencies = [
[[package]]
name = "lance-datagen"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccec8ce4d8e0a87a99c431dab2364398029f2ffb649c1a693c60c79e05ed30dd"
+checksum = "9b90dbb2829875b3a3d00f88fd3a3e39a9e4c7d34c266f67da6550fcda54c76e"
dependencies = [
"arrow 57.3.0",
"arrow-array 57.3.0",
@@ -5255,9 +5352,9 @@ dependencies = [
[[package]]
name = "lance-encoding"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c1aec0bbbac6bce829bc10f1ba066258126100596c375fb71908ecf11c2c2a5"
+checksum = "65ec429cc2e18ad1b7e43cc7ec57a2f2e49229cfbd934da45e619751a886b8cd"
dependencies = [
"arrow-arith 57.3.0",
"arrow-array 57.3.0",
@@ -5294,9 +5391,9 @@ dependencies = [
[[package]]
name = "lance-file"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14a8c548804f5b17486dc2d3282356ed1957095a852780283bc401fdd69e9075"
+checksum = "418afe3f82487615fa09222b95a4b5853103f3f0425996d24a537ca750381f83"
dependencies = [
"arrow-arith 57.3.0",
"arrow-array 57.3.0",
@@ -5328,9 +5425,9 @@ dependencies = [
[[package]]
name = "lance-index"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da212f0090ea59f79ac3686660f596520c167fe1cb5f408900cf71d215f0e03"
+checksum = "936b3deeb6ee075646d18f27b01cf2d2e846c3f5f6c5fa45b30aa41dd5b4c4e2"
dependencies = [
"arrow 57.3.0",
"arrow-arith 57.3.0",
@@ -5394,9 +5491,9 @@ dependencies = [
[[package]]
name = "lance-io"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41d958eb4b56f03bbe0f5f85eb2b4e9657882812297b6f711f201ffc995f259f"
+checksum = "4103e4cebe146af15bfb198c8142d6ea37d5b25fa04158bf2d9be4597bf174d3"
dependencies = [
"arrow 57.3.0",
"arrow-arith 57.3.0",
@@ -5433,9 +5530,9 @@ dependencies = [
[[package]]
name = "lance-linalg"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0285b70da35def7ed95e150fae1d5308089554e1290470403ed3c50cb235bc5e"
+checksum = "c00c7ad71eca93635404519e77add6689947c9342134bb2133578f81249bf809"
dependencies = [
"arrow-array 57.3.0",
"arrow-buffer 57.3.0",
@@ -5451,9 +5548,9 @@ dependencies = [
[[package]]
name = "lance-namespace"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f78e2a828b654e062a495462c6e3eb4fcf0e7e907d761b8f217fc09ccd3ceac"
+checksum = "e0c59a574e72a4b72da8096bcaaa1b1e5b44f6a83da164cc714c286fab30c369"
dependencies = [
"arrow 57.3.0",
"async-trait",
@@ -5479,9 +5576,9 @@ dependencies = [
[[package]]
name = "lance-table"
-version = "4.0.0"
+version = "4.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3df9c4adca3eb2074b3850432a9fb34248a3d90c3d6427d158b13ff9355664ee"
+checksum = "943b9c503f23ebab9e0dbee356f528bc4cbcafded87a6848451f205b0bb473d7"
dependencies = [
"arrow 57.3.0",
"arrow-array 57.3.0",
@@ -5634,9 +5731,26 @@ checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f"
[[package]]
name = "libc"
-version = "0.2.185"
+version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "libduckdb-sys"
+version = "1.10502.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d7401630ae2abcff642f7156294289e50f2d222e061c026ad797b01bf20c215"
+dependencies = [
+ "cc",
+ "flate2",
+ "pkg-config",
+ "reqwest 0.12.28",
+ "serde",
+ "serde_json",
+ "tar",
+ "vcpkg",
+ "zip 6.0.0",
+]
[[package]]
name = "libfuzzer-sys"
@@ -5888,6 +6002,12 @@ dependencies = [
"regex-automata",
]
+[[package]]
+name = "matchit"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
+
[[package]]
name = "matrixmultiply"
version = "0.3.10"
@@ -5901,6 +6021,30 @@ dependencies = [
"thread-tree",
]
+[[package]]
+name = "maud"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8156733e27020ea5c684db5beac5d1d611e1272ab17901a49466294b84fc217e"
+dependencies = [
+ "axum-core",
+ "http",
+ "itoa",
+ "maud_macros",
+]
+
+[[package]]
+name = "maud_macros"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7261b00f3952f617899bc012e3dbd56e4f0110a038175929fa5d18e5a19913ca"
+dependencies = [
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "md-5"
version = "0.10.6"
@@ -6211,6 +6355,20 @@ dependencies = [
"windows-sys 0.61.2",
]
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
[[package]]
name = "num-bigint"
version = "0.4.6"
@@ -6246,6 +6404,28 @@ dependencies = [
"num-traits",
]
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
[[package]]
name = "num-traits"
version = "0.2.19"
@@ -7054,6 +7234,18 @@ dependencies = [
"unicode-ident",
]
+[[package]]
+name = "proc-macro2-diagnostics"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "version_check",
+]
+
[[package]]
name = "prost"
version = "0.12.6"
@@ -7585,7 +7777,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "577c9b9f652b4c121fb25c6a391dd06406d3b092ba68827e6d2f09550edc54b3"
dependencies = [
"cfg-if",
- "crossterm",
+ "crossterm 0.29.0",
"instability",
"ratatui-core",
]
@@ -7824,13 +8016,14 @@ dependencies = [
"wasm-bindgen-futures",
"wasm-streams 0.4.2",
"web-sys",
+ "webpki-roots",
]
[[package]]
name = "reqwest"
-version = "0.13.2"
+version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801"
+checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0"
dependencies = [
"base64",
"bytes",
@@ -7854,6 +8047,8 @@ dependencies = [
"rustls",
"rustls-pki-types",
"rustls-platform-verifier",
+ "serde",
+ "serde_json",
"sync_wrapper",
"tokio",
"tokio-rustls",
@@ -7913,9 +8108,9 @@ dependencies = [
[[package]]
name = "roaring"
-version = "0.11.3"
+version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885"
+checksum = "1dedc5658c6ecb3bdb5ef5f3295bb9253f42dcf3fd1402c03f6b1f7659c3c4a9"
dependencies = [
"bytemuck",
"byteorder",
@@ -8047,9 +8242,9 @@ dependencies = [
[[package]]
name = "rustls"
-version = "0.23.38"
+version = "0.23.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21"
+checksum = "7c2c118cb077cca2822033836dfb1b975355dfb784b5e8da48f7b6c5db74e60e"
dependencies = [
"aws-lc-rs",
"once_cell",
@@ -8074,9 +8269,9 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
-version = "1.14.0"
+version = "1.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
+checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9"
dependencies = [
"web-time",
"zeroize",
@@ -8084,13 +8279,13 @@ dependencies = [
[[package]]
name = "rustls-platform-verifier"
-version = "0.6.2"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
+checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0"
dependencies = [
"core-foundation 0.10.1",
"core-foundation-sys",
- "jni 0.21.1",
+ "jni",
"log",
"once_cell",
"rustls",
@@ -8329,6 +8524,17 @@ dependencies = [
"zmij",
]
+[[package]]
+name = "serde_path_to_error"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457"
+dependencies = [
+ "itoa",
+ "serde",
+ "serde_core",
+]
+
[[package]]
name = "serde_repr"
version = "0.1.20"
@@ -9228,7 +9434,7 @@ dependencies = [
"chrono",
"num_cpus",
"ping",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"sysinfo",
"test-with-derive",
"uzers",
@@ -9249,7 +9455,7 @@ dependencies = [
"proc-macro2",
"quote",
"regex",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"syn 2.0.117",
"sysinfo",
"uzers",
@@ -9590,6 +9796,7 @@ dependencies = [
"tokio",
"tower-layer",
"tower-service",
+ "tracing",
]
[[package]]
@@ -9613,6 +9820,7 @@ dependencies = [
"tower",
"tower-layer",
"tower-service",
+ "tracing",
]
[[package]]
@@ -9948,6 +10156,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
[[package]]
name = "vector-search-bench"
version = "0.1.0"
@@ -10138,6 +10352,7 @@ dependencies = [
"glob",
"humansize",
"indicatif",
+ "insta",
"itertools 0.14.0",
"mimalloc",
"noodles-bgzf",
@@ -10146,7 +10361,7 @@ dependencies = [
"parquet 58.1.0",
"rand 0.10.1",
"regex",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"serde",
"serde_json",
"sysinfo",
@@ -10166,6 +10381,53 @@ dependencies = [
"vortex-tensor",
]
+[[package]]
+name = "vortex-bench-migrate"
+version = "0.1.0-alpha.0"
+dependencies = [
+ "anyhow",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-schema 58.1.0",
+ "clap",
+ "duckdb",
+ "flate2",
+ "reqwest 0.13.3",
+ "rstest",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "vortex-bench-server",
+ "vortex-utils",
+]
+
+[[package]]
+name = "vortex-bench-server"
+version = "0.1.0-alpha.0"
+dependencies = [
+ "anyhow",
+ "axum",
+ "base64",
+ "duckdb",
+ "insta",
+ "maud",
+ "reqwest 0.13.3",
+ "serde",
+ "serde_json",
+ "subtle",
+ "tempfile",
+ "thiserror 2.0.18",
+ "tokio",
+ "tower",
+ "tower-http",
+ "tracing",
+ "tracing-subscriber",
+ "twox-hash",
+]
+
[[package]]
name = "vortex-btrblocks"
version = "0.1.0"
@@ -10242,7 +10504,7 @@ dependencies = [
"clap",
"futures",
"parquet 58.1.0",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"serde",
"serde_json",
"sha2 0.11.0",
@@ -10424,7 +10686,7 @@ dependencies = [
"object_store 0.13.2",
"parking_lot",
"paste",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"rstest",
"tempfile",
"tracing",
@@ -10434,7 +10696,7 @@ dependencies = [
"vortex-runend",
"vortex-sequence",
"vortex-utils",
- "zip",
+ "zip 8.6.0",
]
[[package]]
@@ -10646,7 +10908,7 @@ dependencies = [
"arrow-array 58.1.0",
"arrow-schema 58.1.0",
"futures",
- "jni 0.22.4",
+ "jni",
"object_store 0.13.2",
"parking_lot",
"thiserror 2.0.18",
@@ -10726,7 +10988,7 @@ dependencies = [
"bindgen",
"libloading",
"liblzma",
- "reqwest 0.13.2",
+ "reqwest 0.13.3",
"tar",
"vortex-cuda-macros",
]
@@ -10937,7 +11199,7 @@ dependencies = [
"arrow-schema 58.1.0",
"clap",
"console_error_panic_hook",
- "crossterm",
+ "crossterm 0.29.0",
"datafusion 53.1.0",
"env_logger",
"flatbuffers",
@@ -11070,6 +11332,7 @@ dependencies = [
"cfg-if",
"once_cell",
"rustversion",
+ "serde",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
@@ -11205,6 +11468,15 @@ dependencies = [
"rustls-pki-types",
]
+[[package]]
+name = "webpki-roots"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d"
+dependencies = [
+ "rustls-pki-types",
+]
+
[[package]]
name = "which"
version = "8.0.2"
@@ -11357,15 +11629,6 @@ dependencies = [
"windows-link",
]
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
-dependencies = [
- "windows-targets 0.42.2",
-]
-
[[package]]
name = "windows-sys"
version = "0.52.0"
@@ -11402,21 +11665,6 @@ dependencies = [
"windows-link",
]
-[[package]]
-name = "windows-targets"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
[[package]]
name = "windows-targets"
version = "0.52.6"
@@ -11459,12 +11707,6 @@ dependencies = [
"windows-link",
]
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
@@ -11477,12 +11719,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
@@ -11495,12 +11731,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
@@ -11525,12 +11755,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
@@ -11543,12 +11767,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
@@ -11561,12 +11779,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
@@ -11579,12 +11791,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
@@ -11882,6 +12088,20 @@ dependencies = [
"num-traits",
]
+[[package]]
+name = "zip"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b"
+dependencies = [
+ "arbitrary",
+ "crc32fast",
+ "flate2",
+ "indexmap",
+ "memchr",
+ "zopfli",
+]
+
[[package]]
name = "zip"
version = "8.6.0"
diff --git a/Cargo.toml b/Cargo.toml
index 4ddcfbe3d43..d56fc893658 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -60,6 +60,9 @@ members = [
"benchmarks/duckdb-bench",
"benchmarks/random-access-bench",
"benchmarks/vector-search-bench",
+ # Benchmarks website v3 (alpha) - leaf binary, not part of vortex-* API
+ "benchmarks-website/server",
+ "benchmarks-website/migrate",
]
exclude = ["java/testfiles", "wasm-test"]
resolver = "2"
diff --git a/REUSE.toml b/REUSE.toml
index 161f6e3086a..8e406c95c90 100644
--- a/REUSE.toml
+++ b/REUSE.toml
@@ -36,7 +36,7 @@ SPDX-FileCopyrightText = "Copyright the Vortex contributors"
SPDX-License-Identifier = "CC-BY-4.0"
[[annotations]]
-path = ["**/.gitignore", ".gitmodules", ".python-version", "**/*.lock", "**/*.lockfile", "**/*.toml", "**/*.json", ".idea/**", ".github/**", "codecov.yml", "java/gradle/wrapper/gradle-wrapper.properties"]
+path = ["**/.gitignore", ".gitmodules", ".python-version", "**/*.lock", "**/*.lockfile", "**/*.toml", "**/*.json", ".idea/**", ".github/**", "codecov.yml", "java/gradle/wrapper/gradle-wrapper.properties", "**.duckdb*"]
precedence = "override"
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
SPDX-License-Identifier = "Apache-2.0"
diff --git a/_typos.toml b/_typos.toml
index 62c3b0d6358..2b9bc571e76 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -8,7 +8,7 @@ extend-ignore-re = [
]
[files]
-extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**", "encodings/fsst/src/dfa/tests.rs", "encodings/fsst/src/dfa/flat_contains.rs"]
+extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**", "encodings/fsst/src/dfa/tests.rs", "encodings/fsst/src/dfa/flat_contains.rs", "benchmarks-website/server/static/**"]
[type.py]
extend-ignore-identifiers-re = [
diff --git a/bench-orchestrator/bench_orchestrator/cli.py b/bench-orchestrator/bench_orchestrator/cli.py
index d497d85ed13..6c200015182 100644
--- a/bench-orchestrator/bench_orchestrator/cli.py
+++ b/bench-orchestrator/bench_orchestrator/cli.py
@@ -210,6 +210,10 @@ def run(
Path | None,
typer.Option("--output", help="Optional path for compatibility JSONL output"),
] = None,
+ gh_json_v3: Annotated[
+ Path | None,
+ typer.Option("--gh-json-v3", help="Optional path for v3 JSONL records emitted by the benchmark binary"),
+ ] = None,
options: Annotated[list[str] | None, typer.Option("--opt", help="Engine or benchmark specific options")] = None,
) -> None:
"""Run benchmarks with specified configuration."""
@@ -294,6 +298,7 @@ def run(
sample_rate=sample_rate,
tracing=tracing,
runner=runner,
+ gh_json_v3=gh_json_v3,
on_result=lambda line, store_writer=ctx.write_raw_json, compatibility=compatibility_file: (
write_result_line(
line,
diff --git a/bench-orchestrator/bench_orchestrator/runner/executor.py b/bench-orchestrator/bench_orchestrator/runner/executor.py
index b895afdc2e1..32ed9c91132 100644
--- a/bench-orchestrator/bench_orchestrator/runner/executor.py
+++ b/bench-orchestrator/bench_orchestrator/runner/executor.py
@@ -40,6 +40,7 @@ def build_command(
sample_rate: int | None = None,
tracing: bool = False,
runner: str | None = None,
+ gh_json_v3: Path | None = None,
) -> list[str]:
"""Build the command used to execute a benchmark binary."""
cmd = [
@@ -67,6 +68,8 @@ def build_command(
cmd.append("--tracing")
if runner:
cmd.extend(["--runner", runner])
+ if gh_json_v3 is not None:
+ cmd.extend(["--gh-json-v3", str(gh_json_v3)])
if options:
for key, value in options.items():
cmd.extend(["--opt", f"{key}={value}"])
@@ -98,6 +101,7 @@ def run(
sample_rate: int | None = None,
tracing: bool = False,
runner: str | None = None,
+ gh_json_v3: Path | None = None,
on_result: Callable[[str], None] | None = None,
) -> list[str]:
"""
@@ -128,6 +132,7 @@ def run(
sample_rate=sample_rate,
tracing=tracing,
runner=runner,
+ gh_json_v3=gh_json_v3,
)
if self.verbose:
diff --git a/bench-orchestrator/tests/test_executor.py b/bench-orchestrator/tests/test_executor.py
index ade3dde1a67..dd3253a22ff 100644
--- a/bench-orchestrator/tests/test_executor.py
+++ b/bench-orchestrator/tests/test_executor.py
@@ -48,6 +48,31 @@ def test_build_command_omits_formats_for_lance_backend() -> None:
assert "1,3" in cmd
+def test_build_command_includes_gh_json_v3_when_set() -> None:
+ executor = BenchmarkExecutor(Path("/tmp/duckdb-bench"), Engine.DUCKDB)
+
+ cmd = executor.build_command(
+ benchmark=Benchmark.TPCH,
+ formats=[Format.PARQUET],
+ gh_json_v3=Path("results.v3.jsonl"),
+ )
+
+ assert "--gh-json-v3" in cmd
+ flag_idx = cmd.index("--gh-json-v3")
+ assert cmd[flag_idx + 1] == "results.v3.jsonl"
+
+
+def test_build_command_omits_gh_json_v3_when_unset() -> None:
+ executor = BenchmarkExecutor(Path("/tmp/duckdb-bench"), Engine.DUCKDB)
+
+ cmd = executor.build_command(
+ benchmark=Benchmark.TPCH,
+ formats=[Format.PARQUET],
+ )
+
+ assert "--gh-json-v3" not in cmd
+
+
def test_run_streams_logs_without_counting_them(tmp_path: Path) -> None:
script = tmp_path / "fake-bench.py"
script.write_text(
diff --git a/benchmarks-website/Dockerfile b/benchmarks-website/Dockerfile
deleted file mode 100644
index 1f87a7148b5..00000000000
--- a/benchmarks-website/Dockerfile
+++ /dev/null
@@ -1,16 +0,0 @@
-FROM node:24-alpine AS build
-WORKDIR /app
-COPY package.json package-lock.json ./
-RUN npm ci
-COPY . .
-RUN npm run build
-
-FROM node:24-alpine
-WORKDIR /app
-COPY package.json package-lock.json ./
-RUN npm ci --omit=dev
-COPY --from=build /app/dist ./dist
-COPY server.js .
-COPY src/config.js ./src/config.js
-EXPOSE 3000
-CMD ["node", "server.js"]
diff --git a/benchmarks-website/docker-compose.yml b/benchmarks-website/docker-compose.yml
deleted file mode 100644
index 4c2e9682329..00000000000
--- a/benchmarks-website/docker-compose.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-services:
- benchmarks-website:
- image: ghcr.io/vortex-data/vortex/benchmarks-website:latest
- ports:
- - "80:3000"
- restart: unless-stopped
-
- watchtower:
- image: containrrr/watchtower
- volumes:
- - /var/run/docker.sock:/var/run/docker.sock
- environment:
- - WATCHTOWER_POLL_INTERVAL=60
- - WATCHTOWER_CLEANUP=true
- restart: unless-stopped
diff --git a/benchmarks-website/ec2-init.txt b/benchmarks-website/ec2-init.txt
deleted file mode 100644
index 1c2459b3bee..00000000000
--- a/benchmarks-website/ec2-init.txt
+++ /dev/null
@@ -1,17 +0,0 @@
- 1. Install Docker
- # Amazon Linux 2023
- sudo yum install -y docker
- sudo systemctl enable --now docker
- sudo usermod -aG docker $USER
- newgrp docker
-
- 2. Install Docker Compose plugin
- sudo mkdir -p /usr/local/lib/docker/cli-plugins
- sudo curl -SL https://github.com/docker/compose/releases/latest/download/docker-compose-linux-aarch64 -o /usr/local/lib/docker/cli-plugins/docker-compose
- sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
-
- 3. Set up and start the app
- sudo mkdir -p /opt/benchmarks-website
- sudo cp docker-compose.yml /opt/benchmarks-website/
- cd /opt/benchmarks-website
- docker compose up -d
\ No newline at end of file
diff --git a/benchmarks-website/index.html b/benchmarks-website/index.html
deleted file mode 100644
index e475f3ad254..00000000000
--- a/benchmarks-website/index.html
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-
-
-
- Vortex Benchmarks
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/benchmarks-website/migrate/Cargo.toml b/benchmarks-website/migrate/Cargo.toml
new file mode 100644
index 00000000000..45a752df397
--- /dev/null
+++ b/benchmarks-website/migrate/Cargo.toml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+[package]
+name = "vortex-bench-migrate"
+version = "0.1.0-alpha.0"
+edition = "2024"
+rust-version = "1.91.0"
+license = "Apache-2.0"
+description = "One-shot historical migrator from the v2 benchmarks S3 dataset to a v3 DuckDB file"
+publish = false
+
+[[bin]]
+name = "vortex-bench-migrate"
+path = "src/main.rs"
+
+# Throwaway binary, not part of the vortex-* public API surface.
+# Errors use anyhow, and the crate is intentionally outside the
+# workspace public-api lockfile set.
+
+[dependencies]
+anyhow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
+arrow-schema = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+# track vortex-duckdb's bundled engine version (build.rs)
+duckdb = { version = "1.10502", features = ["bundled", "appender-arrow"] }
+flate2 = "1.1"
+reqwest = { workspace = true, features = ["json"] }
+serde = { workspace = true, features = ["derive"] }
+serde_json = { workspace = true }
+tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
+tracing = { workspace = true, features = ["std"] }
+tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] }
+vortex-bench-server = { path = "../server" }
+vortex-utils = { workspace = true }
+
+[dev-dependencies]
+rstest = { workspace = true }
+tempfile = { workspace = true }
diff --git a/benchmarks-website/migrate/src/classifier.rs b/benchmarks-website/migrate/src/classifier.rs
new file mode 100644
index 00000000000..dfbdb75705b
--- /dev/null
+++ b/benchmarks-website/migrate/src/classifier.rs
@@ -0,0 +1,818 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Bug-for-bug port of v2's `getGroup`, `formatQuery`, and
+//! `normalizeChartName` from the v2 Express server, plus the
+//! mapping from v2 group + name pattern to a v3 fact-table bin.
+//!
+//! The v2 classifier was the source of truth for what historical
+//! records mean. It groups records by name prefix into one of:
+//! "Random Access", "Compression", "Compression Size", or one of the
+//! SQL query suites (with optional fan-out by storage and scale
+//! factor for TPC-H/TPC-DS). This module reproduces that logic and
+//! then hops to a v3 fact-table bin, since v3 stores dim values as
+//! columns instead of name fragments.
+//!
+//! Engine and format strings stored in v3 columns are pulled from the
+//! raw, pre-rename v2 record name. v2's `ENGINE_RENAMES` was a v2
+//! read-time UI concern (e.g. `vortex-file-compressed` rendered as
+//! `vortex` and `parquet-tokio-local-disk` rendered as `parquet-nvme`).
+//! v3 stores canonical `Format::name()` strings to match what the v3
+//! live emitter writes, so historical and live records share series.
+
+use crate::v2::V2Record;
+use crate::v2::dataset_scale_factor;
+
+/// Static port of v2's `QUERY_SUITES`.
+pub const QUERY_SUITES: &[QuerySuite] = &[
+ QuerySuite {
+ prefix: "clickbench",
+ display_name: "Clickbench",
+ query_prefix: "CLICKBENCH",
+ dataset_key: None,
+ fan_out: false,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "statpopgen",
+ display_name: "Statistical and Population Genetics",
+ query_prefix: "STATPOPGEN",
+ dataset_key: None,
+ fan_out: false,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "polarsignals",
+ display_name: "PolarSignals Profiling",
+ query_prefix: "POLARSIGNALS",
+ dataset_key: None,
+ fan_out: false,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "gharchive",
+ display_name: "GhArchive",
+ query_prefix: "GHARCHIVE",
+ dataset_key: None,
+ fan_out: false,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "tpch",
+ display_name: "TPC-H",
+ query_prefix: "TPC-H",
+ dataset_key: Some("tpch"),
+ fan_out: true,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "tpcds",
+ display_name: "TPC-DS",
+ query_prefix: "TPC-DS",
+ dataset_key: Some("tpcds"),
+ fan_out: true,
+ skip: false,
+ },
+ QuerySuite {
+ prefix: "fineweb",
+ display_name: "Fineweb",
+ query_prefix: "FINEWEB",
+ dataset_key: None,
+ fan_out: false,
+ skip: false,
+ },
+];
+
+/// Static port of v2's `ENGINE_RENAMES`. Applied to the "series" half
+/// of a benchmark name (the part after the first `/`) before splitting
+/// on `:` into engine/format. Order doesn't matter — keys are unique.
+const ENGINE_RENAMES: &[(&str, &str)] = &[
+ ("datafusion:vortex-file-compressed", "datafusion:vortex"),
+ ("datafusion:parquet", "datafusion:parquet"),
+ ("datafusion:arrow", "datafusion:in-memory-arrow"),
+ ("datafusion:lance", "datafusion:lance"),
+ ("datafusion:vortex-compact", "datafusion:vortex-compact"),
+ ("duckdb:vortex-file-compressed", "duckdb:vortex"),
+ ("duckdb:parquet", "duckdb:parquet"),
+ ("duckdb:duckdb", "duckdb:duckdb"),
+ ("duckdb:vortex-compact", "duckdb:vortex-compact"),
+ ("vortex-tokio-local-disk", "vortex-nvme"),
+ ("vortex-compact-tokio-local-disk", "vortex-compact-nvme"),
+ ("lance-tokio-local-disk", "lance-nvme"),
+ ("parquet-tokio-local-disk", "parquet-nvme"),
+ ("lance", "lance"),
+];
+
+/// One entry of `QUERY_SUITES`.
+#[derive(Debug, Clone, Copy)]
+pub struct QuerySuite {
+ pub prefix: &'static str,
+ pub display_name: &'static str,
+ pub query_prefix: &'static str,
+ pub dataset_key: Option<&'static str>,
+ pub fan_out: bool,
+ pub skip: bool,
+}
+
+/// Group a v2 record falls into. Mirrors v2's `getGroup`,
+/// including the fan-out group naming for TPC-H/TPC-DS.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum V2Group {
+ RandomAccess,
+ Compression,
+ CompressionSize,
+ Query {
+ suite_index: usize,
+ /// `Some` for fan-out suites only.
+ storage: Option,
+ /// `Some` for fan-out suites only.
+ scale_factor: Option,
+ },
+}
+
+impl V2Group {
+ /// Display name as v2 served it from `/api/metadata`.
+ pub fn display_name(&self) -> String {
+ match self {
+ V2Group::RandomAccess => "Random Access".into(),
+ V2Group::Compression => "Compression".into(),
+ V2Group::CompressionSize => "Compression Size".into(),
+ V2Group::Query {
+ suite_index,
+ storage,
+ scale_factor,
+ } => {
+ let suite = &QUERY_SUITES[*suite_index];
+ if let (Some(storage), Some(sf)) = (storage, scale_factor) {
+ format!("{} ({}) (SF={})", suite.display_name, storage, sf)
+ } else {
+ suite.display_name.to_string()
+ }
+ }
+ }
+ }
+}
+
+/// Apply v2's `ENGINE_RENAMES`. Reproduces the JS `rename`:
+/// `RENAMES[s.toLowerCase()] || RENAMES[s] || s`.
+pub fn rename_engine(s: &str) -> String {
+ let lower = s.to_lowercase();
+ for (k, v) in ENGINE_RENAMES {
+ if *k == lower {
+ return (*v).to_string();
+ }
+ }
+ for (k, v) in ENGINE_RENAMES {
+ if *k == s {
+ return (*v).to_string();
+ }
+ }
+ s.to_string()
+}
+
+/// Faithful port of v2's `formatQuery`: maps `clickbench_q07` →
+/// `"CLICKBENCH Q7"`. Returns the original (uppercased,
+/// `-` and `_` replaced with spaces) when no suite matches.
+pub fn format_query(q: &str) -> String {
+ let lower = q.to_lowercase();
+ for suite in QUERY_SUITES {
+ if suite.skip {
+ continue;
+ }
+ let prefix = suite.prefix;
+ if let Some(rest) = lower.strip_prefix(prefix)
+ && let Some(idx) = parse_query_index(rest)
+ {
+ return format!("{} Q{}", suite.query_prefix, idx);
+ }
+ }
+ let mut out = q.to_uppercase();
+ out = out.replace(['_', '-'], " ");
+ out
+}
+
+/// Parse the `_q07` / ` q7` / `q42` tail used by `format_query`.
+/// Returns the integer query index if the tail matches the v2 regex
+/// `^[_ ]?q(\d+)`.
+fn parse_query_index(rest: &str) -> Option {
+ let after_sep = rest
+ .strip_prefix('_')
+ .or_else(|| rest.strip_prefix(' '))
+ .unwrap_or(rest);
+ let after_q = after_sep
+ .strip_prefix('q')
+ .or_else(|| after_sep.strip_prefix('Q'))?;
+ let digits: String = after_q.chars().take_while(|c| c.is_ascii_digit()).collect();
+ if digits.is_empty() {
+ return None;
+ }
+ digits.parse().ok()
+}
+
+/// Faithful port of v2's `normalizeChartName`.
+pub fn normalize_chart_name(group: &V2Group, chart_name: &str) -> String {
+ if matches!(group, V2Group::CompressionSize) && chart_name == "VORTEX FILE COMPRESSED SIZE" {
+ return "VORTEX SIZE".into();
+ }
+ chart_name.to_string()
+}
+
+/// Port of v2's `getGroup`. Returns `None` for skipped suites
+/// (e.g. `fineweb`) or names that match nothing.
+pub fn get_group(record: &V2Record) -> Option {
+ let lower = record.name.to_lowercase();
+
+ if lower.starts_with("random-access/") || lower.starts_with("random access/") {
+ return Some(V2Group::RandomAccess);
+ }
+
+ if lower.starts_with("vortex size/")
+ || lower.starts_with("vortex-file-compressed size/")
+ || lower.starts_with("parquet size/")
+ || lower.starts_with("parquet-zstd size/")
+ || lower.starts_with("lance size/")
+ || lower.contains(":raw size/")
+ || lower.contains(":parquet-zstd size/")
+ || lower.contains(":lance size/")
+ {
+ return Some(V2Group::CompressionSize);
+ }
+
+ if lower.starts_with("compress time/")
+ || lower.starts_with("decompress time/")
+ || lower.starts_with("parquet_rs-zstd compress")
+ || lower.starts_with("parquet_rs-zstd decompress")
+ || lower.starts_with("lance compress")
+ || lower.starts_with("lance decompress")
+ || lower.starts_with("vortex:lance ratio")
+ || lower.starts_with("vortex:parquet-zstd ratio")
+ // Typo'd v2 emitter wrote `parquet-zst` (no `d`) for some
+ // ratio records; match both spellings so they classify as
+ // derived ratios instead of falling through to Unknown.
+ || lower.starts_with("vortex:parquet-zst ratio")
+ || lower.starts_with("vortex:raw ratio")
+ {
+ return Some(V2Group::Compression);
+ }
+
+ for (i, suite) in QUERY_SUITES.iter().enumerate() {
+ let prefix_q = format!("{}_q", suite.prefix);
+ let prefix_slash = format!("{}/", suite.prefix);
+ if !lower.starts_with(&prefix_q) && !lower.starts_with(&prefix_slash) {
+ continue;
+ }
+ if suite.skip {
+ return None;
+ }
+ if !suite.fan_out {
+ return Some(V2Group::Query {
+ suite_index: i,
+ storage: None,
+ scale_factor: None,
+ });
+ }
+ let storage = match record.storage.as_deref().map(str::to_uppercase).as_deref() {
+ Some("S3") => "S3",
+ _ => "NVMe",
+ };
+ let dataset_key = suite.dataset_key.unwrap_or(suite.prefix);
+ let raw_sf = record
+ .dataset
+ .as_ref()
+ .and_then(|d| dataset_scale_factor(d, dataset_key));
+ let sf = raw_sf
+ .as_deref()
+ .and_then(|s| s.parse::().ok())
+ .map(|f| f.round() as i64)
+ .unwrap_or(1);
+ return Some(V2Group::Query {
+ suite_index: i,
+ storage: Some(storage.into()),
+ scale_factor: Some(sf.to_string()),
+ });
+ }
+
+ None
+}
+
+/// Group + chart + series breakdown for a v2 record, using the same
+/// rules the v2 server applies in `refresh()`. Equivalent to v2's
+/// `(group, chartName, seriesName)` triple after rename / skip rules.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct V2Classification {
+ pub group: V2Group,
+ pub chart: String,
+ pub series: String,
+}
+
+/// Apply the same chart / series naming v2's `refresh()` does, plus
+/// the throughput / `PARQUET-UNC` skip rules.
+pub fn classify_v2(record: &V2Record) -> Option {
+ if record.name.contains(" throughput") {
+ return None;
+ }
+ let group = get_group(record)?;
+ let parts: Vec<&str> = record.name.split('/').collect();
+ let (chart, series) = match (&group, parts.len()) {
+ (V2Group::RandomAccess, 4) => {
+ let chart = format!("{}/{}", parts[1], parts[2])
+ .to_uppercase()
+ .replace(['_', '-'], " ");
+ let series = rename_engine(if parts[3].is_empty() {
+ "default"
+ } else {
+ parts[3]
+ });
+ (chart, series)
+ }
+ (V2Group::RandomAccess, 2) => (
+ "RANDOM ACCESS".to_string(),
+ rename_engine(if parts[1].is_empty() {
+ "default"
+ } else {
+ parts[1]
+ }),
+ ),
+ (V2Group::RandomAccess, _) => return None,
+ _ => {
+ let series_raw = if parts.len() >= 2 && !parts[1].is_empty() {
+ parts[1]
+ } else {
+ "default"
+ };
+ let series = rename_engine(series_raw);
+ let chart = format_query(parts[0]);
+ (chart, series)
+ }
+ };
+ let chart = normalize_chart_name(&group, &chart);
+ if chart.contains("PARQUET-UNC") {
+ return None;
+ }
+ Some(V2Classification {
+ group,
+ chart,
+ series,
+ })
+}
+
+/// Mapping target: which v3 fact table a v2 record lands in, plus the
+/// dim values that table needs.
+#[derive(Debug, Clone, PartialEq)]
+pub enum V3Bin {
+ Query {
+ dataset: String,
+ dataset_variant: Option,
+ scale_factor: Option,
+ query_idx: i32,
+ storage: String,
+ engine: String,
+ format: String,
+ },
+ CompressionTime {
+ dataset: String,
+ dataset_variant: Option,
+ format: String,
+ op: String,
+ },
+ CompressionSize {
+ dataset: String,
+ dataset_variant: Option,
+ format: String,
+ },
+ RandomAccess {
+ dataset: String,
+ format: String,
+ },
+}
+
+/// Top-level entry point. Combines `classify_v2` with the v3 fact-table
+/// mapping. Returns `None` for records that:
+///
+/// - Don't match any v2 group (uncategorized prefix).
+/// - Are explicitly skipped by v2 (throughput, PARQUET-UNC, fineweb).
+/// - Are computed-at-read-time ratios that v3 derives from
+/// `compression_sizes` (`vortex:parquet-zstd ratio …`,
+/// `vortex:lance ratio …`, `vortex:raw ratio …`,
+/// `vortex:* size/…`).
+pub fn classify(record: &V2Record) -> Option {
+ let cls = classify_v2(record)?;
+ match &cls.group {
+ V2Group::RandomAccess => bin_random_access(&cls, record),
+ V2Group::Compression => bin_compression_time(&cls, record),
+ V2Group::CompressionSize => bin_compression_size(&cls, record),
+ V2Group::Query { .. } => bin_query(&cls, record),
+ }
+}
+
+/// Reason the classifier dropped a record. Intentional skips (v2
+/// patterns v3 deliberately doesn't store) are NOT errors; they don't
+/// count against the uncategorized gate.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Skip {
+ /// `vortex:* ratio …` and `vortex:* size` — derived in v3 from
+ /// `compression_sizes` joined to itself.
+ DerivedRatio,
+ /// `throughput` records — v2 derived these from latencies.
+ Throughput,
+ /// A v2 query suite marked `skip: true` in QUERY_SUITES.
+ SkippedSuite,
+ /// random-access record with an unsupported part count.
+ UnsupportedShape,
+ /// Record had no `value` field.
+ NoValue,
+ /// Dim outside the v3 emitter's allowlist (e.g. `parquet-zstd`,
+ /// historical-only suites no longer in CI).
+ Deprecated,
+ /// v2 memory measurements (`*_memory/*` records). Carry top-level
+ /// `peak_physical_memory` / `peak_virtual_memory` /
+ /// `physical_memory_delta` / `virtual_memory_delta` fields that
+ /// `V2Record` doesn't deserialize. Not migrated for alpha; merging
+ /// into the corresponding QueryMeasurement row is future work.
+ HistoricalMemory,
+}
+
+/// Engines the v3 emitter produces today. Anything else is historical
+/// and gets bucketed as `Skip::Deprecated`.
+///
+/// ORCHESTRATOR NOTE: confirm against `vortex-bench`'s `Engine` enum
+/// before handing off; edit if the live set differs.
+const V3_ENGINES: &[&str] = &["datafusion", "duckdb", "vortex", "arrow"];
+
+/// Formats the v3 emitter produces today (`Format::name()` values).
+///
+/// ORCHESTRATOR NOTE: confirm against `vortex-bench/src/lib.rs`
+/// `Format::name()` before handing off.
+const V3_FORMATS: &[&str] = &[
+ "vortex-file-compressed",
+ "vortex-compact",
+ "parquet",
+ "lance",
+ "csv",
+ "arrow",
+ "duckdb",
+];
+
+/// Query suites the v3 CI runs today. Suites outside this list still
+/// classify (so historical analyses stay coherent) but get bucketed
+/// as `Skip::Deprecated` so they don't render as orphan charts in v3.
+///
+/// `fineweb` is included because `.github/workflows/sql-benchmarks.yml`
+/// still has `fineweb` and `fineweb-s3` matrix entries. `gharchive`
+/// stays excluded — it's defined in `vortex-bench` but no current
+/// workflow runs it.
+const V3_QUERY_SUITES: &[&str] = &[
+ "clickbench",
+ "tpch",
+ "tpcds",
+ "statpopgen",
+ "polarsignals",
+ "fineweb",
+];
+
+/// Returns true if every dim that v3 stores as a column is on the
+/// emitter's current allowlist. Dim values outside the allowlist mean
+/// historical-only formats / engines that the v3 UI has nothing to
+/// render against.
+fn is_v3_dim(bin: &V3Bin) -> bool {
+ match bin {
+ V3Bin::Query { engine, format, .. } => {
+ V3_ENGINES.contains(&engine.as_str()) && V3_FORMATS.contains(&format.as_str())
+ }
+ V3Bin::CompressionTime { format, .. }
+ | V3Bin::CompressionSize { format, .. }
+ | V3Bin::RandomAccess { format, .. } => V3_FORMATS.contains(&format.as_str()),
+ }
+}
+
+/// Outcome of running the classifier on a v2 record. Distinguishes
+/// "we know we don't want this" (`Skip`) from "we don't recognize this"
+/// (`Unknown`); the migrator's 5% gate fires only on the latter.
+#[derive(Debug, Clone)]
+pub enum Outcome {
+ Bin(V3Bin),
+ Skip(Skip),
+ Unknown,
+}
+
+/// Like [`classify`], but reports *why* a record was dropped. Intended
+/// for the migrator so the 5% uncategorized gate doesn't trip on
+/// records v2 deliberately doesn't render (ratios, throughput,
+/// skipped suites).
+pub fn classify_outcome(record: &V2Record) -> Outcome {
+ if record.name.contains(" throughput") {
+ return Outcome::Skip(Skip::Throughput);
+ }
+ // v2 memory records: e.g. "clickbench_q07_memory/datafusion:parquet".
+ // Match the `_memory/` infix BEFORE the engine/format split, so they
+ // route to a known Skip variant instead of slipping through to
+ // Outcome::Unknown and tripping the 5% gate.
+ let lower = record.name.to_lowercase();
+ if let Some((head, _)) = lower.split_once('/')
+ && head.ends_with("_memory")
+ {
+ return Outcome::Skip(Skip::HistoricalMemory);
+ }
+ let Some(group) = get_group(record) else {
+ return Outcome::Unknown;
+ };
+ if let V2Group::Query { suite_index, .. } = &group
+ && QUERY_SUITES[*suite_index].skip
+ {
+ return Outcome::Skip(Skip::SkippedSuite);
+ }
+ let Some(cls) = classify_v2(record) else {
+ // get_group succeeded but classify_v2 didn't — shape mismatch.
+ return Outcome::Skip(Skip::UnsupportedShape);
+ };
+ let derived = match &cls.group {
+ V2Group::Compression => {
+ let lc = cls.chart.to_lowercase();
+ lc.contains("ratio") || lc.contains(':')
+ }
+ V2Group::CompressionSize => cls.chart.to_lowercase().contains(':'),
+ _ => false,
+ };
+ if derived {
+ return Outcome::Skip(Skip::DerivedRatio);
+ }
+ let bin = match &cls.group {
+ V2Group::RandomAccess => bin_random_access(&cls, record),
+ V2Group::Compression => bin_compression_time(&cls, record),
+ V2Group::CompressionSize => bin_compression_size(&cls, record),
+ V2Group::Query { .. } => bin_query(&cls, record),
+ };
+ let Some(bin) = bin else {
+ return Outcome::Unknown;
+ };
+ if !is_v3_dim(&bin) {
+ return Outcome::Skip(Skip::Deprecated);
+ }
+ if let V2Group::Query { suite_index, .. } = &group
+ && !V3_QUERY_SUITES.contains(&QUERY_SUITES[*suite_index].prefix)
+ {
+ return Outcome::Skip(Skip::Deprecated);
+ }
+ Outcome::Bin(bin)
+}
+
+fn bin_random_access(cls: &V2Classification, record: &V2Record) -> Option {
+ // v2 chart name shape: "RANDOM ACCESS" or "DATASET/PATTERN" (uppercase).
+ // We store it as the v3 dataset value verbatim, lowercased so
+ // `/api/groups` returns canonical lowercase names.
+ let dataset = cls.chart.to_lowercase();
+ if dataset.is_empty() {
+ return None;
+ }
+ // Pull format from the raw, pre-rename v2 name so v3 stores the
+ // canonical `Format::name()` string (matching what the v3 live
+ // emitter writes). Raw shape is
+ // `random-access///-tokio-local-disk`
+ // (4-part) or `random-access/-tokio-local-disk` (2-part
+ // legacy). After stripping the `-tokio-local-disk` suffix, map the
+ // v2 random-access ext label (`vortex`, from `Format::ext()`) to
+ // the canonical name (`vortex-file-compressed`, from
+ // `Format::name()`). `parquet` and `lance` match between ext and
+ // name. The `vortex` ext is shared by both `OnDiskVortex` (name
+ // `vortex-file-compressed`) and `VortexCompact` (name
+ // `vortex-compact`), but v2's random-access bench only emitted
+ // `OnDiskVortex`, so mapping to `vortex-file-compressed` is
+ // correct for all historical data.
+ let parts: Vec<&str> = record.name.split('/').collect();
+ let raw = match parts.len() {
+ 4 => parts[3],
+ 2 => parts[1],
+ _ => return None,
+ };
+ if raw.is_empty() || raw == "default" {
+ return None;
+ }
+ let stripped = raw.strip_suffix("-tokio-local-disk").unwrap_or(raw);
+ let format = match stripped {
+ "vortex" => "vortex-file-compressed".to_string(),
+ other => other.to_lowercase(),
+ };
+ Some(V3Bin::RandomAccess { dataset, format })
+}
+
+fn bin_compression_time(cls: &V2Classification, _record: &V2Record) -> Option {
+ // v2 compression chart names look like (after format_query):
+ // "COMPRESS TIME" [vortex/encode]
+ // "DECOMPRESS TIME" [vortex/decode]
+ // "PARQUET RS ZSTD COMPRESS TIME" [parquet/encode]
+ // "PARQUET RS ZSTD DECOMPRESS TIME" [parquet/decode]
+ // "LANCE COMPRESS TIME" [lance/encode]
+ // "LANCE DECOMPRESS TIME" [lance/decode]
+ // "VORTEX:LANCE RATIO COMPRESS TIME" [drop]
+ // "VORTEX:PARQUET-ZSTD RATIO COMPRESS TIME" [drop]
+ // "VORTEX:RAW RATIO COMPRESS TIME" [drop]
+ let lc = cls.chart.to_lowercase();
+ if lc.contains("ratio") || lc.contains(':') {
+ // Ratios are computed at read time from compression_sizes.
+ return None;
+ }
+ let (format, op) = if lc.starts_with("compress time") {
+ ("vortex-file-compressed", "encode")
+ } else if lc.starts_with("decompress time") {
+ ("vortex-file-compressed", "decode")
+ } else if lc.starts_with("parquet rs zstd compress time") {
+ ("parquet", "encode")
+ } else if lc.starts_with("parquet rs zstd decompress time") {
+ ("parquet", "decode")
+ } else if lc.starts_with("lance compress time") {
+ ("lance", "encode")
+ } else if lc.starts_with("lance decompress time") {
+ ("lance", "decode")
+ } else {
+ return None;
+ };
+ let dataset = cls.series.to_lowercase();
+ if dataset.is_empty() || dataset == "default" {
+ return None;
+ }
+ Some(V3Bin::CompressionTime {
+ dataset,
+ dataset_variant: None,
+ format: format.to_string(),
+ op: op.to_string(),
+ })
+}
+
+fn bin_compression_size(cls: &V2Classification, record: &V2Record) -> Option {
+ let lc = cls.chart.to_lowercase();
+ // Ratios like "VORTEX:PARQUET ZSTD SIZE" / "VORTEX:LANCE SIZE" /
+ // "VORTEX:RAW SIZE" are derived from compression_sizes at read
+ // time, not stored.
+ if lc.contains(':') {
+ return None;
+ }
+ // `parquet-zstd size` shares a leading "parquet" with `parquet size`,
+ // so check the more specific prefix first. `format_query` upper-cases
+ // and replaces `-`/`_` with spaces, so the chart we match against is
+ // `"PARQUET ZSTD SIZE"` (no hyphen) — same convention as the existing
+ // `"parquet rs zstd compress time"` branches above.
+ let format = if lc.starts_with("vortex size") {
+ "vortex-file-compressed"
+ } else if lc.starts_with("parquet zstd size") {
+ "parquet-zstd"
+ } else if lc.starts_with("parquet size") {
+ "parquet"
+ } else if lc.starts_with("lance size") {
+ "lance"
+ } else {
+ return None;
+ };
+ let dataset = cls.series.to_lowercase();
+ if dataset.is_empty() || dataset == "default" {
+ return None;
+ }
+ // Mirror the file-sizes ingest path's dataset_variant derivation
+ // (see `migrate::migrate_file_sizes`): pull the SF out of the v2
+ // record's `dataset` object when present, drop empty / "1.0".
+ // Without this both code paths produce the same `mid` only by
+ // accident, so SF=10 file-sizes rows wouldn't merge with the
+ // matching data.json.gz "vortex size/tpch" rows.
+ let dataset_variant = record
+ .dataset
+ .as_ref()
+ .and_then(|d| crate::v2::dataset_scale_factor(d, dataset.as_str()))
+ .filter(|s| !s.is_empty() && s.as_str() != "1.0");
+ Some(V3Bin::CompressionSize {
+ dataset,
+ dataset_variant,
+ format: format.to_string(),
+ })
+}
+
+fn bin_query(cls: &V2Classification, record: &V2Record) -> Option {
+ let V2Group::Query {
+ suite_index,
+ storage,
+ scale_factor,
+ } = &cls.group
+ else {
+ return None;
+ };
+ let suite = &QUERY_SUITES[*suite_index];
+
+ // Pull the query index from the *raw* name's first part instead of
+ // the formatted chart, so we don't have to round-trip "Q07".
+ let raw_first = record.name.split('/').next().unwrap_or("");
+ let query_idx = parse_query_index_from_first(raw_first)?;
+
+ // Pull engine:format from the raw, pre-rename second segment so v3
+ // stores canonical `Format::name()` strings (e.g.
+ // `vortex-file-compressed`) that match what the v3 live emitter
+ // writes. `cls.series` has been through v2's `ENGINE_RENAMES` for
+ // UI display and is not appropriate for v3 columns.
+ //
+ // Older v2 records emitted display-case engines (e.g. `DataFusion`,
+ // `DuckDB`); newer ones emit lowercase. Lowercase here so dedup
+ // collapses both spellings into a single canonical row.
+ let raw_series = record.name.split('/').nth(1)?;
+ let (engine, format) = split_engine_format(raw_series)?;
+ let engine = engine.to_lowercase();
+ let format = format.to_lowercase();
+
+ let storage_v3 = match storage.as_deref() {
+ Some("S3") => "s3".to_string(),
+ Some("NVMe") => "nvme".to_string(),
+ _ => "nvme".to_string(),
+ };
+
+ // ClickBench's "flavor" lives in dataset_variant per benchmark-mapping.md
+ // - we don't have it from a v2 name string, so we leave it None.
+ Some(V3Bin::Query {
+ dataset: suite.prefix.to_string(),
+ dataset_variant: None,
+ scale_factor: scale_factor.clone(),
+ query_idx,
+ storage: storage_v3,
+ engine,
+ format,
+ })
+}
+
+/// Pull the integer query index out of the leading name part, which is
+/// always `_q` or ` q` for SQL query records.
+fn parse_query_index_from_first(first: &str) -> Option {
+ let lower = first.to_lowercase();
+ for suite in QUERY_SUITES {
+ if let Some(rest) = lower.strip_prefix(suite.prefix)
+ && let Some(idx) = parse_query_index(rest)
+ {
+ return Some(idx as i32);
+ }
+ }
+ None
+}
+
+/// Split a renamed series like `datafusion:parquet` into
+/// `(engine, format)`. Returns `None` for series with no `:` since
+/// v3 requires both columns.
+fn split_engine_format(series: &str) -> Option<(String, String)> {
+ let mut split = series.splitn(2, ':');
+ let engine = split.next()?.trim().to_string();
+ let format = split.next()?.trim().to_string();
+ if engine.is_empty() || format.is_empty() {
+ return None;
+ }
+ Some((engine, format))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn record(name: &str) -> V2Record {
+ V2Record {
+ name: name.to_string(),
+ commit_id: Some("deadbeef".into()),
+ unit: None,
+ value: None,
+ storage: None,
+ dataset: None,
+ all_runtimes: None,
+ env_triple: None,
+ }
+ }
+
+ #[test]
+ fn format_query_round_trips() {
+ assert_eq!(format_query("clickbench_q07"), "CLICKBENCH Q7");
+ assert_eq!(format_query("tpch_q01"), "TPC-H Q1");
+ assert_eq!(format_query("tpcds_q42"), "TPC-DS Q42");
+ assert_eq!(format_query("statpopgen_q3"), "STATPOPGEN Q3");
+ assert_eq!(format_query("foo bar"), "FOO BAR");
+ }
+
+ #[test]
+ fn rename_engine_canonicalizes_disk_names() {
+ assert_eq!(rename_engine("vortex-tokio-local-disk"), "vortex-nvme");
+ assert_eq!(
+ rename_engine("datafusion:vortex-file-compressed"),
+ "datafusion:vortex"
+ );
+ assert_eq!(rename_engine("unknown-engine"), "unknown-engine");
+ }
+
+ #[test]
+ fn parse_query_index_handles_separators() {
+ assert_eq!(parse_query_index("_q07"), Some(7));
+ assert_eq!(parse_query_index(" q7"), Some(7));
+ assert_eq!(parse_query_index("q42"), Some(42));
+ assert_eq!(parse_query_index("xq7"), None);
+ }
+
+ #[test]
+ fn random_access_bins_dataset_pattern() {
+ let bin = classify(&record("random-access/taxi/take/parquet")).unwrap();
+ assert_eq!(
+ bin,
+ V3Bin::RandomAccess {
+ dataset: "taxi/take".into(),
+ format: "parquet".into(),
+ }
+ );
+ }
+}
diff --git a/benchmarks-website/migrate/src/commits.rs b/benchmarks-website/migrate/src/commits.rs
new file mode 100644
index 00000000000..28d63a5bd19
--- /dev/null
+++ b/benchmarks-website/migrate/src/commits.rs
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Commit upserts. Adapts a [`crate::v2::V2Commit`] into the v3
+//! `commits` row shape (a [`vortex_bench_server::records::CommitInfo`]).
+
+use anyhow::Context as _;
+use anyhow::Result;
+use duckdb::Transaction;
+use duckdb::params;
+
+use crate::v2::V2Commit;
+
+/// Insert a v3 `commits` row for one v2 commit. Missing fields are
+/// filled with the empty string, matching the v3 schema's `NOT NULL`
+/// constraints; the call site logs a warning for each fallback so
+/// the operator can spot bad inputs.
+pub fn upsert_commit(tx: &Transaction<'_>, commit: &V2Commit) -> Result {
+ let mut warnings = Vec::new();
+ let timestamp = require_field(&commit.timestamp, "timestamp", &commit.id, &mut warnings);
+ let message = require_field(&commit.message, "message", &commit.id, &mut warnings);
+ let author_name = require_field(
+ &commit.author.as_ref().and_then(|p| p.name.clone()),
+ "author.name",
+ &commit.id,
+ &mut warnings,
+ );
+ let author_email = require_field(
+ &commit.author.as_ref().and_then(|p| p.email.clone()),
+ "author.email",
+ &commit.id,
+ &mut warnings,
+ );
+ let committer_name = require_field(
+ &commit.committer.as_ref().and_then(|p| p.name.clone()),
+ "committer.name",
+ &commit.id,
+ &mut warnings,
+ );
+ let committer_email = require_field(
+ &commit.committer.as_ref().and_then(|p| p.email.clone()),
+ "committer.email",
+ &commit.id,
+ &mut warnings,
+ );
+ let tree_sha = require_field(&commit.tree_id, "tree_id", &commit.id, &mut warnings);
+ let url = require_field(&commit.url, "url", &commit.id, &mut warnings);
+
+ tx.execute(
+ r#"
+ INSERT INTO commits (
+ commit_sha, timestamp, message, author_name, author_email,
+ committer_name, committer_email, tree_sha, url
+ ) VALUES (?, CAST(? AS TIMESTAMPTZ), ?, ?, ?, ?, ?, ?, ?)
+ ON CONFLICT (commit_sha) DO UPDATE SET
+ timestamp = excluded.timestamp,
+ message = excluded.message,
+ author_name = excluded.author_name,
+ author_email = excluded.author_email,
+ committer_name = excluded.committer_name,
+ committer_email = excluded.committer_email,
+ tree_sha = excluded.tree_sha,
+ url = excluded.url
+ "#,
+ params![
+ commit.id,
+ timestamp,
+ message,
+ author_name,
+ author_email,
+ committer_name,
+ committer_email,
+ tree_sha,
+ url,
+ ],
+ )
+ .with_context(|| format!("upserting commit {}", commit.id))?;
+ Ok(UpsertOutcome { warnings })
+}
+
+fn require_field(
+ field: &Option,
+ name: &str,
+ sha: &str,
+ warnings: &mut Vec,
+) -> String {
+ match field {
+ Some(s) => s.clone(),
+ None => {
+ warnings.push(format!("commit {sha} missing {name}"));
+ String::new()
+ }
+ }
+}
+
+/// Per-call warning bag returned to the caller for logging.
+#[derive(Debug, Default)]
+pub struct UpsertOutcome {
+ pub warnings: Vec,
+}
diff --git a/benchmarks-website/migrate/src/lib.rs b/benchmarks-website/migrate/src/lib.rs
new file mode 100644
index 00000000000..f02db73b4b7
--- /dev/null
+++ b/benchmarks-website/migrate/src/lib.rs
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! One-shot historical migrator from v2's S3-hosted benchmark dataset
+//! to a v3 DuckDB file.
+//!
+//! The v2 dataset is JSONL of bare benchmark records keyed by name string.
+//! v3 uses five typed fact tables with explicit dim columns. This crate
+//! ports v2's `getGroup` classifier from the v2 Express server
+//! bug-for-bug so that historical rows survive the migration with the
+//! same group / chart / series structure as the live v2 server.
+//!
+//! The migrator is throwaway: once v3 cuts over, both the binary and
+//! the classifier go away.
+
+pub mod classifier;
+pub mod commits;
+pub mod migrate;
+pub mod source;
+pub mod v2;
+pub mod verify;
diff --git a/benchmarks-website/migrate/src/main.rs b/benchmarks-website/migrate/src/main.rs
new file mode 100644
index 00000000000..366834ed441
--- /dev/null
+++ b/benchmarks-website/migrate/src/main.rs
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `vortex-bench-migrate` CLI: a one-shot historical migrator from
+//! v2's S3 dataset into a v3 DuckDB file, plus a structural diff
+//! against the live v2 `/api/metadata` endpoint for spotting
+//! classifier regressions.
+
+use std::path::PathBuf;
+use std::process::ExitCode;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use clap::Parser;
+use clap::Subcommand;
+use clap::ValueEnum;
+use tracing_subscriber::EnvFilter;
+use vortex_bench_migrate::migrate;
+use vortex_bench_migrate::source::Source;
+use vortex_bench_migrate::verify;
+
+/// One-shot historical migrator from v2's S3 dataset to v3 DuckDB.
+#[derive(Debug, Parser)]
+#[command(name = "vortex-bench-migrate", version, about)]
+struct Cli {
+ #[command(subcommand)]
+ command: Command,
+}
+
+#[derive(Debug, Subcommand)]
+enum Command {
+ /// Read v2's data.json.gz / commits.json / file-sizes-*.json.gz
+ /// and write a fully populated v3 DuckDB at `--output`.
+ Run {
+ /// Path to write the v3 DuckDB to. Created if absent.
+ #[arg(long)]
+ output: PathBuf,
+ /// Where to fetch v2 dumps from.
+ #[arg(long, value_enum, default_value_t = SourceKind::PublicS3)]
+ source: SourceKind,
+ /// For `--source=local`, the directory containing
+ /// `data.json.gz`, `commits.json`, and `file-sizes-*.json.gz`.
+ #[arg(long, required_if_eq("source", "local"))]
+ source_dir: Option,
+ },
+ /// Diff a migrated DuckDB against the live v2 `/api/metadata`
+ /// endpoint. Exits 0 if every v2 group is present in v3, 1
+ /// otherwise so this can gate a CI step.
+ Verify {
+ /// HTTPS root of a running v2 server (e.g. `https://bench.vortex.dev`).
+ #[arg(long)]
+ against: String,
+ /// Path to the migrated v3 DuckDB.
+ #[arg(long)]
+ duckdb: PathBuf,
+ },
+}
+
+#[derive(Debug, Clone, Copy, ValueEnum)]
+enum SourceKind {
+ PublicS3,
+ Local,
+}
+
+fn main() -> ExitCode {
+ if let Err(err) = run() {
+ eprintln!("error: {err:#}");
+ return ExitCode::from(2);
+ }
+ ExitCode::SUCCESS
+}
+
+fn run() -> Result<()> {
+ tracing_subscriber::fmt()
+ .with_env_filter(
+ EnvFilter::try_from_env("VORTEX_BENCH_LOG").unwrap_or_else(|_| EnvFilter::new("info")),
+ )
+ .init();
+
+ let cli = Cli::parse();
+ match cli.command {
+ Command::Run {
+ output,
+ source,
+ source_dir,
+ } => {
+ let source = match source {
+ SourceKind::PublicS3 => Source::PublicS3,
+ SourceKind::Local => {
+ Source::Local(source_dir.context("--source=local requires --source-dir")?)
+ }
+ };
+ let summary = migrate::run(&source, &output)?;
+ print!("{summary}");
+ if summary.uncategorized_fraction() > 0.05 {
+ anyhow::bail!(
+ "uncategorized records ({:.2}%) exceed the 5% gate; \
+ stop and report unmatched prefixes (see summary above) \
+ before proceeding",
+ 100.0 * summary.uncategorized_fraction()
+ );
+ }
+ Ok(())
+ }
+ Command::Verify { against, duckdb } => {
+ let report = verify::run(&against, &duckdb)?;
+ print!("{report}");
+ if !report.v2_groups_covered() {
+ std::process::exit(1);
+ }
+ Ok(())
+ }
+ }
+}
diff --git a/benchmarks-website/migrate/src/migrate.rs b/benchmarks-website/migrate/src/migrate.rs
new file mode 100644
index 00000000000..7b3b32bb51c
--- /dev/null
+++ b/benchmarks-website/migrate/src/migrate.rs
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! End-to-end migration of one v2 dataset into a v3 DuckDB file.
+//!
+//! Streams `data.json.gz` line-by-line, runs each record through the
+//! [`classifier`], and writes one row per record into the appropriate v3 fact table.
+//! Every row's `measurement_id` is computed via the server's `measurement_id_*` functions so the
+//! result is byte-compatible with what fresh `/api/ingest` would have produced.
+//!
+//! Bulk-load shape: rows are accumulated in memory as parallel column
+//! vectors, deduplicated by `measurement_id`, then flushed to DuckDB
+//! via `Appender::append_record_batch` as one Arrow `RecordBatch` per
+//! fact table.
+
+use std::collections::BTreeMap;
+use std::io::BufRead;
+use std::path::Path;
+use std::sync::Arc;
+use std::time::Duration;
+use std::time::Instant;
+
+use anyhow::Context as _;
+use anyhow::Result;
+use arrow_array::ArrayRef;
+use arrow_array::Int32Array;
+use arrow_array::Int64Array;
+use arrow_array::ListArray;
+use arrow_array::RecordBatch;
+use arrow_array::StringArray;
+use arrow_buffer::OffsetBuffer;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use arrow_schema::Schema;
+use duckdb::Connection;
+use tracing::info;
+use tracing::warn;
+use vortex_bench_server::db::measurement_id_compression_size;
+use vortex_bench_server::db::measurement_id_compression_time;
+use vortex_bench_server::db::measurement_id_query;
+use vortex_bench_server::db::measurement_id_random_access;
+use vortex_bench_server::records::CompressionSize;
+use vortex_bench_server::records::CompressionTime;
+use vortex_bench_server::records::QueryMeasurement;
+use vortex_bench_server::records::RandomAccessTime;
+use vortex_bench_server::schema::SCHEMA_DDL;
+use vortex_utils::aliases::hash_map::HashMap;
+
+use crate::classifier;
+use crate::classifier::V3Bin;
+use crate::commits::upsert_commit;
+use crate::source::Source;
+use crate::v2::V2Commit;
+use crate::v2::V2FileSize;
+use crate::v2::V2Record;
+use crate::v2::index_commits;
+use crate::v2::runtime_as_i64;
+use crate::v2::value_as_f64;
+
+/// Per-table insert counts, plus skip / missing counts.
+#[derive(Debug, Default, Clone)]
+pub struct MigrationSummary {
+ pub records_read: u64,
+ pub query_inserted: u64,
+ pub compression_time_inserted: u64,
+ pub compression_size_inserted: u64,
+ pub random_access_inserted: u64,
+ pub file_size_inserted: u64,
+ pub uncategorized: u64,
+ pub uncategorized_prefixes: BTreeMap,
+ pub missing_commit: u64,
+ pub commit_warnings: u64,
+ pub skipped_no_value: u64,
+ pub skipped_intentional: u64,
+ pub commits_inserted: u64,
+ pub deduped: u64,
+ /// Number of records dropped by dedup whose `value_ns` (or
+ /// `value_bytes` for compression_sizes' replace path) differed
+ /// from the kept row's. Non-zero is a smell worth investigating.
+ pub deduped_with_conflict: u64,
+}
+
+impl MigrationSummary {
+ /// Total `data.json.gz` records that landed in some v3 fact table.
+ pub fn total_inserted(&self) -> u64 {
+ self.query_inserted
+ + self.compression_time_inserted
+ + self.compression_size_inserted
+ + self.random_access_inserted
+ }
+
+ /// Fraction of records that were uncategorized. The orchestrator
+ /// stops if this exceeds the documented 5% threshold.
+ pub fn uncategorized_fraction(&self) -> f64 {
+ if self.records_read == 0 {
+ return 0.0;
+ }
+ self.uncategorized as f64 / self.records_read as f64
+ }
+}
+
+/// Open or create a DuckDB at `path` and apply the v3 schema. The
+/// migrator is a one-shot fresh load; the bulk-append flush is pure
+/// insert (no `ON CONFLICT`), so any stale rows in `path` would clash
+/// with the next run on the same primary keys. Delete both the
+/// database file and its WAL companion up front so every run starts
+/// from a known-empty state.
+pub fn open_target_db(path: &Path) -> Result {
+ remove_if_exists(path)?;
+ let wal = wal_path(path);
+ remove_if_exists(&wal)?;
+ let conn =
+ Connection::open(path).with_context(|| format!("opening DuckDB at {}", path.display()))?;
+ conn.execute_batch(SCHEMA_DDL)
+ .context("applying v3 schema DDL")?;
+ Ok(conn)
+}
+
+fn remove_if_exists(path: &Path) -> Result<()> {
+ match std::fs::remove_file(path) {
+ Ok(()) => {
+ info!(path = %path.display(), "removed pre-existing target file");
+ Ok(())
+ }
+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
+ Err(e) => Err(e).with_context(|| format!("removing {}", path.display())),
+ }
+}
+
+/// DuckDB writes its write-ahead log next to the database file with a
+/// `.wal` suffix appended (e.g. `v3.duckdb` -> `v3.duckdb.wal`).
+fn wal_path(path: &Path) -> std::path::PathBuf {
+ let mut name = path.as_os_str().to_owned();
+ name.push(".wal");
+ std::path::PathBuf::from(name)
+}
+
+/// Run the whole migration: commits, data.json.gz, and every
+/// file-sizes-*.json.gz under the source.
+pub fn run(source: &Source, target: &Path) -> Result {
+ let mut conn = open_target_db(target)?;
+ let mut summary = MigrationSummary::default();
+
+ info!(source = %source.describe(), "Reading commits.json");
+ let commits = read_commits(source)?;
+ info!(commits = commits.len(), "Loaded commits");
+ summary.commits_inserted = upsert_all_commits(&mut conn, &commits, &mut summary)?;
+
+ let mut q = QueryAccum::default();
+ let mut ct = CompressionTimeAccum::default();
+ let mut cs = CompressionSizeAccum::default();
+ let mut ra = RandomAccessAccum::default();
+
+ info!("Migrating data.json.gz");
+ migrate_data_jsonl(
+ source,
+ &commits,
+ &mut summary,
+ &mut q,
+ &mut ct,
+ &mut cs,
+ &mut ra,
+ )?;
+ info!(records = summary.records_read, "data.json.gz done");
+
+ for name in source.list_file_sizes()? {
+ info!(name = %name, "Migrating file-sizes");
+ if let Err(e) = migrate_file_sizes(source, &name, &commits, &mut summary, &mut cs) {
+ warn!("file-sizes file {name} failed: {e:#}");
+ }
+ }
+
+ info!("Flushing accumulators to DuckDB");
+ summary.query_inserted = q.measurement_id.len() as u64;
+ summary.compression_time_inserted = ct.measurement_id.len() as u64;
+ summary.random_access_inserted = ra.measurement_id.len() as u64;
+ summary.compression_size_inserted = cs.rows.len() as u64;
+
+ flush(&conn, "query_measurements", build_query_batch(q)?)?;
+ flush(
+ &conn,
+ "compression_times",
+ build_compression_time_batch(ct)?,
+ )?;
+ flush(&conn, "random_access_times", build_random_access_batch(ra)?)?;
+ flush(
+ &conn,
+ "compression_sizes",
+ build_compression_size_batch(cs)?,
+ )?;
+
+ Ok(summary)
+}
+
+fn read_commits(source: &Source) -> Result> {
+ let reader = source.open_commits_jsonl()?;
+ let mut commits: Vec = Vec::new();
+ for line in reader.lines() {
+ let line = line?;
+ let trimmed = line.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+ match serde_json::from_str::(trimmed) {
+ Ok(c) => commits.push(c),
+ Err(e) => warn!("skipping malformed commits.json line: {e}"),
+ }
+ }
+ Ok(index_commits(commits))
+}
+
+fn upsert_all_commits(
+ conn: &mut Connection,
+ commits: &BTreeMap,
+ summary: &mut MigrationSummary,
+) -> Result {
+ let tx = conn.transaction().context("begin commits transaction")?;
+ let mut count = 0u64;
+ for commit in commits.values() {
+ let outcome = upsert_commit(&tx, commit)?;
+ for w in outcome.warnings {
+ warn!("{w}");
+ summary.commit_warnings += 1;
+ }
+ count += 1;
+ }
+ tx.commit().context("commit commits transaction")?;
+ Ok(count)
+}
+
+/// Stream `data.json.gz` and push classified records into the
+/// per-table accumulators. Dedup happens inside each accumulator's
+/// `push` method by `measurement_id`.
+fn migrate_data_jsonl(
+ source: &Source,
+ commits: &BTreeMap,
+ summary: &mut MigrationSummary,
+ q: &mut QueryAccum,
+ ct: &mut CompressionTimeAccum,
+ cs: &mut CompressionSizeAccum,
+ ra: &mut RandomAccessAccum,
+) -> Result<()> {
+ let reader = source.open_data_jsonl()?;
+ let started = Instant::now();
+ let mut last_log = Instant::now();
+ for line in reader.lines() {
+ let line = line?;
+ let trimmed = line.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+ summary.records_read += 1;
+ let record: V2Record = match serde_json::from_str(trimmed) {
+ Ok(r) => r,
+ Err(e) => {
+ warn!("skipping malformed data.json line: {e}");
+ continue;
+ }
+ };
+ apply_v2_record(&record, commits, summary, q, ct, cs, ra);
+ if last_log.elapsed() >= Duration::from_secs(5) {
+ let elapsed = started.elapsed().as_secs_f64();
+ let rate = summary.records_read as f64 / elapsed.max(0.001);
+ info!(
+ records = summary.records_read,
+ rate = format!("{rate:.0}/s"),
+ query = q.measurement_id.len(),
+ compression_time = ct.measurement_id.len(),
+ compression_size = cs.rows.len(),
+ random_access = ra.measurement_id.len(),
+ "migration progress",
+ );
+ last_log = Instant::now();
+ }
+ }
+ Ok(())
+}
+
+fn apply_v2_record(
+ record: &V2Record,
+ commits: &BTreeMap,
+ summary: &mut MigrationSummary,
+ q: &mut QueryAccum,
+ ct: &mut CompressionTimeAccum,
+ cs: &mut CompressionSizeAccum,
+ ra: &mut RandomAccessAccum,
+) {
+ let Some(sha) = record.commit_id.clone() else {
+ summary.missing_commit += 1;
+ return;
+ };
+ if !commits.contains_key(&sha) {
+ summary.missing_commit += 1;
+ return;
+ }
+
+ let bin = match classifier::classify_outcome(record) {
+ classifier::Outcome::Bin(b) => b,
+ classifier::Outcome::Skip(_) => {
+ summary.skipped_intentional += 1;
+ return;
+ }
+ classifier::Outcome::Unknown => {
+ summary.uncategorized += 1;
+ let prefix = record.name.split('/').next().unwrap_or("").to_string();
+ *summary.uncategorized_prefixes.entry(prefix).or_insert(0) += 1;
+ return;
+ }
+ };
+
+ let env_triple = record.env_triple.as_ref().and_then(|t| t.to_triple());
+ let runtimes = record
+ .all_runtimes
+ .as_ref()
+ .map(|v| v.iter().filter_map(runtime_as_i64).collect::>())
+ .unwrap_or_default();
+ let value_f64 = match record.value.as_ref().and_then(value_as_f64) {
+ Some(v) => v,
+ None => {
+ summary.skipped_no_value += 1;
+ return;
+ }
+ };
+
+ match bin {
+ V3Bin::Query {
+ dataset,
+ dataset_variant,
+ scale_factor,
+ query_idx,
+ storage,
+ engine,
+ format,
+ } => {
+ let qm = QueryMeasurement {
+ commit_sha: sha,
+ dataset,
+ dataset_variant,
+ scale_factor,
+ query_idx,
+ storage,
+ engine,
+ format,
+ value_ns: value_f64 as i64,
+ all_runtimes_ns: runtimes,
+ peak_physical: None,
+ peak_virtual: None,
+ physical_delta: None,
+ virtual_delta: None,
+ env_triple,
+ };
+ let mid = measurement_id_query(&qm);
+ q.push(mid, qm, summary);
+ }
+ V3Bin::CompressionTime {
+ dataset,
+ dataset_variant,
+ format,
+ op,
+ } => {
+ let ctr = CompressionTime {
+ commit_sha: sha,
+ dataset,
+ dataset_variant,
+ format,
+ op,
+ value_ns: value_f64 as i64,
+ all_runtimes_ns: runtimes,
+ env_triple,
+ };
+ let mid = measurement_id_compression_time(&ctr);
+ ct.push(mid, ctr, summary);
+ }
+ V3Bin::CompressionSize {
+ dataset,
+ dataset_variant,
+ format,
+ } => {
+ let csr = CompressionSize {
+ commit_sha: sha,
+ dataset,
+ dataset_variant,
+ format,
+ value_bytes: value_f64 as i64,
+ };
+ let mid = measurement_id_compression_size(&csr);
+ cs.push_replace(mid, csr, summary);
+ }
+ V3Bin::RandomAccess { dataset, format } => {
+ let rar = RandomAccessTime {
+ commit_sha: sha,
+ dataset,
+ format,
+ value_ns: value_f64 as i64,
+ all_runtimes_ns: runtimes,
+ env_triple,
+ };
+ let mid = measurement_id_random_access(&rar);
+ ra.push(mid, rar, summary);
+ }
+ }
+}
+
+fn migrate_file_sizes(
+ source: &Source,
+ name: &str,
+ commits: &BTreeMap,
+ summary: &mut MigrationSummary,
+ cs: &mut CompressionSizeAccum,
+) -> Result<()> {
+ let reader = source.open_file_sizes(name)?;
+ let dataset_fallback = name
+ .strip_prefix("file-sizes-")
+ .and_then(|s| s.strip_suffix(".json.gz"))
+ .unwrap_or(name)
+ .to_string();
+ let started = Instant::now();
+ let mut last_log = Instant::now();
+ for line in reader.lines() {
+ let line = line?;
+ let trimmed = line.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+ let sz: V2FileSize = match serde_json::from_str(trimmed) {
+ Ok(r) => r,
+ Err(e) => {
+ warn!("skipping malformed {name} line: {e}");
+ continue;
+ }
+ };
+ if !commits.contains_key(&sz.commit_id) {
+ summary.missing_commit += 1;
+ continue;
+ }
+ let dataset = if sz.benchmark.is_empty() {
+ dataset_fallback.clone()
+ } else {
+ sz.benchmark.clone()
+ };
+ let dataset_variant = sz
+ .scale_factor
+ .as_ref()
+ .filter(|s| !s.is_empty() && s.as_str() != "1.0")
+ .cloned();
+ let csr = CompressionSize {
+ commit_sha: sz.commit_id.clone(),
+ dataset,
+ dataset_variant,
+ format: sz.format.clone(),
+ value_bytes: sz.size_bytes,
+ };
+ let mid = measurement_id_compression_size(&csr);
+ cs.push_sum(mid, csr);
+ summary.file_size_inserted += 1;
+ if last_log.elapsed() >= Duration::from_secs(5) {
+ let elapsed = started.elapsed().as_secs_f64();
+ let rate = summary.file_size_inserted as f64 / elapsed.max(0.001);
+ info!(
+ name = %name,
+ file_sizes = summary.file_size_inserted,
+ rate = format!("{rate:.0}/s"),
+ "file-sizes progress",
+ );
+ last_log = Instant::now();
+ }
+ }
+ Ok(())
+}
+
+/// Append an Arrow `RecordBatch` to a DuckDB table via `Appender`.
+fn flush(conn: &Connection, table: &str, batch: RecordBatch) -> Result<()> {
+ let mut app = conn
+ .appender(table)
+ .with_context(|| format!("opening appender for {table}"))?;
+ app.append_record_batch(batch)
+ .with_context(|| format!("appending record batch to {table}"))?;
+ drop(app);
+ Ok(())
+}
+
+#[derive(Default)]
+struct QueryAccum {
+ measurement_id: Vec,
+ commit_sha: Vec,
+ dataset: Vec,
+ dataset_variant: Vec