diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 229f5e1ebf3..4e6da589658 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -65,3 +65,43 @@ jobs: run: bash scripts/bench-taskset.sh cargo codspeed run token: ${{ secrets.CODSPEED_TOKEN }} mode: "simulation" + + bench-codspeed-cuda: + if: github.repository == 'vortex-data/vortex' + strategy: + matrix: + include: + - { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" } + - { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" } + - { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" } + - { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" } + name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})" + timeout-minutes: 30 + runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }} + steps: + - uses: runs-on/action@v2 + with: + sccache: s3 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: ./.github/actions/setup-rust + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Display NVIDIA SMI details + run: | + nvidia-smi + nvidia-smi -L + nvidia-smi -q -d Memory + - name: Install Codspeed + uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995 + with: + tool: cargo-codspeed + - name: Build benchmarks + run: cargo codspeed build -m walltime -p vortex-cuda --profile bench + - name: Run benchmarks + uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2 + env: + CARGO_MANIFEST_DIR: ${{ github.workspace }}/vortex-cuda + with: + run: cargo codspeed run $(printf -- '--bench %s ' ${{ matrix.benches }}) + token: ${{ secrets.CODSPEED_TOKEN }} + mode: "walltime" diff --git a/vortex-cuda/benches/alp_cuda.rs b/vortex-cuda/benches/alp_cuda.rs index 0f006018a58..b5c1dd5f1e1 100644 --- a/vortex-cuda/benches/alp_cuda.rs +++ b/vortex-cuda/benches/alp_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::mem::size_of; use std::sync::Arc; @@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; const N_ROWS: usize = 100_000_000; @@ -133,11 +134,7 @@ fn benchmark_alp_decode(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_alp_decode } diff --git a/vortex-cuda/benches/bench_config/mod.rs b/vortex-cuda/benches/bench_config/mod.rs new file mode 100644 index 00000000000..fd8db1cbf63 --- /dev/null +++ b/vortex-cuda/benches/bench_config/mod.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::time::Duration; + +use criterion::Criterion; + +/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks. +/// +/// All benchmarks use `iter_custom` with precise CUDA event timing. +/// criterion's iteration planner estimates `iters` from **wall time** during +/// warmup, which includes GPU context setup and memory copies — not just +/// the kernel. Setting `measurement_time = 1ns` forces `iters = 1` so +/// each sample is exactly one `iter_custom` call returning GPU-timed duration. +/// Stability comes from a high `sample_size` (many independent launches) +/// rather than many iterations per sample. +/// +/// `warm_up_time` runs at least one full iteration before sampling, giving +/// the GPU a chance to reach steady state (clock boost, cache warming). +/// If a single launch exceeds the warm-up budget, criterion still completes +/// it before moving on. +pub(super) fn cuda_bench_config() -> Criterion { + // Number of independent kernel launches. + let sample_size = 10; + + Criterion::default() + .without_plots() + .sample_size(sample_size) + // One ns is enough to JIT-compile kernels and warm GPU caches. + // Criterion always finishes the in-flight iteration even if this + // budget is exceeded. + .warm_up_time(Duration::from_nanos(1)) + // Forces `iters = 1`: criterion's planner estimates iteration cost + // from wall time (which includes GPU context setup), not the + // GPU-timed duration returned by `iter_custom`. A real + // measurement_time would cause wildly inflated iteration counts. + .measurement_time(Duration::from_nanos(1)) +} diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs index 7568db82ec5..ab8fb3e32f3 100644 --- a/vortex-cuda/benches/bitpacked_cuda.rs +++ b/vortex-cuda/benches/bitpacked_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::mem::size_of; use std::ops::Add; @@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; const N_ROWS: usize = 100_000_000; @@ -199,11 +200,7 @@ fn benchmark_bitunpack_with_patches(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_bitunpack, benchmark_bitunpack_with_patches } diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs index 59a8c8aa9df..74e29757558 100644 --- a/vortex-cuda/benches/date_time_parts_cuda.rs +++ b/vortex-cuda/benches/date_time_parts_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::mem::size_of; use std::sync::Arc; @@ -36,7 +37,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray { let days: Vec = (0..len).map(|i| (i / 1000) as i16).collect(); @@ -89,11 +90,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_datetimeparts } diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs index a7371f3b3ad..8f4b366fcf1 100644 --- a/vortex-cuda/benches/dict_cuda.rs +++ b/vortex-cuda/benches/dict_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::fmt::Debug; use std::mem::size_of; @@ -33,7 +34,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")]; @@ -160,11 +161,7 @@ fn benchmark_dict(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_dict } diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs index f4b8a4e392b..b5d950f7219 100644 --- a/vortex-cuda/benches/dynamic_dispatch_cuda.rs +++ b/vortex-cuda/benches/dynamic_dispatch_cuda.rs @@ -5,6 +5,8 @@ #![expect(clippy::cast_possible_truncation)] #![expect(clippy::expect_used)] +mod bench_config; + use std::marker::PhantomData; use std::mem::size_of; use std::sync::Arc; @@ -650,11 +652,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_dynamic_dispatch } diff --git a/vortex-cuda/benches/filter_cuda.rs b/vortex-cuda/benches/filter_cuda.rs index 3ea9b2433f6..bf517b9f2f3 100644 --- a/vortex-cuda/benches/filter_cuda.rs +++ b/vortex-cuda/benches/filter_cuda.rs @@ -6,6 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] +mod bench_config; + use std::ffi::c_void; use std::fmt::Debug; use std::mem::size_of; @@ -226,11 +228,7 @@ fn benchmark_filter(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_filter } diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index c1c098f1f4b..b602a819747 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::mem::size_of; use std::ops::Add; @@ -39,7 +40,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")]; const REFERENCE_VALUE: u8 = 10; @@ -166,11 +167,7 @@ fn benchmark_ffor(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_for, benchmark_ffor } diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs index 0b963a10698..5c9597b83bc 100644 --- a/vortex-cuda/benches/runend_cuda.rs +++ b/vortex-cuda/benches/runend_cuda.rs @@ -6,7 +6,8 @@ #![expect(clippy::unwrap_used)] #![expect(clippy::cast_possible_truncation)] -mod common; +mod bench_config; +mod timed_launch_strategy; use std::mem::size_of; use std::sync::Arc; @@ -32,7 +33,7 @@ use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -use crate::common::TimedLaunchStrategy; +use crate::timed_launch_strategy::TimedLaunchStrategy; /// Creates a run-end encoded array with the specified output length and average run length. fn make_runend_array_typed( @@ -117,11 +118,7 @@ fn benchmark_runend(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_runend } diff --git a/vortex-cuda/benches/throughput_cuda.rs b/vortex-cuda/benches/throughput_cuda.rs index 6a332fa6a4e..5b8c1763b55 100644 --- a/vortex-cuda/benches/throughput_cuda.rs +++ b/vortex-cuda/benches/throughput_cuda.rs @@ -8,6 +8,8 @@ #![expect(clippy::unwrap_used)] +mod bench_config; + use std::time::Duration; use criterion::BenchmarkId; @@ -122,11 +124,7 @@ fn benchmark_transfer_throughput(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_transfer_throughput } diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/timed_launch_strategy/mod.rs similarity index 96% rename from vortex-cuda/benches/common/mod.rs rename to vortex-cuda/benches/timed_launch_strategy/mod.rs index d17c5a41b68..9535f6e1c1c 100644 --- a/vortex-cuda/benches/common/mod.rs +++ b/vortex-cuda/benches/timed_launch_strategy/mod.rs @@ -31,6 +31,7 @@ impl LaunchStrategy for TimedLaunchStrategy { fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> { // NOTE: as long as the duration < 584 years this cast is safe. + #[allow(clippy::cast_possible_truncation)] let elapsed_nanos = events.duration()?.as_nanos() as u64; self.total_time_ns .fetch_add(elapsed_nanos, Ordering::Relaxed); diff --git a/vortex-cuda/benches/zstd_cuda.rs b/vortex-cuda/benches/zstd_cuda.rs index 68e411a2253..fa30cffd442 100644 --- a/vortex-cuda/benches/zstd_cuda.rs +++ b/vortex-cuda/benches/zstd_cuda.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +mod bench_config; + use std::time::Duration; use criterion::BenchmarkId; @@ -173,11 +175,7 @@ fn benchmark_zstd_cuda_decompress(c: &mut Criterion) { criterion::criterion_group! { name = benches; - config = Criterion::default().without_plots() - .sample_size(10) - .warm_up_time(Duration::from_nanos(1)) - .measurement_time(Duration::from_nanos(1)) - .nresamples(10); + config = bench_config::cuda_bench_config(); targets = benchmark_zstd_cuda_decompress }