Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/codspeed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,43 @@ jobs:
run: bash scripts/bench-taskset.sh cargo codspeed run
token: ${{ secrets.CODSPEED_TOKEN }}
mode: "simulation"

bench-codspeed-cuda:
if: github.repository == 'vortex-data/vortex'
strategy:
matrix:
include:
- { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" }
- { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" }
- { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" }
- { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" }
name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})"
timeout-minutes: 30
runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }}
steps:
- uses: runs-on/action@v2
with:
sccache: s3
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- uses: ./.github/actions/setup-rust
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Display NVIDIA SMI details
run: |
nvidia-smi
nvidia-smi -L
nvidia-smi -q -d Memory
- name: Install Codspeed
uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
with:
tool: cargo-codspeed
- name: Build benchmarks
run: cargo codspeed build -m walltime -p vortex-cuda --profile bench
- name: Run benchmarks
uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
env:
CARGO_MANIFEST_DIR: ${{ github.workspace }}/vortex-cuda
with:
run: cargo codspeed run $(printf -- '--bench %s ' ${{ matrix.benches }})
token: ${{ secrets.CODSPEED_TOKEN }}
mode: "walltime"
11 changes: 4 additions & 7 deletions vortex-cuda/benches/alp_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::mem::size_of;
use std::sync::Arc;
Expand Down Expand Up @@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

const N_ROWS: usize = 100_000_000;

Expand Down Expand Up @@ -133,11 +134,7 @@ fn benchmark_alp_decode(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_alp_decode
}

Expand Down
38 changes: 38 additions & 0 deletions vortex-cuda/benches/bench_config/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::time::Duration;

use criterion::Criterion;

/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
///
/// All benchmarks use `iter_custom` with precise CUDA event timing.
/// criterion's iteration planner estimates `iters` from **wall time** during
/// warmup, which includes GPU context setup and memory copies — not just
/// the kernel. Setting `measurement_time = 1ns` forces `iters = 1` so
/// each sample is exactly one `iter_custom` call returning GPU-timed duration.
/// Stability comes from a high `sample_size` (many independent launches)
/// rather than many iterations per sample.
///
/// `warm_up_time` runs at least one full iteration before sampling, giving
/// the GPU a chance to reach steady state (clock boost, cache warming).
/// If a single launch exceeds the warm-up budget, criterion still completes
/// it before moving on.
pub(super) fn cuda_bench_config() -> Criterion {
// Number of independent kernel launches.
let sample_size = 10;

Criterion::default()
.without_plots()
.sample_size(sample_size)
// One ns is enough to JIT-compile kernels and warm GPU caches.
// Criterion always finishes the in-flight iteration even if this
// budget is exceeded.
.warm_up_time(Duration::from_nanos(1))
// Forces `iters = 1`: criterion's planner estimates iteration cost
// from wall time (which includes GPU context setup), not the
// GPU-timed duration returned by `iter_custom`. A real
// measurement_time would cause wildly inflated iteration counts.
.measurement_time(Duration::from_nanos(1))
}
11 changes: 4 additions & 7 deletions vortex-cuda/benches/bitpacked_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::mem::size_of;
use std::ops::Add;
Expand Down Expand Up @@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

const N_ROWS: usize = 100_000_000;

Expand Down Expand Up @@ -199,11 +200,7 @@ fn benchmark_bitunpack_with_patches(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_bitunpack, benchmark_bitunpack_with_patches
}

Expand Down
11 changes: 4 additions & 7 deletions vortex-cuda/benches/date_time_parts_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::mem::size_of;
use std::sync::Arc;
Expand Down Expand Up @@ -36,7 +37,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
Expand Down Expand Up @@ -89,11 +90,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_datetimeparts
}

Expand Down
11 changes: 4 additions & 7 deletions vortex-cuda/benches/dict_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::fmt::Debug;
use std::mem::size_of;
Expand All @@ -33,7 +34,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];

Expand Down Expand Up @@ -160,11 +161,7 @@ fn benchmark_dict(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_dict
}

Expand Down
8 changes: 3 additions & 5 deletions vortex-cuda/benches/dynamic_dispatch_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#![expect(clippy::cast_possible_truncation)]
#![expect(clippy::expect_used)]

mod bench_config;

use std::marker::PhantomData;
use std::mem::size_of;
use std::sync::Arc;
Expand Down Expand Up @@ -650,11 +652,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_dynamic_dispatch
}

Expand Down
8 changes: 3 additions & 5 deletions vortex-cuda/benches/filter_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod bench_config;

use std::ffi::c_void;
use std::fmt::Debug;
use std::mem::size_of;
Expand Down Expand Up @@ -226,11 +228,7 @@ fn benchmark_filter(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_filter
}

Expand Down
11 changes: 4 additions & 7 deletions vortex-cuda/benches/for_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::mem::size_of;
use std::ops::Add;
Expand Down Expand Up @@ -39,7 +40,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
const REFERENCE_VALUE: u8 = 10;
Expand Down Expand Up @@ -166,11 +167,7 @@ fn benchmark_ffor(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_for, benchmark_ffor
}

Expand Down
11 changes: 4 additions & 7 deletions vortex-cuda/benches/runend_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

mod common;
mod bench_config;
mod timed_launch_strategy;

use std::mem::size_of;
use std::sync::Arc;
Expand All @@ -32,7 +33,7 @@ use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::common::TimedLaunchStrategy;
use crate::timed_launch_strategy::TimedLaunchStrategy;

/// Creates a run-end encoded array with the specified output length and average run length.
fn make_runend_array_typed<T>(
Expand Down Expand Up @@ -117,11 +118,7 @@ fn benchmark_runend(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_runend
}

Expand Down
8 changes: 3 additions & 5 deletions vortex-cuda/benches/throughput_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#![expect(clippy::unwrap_used)]

mod bench_config;

use std::time::Duration;

use criterion::BenchmarkId;
Expand Down Expand Up @@ -122,11 +124,7 @@ fn benchmark_transfer_throughput(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_transfer_throughput
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ impl LaunchStrategy for TimedLaunchStrategy {

fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
// NOTE: as long as the duration < 584 years this cast is safe.
#[allow(clippy::cast_possible_truncation)]
let elapsed_nanos = events.duration()?.as_nanos() as u64;
self.total_time_ns
.fetch_add(elapsed_nanos, Ordering::Relaxed);
Expand Down
8 changes: 3 additions & 5 deletions vortex-cuda/benches/zstd_cuda.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

mod bench_config;

use std::time::Duration;

use criterion::BenchmarkId;
Expand Down Expand Up @@ -173,11 +175,7 @@ fn benchmark_zstd_cuda_decompress(c: &mut Criterion) {

criterion::criterion_group! {
name = benches;
config = Criterion::default().without_plots()
.sample_size(10)
.warm_up_time(Duration::from_nanos(1))
.measurement_time(Duration::from_nanos(1))
.nresamples(10);
config = bench_config::cuda_bench_config();
targets = benchmark_zstd_cuda_decompress
}

Expand Down
Loading