diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index 229f5e1ebf3..4e6da589658 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -65,3 +65,43 @@ jobs:
           run: bash scripts/bench-taskset.sh cargo codspeed run
           token: ${{ secrets.CODSPEED_TOKEN }}
           mode: "simulation"
+
+  bench-codspeed-cuda:
+    if: github.repository == 'vortex-data/vortex'
+    strategy:
+      matrix:
+        include:
+          - { shard: 1, name: "Bitpacked", benches: "bitpacked_cuda" }
+          - { shard: 2, name: "Dynamic dispatch", benches: "dynamic_dispatch_cuda" }
+          - { shard: 3, name: "Standalone kernels", benches: "alp_cuda date_time_parts_cuda dict_cuda for_cuda runend_cuda throughput_cuda" }
+          - { shard: 4, name: "NVIDIA kernels", benches: "filter_cuda zstd_cuda" }
+    name: "Benchmark with Codspeed (CUDA Shard #${{ matrix.shard }} - ${{ matrix.name }})"
+    timeout-minutes: 30
+    runs-on: runs-on=${{ github.run_id }}/family=g5/image=ubuntu24-gpu-x64/tag=bench-codspeed-cuda-${{ matrix.shard }}
+    steps:
+      - uses: runs-on/action@v2
+        with:
+          sccache: s3
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6
+      - uses: ./.github/actions/setup-rust
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Display NVIDIA SMI details
+        run: |
+          nvidia-smi
+          nvidia-smi -L
+          nvidia-smi -q -d Memory
+      - name: Install Codspeed
+        uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995
+        with:
+          tool: cargo-codspeed
+      - name: Build benchmarks
+        run: cargo codspeed build -m walltime -p vortex-cuda --profile bench
+      - name: Run benchmarks
+        uses: CodSpeedHQ/action@d872884a306dd4853acf0f584f4b706cf0cc72a2
+        env:
+          CARGO_MANIFEST_DIR: ${{ github.workspace }}/vortex-cuda
+        with:
+          run: cargo codspeed run $(printf -- '--bench %s ' ${{ matrix.benches }})
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          mode: "walltime"
diff --git a/vortex-cuda/benches/alp_cuda.rs b/vortex-cuda/benches/alp_cuda.rs
index 0f006018a58..b5c1dd5f1e1 100644
--- a/vortex-cuda/benches/alp_cuda.rs
+++ b/vortex-cuda/benches/alp_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::mem::size_of;
 use std::sync::Arc;
@@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 const N_ROWS: usize = 100_000_000;
 
@@ -133,11 +134,7 @@ fn benchmark_alp_decode(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_alp_decode
 }
 
diff --git a/vortex-cuda/benches/bench_config/mod.rs b/vortex-cuda/benches/bench_config/mod.rs
new file mode 100644
index 00000000000..fd8db1cbf63
--- /dev/null
+++ b/vortex-cuda/benches/bench_config/mod.rs
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::time::Duration;
+
+use criterion::Criterion;
+
+/// Returns a [`Criterion`] configuration tuned for CUDA benchmarks.
+///
+/// All benchmarks use `iter_custom` with precise CUDA event timing.
+/// criterion's iteration planner estimates `iters` from **wall time** during
+/// warmup, which includes GPU context setup and memory copies — not just
+/// the kernel. Setting `measurement_time = 1ns` forces `iters = 1` so
+/// each sample is exactly one `iter_custom` call returning GPU-timed duration.
+/// Stability comes from a high `sample_size` (many independent launches)
+/// rather than many iterations per sample.
+///
+/// `warm_up_time` runs at least one full iteration before sampling, giving
+/// the GPU a chance to reach steady state (clock boost, cache warming).
+/// If a single launch exceeds the warm-up budget, criterion still completes
+/// it before moving on.
+pub(super) fn cuda_bench_config() -> Criterion {
+    // Number of independent kernel launches.
+    let sample_size = 10;
+
+    Criterion::default()
+        .without_plots()
+        .sample_size(sample_size)
+        // One ns is enough to JIT-compile kernels and warm GPU caches.
+        // Criterion always finishes the in-flight iteration even if this
+        // budget is exceeded.
+        .warm_up_time(Duration::from_nanos(1))
+        // Forces `iters = 1`: criterion's planner estimates iteration cost
+        // from wall time (which includes GPU context setup), not the
+        // GPU-timed duration returned by `iter_custom`. A real
+        // measurement_time would cause wildly inflated iteration counts.
+        .measurement_time(Duration::from_nanos(1))
+}
diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
index 7568db82ec5..ab8fb3e32f3 100644
--- a/vortex-cuda/benches/bitpacked_cuda.rs
+++ b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::mem::size_of;
 use std::ops::Add;
@@ -37,7 +38,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 const N_ROWS: usize = 100_000_000;
 
@@ -199,11 +200,7 @@ fn benchmark_bitunpack_with_patches(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_bitunpack, benchmark_bitunpack_with_patches
 }
 
diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs
index 59a8c8aa9df..74e29757558 100644
--- a/vortex-cuda/benches/date_time_parts_cuda.rs
+++ b/vortex-cuda/benches/date_time_parts_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::mem::size_of;
 use std::sync::Arc;
@@ -36,7 +37,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
     let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
@@ -89,11 +90,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_datetimeparts
 }
 
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index a7371f3b3ad..8f4b366fcf1 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::fmt::Debug;
 use std::mem::size_of;
@@ -33,7 +34,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
 
@@ -160,11 +161,7 @@ fn benchmark_dict(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_dict
 }
 
diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
index f4b8a4e392b..b5d950f7219 100644
--- a/vortex-cuda/benches/dynamic_dispatch_cuda.rs
+++ b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -5,6 +5,8 @@
 #![expect(clippy::cast_possible_truncation)]
 #![expect(clippy::expect_used)]
 
+mod bench_config;
+
 use std::marker::PhantomData;
 use std::mem::size_of;
 use std::sync::Arc;
@@ -650,11 +652,7 @@ fn benchmark_dynamic_dispatch(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_dynamic_dispatch
 }
 
diff --git a/vortex-cuda/benches/filter_cuda.rs b/vortex-cuda/benches/filter_cuda.rs
index 3ea9b2433f6..bf517b9f2f3 100644
--- a/vortex-cuda/benches/filter_cuda.rs
+++ b/vortex-cuda/benches/filter_cuda.rs
@@ -6,6 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
+mod bench_config;
+
 use std::ffi::c_void;
 use std::fmt::Debug;
 use std::mem::size_of;
@@ -226,11 +228,7 @@ fn benchmark_filter(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_filter
 }
 
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index c1c098f1f4b..b602a819747 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::mem::size_of;
 use std::ops::Add;
@@ -39,7 +40,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
 const REFERENCE_VALUE: u8 = 10;
@@ -166,11 +167,7 @@ fn benchmark_ffor(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_for, benchmark_ffor
 }
 
diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs
index 0b963a10698..5c9597b83bc 100644
--- a/vortex-cuda/benches/runend_cuda.rs
+++ b/vortex-cuda/benches/runend_cuda.rs
@@ -6,7 +6,8 @@
 #![expect(clippy::unwrap_used)]
 #![expect(clippy::cast_possible_truncation)]
 
-mod common;
+mod bench_config;
+mod timed_launch_strategy;
 
 use std::mem::size_of;
 use std::sync::Arc;
@@ -32,7 +33,7 @@ use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-use crate::common::TimedLaunchStrategy;
+use crate::timed_launch_strategy::TimedLaunchStrategy;
 
 /// Creates a run-end encoded array with the specified output length and average run length.
 fn make_runend_array_typed<T>(
@@ -117,11 +118,7 @@ fn benchmark_runend(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_runend
 }
 
diff --git a/vortex-cuda/benches/throughput_cuda.rs b/vortex-cuda/benches/throughput_cuda.rs
index 6a332fa6a4e..5b8c1763b55 100644
--- a/vortex-cuda/benches/throughput_cuda.rs
+++ b/vortex-cuda/benches/throughput_cuda.rs
@@ -8,6 +8,8 @@
 
 #![expect(clippy::unwrap_used)]
 
+mod bench_config;
+
 use std::time::Duration;
 
 use criterion::BenchmarkId;
@@ -122,11 +124,7 @@ fn benchmark_transfer_throughput(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_transfer_throughput
 }
 
diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/timed_launch_strategy/mod.rs
similarity index 96%
rename from vortex-cuda/benches/common/mod.rs
rename to vortex-cuda/benches/timed_launch_strategy/mod.rs
index d17c5a41b68..9535f6e1c1c 100644
--- a/vortex-cuda/benches/common/mod.rs
+++ b/vortex-cuda/benches/timed_launch_strategy/mod.rs
@@ -31,6 +31,7 @@ impl LaunchStrategy for TimedLaunchStrategy {
 
     fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
         // NOTE: as long as the duration < 584 years this cast is safe.
+        #[allow(clippy::cast_possible_truncation)]
         let elapsed_nanos = events.duration()?.as_nanos() as u64;
         self.total_time_ns
             .fetch_add(elapsed_nanos, Ordering::Relaxed);
diff --git a/vortex-cuda/benches/zstd_cuda.rs b/vortex-cuda/benches/zstd_cuda.rs
index 68e411a2253..fa30cffd442 100644
--- a/vortex-cuda/benches/zstd_cuda.rs
+++ b/vortex-cuda/benches/zstd_cuda.rs
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+mod bench_config;
+
 use std::time::Duration;
 
 use criterion::BenchmarkId;
@@ -173,11 +175,7 @@ fn benchmark_zstd_cuda_decompress(c: &mut Criterion) {
 
 criterion::criterion_group! {
     name = benches;
-    config = Criterion::default().without_plots()
-        .sample_size(10)
-        .warm_up_time(Duration::from_nanos(1))
-        .measurement_time(Duration::from_nanos(1))
-        .nresamples(10);
+    config = bench_config::cuda_bench_config();
     targets = benchmark_zstd_cuda_decompress
 }