Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8423533
arrow dtype extension conversion
palaska Apr 22, 2026
47efbdb
use arrow canonical extension name
palaska Apr 22, 2026
ad80b53
externalize canonical aliases
palaska Apr 23, 2026
c4d1079
reuse ids
palaska Apr 24, 2026
62d70e2
cached id
palaska Apr 27, 2026
c3c455d
arcswap
palaska Apr 27, 2026
426b8dd
merge
palaska Apr 27, 2026
a91c861
clippy
palaska Apr 27, 2026
aa042f3
Merge branch 'develop' into bp/arrow-ext
palaska Apr 27, 2026
343139a
refactor tests, dont copy metadata bytes
palaska Apr 27, 2026
fd6654b
rm unnecessary stuff
palaska Apr 27, 2026
80f52f1
better name
palaska Apr 27, 2026
784d283
newtype, static ID
palaska Apr 27, 2026
b291a5c
Merge branch 'develop' into bp/arrow-ext
palaska Apr 27, 2026
3b67a4d
keep on-disk metadata as proto, convert to json only at the arrow bou…
palaska Apr 27, 2026
dc8bd4a
Merge branch 'bp/arrow-ext' of github.com:vortex-data/vortex into bp/…
palaska Apr 27, 2026
1ddb32b
revert comments
palaska Apr 27, 2026
76ef82e
public api, fmt
palaska Apr 27, 2026
498830e
Merge branch 'develop' into bp/arrow-ext
palaska Apr 27, 2026
2978add
refactor registry
palaska Apr 27, 2026
ff7e1ac
Merge branch 'develop' into bp/arrow-ext
palaska Apr 28, 2026
b61239c
nit
palaska Apr 28, 2026
ce9b58b
Merge branch 'bp/arrow-ext' of github.com:vortex-data/vortex into bp/…
palaska Apr 28, 2026
2b2a9a3
unwrap extension to storage in execute_arrow
palaska Apr 28, 2026
2155f20
clippy
palaska Apr 28, 2026
69838a9
Merge branch 'develop' into bp/arrow-ext
palaska Apr 28, 2026
990a4b8
recover extension identity in FromArrowArray
palaska Apr 28, 2026
c6333ef
Merge branch 'bp/arrow-ext' of github.com:vortex-data/vortex into bp/…
palaska Apr 28, 2026
0923b1b
Merge branch 'develop' into bp/arrow-ext
palaska Apr 28, 2026
fdaf5d7
clippy
palaska Apr 28, 2026
dfdbf10
simpler, no registry
palaska Apr 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ async-lock = "3.4"
async-stream = "0.3.6"
async-trait = "0.1.89"
base16ct = "1.0.0"
base64 = "0.22"
bigdecimal = "0.4.8"
bindgen = "0.72.0"
bit-vec = "0.9.0"
Expand Down
1 change: 1 addition & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ arrow-schema = { workspace = true }
arrow-select = { workspace = true }
arrow-string = { workspace = true }
async-lock = { workspace = true }
base64 = { workspace = true }
bytes = { workspace = true }
cfg-if = { workspace = true }
cudarc = { workspace = true, optional = true }
Expand Down
236 changes: 234 additions & 2 deletions vortex-array/public-api.lock

Large diffs are not rendered by default.

144 changes: 121 additions & 23 deletions vortex-array/src/arrow/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ use arrow_buffer::ScalarBuffer;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::buffer::OffsetBuffer;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::TimeUnit as ArrowTimeUnit;
use itertools::Itertools;
use vortex_buffer::Alignment;
Expand All @@ -66,12 +67,15 @@ use vortex_error::VortexExpect as _;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_panic;
use vortex_session::VortexSession;

use crate::ArrayRef;
use crate::IntoArray;
use crate::LEGACY_SESSION;
use crate::arrays::BoolArray;
use crate::arrays::DecimalArray;
use crate::arrays::DictArray;
use crate::arrays::ExtensionArray;
use crate::arrays::FixedSizeListArray;
use crate::arrays::ListArray;
use crate::arrays::ListViewArray;
Expand All @@ -87,7 +91,9 @@ use crate::dtype::DecimalDType;
use crate::dtype::IntegerPType;
use crate::dtype::NativePType;
use crate::dtype::PType;
use crate::dtype::arrow::resolve_extension_dtype;
use crate::dtype::i256;
use crate::dtype::session::DTypeSessionExt;
use crate::extension::datetime::TimeUnit;
use crate::validity::Validity;

Expand Down Expand Up @@ -380,23 +386,33 @@ fn remove_nulls(data: arrow_data::ArrayData) -> arrow_data::ArrayData {

impl FromArrowArray<&ArrowStructArray> for ArrayRef {
fn from_arrow(value: &ArrowStructArray, nullable: bool) -> VortexResult<Self> {
Self::from_arrow_with_session(value, nullable, &LEGACY_SESSION)
}

fn from_arrow_with_session(
value: &ArrowStructArray,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
let columns = value
.columns()
.iter()
.zip(value.fields())
.map(|(c, field)| {
// Arrow pushes down nulls, even into non-nullable fields. So we strip them
// out here because Vortex is a little more strict.
let storage = if c.null_count() > 0 && !field.is_nullable() {
let stripped = make_array(remove_nulls(c.into_data()));
Self::from_arrow_with_session(stripped.as_ref(), false, session)?
} else {
Self::from_arrow_with_session(c.as_ref(), field.is_nullable(), session)?
};
wrap_extension_if_field_has_metadata(storage, field.as_ref(), session)
})
.collect::<VortexResult<Vec<_>>>()?;
Ok(StructArray::try_new(
value.column_names().iter().copied().collect(),
value
.columns()
.iter()
.zip(value.fields())
.map(|(c, field)| {
// Arrow pushes down nulls, even into non-nullable fields. So we strip them
// out here because Vortex is a little more strict.
if c.null_count() > 0 && !field.is_nullable() {
let stripped = make_array(remove_nulls(c.into_data()));
Self::from_arrow(stripped.as_ref(), false)
} else {
Self::from_arrow(c.as_ref(), field.is_nullable())
}
})
.collect::<VortexResult<Vec<_>>>()?,
columns,
value.len(),
nulls(value.nulls(), nullable),
)?
Expand All @@ -406,14 +422,27 @@ impl FromArrowArray<&ArrowStructArray> for ArrayRef {

impl<O: IntegerPType + OffsetSizeTrait> FromArrowArray<&GenericListArray<O>> for ArrayRef {
fn from_arrow(value: &GenericListArray<O>, nullable: bool) -> VortexResult<Self> {
// Extract the validity of the underlying element array.
let elements_are_nullable = match value.data_type() {
DataType::List(field) => field.is_nullable(),
DataType::LargeList(field) => field.is_nullable(),
Self::from_arrow_with_session(value, nullable, &LEGACY_SESSION)
}

fn from_arrow_with_session(
value: &GenericListArray<O>,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
let elements_field: &Field = match value.data_type() {
DataType::List(field) => field.as_ref(),
DataType::LargeList(field) => field.as_ref(),
dt => vortex_panic!("Invalid data type for ListArray: {dt}"),
};

let elements = Self::from_arrow(value.values().as_ref(), elements_are_nullable)?;
let elements_storage = Self::from_arrow_with_session(
value.values().as_ref(),
elements_field.is_nullable(),
session,
)?;
let elements =
wrap_extension_if_field_has_metadata(elements_storage, elements_field, session)?;

// `offsets` are always non-nullable.
let offsets = value.offsets().clone().into_array();
Expand Down Expand Up @@ -445,12 +474,25 @@ impl<O: OffsetSizeTrait + NativePType> FromArrowArray<&GenericListViewArray<O>>

impl FromArrowArray<&ArrowFixedSizeListArray> for ArrayRef {
fn from_arrow(array: &ArrowFixedSizeListArray, nullable: bool) -> VortexResult<Self> {
Self::from_arrow_with_session(array, nullable, &LEGACY_SESSION)
}

fn from_arrow_with_session(
array: &ArrowFixedSizeListArray,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
let DataType::FixedSizeList(field, list_size) = array.data_type() else {
vortex_panic!("Invalid data type for ListArray: {}", array.data_type());
};

let elements_storage =
Self::from_arrow_with_session(array.values().as_ref(), field.is_nullable(), session)?;
let elements =
wrap_extension_if_field_has_metadata(elements_storage, field.as_ref(), session)?;

Ok(FixedSizeListArray::try_new(
Self::from_arrow(array.values().as_ref(), field.is_nullable())?,
elements,
*list_size as u32,
nulls(array.nulls(), nullable),
array.len(),
Expand Down Expand Up @@ -494,6 +536,30 @@ fn nulls(nulls: Option<&NullBuffer>, nullable: bool) -> Validity {
}

impl FromArrowArray<&dyn ArrowArray> for ArrayRef {
fn from_arrow_with_session(
array: &dyn ArrowArray,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
match array.data_type() {
DataType::Struct(_) => {
Self::from_arrow_with_session(array.as_struct(), nullable, session)
}
DataType::List(_) => {
Self::from_arrow_with_session(array.as_list::<i32>(), nullable, session)
}
DataType::LargeList(_) => {
Self::from_arrow_with_session(array.as_list::<i64>(), nullable, session)
}
DataType::FixedSizeList(..) => {
Self::from_arrow_with_session(array.as_fixed_size_list(), nullable, session)
}
// Other arrays don't carry child Fields, so session-aware dispatch is identical to
// the legacy path; fall through to `from_arrow`.
_ => Self::from_arrow(array, nullable),
}
}

fn from_arrow(array: &dyn ArrowArray, nullable: bool) -> VortexResult<Self> {
match array.data_type() {
DataType::Boolean => Self::from_arrow(array.as_boolean(), nullable),
Expand Down Expand Up @@ -617,13 +683,45 @@ impl FromArrowArray<&dyn ArrowArray> for ArrayRef {

impl FromArrowArray<RecordBatch> for ArrayRef {
fn from_arrow(array: RecordBatch, nullable: bool) -> VortexResult<Self> {
ArrayRef::from_arrow(&arrow_array::StructArray::from(array), nullable)
Self::from_arrow_with_session(array, nullable, &LEGACY_SESSION)
}

fn from_arrow_with_session(
array: RecordBatch,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
Self::from_arrow_with_session(&arrow_array::StructArray::from(array), nullable, session)
}
}

impl FromArrowArray<&RecordBatch> for ArrayRef {
fn from_arrow(array: &RecordBatch, nullable: bool) -> VortexResult<Self> {
Self::from_arrow(array.clone(), nullable)
Self::from_arrow_with_session(array, nullable, &LEGACY_SESSION)
}

fn from_arrow_with_session(
array: &RecordBatch,
nullable: bool,
session: &VortexSession,
) -> VortexResult<Self> {
Self::from_arrow_with_session(array.clone(), nullable, session)
}
}

/// Inverse of `field_from_dtype` (in `dtype/arrow.rs`): if `field` carries
/// `ARROW:extension:name` metadata for a registered extension, rewrap `storage` as an
/// `ExtensionArray`; otherwise fall through to `storage`. Diagnostic warnings live in
/// [`resolve_extension_dtype`].
fn wrap_extension_if_field_has_metadata(
storage: ArrayRef,
field: &Field,
session: &VortexSession,
) -> VortexResult<ArrayRef> {
let dtypes = session.dtypes();
match resolve_extension_dtype(field, &dtypes, storage.dtype()) {
Some(ext_dtype) => Ok(ExtensionArray::try_new(ext_dtype, storage)?.into_array()),
None => Ok(storage),
}
}

Expand Down
42 changes: 42 additions & 0 deletions vortex-array/src/arrow/executor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ use vortex_error::vortex_bail;
use vortex_error::vortex_ensure;

use crate::ArrayRef;
use crate::arrays::ExtensionArray;
use crate::arrays::List;
use crate::arrays::VarBin;
use crate::arrays::extension::ExtensionArrayExt;
use crate::arrays::list::ListArrayExt;
use crate::arrays::varbin::VarBinArrayExt;
use crate::arrow::executor::bool::to_arrow_bool;
Expand Down Expand Up @@ -87,6 +89,12 @@ impl ArrowArrayExecutor for ArrayRef {
data_type: Option<&DataType>,
ctx: &mut ExecutionCtx,
) -> VortexResult<ArrowArrayRef> {
// Extension identity lives on Field metadata; dispatch on the storage array.
if matches!(self.dtype(), DType::Extension(_)) {
let ext = self.execute::<ExtensionArray>(ctx)?;
return ext.storage_array().clone().execute_arrow(data_type, ctx);
}

let len = self.len();

// Resolve the DataType if it is a leaf type
Expand Down Expand Up @@ -228,3 +236,37 @@ fn preferred_arrow_type(array: &ArrayRef) -> VortexResult<DataType> {
// Everything else: use canonical dtype conversion
array.dtype().to_arrow_dtype()
}

#[cfg(test)]
mod tests {
use arrow_array::cast::AsArray;
use arrow_array::types::UInt64Type;
use arrow_schema::DataType;

use super::*;
use crate::LEGACY_SESSION;
use crate::VortexSessionExecute;
use crate::array::IntoArray;
use crate::arrays::ExtensionArray;
use crate::arrays::PrimitiveArray;
use crate::extension::tests::divisible_int::DivisibleInt;
use crate::extension::tests::divisible_int::Divisor;

#[test]
fn execute_arrow_unwraps_extension_to_storage() {
let storage = PrimitiveArray::from_iter(0u64..6).into_array();
let ext = ExtensionArray::try_new_from_vtable(DivisibleInt, Divisor(1), storage)
.unwrap()
.into_array();

let arrow = ext
.execute_arrow(
Some(&DataType::UInt64),
&mut LEGACY_SESSION.create_execution_ctx(),
)
.unwrap();

let primitives = arrow.as_primitive::<UInt64Type>();
assert_eq!(primitives.values(), &[0, 1, 2, 3, 4, 5]);
}
}
18 changes: 14 additions & 4 deletions vortex-array/src/arrow/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use arrow_array::ArrayRef as ArrowArrayRef;
use arrow_schema::DataType;
use vortex_error::VortexResult;
use vortex_session::VortexSession;

mod convert;
mod datum;
Expand All @@ -24,10 +25,19 @@ use crate::ArrayRef;
use crate::LEGACY_SESSION;
use crate::VortexSessionExecute;

pub trait FromArrowArray<A> {
fn from_arrow(array: A, nullable: bool) -> VortexResult<Self>
where
Self: Sized;
pub trait FromArrowArray<A>: Sized {
fn from_arrow(array: A, nullable: bool) -> VortexResult<Self>;

/// Same conversion, with session for resolving `ARROW:extension:name` field metadata to
/// registered extension dtypes. The default ignores the session — override on impls that
/// see Arrow `Field`s (RecordBatch, Struct, List, FSL).
fn from_arrow_with_session(
array: A,
nullable: bool,
_session: &VortexSession,
) -> VortexResult<Self> {
Self::from_arrow(array, nullable)
}
}

#[deprecated(note = "Use `execute_arrow(None, ctx)` or `execute_arrow(Some(dt), ctx)` instead")]
Expand Down
14 changes: 13 additions & 1 deletion vortex-array/src/arrow/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use arrow_array::cast::AsArray;
use arrow_schema::DataType;
use arrow_schema::Schema;
use vortex_error::VortexResult;
use vortex_session::VortexSession;

use crate::LEGACY_SESSION;
use crate::VortexSessionExecute;
Expand All @@ -17,11 +18,22 @@ impl StructArray {
pub fn into_record_batch_with_schema(
self,
schema: impl AsRef<Schema>,
) -> VortexResult<RecordBatch> {
self.into_record_batch_with_schema_with_session(schema, &LEGACY_SESSION)
}

/// Same as [`Self::into_record_batch_with_schema`], but routes execution through `session`
/// so canonical Arrow extension aliases declared on registered vtables apply uniformly to
/// both schema construction and array conversion.
pub fn into_record_batch_with_schema_with_session(
self,
schema: impl AsRef<Schema>,
session: &VortexSession,
) -> VortexResult<RecordBatch> {
let data_type = DataType::Struct(schema.as_ref().fields.clone());
let array_ref = self
.into_array()
.execute_arrow(Some(&data_type), &mut LEGACY_SESSION.create_execution_ctx())?;
.execute_arrow(Some(&data_type), &mut session.create_execution_ctx())?;
Ok(RecordBatch::from(array_ref.as_struct()))
}
}
Expand Down
Loading
Loading