Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 94 additions & 3 deletions datafusion/functions-nested/src/resize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,14 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
}
}

if output_values_len > max_resize_values(&data_type)
|| O::from_usize(output_values_len).is_none()
{
return exec_err!(
"array_resize: resulting array of {output_values_len} elements exceeds the maximum array size"
);
}

// The fast path is valid when at least one row grows and every row would
// use the same fill value.
let use_bulk_fill = max_extra > 0
Expand Down Expand Up @@ -342,12 +350,26 @@ where
)?))
}

/// Largest element count whose eager value buffer stays within `isize::MAX`
/// bytes, so `array_resize` rejects oversized results instead of panicking.
/// Only primitive and `FixedSizeBinary` leaves are byte-exact.
fn max_resize_values(value_type: &DataType) -> usize {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the compiler prob materialize it, but would it be better to prematerialize it manually through

static LazyLock<HashMap<DataType, usize>> ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since there are parameterized types I would need to keep match right? I'm not sure it would be better but can't say I've a strong opinion. If you say go for it I can change it that way

let element_width = match value_type {
DataType::FixedSizeBinary(size) if *size > 0 => *size as usize,
_ => value_type.primitive_width().unwrap_or(size_of::<u128>()),
};

(isize::MAX as usize) / element_width.max(1)
}

#[cfg(test)]
mod tests {
use super::array_resize_inner;
use arrow::array::{ArrayRef, AsArray, Int64Array, ListArray};
use arrow::buffer::{NullBuffer, ScalarBuffer};
use arrow::datatypes::Int32Type;
use arrow::array::{
ArrayRef, AsArray, FixedSizeBinaryArray, Int64Array, LargeListArray, ListArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow::datatypes::{DataType, Field, Int32Type, Int64Type};
use datafusion_common::Result;
use std::sync::Arc;

Expand All @@ -373,4 +395,73 @@ mod tests {

Ok(())
}

#[test]
fn test_array_resize_large_size_errors_without_panicking() {
let array: ArrayRef =
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
Some(vec![Some(1)]),
]));
let size: ArrayRef = Arc::new(Int64Array::from(vec![i64::MAX]));
let fill: ArrayRef = Arc::new(Int64Array::from(vec![0]));

let err = array_resize_inner(&[array, size, fill]).unwrap_err();
assert!(
err.to_string().contains("exceeds the maximum array size"),
"unexpected error: {err}"
);
}

#[test]
fn test_array_resize_fixed_size_binary_large_size_errors_without_panicking() {
let values =
FixedSizeBinaryArray::try_from_iter(vec![vec![0u8; 32]].into_iter()).unwrap();
let elem_field =
Arc::new(Field::new_list_field(DataType::FixedSizeBinary(32), true));
let offsets = OffsetBuffer::<i64>::new(vec![0i64, 1].into());
let array: ArrayRef = Arc::new(LargeListArray::new(
elem_field,
offsets,
Arc::new(values) as ArrayRef,
None,
));
// Passes the width-16 bound (isize::MAX / 16) but overflows at width 32.
let size: ArrayRef = Arc::new(Int64Array::from(vec![400_000_000_000_000_000i64]));

let err = array_resize_inner(&[array, size]).unwrap_err();
assert!(
err.to_string().contains("exceeds the maximum array size"),
"unexpected error: {err}"
);
}

#[test]
fn test_array_resize_accumulates_values_across_rows() {
// Each row's target (6e17) is individually under the width-8 cap
// (isize::MAX / 8), but their sum (1.2e18) exceeds it, so the guard
// must reject based on the accumulated total rather than per row.
let values = Int64Array::from(vec![1, 2]);
let offsets = OffsetBuffer::<i64>::new(vec![0i64, 1, 2].into());
let elem_field = Arc::new(Field::new_list_field(DataType::Int64, true));
let array: ArrayRef = Arc::new(LargeListArray::new(
elem_field,
offsets,
Arc::new(values) as ArrayRef,
None,
));
let size: ArrayRef = Arc::new(Int64Array::from(vec![
600_000_000_000_000_000i64,
600_000_000_000_000_000i64,
]));

let err = array_resize_inner(&[array, size]).unwrap_err();
assert!(
err.to_string().contains("1200000000000000000"),
"expected accumulated total in error: {err}"
);
assert!(
err.to_string().contains("exceeds the maximum array size"),
"unexpected error: {err}"
);
}
}
17 changes: 17 additions & 0 deletions datafusion/sqllogictest/test_files/array/array_resize.slt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5, 4);
query error
select array_resize(make_array(1, 2, 3), -5, 2);

# array_resize with a very large size should error instead of panicking (capacity overflow)
query error DataFusion error: Execution error: array_resize: resulting array of 9223372036854775807 elements exceeds the maximum array size
select array_resize(make_array(1), 9223372036854775807, 0);

query error DataFusion error: Execution error: array_resize: resulting array of 9223372036854775807 elements exceeds the maximum array size
select array_resize(arrow_cast(make_array(1), 'LargeList(Int64)'), 9223372036854775807, 0);

# List size above i32::MAX must error via the offset-type guard instead of
# silently truncating the offsets or attempting a multi-GB allocation
query error DataFusion error: Execution error: array_resize: resulting array of 3000000000 elements exceeds the maximum array size
select array_resize(make_array(1), 3000000000, 0);

# Non-primitive element types (e.g. Utf8) fall back to a conservative byte
# width; a size above that cap must error instead of attempting a huge allocation.
query error DataFusion error: Execution error: array_resize: resulting array of 600000000000000000 elements exceeds the maximum array size
select array_resize(arrow_cast(make_array('a'), 'LargeList(Utf8)'), 600000000000000000);

# array_resize scalar function #5
query ?
select array_resize(make_array(1.1, 2.2, 3.3), 10, 9.9);
Expand Down
Loading