Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Optimize array and list gather #19327

Merged
merged 5 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
wrap-up
  • Loading branch information
ritchie46 committed Oct 21, 2024
commit f79b8570bab8881f0fea9a375bf72b46c91f9586
107 changes: 106 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{new_empty_array, new_null_array, Array, Splitable};
use super::{new_empty_array, new_null_array, Array, ArrayRef, Splitable};
use crate::bitmap::Bitmap;
use crate::datatypes::{ArrowDataType, Field};

Expand All @@ -9,8 +9,11 @@ mod iterator;
mod mutable;
pub use mutable::*;
use polars_error::{polars_bail, polars_ensure, PolarsResult};
use polars_utils::format_tuple;
use polars_utils::pl_str::PlSmallStr;

use crate::datatypes::reshape::{Dimension, ReshapeDimension};

/// The Arrow's equivalent to an immutable `Vec<Option<[T; size]>>` where `T` is an Arrow type.
/// Cloning and slicing this struct is `O(1)`.
#[derive(Clone)]
Expand Down Expand Up @@ -120,6 +123,108 @@ impl FixedSizeListArray {
let values = new_null_array(field.dtype().clone(), length * size);
Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length)))
}

pub fn from_shape(
leaf_array: ArrayRef,
dimensions: &[ReshapeDimension],
) -> PolarsResult<ArrayRef> {
polars_ensure!(
!dimensions.is_empty(),
InvalidOperation: "at least one dimension must be specified"
);
let size = leaf_array.len();

let mut total_dim_size = 1;
let mut num_infers = 0;
for &dim in dimensions {
match dim {
ReshapeDimension::Infer => num_infers += 1,
ReshapeDimension::Specified(dim) => total_dim_size *= dim.get() as usize,
}
}

polars_ensure!(num_infers <= 1, InvalidOperation: "can only specify one inferred dimension");

if size == 0 {
polars_ensure!(
num_infers > 0 || total_dim_size == 0,
InvalidOperation: "cannot reshape empty array into shape without zero dimension: {}",
format_tuple!(dimensions),
);

let mut prev_arrow_dtype = leaf_array.dtype().clone();
let mut prev_array = leaf_array;

// @NOTE: We need to collect the iterator here because it is lazily processed.
let mut current_length = dimensions[0].get_or_infer(0);
let len_iter = dimensions[1..]
.iter()
.map(|d| {
let length = current_length as usize;
current_length *= d.get_or_infer(0);
length
})
.collect::<Vec<_>>();

// We pop the outer dimension as that is the height of the series.
for (dim, length) in dimensions[1..].iter().zip(len_iter).rev() {
// Infer dimension if needed
let dim = dim.get_or_infer(0);
prev_arrow_dtype = prev_arrow_dtype.to_fixed_size_list(dim as usize, true);

prev_array =
FixedSizeListArray::new(prev_arrow_dtype.clone(), length, prev_array, None)
.boxed();
}

return Ok(prev_array);
}

polars_ensure!(
total_dim_size > 0,
InvalidOperation: "cannot reshape non-empty array into shape containing a zero dimension: {}",
format_tuple!(dimensions)
);

polars_ensure!(
size % total_dim_size == 0,
InvalidOperation: "cannot reshape array of size {} into shape {}", size, format_tuple!(dimensions)
);

let mut prev_arrow_dtype = leaf_array.dtype().clone();
let mut prev_array = leaf_array;

// We pop the outer dimension as that is the height of the series.
for dim in dimensions[1..].iter().rev() {
// Infer dimension if needed
let dim = dim.get_or_infer((size / total_dim_size) as u64);
prev_arrow_dtype = prev_arrow_dtype.to_fixed_size_list(dim as usize, true);

prev_array = FixedSizeListArray::new(
prev_arrow_dtype.clone(),
prev_array.len() / dim as usize,
prev_array,
None,
)
.boxed();
}
Ok(prev_array)
}

pub fn get_dims(&self) -> Vec<Dimension> {
let mut dims = vec![
Dimension::new(self.length as _),
Dimension::new(self.size as _),
];

let mut prev_array = &self.values;

while let Some(a) = prev_array.as_any().downcast_ref::<FixedSizeListArray>() {
dims.push(Dimension::new(a.size as _));
prev_array = &a.values;
}
dims
}
}

// must use
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/growable/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ unsafe fn extend_offset_values<O: Offset>(
start: usize,
len: usize,
) {
let array = growable.arrays[index];
let array = growable.arrays.get_unchecked_release(index);
let offsets = array.offsets();

growable
Expand Down
109 changes: 36 additions & 73 deletions crates/polars-arrow/src/compute/take/fixed_size_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@
// specific language governing permissions and limitations
// under the License.

use polars_utils::itertools::Itertools;

use super::Index;
use crate::array::growable::{Growable, GrowableFixedSizeList};
use crate::array::{Array, ArrayRef, FixedSizeListArray, PrimitiveArray};
use crate::bitmap::MutableBitmap;
use crate::datatypes::reshape::{Dimension, ReshapeDimension};
use crate::datatypes::{ArrowDataType, PhysicalType};
use crate::legacy::prelude::FromData;
use crate::{with_match_primitive_type};
use crate::with_match_primitive_type;

pub(super) unsafe fn take_unchecked_slow<O: Index>(
values: &FixedSizeListArray,
Expand Down Expand Up @@ -124,7 +127,6 @@ unsafe fn from_buffer(mut buf: Vec<u8>, dtype: &ArrowDataType) -> ArrayRef {
}
}


// Use an alignedvec so the alignment always fits the actual type
// That way we can operate on bytes and reduce monomorphization.
#[repr(C, align(256))]
Expand All @@ -149,14 +151,6 @@ unsafe fn aligned_vec(n_bytes: usize) -> Vec<u8> {
)
}

fn replace_leaves(arr: &FixedSizeListArray, leaves: ArrayRef) -> FixedSizeListArray {
if let Some(arr) = arr.values().as_any().downcast_ref::<FixedSizeListArray>() {
replace_leaves(arr, leaves)
} else {
FixedSizeListArray::new(arr.dtype().clone(), if arr.size() == 0 { 0 } else { leaves.len() / arr.size() }, leaves, None)
}
}

fn no_inner_validities(values: &ArrayRef) -> bool {
if let Some(arr) = values.as_any().downcast_ref::<FixedSizeListArray>() {
arr.validity().is_none() && no_inner_validities(arr.values())
Expand All @@ -169,8 +163,7 @@ fn no_inner_validities(values: &ArrayRef) -> bool {
pub(super) unsafe fn take_unchecked<O: Index>(
values: &FixedSizeListArray,
indices: &PrimitiveArray<O>,
) -> FixedSizeListArray {

) -> ArrayRef {
let (stride, leaf_type) = get_stride_and_leaf_type(values.dtype(), 1);
if leaf_type.to_physical_type().is_primitive() && no_inner_validities(values.values()) {
let leaves = get_leaves(values);
Expand All @@ -186,27 +179,35 @@ pub(super) unsafe fn take_unchecked<O: Index>(

let mut count = 0;
let validity = if indices.null_count() == 0 {
dbg!("no-null");
for i in indices.values().iter() {
let i = i.to_usize();

std::ptr::copy_nonoverlapping(leaves_buf.as_ptr().add(i * bytes_per_element), dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, bytes_per_element);
std::ptr::copy_nonoverlapping(
leaves_buf.as_ptr().add(i * bytes_per_element),
dst.as_mut_ptr().add(count * bytes_per_element) as *mut _,
bytes_per_element,
);
count += 1;
}
None
} else {
dbg!("null");
let mut new_validity = MutableBitmap::with_capacity(indices.len());
let validity = indices.validity().unwrap();
for i in indices.values().iter() {
let i = i.to_usize();

if validity.get_bit_unchecked(i) {
new_validity.push_unchecked(true);
std::ptr::copy_nonoverlapping(leaves_buf.as_ptr().add(i * bytes_per_element), dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, bytes_per_element);
new_validity.extend_constant(indices.len(), true);
for i in indices.iter() {
if let Some(i) = i {
let i = i.to_usize();
std::ptr::copy_nonoverlapping(
leaves_buf.as_ptr().add(i * bytes_per_element),
dst.as_mut_ptr().add(count * bytes_per_element) as *mut _,
bytes_per_element,
);
} else {
new_validity.push_unchecked(false);
std::ptr::write_bytes(dst.as_mut_ptr().add(count * bytes_per_element) as *mut _, 0, bytes_per_element);
new_validity.set_unchecked(count, false);
std::ptr::write_bytes(
dst.as_mut_ptr().add(count * bytes_per_element) as *mut _,
0,
bytes_per_element,
);
}

count += 1;
Expand All @@ -217,56 +218,18 @@ pub(super) unsafe fn take_unchecked<O: Index>(

buf.set_len(total_bytes);


let leaves = from_buffer(buf, leaves.dtype());
replace_leaves(&values, leaves).with_validity(validity)

let mut shape = values.get_dims();
shape[0] = Dimension::new(indices.len() as _);
let shape = shape
.into_iter()
.map(ReshapeDimension::Specified)
.collect_vec();

FixedSizeListArray::from_shape(leaves.clone(), &shape)
.unwrap()
.with_validity(validity)
} else {
dbg!("slow");
take_unchecked_slow(values, indices)
take_unchecked_slow(values, indices).boxed()
}





}


#[cfg(test)]
mod test {
use polars_utils::pl_str::PlSmallStr;
use crate::datatypes::Field;
use super::*;

#[test]
fn test_gather_fixed_size_list() {

let s = PlSmallStr::EMPTY;
let f = Field::new(s, ArrowDataType::Int16, true);
let dt = ArrowDataType::FixedSizeList(Box::new(f), 2);

let values = PrimitiveArray::from_data_default(vec![0i16, 1, 2, 3, 4, 5, 6, 7].into(), None);
let arr = FixedSizeListArray::new(dt.clone(), 4, values.boxed(), None);


let idx = PrimitiveArray::from_data_default(vec![2u32, 1, 0, 0, 1, 2].into(), None);

unsafe {
dbg!(take_unchecked(&arr, &idx));
}

let f = Field::new(PlSmallStr::EMPTY, dt, true);
let dt = ArrowDataType::FixedSizeList(Box::new(f), 2);
let arr = FixedSizeListArray::new(dt, 2, arr.boxed(), None);

dbg!(&arr);
let idx = PrimitiveArray::from_data_default(vec![0u32, 1, 0].into(), None);

unsafe {
dbg!(take_unchecked(&arr, &idx));
}


}
}
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/compute/take/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pub unsafe fn take_unchecked(values: &dyn Array, indices: &IdxArr) -> Box<dyn Ar
},
FixedSizeList => {
let array = values.as_any().downcast_ref().unwrap();
Box::new(fixed_size_list::take_unchecked(array, indices))
fixed_size_list::take_unchecked(array, indices)
},
BinaryView => {
take_binview_unchecked(values.as_any().downcast_ref().unwrap(), indices).boxed()
Expand Down
26 changes: 14 additions & 12 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

mod field;
mod physical_type;
pub mod reshape;
mod schema;

use std::collections::BTreeMap;
Expand Down Expand Up @@ -367,19 +368,20 @@ impl ArrowDataType {

pub fn is_numeric(&self) -> bool {
use ArrowDataType as D;
matches!(self,
matches!(
self,
D::Int8
| D::Int16
| D::Int32
| D::Int64
| D::UInt8
| D::UInt16
| D::UInt32
| D::UInt64
| D::Float32
| D::Float64
| D::Decimal(_, _)
| D::Decimal256(_, _)
| D::Int16
| D::Int32
| D::Int64
| D::UInt8
| D::UInt16
| D::UInt32
| D::UInt64
| D::Float32
| D::Float64
| D::Decimal(_, _)
| D::Decimal256(_, _)
)
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/append.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ where

impl<T> ChunkedArray<T>
where
T: PolarsDataType<IsNested = FalseT>,
T: PolarsDataType<IsNested = FalseT, IsObject = FalseT>,
for<'a> T::Physical<'a>: TotalOrd,
{
/// Append in place. This is done by adding the chunks of `other` to this [`ChunkedArray`].
Expand Down
Loading
Loading