Skip to content

Commit

Permalink
feat: allow round-tripping of dictionary data through the v2 format (l…
Browse files Browse the repository at this point in the history
  • Loading branch information
westonpace authored Aug 1, 2024
1 parent 9e7d97f commit 526cdd1
Show file tree
Hide file tree
Showing 7 changed files with 572 additions and 103 deletions.
56 changes: 56 additions & 0 deletions python/python/tests/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,18 @@ def check_round_trip(tmp_path, table):


def test_different_types(tmp_path):
dict_values = pa.array(["foo", "bar", "baz"], pa.string())
dict_indices = pa.array([2, 1, 0], pa.uint8())

check_round_trip(
tmp_path,
pa.table(
{
"large_string": pa.array(["foo", "bar", "baz"], pa.large_string()),
"large_binary": pa.array([b"foo", b"bar", b"baz"], pa.large_binary()),
"dict_string": pa.DictionaryArray.from_arrays(
dict_indices, dict_values
),
}
),
)
Expand Down Expand Up @@ -189,3 +195,53 @@ def test_list_field_name(tmp_path):

assert round_tripped == table
assert round_tripped.schema.field("list_str").type == weird_string_type


def test_dictionary(tmp_path):
# Basic round trip
dictionary = pa.array(["foo", "bar", "baz"], pa.string())
indices = pa.array([0, 1, 2, 0, 1, 2], pa.int32())
dict_arr = pa.DictionaryArray.from_arrays(indices, dictionary)

def round_trip(arr):
table = pa.table({"dict": arr})

path = tmp_path / "foo.lance"
with LanceFileWriter(str(path)) as writer:
writer.write_batch(table)

reader = LanceFileReader(str(path))
table2 = reader.read_all().to_table()
return table2.column("dict").chunk(0)

round_tripped = round_trip(dict_arr)

assert round_tripped == dict_arr
assert round_tripped.type == dict_arr.type

# Dictionary that doesn't use all values
dictionary = pa.array(["foo", "bar", "baz"], pa.string())
indices = pa.array([0, 0, 1, 1], pa.int32())
dict_arr = pa.DictionaryArray.from_arrays(indices, dictionary)

round_tripped = round_trip(dict_arr)

assert round_tripped.dictionary == dictionary

# different indices types
dictionary = pa.array(["foo", "bar", "baz"], pa.string())
for data_type in [
pa.uint8(),
pa.uint16(),
pa.uint32(),
pa.uint64(),
pa.int8(),
pa.int16(),
pa.int32(),
pa.int64(),
]:
indices = pa.array([0, 1, 2, 0, 1, 2], data_type)
dict_arr = pa.DictionaryArray.from_arrays(indices, dictionary)
round_tripped = round_trip(dict_arr)
assert round_tripped == dict_arr
assert round_tripped.type == dict_arr.type
55 changes: 55 additions & 0 deletions rust/lance-encoding/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,22 @@
use std::{ops::Deref, ptr::NonNull, sync::Arc};

use arrow_buffer::Buffer;
use snafu::{location, Location};

use lance_core::{Error, Result};

/// A copy-on-write byte buffer
///
/// It can be created from read-only buffers (e.g. bytes::Bytes or arrow_buffer::Buffer), e.g. "borrowed"
/// or from writeable buffers (e.g. Vec<u8>), e.g. "owned"
///
/// The buffer can switch to borrowed mode without a copy of the data
///
/// LanceBuffer does not implement Clone because doing could potentially silently trigger a copy of the data
/// and we want to make sure that the user is aware of this operation.
///
/// If you need to clone a LanceBuffer you can use borrow_and_clone() which will make sure that the buffer
/// is in borrowed mode before cloning. This is a zero copy operation (but requires &mut self).
#[derive(Debug)]
pub enum LanceBuffer {
Borrowed(Buffer),
Expand Down Expand Up @@ -59,6 +70,50 @@ impl LanceBuffer {
}
}
}

/// Convert into a borrowed buffer, this is a zero-copy operation
///
/// This is often called before cloning the buffer
pub fn into_borrowed(self) -> Self {
match self {
Self::Borrowed(_) => self,
Self::Owned(buffer) => Self::Borrowed(Buffer::from_vec(buffer)),
}
}

/// Creates an owned copy of the buffer, will always involve a full copy of the bytes
pub fn to_owned(&self) -> Self {
match self {
Self::Borrowed(buffer) => Self::Owned(buffer.to_vec()),
Self::Owned(buffer) => Self::Owned(buffer.clone()),
}
}

/// Creates a clone of the buffer but also puts the buffer into borrowed mode
///
/// This is a zero-copy operation
pub fn borrow_and_clone(&mut self) -> Self {
match self {
Self::Borrowed(buffer) => Self::Borrowed(buffer.clone()),
Self::Owned(buffer) => {
let buf_data = std::mem::take(buffer);
let buffer = Buffer::from_vec(buf_data);
*self = Self::Borrowed(buffer.clone());
Self::Borrowed(buffer)
}
}
}

/// Clones the buffer but fails if the buffer is in owned mode
pub fn try_clone(&self) -> Result<Self> {
match self {
Self::Borrowed(buffer) => Ok(Self::Borrowed(buffer.clone())),
Self::Owned(_) => Err(Error::Internal {
message: "try_clone called on an owned buffer".to_string(),
location: location!(),
}),
}
}
}

impl AsRef<[u8]> for LanceBuffer {
Expand Down
156 changes: 155 additions & 1 deletion rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@ use crate::buffer::LanceBuffer;
/// A DataBlock can be converted into an Arrow ArrayData (and then Array) for a given array type.
/// For example, a FixedWidthDataBlock can be converted into any primitive type or a fixed size
/// list of a primitive type.
pub trait DataBlock: Any {
pub trait DataBlock: Any + std::fmt::Debug + Send + Sync {
/// Get a reference to the Any trait object
fn as_any(&self) -> &dyn Any;
/// Convert self into a Box<dyn Any>
fn as_any_box(self: Box<Self>) -> Box<dyn Any>;
/// Convert self into an Arrow ArrayData
fn into_arrow(self: Box<Self>, data_type: DataType, validate: bool) -> Result<ArrayData>;
/// Converts the data buffers into borrowed mode and clones the block
///
/// This is a zero-copy operation but requires a mutable reference to self and, afterwards,
/// all buffers will be in Borrowed mode.
fn borrow_and_clone(&mut self) -> Box<dyn DataBlock>;
/// Try and clone the block
///
/// This will fail if any buffers are in owned mode. You can call borrow_and_clone() to
/// ensure that all buffers are in borrowed mode before calling this method.
fn try_clone(&self) -> Result<Box<dyn DataBlock>>;
}

/// Extension trait for DataBlock
Expand All @@ -51,6 +61,7 @@ impl DataBlockExt for Box<dyn DataBlock> {
}

/// A data block with no buffers where everything is null
#[derive(Debug)]
pub struct AllNullDataBlock {
/// The number of values represented by this block
pub num_values: u64,
Expand All @@ -68,9 +79,22 @@ impl DataBlock for AllNullDataBlock {
fn into_arrow(self: Box<Self>, data_type: DataType, _validate: bool) -> Result<ArrayData> {
Ok(ArrayData::new_null(&data_type, self.num_values as usize))
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
num_values: self.num_values,
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
num_values: self.num_values,
}))
}
}

/// Wraps a data block and adds nullability information to it
#[derive(Debug)]
pub struct NullableDataBlock {
/// The underlying data
pub data: Box<dyn DataBlock>,
Expand All @@ -97,9 +121,24 @@ impl DataBlock for NullableDataBlock {
Ok(unsafe { data.build_unchecked() })
}
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
data: self.data.borrow_and_clone(),
nulls: self.nulls.borrow_and_clone(),
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
data: self.data.try_clone()?,
nulls: self.nulls.try_clone()?,
}))
}
}

/// A data block for a single buffer of data where each element has a fixed number of bits
#[derive(Debug)]
pub struct FixedWidthDataBlock {
/// The data buffer
pub data: LanceBuffer,
Expand Down Expand Up @@ -155,9 +194,26 @@ impl DataBlock for FixedWidthDataBlock {
let root_num_values = self.num_values;
self.do_into_arrow(data_type, root_num_values, validate)
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
data: self.data.borrow_and_clone(),
bits_per_value: self.bits_per_value,
num_values: self.num_values,
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
data: self.data.try_clone()?,
bits_per_value: self.bits_per_value,
num_values: self.num_values,
}))
}
}

/// A data block for variable-width data (e.g. strings, packed rows, etc.)
#[derive(Debug)]
pub struct VariableWidthBlock {
/// The data buffer
pub data: LanceBuffer,
Expand Down Expand Up @@ -192,9 +248,28 @@ impl DataBlock for VariableWidthBlock {
Ok(unsafe { builder.build_unchecked() })
}
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
data: self.data.borrow_and_clone(),
offsets: self.offsets.borrow_and_clone(),
bits_per_offset: self.bits_per_offset,
num_values: self.num_values,
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
data: self.data.try_clone()?,
offsets: self.offsets.try_clone()?,
bits_per_offset: self.bits_per_offset,
num_values: self.num_values,
}))
}
}

/// A data block representing a struct
#[derive(Debug)]
pub struct StructDataBlock {
/// The child arrays
pub children: Vec<Box<dyn DataBlock>>,
Expand Down Expand Up @@ -231,4 +306,83 @@ impl DataBlock for StructDataBlock {
})
}
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
children: self
.children
.iter_mut()
.map(|c| c.borrow_and_clone())
.collect(),
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
children: self
.children
.iter()
.map(|c| c.try_clone())
.collect::<Result<_>>()?,
}))
}
}

/// A data block for dictionary encoded data
#[derive(Debug)]
pub struct DictionaryDataBlock {
/// The indices buffer
pub indices: Box<dyn DataBlock>,
/// The dictionary itself
pub dictionary: Box<dyn DataBlock>,
}

impl DataBlock for DictionaryDataBlock {
fn as_any(&self) -> &dyn Any {
self
}

fn as_any_box(self: Box<Self>) -> Box<dyn Any> {
self
}

fn into_arrow(self: Box<Self>, data_type: DataType, validate: bool) -> Result<ArrayData> {
let (key_type, value_type) = if let DataType::Dictionary(key_type, value_type) = &data_type
{
(key_type.as_ref().clone(), value_type.as_ref().clone())
} else {
return Err(Error::Internal {
message: format!("Expected Dictionary, got {:?}", data_type),
location: location!(),
});
};

let indices = self.indices.into_arrow(key_type, validate)?;
let dictionary = self.dictionary.into_arrow(value_type, validate)?;

let builder = indices
.into_builder()
.add_child_data(dictionary)
.data_type(data_type);

if validate {
Ok(builder.build()?)
} else {
Ok(unsafe { builder.build_unchecked() })
}
}

fn borrow_and_clone(&mut self) -> Box<dyn DataBlock> {
Box::new(Self {
indices: self.indices.borrow_and_clone(),
dictionary: self.dictionary.borrow_and_clone(),
})
}

fn try_clone(&self) -> Result<Box<dyn DataBlock>> {
Ok(Box::new(Self {
indices: self.indices.try_clone()?,
dictionary: self.dictionary.try_clone()?,
}))
}
}
Loading

0 comments on commit 526cdd1

Please sign in to comment.