Skip to content

Commit 184e1bf

Browse files
committed
Add variant_get compute kernel
In parquet-variant: - Add a new function `Variant::get_path`: this traverses the path to create a new Variant (does not cast any of it). - Add a new module `parquet_variant::path`: adds structs/enums to define a path to access a variant value deeply. In parquet-variant-compute: - Add a new compute kernel `variant_get`: does the path traversal over a `VariantArray`. In the future, this would also cast the values to a specified type. - Includes some basic unit tests. Not comprehensive. - Includes a simple micro-benchmark for reference. Current limitations: - It can only return another VariantArray. Casts are not implemented yet. - Only top-level object/list access is supported. It panics on finding a nested object/list. Needs apache#7914 to fix this. - Perf is a TODO.
1 parent daf31be commit 184e1bf

File tree

7 files changed

+286
-0
lines changed

7 files changed

+286
-0
lines changed

parquet-variant-compute/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,9 @@ name = "parquet_variant_compute"
4242
bench = false
4343

4444
[dev-dependencies]
45+
criterion = { version = "0.6", default-features = false }
46+
rand = { version = "0.9.1" }
47+
48+
[[bench]]
49+
name = "variant_get"
50+
harness = false
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
use std::sync::Arc;
2+
3+
use arrow::array::ArrayRef;
4+
use criterion::{criterion_group, criterion_main, Criterion};
5+
use parquet_variant::{Variant, VariantBuilder};
6+
use parquet_variant_compute::{
7+
variant_get::{variant_get, GetOptions},
8+
VariantArray, VariantArrayBuilder,
9+
};
10+
use rand::{rngs::StdRng, Rng, SeedableRng};
11+
12+
fn create_primitive_variant(size: usize) -> VariantArray {
13+
let mut rng = StdRng::seed_from_u64(42);
14+
15+
let mut variant_builder = VariantArrayBuilder::new(1);
16+
17+
for _ in 0..size {
18+
let mut builder = VariantBuilder::new();
19+
builder.append_value(rng.random::<i64>());
20+
let (metadata, value) = builder.finish();
21+
variant_builder.append_variant(Variant::try_new(&metadata, &value).unwrap());
22+
}
23+
24+
variant_builder.build()
25+
}
26+
27+
pub fn variant_get_bench(c: &mut Criterion) {
28+
let variant_array = create_primitive_variant(8192);
29+
let input: ArrayRef = Arc::new(variant_array);
30+
31+
let options = GetOptions {
32+
path: vec![].into(),
33+
as_type: None,
34+
cast_options: Default::default(),
35+
};
36+
37+
c.bench_function("variant_get_primitive", |b| {
38+
b.iter(|| variant_get(&input.clone(), options.clone()))
39+
});
40+
}
41+
42+
criterion_group!(benches, variant_get_bench);
43+
criterion_main!(benches);

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod from_json;
1919
mod to_json;
2020
mod variant_array;
2121
mod variant_array_builder;
22+
pub mod variant_get;
2223

2324
pub use variant_array::VariantArray;
2425
pub use variant_array_builder::VariantArrayBuilder;
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
use std::sync::Arc;
2+
3+
use arrow::{
4+
array::{Array, ArrayRef},
5+
compute::CastOptions,
6+
error::Result,
7+
};
8+
use arrow_schema::{ArrowError, Field};
9+
use parquet_variant::path::VariantPath;
10+
11+
use crate::{VariantArray, VariantArrayBuilder};
12+
13+
/// Returns an array with the specified path extracted from the variant values.
14+
///
15+
/// The return array type depends on the `as_type` field of the options parameter
16+
/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
17+
/// to the specified path.
18+
/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
19+
pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
20+
let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| {
21+
ArrowError::InvalidArgumentError(
22+
"expected a VariantArray as the input for variant_get".to_owned(),
23+
)
24+
})?;
25+
26+
if let Some(as_type) = options.as_type {
27+
return Err(ArrowError::NotYetImplemented(format!(
28+
"getting a {} from a VariantArray is not implemented yet",
29+
as_type
30+
)));
31+
}
32+
33+
let mut builder = VariantArrayBuilder::new(variant_array.len());
34+
for i in 0..variant_array.len() {
35+
let new_variant = variant_array.value(i);
36+
// TODO: perf?
37+
let new_variant = new_variant.get_path(&options.path);
38+
if let Some(new_variant) = new_variant {
39+
// TODO: we're decoding the value and doing a copy into a variant value again. This
40+
// copy can be much smarter.
41+
builder.append_variant(new_variant);
42+
} else {
43+
builder.append_null();
44+
}
45+
}
46+
47+
Ok(Arc::new(builder.build()))
48+
}
49+
50+
/// Controls the action of the variant_get kernel.
51+
#[derive(Debug, Clone)]
52+
pub struct GetOptions<'a> {
53+
/// What path to extract
54+
pub path: VariantPath,
55+
/// if `as_type` is None, the returned array will itself be a VariantArray.
56+
///
57+
/// if `as_type` is `Some(type)` the field is returned as the specified type if possible. To specify returning
58+
/// a Variant, pass a Field with variant type in the metadata.
59+
pub as_type: Option<Field>,
60+
/// Controls the casting behavior (e.g. error vs substituting null on cast error).
61+
pub cast_options: CastOptions<'a>,
62+
}
63+
64+
impl<'a> GetOptions<'a> {
65+
/// Construct options to get the specified path as a variant.
66+
pub fn new_with_path(path: VariantPath) -> Self {
67+
Self {
68+
path,
69+
as_type: None,
70+
cast_options: Default::default(),
71+
}
72+
}
73+
}
74+
75+
#[cfg(test)]
76+
mod test {
77+
use std::sync::Arc;
78+
79+
use arrow::array::{Array, ArrayRef};
80+
use parquet_variant::{path::VariantPathElement, VariantBuilder};
81+
82+
use crate::{VariantArray, VariantArrayBuilder};
83+
84+
use super::{variant_get, GetOptions};
85+
86+
#[test]
87+
fn get_primitive_variant() {
88+
let mut builder = VariantBuilder::new();
89+
builder.add_field_name("some_field");
90+
let mut object = builder.new_object();
91+
object.insert("some_field", 1234i64);
92+
object.finish().unwrap();
93+
let (metadata, value) = builder.finish();
94+
95+
let mut builder = VariantArrayBuilder::new(1);
96+
builder.append_variant_buffers(&metadata, &value);
97+
98+
let variant_array = builder.build();
99+
100+
let input = Arc::new(variant_array) as ArrayRef;
101+
102+
let result = variant_get(
103+
&input,
104+
GetOptions::new_with_path(
105+
vec![VariantPathElement::field("some_field".to_owned())].into(),
106+
),
107+
)
108+
.unwrap();
109+
110+
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
111+
assert!(result.nulls().is_none());
112+
let result = result.value(0);
113+
assert_eq!(result.as_int64().unwrap(), 1234);
114+
}
115+
116+
#[test]
117+
#[should_panic(
118+
expected = "Nested values are handled specially by ObjectBuilder and ListBuilder"
119+
)]
120+
fn get_complex_variant() {
121+
let mut builder = VariantBuilder::new();
122+
builder.add_field_name("top_level_field");
123+
builder.add_field_name("inner_field");
124+
125+
let mut object = builder.new_object();
126+
let mut inner_object = object.new_object("top_level_field");
127+
inner_object.insert("inner_field", 1234i64);
128+
inner_object.finish().unwrap();
129+
object.finish().unwrap();
130+
let (metadata, value) = builder.finish();
131+
132+
let mut builder = VariantArrayBuilder::new(1);
133+
builder.append_variant_buffers(&metadata, &value);
134+
135+
let variant_array = builder.build();
136+
137+
let input = Arc::new(variant_array) as ArrayRef;
138+
139+
let result = variant_get(
140+
&input,
141+
GetOptions::new_with_path(
142+
vec![VariantPathElement::field("top_level_field".to_owned())].into(),
143+
),
144+
)
145+
.unwrap();
146+
147+
// uncomment once implemented
148+
todo!("{:?}", result);
149+
// let result: &VariantArray = result.as_any().downcast_ref().unwrap();
150+
// assert!(result.nulls().is_none());
151+
// let result = result.value(0);
152+
// let result = result.as_object().unwrap();
153+
// let binding = result.get("top_level_field").unwrap();
154+
// let result = binding.as_object().unwrap();
155+
// let result = result.get("inner_field").unwrap();
156+
// assert_eq!(result.as_int64().unwrap(), 1234);
157+
}
158+
}

parquet-variant/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ mod builder;
3131
mod decoder;
3232
mod utils;
3333
mod variant;
34+
pub mod path;
3435

3536
pub use builder::*;
3637
pub use variant::*;

parquet-variant/src/path.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
use std::ops::Deref;
2+
3+
/// Represents a qualified path to a potential subfield or index of a variant value.
4+
#[derive(Debug, Clone)]
5+
pub struct VariantPath(Vec<VariantPathElement>);
6+
7+
impl VariantPath {
8+
pub fn new(path: Vec<VariantPathElement>) -> Self {
9+
Self(path)
10+
}
11+
12+
pub fn path(&self) -> &Vec<VariantPathElement> {
13+
&self.0
14+
}
15+
}
16+
17+
impl From<Vec<VariantPathElement>> for VariantPath {
18+
fn from(value: Vec<VariantPathElement>) -> Self {
19+
Self::new(value)
20+
}
21+
}
22+
23+
impl Deref for VariantPath {
24+
type Target = Vec<VariantPathElement>;
25+
26+
fn deref(&self) -> &Self::Target {
27+
&self.0
28+
}
29+
}
30+
31+
/// Element of a path
32+
#[derive(Debug, Clone)]
33+
pub enum VariantPathElement {
34+
/// Access field with name `name`
35+
Field { name: String },
36+
/// Access the list element at `index`
37+
Index { index: usize },
38+
}
39+
40+
impl VariantPathElement {
41+
pub fn field(name: String) -> VariantPathElement {
42+
VariantPathElement::Field { name }
43+
}
44+
45+
pub fn index(index: usize) -> VariantPathElement {
46+
VariantPathElement::Index { index }
47+
}
48+
}

parquet-variant/src/variant.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ pub use self::object::VariantObject;
2222
use crate::decoder::{
2323
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
2424
};
25+
use crate::path::{VariantPath, VariantPathElement};
2526
use crate::utils::{first_byte_from_slice, slice_from_slice};
2627
use std::ops::Deref;
2728

@@ -1063,6 +1064,34 @@ impl<'m, 'v> Variant<'m, 'v> {
10631064
_ => None,
10641065
}
10651066
}
1067+
1068+
/// Return a new Variant with the path followed.
1069+
///
1070+
/// If the path is not found, `None` is returned.
1071+
pub fn get_path(&self, path: &VariantPath) -> Option<Variant> {
1072+
let mut output = self.clone();
1073+
for element in path.deref() {
1074+
match element {
1075+
VariantPathElement::Field { name } => {
1076+
let Variant::Object(variant_object) = output else {
1077+
return None;
1078+
};
1079+
let field = variant_object.get(name);
1080+
let field = field?;
1081+
output = field;
1082+
}
1083+
VariantPathElement::Index { index } => {
1084+
let Variant::List(variant_list) = output else {
1085+
return None;
1086+
};
1087+
let index = variant_list.get(*index);
1088+
let index = index?;
1089+
output = index;
1090+
}
1091+
}
1092+
}
1093+
Some(output)
1094+
}
10661095
}
10671096

10681097
impl From<()> for Variant<'_, '_> {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy