This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
985dddb
commit e773c10
Showing
4 changed files
with
675 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
use super::levels::to_length; | ||
use super::pages::Nested; | ||
|
||
trait DebugIter: Iterator<Item = usize> + std::fmt::Debug {} | ||
|
||
impl<A: Iterator<Item = usize> + std::fmt::Debug> DebugIter for A {} | ||
|
||
fn iter<'a>(nested: &'a [Nested]) -> Vec<Box<dyn DebugIter + 'a>> { | ||
nested | ||
.iter() | ||
.filter_map(|nested| match nested { | ||
Nested::Primitive(_, _) => None, | ||
Nested::List(nested) => Some(Box::new(to_length(nested.offsets)) as Box<dyn DebugIter>), | ||
Nested::LargeList(nested) => { | ||
Some(Box::new(to_length(nested.offsets)) as Box<dyn DebugIter>) | ||
} | ||
Nested::Struct(_, _) => None, | ||
}) | ||
.collect() | ||
} | ||
|
||
fn num_values(nested: &[Nested]) -> usize { | ||
let iterators = iter(nested); | ||
let depth = iterators.len(); | ||
|
||
iterators | ||
.into_iter() | ||
.enumerate() | ||
.map(|(index, lengths)| { | ||
let length = if index == depth - 1 { | ||
lengths | ||
.map(|length| if length == 0 { 1 } else { length }) | ||
.sum::<usize>() | ||
} else { | ||
lengths | ||
.map(|length| if length == 0 { 1 } else { 0 }) | ||
.sum::<usize>() | ||
}; | ||
length | ||
}) | ||
.sum() | ||
} | ||
|
||
/// Iterator adapter of parquet / dremel repetition levels | ||
#[derive(Debug)] | ||
pub struct RepLevelsIter<'a> { | ||
// iterators of lengths. E.g. [[[a,b,c], [d,e,f,g]], [[h], [i,j]]] -> [[2, 2], [3, 4, 1, 2]] | ||
iter: Vec<Box<dyn DebugIter + 'a>>, | ||
// vector containing the remaining number of values of each iterator. | ||
// e.g. the iters [[2, 2], [3, 4, 1, 2]] after the first iteration will return [2, 3], | ||
// and remaining will be [2, 3]. | ||
// on the second iteration, it will be `[2, 2]` (since iterations consume the last items) | ||
remaining: Vec<usize>, /* < remaining.len() == iter.len() */ | ||
// cache of the first `remaining` that is non-zero. Examples: | ||
// * `remaining = [2, 2] => current_level = 2` | ||
// * `remaining = [2, 0] => current_level = 1` | ||
// * `remaining = [0, 0] => current_level = 0` | ||
current_level: usize, /* < iter.len() */ | ||
// the number to discount due to being the first element of the iterators. | ||
total: usize, /* < iter.len() */ | ||
|
||
// the total number of items that this iterator will return | ||
remaining_values: usize, | ||
} | ||
|
||
impl<'a> RepLevelsIter<'a> { | ||
pub fn new(nested: &'a [Nested]) -> Self { | ||
let remaining_values = num_values(nested); | ||
|
||
let iter = iter(nested); | ||
let remaining = std::iter::repeat(0).take(iter.len()).collect(); | ||
|
||
Self { | ||
iter, | ||
remaining, | ||
total: 0, | ||
current_level: 0, | ||
remaining_values, | ||
} | ||
} | ||
} | ||
|
||
impl<'a> Iterator for RepLevelsIter<'a> { | ||
type Item = u32; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
if *self.remaining.last().unwrap() > 0 { | ||
*self.remaining.last_mut().unwrap() -= 1; | ||
|
||
let total = self.total; | ||
self.total = 0; | ||
let r = Some((self.current_level - total) as u32); | ||
|
||
for level in 0..self.current_level - 1 { | ||
let level = self.remaining.len() - level - 1; | ||
if self.remaining[level] == 0 { | ||
self.current_level -= 1; | ||
self.remaining[level.saturating_sub(1)] -= 1; | ||
} | ||
} | ||
if self.remaining[0] == 0 { | ||
self.current_level -= 1; | ||
} | ||
self.remaining_values -= 1; | ||
return r; | ||
} | ||
|
||
self.total = 0; | ||
for (iter, remaining) in self | ||
.iter | ||
.iter_mut() | ||
.zip(self.remaining.iter_mut()) | ||
.skip(self.current_level) | ||
{ | ||
let length: usize = iter.next()?; | ||
if length == 0 { | ||
self.remaining_values -= 1; | ||
return Some(self.current_level as u32); | ||
} | ||
*remaining = length; | ||
self.current_level += 1; | ||
self.total += 1; | ||
} | ||
self.next() | ||
} | ||
|
||
fn size_hint(&self) -> (usize, Option<usize>) { | ||
let length = self.remaining_values; | ||
(length, Some(length)) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::io::parquet::write::levels::NestedInfo; | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_rep_levels() { | ||
let nested = vec![ | ||
Nested::List(NestedInfo::<i32> { | ||
is_optional: false, | ||
offsets: &[0, 2, 4], | ||
validity: None, | ||
}), | ||
Nested::List(NestedInfo::<i32> { | ||
is_optional: false, | ||
offsets: &[0, 3, 7, 8, 10], | ||
validity: None, | ||
}), | ||
Nested::Primitive(None, false), | ||
]; | ||
let mut iter = RepLevelsIter::new(&nested); | ||
assert_eq!(iter.size_hint().0, 10); | ||
let result = iter.by_ref().collect::<Vec<_>>(); | ||
assert_eq!(result, vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]); | ||
assert_eq!(iter.size_hint().0, 0); | ||
} | ||
|
||
#[test] | ||
fn test_rep_levels_with_zero() { | ||
let nested = vec![ | ||
Nested::List(NestedInfo::<i32> { | ||
is_optional: false, | ||
offsets: &[0, 2, 2, 4], | ||
validity: None, | ||
}), | ||
Nested::List(NestedInfo::<i32> { | ||
is_optional: false, | ||
offsets: &[0, 3, 7, 8, 10], | ||
validity: None, | ||
}), | ||
Nested::Primitive(None, false), | ||
]; | ||
let mut iter = RepLevelsIter::new(&nested); | ||
assert_eq!(iter.size_hint().0, 11); | ||
let result = iter.by_ref().collect::<Vec<_>>(); | ||
assert_eq!(result, vec![0, 2, 2, 1, 2, 2, 2, 0, 0, 1, 2]); | ||
assert_eq!(iter.size_hint().0, 0); | ||
} | ||
} |
Oops, something went wrong.