Docs and lint (#860)

jorgecarleitao · Feb 23, 2022 · 73c6bf3 · 73c6bf3
1 parent f70116d
commit 73c6bf3
Show file tree

Hide file tree

Showing 16 changed files with 72 additions and 152 deletions.
diff --git a/benches/avro_read.rs b/benches/avro_read.rs
@@ -24,12 +24,11 @@ fn schema() -> AvroSchema {
 fn write(size: usize, has_codec: bool) -> Result<Vec<u8>> {
  let avro = schema();
  // a writer needs a schema and something to write to
- let mut writer: Writer<Vec<u8>>;
- if has_codec {
- writer = Writer::with_codec(&avro, Vec::new(), Codec::Deflate);
+ let mut writer = if has_codec {
+ Writer::with_codec(&avro, Vec::new(), Codec::Deflate)
  } else {
- writer = Writer::new(&avro, Vec::new());
- }
+ Writer::new(&avro, Vec::new())
+ };
 
  (0..size).for_each(|_| {
  let mut record = Record::new(writer.schema()).unwrap();

diff --git a/benches/write_parquet.rs b/benches/write_parquet.rs
@@ -1,4 +1,3 @@
-use std::io::Cursor;
 use std::sync::Arc;
 
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -29,7 +28,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> {
  vec![encoding],
  )?;
 
- let mut writer = vec![];
+ let writer = vec![];
 
  let mut writer = FileWriter::try_new(writer, schema, options)?;
 

diff --git a/examples/ffi.rs b/examples/ffi.rs
@@ -9,6 +9,7 @@ unsafe fn export(
  array_ptr: *mut ffi::ArrowArray,
  schema_ptr: *mut ffi::ArrowSchema,
 ) {
+ // exporting an array requires an associated field so that the consumer knows its datatype
  let field = Field::new("a", array.data_type().clone(), true);
  ffi::export_array_to_c(array, array_ptr);
  ffi::export_field_to_c(&field, schema_ptr);
@@ -25,23 +26,13 @@ fn main() -> Result<()> {
 
  // the goal is to export this array and import it back via FFI.
  // to import, we initialize the structs that will receive the data
- let array_ptr = Box::new(ffi::ArrowArray::empty());
- let schema_ptr = Box::new(ffi::ArrowSchema::empty());
-
- // since FFIs work in raw pointers, let's temporarily relinquish ownership so that producers
- // can write into it in a thread-safe manner
- let array_ptr = Box::into_raw(array_ptr);
- let schema_ptr = Box::into_raw(schema_ptr);
+ let mut array_ptr = Box::new(ffi::ArrowArray::empty());
+ let mut schema_ptr = Box::new(ffi::ArrowSchema::empty());
 
  // this is where a producer (in this case also us ^_^) writes to the pointers' location.
  // `array` here could be anything or not even be available, if this was e.g. from Python.
- // Safety: we just allocated the pointers correctly.
- unsafe { export(array.clone(), array_ptr, schema_ptr) };
-
- // we can now take ownership back, since we are responsible for deallocating this memory.
- // Safety: we just into_raw them.
- let array_ptr = unsafe { Box::from_raw(array_ptr) };
- let schema_ptr = unsafe { Box::from_raw(schema_ptr) };
+ // Safety: we just allocated the pointers
+ unsafe { export(array.clone(), &mut *array_ptr, &mut *schema_ptr) };
 
  // and finally interpret the written memory into a new array.
  // Safety: we used `export`, which is a valid exporter to the C data interface

diff --git a/examples/parquet_read_async.rs b/examples/parquet_read_async.rs
@@ -2,8 +2,6 @@ use std::sync::Arc;
 use std::time::SystemTime;
 
 use futures::future::BoxFuture;
-use futures::FutureExt;
-use tokio;
 use tokio::fs::File;
 use tokio::io::BufReader;
 use tokio_util::compat::*;

diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs
@@ -7,6 +7,7 @@ use super::{
  Bitmap,
 };
 
+/// Creates a [Vec<u8>] from an [`Iterator`] of [`BitChunk`].
 /// # Safety
 /// The iterator must be [`TrustedLen`].
 pub unsafe fn from_chunk_iter_unchecked<T: BitChunk, I: Iterator<Item = T>>(
@@ -35,7 +36,7 @@ pub unsafe fn from_chunk_iter_unchecked<T: BitChunk, I: Iterator<Item = T>>(
  buffer
 }
 
-/// Creates a Vec<u8> from a [`TrustedLen`] of [`BitChunk`],
+/// Creates a [`Vec<u8>`] from a [`TrustedLen`] of [`BitChunk`].
 pub fn chunk_iter_to_vec<T: BitChunk, I: TrustedLen<Item = T>>(iter: I) -> Vec<u8> {
  unsafe { from_chunk_iter_unchecked(iter) }
 }

diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs
@@ -58,7 +58,7 @@ impl MutableBitmap {
  }
  }
 
- /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
+ /// Initializes a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
  #[inline]
  pub fn with_capacity(capacity: usize) -> Self {
  Self {
@@ -67,7 +67,7 @@ impl MutableBitmap {
  }
  }
 
- /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
+ /// Reserves `additional` bits in the [`MutableBitmap`], potentially re-allocating its buffer.
  #[inline(always)]
  pub fn reserve(&mut self, additional: usize) {
  self.buffer

diff --git a/src/compute/like.rs b/src/compute/like.rs
@@ -17,6 +17,10 @@ fn is_like_pattern(c: char) -> bool {
  c == '%' || c == '_'
 }
 
+fn replace_pattern(pattern: &str) -> String {
+ pattern.replace('%', ".*").replace('_', ".")
+}
+
 #[inline]
 fn a_like_utf8<O: Offset, F: Fn(bool) -> bool>(
  lhs: &Utf8Array<O>,
@@ -40,7 +44,7 @@ fn a_like_utf8<O: Offset, F: Fn(bool) -> bool>(
  let pattern = if let Some(pattern) = map.get(pattern) {
  pattern
  } else {
- let re_pattern = pattern.replace("%", ".*").replace("_", ".");
+ let re_pattern = replace_pattern(pattern);
  let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
  ArrowError::InvalidArgumentError(format!(
  "Unable to build regex from LIKE pattern: {}",
@@ -113,7 +117,7 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(
  let ends_with = &rhs[1..];
  Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
  } else {
- let re_pattern = rhs.replace("%", ".*").replace("_", ".");
+ let re_pattern = replace_pattern(rhs);
  let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
  ArrowError::InvalidArgumentError(format!(
  "Unable to build regex from LIKE pattern: {}",
@@ -187,10 +191,8 @@ fn a_like_binary<O: Offset, F: Fn(bool) -> bool>(
  let pattern = if let Some(pattern) = map.get(pattern) {
  pattern
  } else {
- let re_pattern = simdutf8::basic::from_utf8(pattern)
- .unwrap()
- .replace("%", ".*")
- .replace("_", ".");
+ let re_pattern = simdutf8::basic::from_utf8(pattern).unwrap();
+ let re_pattern = replace_pattern(re_pattern);
  let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| {
  ArrowError::InvalidArgumentError(format!(
  "Unable to build regex from LIKE pattern: {}",
@@ -270,7 +272,7 @@ fn a_like_binary_scalar<O: Offset, F: Fn(bool) -> bool>(
  let ends_with = &rhs[1..];
  Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
  } else {
- let re_pattern = pattern.replace("%", ".*").replace("_", ".");
+ let re_pattern = replace_pattern(pattern);
  let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| {
  ArrowError::InvalidArgumentError(format!(
  "Unable to build regex from LIKE pattern: {}",

diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs
@@ -355,7 +355,7 @@ where
  values.chain(null_indices.into_iter()).collect::<Vec<I>>()
  };
 
- values.truncate(limit.unwrap_or_else(|| values.len()));
+ values.truncate(limit.unwrap_or(values.len()));
 
  let data_type = I::PRIMITIVE.into();
  PrimitiveArray::<I>::from_data(data_type, values.into(), None)

diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs
@@ -121,8 +121,7 @@ pub fn read_record_batch<R: Read + Seek>(
  Ok(None)
  }
  })
- .map(|x| x.transpose())
- .flatten()
+ .filter_map(|x| x.transpose())
  .collect::<Result<Vec<_>>>()?
  } else {
  fields

diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs
@@ -125,8 +125,7 @@ pub fn infer_rows(rows: &[Value]) -> Result<DataType> {
  // discard None values and deduplicate entries
  let types = types
  .into_iter()
- .map(|x| x.transpose())
- .flatten()
+ .filter_map(|x| x.transpose())
  .collect::<Result<HashSet<_>>>()?;
 
  Ok(if !types.is_empty() {

diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs
@@ -170,8 +170,7 @@ fn to_binary<O: Offset>(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc<
  .as_ref()
  .unwrap()
  .iter()
- .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
- .flatten()
+ .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
  .collect();
  Arc::new(BinaryArray::from_data(data_type, offsets, values, validity))
 }
@@ -184,8 +183,7 @@ fn to_utf8<O: Offset>(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc<dy
  .as_ref()
  .unwrap()
  .iter()
- .map(|value| value.as_str().unwrap().as_bytes().to_vec())
- .flatten()
+ .flat_map(|value| value.as_str().unwrap().as_bytes().to_vec())
  .collect();
  Arc::new(Utf8Array::from_data(data_type, offsets, values, validity))
 }
@@ -309,8 +307,7 @@ pub fn to_array(
  .as_ref()
  .unwrap()
  .iter()
- .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
- .flatten()
+ .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
  .collect();
  Ok(Arc::new(FixedSizeBinaryArray::from_data(
  data_type, values, validity,

diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs
@@ -12,7 +12,7 @@ use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
 /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
 /// any physical column.
 pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
- fields.iter().map(to_field).flatten().collect::<Vec<_>>()
+ fields.iter().filter_map(to_field).collect::<Vec<_>>()
 }
 
 fn from_int32(
@@ -224,11 +224,7 @@ fn non_repeated_group(
 /// Converts a parquet group type to an arrow [`DataType::Struct`].
 /// Returns [`None`] if all its fields are empty
 fn to_struct(fields: &[ParquetType]) -> Option<DataType> {
- let fields = fields
- .iter()
- .map(to_field)
- .flatten()
- .collect::<Vec<Field>>();
+ let fields = fields.iter().filter_map(to_field).collect::<Vec<Field>>();
  if fields.is_empty() {
  None
  } else {

diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs
@@ -101,7 +101,7 @@ fn get_fields(field: &Field) -> Vec<&Field> {
  match field.data_type.to_logical_type() {
  DataType::List(inner) => get_fields(inner),
  DataType::LargeList(inner) => get_fields(inner),
- DataType::Struct(fields) => fields.iter().map(get_fields).flatten().collect(),
+ DataType::Struct(fields) => fields.iter().flat_map(get_fields).collect(),
  _ => vec![field],
  }
 }

diff --git a/src/io/parquet/write/dictionary.rs b/src/io/parquet/write/dictionary.rs
@@ -60,19 +60,15 @@ fn encode_keys<K: DictionaryKey>(
  // encode indices
  // compute the required number of bits
  if let Some(validity) = validity {
- let keys = array
- .iter()
- .flatten()
- .map(|x| {
- let index = x.to_usize().unwrap();
- // discard indices whose values are null, since they are part of the def levels.
- if validity.get_bit(index) {
- Some(index as u32)
- } else {
- None
- }
- })
- .flatten();
+ let keys = array.iter().flatten().filter_map(|x| {
+ let index = x.to_usize().unwrap();
+ // discard indices whose values are null, since they are part of the def levels.
+ if validity.get_bit(index) {
+ Some(index as u32)
+ } else {
+ None
+ }
+ });
  let num_bits = utils::get_bit_width(keys.clone().max().unwrap_or(0) as u64) as u8;
 
  let keys = utils::ExactSizedIter::new(keys, array.len() - null_count);