diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index db2f64861ca59..ab04528fa2a49 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -42,6 +42,27 @@ env: ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: + lint: + name: Lint with clippy + strategy: + matrix: + rust: [nightly-2020-04-22] + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + runs-on: ubuntu-latest + steps: + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + override: true + components: clippy + - name: Run Clippy + shell: bash + run: ci/scripts/rust_lint.sh debian: name: AMD64 Debian 10 Rust ${{ matrix.rust }} diff --git a/ci/scripts/rust_lint.sh b/ci/scripts/rust_lint.sh new file mode 100755 index 0000000000000..fa8c3cacd0cca --- /dev/null +++ b/ci/scripts/rust_lint.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +pushd rust/arrow + max_error_cnt=3 + max_warn_cnt=9 + + error_msg=$(cargo clippy -- -A clippy::redundant_field_names 2>&1 | grep 'error: aborting due to') + + error_cnt=$(echo "${error_msg}" | awk '{print $5}') + [[ ${error_cnt} -gt ${max_error_cnt} ]] && { + echo "More clippy errors introduced, got: ${error_cnt}, was: ${max_error_cnt}" + exit 1 + } + + warn_cnt=$(echo "${error_msg}" | awk '{print $8}') + [[ ${warn_cnt} -gt ${max_warn_cnt} ]] && { + echo "More clippy warnings introduced, got: ${warn_cnt}, was: ${max_warn_cnt}" + exit 1 + } +popd diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 64a3d5a2cff98..6e8f196c2ccfa 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -1143,7 +1143,7 @@ fn append_binary_data( DataType::List(Box::new(DataType::UInt8)), array.len(), None, - array.null_buffer().map(|buf| buf.clone()), + array.null_buffer().cloned(), array.offset(), vec![(&array.buffers()[0]).clone()], vec![int_data], @@ -1195,7 +1195,7 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { DataType::FixedSizeList(Box::new(DataType::UInt8), self.builder.list_len), array.len(), None, - array.null_buffer().map(|buf| buf.clone()), + array.null_buffer().cloned(), array.offset(), vec![], vec![int_data], diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs index 876c2d554f5f7..409fd5c026d0c 100644 --- a/rust/arrow/src/array/data.rs +++ b/rust/arrow/src/array/data.rs @@ -146,6 +146,10 @@ impl ArrayData { self.len } + pub fn is_empty(&self) -> bool { + self.len == 0 + } + /// Returns the offset of this array pub fn offset(&self) -> usize { self.offset diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs index dd69b7fee2030..c2bb03db1811a 100644 --- a/rust/arrow/src/array/mod.rs +++ b/rust/arrow/src/array/mod.rs @@ -81,6 +81,7 @@ //! ) //! ``` +#[allow(clippy::module_inception)] mod array; mod builder; mod cast; diff --git a/rust/arrow/src/array/ord.rs b/rust/arrow/src/array/ord.rs index 84e5ceda079cb..ab8a31f89a965 100644 --- a/rust/arrow/src/array/ord.rs +++ b/rust/arrow/src/array/ord.rs @@ -69,63 +69,61 @@ impl OrdArray for NullArray { } /// Convert ArrayRef to OrdArray trait object -pub fn as_ordarray(values: &ArrayRef) -> Result> { +pub fn as_ordarray(values: &ArrayRef) -> Result<&OrdArray> { match values.data_type() { - DataType::Boolean => Ok(Box::new(as_boolean_array(&values))), - DataType::Utf8 => Ok(Box::new(as_string_array(&values))), - DataType::Null => Ok(Box::new(as_null_array(&values))), - DataType::Int8 => Ok(Box::new(as_primitive_array::(&values))), - DataType::Int16 => Ok(Box::new(as_primitive_array::(&values))), - DataType::Int32 => Ok(Box::new(as_primitive_array::(&values))), - DataType::Int64 => Ok(Box::new(as_primitive_array::(&values))), - DataType::UInt8 => Ok(Box::new(as_primitive_array::(&values))), - DataType::UInt16 => Ok(Box::new(as_primitive_array::(&values))), - DataType::UInt32 => Ok(Box::new(as_primitive_array::(&values))), - DataType::UInt64 => Ok(Box::new(as_primitive_array::(&values))), - DataType::Date32(_) => Ok(Box::new(as_primitive_array::(&values))), - DataType::Date64(_) => Ok(Box::new(as_primitive_array::(&values))), - DataType::Time32(Second) => { - Ok(Box::new(as_primitive_array::(&values))) + DataType::Boolean => Ok(as_boolean_array(&values)), + DataType::Utf8 => Ok(as_string_array(&values)), + DataType::Null => Ok(as_null_array(&values)), + DataType::Int8 => Ok(as_primitive_array::(&values)), + DataType::Int16 => Ok(as_primitive_array::(&values)), + DataType::Int32 => Ok(as_primitive_array::(&values)), + DataType::Int64 => Ok(as_primitive_array::(&values)), + DataType::UInt8 => Ok(as_primitive_array::(&values)), + DataType::UInt16 => Ok(as_primitive_array::(&values)), + DataType::UInt32 => Ok(as_primitive_array::(&values)), + DataType::UInt64 => Ok(as_primitive_array::(&values)), + DataType::Date32(_) => Ok(as_primitive_array::(&values)), + DataType::Date64(_) => Ok(as_primitive_array::(&values)), + DataType::Time32(Second) => Ok(as_primitive_array::(&values)), + DataType::Time32(Millisecond) => { + Ok(as_primitive_array::(&values)) + } + DataType::Time64(Microsecond) => { + Ok(as_primitive_array::(&values)) + } + DataType::Time64(Nanosecond) => { + Ok(as_primitive_array::(&values)) } - DataType::Time32(Millisecond) => Ok(Box::new(as_primitive_array::< - Time32MillisecondType, - >(&values))), - DataType::Time64(Microsecond) => Ok(Box::new(as_primitive_array::< - Time64MicrosecondType, - >(&values))), - DataType::Time64(Nanosecond) => Ok(Box::new(as_primitive_array::< - Time64NanosecondType, - >(&values))), DataType::Timestamp(Second, _) => { - Ok(Box::new(as_primitive_array::(&values))) + Ok(as_primitive_array::(&values)) + } + DataType::Timestamp(Millisecond, _) => { + Ok(as_primitive_array::(&values)) + } + DataType::Timestamp(Microsecond, _) => { + Ok(as_primitive_array::(&values)) + } + DataType::Timestamp(Nanosecond, _) => { + Ok(as_primitive_array::(&values)) + } + DataType::Interval(IntervalUnit::YearMonth) => { + Ok(as_primitive_array::(&values)) } - DataType::Timestamp(Millisecond, _) => Ok(Box::new(as_primitive_array::< - TimestampMillisecondType, - >(&values))), - DataType::Timestamp(Microsecond, _) => Ok(Box::new(as_primitive_array::< - TimestampMicrosecondType, - >(&values))), - DataType::Timestamp(Nanosecond, _) => Ok(Box::new(as_primitive_array::< - TimestampNanosecondType, - >(&values))), - DataType::Interval(IntervalUnit::YearMonth) => Ok(Box::new( - as_primitive_array::(&values), - )), DataType::Interval(IntervalUnit::DayTime) => { - Ok(Box::new(as_primitive_array::(&values))) + Ok(as_primitive_array::(&values)) } DataType::Duration(TimeUnit::Second) => { - Ok(Box::new(as_primitive_array::(&values))) + Ok(as_primitive_array::(&values)) + } + DataType::Duration(TimeUnit::Millisecond) => { + Ok(as_primitive_array::(&values)) + } + DataType::Duration(TimeUnit::Microsecond) => { + Ok(as_primitive_array::(&values)) + } + DataType::Duration(TimeUnit::Nanosecond) => { + Ok(as_primitive_array::(&values)) } - DataType::Duration(TimeUnit::Millisecond) => Ok(Box::new(as_primitive_array::< - DurationMillisecondType, - >(&values))), - DataType::Duration(TimeUnit::Microsecond) => Ok(Box::new(as_primitive_array::< - DurationMicrosecondType, - >(&values))), - DataType::Duration(TimeUnit::Nanosecond) => Ok(Box::new(as_primitive_array::< - DurationNanosecondType, - >(&values))), t => Err(ArrowError::ComputeError(format!( "Lexical Sort not supported for data type {:?}", t diff --git a/rust/arrow/src/array/union.rs b/rust/arrow/src/array/union.rs index 695237af28585..21c88a1245bc1 100644 --- a/rust/arrow/src/array/union.rs +++ b/rust/arrow/src/array/union.rs @@ -172,7 +172,7 @@ impl UnionArray { .iter() .filter(|i| *i < &0) .collect::>(); - if invalid_type_ids.len() > 0 { + if !invalid_type_ids.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( "Type Ids must be positive and cannot be greater than the number of \ child arrays, found:\n{:?}", @@ -188,7 +188,7 @@ impl UnionArray { .iter() .filter(|i| *i < &0 || *i > &max_len) .collect::>(); - if invalid_offsets.len() > 0 { + if !invalid_offsets.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( "Offsets must be positive and within the length of the Array, \ found:\n{:?}", @@ -325,7 +325,7 @@ impl fmt::Debug for UnionArray { column.data_type() )?; fmt::Debug::fmt(column, f)?; - writeln!(f, "")?; + writeln!(f)?; } writeln!(f, "]") } @@ -484,7 +484,7 @@ impl UnionBuilder { /// Appends a null to this builder. pub fn append_null(&mut self) -> Result<()> { - if let None = self.bitmap_builder { + if self.bitmap_builder.is_none() { let mut builder = BooleanBufferBuilder::new(self.len + 1); for _ in 0..self.len { builder.append(true)?; @@ -499,7 +499,7 @@ impl UnionBuilder { self.type_id_builder.append(i8::default())?; // Handle sparse union - if let None = self.value_offset_builder { + if self.value_offset_builder.is_none() { for (_, fd) in self.fields.iter_mut() { fd.append_null_dynamic()?; } @@ -518,25 +518,22 @@ impl UnionBuilder { let mut field_data = match self.fields.remove(&type_name) { Some(data) => data, - None => { - let field_data = match self.value_offset_builder { - Some(_) => { - FieldData::new(self.fields.len() as i8, T::get_data_type(), None) - } - None => { - let mut fd = FieldData::new( - self.fields.len() as i8, - T::get_data_type(), - Some(BooleanBufferBuilder::new(1)), - ); - for _ in 0..self.len { - fd.append_null::()?; - } - fd + None => match self.value_offset_builder { + Some(_) => { + FieldData::new(self.fields.len() as i8, T::get_data_type(), None) + } + None => { + let mut fd = FieldData::new( + self.fields.len() as i8, + T::get_data_type(), + Some(BooleanBufferBuilder::new(1)), + ); + for _ in 0..self.len { + fd.append_null::()?; } - }; - field_data - } + fd + } + }, }; self.type_id_builder.append(field_data.type_id)?; diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs index 83ac5427c5537..06412af664f2e 100644 --- a/rust/arrow/src/bitmap.rs +++ b/rust/arrow/src/bitmap.rs @@ -51,6 +51,10 @@ impl Bitmap { self.bits.len() } + pub fn is_empty(&self) -> bool { + self.bits.is_empty() + } + pub fn is_set(&self, i: usize) -> bool { assert!(i < (self.bits.len() << 3)); unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i) } @@ -60,7 +64,7 @@ impl Bitmap { &self.bits } - pub fn to_buffer(self) -> Buffer { + pub fn into_buffer(self) -> Buffer { self.bits } } diff --git a/rust/arrow/src/compute/kernels/sort.rs b/rust/arrow/src/compute/kernels/sort.rs index b612e4fc81e8c..cc2583a2bf3e6 100644 --- a/rust/arrow/src/compute/kernels/sort.rs +++ b/rust/arrow/src/compute/kernels/sort.rs @@ -271,7 +271,7 @@ pub fn lexsort_to_indices(columns: &Vec) -> Result { // convert ArrayRefs to OrdArray trait objects and perform row count check let flat_columns = columns .iter() - .map(|column| -> Result<(Box<&OrdArray>, SortOptions)> { + .map(|column| -> Result<(&OrdArray, SortOptions)> { // row count check let curr_row_count = column.values.len() - column.values.offset(); match row_count { @@ -292,7 +292,7 @@ pub fn lexsort_to_indices(columns: &Vec) -> Result { column.options.unwrap_or_default(), )) }) - .collect::, SortOptions)>>>()?; + .collect::>>()?; let lex_comparator = |a_idx: &usize, b_idx: &usize| -> Ordering { for column in flat_columns.iter() { diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 4e43e6af979c9..9341190387315 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -187,7 +187,7 @@ fn infer_file_schema( /// /// If `max_read_records` is not set, all files will be read fully to infer the schema. pub fn infer_schema_from_files( - files: &Vec, + files: &[String], delimiter: u8, max_read_records: Option, has_header: bool, @@ -207,7 +207,7 @@ pub fn infer_schema_from_files( } schemas.push(schema.clone()); records_to_read -= records_read; - if records_to_read <= 0 { + if records_to_read == 0 { break; } } @@ -283,11 +283,8 @@ impl Reader { let mut reader_builder = csv_crate::ReaderBuilder::new(); reader_builder.has_headers(has_header); - match delimiter { - Some(c) => { - reader_builder.delimiter(c); - } - _ => (), + if let Some(c) = delimiter { + reader_builder.delimiter(c); } let csv_reader = reader_builder.from_reader(buf_reader); @@ -302,6 +299,7 @@ impl Reader { } /// Read the next batch of rows + #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result> { // read a batch of rows into memory let mut rows: Vec = Vec::with_capacity(self.batch_size); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 3b025c2aef99f..1bc5027bb41f7 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1218,11 +1218,6 @@ impl Field { } } - /// Converts to a `String` representation of the `Field` - pub fn to_string(&self) -> String { - format!("{}: {:?}", self.name, self.data_type) - } - /// Merge field into self if it is compatible. Struct will be merged recursively. /// /// Example: @@ -1251,7 +1246,7 @@ impl Field { DataType::Struct(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; - for self_field in nested_fields.into_iter() { + for self_field in nested_fields.iter_mut() { if self_field.name != from_field.name { continue; } @@ -1274,7 +1269,7 @@ impl Field { DataType::Union(from_nested_fields) => { for from_field in from_nested_fields { let mut is_new_field = true; - for self_field in nested_fields.into_iter() { + for self_field in nested_fields.iter_mut() { if from_field == self_field { is_new_field = false; break; @@ -1336,7 +1331,7 @@ impl Field { impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.to_string()) + write!(f, "{}: {:?}", self.name, self.data_type) } } @@ -1429,7 +1424,7 @@ impl Schema { /// ]), /// ); /// ``` - pub fn try_merge(schemas: &Vec) -> Result { + pub fn try_merge(schemas: &[Self]) -> Result { let mut merged = Self::empty(); for schema in schemas { diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs index e37ba8567492d..cb7fb4cb4100e 100644 --- a/rust/arrow/src/ipc/reader.rs +++ b/rust/arrow/src/ipc/reader.rs @@ -631,6 +631,7 @@ impl FileReader { } /// Read the next record batch + #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result> { // get current block if self.current_block < self.total_blocks { @@ -778,6 +779,7 @@ impl StreamReader { } /// Read the next record batch + #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result> { if self.finished { return Ok(None); diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index dfab56282554d..f9fde68022ee4 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -426,6 +426,7 @@ impl Reader { } /// Read the next batch of records + #[allow(clippy::should_implement_trait)] pub fn next(&mut self) -> Result> { let mut rows: Vec = Vec::with_capacity(self.batch_size); let mut line = String::new(); @@ -445,7 +446,7 @@ impl Reader { } let rows = &rows[..]; - let projection = self.projection.clone().unwrap_or_else(|| vec![]); + let projection = self.projection.clone().unwrap_or_else(Vec::new); let arrays: Result> = self .schema .clone() diff --git a/rust/arrow/src/memory.rs b/rust/arrow/src/memory.rs index c719c96028082..7aa8f66c76071 100644 --- a/rust/arrow/src/memory.rs +++ b/rust/arrow/src/memory.rs @@ -149,12 +149,31 @@ pub fn allocate_aligned(size: usize) -> *mut u8 { } } +/// # Safety +/// +/// This function is unsafe because undefined behavior can result if the caller does not ensure all +/// of the following: +/// +/// * ptr must denote a block of memory currently allocated via this allocator, +/// +/// * size must be the same size that was used to allocate that block of memory, pub unsafe fn free_aligned(ptr: *mut u8, size: usize) { if size != 0x00 && ptr != BYPASS_PTR.as_ptr() { std::alloc::dealloc(ptr, Layout::from_size_align_unchecked(size, ALIGNMENT)); } } +/// # Safety +/// +/// This function is unsafe because undefined behavior can result if the caller does not ensure all +/// of the following: +/// +/// * ptr must be currently allocated via this allocator, +/// +/// * new_size must be greater than zero. +/// +/// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., +/// the rounded value must be less than usize::MAX). pub unsafe fn reallocate(ptr: *mut u8, old_size: usize, new_size: usize) -> *mut u8 { if ptr == BYPASS_PTR.as_ptr() { allocate_aligned(new_size) @@ -171,6 +190,19 @@ pub unsafe fn reallocate(ptr: *mut u8, old_size: usize, new_size: usize) -> *mut } } +/// # Safety +/// +/// Behavior is undefined if any of the following conditions are violated: +/// +/// * `src` must be [valid] for reads of `len * size_of::()` bytes. +/// +/// * `dst` must be [valid] for writes of `len * size_of::()` bytes. +/// +/// * Both `src` and `dst` must be properly aligned. +/// +/// `memcpy` creates a bitwise copy of `T`, regardless of whether `T` is [`Copy`]. If `T` is not +/// [`Copy`], using both the values in the region beginning at `*src` and the region beginning at +/// `*dst` can [violate memory safety][read-ownership]. pub unsafe fn memcpy(dst: *mut u8, src: *const u8, len: usize) { if len != 0x00 && src != BYPASS_PTR.as_ptr() { std::ptr::copy_nonoverlapping(src, dst, len) diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs index 0c69c19f9e622..a2ada2c03237d 100644 --- a/rust/arrow/src/util/bit_util.rs +++ b/rust/arrow/src/util/bit_util.rs @@ -56,6 +56,8 @@ pub fn get_bit(data: &[u8], i: usize) -> bool { /// Returns whether bit at position `i` in `data` is set or not. /// +/// # Safety +/// /// Note this doesn't do any bound checking, for performance reason. The caller is /// responsible to guarantee that `i` is within bounds. #[inline] @@ -71,6 +73,8 @@ pub fn set_bit(data: &mut [u8], i: usize) { /// Sets bit at position `i` for `data` /// +/// # Safety +/// /// Note this doesn't do any bound checking, for performance reason. The caller is /// responsible to guarantee that `i` is within bounds. #[inline] @@ -80,6 +84,8 @@ pub unsafe fn set_bit_raw(data: *mut u8, i: usize) { /// Sets bits in the non-inclusive range `start..end` for `data` /// +/// # Safety +/// /// Note this doesn't do any bound checking, for performance reason. The caller is /// responsible to guarantee that both `start` and `end` are within bounds. #[inline] @@ -158,6 +164,8 @@ pub fn ceil(value: usize, divisor: usize) -> usize { /// Performs SIMD bitwise binary operations. /// +/// # Safety +/// /// Note that each slice should be 64 bytes and it is the callers responsibility to ensure /// that this is the case. If passed slices larger than 64 bytes the operation will only /// be performed on the first 64 bytes. Slices less than 64 bytes will panic. diff --git a/rust/arrow/src/util/string_writer.rs b/rust/arrow/src/util/string_writer.rs index e6419f03bbc2e..a582b6891cc00 100644 --- a/rust/arrow/src/util/string_writer.rs +++ b/rust/arrow/src/util/string_writer.rs @@ -74,6 +74,12 @@ impl StringWriter { } } +impl Default for StringWriter { + fn default() -> Self { + Self::new() + } +} + impl ToString for StringWriter { fn to_string(&self) -> String { self.data.clone()