Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(function): Support Semi-structured function GET/GET_IGNORE_CASE/GET_PATH #4684

Merged
merged 1 commit into from
Apr 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions common/datavalues/src/columns/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ impl std::fmt::Debug for dyn Column + '_ {
Struct => {
fmt_dyn!(col, StructColumn, f)
},
Variant | VariantArray | VariantObject => {
fmt_dyn!(col, JsonColumn, f)
}
_ => {
unimplemented!()
}
Expand Down
18 changes: 18 additions & 0 deletions common/datavalues/src/types/type_id.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,24 @@ impl TypeID {
)
}

#[inline]
pub fn is_variant(&self) -> bool {
matches!(
self,
TypeID::Variant | TypeID::VariantArray | TypeID::VariantObject
)
}

#[inline]
pub fn is_variant_or_array(&self) -> bool {
matches!(self, TypeID::Variant | TypeID::VariantArray)
}

#[inline]
pub fn is_variant_or_object(&self) -> bool {
matches!(self, TypeID::Variant | TypeID::VariantObject)
}

#[inline]
pub fn numeric_byte_size(&self) -> Result<usize> {
match self {
Expand Down
1 change: 1 addition & 0 deletions common/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ serde_json = "1.0.79"
sha1 = "0.10.1"
sha2 = "0.10.2"
simdutf8 = "0.1.4"
sqlparser = { git = "https://github.com/datafuse-extras/sqlparser-rs", rev = "1c8d3f1" }
strength_reduce = "0.2.3"
twox-hash = "1.6.2"
uuid = { version = "0.8.2", features = ["v4"] }
Expand Down
251 changes: 251 additions & 0 deletions common/functions/src/scalars/semi_structureds/get.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
// Copyright 2022 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::fmt;
use std::sync::Arc;

use common_datavalues::prelude::*;
use common_exception::ErrorCode;
use common_exception::Result;
use serde_json::Value as JsonValue;
use sqlparser::ast::Value;
use sqlparser::dialect::GenericDialect;
use sqlparser::parser::Parser;
use sqlparser::tokenizer::Tokenizer;

use crate::scalars::Function;
use crate::scalars::FunctionDescription;
use crate::scalars::FunctionFeatures;

pub type GetFunction = GetFunctionImpl<false, false>;

pub type GetIgnoreCaseFunction = GetFunctionImpl<false, true>;

pub type GetPathFunction = GetFunctionImpl<true, false>;

#[derive(Clone)]
pub struct GetFunctionImpl<const BY_PATH: bool, const IGNORE_CASE: bool> {
display_name: String,
}

impl<const BY_PATH: bool, const IGNORE_CASE: bool> GetFunctionImpl<BY_PATH, IGNORE_CASE> {
pub fn try_create(display_name: &str) -> Result<Box<dyn Function>> {
Ok(Box::new(GetFunctionImpl::<BY_PATH, IGNORE_CASE> {
display_name: display_name.to_string(),
}))
}

pub fn desc() -> FunctionDescription {
FunctionDescription::creator(Box::new(Self::try_create))
.features(FunctionFeatures::default().deterministic().num_arguments(2))
}
}

impl<const BY_PATH: bool, const IGNORE_CASE: bool> Function
for GetFunctionImpl<BY_PATH, IGNORE_CASE>
{
fn name(&self) -> &str {
&*self.display_name
}

fn return_type(&self, args: &[&DataTypePtr]) -> Result<DataTypePtr> {
let data_type = args[0];
let path_type = args[1];

if (IGNORE_CASE
&& (!data_type.data_type_id().is_variant_or_object()
|| !path_type.data_type_id().is_string()))
|| (BY_PATH
&& (!data_type.data_type_id().is_variant()
|| !path_type.data_type_id().is_string()))
|| (!data_type.data_type_id().is_variant()
|| (!path_type.data_type_id().is_string()
&& !path_type.data_type_id().is_unsigned_integer()))
{
return Err(ErrorCode::IllegalDataType(format!(
"Invalid argument types for function '{}': ({:?}, {:?})",
self.display_name.to_uppercase(),
data_type,
path_type
)));
}

Ok(Arc::new(NullableType::create(VariantType::arc())))
}

fn eval(&self, columns: &ColumnsWithField, input_rows: usize) -> Result<ColumnRef> {
let path_keys = if BY_PATH {
parse_path_keys(columns[1].column())?
} else {
build_path_keys(columns[1].column())?
};

extract_value_by_path(columns[0].column(), path_keys, input_rows, IGNORE_CASE)
}
}

impl<const BY_PATH: bool, const IGNORE_CASE: bool> fmt::Display
for GetFunctionImpl<BY_PATH, IGNORE_CASE>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.display_name.to_uppercase())
}
}

fn parse_path_keys(column: &ColumnRef) -> Result<Vec<Vec<DataValue>>> {
let column: &StringColumn = if column.is_const() {
let const_column: &ConstColumn = Series::check_get(column)?;
Series::check_get(const_column.inner())?
} else {
Series::check_get(column)?
};

let dialect = &GenericDialect {};
let mut path_keys: Vec<Vec<DataValue>> = vec![];
for v in column.iter() {
if v.is_empty() {
return Err(ErrorCode::SyntaxException(
"Bad compound object's field path name: '' in GET_PATH",
));
}
let definition = std::str::from_utf8(v).unwrap();
let mut tokenizer = Tokenizer::new(dialect, definition);
match tokenizer.tokenize() {
Ok((tokens, position_map)) => {
match Parser::new(tokens, position_map, dialect).parse_map_keys() {
Ok(values) => {
let path_key: Vec<DataValue> = values
.iter()
.map(|v| match v {
Value::Number(value, _) => {
DataValue::try_from_literal(value, None).unwrap()
}
Value::SingleQuotedString(value) => {
DataValue::String(value.clone().into_bytes())
}
Value::ColonString(value) => {
DataValue::String(value.clone().into_bytes())
}
Value::PeriodString(value) => {
DataValue::String(value.clone().into_bytes())
}
_ => DataValue::Null,
})
.collect();

path_keys.push(path_key);
}
Err(parse_error) => return Err(ErrorCode::from(parse_error)),
}
}
Err(tokenize_error) => {
return Err(ErrorCode::SyntaxException(format!(
"Can not tokenize definition: {}, Error: {:?}",
definition, tokenize_error
)))
}
}
}
Ok(path_keys)
}

fn build_path_keys(column: &ColumnRef) -> Result<Vec<Vec<DataValue>>> {
if column.is_const() {
let const_column: &ConstColumn = Series::check_get(column)?;
return build_path_keys(const_column.inner());
}

let mut path_keys: Vec<Vec<DataValue>> = vec![];
for i in 0..column.len() {
path_keys.push(vec![column.get(i)]);
}
Ok(path_keys)
}

fn extract_value_by_path(
column: &ColumnRef,
path_keys: Vec<Vec<DataValue>>,
input_rows: usize,
ignore_case: bool,
) -> Result<ColumnRef> {
let column: &JsonColumn = if column.is_const() {
let const_column: &ConstColumn = Series::check_get(column)?;
Series::check_get(const_column.inner())?
} else {
Series::check_get(column)?
};

let mut builder = NullableColumnBuilder::<JsonValue>::with_capacity(input_rows);
for path_key in path_keys.iter() {
if path_key.is_empty() {
for _ in 0..column.len() {
builder.append_null();
}
continue;
}
for v in column.iter() {
let mut found_value = true;
Copy link
Member

@sundy-li sundy-li Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will found_value defaults to false make this more simple?

Copy link
Member Author

@b41sh b41sh Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default value of found_value is not simpler to set as false, because the lookup is recursive. If we can find a value we need to set it to true, otherwise we set it to false. If the default value is false, the following set found_value = false can't be omitted, but need to add some code to set found_value = true.

let mut value = v;
for key in path_key.iter() {
match key {
DataValue::UInt64(k) => match value.get(*k as usize) {
Some(child_value) => value = child_value,
None => {
found_value = false;
break;
}
},
DataValue::String(k) => match String::from_utf8(k.to_vec()) {
Ok(k) => match value.get(&k) {
Some(child_value) => value = child_value,
None => {
// if no exact match value found, return one of the ambiguous matches
if ignore_case && value.is_object() {
let mut ignore_case_found_value = false;
let obj = value.as_object().unwrap();
for (_, (child_key, child_value)) in obj.iter().enumerate() {
if k.to_lowercase() == child_key.to_lowercase() {
ignore_case_found_value = true;
value = child_value;
break;
}
}
if ignore_case_found_value {
continue;
}
}
found_value = false;
break;
}
},
Err(_) => {
found_value = false;
break;
}
},
_ => {
found_value = false;
break;
}
}
}
if found_value {
builder.append(value, true);
} else {
builder.append_null();
}
}
}
Ok(builder.build(input_rows))
}
4 changes: 4 additions & 0 deletions common/functions/src/scalars/semi_structureds/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
// limitations under the License.

mod check_json;
mod get;
mod parse_json;
mod semi_structured;

pub use check_json::CheckJsonFunction;
pub use get::GetFunction;
pub use get::GetIgnoreCaseFunction;
pub use get::GetPathFunction;
pub use parse_json::ParseJsonFunction;
pub use parse_json::TryParseJsonFunction;
pub use semi_structured::SemiStructuredFunction;
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use super::get::GetFunction;
use super::get::GetIgnoreCaseFunction;
use super::get::GetPathFunction;
use super::parse_json::ParseJsonFunction;
use super::parse_json::TryParseJsonFunction;
use crate::scalars::CheckJsonFunction;
Expand All @@ -24,5 +27,8 @@ impl SemiStructuredFunction {
factory.register("parse_json", ParseJsonFunction::desc());
factory.register("try_parse_json", TryParseJsonFunction::desc());
factory.register("check_json", CheckJsonFunction::desc());
factory.register("get", GetFunction::desc());
factory.register("get_ignore_case", GetIgnoreCaseFunction::desc());
factory.register("get_path", GetPathFunction::desc());
}
}
Loading