Skip to content

Commit

Permalink
Merge pull request #4771 from nange/feat/regexp_substr_function
Browse files Browse the repository at this point in the history
feature: support REGEXP_SUBSTR function
  • Loading branch information
BohuTANG authored Apr 9, 2022
2 parents 65f8e5a + ab3f941 commit a279982
Show file tree
Hide file tree
Showing 12 changed files with 515 additions and 18 deletions.
2 changes: 2 additions & 0 deletions common/functions/src/scalars/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ mod pad;
mod quote;
mod regexp_instr;
mod regexp_like;
mod regexp_substr;
mod repeat;
mod replace;
mod reverse;
Expand Down Expand Up @@ -83,6 +84,7 @@ pub use pad::RightPadFunction;
pub use quote::QuoteFunction;
pub use regexp_instr::RegexpInStrFunction;
pub use regexp_like::RegexpLikeFunction;
pub use regexp_substr::RegexpSubStrFunction;
pub use repeat::RepeatFunction;
pub use replace::ReplaceFunction;
pub use reverse::ReverseFunction;
Expand Down
40 changes: 27 additions & 13 deletions common/functions/src/scalars/strings/regexp_instr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,15 @@ impl RegexpInStrFunction {
#[inline]
fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 {
let occur = if occur < 1 { 1 } else { occur };
let mut pos = if pos < 1 { 0 } else { (pos - 1) as usize };
let pos = if pos < 1 { 0 } else { (pos - 1) as usize };

// the 'pos' postion is the character index,
// so we should iterate the character to find the byte index.
let mut pos = match s.char_indices().nth(pos) {
Some((start, _, _)) => start,
None => return 0,
};

let mut i = 1_i64;
let m = loop {
let m = re.find_at(s, pos);
Expand All @@ -233,23 +241,29 @@ fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 {

i += 1;
if let Some(m) = m {
pos += m.end();
}
if pos >= s.len() {
break None;
// set the start postion of 'find_at' function to the position following the matched substring
pos = m.end();
}
};

let instr = match m {
Some(m) => {
if ro == 0 {
m.start() + 1
} else {
m.end() + 1
if m.is_none() {
return 0;
}

// the matched result is the byte index, but the 'regexp_instr' function returns the character index,
// so we should iterate the character to find the character index.
let mut instr = 0_usize;
for (p, (start, end, _)) in s.char_indices().enumerate() {
if ro == 0 {
if start == m.unwrap().start() {
instr = p + 1;
break;
}
} else if end == m.unwrap().end() {
instr = p + 2;
break;
}
None => 0,
};
}

instr as u64
}
Expand Down
244 changes: 244 additions & 0 deletions common/functions/src/scalars/strings/regexp_substr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
// Copyright 2022 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;

use bstr::ByteSlice;
use common_datavalues::prelude::*;
use common_exception::ErrorCode;
use common_exception::Result;
use itertools::izip;
use regex::bytes::Regex;

use crate::scalars::assert_string;
use crate::scalars::cast_column_field;
use crate::scalars::strings::regexp_like::build_regexp_from_pattern;
use crate::scalars::Function;
use crate::scalars::FunctionDescription;
use crate::scalars::FunctionFeatures;

#[derive(Clone)]
pub struct RegexpSubStrFunction {
display_name: String,
}

impl RegexpSubStrFunction {
pub fn try_create(display_name: &str) -> Result<Box<dyn Function>> {
Ok(Box::new(Self {
display_name: display_name.to_string(),
}))
}

pub fn desc() -> FunctionDescription {
FunctionDescription::creator(Box::new(Self::try_create)).features(
FunctionFeatures::default()
.deterministic()
.variadic_arguments(2, 5),
)
}
}

impl Function for RegexpSubStrFunction {
fn name(&self) -> &str {
&self.display_name
}

fn return_type(&self, args: &[&DataTypePtr]) -> Result<DataTypePtr> {
for (i, arg) in args.iter().enumerate() {
if i < 2 || i == 4 {
assert_string(*arg)?;
} else if !arg.data_type_id().is_integer()
&& !arg.data_type_id().is_string()
&& !arg.data_type_id().is_null()
{
return Err(ErrorCode::IllegalDataType(format!(
"Expected integer or string or null, but got {}",
args[i].data_type_id()
)));
}
}

Ok(Arc::new(NullableType::create(StringType::arc())))
}

// Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-substr
fn eval(&self, columns: &ColumnsWithField, input_rows: usize) -> Result<ColumnRef> {
let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc();
let mut occurrence = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc();
let mut match_type = ConstColumn::new(Series::from_data(vec![""]), input_rows).arc();

for i in 2..columns.len() {
match i {
2 => pos = cast_column_field(&columns[2], &Int64Type::arc())?,
3 => occurrence = cast_column_field(&columns[3], &Int64Type::arc())?,
_ => match_type = cast_column_field(&columns[4], &StringType::arc())?,
}
}

let pat = columns[1].column();

if pat.is_const() && match_type.is_const() {
let pat_value = pat.get_string(0)?;
let mt_value = match_type.get_string(0)?;

return self.a_regexp_substr_binary_scalar(
columns[0].column(),
&pat_value,
&pos,
&occurrence,
&mt_value,
input_rows,
);
}

self.a_regexp_substr_binary(
columns[0].column(),
pat,
&pos,
&occurrence,
&match_type,
input_rows,
)
}
}

impl RegexpSubStrFunction {
fn a_regexp_substr_binary_scalar(
&self,
source: &ColumnRef,
pat: &[u8],
pos: &ColumnRef,
occurrence: &ColumnRef,
mt: &[u8],
input_rows: usize,
) -> Result<ColumnRef> {
let mut builder = NullableColumnBuilder::<Vu8>::with_capacity(source.len());

let source = Vu8::try_create_viewer(source)?;
let pos = i64::try_create_viewer(pos)?;
let occur = i64::try_create_viewer(occurrence)?;

let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?;

let iter = izip!(source, pos, occur);
for (s_value, pos_value, occur_value) in iter {
if s_value.is_empty() || pat.is_empty() {
builder.append_null();
continue;
}

let substr = regexp_substr(s_value, &re, pos_value, occur_value);
match substr {
Some(ss) => builder.append(ss, true),
None => builder.append_null(),
}
}

Ok(builder.build(input_rows))
}

fn a_regexp_substr_binary(
&self,
source: &ColumnRef,
pat: &ColumnRef,
pos: &ColumnRef,
occurrence: &ColumnRef,
match_type: &ColumnRef,
input_rows: usize,
) -> Result<ColumnRef> {
let mut builder = NullableColumnBuilder::<Vu8>::with_capacity(source.len());

let mut map: HashMap<Vec<u8>, Regex> = HashMap::new();
let mut key: Vec<u8> = Vec::new();

let source = Vu8::try_create_viewer(source)?;
let pat = Vu8::try_create_viewer(pat)?;
let pos = i64::try_create_viewer(pos)?;
let occur = i64::try_create_viewer(occurrence)?;
let mt = Vu8::try_create_viewer(match_type)?;

let iter = izip!(source, pat, pos, occur, mt);
for (s_value, pat_value, pos_value, occur_value, mt_value) in iter {
if mt_value.starts_with_str("-") {
return Err(ErrorCode::BadArguments(format!(
"Incorrect arguments to {} match type: {}",
self.name(),
mt_value.to_str_lossy(),
)));
}
if s_value.is_empty() || pat_value.is_empty() {
builder.append_null();
continue;
}

key.extend_from_slice(pat_value);
key.extend_from_slice("-".as_bytes());
key.extend_from_slice(mt_value);
let re = if let Some(re) = map.get(&key) {
re
} else {
let re = build_regexp_from_pattern(self.name(), pat_value, Some(mt_value))?;
map.insert(key.clone(), re);
map.get(&key).unwrap()
};
key.clear();

let substr = regexp_substr(s_value, re, pos_value, occur_value);
if let Some(ss) = substr {
builder.append(ss, true);
} else {
builder.append_null();
}
}

Ok(builder.build(input_rows))
}
}

#[inline]
fn regexp_substr<'a>(s: &'a [u8], re: &Regex, pos: i64, occur: i64) -> Option<&'a [u8]> {
let occur = if occur < 1 { 1 } else { occur };
let pos = if pos < 1 { 0 } else { (pos - 1) as usize };

// the 'pos' postion is the character index,
// so we should iterate the character to find the byte index.
let mut pos = match s.char_indices().nth(pos) {
Some((start, _, _)) => start,
None => return None,
};

let mut i = 1_i64;
let m = loop {
let m = re.find_at(s, pos);
if i == occur || m.is_none() {
break m;
}

i += 1;
if let Some(m) = m {
// set the start postion of 'find_at' function to the position following the matched substring
pos = m.end();
}
};

m.map(|m| m.as_bytes())
}

impl fmt::Display for RegexpSubStrFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.display_name)
}
}
2 changes: 2 additions & 0 deletions common/functions/src/scalars/strings/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ use crate::scalars::QuoteFunction;
use crate::scalars::RTrimFunction;
use crate::scalars::RegexpInStrFunction;
use crate::scalars::RegexpLikeFunction;
use crate::scalars::RegexpSubStrFunction;
use crate::scalars::RepeatFunction;
use crate::scalars::ReplaceFunction;
use crate::scalars::ReverseFunction;
Expand Down Expand Up @@ -84,6 +85,7 @@ impl StringFunction {
factory.register("length", LengthFunction::desc());
factory.register("regexp_instr", RegexpInStrFunction::desc());
factory.register("regexp_like", RegexpLikeFunction::desc());
factory.register("regexp_substr", RegexpSubStrFunction::desc());
factory.register("bin", BinFunction::desc());
factory.register("oct", OctFunction::desc());
factory.register("hex", HexFunction::desc());
Expand Down
1 change: 1 addition & 0 deletions common/functions/tests/it/scalars/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
mod lower;
mod regexp_instr;
mod regexp_like;
mod regexp_substr;
mod substring;
mod trim;

Expand Down
27 changes: 22 additions & 5 deletions common/functions/tests/it/scalars/strings/regexp_instr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ fn test_regexp_instr_function() -> Result<()> {
columns: vec![
Series::from_data(vec![
"dog cat dog",
"aa aaa aaaa aa aaa aaaa",
"aa aaa aaaa aa aaa aaaa",
"aa aa aa aaaa aaaa aaaa",
"aa aa aa aaaa aaaa aaaa",
]),
Series::from_data(vec!["dog", "a{2}", "a{4}"]),
Series::from_data(vec![1_i64, 2, 1]),
Series::from_data(vec![2_i64, 2, 2]),
Series::from_data(vec![1_i64, 1, 9]),
Series::from_data(vec![2_i64, 3, 2]),
],
expect: Series::from_data(vec![9_u64, 8, 20]),
expect: Series::from_data(vec![9_u64, 7, 15]),
error: "",
},
ScalarFunctionTest {
Expand Down Expand Up @@ -89,6 +89,23 @@ fn test_regexp_instr_function() -> Result<()> {
expect: Series::from_data(vec![9_u64, 0, 24]),
error: "",
},
ScalarFunctionTest {
name: "regexp-instr-multi-byte-character-passed",
columns: vec![
Series::from_data(vec![
"周 周周 周周周 周周周周",
"周 周周 周周周 周周周周",
"周 周周 周周周 周周周周",
"周 周周 周周周 周周周周",
]),
Series::from_data(vec!["周+", "周+", "周+", "周+"]),
Series::from_data(vec![1_i64, 2, 3, 5]),
Series::from_data(vec![1_i64, 2, 3, 1]),
Series::from_data(vec![0_i64, 1, 1, 1]),
],
expect: Series::from_data(vec![1_u64, 9, 14, 9]),
error: "",
},
ScalarFunctionTest {
name: "regexp-instr-return-option-error",
columns: vec![
Expand Down
Loading

1 comment on commit a279982

@vercel
Copy link

@vercel vercel bot commented on a279982 Apr 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

databend – ./

databend.rs
databend.vercel.app
databend-git-main-databend.vercel.app
databend-databend.vercel.app

Please sign in to comment.