Merge pull request #4771 from nange/feat/regexp_substr_function

feature: support REGEXP_SUBSTR function
databendlabs · Apr 9, 2022 · a279982 · a279982 · vercel · Apr 9, 2022
2 parents 65f8e5a + ab3f941
commit a279982
Show file tree

Hide file tree

Showing 12 changed files with 515 additions and 18 deletions.
diff --git a/common/functions/src/scalars/strings/mod.rs b/common/functions/src/scalars/strings/mod.rs
@@ -38,6 +38,7 @@ mod pad;
 mod quote;
 mod regexp_instr;
 mod regexp_like;
+mod regexp_substr;
 mod repeat;
 mod replace;
 mod reverse;
@@ -83,6 +84,7 @@ pub use pad::RightPadFunction;
 pub use quote::QuoteFunction;
 pub use regexp_instr::RegexpInStrFunction;
 pub use regexp_like::RegexpLikeFunction;
+pub use regexp_substr::RegexpSubStrFunction;
 pub use repeat::RepeatFunction;
 pub use replace::ReplaceFunction;
 pub use reverse::ReverseFunction;

diff --git a/common/functions/src/scalars/strings/regexp_instr.rs b/common/functions/src/scalars/strings/regexp_instr.rs
@@ -223,7 +223,15 @@ impl RegexpInStrFunction {
 #[inline]
 fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 {
     let occur = if occur < 1 { 1 } else { occur };
-    let mut pos = if pos < 1 { 0 } else { (pos - 1) as usize };
+    let pos = if pos < 1 { 0 } else { (pos - 1) as usize };
+
+    // the 'pos' postion is the character index,
+    // so we should iterate the character to find the byte index.
+    let mut pos = match s.char_indices().nth(pos) {
+        Some((start, _, _)) => start,
+        None => return 0,
+    };
+
     let mut i = 1_i64;
     let m = loop {
         let m = re.find_at(s, pos);
@@ -233,23 +241,29 @@ fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 {
 
         i += 1;
         if let Some(m) = m {
-            pos += m.end();
-        }
-        if pos >= s.len() {
-            break None;
+            // set the start postion of 'find_at' function to the position following the matched substring
+            pos = m.end();
         }
     };
 
-    let instr = match m {
-        Some(m) => {
-            if ro == 0 {
-                m.start() + 1
-            } else {
-                m.end() + 1
+    if m.is_none() {
+        return 0;
+    }
+
+    // the matched result is the byte index, but the 'regexp_instr' function returns the character index,
+    // so we should iterate the character to find the character index.
+    let mut instr = 0_usize;
+    for (p, (start, end, _)) in s.char_indices().enumerate() {
+        if ro == 0 {
+            if start == m.unwrap().start() {
+                instr = p + 1;
+                break;
             }
+        } else if end == m.unwrap().end() {
+            instr = p + 2;
+            break;
         }
-        None => 0,
-    };
+    }
 
     instr as u64
 }

diff --git a/common/functions/src/scalars/strings/regexp_substr.rs b/common/functions/src/scalars/strings/regexp_substr.rs
@@ -0,0 +1,244 @@
+// Copyright 2022 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fmt;
+use std::sync::Arc;
+
+use bstr::ByteSlice;
+use common_datavalues::prelude::*;
+use common_exception::ErrorCode;
+use common_exception::Result;
+use itertools::izip;
+use regex::bytes::Regex;
+
+use crate::scalars::assert_string;
+use crate::scalars::cast_column_field;
+use crate::scalars::strings::regexp_like::build_regexp_from_pattern;
+use crate::scalars::Function;
+use crate::scalars::FunctionDescription;
+use crate::scalars::FunctionFeatures;
+
+#[derive(Clone)]
+pub struct RegexpSubStrFunction {
+    display_name: String,
+}
+
+impl RegexpSubStrFunction {
+    pub fn try_create(display_name: &str) -> Result<Box<dyn Function>> {
+        Ok(Box::new(Self {
+            display_name: display_name.to_string(),
+        }))
+    }
+
+    pub fn desc() -> FunctionDescription {
+        FunctionDescription::creator(Box::new(Self::try_create)).features(
+            FunctionFeatures::default()
+                .deterministic()
+                .variadic_arguments(2, 5),
+        )
+    }
+}
+
+impl Function for RegexpSubStrFunction {
+    fn name(&self) -> &str {
+        &self.display_name
+    }
+
+    fn return_type(&self, args: &[&DataTypePtr]) -> Result<DataTypePtr> {
+        for (i, arg) in args.iter().enumerate() {
+            if i < 2 || i == 4 {
+                assert_string(*arg)?;
+            } else if !arg.data_type_id().is_integer()
+                && !arg.data_type_id().is_string()
+                && !arg.data_type_id().is_null()
+            {
+                return Err(ErrorCode::IllegalDataType(format!(
+                    "Expected integer or string or null, but got {}",
+                    args[i].data_type_id()
+                )));
+            }
+        }
+
+        Ok(Arc::new(NullableType::create(StringType::arc())))
+    }
+
+    // Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-substr
+    fn eval(&self, columns: &ColumnsWithField, input_rows: usize) -> Result<ColumnRef> {
+        let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc();
+        let mut occurrence = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc();
+        let mut match_type = ConstColumn::new(Series::from_data(vec![""]), input_rows).arc();
+
+        for i in 2..columns.len() {
+            match i {
+                2 => pos = cast_column_field(&columns[2], &Int64Type::arc())?,
+                3 => occurrence = cast_column_field(&columns[3], &Int64Type::arc())?,
+                _ => match_type = cast_column_field(&columns[4], &StringType::arc())?,
+            }
+        }
+
+        let pat = columns[1].column();
+
+        if pat.is_const() && match_type.is_const() {
+            let pat_value = pat.get_string(0)?;
+            let mt_value = match_type.get_string(0)?;
+
+            return self.a_regexp_substr_binary_scalar(
+                columns[0].column(),
+                &pat_value,
+                &pos,
+                &occurrence,
+                &mt_value,
+                input_rows,
+            );
+        }
+
+        self.a_regexp_substr_binary(
+            columns[0].column(),
+            pat,
+            &pos,
+            &occurrence,
+            &match_type,
+            input_rows,
+        )
+    }
+}
+
+impl RegexpSubStrFunction {
+    fn a_regexp_substr_binary_scalar(
+        &self,
+        source: &ColumnRef,
+        pat: &[u8],
+        pos: &ColumnRef,
+        occurrence: &ColumnRef,
+        mt: &[u8],
+        input_rows: usize,
+    ) -> Result<ColumnRef> {
+        let mut builder = NullableColumnBuilder::<Vu8>::with_capacity(source.len());
+
+        let source = Vu8::try_create_viewer(source)?;
+        let pos = i64::try_create_viewer(pos)?;
+        let occur = i64::try_create_viewer(occurrence)?;
+
+        let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?;
+
+        let iter = izip!(source, pos, occur);
+        for (s_value, pos_value, occur_value) in iter {
+            if s_value.is_empty() || pat.is_empty() {
+                builder.append_null();
+                continue;
+            }
+
+            let substr = regexp_substr(s_value, &re, pos_value, occur_value);
+            match substr {
+                Some(ss) => builder.append(ss, true),
+                None => builder.append_null(),
+            }
+        }
+
+        Ok(builder.build(input_rows))
+    }
+
+    fn a_regexp_substr_binary(
+        &self,
+        source: &ColumnRef,
+        pat: &ColumnRef,
+        pos: &ColumnRef,
+        occurrence: &ColumnRef,
+        match_type: &ColumnRef,
+        input_rows: usize,
+    ) -> Result<ColumnRef> {
+        let mut builder = NullableColumnBuilder::<Vu8>::with_capacity(source.len());
+
+        let mut map: HashMap<Vec<u8>, Regex> = HashMap::new();
+        let mut key: Vec<u8> = Vec::new();
+
+        let source = Vu8::try_create_viewer(source)?;
+        let pat = Vu8::try_create_viewer(pat)?;
+        let pos = i64::try_create_viewer(pos)?;
+        let occur = i64::try_create_viewer(occurrence)?;
+        let mt = Vu8::try_create_viewer(match_type)?;
+
+        let iter = izip!(source, pat, pos, occur, mt);
+        for (s_value, pat_value, pos_value, occur_value, mt_value) in iter {
+            if mt_value.starts_with_str("-") {
+                return Err(ErrorCode::BadArguments(format!(
+                    "Incorrect arguments to {} match type: {}",
+                    self.name(),
+                    mt_value.to_str_lossy(),
+                )));
+            }
+            if s_value.is_empty() || pat_value.is_empty() {
+                builder.append_null();
+                continue;
+            }
+
+            key.extend_from_slice(pat_value);
+            key.extend_from_slice("-".as_bytes());
+            key.extend_from_slice(mt_value);
+            let re = if let Some(re) = map.get(&key) {
+                re
+            } else {
+                let re = build_regexp_from_pattern(self.name(), pat_value, Some(mt_value))?;
+                map.insert(key.clone(), re);
+                map.get(&key).unwrap()
+            };
+            key.clear();
+
+            let substr = regexp_substr(s_value, re, pos_value, occur_value);
+            if let Some(ss) = substr {
+                builder.append(ss, true);
+            } else {
+                builder.append_null();
+            }
+        }
+
+        Ok(builder.build(input_rows))
+    }
+}
+
+#[inline]
+fn regexp_substr<'a>(s: &'a [u8], re: &Regex, pos: i64, occur: i64) -> Option<&'a [u8]> {
+    let occur = if occur < 1 { 1 } else { occur };
+    let pos = if pos < 1 { 0 } else { (pos - 1) as usize };
+
+    // the 'pos' postion is the character index,
+    // so we should iterate the character to find the byte index.
+    let mut pos = match s.char_indices().nth(pos) {
+        Some((start, _, _)) => start,
+        None => return None,
+    };
+
+    let mut i = 1_i64;
+    let m = loop {
+        let m = re.find_at(s, pos);
+        if i == occur || m.is_none() {
+            break m;
+        }
+
+        i += 1;
+        if let Some(m) = m {
+            // set the start postion of 'find_at' function to the position following the matched substring
+            pos = m.end();
+        }
+    };
+
+    m.map(|m| m.as_bytes())
+}
+
+impl fmt::Display for RegexpSubStrFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.display_name)
+    }
+}
diff --git a/common/functions/src/scalars/strings/string.rs b/common/functions/src/scalars/strings/string.rs
@@ -44,6 +44,7 @@ use crate::scalars::QuoteFunction;
 use crate::scalars::RTrimFunction;
 use crate::scalars::RegexpInStrFunction;
 use crate::scalars::RegexpLikeFunction;
+use crate::scalars::RegexpSubStrFunction;
 use crate::scalars::RepeatFunction;
 use crate::scalars::ReplaceFunction;
 use crate::scalars::ReverseFunction;
@@ -84,6 +85,7 @@ impl StringFunction {
         factory.register("length", LengthFunction::desc());
         factory.register("regexp_instr", RegexpInStrFunction::desc());
         factory.register("regexp_like", RegexpLikeFunction::desc());
+        factory.register("regexp_substr", RegexpSubStrFunction::desc());
         factory.register("bin", BinFunction::desc());
         factory.register("oct", OctFunction::desc());
         factory.register("hex", HexFunction::desc());

diff --git a/common/functions/tests/it/scalars/strings/mod.rs b/common/functions/tests/it/scalars/strings/mod.rs
@@ -16,6 +16,7 @@
 mod lower;
 mod regexp_instr;
 mod regexp_like;
+mod regexp_substr;
 mod substring;
 mod trim;
 

diff --git a/common/functions/tests/it/scalars/strings/regexp_instr.rs b/common/functions/tests/it/scalars/strings/regexp_instr.rs
@@ -46,14 +46,14 @@ fn test_regexp_instr_function() -> Result<()> {
             columns: vec![
                 Series::from_data(vec![
                     "dog cat dog",
-                    "aa aaa aaaa aa aaa aaaa",
-                    "aa aaa aaaa aa aaa aaaa",
+                    "aa aa aa aaaa aaaa aaaa",
+                    "aa aa aa aaaa aaaa aaaa",
                 ]),
                 Series::from_data(vec!["dog", "a{2}", "a{4}"]),
-                Series::from_data(vec![1_i64, 2, 1]),
-                Series::from_data(vec![2_i64, 2, 2]),
+                Series::from_data(vec![1_i64, 1, 9]),
+                Series::from_data(vec![2_i64, 3, 2]),
             ],
-            expect: Series::from_data(vec![9_u64, 8, 20]),
+            expect: Series::from_data(vec![9_u64, 7, 15]),
             error: "",
         },
         ScalarFunctionTest {
@@ -89,6 +89,23 @@ fn test_regexp_instr_function() -> Result<()> {
             expect: Series::from_data(vec![9_u64, 0, 24]),
             error: "",
         },
+        ScalarFunctionTest {
+            name: "regexp-instr-multi-byte-character-passed",
+            columns: vec![
+                Series::from_data(vec![
+                    "周 周周 周周周 周周周周",
+                    "周 周周 周周周 周周周周",
+                    "周 周周 周周周 周周周周",
+                    "周 周周 周周周 周周周周",
+                ]),
+                Series::from_data(vec!["周+", "周+", "周+", "周+"]),
+                Series::from_data(vec![1_i64, 2, 3, 5]),
+                Series::from_data(vec![1_i64, 2, 3, 1]),
+                Series::from_data(vec![0_i64, 1, 1, 1]),
+            ],
+            expect: Series::from_data(vec![1_u64, 9, 14, 9]),
+            error: "",
+        },
         ScalarFunctionTest {
             name: "regexp-instr-return-option-error",
             columns: vec![