From c57200f57fe03147f353e701d9f8d80cda2dc626 Mon Sep 17 00:00:00 2001 From: skantor Date: Wed, 6 Sep 2023 15:53:01 +0300 Subject: [PATCH 1/4] add databricks dialect --- .../dialect_libraries/databricks_library.py | 31 +++++++++++ compiler/dialects.py | 54 ++++++++++++++++++- 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 compiler/dialect_libraries/databricks_library.py diff --git a/compiler/dialect_libraries/databricks_library.py b/compiler/dialect_libraries/databricks_library.py new file mode 100644 index 00000000..e500de4b --- /dev/null +++ b/compiler/dialect_libraries/databricks_library.py @@ -0,0 +1,31 @@ +""" +Take snowflake dialect as a reference and implement the same set of functions for Databricks +Once done, test every function and their combinations thoroughly +""" + +library = """ +->(left:, right:) = {arg: left, value: right}; +ArgMin(a) = SqlExpr( + "(ARRAY_AGG({arg} order by {value}))[1]", + {arg: a.arg, value: a.value}); +ArgMax(a) = SqlExpr( + "(ARRAY_AGG({arg} order by {value} desc))[1]", + {arg: a.arg, value: a.value}); +ArgMaxK(a, l) = SqlExpr( + "SLICE(ARRAY_AGG({arg} order by {value} desc), 1, {lim})", + {arg: a.arg, value: a.value, lim: l}); +ArgMinK(a, l) = SqlExpr( + "SLICE(ARRAY_AGG({arg} order by {value}), 1, {lim})", + {arg: a.arg, value: a.value, lim: l}); +RMatch(s, p) = SqlExpr( + "REGEXP_LIKE({s}, {p})", + {s: s, p: p}); +RExtract(s, p, g) = SqlExpr( + "REGEXP_SUBSTR({s}, {p}, 1, 1, 'c', {g})", + {s: s, p: p, g: g}); + +Array(a) = SqlExpr( + "ARRAY_AGG({value} order by {arg})", + {arg: a.arg, value: a.value}); + +""" \ No newline at end of file diff --git a/compiler/dialects.py b/compiler/dialects.py index aa23d8bd..a76e4601 100755 --- a/compiler/dialects.py +++ b/compiler/dialects.py @@ -24,12 +24,14 @@ from compiler.dialect_libraries import sqlite_library from compiler.dialect_libraries import trino_library from compiler.dialect_libraries import presto_library + from compiler.dialect_libraries import databricks_library else: from ..compiler.dialect_libraries import bq_library from ..compiler.dialect_libraries import psql_library from ..compiler.dialect_libraries import sqlite_library from ..compiler.dialect_libraries import trino_library from ..compiler.dialect_libraries import presto_library + from ..compiler.dialect_libraries import databricks_library def Get(engine): return DIALECTS[engine]() @@ -308,11 +310,61 @@ def DecorateCombineRule(self, rule, var): return rule +class Databricks(Dialect): + """Databricks dialect""" + + def Name(self): + return 'Databricks' + + def BuiltInFunctions(self): + return { + 'ToString': 'CAST(%s AS STRING)', + 'ToInt64': 'CAST(%s AS BIGINT)', + 'ToFloat64': 'CAST(%s AS DOUBLE)', + 'AnyValue': 'ANY_VALUE(%s)', + 'ILike': '({0}::string ILIKE {1})', + 'Like': '({0}::string LIKE {1})', + 'Replace': 'REPLACE({0}::string, {1}, {2})', + 'ArrayConcat': 'ARRAY_JOIN({0}, {1})', + 'JsonExtract': 'GET_JSON_OBJECT({0}, {1})', + 'JsonExtractScalar': 'GET_JSON_OBJECT({0}, {1})', + 'Length': 'ARRAY_SIZE(%s)', + 'DateDiff': 'DATEDIFF({0}, {1}, {2})', + 'IsNull': '({0} IS NULL)', + 'LogicalOr': 'BOOL_OR(%s)', + 'LogicalAnd': 'BOOL AND(%s)' + } + + def InfixOperators(self): + return { + '++': 'CONCAT(%s, %s)', + 'in': 'ARRAY_CONTAINS(%s, %s)' + } + + def Subscript(self, record, subscript): + return '%s.%s' % (record, subscript) + + def LibraryProgram(self): + return databricks_library.library + + def UnnestPhrase(self): + return 'explode({0}) AS pushkin({1})' + + def ArrayPhrase(self): + return 'ARRAY(%s)' + + def GroupBySpecBy(self): + return 'index' + + def DecorateCombineRule(self, rule, var): + return rule + DIALECTS = { 'bigquery': BigQueryDialect, 'sqlite': SqLiteDialect, 'psql': PostgreSQL, 'presto': Presto, - 'trino': Trino + 'trino': Trino, + 'databricks': Databricks } From 908f659a78e2be66f9bf556fde5ed3129c2b15d3 Mon Sep 17 00:00:00 2001 From: skantor Date: Wed, 6 Sep 2023 15:56:40 +0300 Subject: [PATCH 2/4] minor fix --- compiler/dialect_libraries/databricks_library.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/compiler/dialect_libraries/databricks_library.py b/compiler/dialect_libraries/databricks_library.py index e500de4b..56872300 100644 --- a/compiler/dialect_libraries/databricks_library.py +++ b/compiler/dialect_libraries/databricks_library.py @@ -1,8 +1,3 @@ -""" -Take snowflake dialect as a reference and implement the same set of functions for Databricks -Once done, test every function and their combinations thoroughly -""" - library = """ ->(left:, right:) = {arg: left, value: right}; ArgMin(a) = SqlExpr( From 594472470e4a4492d63f42763442f652342c5ba3 Mon Sep 17 00:00:00 2001 From: skantor Date: Mon, 18 Sep 2023 13:31:59 +0300 Subject: [PATCH 3/4] add comment for Apache compliance --- compiler/dialect_libraries/databricks_library.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/compiler/dialect_libraries/databricks_library.py b/compiler/dialect_libraries/databricks_library.py index 56872300..73d70a77 100644 --- a/compiler/dialect_libraries/databricks_library.py +++ b/compiler/dialect_libraries/databricks_library.py @@ -1,3 +1,19 @@ +#!/usr/bin/python +# +# Copyright 2023 Logica Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + library = """ ->(left:, right:) = {arg: left, value: right}; ArgMin(a) = SqlExpr( From 6f75c2b4b32404f338961f3b9a95e825d0a54975 Mon Sep 17 00:00:00 2001 From: skantor Date: Wed, 20 Sep 2023 15:23:57 +0300 Subject: [PATCH 4/4] add todo --- compiler/dialects.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/compiler/dialects.py b/compiler/dialects.py index a76e4601..c1e0950e 100755 --- a/compiler/dialects.py +++ b/compiler/dialects.py @@ -57,7 +57,7 @@ def InfixOperators(self): def Subscript(self, record, subscript): return '%s.%s' % (record, subscript) - + def LibraryProgram(self): return bq_library.library @@ -105,7 +105,7 @@ def DecorateCombineRule(self, rule, var): """Resolving ambiguity of aggregation scope.""" # Entangling result of aggregation with a variable that comes from a list # unnested inside a combine expression, to make it clear that aggregation - # must be done in the combine. + # must be done in the combine. rule = copy.deepcopy(rule) rule['head']['record']['field_value'][0]['value'][ @@ -121,7 +121,7 @@ def DecorateCombineRule(self, rule, var): 'field': 0, 'value': rule['head']['record']['field_value'][0]['value'][ 'aggregation']['expression']['call'][ - 'record']['field_value'][0]['value'] + 'record']['field_value'][0]['value'] }, { 'field': 1, @@ -166,7 +166,7 @@ def DecorateCombineRule(self, rule, var): } } } - } + } ) return rule @@ -180,7 +180,7 @@ def InfixOperators(self): def Subscript(self, record, subscript): return 'JSON_EXTRACT(%s, "$.%s")' % (record, subscript) - + def LibraryProgram(self): return sqlite_library.library @@ -215,7 +215,7 @@ def InfixOperators(self): def Subscript(self, record, subscript): return '(%s).%s' % (record, subscript) - + def LibraryProgram(self): return psql_library.library @@ -255,7 +255,7 @@ def InfixOperators(self): def Subscript(self, record, subscript): return '%s.%s' % (record, subscript) - + def LibraryProgram(self): return trino_library.library @@ -276,7 +276,7 @@ class Presto(Dialect): def Name(self): return 'Presto' - + def BuiltInFunctions(self): return { 'Range': 'SEQUENCE(0, %s - 1)', @@ -293,7 +293,7 @@ def InfixOperators(self): def Subscript(self, record, subscript): return '%s.%s' % (record, subscript) - + def LibraryProgram(self): return presto_library.library @@ -313,6 +313,8 @@ def DecorateCombineRule(self, rule, var): class Databricks(Dialect): """Databricks dialect""" + #TODO: add DATEDIFF and NOW function + def Name(self): return 'Databricks'