From 26c4133c24ec9539628e66e24e8d72c63b75ebe1 Mon Sep 17 00:00:00 2001 From: Albin Antony Date: Thu, 16 Nov 2023 15:25:53 +0530 Subject: [PATCH] rgw/s3select: json output format for csv, json & parquet Signed-off-by: Albin Antony --- requirements.txt | 2 + s3tests_boto3/functional/test_s3select.py | 382 ++++++++++++++++------ 2 files changed, 275 insertions(+), 109 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7742d8fb9..a75019247 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,5 @@ httplib2 lxml pytest tox +pandas +pyarrow diff --git a/s3tests_boto3/functional/test_s3select.py b/s3tests_boto3/functional/test_s3select.py index 1ce4fa3ea..3f37c6ece 100644 --- a/s3tests_boto3/functional/test_s3select.py +++ b/s3tests_boto3/functional/test_s3select.py @@ -5,6 +5,10 @@ import json from botocore.exceptions import ClientError +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + import uuid from . import ( @@ -235,30 +239,45 @@ def create_random_json_object(rows,columns,col_delim=",",record_delim="\n",csv_s return result -def csv_to_json(obj, field_split=",",row_split="\n",csv_schema=""): +def create_parquet_object(parquet_size): + # Initialize lists with random integers + a = [random.randint(1, 10000) for _ in range(parquet_size)] + b = [random.randint(1, 10000) for _ in range(parquet_size)] + c = [random.randint(1, 10000) for _ in range(parquet_size)] + d = [random.randint(1, 10000) for _ in range(parquet_size)] + + # Create DataFrame + df3 = pd.DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + + # Create Parquet object + table = pa.Table.from_pandas(df3, preserve_index=False) + obj = pa.BufferOutputStream() + pq.write_table(table, obj) + + return obj.getvalue().to_pybytes() + +def csv_to_json(obj, field_split=",", row_split="\n", csv_schema=""): result = "{\"root\" : [" - result += row_split - if len(csv_schema)>0 : - result = csv_schema + row_split - for rec in obj.split(row_split): - row = "" - num = 0 - row += "{" - for col in rec.split(field_split): - if col == "": - break - num += 1 - row = row + "\"c" + str(num) + "\"" + ": " "{}{}".format(col,field_split) - row = row[:-1] - row += "}" - row += "," - result += row + row_split + rows = obj.split(row_split) + for rec in rows: + if rec.strip() == "": + continue - result = result[:-5] - result += row_split - result += "]" + "}" + row = "{" + columns = rec.split(field_split) + for i, col in enumerate(columns): + if col.strip() == "": + continue + if col.isdigit() or (col.replace('.', '', 1).isdigit() and col.count('.') < 2): + row += "\"c{}\": {}, ".format(i + 1, col) + else: + row += "\"c{}\": \"{}\", ".format(i + 1, col) + row = row.rstrip(', ') + "}," + result += row + row_split + result = result.rstrip(',\n') + result += "]}" return result def upload_object(bucket_name,new_key,obj): @@ -272,6 +291,12 @@ def upload_object(bucket_name,new_key,obj): response = c2.get_object(Bucket=bucket_name, Key=new_key) assert response['Body'].read().decode('utf-8') == obj, 's3select error[ downloaded object not equal to uploaded objecy' +def upload_parquet_object(bucket_name,parquet_obj_name,obj): + + client = get_client() + client.create_bucket(Bucket=bucket_name) + client.put_object(Bucket=bucket_name, Key=parquet_obj_name, Body=obj) + def run_s3select(bucket,key,query,column_delim=",",row_delim="\n",quot_char='"',esc_char='\\',csv_header_info="NONE", progress = False): s3 = get_client() @@ -359,6 +384,48 @@ def run_s3select_json(bucket,key,query, op_row_delim = "\n"): return result +def run_s3select_csv_json_format(bucket,key,query,column_delim=",",row_delim="\n",quot_char='"',esc_char='\\',csv_header_info="NONE", progress = False): + + s3 = get_client() + + r = s3.select_object_content( + Bucket=bucket, + Key=key, + ExpressionType='SQL', + InputSerialization = {"CSV": {"RecordDelimiter" : row_delim, "FieldDelimiter" : column_delim,"QuoteEscapeCharacter": esc_char, "QuoteCharacter": quot_char, "FileHeaderInfo": csv_header_info}, "CompressionType": "NONE"}, + OutputSerialization = {"JSON": {}}, + Expression=query,) + #Record delimiter optional in output serialization + + result = "" + for event in r['Payload']: + if 'Records' in event: + records = event['Records']['Payload'].decode('utf-8') + result += records + + return result + +def run_s3select_parquet_json_format(bucket,key,query, op_row_delim = "\n"): + + s3 = get_client() + + r = s3.select_object_content( + Bucket=bucket, + Key=key, + ExpressionType='SQL', + InputSerialization = {'Parquet': {}}, + OutputSerialization = {"JSON": {}}, + Expression=query,) + #Record delimiter optional in output serialization + + result = "" + for event in r['Payload']: + if 'Records' in event: + records = event['Records']['Payload'].decode('utf-8') + result += records + + return result + def remove_xml_tags_from_result(obj): result = "" for rec in obj.split("\n"): @@ -388,6 +455,13 @@ def create_list_of_int(column_pos,obj,field_split=",",row_split="\n"): return list_of_int +def get_max_from_parquet_column(parquet_obj, column_name): + table = pq.read_table(pa.BufferReader(parquet_obj)) + + df = table.to_pandas() + + return df[column_name].max() + @pytest.mark.s3select def test_count_operation(): csv_obj_name = get_random_string() @@ -396,7 +470,7 @@ def test_count_operation(): obj_to_load = create_random_csv_object(num_of_rows,10) upload_object(bucket_name,csv_obj_name,obj_to_load) res = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select count(0) from s3object;") ).replace(",","") - + s3select_assert_result( num_of_rows, int( res )) @pytest.mark.s3select @@ -407,191 +481,179 @@ def test_count_json_operation(): num_of_rows = 1 obj_to_load = create_random_json_object(num_of_rows,10) upload_object(bucket_name,json_obj_name,obj_to_load) - res = remove_xml_tags_from_result(run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*];")) - s3select_assert_result( 1, int(res)) + res = run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*];") + s3select_assert_result( '{"_1":1}\n', res) - res = remove_xml_tags_from_result(run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root;")) - s3select_assert_result( 1, int(res)) + res = run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root;") + s3select_assert_result( '{"_1":1}\n', res) + json_obj_name = get_random_string() obj_to_load = create_random_json_object(3,10) upload_object(bucket_name,json_obj_name,obj_to_load) - res = remove_xml_tags_from_result(run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root;")) - s3select_assert_result( 3, int(res)) + res = run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root;") + s3select_assert_result( '{"_1":3}\n', res) @pytest.mark.s3select -def test_json_column_sum_min_max(): +def test_column_sum_min_max(): csv_obj = create_random_csv_object(10000,10) - json_obj = csv_to_json(csv_obj); - - json_obj_name = get_random_string() + csv_obj_name = get_random_string() bucket_name = get_new_bucket_name() - - upload_object(bucket_name,json_obj_name,json_obj) - json_obj_name_2 = get_random_string() + upload_object(bucket_name,csv_obj_name,csv_obj) + + csv_obj_name_2 = get_random_string() bucket_name_2 = "testbuck2" - upload_object(bucket_name_2,json_obj_name_2,json_obj) + upload_object(bucket_name_2,csv_obj_name_2,csv_obj) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select min(_1.c1) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select min(int(_1)) from s3object;") ).replace(",","") list_int = create_list_of_int( 1 , csv_obj ) res_target = min( list_int ) s3select_assert_result( int(res_s3select), int(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select min(_1.c4) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select min(int(_4)) from s3object;") ).replace(",","") list_int = create_list_of_int( 4 , csv_obj ) res_target = min( list_int ) s3select_assert_result( int(res_s3select), int(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select avg(_1.c6) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select avg(int(_6)) from s3object;") ).replace(",","") list_int = create_list_of_int( 6 , csv_obj ) res_target = float(sum(list_int ))/10000 s3select_assert_result( float(res_s3select), float(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select max(_1.c4) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select max(int(_4)) from s3object;") ).replace(",","") list_int = create_list_of_int( 4 , csv_obj ) res_target = max( list_int ) s3select_assert_result( int(res_s3select), int(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select max(_1.c7) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select max(int(_7)) from s3object;") ).replace(",","") list_int = create_list_of_int( 7 , csv_obj ) res_target = max( list_int ) s3select_assert_result( int(res_s3select), int(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select sum(_1.c4) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select sum(int(_4)) from s3object;") ).replace(",","") list_int = create_list_of_int( 4 , csv_obj ) res_target = sum( list_int ) s3select_assert_result( int(res_s3select), int(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select sum(_1.c7) from s3object[*].root;") ).replace(",","") + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select sum(int(_7)) from s3object;") ).replace(",","") list_int = create_list_of_int( 7 , csv_obj ) res_target = sum( list_int ) s3select_assert_result( int(res_s3select) , int(res_target) ) # the following queries, validates on *random* input an *accurate* relation between condition result,sum operation and count operation. - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name_2,json_obj_name_2,"select count(0),sum(_1.c1),sum(_1.c2) from s3object[*].root where (_1.c1-_1.c2) = 2;" ) ) + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name_2,csv_obj_name_2,"select count(0),sum(int(_1)),sum(int(_2)) from s3object where (int(_1)-int(_2)) = 2;" ) ) count,sum1,sum2 = res_s3select.split(",") s3select_assert_result( int(count)*2 , int(sum1)-int(sum2 ) ) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0),sum(_1.c1),sum(_1.c2) from s3object[*].root where (_1.c1-_1.c2) = 4;" ) ) + res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select count(0),sum(int(_1)),sum(int(_2)) from s3object where (int(_1)-int(_2)) = 4;" ) ) count,sum1,sum2 = res_s3select.split(",") s3select_assert_result( int(count)*4 , int(sum1)-int(sum2) ) @pytest.mark.s3select -def test_json_nullif_expressions(): - - json_obj = create_random_json_object(10000,10) +def test_csv_json_format_column_sum_min_max(): + csv_obj = create_random_csv_object(10000,10) - json_obj_name = get_random_string() + csv_obj_name = get_random_string() bucket_name = get_new_bucket_name() + + upload_object(bucket_name,csv_obj_name,csv_obj) + + csv_obj_name_2 = get_random_string() + bucket_name_2 = "testbuck2" + upload_object(bucket_name_2,csv_obj_name_2,csv_obj) + + res_s3select = run_s3select_csv_json_format(bucket_name,csv_obj_name,"select max(int(_1)) from s3object;") + list_int = create_list_of_int( 1 , csv_obj ) + res_target = max( list_int ) - upload_object(bucket_name,json_obj_name,json_obj) - - res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where nullif(_1.c1,_1.c2) is null ;") ).replace("\n","") - - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 = _1.c2 ;") ).replace("\n","") - - s3select_assert_result( res_s3select_nullif, res_s3select) - - res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (nullif(_1.c1,_1.c2) is null) from s3object[*].root ;") ).replace("\n","") - - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (_1.c1 = _1.c2) from s3object[*].root ;") ).replace("\n","") - - s3select_assert_result( res_s3select_nullif, res_s3select) - - res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where not nullif(_1.c1,_1.c2) is null ;") ).replace("\n","") + s3select_assert_result(res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 != _1.c2 ;") ).replace("\n","") +@pytest.mark.s3select +def test_parquet_json_format_column_sum_min_max(): - s3select_assert_result( res_s3select_nullif, res_s3select) + a = [random.randint(1, 10000) for _ in range(100)] - res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (nullif(_1.c1,_1.c2) is not null) from s3object[*].root ;") ).replace("\n","") + df3 = pd.DataFrame({'a': a}) - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (_1.c1 != _1.c2) from s3object[*].root ;") ).replace("\n","") + table = pa.Table.from_pandas(df3, preserve_index=False) + obj = pa.BufferOutputStream() + pq.write_table(table, obj) - s3select_assert_result( res_s3select_nullif, res_s3select) + parquet_obj = obj.getvalue().to_pybytes() - res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where nullif(_1.c1,_1.c2) = _1.c1 ;") ).replace("\n","") + parquet_obj_name = "4col.parquet" + bucket_name = get_new_bucket_name() - res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 != _1.c2 ;") ).replace("\n","") + upload_parquet_object(bucket_name,parquet_obj_name,parquet_obj) + max_value = get_max_from_parquet_column(parquet_obj, 'a') - s3select_assert_result( res_s3select_nullif, res_s3select) + res_s3select = run_s3select_parquet_json_format(bucket_name,parquet_obj_name,'select max(a) from s3object ;') + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(max_value)) @pytest.mark.s3select -def test_column_sum_min_max(): - csv_obj = create_random_csv_object(10000,10) +def test_json_column_sum_min_max(): + csv_obj = create_random_csv_object(10,10) + print(csv_obj) - csv_obj_name = get_random_string() + json_obj = csv_to_json(csv_obj) + print(json_obj) + + json_obj_name = get_random_string() bucket_name = get_new_bucket_name() - - upload_object(bucket_name,csv_obj_name,csv_obj) - - csv_obj_name_2 = get_random_string() - bucket_name_2 = "testbuck2" - upload_object(bucket_name_2,csv_obj_name_2,csv_obj) - - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select min(int(_1)) from s3object;") ).replace(",","") + + upload_object(bucket_name,json_obj_name,json_obj) + + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select min(_1.c1) from s3object[*].root;") list_int = create_list_of_int( 1 , csv_obj ) + print(list_int) res_target = min( list_int ) + print("target") + print(res_target) + print(res_s3select) - s3select_assert_result( int(res_s3select), int(res_target)) + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select min(int(_4)) from s3object;") ).replace(",","") + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select min(_1.c4) from s3object[*].root;") list_int = create_list_of_int( 4 , csv_obj ) res_target = min( list_int ) - s3select_assert_result( int(res_s3select), int(res_target)) - - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select avg(int(_6)) from s3object;") ).replace(",","") - list_int = create_list_of_int( 6 , csv_obj ) - res_target = float(sum(list_int ))/10000 - - s3select_assert_result( float(res_s3select), float(res_target)) + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select max(int(_4)) from s3object;") ).replace(",","") + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select max(_1.c4) from s3object[*].root;") list_int = create_list_of_int( 4 , csv_obj ) res_target = max( list_int ) - s3select_assert_result( int(res_s3select), int(res_target)) + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select max(int(_7)) from s3object;") ).replace(",","") + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select max(_1.c7) from s3object[*].root;") list_int = create_list_of_int( 7 , csv_obj ) res_target = max( list_int ) - s3select_assert_result( int(res_s3select), int(res_target)) + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select sum(int(_4)) from s3object;") ).replace(",","") + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select sum(_1.c4) from s3object[*].root;") list_int = create_list_of_int( 4 , csv_obj ) res_target = sum( list_int ) - s3select_assert_result( int(res_s3select), int(res_target)) + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select sum(int(_7)) from s3object;") ).replace(",","") + res_s3select = run_s3select_json(bucket_name,json_obj_name,"select sum(_1.c7) from s3object[*].root;") list_int = create_list_of_int( 7 , csv_obj ) res_target = sum( list_int ) - s3select_assert_result( int(res_s3select) , int(res_target) ) - - # the following queries, validates on *random* input an *accurate* relation between condition result,sum operation and count operation. - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name_2,csv_obj_name_2,"select count(0),sum(int(_1)),sum(int(_2)) from s3object where (int(_1)-int(_2)) = 2;" ) ) - count,sum1,sum2 = res_s3select.split(",") + s3select_assert_result( res_s3select, '{{"_1":{}}}\n'.format(res_target)) - s3select_assert_result( int(count)*2 , int(sum1)-int(sum2 ) ) - - res_s3select = remove_xml_tags_from_result( run_s3select(bucket_name,csv_obj_name,"select count(0),sum(int(_1)),sum(int(_2)) from s3object where (int(_1)-int(_2)) = 4;" ) ) - count,sum1,sum2 = res_s3select.split(",") - - s3select_assert_result( int(count)*4 , int(sum1)-int(sum2) ) @pytest.mark.s3select def test_nullif_expressions(): @@ -649,6 +711,46 @@ def test_nullif_expressions(): s3select_assert_result( res_s3select_nullif, res_s3select) +@pytest.mark.s3select +def test_json_nullif_expressions(): + + json_obj = create_random_json_object(10000,10) + + json_obj_name = get_random_string() + bucket_name = get_new_bucket_name() + + upload_object(bucket_name,json_obj_name,json_obj) + + res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where nullif(_1.c1,_1.c2) is null ;") ).replace("\n","") + + res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 = _1.c2 ;") ).replace("\n","") + + s3select_assert_result( res_s3select_nullif, res_s3select) + + res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (nullif(_1.c1,_1.c2) is null) from s3object[*].root ;") ).replace("\n","") + + res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (_1.c1 = _1.c2) from s3object[*].root ;") ).replace("\n","") + + s3select_assert_result( res_s3select_nullif, res_s3select) + + res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where not nullif(_1.c1,_1.c2) is null ;") ).replace("\n","") + + res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 != _1.c2 ;") ).replace("\n","") + + s3select_assert_result( res_s3select_nullif, res_s3select) + + res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (nullif(_1.c1,_1.c2) is not null) from s3object[*].root ;") ).replace("\n","") + + res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select (_1.c1 != _1.c2) from s3object[*].root ;") ).replace("\n","") + + s3select_assert_result( res_s3select_nullif, res_s3select) + + res_s3select_nullif = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where nullif(_1.c1,_1.c2) = _1.c1 ;") ).replace("\n","") + + res_s3select = remove_xml_tags_from_result( run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 != _1.c2 ;") ).replace("\n","") + + s3select_assert_result( res_s3select_nullif, res_s3select) + @pytest.mark.s3select def test_nulliftrue_expressions(): @@ -717,6 +819,43 @@ def test_lowerupper_expressions(): s3select_assert_result( res_s3select, "AB12CD$$") +@pytest.mark.s3select +def test_json_lowerupper_expressions(): + + json_obj = create_random_json_object(1,10) + + json_obj_name = get_random_string() + bucket_name = get_new_bucket_name() + + upload_object(bucket_name,json_obj_name,json_obj) + + res_s3select = run_s3select_json(bucket_name,json_obj_name,'select lower("AB12cd$$") from s3object[*] ;') + + s3select_assert_result( res_s3select, '{"_1":ab12cd$$}\n') + + res_s3select = run_s3select_json(bucket_name,json_obj_name,'select upper("ab12CD$$") from s3object[*] ;') + + s3select_assert_result( res_s3select, '{"_1":AB12CD$$}\n') + +@pytest.mark.s3select +def test_parquet_lowerupper_expressions(): + + parquet_obj = create_parquet_object(1) + + parquet_obj_name = "4col.parquet" + bucket_name = get_new_bucket_name() + + upload_parquet_object(bucket_name,parquet_obj_name,parquet_obj) + + res_s3select = run_s3select_parquet_json_format(bucket_name,parquet_obj_name,'select lower("AB12cd$$") from s3object ;') + + s3select_assert_result( res_s3select, '{"_1":ab12cd$$}\n') + + res_s3select = run_s3select_parquet_json_format(bucket_name,parquet_obj_name,'select upper("ab12CD$$") from s3object ;') + + s3select_assert_result( res_s3select, '{"_1":AB12CD$$}\n') + + @pytest.mark.s3select def test_in_expressions(): @@ -922,6 +1061,31 @@ def test_like_expressions(): s3select_assert_result( res_s3select_like, res_s3select ) +@pytest.mark.s3select +def test_json_like_expressions(): + + csv_obj = create_random_csv_object_string(1,10) + print(csv_obj) + json_obj = csv_to_json(csv_obj) + print(json_obj) + + json_obj_name = get_random_string() + bucket_name = get_new_bucket_name() + + upload_object(bucket_name,json_obj_name,json_obj) + + res_s3select_like = remove_xml_tags_from_result(run_s3select_json(bucket_name,json_obj_name,"select count(0) from s3object[*].root where _1.c1 like \"%aeio%\";")).replace("\n","") + + res_s3select = remove_xml_tags_from_result(run_s3select_json(bucket_name,json_obj_name, "select count(0) from s3object[*].root where substring(_1.c1,11,4) = \"aeio\" ;")).replace("\n","") + + s3select_assert_result( res_s3select_like, res_s3select ) + + res_s3select_like = run_s3select_json(bucket_name,json_obj_name,'select (_1.c1 like "%aeio%") from s3object[*].root ;') + + res_s3select = run_s3select_json(bucket_name,json_obj_name, 'select (substring(_1.c1,11,4) = "aeio") from s3object[*].root ;') + + s3select_assert_result( res_s3select_like, res_s3select ) + @pytest.mark.s3select def test_truefalselike_expressions():