Skip to content

Commit

Permalink
use array_slice and array_unique_agg
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-cnivera committed Oct 31, 2024
1 parent edbeb4d commit 3b64e22
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions semantic_model_generator/snowflake_utils/snowflake_connector.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import concurrent.futures
import json
from collections import defaultdict
from contextlib import contextmanager
from typing import Any, Dict, Generator, List, Optional, TypeVar
Expand Down Expand Up @@ -195,19 +196,25 @@ def _get_column_representation(
try:
cursor = conn.cursor(DictCursor)
assert cursor is not None, "Cursor is unexpectedly None"
cursor_execute = cursor.execute(
f'select distinct "{column_name}" from {schema_name}.{table_name} limit {ndv}'
)
query = f"""
SELECT ARRAY_SLICE(ARRAY_UNIQUE_AGG("{column_name}"), 0, {ndv}) AS unique_values
FROM (
SELECT "{column_name}"
FROM {schema_name}.{table_name}
SAMPLE (1000 ROWS)
)
"""
cursor_execute = cursor.execute(query)
assert cursor_execute is not None, "cursor_execute should not be none "
res = cursor_execute.fetchall()
# Cast all values to string to ensure the list is json serializable.
# A better solution would be to identify the possible types that are not
# json serializable (e.g. datetime objects) and apply the appropriate casting
# in just those cases.
if len(res) > 0:
if isinstance(res[0], dict):
col_key = [k for k in res[0].keys()][0]
column_values = [str(r[col_key]) for r in res]
if res and isinstance(res[0], dict) and "UNIQUE_VALUES" in res[0]:
unique_values_list = json.loads(res[0]["UNIQUE_VALUES"])
column_values = [str(value) for value in unique_values_list]
else:
raise ValueError(
f"Expected the first item of res to be a dict. Instead passed {res}"
Expand Down

0 comments on commit 3b64e22

Please sign in to comment.