From b9e4027ac56bbb585a27eaa3c3a52fc40568139c Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 15:24:49 +0200 Subject: [PATCH 01/23] table profile added --- CHANGELOG.md | 1 + doc/_toc.yml | 1 + doc/user-guide/explore-tables.md | 88 ++++++++++++++++++ src/sql/inspect.py | 154 ++++++++++++++++++++++++++++++- src/sql/magic_cmd.py | 28 +++++- src/tests/test_magic_cmd.py | 53 +++++++++++ 6 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 doc/user-guide/explore-tables.md diff --git a/CHANGELOG.md b/CHANGELOG.md index b50b9a050..51ef6da9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # CHANGELOG ## 0.5.7dev +* [Feature] Adds `%sqlcmd profile` (#66) ## 0.5.6 (2023-02-16) diff --git a/doc/_toc.yml b/doc/_toc.yml index 7180cedd4..fcc5ed168 100644 --- a/doc/_toc.yml +++ b/doc/_toc.yml @@ -13,6 +13,7 @@ parts: - file: compose - file: user-guide/tables-columns - file: plot-legacy + - file: user-guide/explore-tables - caption: Integrations chapters: diff --git a/doc/user-guide/explore-tables.md b/doc/user-guide/explore-tables.md new file mode 100644 index 000000000..9d37dd027 --- /dev/null +++ b/doc/user-guide/explore-tables.md @@ -0,0 +1,88 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.14.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Explore tables + +When dealing with a new dataset, it's crucial for practitioners to have a comprehensive understanding of the data in a timely manner. This involves exploring and summarizing the dataset efficiently to extract valuable insights. However, this can be a time-consuming process. Fortunately, `%sqlcmd profile` offers an easy way to generate statistics and descriptive information, enabling practitioners to quickly gain a deeper understanding of the dataset. + +Availble statistics: + +* The count of non empty values +* The number of unique values +* The top (most frequent) value +* The frequency of your top value +* The mean, standard deviation, min and max values +* The percentiles of your data: 25%, 50% and 75%. + + +## Examples + +### Simple example with SQLite + +```{code-cell} ipython3 +:tags: [hide-output] + +%load_ext sql +%sql sqlite:// +``` + +Let's create our table + +```{code-cell} ipython3 +:tags: [hide-output] + +%%sql sqlite:// +CREATE TABLE example_table (rating, price, number, symbol); +INSERT INTO example_table VALUES (14.44, 2.48, 82, 'a'); +INSERT INTO example_table VALUES (13.13, 1.50, 93, 'b'); +INSERT INTO example_table VALUES (12.59, 0.20, 98, 'a'); +INSERT INTO example_table VALUES (11.54, 0.41, 89, 'a'); +INSERT INTO example_table VALUES (10.532, 0.1, 88, 'c'); +INSERT INTO example_table VALUES (11.5, 0.2, 84, 'b'); +INSERT INTO example_table VALUES (11.1, 0.3, 90, 'a'); +INSERT INTO example_table VALUES (12.9, 0.31, 86, ''); +INSERT INTO example_table VALUES (12.9, 0.31, 86, ' '); +``` + +```{code-cell} ipython3 +%sqlcmd profile -t example_table +``` + +### Large datasets + +We can easily explore large SQlite database using DuckDB. + +```{code-cell} ipython3 +:tags: [hide-output] + +import urllib.request +from pathlib import Path + +if not Path("example.db").is_file(): + url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite" # noqa + urllib.request.urlretrieve(url, "example.db") +``` + + +```{code-cell} ipython3 +:tags: [hide-output] + +%%sql duckdb:/// +INSTALL 'sqlite_scanner'; +LOAD 'sqlite_scanner'; +CALL sqlite_attach('example.db'); +``` + +```{code-cell} ipython3 +%sqlcmd profile -t track +``` \ No newline at end of file diff --git a/src/sql/inspect.py b/src/sql/inspect.py index 751f86466..d6c16c9ab 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -1,9 +1,10 @@ from sqlalchemy import inspect from prettytable import PrettyTable from ploomber_core.exceptions import modify_exceptions - from sql.connection import Connection from sql.telemetry import telemetry +import sql.run +import math def _get_inspector(conn): @@ -73,6 +74,146 @@ def __init__(self, name, schema, conn=None) -> None: self._table_txt = self._table.get_string() +@modify_exceptions +class TableDescription(DatabaseInspection): + """ + Generates descriptive statistics. + + Descriptive statistics are: + + Count - Number of all non empty values + + Mean - Mean of the values + + Max - Maximum of the values in the object. + + Min - Minimum of the values in the object. + + STD - Standard deviation of the observations + + 25h, 50h and 75h percentiles + + Unique - Number of unique values + + Top - The most frequent value + + Freq - Frequency of the top value + + """ + + def __init__(self, table_name, config=None, user_ns=None) -> None: + result_table_columns = sql.run.run( + Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config, user_ns + ) + + columns = result_table_columns.keys + + table_stats = dict({}) + + for column in columns: + table_stats[column] = dict() + result_col_unique_values = sql.run.run( + Connection.current, + f"SELECT COUNT(DISTINCT {column}) as unique_count FROM {table_name}", + config, + user_ns, + ) + + result_col_freq_values = sql.run.run( + Connection.current, + f"""SELECT {column}, COUNT({column}) as frequency FROM {table_name} + GROUP BY {column} ORDER BY Count({column}) Desc""", + config, + user_ns, + ) + + result_non_empty_values = sql.run.run( + Connection.current, + f"""SELECT {column} FROM {table_name} WHERE {column} + IS NOT NULL AND TRIM({column}) <> '' + ORDER BY {column} ASC + """, + config, + user_ns, + ) + + col_values = result_non_empty_values.dict()[column] + count = len(col_values) + table_stats[column]["count"] = count + table_stats[column]["freq"] = result_col_freq_values.dict()["frequency"][0] + table_stats[column]["unique"] = result_col_unique_values.dict()[ + "unique_count" + ][0] + table_stats[column]["top"] = result_col_freq_values.dict()[column][0] + table_stats[column]["min"] = col_values[0] + table_stats[column]["max"] = col_values[count - 1] + + try: + mean = sum(col_values) / count + table_stats[column]["mean"] = mean + + values_sum = sum([(math.pow((v - mean), 2)) for v in col_values]) + std = math.sqrt(values_sum / (count - 1)) + + table_stats[column]["std"] = std + + table_stats[column]["25%"] = self._get_n_percentile(25, col_values) + table_stats[column]["50%"] = self._get_n_percentile(50, col_values) + table_stats[column]["75%"] = self._get_n_percentile(75, col_values) + + except TypeError: + # for non numeric values + table_stats[column]["mean"] = math.nan + table_stats[column]["std"] = math.nan + table_stats[column]["25%"] = math.nan + table_stats[column]["50%"] = math.nan + table_stats[column]["75%"] = math.nan + + self._table = PrettyTable() + self._table.field_names = [" "] + list(table_stats.keys()) + + rows = list(table_stats.items())[0][1].keys() + + for row in rows: + values = [row] + for column in table_stats: + value = table_stats[column][row] + values.append(value) + + self._table.add_row(values) + + self._table_html = self._table.get_html_string() + self._table_txt = self._table.get_string() + + def _get_n_percentile(self, n, list) -> float: + """ + Calculates the nth percentile of the given data. + + Parameters + ---------- + n : int + The Nth percentile to comupte. Must be between 0 and 100 inclusive. + + list : list of numeric values + An ordered list of numeric values + + Returns + ------- + nth percentile of the list + """ + if n < 0 or n > 100: + raise ValueError("N must be between 0 and 100 inclusive") + + count = len(list) + lp = ((count + 1) * n) / 100 + index = math.floor(lp) + if index - 1 >= 0: + diff = list[index] - list[index - 1] + distance = lp - index + + return list[index - 1] + distance * diff + + @telemetry.log_call() def get_table_names(schema=None): """Get table names for a given connection""" @@ -83,3 +224,14 @@ def get_table_names(schema=None): def get_columns(name, schema=None): """Get column names for a given connection""" return Columns(name, schema) + + +@telemetry.log_call() +def get_table_statistics(name, config=None, user_ns=None): + """Get table statistics for a given connection. + + For all data types the results will include `count`, `mean`, `std`, `min` + `max`, `25`, `50` and `75` percentiles. It will also include `unique`, `top` + and `freq` statistics. + """ + return TableDescription(name, config=config, user_ns=user_ns) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index aff07f05c..7b795c9da 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -6,6 +6,7 @@ Magics, line_magic, magics_class, + needs_local_scope, ) from IPython.core.magic_arguments import argument, magic_arguments from IPython.core.error import UsageError @@ -33,6 +34,17 @@ def error(self, message): class SqlCmdMagic(Magics, Configurable): """%sqlcmd magic""" + displaycon = True + autolimit = None + style = "DEFAULT" + short_errors = True + displaylimit = None + autopandas = False + column_local_vars = False + feedback = False + autocommit = False + + @needs_local_scope @line_magic("sqlcmd") @magic_arguments() @argument("line", default="", type=str, help="Command name") @@ -65,8 +77,22 @@ def execute(self, line="", cell="", local_ns=None): args = parser.parse_args(others) return inspect.get_columns(name=args.table, schema=args.schema) + + elif cmd_name == "profile": + parser = CmdParser() + + parser.add_argument( + "-t", "--table", type=str, help="Table name", required=True + ) + + args = parser.parse_args(others) + + user_ns = self.shell.user_ns.copy() + user_ns.update(local_ns) + + return inspect.get_table_statistics(name=args.table, config=self, user_ns=user_ns) else: raise UsageError( f"%sqlcmd has no command: {cmd_name!r}. " - "Valid commands are: 'tables', 'columns'" + "Valid commands are: 'tables', 'columns', 'profile' " ) diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index 7fc189d03..6e7a6cb86 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -70,3 +70,56 @@ def test_columns_with_schema(ip, tmp_empty): ).result._repr_html_() assert "some_number" in out + + +def test_table_profile(ip): + ip.run_cell(""" + %%sql sqlite:// + CREATE TABLE numbers (rating, price, number, word); + INSERT INTO numbers VALUES (14.44, 2.48, 82, 'a'); + INSERT INTO numbers VALUES (13.13, 1.50, 93, 'b'); + INSERT INTO numbers VALUES (12.59, 0.20, 98, 'a'); + INSERT INTO numbers VALUES (11.54, 0.41, 89, 'a'); + INSERT INTO numbers VALUES (10.532, 0.1, 88, 'c'); + INSERT INTO numbers VALUES (11.5, 0.2, 84, ' '); + INSERT INTO numbers VALUES (11.1, 0.3, 90, 'a'); + INSERT INTO numbers VALUES (12.9, 0.31, 86, ''); + """) + + expected = { + "count": [8, 8, 8, 6], + "mean": [12.2165, 0.6875, 88.75, float("NaN")], + "min": [10.532, 0.1, 82, float("NaN")], + "max": [14.44, 2.48, 98, float("NaN")], + "std": [1.2784055917989632, 0.8504914545636036, + 5.092010548749033, float("NaN")], + "25%": [11.2, 0.2, 84.5, float("NaN")], + "50%": [12.065, 0.305, 88.5, float("NaN")], + "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], + "unique": [8, 7, 8, 4], + "freq": [1, 2, 1, 4], + "top": [14.44, 0.2, 98, "a"], + + } + + out = ip.run_cell("%sqlcmd profile -t numbers").result + + stats_table = out._table + + for row in stats_table: + criteria = row.get_string( + fields=[" "], border=False).strip() + + rating = row.get_string( + fields=["rating"], border=False, header=False).strip() + + price = row.get_string( + fields=["price"], border=False, header=False).strip() + + number = row.get_string( + fields=["number"], border=False, header=False).strip() + + if criteria in expected: + assert rating == str(expected[criteria][0]) + assert price == str(expected[criteria][1]) + assert number == str(expected[criteria][2]) From 0fa35327168e1c7d05374b2f73df04d9b866f174 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 15:29:36 +0200 Subject: [PATCH 02/23] lint --- src/sql/magic_cmd.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index 7b795c9da..2697f9cd6 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -90,7 +90,9 @@ def execute(self, line="", cell="", local_ns=None): user_ns = self.shell.user_ns.copy() user_ns.update(local_ns) - return inspect.get_table_statistics(name=args.table, config=self, user_ns=user_ns) + return inspect.get_table_statistics( + name=args.table, config=self, user_ns=user_ns + ) else: raise UsageError( f"%sqlcmd has no command: {cmd_name!r}. " From eca6957cd2a58bde7b9130faadac68ca6ce6afa1 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 15:43:22 +0200 Subject: [PATCH 03/23] test fixed --- src/sql/magic_cmd.py | 2 +- src/tests/test_magic_cmd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index 2697f9cd6..e3c0acba2 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -96,5 +96,5 @@ def execute(self, line="", cell="", local_ns=None): else: raise UsageError( f"%sqlcmd has no command: {cmd_name!r}. " - "Valid commands are: 'tables', 'columns', 'profile' " + "Valid commands are: 'tables', 'columns', 'profile'" ) diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index 6e7a6cb86..4ab0b72fd 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -10,7 +10,7 @@ [ "%sqlcmd stuff", UsageError, - "%sqlcmd has no command: 'stuff'. Valid commands are: 'tables', 'columns'", + "%sqlcmd has no command: 'stuff'. Valid commands are: 'tables', 'columns', 'profile'", ], [ "%sqlcmd columns", From a400a036fa75b46f7d52d00deebc72320105c2a3 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 15:52:54 +0200 Subject: [PATCH 04/23] lint --- src/tests/test_magic_cmd.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index 4ab0b72fd..e367dd80f 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -10,7 +10,8 @@ [ "%sqlcmd stuff", UsageError, - "%sqlcmd has no command: 'stuff'. Valid commands are: 'tables', 'columns', 'profile'", + "%sqlcmd has no command: 'stuff'. Valid commands are: 'tables', " + "'columns', 'profile'", ], [ "%sqlcmd columns", @@ -73,7 +74,8 @@ def test_columns_with_schema(ip, tmp_empty): def test_table_profile(ip): - ip.run_cell(""" + ip.run_cell( + """ %%sql sqlite:// CREATE TABLE numbers (rating, price, number, word); INSERT INTO numbers VALUES (14.44, 2.48, 82, 'a'); @@ -84,22 +86,26 @@ def test_table_profile(ip): INSERT INTO numbers VALUES (11.5, 0.2, 84, ' '); INSERT INTO numbers VALUES (11.1, 0.3, 90, 'a'); INSERT INTO numbers VALUES (12.9, 0.31, 86, ''); - """) + """ + ) expected = { "count": [8, 8, 8, 6], "mean": [12.2165, 0.6875, 88.75, float("NaN")], "min": [10.532, 0.1, 82, float("NaN")], "max": [14.44, 2.48, 98, float("NaN")], - "std": [1.2784055917989632, 0.8504914545636036, - 5.092010548749033, float("NaN")], + "std": [ + 1.2784055917989632, + 0.8504914545636036, + 5.092010548749033, + float("NaN"), + ], "25%": [11.2, 0.2, 84.5, float("NaN")], "50%": [12.065, 0.305, 88.5, float("NaN")], "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], "unique": [8, 7, 8, 4], "freq": [1, 2, 1, 4], "top": [14.44, 0.2, 98, "a"], - } out = ip.run_cell("%sqlcmd profile -t numbers").result @@ -107,17 +113,13 @@ def test_table_profile(ip): stats_table = out._table for row in stats_table: - criteria = row.get_string( - fields=[" "], border=False).strip() + criteria = row.get_string(fields=[" "], border=False).strip() - rating = row.get_string( - fields=["rating"], border=False, header=False).strip() + rating = row.get_string(fields=["rating"], border=False, header=False).strip() - price = row.get_string( - fields=["price"], border=False, header=False).strip() + price = row.get_string(fields=["price"], border=False, header=False).strip() - number = row.get_string( - fields=["number"], border=False, header=False).strip() + number = row.get_string(fields=["number"], border=False, header=False).strip() if criteria in expected: assert rating == str(expected[criteria][0]) From 704108123d27788caeb983854b786c1c75895803 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 16:14:49 +0200 Subject: [PATCH 05/23] autopolars property added to config --- src/sql/magic_cmd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index e3c0acba2..25a4e684d 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -43,6 +43,7 @@ class SqlCmdMagic(Magics, Configurable): column_local_vars = False feedback = False autocommit = False + autopolars = False @needs_local_scope @line_magic("sqlcmd") From 9ccd1cc62fa37e21776c5226378a5800ae4f1eef Mon Sep 17 00:00:00 2001 From: yafimvo Date: Mon, 27 Feb 2023 17:48:10 +0200 Subject: [PATCH 06/23] save report added --- doc/user-guide/explore-tables.md | 10 +++++++++- src/sql/inspect.py | 5 +++-- src/sql/magic_cmd.py | 12 +++++++++++- src/tests/test_magic_cmd.py | 30 ++++++++++++++++++++++++++---- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/doc/user-guide/explore-tables.md b/doc/user-guide/explore-tables.md index 9d37dd027..2a9a82b05 100644 --- a/doc/user-guide/explore-tables.md +++ b/doc/user-guide/explore-tables.md @@ -85,4 +85,12 @@ CALL sqlite_attach('example.db'); ```{code-cell} ipython3 %sqlcmd profile -t track -``` \ No newline at end of file +``` + +### Saving report as HTML + +To save the generated report as an HTML file, use the `--output`/`-o` attribute followed by the desired file name + +``` +%sqlcmd profile -t track --output my-report.html +``` diff --git a/src/sql/inspect.py b/src/sql/inspect.py index d6c16c9ab..d7d2c3836 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -207,11 +207,12 @@ def _get_n_percentile(self, n, list) -> float: count = len(list) lp = ((count + 1) * n) / 100 index = math.floor(lp) - if index - 1 >= 0: + if index - 1 >= 0 and index < len(list): diff = list[index] - list[index - 1] distance = lp - index - return list[index - 1] + distance * diff + else: + return None @telemetry.log_call() diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index 25a4e684d..0ad8249c7 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -86,14 +86,24 @@ def execute(self, line="", cell="", local_ns=None): "-t", "--table", type=str, help="Table name", required=True ) + parser.add_argument( + "-o", "--output", type=str, help="Store report location", required=False + ) + args = parser.parse_args(others) user_ns = self.shell.user_ns.copy() user_ns.update(local_ns) - return inspect.get_table_statistics( + report = inspect.get_table_statistics( name=args.table, config=self, user_ns=user_ns ) + + if args.output: + with open(args.output, "w") as f: + f.write(report._repr_html_()) + + return report else: raise UsageError( f"%sqlcmd has no command: {cmd_name!r}. " diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index e367dd80f..cdae9f2f2 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -2,6 +2,7 @@ import pytest from IPython.core.error import UsageError +from pathlib import Path @pytest.mark.parametrize( @@ -73,7 +74,7 @@ def test_columns_with_schema(ip, tmp_empty): assert "some_number" in out -def test_table_profile(ip): +def test_table_profile(ip, tmp_empty): ip.run_cell( """ %%sql sqlite:// @@ -92,8 +93,8 @@ def test_table_profile(ip): expected = { "count": [8, 8, 8, 6], "mean": [12.2165, 0.6875, 88.75, float("NaN")], - "min": [10.532, 0.1, 82, float("NaN")], - "max": [14.44, 2.48, 98, float("NaN")], + "min": [10.532, 0.1, 82, "a"], + "max": [14.44, 2.48, 98, "c"], "std": [ 1.2784055917989632, 0.8504914545636036, @@ -103,7 +104,7 @@ def test_table_profile(ip): "25%": [11.2, 0.2, 84.5, float("NaN")], "50%": [12.065, 0.305, 88.5, float("NaN")], "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], - "unique": [8, 7, 8, 4], + "unique": [8, 7, 8, 5], "freq": [1, 2, 1, 4], "top": [14.44, 0.2, 98, "a"], } @@ -121,7 +122,28 @@ def test_table_profile(ip): number = row.get_string(fields=["number"], border=False, header=False).strip() + word = row.get_string(fields=["word"], border=False, header=False).strip() + if criteria in expected: assert rating == str(expected[criteria][0]) assert price == str(expected[criteria][1]) assert number == str(expected[criteria][2]) + assert word == str(expected[criteria][3]) + + +def test_table_profile_store(ip, tmp_empty): + ip.run_cell( + """ + %%sql sqlite:// + CREATE TABLE test_store (rating, price, number, symbol); + INSERT INTO test_store VALUES (14.44, 2.48, 82, 'a'); + INSERT INTO test_store VALUES (13.13, 1.50, 93, 'b'); + INSERT INTO test_store VALUES (12.59, 0.20, 98, 'a'); + INSERT INTO test_store VALUES (11.54, 0.41, 89, 'a'); + """ + ) + + ip.run_cell("%sqlcmd profile -t test_store --output test_report.html") + + report = Path("test_report.html") + assert report.is_file() From 9a0dc82e11f805a4dbcf0360119e159da6b7a14c Mon Sep 17 00:00:00 2001 From: yafimvo Date: Sun, 5 Mar 2023 18:08:22 +0200 Subject: [PATCH 07/23] percentile_disc added, schema added, docs updated --- doc/_toc.yml | 2 +- doc/user-guide/data-profiling.md | 158 +++++++++++++++++++++++++++++++ doc/user-guide/explore-tables.md | 96 ------------------- src/sql/inspect.py | 68 ++++++++----- src/sql/magic_cmd.py | 6 +- src/sql/util.py | 34 +++++++ src/tests/test_magic_cmd.py | 60 ++++++++++-- 7 files changed, 298 insertions(+), 126 deletions(-) create mode 100644 doc/user-guide/data-profiling.md delete mode 100644 doc/user-guide/explore-tables.md create mode 100644 src/sql/util.py diff --git a/doc/_toc.yml b/doc/_toc.yml index 0909c5175..2275954ac 100644 --- a/doc/_toc.yml +++ b/doc/_toc.yml @@ -14,7 +14,7 @@ parts: - file: user-guide/tables-columns - file: plot-legacy - file: user-guide/template - - file: user-guide/explore-tables + - file: user-guide/data-profiling - caption: Integrations chapters: diff --git a/doc/user-guide/data-profiling.md b/doc/user-guide/data-profiling.md new file mode 100644 index 000000000..c9ffa8289 --- /dev/null +++ b/doc/user-guide/data-profiling.md @@ -0,0 +1,158 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.14.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Data profiling + +When dealing with a new dataset, it's crucial for practitioners to have a comprehensive understanding of the data in a timely manner. This involves exploring and summarizing the dataset efficiently to extract valuable insights. However, this can be a time-consuming process. Fortunately, `%sqlcmd profile` offers an easy way to generate statistics and descriptive information, enabling practitioners to quickly gain a deeper understanding of the dataset. + +Availble statistics: + +* The count of non empty values +* The number of unique values +* The top (most frequent) value +* The frequency of your top value +* The mean, standard deviation, min and max values +* The percentiles of your data: 25%, 50% and 75%. + + +## Examples + +### DuckDB + +In this example we'll demonstrate the process of profiling a sample dataset that contains historical taxi data from NYC, using DuckDB. However, the code used here is compatible with all major databases. + +Download the data + +```{code-cell} ipython3 +from pathlib import Path +from urllib.request import urlretrieve + +url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet" + +if not Path("yellow_tripdata_2021-01.parquet").is_file(): + urlretrieve(url, "yellow_tripdata_2021-01.parquet") +``` + +Setup + +```{note} +this example requires duckdb-engine: `pip install duckdb-engine` +``` + +Load the extension and connect to an in-memory DuckDB database: + +```{code-cell} ipython3 +%load_ext sql +``` + +```{code-cell} ipython3 +%sql duckdb:// +``` + +Profile table + +```{code-cell} ipython3 +%sqlcmd profile --table "yellow_tripdata_2021-01.parquet" +``` + +### SQLite + +We can easily explore large SQLite database using DuckDB. + +```{code-cell} ipython3 +:tags: [hide-output] + +import urllib.request +from pathlib import Path + +if not Path("example.db").is_file(): + url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite" # noqa + urllib.request.urlretrieve(url, "example.db") +``` + + +```{code-cell} ipython3 +:tags: [hide-output] + +%%sql duckdb:/// +INSTALL 'sqlite_scanner'; +LOAD 'sqlite_scanner'; +CALL sqlite_attach('example.db'); +``` + +```{code-cell} ipython3 +%sqlcmd profile -t track +``` + +### Saving report as HTML + +To save the generated report as an HTML file, use the `--output`/`-o` attribute followed by the desired file name + +```{code-cell} ipython3 +:tags: [hide-output] + +%sqlcmd profile -t track --output my-report.html +``` + +```{code-cell} ipython3 +from IPython.display import HTML +HTML("my-report.html") +``` + +### Use schemas + +To profile a specific table from various tables in different schemas, we can use the `--schema/-s` attribute. + +```{code-cell} ipython3 +:tags: [hide-output] + +import sqlite3 + +with sqlite3.connect("a.db") as conn: + conn.execute("CREATE TABLE my_numbers (number FLOAT)") + conn.execute("INSERT INTO my_numbers VALUES (1)") + conn.execute("INSERT INTO my_numbers VALUES (2)") + conn.execute("INSERT INTO my_numbers VALUES (3)") +``` + +```{code-cell} ipython3 +:tags: [hide-output] + +%%sql +ATTACH DATABASE 'a.db' AS a_schema +``` + +```{code-cell} ipython3 +:tags: [hide-output] + +import sqlite3 + +with sqlite3.connect("b.db") as conn: + conn.execute("CREATE TABLE my_numbers (number FLOAT)") + conn.execute("INSERT INTO my_numbers VALUES (11)") + conn.execute("INSERT INTO my_numbers VALUES (22)") + conn.execute("INSERT INTO my_numbers VALUES (33)") +``` + +```{code-cell} ipython3 +:tags: [hide-output] + +%%sql +ATTACH DATABASE 'b.db' AS b_schema +``` + +Let's profile `my_numbers` of `b_schema` + +```{code-cell} ipython3 +%sqlcmd profile --table my_numbers --schema b_schema +``` diff --git a/doc/user-guide/explore-tables.md b/doc/user-guide/explore-tables.md deleted file mode 100644 index 2a9a82b05..000000000 --- a/doc/user-guide/explore-tables.md +++ /dev/null @@ -1,96 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Explore tables - -When dealing with a new dataset, it's crucial for practitioners to have a comprehensive understanding of the data in a timely manner. This involves exploring and summarizing the dataset efficiently to extract valuable insights. However, this can be a time-consuming process. Fortunately, `%sqlcmd profile` offers an easy way to generate statistics and descriptive information, enabling practitioners to quickly gain a deeper understanding of the dataset. - -Availble statistics: - -* The count of non empty values -* The number of unique values -* The top (most frequent) value -* The frequency of your top value -* The mean, standard deviation, min and max values -* The percentiles of your data: 25%, 50% and 75%. - - -## Examples - -### Simple example with SQLite - -```{code-cell} ipython3 -:tags: [hide-output] - -%load_ext sql -%sql sqlite:// -``` - -Let's create our table - -```{code-cell} ipython3 -:tags: [hide-output] - -%%sql sqlite:// -CREATE TABLE example_table (rating, price, number, symbol); -INSERT INTO example_table VALUES (14.44, 2.48, 82, 'a'); -INSERT INTO example_table VALUES (13.13, 1.50, 93, 'b'); -INSERT INTO example_table VALUES (12.59, 0.20, 98, 'a'); -INSERT INTO example_table VALUES (11.54, 0.41, 89, 'a'); -INSERT INTO example_table VALUES (10.532, 0.1, 88, 'c'); -INSERT INTO example_table VALUES (11.5, 0.2, 84, 'b'); -INSERT INTO example_table VALUES (11.1, 0.3, 90, 'a'); -INSERT INTO example_table VALUES (12.9, 0.31, 86, ''); -INSERT INTO example_table VALUES (12.9, 0.31, 86, ' '); -``` - -```{code-cell} ipython3 -%sqlcmd profile -t example_table -``` - -### Large datasets - -We can easily explore large SQlite database using DuckDB. - -```{code-cell} ipython3 -:tags: [hide-output] - -import urllib.request -from pathlib import Path - -if not Path("example.db").is_file(): - url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite" # noqa - urllib.request.urlretrieve(url, "example.db") -``` - - -```{code-cell} ipython3 -:tags: [hide-output] - -%%sql duckdb:/// -INSTALL 'sqlite_scanner'; -LOAD 'sqlite_scanner'; -CALL sqlite_attach('example.db'); -``` - -```{code-cell} ipython3 -%sqlcmd profile -t track -``` - -### Saving report as HTML - -To save the generated report as an HTML file, use the `--output`/`-o` attribute followed by the desired file name - -``` -%sqlcmd profile -t track --output my-report.html -``` diff --git a/src/sql/inspect.py b/src/sql/inspect.py index d7d2c3836..6ba3370fc 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -5,6 +5,7 @@ from sql.telemetry import telemetry import sql.run import math +from sql.util import convert_to_scientific def _get_inspector(conn): @@ -101,7 +102,11 @@ class TableDescription(DatabaseInspection): """ - def __init__(self, table_name, config=None, user_ns=None) -> None: + def __init__(self, table_name, schema=None, config=None, user_ns=None) -> None: + + if schema: + table_name = f"{schema}.{table_name}" + result_table_columns = sql.run.run( Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config, user_ns ) @@ -157,9 +162,15 @@ def __init__(self, table_name, config=None, user_ns=None) -> None: table_stats[column]["std"] = std - table_stats[column]["25%"] = self._get_n_percentile(25, col_values) - table_stats[column]["50%"] = self._get_n_percentile(50, col_values) - table_stats[column]["75%"] = self._get_n_percentile(75, col_values) + table_stats[column]["25%"] = self._get_n_percentile( + 25, table_name, column, config, user_ns + ) + table_stats[column]["50%"] = self._get_n_percentile( + 50, table_name, column, config, user_ns + ) + table_stats[column]["75%"] = self._get_n_percentile( + 75, table_name, column, config, user_ns + ) except TypeError: # for non numeric values @@ -169,6 +180,11 @@ def __init__(self, table_name, config=None, user_ns=None) -> None: table_stats[column]["50%"] = math.nan table_stats[column]["75%"] = math.nan + except BaseException: + # Failed to run sql command. + # We ignore the cell stats for such case. + pass + self._table = PrettyTable() self._table.field_names = [" "] + list(table_stats.keys()) @@ -178,6 +194,7 @@ def __init__(self, table_name, config=None, user_ns=None) -> None: values = [row] for column in table_stats: value = table_stats[column][row] + value = convert_to_scientific(value) values.append(value) self._table.add_row(values) @@ -185,34 +202,41 @@ def __init__(self, table_name, config=None, user_ns=None) -> None: self._table_html = self._table.get_html_string() self._table_txt = self._table.get_string() - def _get_n_percentile(self, n, list) -> float: + def _get_n_percentile( + self, percentile, table_name, column, config, user_ns + ) -> float: """ - Calculates the nth percentile of the given data. + Uses percentile_disc SQL query to compute the nth percentile of a + specified column in a specified table. Parameters ---------- n : int The Nth percentile to comupte. Must be between 0 and 100 inclusive. - list : list of numeric values - An ordered list of numeric values + table_name : str + Name of SQL table + + column : str + Name of the column in table Returns ------- - nth percentile of the list + Nth percentile of the list """ - if n < 0 or n > 100: - raise ValueError("N must be between 0 and 100 inclusive") + percentile = percentile / 100 + + percentile = sql.run.run( + Connection.current, + f""" + SELECT percentile_disc({percentile}) WITHIN GROUP (ORDER BY {column}) + as percentile, FROM {table_name} + """, + config, + user_ns, + ) - count = len(list) - lp = ((count + 1) * n) / 100 - index = math.floor(lp) - if index - 1 >= 0 and index < len(list): - diff = list[index] - list[index - 1] - distance = lp - index - return list[index - 1] + distance * diff - else: - return None + return percentile.dict()["percentile"][0] @telemetry.log_call() @@ -228,11 +252,11 @@ def get_columns(name, schema=None): @telemetry.log_call() -def get_table_statistics(name, config=None, user_ns=None): +def get_table_statistics(name, schema=None, config=None, user_ns=None): """Get table statistics for a given connection. For all data types the results will include `count`, `mean`, `std`, `min` `max`, `25`, `50` and `75` percentiles. It will also include `unique`, `top` and `freq` statistics. """ - return TableDescription(name, config=config, user_ns=user_ns) + return TableDescription(name, schema=schema, config=config, user_ns=user_ns) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index 0ad8249c7..4471b8ddd 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -86,6 +86,10 @@ def execute(self, line="", cell="", local_ns=None): "-t", "--table", type=str, help="Table name", required=True ) + parser.add_argument( + "-s", "--schema", type=str, help="Schema name", required=False + ) + parser.add_argument( "-o", "--output", type=str, help="Store report location", required=False ) @@ -96,7 +100,7 @@ def execute(self, line="", cell="", local_ns=None): user_ns.update(local_ns) report = inspect.get_table_statistics( - name=args.table, config=self, user_ns=user_ns + schema=args.schema, name=args.table, config=self, user_ns=user_ns ) if args.output: diff --git a/src/sql/util.py b/src/sql/util.py new file mode 100644 index 000000000..347f302e4 --- /dev/null +++ b/src/sql/util.py @@ -0,0 +1,34 @@ +import numpy as np + + +def convert_to_scientific(value): + """ + Converts value to scientific notation if necessary + + Parameters + ---------- + value : any + Value to format. + """ + if ( + isinstance(value, (int, float)) + and not isinstance(value, bool) + and _is_long_number(value) + ): + new_value = np.format_float_scientific(value, exp_digits=2, precision=3) + + else: + new_value = value + + return new_value + + +def _is_long_number(num) -> bool: + """ + Checks if num's digits > 10 + """ + if "." in str(num): + split_by_decimal = str(num).split(".") + if len(split_by_decimal[0]) > 10 or len(split_by_decimal[1]) > 10: + return True + return False diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index cdae9f2f2..c9e3a9bd6 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -96,19 +96,21 @@ def test_table_profile(ip, tmp_empty): "min": [10.532, 0.1, 82, "a"], "max": [14.44, 2.48, 98, "c"], "std": [ - 1.2784055917989632, - 0.8504914545636036, - 5.092010548749033, + "1.278e+00", + "8.505e-01", + "5.092e+00", float("NaN"), ], - "25%": [11.2, 0.2, 84.5, float("NaN")], - "50%": [12.065, 0.305, 88.5, float("NaN")], - "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], + # "25%": [11.2, 0.2, 84.5, float("NaN")], + # "50%": [12.065, 0.305, 88.5, float("NaN")], + # "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], "unique": [8, 7, 8, 5], "freq": [1, 2, 1, 4], "top": [14.44, 0.2, 98, "a"], } + # note : We ignote Nth percentile since sqlite doesn't support `percentile_disc` + out = ip.run_cell("%sqlcmd profile -t numbers").result stats_table = out._table @@ -131,6 +133,52 @@ def test_table_profile(ip, tmp_empty): assert word == str(expected[criteria][3]) +def test_table_schema_profile(ip, tmp_empty): + + with sqlite3.connect("a.db") as conn: + conn.execute("CREATE TABLE t (n FLOAT)") + conn.execute("INSERT INTO t VALUES (1)") + conn.execute("INSERT INTO t VALUES (2)") + conn.execute("INSERT INTO t VALUES (3)") + + with sqlite3.connect("b.db") as conn: + conn.execute("CREATE TABLE t (n FLOAT)") + conn.execute("INSERT INTO t VALUES (11)") + conn.execute("INSERT INTO t VALUES (22)") + conn.execute("INSERT INTO t VALUES (33)") + + ip.run_cell( + """ + %%sql sqlite:// + ATTACH DATABASE 'a.db' AS a_schema; + ATTACH DATABASE 'b.db' AS b_schema; + """ + ) + + expected = { + "count": [3], + "mean": [22.0], + "min": [11.0], + "max": [33.0], + "std": [11.0], + "unique": [3], + "freq": [1], + "top": [33.0], + } + + out = ip.run_cell("%sqlcmd profile -t t --schema b_schema").result + + stats_table = out._table + + for row in stats_table: + criteria = row.get_string(fields=[" "], border=False).strip() + + cell = row.get_string(fields=["n"], border=False, header=False).strip() + + if criteria in expected: + assert cell == str(expected[criteria][0]) + + def test_table_profile_store(ip, tmp_empty): ip.run_cell( """ From 56e3d2eb80e68105d076c1630cebef2f0656913d Mon Sep 17 00:00:00 2001 From: yafimvo Date: Sun, 5 Mar 2023 18:14:16 +0200 Subject: [PATCH 08/23] numpy added to setup --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1234d0991..af595b434 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ "jinja2", "ploomber-core>=0.2.4", 'importlib-metadata;python_version<"3.8"', + "numpy" ] DEV = [ From 431d2fb7c1c5b298dd634a28ffd58d86501a7865 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Tue, 7 Mar 2023 19:29:00 +0200 Subject: [PATCH 09/23] np removed, run_raw added, queries updated, test fixed --- doc/user-guide/data-profiling.md | 3 +- setup.py | 3 +- src/sql/inspect.py | 157 +++++++++++++------------------ src/sql/magic_cmd.py | 22 +---- src/sql/run.py | 17 +++- src/sql/util.py | 5 +- src/tests/test_magic_cmd.py | 29 ++---- 7 files changed, 96 insertions(+), 140 deletions(-) diff --git a/doc/user-guide/data-profiling.md b/doc/user-guide/data-profiling.md index c9ffa8289..b36359038 100644 --- a/doc/user-guide/data-profiling.md +++ b/doc/user-guide/data-profiling.md @@ -24,7 +24,6 @@ Availble statistics: * The mean, standard deviation, min and max values * The percentiles of your data: 25%, 50% and 75%. - ## Examples ### DuckDB @@ -46,7 +45,7 @@ if not Path("yellow_tripdata_2021-01.parquet").is_file(): Setup ```{note} -this example requires duckdb-engine: `pip install duckdb-engine` +This example requires duckdb-engine: `pip install duckdb-engine` ``` Load the extension and connect to an in-memory DuckDB database: diff --git a/setup.py b/setup.py index af595b434..51849f922 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,7 @@ "ipython-genutils>=0.1.0", "jinja2", "ploomber-core>=0.2.4", - 'importlib-metadata;python_version<"3.8"', - "numpy" + 'importlib-metadata;python_version<"3.8"' ] DEV = [ diff --git a/src/sql/inspect.py b/src/sql/inspect.py index 6ba3370fc..cb5ffe5cb 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -82,7 +82,7 @@ class TableDescription(DatabaseInspection): Descriptive statistics are: - Count - Number of all non empty values + Count - Number of all not None values Mean - Mean of the values @@ -94,7 +94,7 @@ class TableDescription(DatabaseInspection): 25h, 50h and 75h percentiles - Unique - Number of unique values + Unique - Number of not None unique values Top - The most frequent value @@ -102,75 +102,83 @@ class TableDescription(DatabaseInspection): """ - def __init__(self, table_name, schema=None, config=None, user_ns=None) -> None: - + def __init__(self, table_name, schema=None, config=None) -> None: if schema: table_name = f"{schema}.{table_name}" - result_table_columns = sql.run.run( - Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config, user_ns - ) - - columns = result_table_columns.keys + columns = sql.run.run_raw( + Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config + ).keys table_stats = dict({}) for column in columns: table_stats[column] = dict() - result_col_unique_values = sql.run.run( - Connection.current, - f"SELECT COUNT(DISTINCT {column}) as unique_count FROM {table_name}", - config, - user_ns, - ) - - result_col_freq_values = sql.run.run( + result_col_freq_values = sql.run.run_raw( Connection.current, - f"""SELECT {column}, COUNT({column}) as frequency FROM {table_name} + f"""SELECT {column} as top, + COUNT({column}) as frequency FROM {table_name} GROUP BY {column} ORDER BY Count({column}) Desc""", config, - user_ns, - ) + ).dict() - result_non_empty_values = sql.run.run( + # get all non None values, min, max and avg. + result_value_values = sql.run.run_raw( Connection.current, - f"""SELECT {column} FROM {table_name} WHERE {column} - IS NOT NULL AND TRIM({column}) <> '' - ORDER BY {column} ASC + f""" + SELECT MIN({column}) AS min, + MAX({column}) AS max, + COUNT(DISTINCT {column}) AS unique_count, + COUNT({column}) AS total + FROM {table_name} + WHERE {column} IS NOT NULL AND TRIM({column}) <> '' """, config, - user_ns, - ) - - col_values = result_non_empty_values.dict()[column] - count = len(col_values) - table_stats[column]["count"] = count - table_stats[column]["freq"] = result_col_freq_values.dict()["frequency"][0] - table_stats[column]["unique"] = result_col_unique_values.dict()[ - "unique_count" - ][0] - table_stats[column]["top"] = result_col_freq_values.dict()[column][0] - table_stats[column]["min"] = col_values[0] - table_stats[column]["max"] = col_values[count - 1] + ).dict() - try: - mean = sum(col_values) / count - table_stats[column]["mean"] = mean + table_stats[column]["freq"] = result_col_freq_values["frequency"][0] + table_stats[column]["top"] = result_col_freq_values["top"][0] + table_stats[column]["count"] = result_value_values["total"][0] + table_stats[column]["unique"] = result_value_values["unique_count"][0] + table_stats[column]["min"] = result_value_values["min"][0] + table_stats[column]["max"] = result_value_values["max"][0] - values_sum = sum([(math.pow((v - mean), 2)) for v in col_values]) - std = math.sqrt(values_sum / (count - 1)) + avg = None + try: + results_avg = sql.run.run_raw( + Connection.current, + f""" + SELECT AVG({column}) AS avg + FROM {table_name} + WHERE {column} IS NOT NULL AND TRIM({column}) <> '' + """, + config, + ).dict() + avg = results_avg["avg"][0] + except BaseException: + avg = math.nan - table_stats[column]["std"] = std + table_stats[column]["mean"] = avg - table_stats[column]["25%"] = self._get_n_percentile( - 25, table_name, column, config, user_ns - ) - table_stats[column]["50%"] = self._get_n_percentile( - 50, table_name, column, config, user_ns - ) - table_stats[column]["75%"] = self._get_n_percentile( - 75, table_name, column, config, user_ns - ) + try: + # Note: This STDEV and PERCENTILE_DISC will work only on DuckDB + result = sql.run.run_raw( + Connection.current, + f""" + SELECT + stddev_pop({column}) as std, + percentile_disc(0.25) WITHIN GROUP (ORDER BY {column}) as p25, + percentile_disc(0.50) WITHIN GROUP (ORDER BY {column}) as p50, + percentile_disc(0.75) WITHIN GROUP (ORDER BY {column}) as p75 + FROM {table_name} + """, + config, + ).dict() + + table_stats[column]["std"] = result["std"][0] + table_stats[column]["25%"] = result["p25"][0] + table_stats[column]["50%"] = result["p50"][0] + table_stats[column]["75%"] = result["p75"][0] except TypeError: # for non numeric values @@ -193,7 +201,10 @@ def __init__(self, table_name, schema=None, config=None, user_ns=None) -> None: for row in rows: values = [row] for column in table_stats: - value = table_stats[column][row] + if row in table_stats[column]: + value = table_stats[column][row] + else: + value = "" value = convert_to_scientific(value) values.append(value) @@ -202,42 +213,6 @@ def __init__(self, table_name, schema=None, config=None, user_ns=None) -> None: self._table_html = self._table.get_html_string() self._table_txt = self._table.get_string() - def _get_n_percentile( - self, percentile, table_name, column, config, user_ns - ) -> float: - """ - Uses percentile_disc SQL query to compute the nth percentile of a - specified column in a specified table. - - Parameters - ---------- - n : int - The Nth percentile to comupte. Must be between 0 and 100 inclusive. - - table_name : str - Name of SQL table - - column : str - Name of the column in table - - Returns - ------- - Nth percentile of the list - """ - percentile = percentile / 100 - - percentile = sql.run.run( - Connection.current, - f""" - SELECT percentile_disc({percentile}) WITHIN GROUP (ORDER BY {column}) - as percentile, FROM {table_name} - """, - config, - user_ns, - ) - - return percentile.dict()["percentile"][0] - @telemetry.log_call() def get_table_names(schema=None): @@ -252,11 +227,11 @@ def get_columns(name, schema=None): @telemetry.log_call() -def get_table_statistics(name, schema=None, config=None, user_ns=None): +def get_table_statistics(name, schema=None, config=None): """Get table statistics for a given connection. For all data types the results will include `count`, `mean`, `std`, `min` `max`, `25`, `50` and `75` percentiles. It will also include `unique`, `top` and `freq` statistics. """ - return TableDescription(name, schema=schema, config=config, user_ns=user_ns) + return TableDescription(name, schema=schema, config=config) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index 4471b8ddd..bb9d1a079 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -5,8 +5,7 @@ from IPython.core.magic import ( Magics, line_magic, - magics_class, - needs_local_scope, + magics_class ) from IPython.core.magic_arguments import argument, magic_arguments from IPython.core.error import UsageError @@ -34,22 +33,10 @@ def error(self, message): class SqlCmdMagic(Magics, Configurable): """%sqlcmd magic""" - displaycon = True - autolimit = None - style = "DEFAULT" - short_errors = True - displaylimit = None - autopandas = False - column_local_vars = False - feedback = False - autocommit = False - autopolars = False - - @needs_local_scope @line_magic("sqlcmd") @magic_arguments() @argument("line", default="", type=str, help="Command name") - def execute(self, line="", cell="", local_ns=None): + def execute(self, line="", cell=""): """ Command """ @@ -96,11 +83,8 @@ def execute(self, line="", cell="", local_ns=None): args = parser.parse_args(others) - user_ns = self.shell.user_ns.copy() - user_ns.update(local_ns) - report = inspect.get_table_statistics( - schema=args.schema, name=args.table, config=self, user_ns=user_ns + schema=args.schema, name=args.table, config=self.config ) if args.output: diff --git a/src/sql/run.py b/src/sql/run.py index 77969db01..7fad057ae 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -106,13 +106,18 @@ def __init__(self, sqlaproxy, config): self.keys = {} if sqlaproxy.returns_rows: self.keys = sqlaproxy.keys() - if config.autolimit: + if isinstance(config.autolimit, bool): list.__init__(self, sqlaproxy.fetchmany(size=config.autolimit)) else: list.__init__(self, sqlaproxy.fetchall()) self.field_names = unduplicate_field_names(self.keys) + + _style = None + if isinstance(config.style, str): + _style = prettytable.__dict__[config.style.upper()] + self.pretty = PrettyTable( - self.field_names, style=prettytable.__dict__[config.style.upper()] + self.field_names, style=_style ) else: list.__init__(self, []) @@ -347,7 +352,7 @@ def from_list(self, source_list): def fetchmany(size): pos = 0 while pos < len(source_list): - yield source_list[pos : pos + size] + yield source_list[pos: pos + size] pos += size self.fetchmany = fetchmany @@ -415,6 +420,12 @@ def run(conn, sql, config, user_namespace): return "Connected: %s" % conn.name +def run_raw(conn, sql, config): + result = conn.session.execute(sql) + resultset = ResultSet(result, config) + return resultset + + class PrettyTable(prettytable.PrettyTable): def __init__(self, *args, **kwargs): self.row_count = 0 diff --git a/src/sql/util.py b/src/sql/util.py index 347f302e4..f606391ca 100644 --- a/src/sql/util.py +++ b/src/sql/util.py @@ -1,6 +1,3 @@ -import numpy as np - - def convert_to_scientific(value): """ Converts value to scientific notation if necessary @@ -15,7 +12,7 @@ def convert_to_scientific(value): and not isinstance(value, bool) and _is_long_number(value) ): - new_value = np.format_float_scientific(value, exp_digits=2, precision=3) + new_value = "{:,.3e}".format(value) else: new_value = value diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index c9e3a9bd6..91ea7ce9c 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -78,7 +78,7 @@ def test_table_profile(ip, tmp_empty): ip.run_cell( """ %%sql sqlite:// - CREATE TABLE numbers (rating, price, number, word); + CREATE TABLE numbers (rating float, price float, number int, word varchar(50)); INSERT INTO numbers VALUES (14.44, 2.48, 82, 'a'); INSERT INTO numbers VALUES (13.13, 1.50, 93, 'b'); INSERT INTO numbers VALUES (12.59, 0.20, 98, 'a'); @@ -92,29 +92,20 @@ def test_table_profile(ip, tmp_empty): expected = { "count": [8, 8, 8, 6], - "mean": [12.2165, 0.6875, 88.75, float("NaN")], + "mean": [12.2165, "6.875e-01", 88.75, 0.0], "min": [10.532, 0.1, 82, "a"], "max": [14.44, 2.48, 98, "c"], - "std": [ - "1.278e+00", - "8.505e-01", - "5.092e+00", - float("NaN"), - ], - # "25%": [11.2, 0.2, 84.5, float("NaN")], - # "50%": [12.065, 0.305, 88.5, float("NaN")], - # "75%": [13.072500000000002, 1.2275, 92.25, float("NaN")], - "unique": [8, 7, 8, 5], + "unique": [8, 7, 8, 3], "freq": [1, 2, 1, 4], "top": [14.44, 0.2, 98, "a"], } - # note : We ignote Nth percentile since sqlite doesn't support `percentile_disc` - out = ip.run_cell("%sqlcmd profile -t numbers").result stats_table = out._table + assert len(stats_table.rows) == len(expected) + for row in stats_table: criteria = row.get_string(fields=[" "], border=False).strip() @@ -126,11 +117,11 @@ def test_table_profile(ip, tmp_empty): word = row.get_string(fields=["word"], border=False, header=False).strip() - if criteria in expected: - assert rating == str(expected[criteria][0]) - assert price == str(expected[criteria][1]) - assert number == str(expected[criteria][2]) - assert word == str(expected[criteria][3]) + assert criteria in expected + assert rating == str(expected[criteria][0]) + assert price == str(expected[criteria][1]) + assert number == str(expected[criteria][2]) + assert word == str(expected[criteria][3]) def test_table_schema_profile(ip, tmp_empty): From fafa53343324330b56b6d1bcda2f687f9330509d Mon Sep 17 00:00:00 2001 From: yafimvo Date: Tue, 7 Mar 2023 20:02:46 +0200 Subject: [PATCH 10/23] test fixed --- src/sql/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sql/run.py b/src/sql/run.py index 7fad057ae..ff45543f3 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -106,7 +106,7 @@ def __init__(self, sqlaproxy, config): self.keys = {} if sqlaproxy.returns_rows: self.keys = sqlaproxy.keys() - if isinstance(config.autolimit, bool): + if isinstance(config.autolimit, int): list.__init__(self, sqlaproxy.fetchmany(size=config.autolimit)) else: list.__init__(self, sqlaproxy.fetchall()) From 122f1067c9f8c6951d95b5778c6b823597a80e83 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 10:07:44 +0200 Subject: [PATCH 11/23] config.autolimit check fixed --- src/sql/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sql/run.py b/src/sql/run.py index ff45543f3..462492140 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -106,7 +106,7 @@ def __init__(self, sqlaproxy, config): self.keys = {} if sqlaproxy.returns_rows: self.keys = sqlaproxy.keys() - if isinstance(config.autolimit, int): + if isinstance(config.autolimit, int) and config.autolimit > 0: list.__init__(self, sqlaproxy.fetchmany(size=config.autolimit)) else: list.__init__(self, sqlaproxy.fetchall()) From 83b9dd39430905d5a057acd76cfa2087a0261cf4 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 20:35:54 +0200 Subject: [PATCH 12/23] integration tests added --- src/sql/inspect.py | 137 +++++++++++------- .../integration/test_generic_db_opeations.py | 99 +++++++++++++ 2 files changed, 183 insertions(+), 53 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index cb5ffe5cb..18e0c1e8a 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -111,84 +111,115 @@ def __init__(self, table_name, schema=None, config=None) -> None: ).keys table_stats = dict({}) + columns_to_include_in_report = set() for column in columns: table_stats[column] = dict() - result_col_freq_values = sql.run.run_raw( - Connection.current, - f"""SELECT {column} as top, - COUNT({column}) as frequency FROM {table_name} - GROUP BY {column} ORDER BY Count({column}) Desc""", - config, - ).dict() - - # get all non None values, min, max and avg. - result_value_values = sql.run.run_raw( - Connection.current, - f""" - SELECT MIN({column}) AS min, - MAX({column}) AS max, - COUNT(DISTINCT {column}) AS unique_count, - COUNT({column}) AS total - FROM {table_name} - WHERE {column} IS NOT NULL AND TRIM({column}) <> '' - """, - config, - ).dict() - - table_stats[column]["freq"] = result_col_freq_values["frequency"][0] - table_stats[column]["top"] = result_col_freq_values["top"][0] - table_stats[column]["count"] = result_value_values["total"][0] - table_stats[column]["unique"] = result_value_values["unique_count"][0] - table_stats[column]["min"] = result_value_values["min"][0] - table_stats[column]["max"] = result_value_values["max"][0] - - avg = None + + # index is reserved word in sqlite so we use + # brackets to make it work. + if column == "index": + _column = "[index]" + else: + _column = column + + try: + result_col_freq_values = sql.run.run_raw( + Connection.current, + f"""SELECT {_column} as top, + COUNT({_column}) as frequency FROM {table_name} + GROUP BY {_column} ORDER BY Count({_column}) Desc""", + config, + ).dict() + + table_stats[column]["freq"] = result_col_freq_values["frequency"][0] + table_stats[column]["top"] = result_col_freq_values["top"][0] + + columns_to_include_in_report.update(["freq", "top"]) + + except Exception: + pass + + try: + # get all non None values, min, max and avg. + result_value_values = sql.run.run_raw( + Connection.current, + f""" + SELECT MIN({_column}) AS min, + MAX({_column}) AS max, + COUNT(DISTINCT {_column}) AS unique_count, + COUNT({_column}) AS count + FROM {table_name} + WHERE {_column} IS NOT NULL AND TRIM({_column}) <> '' + """, + config, + ).dict() + + table_stats[column]["count"] = result_value_values["count"][0] + table_stats[column]["unique"] = result_value_values["unique_count"][0] + table_stats[column]["min"] = result_value_values["min"][0] + table_stats[column]["max"] = result_value_values["max"][0] + + columns_to_include_in_report.update(["count", "unique", "min", "max"]) + + except Exception: + pass + try: results_avg = sql.run.run_raw( Connection.current, f""" - SELECT AVG({column}) AS avg + SELECT AVG({_column}) AS avg FROM {table_name} - WHERE {column} IS NOT NULL AND TRIM({column}) <> '' + WHERE {_column} IS NOT NULL AND TRIM({_column}) <> '' """, config, ).dict() - avg = results_avg["avg"][0] - except BaseException: - avg = math.nan + table_stats[column]["mean"] = results_avg["avg"][0] + columns_to_include_in_report.update(["mean"]) + + except Exception: + table_stats[column]["mean"] = math.nan - table_stats[column]["mean"] = avg + # These keys are numeric and work only on duckdb + special_numeric_keys = ["std", "25%", "50%", "75%"] try: - # Note: This STDEV and PERCENTILE_DISC will work only on DuckDB + # Note: stddev_pop and PERCENTILE_DISC will work only on DuckDB result = sql.run.run_raw( Connection.current, f""" SELECT - stddev_pop({column}) as std, - percentile_disc(0.25) WITHIN GROUP (ORDER BY {column}) as p25, - percentile_disc(0.50) WITHIN GROUP (ORDER BY {column}) as p50, - percentile_disc(0.75) WITHIN GROUP (ORDER BY {column}) as p75 + stddev_pop({_column}) as key_std, + percentile_disc(0.25) WITHIN GROUP + (ORDER BY {_column}) as key_25, + percentile_disc(0.50) WITHIN GROUP + (ORDER BY {_column}) as key_50, + percentile_disc(0.75) WITHIN GROUP + (ORDER BY {_column}) as key_75 FROM {table_name} """, config, ).dict() - table_stats[column]["std"] = result["std"][0] - table_stats[column]["25%"] = result["p25"][0] - table_stats[column]["50%"] = result["p50"][0] - table_stats[column]["75%"] = result["p75"][0] + for key in special_numeric_keys: + r_key = f'key_{key.replace("%", "")}' + table_stats[column][key] = result[r_key][0] + + columns_to_include_in_report.update(special_numeric_keys) except TypeError: # for non numeric values - table_stats[column]["mean"] = math.nan - table_stats[column]["std"] = math.nan - table_stats[column]["25%"] = math.nan - table_stats[column]["50%"] = math.nan - table_stats[column]["75%"] = math.nan + for key in special_numeric_keys: + table_stats[column][key] = math.nan + + except Exception as e: + # We tried to apply numeric function on + # non numeric value, i.e: DateTime + if "duckdb.BinderException" in str(e): + for key in special_numeric_keys: + table_stats[column][key] = math.nan - except BaseException: # Failed to run sql command. # We ignore the cell stats for such case. pass @@ -196,8 +227,8 @@ def __init__(self, table_name, schema=None, config=None) -> None: self._table = PrettyTable() self._table.field_names = [" "] + list(table_stats.keys()) - rows = list(table_stats.items())[0][1].keys() - + rows = list(columns_to_include_in_report) + rows.sort(reverse=True) for row in rows: values = [row] for column in table_stats: diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index 53f37e676..e7bbdbe1c 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -3,6 +3,7 @@ import warnings from sql.telemetry import telemetry from unittest.mock import ANY, Mock +import math @pytest.fixture(autouse=True) @@ -144,3 +145,101 @@ def test_telemetry_execute_command_has_connection_info( }, }, ) + + +@pytest.mark.parametrize( + "ip_with_dynamic_db, table, table_columns, expected", + [ + # ("ip_with_postgreSQL", + # "taxi", + # ["index", "taxi_driver_name"], + # { + # "count": [45, 45], + # "mean": [22.0, 0.0], + # "min": [0, "Eric Ken"], + # "max": [44, "Kevin Kelly"], + # "unique": [45, 3], + # "freq": [1, 15], + # "top": [0, "Kevin Kelly"], + # } + # ), + # ("ip_with_mySQL", + # "taxi", + # ["index", "taxi_driver_name"], + # { + # "count": [45, 45], + # "mean": [22.0, 0.0], + # "min": [0, "Eric Ken"], + # "max": [44, "Kevin Kelly"], + # "unique": [45, 3], + # "freq": [1, 15], + # "top": [0, "Kevin Kelly"], + # } + # ), + # ("ip_with_mariaDB", + # "taxi", + # ["index", "taxi_driver_name"], + # { + # "count": [45, 45], + # "mean": [22.0, 0.0], + # "min": [0, "Eric Ken"], + # "max": [44, "Kevin Kelly"], + # "unique": [45, 3], + # "freq": [1, 15], + # "top": [0, "Kevin Kelly"], + # } + # ), + ("ip_with_SQLite", + "taxi", + ["index", "taxi_driver_name"], + { + "count": [45, 45], + "mean": [22.0, 0.0], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Kevin Kelly"], + } + ), + ("ip_with_duckDB", + "yellow_tripdata_2021-01.parquet", + ["VendorID", "tpep_pickup_datetime", "passenger_count"], + { + "count": [1369769, 1369769, 1271417], + "mean": ["1.722e+00", math.nan, "1.412e+00"], + "min": [1, "2008-12-31 23:05:14", 0.0], + "max": [6, "2021-02-22 16:52:16", 8.0], + "unique": [3, 939020, 9], + "freq": [937141, 13, 966236], + "top": [2, "2021-01-14 13:52:00", 1.0], + + "std": ["5.925e-01", math.nan, "1.060e+00"], + "25%": [1, math.nan, 1.0], + "50%": [2, math.nan, 1.0], + "75%": [2, math.nan, 1.0], + }), + ], +) +def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected): + ip_with_dynamic_db = request.getfixturevalue(ip_with_dynamic_db) + + out = ip_with_dynamic_db.run_cell( + f""" + %sqlcmd profile --table "{table}" + """ + ).result + + stats_table = out._table + + assert len(stats_table.rows) == len(expected) + + for row in stats_table: + criteria = row.get_string(fields=[" "], border=False).strip() + + for i, column in enumerate(table_columns): + cell_value = row.get_string( + fields=[column], border=False, header=False).strip() + + assert criteria in expected + assert cell_value == str(expected[criteria][i]) From 4d0f84de5a60fe73f2f9737f8286a3eef8374845 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 21:16:14 +0200 Subject: [PATCH 13/23] integration tests fixed --- src/sql/inspect.py | 36 +++--- .../integration/test_generic_db_opeations.py | 113 +++++++++--------- 2 files changed, 72 insertions(+), 77 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index 18e0c1e8a..2a67fcaec 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -116,19 +116,13 @@ def __init__(self, table_name, schema=None, config=None) -> None: for column in columns: table_stats[column] = dict() - # index is reserved word in sqlite so we use - # brackets to make it work. - if column == "index": - _column = "[index]" - else: - _column = column - + # Note: index is reserved word in sqlite try: result_col_freq_values = sql.run.run_raw( Connection.current, - f"""SELECT {_column} as top, - COUNT({_column}) as frequency FROM {table_name} - GROUP BY {_column} ORDER BY Count({_column}) Desc""", + f"""SELECT {column} as top, + COUNT({column}) as frequency FROM {table_name} + GROUP BY {column} ORDER BY Count({column}) Desc""", config, ).dict() @@ -145,12 +139,12 @@ def __init__(self, table_name, schema=None, config=None) -> None: result_value_values = sql.run.run_raw( Connection.current, f""" - SELECT MIN({_column}) AS min, - MAX({_column}) AS max, - COUNT(DISTINCT {_column}) AS unique_count, - COUNT({_column}) AS count + SELECT MIN({column}) AS min, + MAX({column}) AS max, + COUNT(DISTINCT {column}) AS unique_count, + COUNT({column}) AS count FROM {table_name} - WHERE {_column} IS NOT NULL AND TRIM({_column}) <> '' + WHERE {column} IS NOT NULL AND TRIM({column}) <> '' """, config, ).dict() @@ -169,9 +163,9 @@ def __init__(self, table_name, schema=None, config=None) -> None: results_avg = sql.run.run_raw( Connection.current, f""" - SELECT AVG({_column}) AS avg + SELECT AVG({column}) AS avg FROM {table_name} - WHERE {_column} IS NOT NULL AND TRIM({_column}) <> '' + WHERE {column} IS NOT NULL AND TRIM({column}) <> '' """, config, ).dict() @@ -190,13 +184,13 @@ def __init__(self, table_name, schema=None, config=None) -> None: Connection.current, f""" SELECT - stddev_pop({_column}) as key_std, + stddev_pop({column}) as key_std, percentile_disc(0.25) WITHIN GROUP - (ORDER BY {_column}) as key_25, + (ORDER BY {column}) as key_25, percentile_disc(0.50) WITHIN GROUP - (ORDER BY {_column}) as key_50, + (ORDER BY {column}) as key_50, percentile_disc(0.75) WITHIN GROUP - (ORDER BY {_column}) as key_75 + (ORDER BY {column}) as key_75 FROM {table_name} """, config, diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index e7bbdbe1c..e3765ce4c 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -150,46 +150,33 @@ def test_telemetry_execute_command_has_connection_info( @pytest.mark.parametrize( "ip_with_dynamic_db, table, table_columns, expected", [ - # ("ip_with_postgreSQL", - # "taxi", - # ["index", "taxi_driver_name"], - # { - # "count": [45, 45], - # "mean": [22.0, 0.0], - # "min": [0, "Eric Ken"], - # "max": [44, "Kevin Kelly"], - # "unique": [45, 3], - # "freq": [1, 15], - # "top": [0, "Kevin Kelly"], - # } - # ), - # ("ip_with_mySQL", - # "taxi", - # ["index", "taxi_driver_name"], - # { - # "count": [45, 45], - # "mean": [22.0, 0.0], - # "min": [0, "Eric Ken"], - # "max": [44, "Kevin Kelly"], - # "unique": [45, 3], - # "freq": [1, 15], - # "top": [0, "Kevin Kelly"], - # } - # ), - # ("ip_with_mariaDB", - # "taxi", - # ["index", "taxi_driver_name"], - # { - # "count": [45, 45], - # "mean": [22.0, 0.0], - # "min": [0, "Eric Ken"], - # "max": [44, "Kevin Kelly"], - # "unique": [45, 3], - # "freq": [1, 15], - # "top": [0, "Kevin Kelly"], - # } - # ), - ("ip_with_SQLite", + ("ip_with_postgreSQL", + "taxi", + ["index", "taxi_driver_name"], + { + "count": [45, 45], + "mean": [22.0, 0.0], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Kevin Kelly"], + } + ), + ("ip_with_mySQL", + "taxi", + ["index", "taxi_driver_name"], + { + "count": [45, 45], + "mean": [22.0, 0.0], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Kevin Kelly"], + } + ), + ("ip_with_mariaDB", "taxi", ["index", "taxi_driver_name"], { @@ -202,26 +189,40 @@ def test_telemetry_execute_command_has_connection_info( "top": [0, "Kevin Kelly"], } ), + ("ip_with_SQLite", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + } + ), ("ip_with_duckDB", - "yellow_tripdata_2021-01.parquet", - ["VendorID", "tpep_pickup_datetime", "passenger_count"], + "taxi", + ["index", "taxi_driver_name"], { - "count": [1369769, 1369769, 1271417], - "mean": ["1.722e+00", math.nan, "1.412e+00"], - "min": [1, "2008-12-31 23:05:14", 0.0], - "max": [6, "2021-02-22 16:52:16", 8.0], - "unique": [3, 939020, 9], - "freq": [937141, 13, 966236], - "top": [2, "2021-01-14 13:52:00", 1.0], - - "std": ["5.925e-01", math.nan, "1.060e+00"], - "25%": [1, math.nan, 1.0], - "50%": [2, math.nan, 1.0], - "75%": [2, math.nan, 1.0], - }), + "count": [45, 45], + "mean": [22.0, math.nan], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Eric Ken"], + "std": ["1.299e+01", math.nan], + "25%": [11, math.nan], + "50%": [22, math.nan], + "75%": [33, math.nan], + + } + ), ], ) -def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected): +def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected, capsys): ip_with_dynamic_db = request.getfixturevalue(ip_with_dynamic_db) out = ip_with_dynamic_db.run_cell( From 105aa3d957a43b6acef0958f16163d6ac7baf8e5 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 21:17:28 +0200 Subject: [PATCH 14/23] lint --- src/tests/integration/test_generic_db_opeations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index e3765ce4c..3ff399663 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -222,7 +222,7 @@ def test_telemetry_execute_command_has_connection_info( ), ], ) -def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected, capsys): +def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected): ip_with_dynamic_db = request.getfixturevalue(ip_with_dynamic_db) out = ip_with_dynamic_db.run_cell( From 8e4aac33cbebed59de6f15fb84419613998812ac Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 21:26:20 +0200 Subject: [PATCH 15/23] index removed from integration tests --- .../integration/test_generic_db_opeations.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index 3ff399663..2c6bd3af1 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -152,41 +152,41 @@ def test_telemetry_execute_command_has_connection_info( [ ("ip_with_postgreSQL", "taxi", - ["index", "taxi_driver_name"], + ["taxi_driver_name"], { - "count": [45, 45], - "mean": [22.0, 0.0], - "min": [0, "Eric Ken"], - "max": [44, "Kevin Kelly"], - "unique": [45, 3], - "freq": [1, 15], - "top": [0, "Kevin Kelly"], + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], } ), ("ip_with_mySQL", "taxi", - ["index", "taxi_driver_name"], + ["taxi_driver_name"], { - "count": [45, 45], - "mean": [22.0, 0.0], - "min": [0, "Eric Ken"], - "max": [44, "Kevin Kelly"], - "unique": [45, 3], - "freq": [1, 15], - "top": [0, "Kevin Kelly"], + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], } ), ("ip_with_mariaDB", "taxi", - ["index", "taxi_driver_name"], + ["taxi_driver_name"], { - "count": [45, 45], - "mean": [22.0, 0.0], - "min": [0, "Eric Ken"], - "max": [44, "Kevin Kelly"], - "unique": [45, 3], - "freq": [1, 15], - "top": [0, "Kevin Kelly"], + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], } ), ("ip_with_SQLite", From 829352d2814397c9842ab5b3d785b8bcfed0b054 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 21:39:07 +0200 Subject: [PATCH 16/23] postgres, mysql and maria excluded from profile test --- .../integration/test_generic_db_opeations.py | 85 ++++++++++--------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index 2c6bd3af1..4d01466b1 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -150,45 +150,52 @@ def test_telemetry_execute_command_has_connection_info( @pytest.mark.parametrize( "ip_with_dynamic_db, table, table_columns, expected", [ - ("ip_with_postgreSQL", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - } - ), - ("ip_with_mySQL", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - } - ), - ("ip_with_mariaDB", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - } - ), + pytest.param("ip_with_postgreSQL", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail( + reason="Need to parse results"), + ), + + pytest.param("ip_with_mySQL", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail( + reason="Need to get column names from table with a different query"), + ), + pytest.param("ip_with_mariaDB", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail( + reason="Need to get column names from table with a different query"), + ), ("ip_with_SQLite", "taxi", ["taxi_driver_name"], From 823cc61f57a41274bb94a56839593037b9e6cc1c Mon Sep 17 00:00:00 2001 From: yafimvo Date: Wed, 8 Mar 2023 21:39:48 +0200 Subject: [PATCH 17/23] lint --- .../integration/test_generic_db_opeations.py | 161 +++++++++--------- 1 file changed, 83 insertions(+), 78 deletions(-) diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index 4d01466b1..ebc63d1db 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -150,83 +150,87 @@ def test_telemetry_execute_command_has_connection_info( @pytest.mark.parametrize( "ip_with_dynamic_db, table, table_columns, expected", [ - pytest.param("ip_with_postgreSQL", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - }, - marks=pytest.mark.xfail( - reason="Need to parse results"), - ), - - pytest.param("ip_with_mySQL", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - }, - marks=pytest.mark.xfail( - reason="Need to get column names from table with a different query"), - ), - pytest.param("ip_with_mariaDB", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - }, - marks=pytest.mark.xfail( - reason="Need to get column names from table with a different query"), - ), - ("ip_with_SQLite", - "taxi", - ["taxi_driver_name"], - { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], - } - ), - ("ip_with_duckDB", - "taxi", - ["index", "taxi_driver_name"], - { - "count": [45, 45], - "mean": [22.0, math.nan], - "min": [0, "Eric Ken"], - "max": [44, "Kevin Kelly"], - "unique": [45, 3], - "freq": [1, 15], - "top": [0, "Eric Ken"], - "std": ["1.299e+01", math.nan], - "25%": [11, math.nan], - "50%": [22, math.nan], - "75%": [33, math.nan], - - } - ), + pytest.param( + "ip_with_postgreSQL", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail(reason="Need to parse results"), + ), + pytest.param( + "ip_with_mySQL", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail( + reason="Need to get column names from table with a different query" + ), + ), + pytest.param( + "ip_with_mariaDB", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + marks=pytest.mark.xfail( + reason="Need to get column names from table with a different query" + ), + ), + ( + "ip_with_SQLite", + "taxi", + ["taxi_driver_name"], + { + "count": [45], + "mean": [0.0], + "min": ["Eric Ken"], + "max": ["Kevin Kelly"], + "unique": [3], + "freq": [15], + "top": ["Kevin Kelly"], + }, + ), + ( + "ip_with_duckDB", + "taxi", + ["index", "taxi_driver_name"], + { + "count": [45, 45], + "mean": [22.0, math.nan], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Eric Ken"], + "std": ["1.299e+01", math.nan], + "25%": [11, math.nan], + "50%": [22, math.nan], + "75%": [33, math.nan], + }, + ), ], ) def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expected): @@ -247,7 +251,8 @@ def test_profile_query(request, ip_with_dynamic_db, table, table_columns, expect for i, column in enumerate(table_columns): cell_value = row.get_string( - fields=[column], border=False, header=False).strip() + fields=[column], border=False, header=False + ).strip() assert criteria in expected assert cell_value == str(expected[criteria][i]) From 29492a12cd66d5d91b130c8a52a615a8792beca8 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Thu, 9 Mar 2023 10:49:34 +0200 Subject: [PATCH 18/23] postgresql fixed --- src/sql/inspect.py | 12 +++---- .../integration/test_generic_db_opeations.py | 33 ++++++++++--------- src/tests/test_magic_cmd.py | 6 ++-- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index 2a67fcaec..d2383aa83 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -120,7 +120,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: try: result_col_freq_values = sql.run.run_raw( Connection.current, - f"""SELECT {column} as top, + f"""SELECT DISTINCT {column} as top, COUNT({column}) as frequency FROM {table_name} GROUP BY {column} ORDER BY Count({column}) Desc""", config, @@ -144,7 +144,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: COUNT(DISTINCT {column}) AS unique_count, COUNT({column}) AS count FROM {table_name} - WHERE {column} IS NOT NULL AND TRIM({column}) <> '' + WHERE {column} IS NOT NULL """, config, ).dict() @@ -165,11 +165,11 @@ def __init__(self, table_name, schema=None, config=None) -> None: f""" SELECT AVG({column}) AS avg FROM {table_name} - WHERE {column} IS NOT NULL AND TRIM({column}) <> '' + WHERE {column} IS NOT NULL """, config, ).dict() - table_stats[column]["mean"] = results_avg["avg"][0] + table_stats[column]["mean"] = float(results_avg["avg"][0]) columns_to_include_in_report.update(["mean"]) except Exception: @@ -198,7 +198,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: for key in special_numeric_keys: r_key = f'key_{key.replace("%", "")}' - table_stats[column][key] = result[r_key][0] + table_stats[column][key] = float(result[r_key][0]) columns_to_include_in_report.update(special_numeric_keys) @@ -214,7 +214,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: for key in special_numeric_keys: table_stats[column][key] = math.nan - # Failed to run sql command. + # Failed to run sql command/func (e.g stddev_pop). # We ignore the cell stats for such case. pass diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index ebc63d1db..019e4395a 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -150,20 +150,23 @@ def test_telemetry_execute_command_has_connection_info( @pytest.mark.parametrize( "ip_with_dynamic_db, table, table_columns, expected", [ - pytest.param( + ( "ip_with_postgreSQL", "taxi", - ["taxi_driver_name"], + ["index", "taxi_driver_name"], { - "count": [45], - "mean": [0.0], - "min": ["Eric Ken"], - "max": ["Kevin Kelly"], - "unique": [3], - "freq": [15], - "top": ["Kevin Kelly"], + "count": [45, 45], + "mean": [22.0, math.nan], + "min": [0, "Eric Ken"], + "max": [44, "Kevin Kelly"], + "unique": [45, 3], + "freq": [1, 15], + "top": [0, "Eric Ken"], + "std": ["1.299e+01", ""], + "25%": [11.0, ""], + "50%": [22.0, ""], + "75%": [33.0, ""], }, - marks=pytest.mark.xfail(reason="Need to parse results"), ), pytest.param( "ip_with_mySQL", @@ -179,7 +182,7 @@ def test_telemetry_execute_command_has_connection_info( "top": ["Kevin Kelly"], }, marks=pytest.mark.xfail( - reason="Need to get column names from table with a different query" + reason="Need to get column names with a different query" ), ), pytest.param( @@ -196,7 +199,7 @@ def test_telemetry_execute_command_has_connection_info( "top": ["Kevin Kelly"], }, marks=pytest.mark.xfail( - reason="Need to get column names from table with a different query" + reason="Need to get column names with a different query" ), ), ( @@ -226,9 +229,9 @@ def test_telemetry_execute_command_has_connection_info( "freq": [1, 15], "top": [0, "Eric Ken"], "std": ["1.299e+01", math.nan], - "25%": [11, math.nan], - "50%": [22, math.nan], - "75%": [33, math.nan], + "25%": [11.0, math.nan], + "50%": [22.0, math.nan], + "75%": [33.0, math.nan], }, ), ], diff --git a/src/tests/test_magic_cmd.py b/src/tests/test_magic_cmd.py index 91ea7ce9c..20242df4c 100644 --- a/src/tests/test_magic_cmd.py +++ b/src/tests/test_magic_cmd.py @@ -91,11 +91,11 @@ def test_table_profile(ip, tmp_empty): ) expected = { - "count": [8, 8, 8, 6], + "count": [8, 8, 8, 8], "mean": [12.2165, "6.875e-01", 88.75, 0.0], - "min": [10.532, 0.1, 82, "a"], + "min": [10.532, 0.1, 82, ""], "max": [14.44, 2.48, 98, "c"], - "unique": [8, 7, 8, 3], + "unique": [8, 7, 8, 5], "freq": [1, 2, 1, 4], "top": [14.44, 0.2, 98, "a"], } From abeb44af9bd65702b2baa922109fc186891bffd5 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Thu, 9 Mar 2023 11:13:15 +0200 Subject: [PATCH 19/23] postgresql nan values fixed --- src/sql/inspect.py | 2 +- src/tests/integration/test_generic_db_opeations.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index d2383aa83..d1d5c4807 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -210,7 +210,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: except Exception as e: # We tried to apply numeric function on # non numeric value, i.e: DateTime - if "duckdb.BinderException" in str(e): + if "duckdb.BinderException" or "add explicit type casts" in str(e): for key in special_numeric_keys: table_stats[column][key] = math.nan diff --git a/src/tests/integration/test_generic_db_opeations.py b/src/tests/integration/test_generic_db_opeations.py index 019e4395a..0f5908705 100644 --- a/src/tests/integration/test_generic_db_opeations.py +++ b/src/tests/integration/test_generic_db_opeations.py @@ -162,10 +162,10 @@ def test_telemetry_execute_command_has_connection_info( "unique": [45, 3], "freq": [1, 15], "top": [0, "Eric Ken"], - "std": ["1.299e+01", ""], - "25%": [11.0, ""], - "50%": [22.0, ""], - "75%": [33.0, ""], + "std": ["1.299e+01", math.nan], + "25%": [11.0, math.nan], + "50%": [22.0, math.nan], + "75%": [33.0, math.nan], }, ), pytest.param( @@ -182,7 +182,7 @@ def test_telemetry_execute_command_has_connection_info( "top": ["Kevin Kelly"], }, marks=pytest.mark.xfail( - reason="Need to get column names with a different query" + reason="Need to get column names from table with a different query" ), ), pytest.param( @@ -199,7 +199,7 @@ def test_telemetry_execute_command_has_connection_info( "top": ["Kevin Kelly"], }, marks=pytest.mark.xfail( - reason="Need to get column names with a different query" + reason="Need to get column names from table with a different query" ), ), ( From 606b9bb5368d0ce436640854f0850d80f9cd09ce Mon Sep 17 00:00:00 2001 From: yafimvo Date: Tue, 14 Mar 2023 15:37:52 +0200 Subject: [PATCH 20/23] rebase --- .github/workflows/ci.yaml | 32 +++++++++++++++++++++++++++ CHANGELOG.md | 7 +++++- doc/community/vs.md | 12 ++++++++-- doc/integrations/duckdb.md | 45 ++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- src/sql/__init__.py | 2 +- src/sql/command.py | 1 + src/sql/connection.py | 35 +++++++++++++++++++---------- src/sql/magic.py | 2 +- src/sql/run.py | 1 + src/tests/test_magic.py | 13 +++++++++++ tasks.py | 2 +- 12 files changed, 135 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0e3adec95..99e07b0e3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,6 +43,38 @@ jobs: run: | pytest --durations-min=5 --ignore=src/tests/integration + test-sqlalchemy-v1: + strategy: + matrix: + python-version: ['3.11'] + os: [ubuntu-latest, macos-latest, windows-latest] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Lint with flake8 + run: | + python -m pip install --upgrade pip + # run flake8 on .py files + pip install flake8 + flake8 + # run flake8 on notebooks (.ipynb, .md, etc) + pip install jupytext nbqa + nbqa flake8 . + - name: Install dependencies + run: | + pip install "sqlalchemy<2" + pip install ".[dev]" + - name: Test with pytest + run: | + pytest --durations-min=5 --ignore=src/tests/integration # run: pkgmt check check: diff --git a/CHANGELOG.md b/CHANGELOG.md index cee954d8c..95526129e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,13 @@ # CHANGELOG -## 0.6.4dev +## 0.6.5dev * [Feature] Adds `%sqlcmd profile` (#66) +## 0.6.4 (2023-03-12) + +* [Fix] Adds support for SQL Alchemy 2.0 +* [Doc] Summary section on jupysql vs ipython-sql + ## 0.6.3 (2023-03-06) * [Fix] Displaying variable substitution warning only when the variable to expand exists in the user's namespace diff --git a/doc/community/vs.md b/doc/community/vs.md index 808774a4d..759dd38e0 100644 --- a/doc/community/vs.md +++ b/doc/community/vs.md @@ -4,7 +4,15 @@ JupySQL is an actively maintained fork of [ipython-sql](https://github.com/cathe ## Incompatibilities -If you're migrating from `ipython-sql` to JupySQL, these are the differences (it most cases, no code changes are needed): +If you're migrating from `ipython-sql` to JupySQL, these are the differences (in most cases, no code changes are needed): - Since `0.6` JupySQL no longer supports old versions of IPython -- Variable expansion is being replaced from `{variable}`, `${variable}` to `{{variable}}` \ No newline at end of file +- Variable expansion is being replaced from `{variable}`, `${variable}` to `{{variable}}` + +## New features + +- [Plotting](../plot) module that allows you to efficiently plot massive datasets without running out of memory. +- JupySQL allows you to break queries into multiple cells with the help of CTEs. [Click here](../compose) to learn more. +- Using `%sqlcmd tables` and `%sqlcmd columns --table/-t` user can quickly explore tables in the database and the columns each table has. [Click here](../user-guide/tables-columns) to learn more. +- [Polars Integration](../integrations/polars) to convert query results to `polars.DataFrame`. `%config SqlMagic.autopolars` can be used to automatically return Polars DataFrames instead of regular result sets. +- Integration tests with PostgreSQL, MariaDB, MySQL, SQLite and DuckDB. \ No newline at end of file diff --git a/doc/integrations/duckdb.md b/doc/integrations/duckdb.md index 78da10885..f310345fd 100644 --- a/doc/integrations/duckdb.md +++ b/doc/integrations/duckdb.md @@ -272,3 +272,48 @@ some_engine = create_engine( %sql some_engine ``` +## Listing Tables + +This section demonstrates how to list tables from both the `.csv` and `.parquet` files introduced in the previous sections. + +### Listing tables from a `.csv` file + +The data from the `.csv` file must first be registered as a table in order for the table to be listed. + +```{code-cell} ipython3 +%%sql +CREATE TABLE penguins AS SELECT * FROM penguins.csv +``` + +The cell above allows the data to now be listed as a table from the following code: + +```{code-cell} ipython3 +%sqlcmd tables +``` + +### Listing tables from a `.parquet` file + +Identically, to list the data from a `.parquet` file as a table, the data must first be registered as a table. + +```{code-cell} ipython3 +%%sql +CREATE TABLE tripdata AS SELECT * FROM "yellow_tripdata_2021-01.parquet" +``` + +The data is now able to be listed as a table from the following code: + +```{code-cell} ipython3 +%sqlcmd tables +``` + +## Listing Columns + +After either registering the data from the`.csv` or `.parquet` files as a table, their respective columns can now be listed with the following code: + +```{code-cell} ipython3 +%sqlcmd columns -t penguins +``` + +```{code-cell} ipython3 +%sqlcmd columns -t tripdata +``` \ No newline at end of file diff --git a/setup.py b/setup.py index 027499004..60ba9fdfb 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ install_requires = [ "prettytable", "ipython>=1.0", - "sqlalchemy>=0.6.7,<2.0", + "sqlalchemy", "sqlparse", "ipython-genutils>=0.1.0", "jinja2", diff --git a/src/sql/__init__.py b/src/sql/__init__.py index 478cfa53c..0923e9e5c 100644 --- a/src/sql/__init__.py +++ b/src/sql/__init__.py @@ -1,6 +1,6 @@ from .magic import RenderMagic, SqlMagic, load_ipython_extension -__version__ = "0.6.4dev" +__version__ = "0.6.5dev" __all__ = [ diff --git a/src/sql/command.py b/src/sql/command.py index 6fb840ba5..ec7d7990f 100644 --- a/src/sql/command.py +++ b/src/sql/command.py @@ -90,6 +90,7 @@ def result_var(self): def _var_expand(self, sql, user_ns, magic): sql = Template(sql).render(user_ns) + parsed_sql = magic.shell.var_expand(sql, depth=2) has_SQLAlchemy_var_expand = ":" in sql and any( diff --git a/src/sql/connection.py b/src/sql/connection.py index 516d79fd9..ea340f06d 100644 --- a/src/sql/connection.py +++ b/src/sql/connection.py @@ -11,6 +11,7 @@ "For technical support: https://ploomber.io/community" "\nDocumentation: https://jupysql.ploomber.io/en/latest/connecting.html" ) +IS_SQLALCHEMY_ONE = int(sqlalchemy.__version__.split(".")[0]) == 1 # Check Full List: https://docs.sqlalchemy.org/en/20/dialects MISSING_PACKAGE_LIST_EXCEPT_MATCHERS = { @@ -193,11 +194,23 @@ def _error_module_not_found(cls, e): return ModuleNotFoundError("test") def __init__(self, engine, alias=None): - self.dialect = engine.url.get_dialect() - self.metadata = sqlalchemy.MetaData(bind=engine) + self.url = engine.url self.name = self.assign_name(engine) + self.dialect = self.url.get_dialect() self.session = engine.connect() - self.connections[alias or repr(self.metadata.bind.url)] = self + + if IS_SQLALCHEMY_ONE: + self.metadata = sqlalchemy.MetaData(bind=engine) + + self.connections[ + alias + or ( + repr(sqlalchemy.MetaData(bind=engine).bind.url) + if IS_SQLALCHEMY_ONE + else repr(engine.url) + ) + ] = self + self.connect_args = None self.alias = alias Connection.current = self @@ -298,7 +311,7 @@ def connection_list(cls): result = [] for key in sorted(cls.connections): conn = cls.connections[key] - engine_url = conn.metadata.bind.url # type: sqlalchemy.engine.url.URL + engine_url = conn.metadata.bind.url if IS_SQLALCHEMY_ONE else conn.url prefix = "* " if conn == cls.current else " " @@ -312,7 +325,7 @@ def connection_list(cls): return "\n".join(result) @classmethod - def _close(cls, descriptor): + def close(cls, descriptor): if isinstance(descriptor, Connection): conn = descriptor else: @@ -328,12 +341,10 @@ def _close(cls, descriptor): if descriptor in cls.connections: cls.connections.pop(descriptor) else: - cls.connections.pop(str(conn.metadata.bind.url)) - - conn.session.close() - - def close(self): - self.__class__._close(self) + cls.connections.pop( + str(conn.metadata.bind.url) if IS_SQLALCHEMY_ONE else str(conn.url) + ) + conn.session.close() @classmethod def _get_curr_connection_info(cls): @@ -341,7 +352,7 @@ def _get_curr_connection_info(cls): if not cls.current: return None - engine = cls.current.metadata.bind + engine = cls.current.metadata.bind if IS_SQLALCHEMY_ONE else cls.current return { "dialect": getattr(engine.dialect, "name", None), "driver": getattr(engine.dialect, "driver", None), diff --git a/src/sql/magic.py b/src/sql/magic.py index c15cdcd8e..5c8e50381 100644 --- a/src/sql/magic.py +++ b/src/sql/magic.py @@ -266,7 +266,7 @@ def _execute(self, payload, line, cell, local_ns): if args.connections: return sql.connection.Connection.connections elif args.close: - return sql.connection.Connection._close(args.close) + return sql.connection.Connection.close(args.close) connect_arg = command.connection diff --git a/src/sql/run.py b/src/sql/run.py index 462492140..bec6b06e5 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -408,6 +408,7 @@ def run(conn, sql, config, user_namespace): _commit(conn=conn, config=config) if result and config.feedback: print(interpret_rowcount(result.rowcount)) + resultset = ResultSet(result, config) if config.autopandas: return resultset.DataFrame() diff --git a/src/tests/test_magic.py b/src/tests/test_magic.py index 8b0086c42..b52e5f21e 100644 --- a/src/tests/test_magic.py +++ b/src/tests/test_magic.py @@ -590,3 +590,16 @@ def test_jupysql_alias(): "line": {"jupysql": "execute", "sql": "execute"}, "cell": {"jupysql": "execute", "sql": "execute"}, } + + +@pytest.mark.xfail(reason="will be fixed once we deprecate the $name parametrization") +def test_columns_with_dollar_sign(ip_empty): + ip_empty.run_cell("%sql sqlite://") + result = ip_empty.run_cell( + """ + %sql SELECT $2 FROM (VALUES (1, 'one'), (2, 'two'), (3, 'three'))""" + ) + + html = result.result._repr_html_() + + assert "$2" in html diff --git a/tasks.py b/tasks.py index a84a784c1..8cdea7e48 100644 --- a/tasks.py +++ b/tasks.py @@ -32,7 +32,7 @@ def setup(c, version=None, doc=False): @task(aliases=["d"]) def doc(c): - with c.cd('doc'): + with c.cd("doc"): c.run( "python3 -m sphinx -T -E -W --keep-going -b html \ -d _build/doctrees -D language=en . _build/html" From ea81d9e39caabd0c1df246a0c1f07567c667efc2 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Tue, 14 Mar 2023 16:32:46 +0200 Subject: [PATCH 21/23] naming changed --- src/sql/inspect.py | 10 +++++----- src/sql/run.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index d1d5c4807..59c2f39d7 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -106,7 +106,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: if schema: table_name = f"{schema}.{table_name}" - columns = sql.run.run_raw( + columns = sql.run.raw_run( Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config ).keys @@ -118,7 +118,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: # Note: index is reserved word in sqlite try: - result_col_freq_values = sql.run.run_raw( + result_col_freq_values = sql.run.raw_run( Connection.current, f"""SELECT DISTINCT {column} as top, COUNT({column}) as frequency FROM {table_name} @@ -136,7 +136,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: try: # get all non None values, min, max and avg. - result_value_values = sql.run.run_raw( + result_value_values = sql.run.raw_run( Connection.current, f""" SELECT MIN({column}) AS min, @@ -160,7 +160,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: pass try: - results_avg = sql.run.run_raw( + results_avg = sql.run.raw_run( Connection.current, f""" SELECT AVG({column}) AS avg @@ -180,7 +180,7 @@ def __init__(self, table_name, schema=None, config=None) -> None: try: # Note: stddev_pop and PERCENTILE_DISC will work only on DuckDB - result = sql.run.run_raw( + result = sql.run.raw_run( Connection.current, f""" SELECT diff --git a/src/sql/run.py b/src/sql/run.py index bec6b06e5..a655f3d56 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -421,7 +421,7 @@ def run(conn, sql, config, user_namespace): return "Connected: %s" % conn.name -def run_raw(conn, sql, config): +def raw_run(conn, sql, config): result = conn.session.execute(sql) resultset = ResultSet(result, config) return resultset From 1f5bea07ccd019a0201331205b8058dc6a7078c0 Mon Sep 17 00:00:00 2001 From: yafimvo Date: Thu, 16 Mar 2023 17:00:15 +0200 Subject: [PATCH 22/23] sqlalchemy downgraded to 1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 60ba9fdfb..646bf009e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ install_requires = [ "prettytable", "ipython>=1.0", - "sqlalchemy", + "sqlalchemy<2", "sqlparse", "ipython-genutils>=0.1.0", "jinja2", From a0398f18505540f61cc85d99cdb41247d61a65ed Mon Sep 17 00:00:00 2001 From: yafimvo Date: Sun, 19 Mar 2023 12:47:07 +0200 Subject: [PATCH 23/23] config removed from raw_run --- src/sql/inspect.py | 51 +++++++++++++++++++++----------------------- src/sql/magic_cmd.py | 2 +- src/sql/run.py | 6 ++---- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/src/sql/inspect.py b/src/sql/inspect.py index 59c2f39d7..a8970ae22 100644 --- a/src/sql/inspect.py +++ b/src/sql/inspect.py @@ -102,13 +102,13 @@ class TableDescription(DatabaseInspection): """ - def __init__(self, table_name, schema=None, config=None) -> None: + def __init__(self, table_name, schema=None) -> None: if schema: table_name = f"{schema}.{table_name}" columns = sql.run.raw_run( - Connection.current, f"SELECT * FROM {table_name} WHERE 1=0", config - ).keys + Connection.current, f"SELECT * FROM {table_name} WHERE 1=0" + ).keys() table_stats = dict({}) columns_to_include_in_report = set() @@ -122,12 +122,11 @@ def __init__(self, table_name, schema=None, config=None) -> None: Connection.current, f"""SELECT DISTINCT {column} as top, COUNT({column}) as frequency FROM {table_name} - GROUP BY {column} ORDER BY Count({column}) Desc""", - config, - ).dict() + GROUP BY {column} ORDER BY Count({column}) Desc""" + ).fetchall() - table_stats[column]["freq"] = result_col_freq_values["frequency"][0] - table_stats[column]["top"] = result_col_freq_values["top"][0] + table_stats[column]["freq"] = result_col_freq_values[0][1] + table_stats[column]["top"] = result_col_freq_values[0][0] columns_to_include_in_report.update(["freq", "top"]) @@ -145,14 +144,13 @@ def __init__(self, table_name, schema=None, config=None) -> None: COUNT({column}) AS count FROM {table_name} WHERE {column} IS NOT NULL - """, - config, - ).dict() + """ + ).fetchall() - table_stats[column]["count"] = result_value_values["count"][0] - table_stats[column]["unique"] = result_value_values["unique_count"][0] - table_stats[column]["min"] = result_value_values["min"][0] - table_stats[column]["max"] = result_value_values["max"][0] + table_stats[column]["min"] = result_value_values[0][0] + table_stats[column]["max"] = result_value_values[0][1] + table_stats[column]["unique"] = result_value_values[0][2] + table_stats[column]["count"] = result_value_values[0][3] columns_to_include_in_report.update(["count", "unique", "min", "max"]) @@ -166,10 +164,10 @@ def __init__(self, table_name, schema=None, config=None) -> None: SELECT AVG({column}) AS avg FROM {table_name} WHERE {column} IS NOT NULL - """, - config, - ).dict() - table_stats[column]["mean"] = float(results_avg["avg"][0]) + """ + ).fetchall() + + table_stats[column]["mean"] = float(results_avg[0][0]) columns_to_include_in_report.update(["mean"]) except Exception: @@ -192,13 +190,12 @@ def __init__(self, table_name, schema=None, config=None) -> None: percentile_disc(0.75) WITHIN GROUP (ORDER BY {column}) as key_75 FROM {table_name} - """, - config, - ).dict() + """ + ).fetchall() - for key in special_numeric_keys: - r_key = f'key_{key.replace("%", "")}' - table_stats[column][key] = float(result[r_key][0]) + for i, key in enumerate(special_numeric_keys): + # r_key = f'key_{key.replace("%", "")}' + table_stats[column][key] = float(result[0][i]) columns_to_include_in_report.update(special_numeric_keys) @@ -252,11 +249,11 @@ def get_columns(name, schema=None): @telemetry.log_call() -def get_table_statistics(name, schema=None, config=None): +def get_table_statistics(name, schema=None): """Get table statistics for a given connection. For all data types the results will include `count`, `mean`, `std`, `min` `max`, `25`, `50` and `75` percentiles. It will also include `unique`, `top` and `freq` statistics. """ - return TableDescription(name, schema=schema, config=config) + return TableDescription(name, schema=schema) diff --git a/src/sql/magic_cmd.py b/src/sql/magic_cmd.py index bb9d1a079..6e8cff4ad 100644 --- a/src/sql/magic_cmd.py +++ b/src/sql/magic_cmd.py @@ -84,7 +84,7 @@ def execute(self, line="", cell=""): args = parser.parse_args(others) report = inspect.get_table_statistics( - schema=args.schema, name=args.table, config=self.config + schema=args.schema, name=args.table ) if args.output: diff --git a/src/sql/run.py b/src/sql/run.py index b33a65330..3efec6382 100644 --- a/src/sql/run.py +++ b/src/sql/run.py @@ -439,10 +439,8 @@ def run(conn, sql, config): return "Connected: %s" % conn.name -def raw_run(conn, sql, config): - result = conn.session.execute(sql) - resultset = ResultSet(result, config) - return resultset +def raw_run(conn, sql): + return conn.session.execute(sql) class PrettyTable(prettytable.PrettyTable):