diff --git a/chdb/dataframe/__init__.py b/chdb/dataframe/__init__.py index e74e32a30af..8df975ac3c2 100644 --- a/chdb/dataframe/__init__.py +++ b/chdb/dataframe/__init__.py @@ -14,5 +14,6 @@ from .query import Table, pandas_read_parquet # noqa: C0413 query = Table.queryStatic +sql = Table.queryStatic -__all__ = ['Table', 'query', 'pandas_read_parquet'] +__all__ = ["Table", "query", "sql", "pandas_read_parquet"] diff --git a/src/Processors/Sources/PythonSource.cpp b/src/Processors/Sources/PythonSource.cpp index 035f0956b81..6915cb710e8 100644 --- a/src/Processors/Sources/PythonSource.cpp +++ b/src/Processors/Sources/PythonSource.cpp @@ -392,6 +392,10 @@ Chunk PythonSource::scanDataToChunk() columns[i] = convert_and_insert_array(col, cursor, count); else if (which.isDateTime64()) columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isDate32()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isDate()) + columns[i] = convert_and_insert_array(col, cursor, count); else if (which.isString()) columns[i] = convert_and_insert_array(col, cursor, count); else diff --git a/src/Storages/StoragePython.cpp b/src/Storages/StoragePython.cpp index a92d2b83ed0..368acdf28a5 100644 --- a/src/Storages/StoragePython.cpp +++ b/src/Storages/StoragePython.cpp @@ -263,7 +263,8 @@ std::vector> PyReader::getSchemaFromPyObj(co if (!py::hasattr(data, "__class__")) { throw Exception( - ErrorCodes::UNKNOWN_FORMAT, "Unknown data type for schema inference. Consider inheriting PyReader and overriding getSchema()."); + ErrorCodes::UNKNOWN_FORMAT, + "Unknown data type for schema inference. Consider inheriting PyReader and overriding get_schema()."); } auto type_name = data.attr("__class__").attr("__name__").cast(); diff --git a/tests/test_query_py.py b/tests/test_query_py.py index e6ed33dfd42..3184d3e567d 100644 --- a/tests/test_query_py.py +++ b/tests/test_query_py.py @@ -1,10 +1,12 @@ #!python3 -from io import StringIO +import io +import random import unittest import numpy as np import pandas as pd import pyarrow as pa +from pyarrow import csv import chdb @@ -27,6 +29,19 @@ 717410,0.6095994785374601,draw """ +SCORES_CSV = """score,result,dateOfBirth +758270,lose,1983-07-24 +355079,win,2000-11-27 +451231,lose,1980-03-11 +854953,lose,1996-08-10 +294257,lose,1966-12-12 +756327,lose,1997-08-29 +379755,lose,1981-10-24 +916108,lose,1950-08-30 +467033,win,2007-09-15 +639860,win,1989-06-30 +""" + class myReader(chdb.PyReader): def __init__(self, data): self.data = data @@ -43,6 +58,17 @@ def read(self, col_names, count): class TestQueryPy(unittest.TestCase): + # def test_query_np(self): + # t3 = { + # "a": np.array([1, 2, 3, 4, 5, 6]), + # "b": np.array(["tom", "jerry", "auxten", "tom", "jerry", "auxten"]), + # } + + # ret = chdb.query( + # "SELECT b, sum(a) FROM Python(t3) GROUP BY b ORDER BY b", "debug" + # ) + # self.assertEqual(str(ret), EXPECTED) + def test_query_py(self): reader = myReader( { @@ -74,7 +100,7 @@ def test_query_arrow(self): ) ret = chdb.query( - "SELECT b, sum(a) FROM Python(table) GROUP BY b ORDER BY b", "debug" + "SELECT b, sum(a) FROM Python(table) GROUP BY b ORDER BY b" ) self.assertEqual(str(ret), EXPECTED) @@ -87,20 +113,38 @@ def test_query_arrow2(self): ) ret = chdb.query( - "SELECT b, sum(a) FROM Python(t2) GROUP BY b ORDER BY b", "debug" + "SELECT b, sum(a) FROM Python(t2) GROUP BY b ORDER BY b" ) self.assertEqual(str(ret), EXPECTED) - # def test_query_np(self): - # t3 = { - # "a": np.array([1, 2, 3, 4, 5, 6]), - # "b": np.array(["tom", "jerry", "auxten", "tom", "jerry", "auxten"]), - # } + def test_query_arrow3(self): + table = csv.read_csv(io.BytesIO(SCORES_CSV.encode())) + ret = chdb.query( + """ + SELECT sum(score), avg(score), median(score), + avgIf(score, dateOfBirth > '1980-01-01') as avgIf, + countIf(result = 'win') AS wins, + countIf(result = 'draw') AS draws, + countIf(result = 'lose') AS losses, + count() + FROM Python(table) + """, + ) + self.assertEqual( + str(ret), + "5872873,587287.3,553446.5,470878.25,3,0,7,10\n", + ) - # ret = chdb.query( - # "SELECT b, sum(a) FROM Python(t3) GROUP BY b ORDER BY b", "debug" - # ) - # self.assertEqual(str(ret), EXPECTED) + def test_random_float(self): + x = {"col1": [random.uniform(0, 1) for _ in range(0, 100000)]} + ret = chdb.sql( + """ + select avg(col1) + FROM Python(x) + """ + ) + print(ret.bytes()) + self.assertAlmostEqual(float(ret.bytes()), 0.5, delta=0.01) def test_query_dict(self): data = { @@ -109,29 +153,29 @@ def test_query_dict(self): } ret = chdb.query( - "SELECT b, sum(a) FROM Python(data) GROUP BY b ORDER BY b", "debug" + "SELECT b, sum(a) FROM Python(data) GROUP BY b ORDER BY b" ) self.assertEqual(str(ret), EXPECTED) - # def test_query_dict_int(self): - # data = { - # "a": [1, 2, 3, 4, 5, 6], - # "b": [1, 2, 3, 1, 2, 3], - # } - - # ret = chdb.query( - # "SELECT b, sum(a) FROM Python(data) GROUP BY b ORDER BY b", "debug" - # ) - # self.assertEqual( - # str(ret), - # """1,5 - # 2,7 - # 3,9 - # """, - # ) + def test_query_dict_int(self): + data = { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 1, 2, 3], + } + + ret = chdb.query( + "SELECT b, sum(a) FROM Python(data) GROUP BY b ORDER BY b" + ) + self.assertEqual( + str(ret), + """1,5 +2,7 +3,9 +""", + ) def test_query_pd_csv(self): - csv_data = pd.read_csv(StringIO(SMALL_CSV)) + csv_data = pd.read_csv(io.StringIO(SMALL_CSV)) ret = chdb.query( """ SELECT sum(score1), avg(score1), median(score1), @@ -145,8 +189,7 @@ def test_query_pd_csv(self): ) self.assertEqual( str(ret), - """4099877,409987.7,414399.5,6.128691345453262,0.6128691345453262,0.5693101584911346,1,5,4,10 -""", + "4099877,409987.7,414399.5,6.128691345453262,0.6128691345453262,0.5693101584911346,1,5,4,10\n", )