Skip to content

Commit

Permalink
Fix COUNT(*) ERROR for PDF SELECT Queries (#1376)
Browse files Browse the repository at this point in the history
#1170

COUNT(*) did not work with GROUP BY, while COUNT(column_name), or
COUNT(SEGMENT(sth)) worked with group by.
The issue was due to wrong column name assingment while creating a tuple
value expression.

I added a test case and a sample pdf that is same as the one from the
issue.
  • Loading branch information
jkim3663 authored Nov 22, 2023
1 parent e00887c commit 3d0b647
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 0 deletions.
2 changes: 2 additions & 0 deletions evadb/parser/lark_visitor/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ def aggregate_windowed_function(self, tree):
# Support for COUNT(*)
if token != "*":
agg_func_name = token
elif token == "*":
agg_func_arg = TupleValueExpression(name="_row_id")
else:
agg_func_arg = TupleValueExpression(name="id")

Expand Down
Binary file not shown.
32 changes: 32 additions & 0 deletions test/integration_tests/short/test_select_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,38 @@ def test_select_and_groupby_with_sample(self):
expected_batch.project(["FIRST.id", "SEGMENT.data"]),
)

def test_select_and_groupby_and_aggregate_with_pdf(self):
GROUPBY_SIZE = 8
execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")
# load from directory
pdf_path = (
"test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf"
)
load_query = f"LOAD PDF '{pdf_path}' INTO MyPDFs;"
execute_query_fetch_all(self.evadb, load_query)
select_all_query = "SELECT * FROM MyPDFs;"
all_pdf_batch = execute_query_fetch_all(self.evadb, select_all_query)

select_query = (
f"SELECT COUNT(*) FROM MyPDFs GROUP BY '{GROUPBY_SIZE} paragraphs';"
)
actual_batch = execute_query_fetch_all(self.evadb, select_query)

self.assertAlmostEqual(
len(all_pdf_batch),
len(actual_batch) * actual_batch.frames.iloc[0, 0],
None,
None,
GROUPBY_SIZE,
)
self.assertEqual(len(actual_batch), 99)
n = len(actual_batch)
for i in range(n):
self.assertEqual(actual_batch.frames.iloc[i, 0], GROUPBY_SIZE)

# tear down
execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")

def test_lateral_join_with_unnest_and_sample(self):
query = """SELECT id, label
FROM MyVideo SAMPLE 2 JOIN LATERAL
Expand Down

0 comments on commit 3d0b647

Please sign in to comment.