From b0dce99725ca3ffa941921501f3ee8833429b452 Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sat, 12 Aug 2023 17:31:29 -0400 Subject: [PATCH 1/8] Fix problems found when running hugging face text summariztion model on large input. --- evadb/optimizer/rules/rules.py | 2 ++ evadb/udfs/abstract/hf_abstract_udf.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/evadb/optimizer/rules/rules.py b/evadb/optimizer/rules/rules.py index 883a68e3df..2ac2679146 100644 --- a/evadb/optimizer/rules/rules.py +++ b/evadb/optimizer/rules/rules.py @@ -872,6 +872,7 @@ def apply(self, before: LogicalGet, context: OptimizerContext): # read in a batch from storage engine. # Todo: Experiment heuristics. after = SeqScanPlan(None, before.target_list, before.alias) + batch_mem_size = context.db.config.get_value("executor", "batch_mem_size") after.append_child( StoragePlan( before.table_obj, @@ -880,6 +881,7 @@ def apply(self, before: LogicalGet, context: OptimizerContext): sampling_rate=before.sampling_rate, sampling_type=before.sampling_type, chunk_params=before.chunk_params, + batch_mem_size = batch_mem_size ) ) yield after diff --git a/evadb/udfs/abstract/hf_abstract_udf.py b/evadb/udfs/abstract/hf_abstract_udf.py index e85c3b2786..2ac1c44198 100644 --- a/evadb/udfs/abstract/hf_abstract_udf.py +++ b/evadb/udfs/abstract/hf_abstract_udf.py @@ -97,7 +97,10 @@ def output_formatter(self, outputs: Any): def forward(self, inputs, *args, **kwargs) -> pd.DataFrame: hf_input = self.input_formatter(inputs) - hf_output = self.hf_udf_obj(hf_input, *args, **kwargs) + # Use truncation=True to handle the case where num of tokens is larger + # than limit + # Ref: https://stackoverflow.com/questions/66954682/token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-leng + hf_output = self.hf_udf_obj(hf_input, *args, **kwargs, truncation=True) evadb_output = self.output_formatter(hf_output) return evadb_output From 3d09e9eef5917eb890edfbb947d42e60f3661a73 Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sat, 12 Aug 2023 17:59:14 -0400 Subject: [PATCH 2/8] Add testcases for hugging face with large text input --- test/udfs/test_hugging_face.py | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 test/udfs/test_hugging_face.py diff --git a/test/udfs/test_hugging_face.py b/test/udfs/test_hugging_face.py new file mode 100644 index 0000000000..e0b93261f5 --- /dev/null +++ b/test/udfs/test_hugging_face.py @@ -0,0 +1,44 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from mock import MagicMock +import pandas as pd + +from evadb.third_party.huggingface.model import TextHFModel + +class TestTextHFModel(TextHFModel): + + @property + def default_pipeline_args(self) -> dict: + # We need to improve the hugging face interface, passing + # UdfCatalogEntry into UDF is not ideal. + return {"task": "summarization", + "model": "sshleifer/distilbart-cnn-12-6", + "min_length": 5, + "max_length": 100} + +class HuggingFaceTest(unittest.TestCase): + + def test_hugging_face_with_large_input(self): + udf_obj = MagicMock() + udf_obj.metadata = [] + text_summarization_model = TestTextHFModel(udf_obj) + + large_text = pd.DataFrame([{"text": "hello" * 4096}]) + try: + text_summarization_model(large_text) + except IndexError: + self.fail("hugging face with large input raised IndexError.") From 1879cb8113dccb02c59db66ffaadbdcd50520a1b Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sat, 12 Aug 2023 18:00:16 -0400 Subject: [PATCH 3/8] Linter --- evadb/optimizer/rules/rules.py | 2 +- test/udfs/test_hugging_face.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/evadb/optimizer/rules/rules.py b/evadb/optimizer/rules/rules.py index 2ac2679146..3c5f39c3a6 100644 --- a/evadb/optimizer/rules/rules.py +++ b/evadb/optimizer/rules/rules.py @@ -881,7 +881,7 @@ def apply(self, before: LogicalGet, context: OptimizerContext): sampling_rate=before.sampling_rate, sampling_type=before.sampling_type, chunk_params=before.chunk_params, - batch_mem_size = batch_mem_size + batch_mem_size=batch_mem_size, ) ) yield after diff --git a/test/udfs/test_hugging_face.py b/test/udfs/test_hugging_face.py index e0b93261f5..a2e537a8b6 100644 --- a/test/udfs/test_hugging_face.py +++ b/test/udfs/test_hugging_face.py @@ -14,24 +14,27 @@ # limitations under the License. import unittest -from mock import MagicMock + import pandas as pd +from mock import MagicMock from evadb.third_party.huggingface.model import TextHFModel + class TestTextHFModel(TextHFModel): - @property def default_pipeline_args(self) -> dict: - # We need to improve the hugging face interface, passing + # We need to improve the hugging face interface, passing # UdfCatalogEntry into UDF is not ideal. - return {"task": "summarization", - "model": "sshleifer/distilbart-cnn-12-6", - "min_length": 5, - "max_length": 100} + return { + "task": "summarization", + "model": "sshleifer/distilbart-cnn-12-6", + "min_length": 5, + "max_length": 100, + } + class HuggingFaceTest(unittest.TestCase): - def test_hugging_face_with_large_input(self): udf_obj = MagicMock() udf_obj.metadata = [] From 7d78b80de942ffb62841f62b8781a83d0370a45c Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sat, 12 Aug 2023 18:06:39 -0400 Subject: [PATCH 4/8] Move truncation=True for only TextModels --- evadb/third_party/huggingface/model.py | 6 ++++++ evadb/udfs/abstract/hf_abstract_udf.py | 5 +---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/evadb/third_party/huggingface/model.py b/evadb/third_party/huggingface/model.py index 9e66284b98..3b853acbeb 100644 --- a/evadb/third_party/huggingface/model.py +++ b/evadb/third_party/huggingface/model.py @@ -32,6 +32,12 @@ class TextHFModel(AbstractHFUdf): """ Base Model for all HF Models that take in text as input """ + + def __call__(self, *args, **kwargs): + # Use truncation=True to handle the case where num of tokens is larger + # than limit + # Ref: https://stackoverflow.com/questions/66954682/token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-leng + return self.forward(args[0], truncation=True) def input_formatter(self, inputs: Any): return inputs.values.flatten().tolist() diff --git a/evadb/udfs/abstract/hf_abstract_udf.py b/evadb/udfs/abstract/hf_abstract_udf.py index 2ac1c44198..e85c3b2786 100644 --- a/evadb/udfs/abstract/hf_abstract_udf.py +++ b/evadb/udfs/abstract/hf_abstract_udf.py @@ -97,10 +97,7 @@ def output_formatter(self, outputs: Any): def forward(self, inputs, *args, **kwargs) -> pd.DataFrame: hf_input = self.input_formatter(inputs) - # Use truncation=True to handle the case where num of tokens is larger - # than limit - # Ref: https://stackoverflow.com/questions/66954682/token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-leng - hf_output = self.hf_udf_obj(hf_input, *args, **kwargs, truncation=True) + hf_output = self.hf_udf_obj(hf_input, *args, **kwargs) evadb_output = self.output_formatter(hf_output) return evadb_output From d9d2f20c200fd7ce26167e407499f442e27423d8 Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sun, 13 Aug 2023 00:21:33 -0400 Subject: [PATCH 5/8] Add testcases to make sure batch mem size is passed correctly --- evadb/third_party/huggingface/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evadb/third_party/huggingface/model.py b/evadb/third_party/huggingface/model.py index 3b853acbeb..04d182f485 100644 --- a/evadb/third_party/huggingface/model.py +++ b/evadb/third_party/huggingface/model.py @@ -32,7 +32,7 @@ class TextHFModel(AbstractHFUdf): """ Base Model for all HF Models that take in text as input """ - + def __call__(self, *args, **kwargs): # Use truncation=True to handle the case where num of tokens is larger # than limit From 71c5686d7430e3b73e65ee24535b93f255fe6674 Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sun, 13 Aug 2023 00:22:01 -0400 Subject: [PATCH 6/8] add missing file --- test/optimizer/rules/test_batch_mem_size.py | 58 +++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 test/optimizer/rules/test_batch_mem_size.py diff --git a/test/optimizer/rules/test_batch_mem_size.py b/test/optimizer/rules/test_batch_mem_size.py new file mode 100644 index 0000000000..03bbe59a37 --- /dev/null +++ b/test/optimizer/rules/test_batch_mem_size.py @@ -0,0 +1,58 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing + +from mock import patch, ANY + +from evadb.server.command_handler import execute_query_fetch_all +from evadb.storage.sqlite_storage_engine import SQLStorageEngine + +class BatchMemSizeTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + cls.evadb.catalog().reset() + + @classmethod + def tearDownClass(cls): + execute_query_fetch_all(cls.evadb, "DROP TABLE IF EXISTS MyCSV;") + + def test_batch_mem_size_for_sqlite_storage_engine(self): + """ + This testcase make sure that the `batch_mem_size` is correctly passed to + the storage engine. + """ + test_batch_mem_size = 100 + self.evadb.config.update_value("executor", "batch_mem_size", test_batch_mem_size) + create_table_query = """ + CREATE TABLE IF NOT EXISTS MyCSV ( + id INTEGER UNIQUE, + frame_id INTEGER, + video_id INTEGER, + dataset_name TEXT(30), + label TEXT(30), + bbox NDARRAY FLOAT32(4), + object_id INTEGER + );""" + execute_query_fetch_all(self.evadb, create_table_query) + + select_table_query = "SELECT * FROM MyCSV;" + with patch.object(SQLStorageEngine, 'read') as mock_read: + mock_read.__iter__.return_value = [] + execute_query_fetch_all(self.evadb, select_table_query) + mock_read.assert_called_with(ANY, test_batch_mem_size) + From 4283de739d40f80e40d6939753cd5fe31a9b4db0 Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sun, 13 Aug 2023 00:22:22 -0400 Subject: [PATCH 7/8] linter --- test/optimizer/rules/test_batch_mem_size.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/optimizer/rules/test_batch_mem_size.py b/test/optimizer/rules/test_batch_mem_size.py index 03bbe59a37..70033b014f 100644 --- a/test/optimizer/rules/test_batch_mem_size.py +++ b/test/optimizer/rules/test_batch_mem_size.py @@ -15,11 +15,12 @@ import unittest from test.util import get_evadb_for_testing -from mock import patch, ANY +from mock import ANY, patch from evadb.server.command_handler import execute_query_fetch_all from evadb.storage.sqlite_storage_engine import SQLStorageEngine + class BatchMemSizeTest(unittest.TestCase): @classmethod def setUpClass(cls): @@ -33,11 +34,13 @@ def tearDownClass(cls): def test_batch_mem_size_for_sqlite_storage_engine(self): """ - This testcase make sure that the `batch_mem_size` is correctly passed to + This testcase make sure that the `batch_mem_size` is correctly passed to the storage engine. - """ + """ test_batch_mem_size = 100 - self.evadb.config.update_value("executor", "batch_mem_size", test_batch_mem_size) + self.evadb.config.update_value( + "executor", "batch_mem_size", test_batch_mem_size + ) create_table_query = """ CREATE TABLE IF NOT EXISTS MyCSV ( id INTEGER UNIQUE, @@ -51,8 +54,7 @@ def test_batch_mem_size_for_sqlite_storage_engine(self): execute_query_fetch_all(self.evadb, create_table_query) select_table_query = "SELECT * FROM MyCSV;" - with patch.object(SQLStorageEngine, 'read') as mock_read: + with patch.object(SQLStorageEngine, "read") as mock_read: mock_read.__iter__.return_value = [] execute_query_fetch_all(self.evadb, select_table_query) mock_read.assert_called_with(ANY, test_batch_mem_size) - From c70056de128ca2c64e62028feb35a2ad0e0177bb Mon Sep 17 00:00:00 2001 From: xzdandy Date: Sun, 13 Aug 2023 01:19:51 -0400 Subject: [PATCH 8/8] Fix linter --- evadb/executor/delete_executor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/evadb/executor/delete_executor.py b/evadb/executor/delete_executor.py index 2bf2f8cf61..7668227466 100644 --- a/evadb/executor/delete_executor.py +++ b/evadb/executor/delete_executor.py @@ -45,31 +45,31 @@ def predicate_node_to_filter_clause( left = predicate_node.get_child(0) right = predicate_node.get_child(1) - if type(left) == TupleValueExpression: + if isinstance(left, TupleValueExpression): column = left.name x = table.columns[column] - elif type(left) == ConstantValueExpression: + elif isinstance(left, ConstantValueExpression): value = left.value x = value else: left_filter_clause = self.predicate_node_to_filter_clause(table, left) - if type(right) == TupleValueExpression: + if isinstance(right, TupleValueExpression): column = right.name y = table.columns[column] - elif type(right) == ConstantValueExpression: + elif isinstance(right, ConstantValueExpression): value = right.value y = value else: right_filter_clause = self.predicate_node_to_filter_clause(table, right) - if type(predicate_node) == LogicalExpression: + if isinstance(predicate_node, LogicalExpression): if predicate_node.etype == ExpressionType.LOGICAL_AND: filter_clause = and_(left_filter_clause, right_filter_clause) elif predicate_node.etype == ExpressionType.LOGICAL_OR: filter_clause = or_(left_filter_clause, right_filter_clause) - elif type(predicate_node) == ComparisonExpression: + elif isinstance(predicate_node, ComparisonExpression): assert ( predicate_node.etype != ExpressionType.COMPARE_CONTAINS and predicate_node.etype != ExpressionType.COMPARE_IS_CONTAINED