From eb6f0fec7d140337b197e11ec1655c3abcd85ba4 Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Fri, 21 Jan 2022 20:04:15 +1100 Subject: [PATCH] fix: performance improvements for large forms - missing translations check can still work while only looking at first row, which is a little faster and still passes the tests. - while profiling, found an inefficient unique check in section.py. The benefit is only apparent with forms >1000 questions, but it is significant: q=5000 28s vs. 11s, q=10000 103s vs. 21s. --- pyxform/section.py | 9 ++-- .../pyxform/missing_translations_check.py | 13 ++--- tests/test_translations.py | 51 ++++++++++--------- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/pyxform/section.py b/pyxform/section.py index 609d8233..bd91096f 100644 --- a/pyxform/section.py +++ b/pyxform/section.py @@ -18,15 +18,16 @@ def validate(self): # there's a stronger test of this when creating the xpath # dictionary for a survey. def _validate_uniqueness_of_element_names(self): - element_slugs = [] + element_slugs = set() for element in self.children: - if any(element.name.lower() == s.lower() for s in element_slugs): + elem_lower = element.name.lower() + if elem_lower in element_slugs: raise PyXFormError( "There are more than one survey elements named '%s' " "(case-insensitive) in the section named '%s'." - % (element.name.lower(), self.name) + % (elem_lower, self.name) ) - element_slugs.append(element.name) + element_slugs.add(elem_lower) def xml_instance(self, **kwargs): """ diff --git a/pyxform/validators/pyxform/missing_translations_check.py b/pyxform/validators/pyxform/missing_translations_check.py index 02c43551..261a2985 100644 --- a/pyxform/validators/pyxform/missing_translations_check.py +++ b/pyxform/validators/pyxform/missing_translations_check.py @@ -55,10 +55,8 @@ def find_missing_translations( each language (including the default / unspecified language) that is used for any other translatable column. - This could be more efficient by not looking at every row, but that's how the data is - arranged by the time it passes through workbook_to_json(). On the bright side it - means this function could be adapted to warn about specific items lacking - translations, even when there are no missing translation columns. + This checks the first row only since it is concerned with the presence of columns, not + individual cells. It therefore assumes that each row object has the same structure. :param sheet_data: The survey or choices sheet data. :param translatable_columns: The translatable columns for a sheet. The structure @@ -80,12 +78,15 @@ def process_cell(typ, cell): translations_seen[lng].append(name) translation_columns_seen.add(name) - for row in sheet_data: - for column_type, cell_content in row.items(): + if 0 < len(sheet_data): + # e.g. ("name", "q1"), ("label", {"en": "Hello", "fr": "Bonjour"}) + for column_type, cell_content in sheet_data[0].items(): if column_type == constants.MEDIA: + # e.g. ("audio", {"eng": "my.mp3"}) for media_type, media_cell in cell_content.items(): process_cell(typ=media_type, cell=media_cell) if column_type == constants.BIND: + # e.g. ("jr:constraintMsg", "Try again") for bind_type, bind_cell in cell_content.items(): process_cell(typ=bind_type, cell=bind_cell) else: diff --git a/tests/test_translations.py b/tests/test_translations.py index 18f49cb5..a58c8cf0 100644 --- a/tests/test_translations.py +++ b/tests/test_translations.py @@ -458,12 +458,16 @@ def test_missing_translation__one_lang_all_cols(self): @unittest.skip("Slow performance test. Un-skip to run as needed.") def test_missing_translations_check_performance(self): - """Should find the translations check costs a fraction of a second for a giant form. + """ + Should find the translations check costs a fraction of a second for large forms. - Results with Python 3.8.9 on VM with 4CPU 8GB RAM, 500 questions with 2 choices - each, average of 20 runs (diff = 0.059s): - - with check (seconds): 1.351347613078542 - - without check (seconds): 1.292487204639474 + Results with Python 3.8.9 on VM with 4CPU 8GB RAM, x questions with 2 choices + each, average of 10 runs (seconds), with and without the check: + | num | with | without | + | 500 | 1.0192 | 0.9950 | + | 1000 | 2.0054 | 2.1026 | + | 2000 | 4.0714 | 4.0926 | + | 3000 | 6.0266 | 6.2476 | """ survey_header = """ | survey | | | | | @@ -480,24 +484,25 @@ def test_missing_translations_check_performance(self): | | c{i} | na | la-d | la-e | | | c{i} | nb | lb-d | lb-e | """ - questions = "\n".join((question.format(i=i) for i in range(1, 500))) - choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, 500))) - md = "".join((survey_header, questions, choices_header, choice_lists)) - - def run(name): - runs = 0 - results = [] - while runs < 20: - start = perf_counter() - self.assertPyxformXform(md=md) - results.append(perf_counter() - start) - runs += 1 - print(name, sum(results) / len(results)) - - run(name="with check (seconds):") - - with patch("pyxform.xls2json.missing_translations_check", return_value=[]): - run(name="without check (seconds):") + for count in (500, 1000, 2000): + questions = "\n".join((question.format(i=i) for i in range(1, count))) + choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, count))) + md = "".join((survey_header, questions, choices_header, choice_lists)) + + def run(name): + runs = 0 + results = [] + while runs < 10: + start = perf_counter() + self.assertPyxformXform(md=md) + results.append(perf_counter() - start) + runs += 1 + print(name, sum(results) / len(results)) + + run(name=f"questions={count}, with check (seconds):") + + with patch("pyxform.xls2json.missing_translations_check", return_value=[]): + run(name=f"questions={count}, without check (seconds):") class TestTranslationsSurvey(PyxformTestCase):