From eb6f0fec7d140337b197e11ec1655c3abcd85ba4 Mon Sep 17 00:00:00 2001
From: lindsay stevens <lindsay.stevens.au@gmail.com>
Date: Fri, 21 Jan 2022 20:04:15 +1100
Subject: [PATCH] fix: performance improvements for large forms

- missing translations check can still work while only looking at first
  row, which is a little faster and still passes the tests.
- while profiling, found an inefficient unique check in section.py. The
  benefit is only apparent with forms >1000 questions, but it is
  significant: q=5000 28s vs. 11s, q=10000 103s vs. 21s.
---
 pyxform/section.py                            |  9 ++--
 .../pyxform/missing_translations_check.py     | 13 ++---
 tests/test_translations.py                    | 51 ++++++++++---------
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/pyxform/section.py b/pyxform/section.py
index 609d8233..bd91096f 100644
--- a/pyxform/section.py
+++ b/pyxform/section.py
@@ -18,15 +18,16 @@ def validate(self):
     # there's a stronger test of this when creating the xpath
     # dictionary for a survey.
     def _validate_uniqueness_of_element_names(self):
-        element_slugs = []
+        element_slugs = set()
         for element in self.children:
-            if any(element.name.lower() == s.lower() for s in element_slugs):
+            elem_lower = element.name.lower()
+            if elem_lower in element_slugs:
                 raise PyXFormError(
                     "There are more than one survey elements named '%s' "
                     "(case-insensitive) in the section named '%s'."
-                    % (element.name.lower(), self.name)
+                    % (elem_lower, self.name)
                 )
-            element_slugs.append(element.name)
+            element_slugs.add(elem_lower)
 
     def xml_instance(self, **kwargs):
         """
diff --git a/pyxform/validators/pyxform/missing_translations_check.py b/pyxform/validators/pyxform/missing_translations_check.py
index 02c43551..261a2985 100644
--- a/pyxform/validators/pyxform/missing_translations_check.py
+++ b/pyxform/validators/pyxform/missing_translations_check.py
@@ -55,10 +55,8 @@ def find_missing_translations(
     each language (including the default / unspecified language) that is used for any
     other translatable column.
 
-    This could be more efficient by not looking at every row, but that's how the data is
-    arranged by the time it passes through workbook_to_json(). On the bright side it
-    means this function could be adapted to warn about specific items lacking
-    translations, even when there are no missing translation columns.
+    This checks the first row only since it is concerned with the presence of columns, not
+    individual cells. It therefore assumes that each row object has the same structure.
 
     :param sheet_data: The survey or choices sheet data.
     :param translatable_columns: The translatable columns for a sheet. The structure
@@ -80,12 +78,15 @@ def process_cell(typ, cell):
                         translations_seen[lng].append(name)
                         translation_columns_seen.add(name)
 
-    for row in sheet_data:
-        for column_type, cell_content in row.items():
+    if 0 < len(sheet_data):
+        # e.g. ("name", "q1"), ("label", {"en": "Hello", "fr": "Bonjour"})
+        for column_type, cell_content in sheet_data[0].items():
             if column_type == constants.MEDIA:
+                # e.g. ("audio", {"eng": "my.mp3"})
                 for media_type, media_cell in cell_content.items():
                     process_cell(typ=media_type, cell=media_cell)
             if column_type == constants.BIND:
+                # e.g. ("jr:constraintMsg", "Try again")
                 for bind_type, bind_cell in cell_content.items():
                     process_cell(typ=bind_type, cell=bind_cell)
             else:
diff --git a/tests/test_translations.py b/tests/test_translations.py
index 18f49cb5..a58c8cf0 100644
--- a/tests/test_translations.py
+++ b/tests/test_translations.py
@@ -458,12 +458,16 @@ def test_missing_translation__one_lang_all_cols(self):
 
     @unittest.skip("Slow performance test. Un-skip to run as needed.")
     def test_missing_translations_check_performance(self):
-        """Should find the translations check costs a fraction of a second for a giant form.
+        """
+        Should find the translations check costs a fraction of a second for large forms.
 
-        Results with Python 3.8.9 on VM with 4CPU 8GB RAM, 500 questions with 2 choices
-        each, average of 20 runs (diff = 0.059s):
-        - with check (seconds):    1.351347613078542
-        - without check (seconds): 1.292487204639474
+        Results with Python 3.8.9 on VM with 4CPU 8GB RAM, x questions with 2 choices
+        each, average of 10 runs (seconds), with and without the check:
+        | num  | with   | without |
+        |  500 | 1.0192 |  0.9950 |
+        | 1000 | 2.0054 |  2.1026 |
+        | 2000 | 4.0714 |  4.0926 |
+        | 3000 | 6.0266 |  6.2476 |
         """
         survey_header = """
         | survey |                 |        |                |               |
@@ -480,24 +484,25 @@ def test_missing_translations_check_performance(self):
         |         | c{i}        | na   | la-d  | la-e       |
         |         | c{i}        | nb   | lb-d  | lb-e       |
         """
-        questions = "\n".join((question.format(i=i) for i in range(1, 500)))
-        choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, 500)))
-        md = "".join((survey_header, questions, choices_header, choice_lists))
-
-        def run(name):
-            runs = 0
-            results = []
-            while runs < 20:
-                start = perf_counter()
-                self.assertPyxformXform(md=md)
-                results.append(perf_counter() - start)
-                runs += 1
-            print(name, sum(results) / len(results))
-
-        run(name="with check (seconds):")
-
-        with patch("pyxform.xls2json.missing_translations_check", return_value=[]):
-            run(name="without check (seconds):")
+        for count in (500, 1000, 2000):
+            questions = "\n".join((question.format(i=i) for i in range(1, count)))
+            choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, count)))
+            md = "".join((survey_header, questions, choices_header, choice_lists))
+
+            def run(name):
+                runs = 0
+                results = []
+                while runs < 10:
+                    start = perf_counter()
+                    self.assertPyxformXform(md=md)
+                    results.append(perf_counter() - start)
+                    runs += 1
+                print(name, sum(results) / len(results))
+
+            run(name=f"questions={count}, with check (seconds):")
+
+            with patch("pyxform.xls2json.missing_translations_check", return_value=[]):
+                run(name=f"questions={count}, without check (seconds):")
 
 
 class TestTranslationsSurvey(PyxformTestCase):