facebookresearch · stephenroller · Mar 20, 2022 · Mar 8, 2022 · Mar 8, 2022 · Mar 12, 2022
diff --git a/parlai/tasks/lccc/test.py b/parlai/tasks/lccc/test.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from parlai.utils.testing import AutoTeacherTest  # noqa: F401
+from parlai.utils.testing import AutoTeacherTest
 
 
 class TestDefaultTeacher(AutoTeacherTest):

diff --git a/parlai/utils/conversations.py b/parlai/utils/conversations.py
@@ -9,7 +9,7 @@
 import datetime
 import json
 import os
-import random
+import itertools
 
 from parlai.utils.io import PathManager
 from parlai.core.metrics import dict_report
@@ -196,26 +196,34 @@ class Conversations:
     """
 
     def __init__(self, datapath):
-        self.conversations = self._load_conversations(datapath)
+        self._datapath = datapath
         self.metadata = self._load_metadata(datapath)
 
     def __len__(self):
-        return len(self.conversations)
+        return sum(1 for _ in self._load_raw(self._datapath))
 
-    def _load_conversations(self, datapath):
+    def _load_raw(self, datapath):
+        """
+        Load the data as a raw, unparsed file.
+
+        Useful for fast IO stuff like random indexing.
+        """
         if not PathManager.exists(datapath):
             raise RuntimeError(
                 f'Conversations at path {datapath} not found. '
                 'Double check your path.'
             )
 
-        conversations = []
         with PathManager.open(datapath, 'r') as f:
             lines = f.read().splitlines()
             for line in lines:
-                conversations.append(Conversation(json.loads(line)))
+                yield line
+
+    def _parse(self, line):
+        return Conversation(json.loads(line))
 
-        return conversations
+    def _load_conversations(self, datapath):
+        return (self._parse(line) for line in self._load_raw(datapath))
 
     def _load_metadata(self, datapath):
         """
@@ -225,7 +233,7 @@ def _load_metadata(self, datapath):
         Metadata should be of the following format:
         {
             'date': <date collected>,
-            'opt': <opt used to collect the data,
+            'opt': <opt used to collect the data>,
             'speakers': <identity of speakers>,
             ...
             Other arguments.
@@ -235,7 +243,7 @@ def _load_metadata(self, datapath):
             metadata = Metadata(datapath)
             return metadata
         except RuntimeError:
-            logging.error('Metadata does not exist. Please double check your datapath.')
+            logging.debug('Metadata does not exist. Please double check your datapath.')
             return None
 
     def read_metadata(self):
@@ -245,31 +253,12 @@ def read_metadata(self):
             logging.warning('No metadata available.')
 
     def __getitem__(self, index):
-        return self.conversations[index]
+        raw = self._load_raw(self._datapath)
+        item = list(itertools.islice(raw, index, index + 1))[0]
+        return self._parse(item)
 
     def __iter__(self):
-        self.iterator_idx = 0
-        return self
-
-    def __next__(self):
-        """
-        Return the next conversation.
-        """
-        if self.iterator_idx >= len(self):
-            raise StopIteration
-
-        conv = self.conversations[self.iterator_idx]
-        self.iterator_idx += 1
-
-        return conv
-
-    def read_conv_idx(self, idx):
-        convo = self.conversations[idx]
-        logging.info(convo)
-
-    def read_rand_conv(self):
-        idx = random.choice(range(len(self)))
-        self.read_conv_idx(idx)
+        return self._load_conversations(self._datapath)
 
     @staticmethod
     def _get_path(datapath):

diff --git a/tests/test_conversations.py b/tests/test_conversations.py
@@ -8,7 +8,6 @@
 import unittest
 
 from parlai.utils.conversations import Conversations
-import parlai.utils.logging as logging
 import tempfile
 
 
@@ -78,17 +77,6 @@ def test_conversations(self):
         # test kwargs
         self.assertEqual({'other_info': 'Blah blah blah'}, convos.metadata.extra_data)
 
-        # test reading conversations
-        with self.assertLogs(logger=logging.logger, level='DEBUG') as cm:
-            convos.read_conv_idx(0)
-            str_version = (
-                'Emily: Hello, do you like this test?\n'
-                'Stephen: Why yes! I love this test!\n'
-                'Emily: So will you stamp this diff?\n'
-                'Stephen: Yes, I will do it right now!\n'
-            )
-            self.assertIn(str_version, "\n".join(cm.output))
-
         # test getting a specific turn
         first = convos[0]  # Conversation
         self.assertEqual(first[0].id, 'Emily')