Revert "Convokit 3.0 Mega Pull Request (#197)"

This reverts commit 3935d4e.
CornellNLP · Jul 11, 2023 · 6465639 · 6465639
1 parent 3935d4e
commit 6465639
Show file tree

Hide file tree

Showing 18 changed files with 122 additions and 198 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 [![Slack Community](https://img.shields.io/static/v1?logo=slack&style=flat&color=red&label=slack&message=community)](https://join.slack.com/t/convokit/shared_invite/zt-1axq34qrp-1hDXQrvSXClIbJOqw4S03Q)
 
 
-This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn.  Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [3.0.0](https://github.com/CornellNLP/ConvoKit/releases/tag/v3.0.0) (released 1 June 2023); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates.
+This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn.  Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [2.5.3](https://github.com/CornellNLP/ConvoKit/releases/tag/v2.5.2) (released 16 Jan 2022); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates.
 
 Read our [documentation](https://convokit.cornell.edu/documentation) or try ConvoKit in our [interactive tutorial](https://colab.research.google.com/github/CornellNLP/ConvoKit/blob/master/examples/Introduction_to_ConvoKit.ipynb).
 

diff --git a/convokit/coordination/coordination.py b/convokit/coordination/coordination.py
@@ -1,6 +1,5 @@
 from collections import defaultdict
 from typing import Callable, Tuple, List, Dict, Optional, Collection, Union
-import copy
 
 import pkg_resources
 
@@ -109,22 +108,11 @@ def transform(self, corpus: Corpus) -> Corpus:
             utterance_thresh_func=self.utterance_thresh_func,
         )
 
-        # Keep record of all score update for all (speakers, target) pairs to avoid redundant operations
-        todo = {}
-
         for (speaker, target), score in pair_scores.items():
             if self.coordination_attribute_name not in speaker.meta:
                 speaker.meta[self.coordination_attribute_name] = {}
-            key = (speaker, target.id)
-            todo.update({key: score})
-
-        for key, score in todo.items():
-            speaker = key[0]
-            target = key[1]
-            # For avoiding mutability for the sake of DB corpus
-            temp_dict = copy.deepcopy(speaker.meta[self.coordination_attribute_name])
-            temp_dict[target] = score
-            speaker.meta[self.coordination_attribute_name] = temp_dict
+            speaker.meta[self.coordination_attribute_name][target.id] = score
+
             assert isinstance(speaker, Speaker)
 
         return corpus

diff --git a/convokit/model/convoKitMeta.py b/convokit/model/convoKitMeta.py
@@ -7,7 +7,6 @@
 from .convoKitIndex import ConvoKitIndex
 import json
 from typing import Union
-import copy
 
 # See reference: https://stackoverflow.com/questions/7760916/correct-usage-of-a-getter-setter-for-dictionary-values
 
@@ -31,18 +30,9 @@ def storage_key(self) -> str:
         return f"{self.obj_type}_{self.owner.id}"
 
     def __getitem__(self, item):
-        # in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)
-        # we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer
-        # affect corpus metadata storage, but only acting on the copy of it.
-        item = self._get_storage().get_data(
+        return self._get_storage().get_data(
             "meta", self.storage_key, item, self.index.get_index(self.obj_type)
         )
-        immutable_types = (int, float, bool, complex, str, tuple, frozenset)
-        if isinstance(item, immutable_types):
-            return item
-        else:
-            # return copy.deepcopy(item) if item is not common python immutable type
-            return copy.deepcopy(item)
 
     def _get_storage(self):
         # special case for Corpus meta since that's the only time owner is not a CorpusComponent

diff --git a/convokit/model/corpus.py b/convokit/model/corpus.py
@@ -10,7 +10,7 @@
 from .convoKitMatrix import ConvoKitMatrix
 from .corpusUtil import *
 from .corpus_helpers import *
-from .backendMapper import BackendMapper
+from .storageManager import StorageManager
 
 
 class Corpus:
@@ -19,8 +19,6 @@ class Corpus:
 
     :param filename: Path to a folder containing a Corpus or to an utterances.jsonl / utterances.json file to load
     :param utterances: list of utterances to initialize Corpus from
-    :param db_collection_prefix: if a db backend is used, this determines how the database will be named. If not specified, a random name will be used.
-    :param db_host: if specified, and a db backend is used, connect to the database at this URL. If not specified, will default to the db_host in the ConvoKit global configuration file.
     :param preload_vectors: list of names of vectors to be preloaded from directory; by default,
         no vectors are loaded but can be loaded any time after corpus initialization (i.e. vectors are lazy-loaded).
     :param utterance_start_index: if loading from directory and the corpus folder contains utterances.jsonl, specify the
@@ -38,9 +36,6 @@ class Corpus:
         index.json is already accurate and disabling it will allow for a faster corpus load. This parameter is set to
         True by default, i.e. type-checking is not carried out.
 
-    :param backend: specify the backend type, either “mem” or “db”, default to “mem”.
-    :param backend_mapper: (advanced usage only) if provided, use this as the BackendMapper instance instead of initializing a new one.
-
     :ivar meta_index: index of Corpus metadata
     :ivar vectors: the vectors stored in the Corpus
     :ivar corpus_dirpath: path to the directory the corpus was loaded from
@@ -61,24 +56,24 @@ def __init__(
         exclude_speaker_meta: Optional[List[str]] = None,
         exclude_overall_meta: Optional[List[str]] = None,
         disable_type_check=True,
-        backend: Optional[str] = None,
-        backend_mapper: Optional[BackendMapper] = None,
+        storage_type: Optional[str] = None,
+        storage: Optional[StorageManager] = None,
     ):
         self.config = ConvoKitConfig()
         self.corpus_dirpath = get_corpus_dirpath(filename)
 
         # configure corpus ID (optional for mem mode, required for DB mode)
-        if backend is None:
-            backend = self.config.default_storage_mode
-        if db_collection_prefix is None and filename is None and backend == "db":
+        if storage_type is None:
+            storage_type = self.config.default_storage_mode
+        if db_collection_prefix is None and filename is None and storage_type == "db":
             db_collection_prefix = create_safe_id()
             warn(
                 "You are in DB mode, but no collection prefix was specified and no filename was given from which to infer one."
                 "Will use a randomly generated unique prefix " + db_collection_prefix
             )
-        self.id = get_corpus_id(db_collection_prefix, filename, backend)
-        self.backend = backend
-        self.backend_mapper = initialize_storage(self, backend_mapper, backend, db_host)
+        self.id = get_corpus_id(db_collection_prefix, filename, storage_type)
+        self.storage_type = storage_type
+        self.storage = initialize_storage(self, storage, storage_type, db_host)
 
         self.meta_index = ConvoKitIndex(self)
         self.meta = ConvoKitMeta(self, self.meta_index, "corpus")
@@ -96,10 +91,10 @@ def __init__(
         if exclude_overall_meta is None:
             exclude_overall_meta = []
 
-        if filename is not None and backend == "db":
+        if filename is not None and storage_type == "db":
             # JSON-to-DB construction mode uses a specialized code branch, which
             # optimizes for this use case by using direct batch insertions into the
-            # DB rather than going through the BackendMapper, hence improving
+            # DB rather than going through the StorageManager, hence improving
             # efficiency.
 
             with open(os.path.join(filename, "index.json"), "r") as f:
@@ -109,7 +104,7 @@ def __init__(
             # populate the DB with the contents of the source file
             ids_in_db = populate_db_from_file(
                 filename,
-                self.backend_mapper.db,
+                self.storage.db,
                 self.id,
                 self.meta_index,
                 utterance_start_index,
@@ -120,7 +115,7 @@ def __init__(
                 exclude_overall_meta,
             )
 
-            # with the BackendMapper's DB now populated, initialize the corresponding
+            # with the StorageManager's DB now populated, initialize the corresponding
             # CorpusComponent instances.
             init_corpus_from_storage_manager(self, ids_in_db)
 
@@ -221,8 +216,8 @@ def reconnect_to_db(cls, db_collection_prefix: str, db_host: Optional[str] = Non
         resume where you left off.
         """
         # create a blank Corpus that will hold the data
-        result = cls(db_collection_prefix=db_collection_prefix, db_host=db_host, backend="db")
-        # through the constructor, the blank Corpus' BackendMapper is now connected
+        result = cls(db_collection_prefix=db_collection_prefix, db_host=db_host, storage_type="db")
+        # through the constructor, the blank Corpus' StorageManager is now connected
         # to the DB. Next use the DB contents to populate the corpus components.
         init_corpus_from_storage_manager(result)
 
@@ -626,7 +621,7 @@ def filter_conversations_by(self, selector: Callable[[Conversation], bool]):
             meta_ids.append(convo.meta.storage_key)
         for speaker in self.iter_speakers():
             meta_ids.append(speaker.meta.storage_key)
-        self.backend_mapper.purge_obsolete_entries(
+        self.storage.purge_obsolete_entries(
             self.get_utterance_ids(), self.get_conversation_ids(), self.get_speaker_ids(), meta_ids
         )
 
@@ -650,8 +645,8 @@ def filter_utterances(source_corpus: "Corpus", selector: Callable[[Utterance], b
             convo.meta.update(source_corpus.get_conversation(convo.id).meta)
 
         # original Corpus is invalidated and no longer usable; clear all data from
-        # its now-orphaned BackendMapper to avoid having duplicates in memory
-        source_corpus.backend_mapper.clear_all_data()
+        # its now-orphaned StorageManager to avoid having duplicates in memory
+        source_corpus.storage.clear_all_data()
 
         return new_corpus
 
@@ -725,8 +720,8 @@ def reindex_conversations(
                 print(missing_convo_roots)
 
         # original Corpus is invalidated and no longer usable; clear all data from
-        # its now-orphaned BackendMapper to avoid having duplicates in memory
-        source_corpus.backend_mapper.clear_all_data()
+        # its now-orphaned StorageManager to avoid having duplicates in memory
+        source_corpus.storage.clear_all_data()
 
         return new_corpus
 
@@ -1032,10 +1027,10 @@ def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True):
         new_corpus.reinitialize_index()
 
         # source corpora are now invalidated and all needed data has been copied
-        # into the new merged corpus; clear the source corpora's backend mapper to
+        # into the new merged corpus; clear the source corpora's storage to
         # prevent having duplicates in memory
-        primary.backend_mapper.clear_all_data()
-        secondary.backend_mapper.clear_all_data()
+        primary.storage.clear_all_data()
+        secondary.storage.clear_all_data()
 
         return new_corpus
 
@@ -1300,9 +1295,9 @@ def load_info(self, obj_type, fields=None, dir_name=None):
         for field in fields:
             # self.aux_info[field] = self.load_jsonlist_to_dict(
             #     os.path.join(dir_name, 'feat.%s.jsonl' % field))
-            if self.backend == "mem":
+            if self.storage_type == "mem":
                 load_info_to_mem(self, dir_name, obj_type, field)
-            elif self.backend == "db":
+            elif self.storage_type == "db":
                 load_info_to_db(self, dir_name, obj_type, field)
 
     def dump_info(self, obj_type, fields, dir_name=None):

diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py
@@ -48,7 +48,7 @@ def set_owner(self, owner):
         self._owner = owner
         if owner is not None:
             # when a new owner Corpus is assigned, we must take the following steps:
-            # (1) transfer this component's data to the new owner's BackendMapper
+            # (1) transfer this component's data to the new owner's StorageManager
             # (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner)
             # (3) reinitialize the metadata instance
             data_dict = (
@@ -71,7 +71,7 @@ def set_owner(self, owner):
     def init_meta(self, meta, overwrite=False):
         if self._owner is None:
             # ConvoKitMeta instances are not allowed for ownerless (standalone)
-            # components since they must be backed by a BackendMapper. In this
+            # components since they must be backed by a StorageManager. In this
             # case we must forcibly convert the ConvoKitMeta instance to dict
             if isinstance(meta, ConvoKitMeta):
                 meta = meta.to_dict()

diff --git a/convokit/model/corpus_helpers.py b/convokit/model/corpus_helpers.py
@@ -16,7 +16,7 @@
 from .convoKitIndex import ConvoKitIndex
 from .convoKitMeta import ConvoKitMeta
 from .speaker import Speaker
-from .backendMapper import BackendMapper, MemMapper, DBMapper
+from .storageManager import StorageManager, MemStorageManager, DBStorageManager
 from .utterance import Utterance
 
 BIN_DELIM_L, BIN_DELIM_R = "<##bin{", "}&&@**>"
@@ -83,17 +83,17 @@ def get_corpus_dirpath(filename: str) -> Optional[str]:
 
 
 def initialize_storage(
-    corpus: "Corpus", storage: Optional[BackendMapper], storage_type: str, db_host: Optional[str]
+    corpus: "Corpus", storage: Optional[StorageManager], storage_type: str, db_host: Optional[str]
 ):
     if storage is not None:
         return storage
     else:
         if storage_type == "mem":
-            return MemMapper()
+            return MemStorageManager()
         elif storage_type == "db":
             if db_host is None:
                 db_host = corpus.config.db_host
-            return DBMapper(corpus.id, db_host)
+            return DBStorageManager(corpus.id, db_host)
         else:
             raise ValueError(
                 f"Unrecognized setting '{storage_type}' for storage type; should be either 'mem' or 'db'."
@@ -886,7 +886,7 @@ def populate_db_from_file(
 ):
     """
     Populate all necessary collections of a MongoDB database so that it can be
-    used by a DBMapper, sourcing data from the valid ConvoKit Corpus
+    used by a DBStorageManager, sourcing data from the valid ConvoKit Corpus
     data pointed to by the filename parameter.
     """
     binary_meta, updated_exclude_meta = load_binary_metadata(
@@ -983,5 +983,5 @@ def init_corpus_from_storage_manager(corpus, utt_ids=None):
     corpus.meta_index.enable_type_check()
     corpus.update_speakers_data()
 
-    # restore the BackendMapper's init behavior to default
+    # restore the StorageManager's init behavior to default
     corpus.storage.bypass_init = False