Skip to content

Commit

Permalink
Revert "Convokit 3.0 Mega Pull Request (#197)"
Browse files Browse the repository at this point in the history
This reverts commit 3935d4e.
  • Loading branch information
cristiandnm authored Jul 11, 2023
1 parent 3935d4e commit 6465639
Show file tree
Hide file tree
Showing 18 changed files with 122 additions and 198 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
[![Slack Community](https://img.shields.io/static/v1?logo=slack&style=flat&color=red&label=slack&message=community)](https://join.slack.com/t/convokit/shared_invite/zt-1axq34qrp-1hDXQrvSXClIbJOqw4S03Q)


This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [3.0.0](https://github.com/CornellNLP/ConvoKit/releases/tag/v3.0.0) (released 1 June 2023); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates.
This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/ConvoKit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [2.5.3](https://github.com/CornellNLP/ConvoKit/releases/tag/v2.5.2) (released 16 Jan 2022); follow the [project on GitHub](https://github.com/CornellNLP/ConvoKit) to keep track of updates.

Read our [documentation](https://convokit.cornell.edu/documentation) or try ConvoKit in our [interactive tutorial](https://colab.research.google.com/github/CornellNLP/ConvoKit/blob/master/examples/Introduction_to_ConvoKit.ipynb).

Expand Down
16 changes: 2 additions & 14 deletions convokit/coordination/coordination.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections import defaultdict
from typing import Callable, Tuple, List, Dict, Optional, Collection, Union
import copy

import pkg_resources

Expand Down Expand Up @@ -109,22 +108,11 @@ def transform(self, corpus: Corpus) -> Corpus:
utterance_thresh_func=self.utterance_thresh_func,
)

# Keep record of all score update for all (speakers, target) pairs to avoid redundant operations
todo = {}

for (speaker, target), score in pair_scores.items():
if self.coordination_attribute_name not in speaker.meta:
speaker.meta[self.coordination_attribute_name] = {}
key = (speaker, target.id)
todo.update({key: score})

for key, score in todo.items():
speaker = key[0]
target = key[1]
# For avoiding mutability for the sake of DB corpus
temp_dict = copy.deepcopy(speaker.meta[self.coordination_attribute_name])
temp_dict[target] = score
speaker.meta[self.coordination_attribute_name] = temp_dict
speaker.meta[self.coordination_attribute_name][target.id] = score

assert isinstance(speaker, Speaker)

return corpus
Expand Down
12 changes: 1 addition & 11 deletions convokit/model/convoKitMeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from .convoKitIndex import ConvoKitIndex
import json
from typing import Union
import copy

# See reference: https://stackoverflow.com/questions/7760916/correct-usage-of-a-getter-setter-for-dictionary-values

Expand All @@ -31,18 +30,9 @@ def storage_key(self) -> str:
return f"{self.obj_type}_{self.owner.id}"

def __getitem__(self, item):
# in DB mode, metadata field mutation would not be updated. (ex. mutating dict/list metadata fields)
# we align MEM mode behavior and DB mode by making deepcopy of metadata fields, so mutation no longer
# affect corpus metadata storage, but only acting on the copy of it.
item = self._get_storage().get_data(
return self._get_storage().get_data(
"meta", self.storage_key, item, self.index.get_index(self.obj_type)
)
immutable_types = (int, float, bool, complex, str, tuple, frozenset)
if isinstance(item, immutable_types):
return item
else:
# return copy.deepcopy(item) if item is not common python immutable type
return copy.deepcopy(item)

def _get_storage(self):
# special case for Corpus meta since that's the only time owner is not a CorpusComponent
Expand Down
55 changes: 25 additions & 30 deletions convokit/model/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .convoKitMatrix import ConvoKitMatrix
from .corpusUtil import *
from .corpus_helpers import *
from .backendMapper import BackendMapper
from .storageManager import StorageManager


class Corpus:
Expand All @@ -19,8 +19,6 @@ class Corpus:
:param filename: Path to a folder containing a Corpus or to an utterances.jsonl / utterances.json file to load
:param utterances: list of utterances to initialize Corpus from
:param db_collection_prefix: if a db backend is used, this determines how the database will be named. If not specified, a random name will be used.
:param db_host: if specified, and a db backend is used, connect to the database at this URL. If not specified, will default to the db_host in the ConvoKit global configuration file.
:param preload_vectors: list of names of vectors to be preloaded from directory; by default,
no vectors are loaded but can be loaded any time after corpus initialization (i.e. vectors are lazy-loaded).
:param utterance_start_index: if loading from directory and the corpus folder contains utterances.jsonl, specify the
Expand All @@ -38,9 +36,6 @@ class Corpus:
index.json is already accurate and disabling it will allow for a faster corpus load. This parameter is set to
True by default, i.e. type-checking is not carried out.
:param backend: specify the backend type, either “mem” or “db”, default to “mem”.
:param backend_mapper: (advanced usage only) if provided, use this as the BackendMapper instance instead of initializing a new one.
:ivar meta_index: index of Corpus metadata
:ivar vectors: the vectors stored in the Corpus
:ivar corpus_dirpath: path to the directory the corpus was loaded from
Expand All @@ -61,24 +56,24 @@ def __init__(
exclude_speaker_meta: Optional[List[str]] = None,
exclude_overall_meta: Optional[List[str]] = None,
disable_type_check=True,
backend: Optional[str] = None,
backend_mapper: Optional[BackendMapper] = None,
storage_type: Optional[str] = None,
storage: Optional[StorageManager] = None,
):
self.config = ConvoKitConfig()
self.corpus_dirpath = get_corpus_dirpath(filename)

# configure corpus ID (optional for mem mode, required for DB mode)
if backend is None:
backend = self.config.default_storage_mode
if db_collection_prefix is None and filename is None and backend == "db":
if storage_type is None:
storage_type = self.config.default_storage_mode
if db_collection_prefix is None and filename is None and storage_type == "db":
db_collection_prefix = create_safe_id()
warn(
"You are in DB mode, but no collection prefix was specified and no filename was given from which to infer one."
"Will use a randomly generated unique prefix " + db_collection_prefix
)
self.id = get_corpus_id(db_collection_prefix, filename, backend)
self.backend = backend
self.backend_mapper = initialize_storage(self, backend_mapper, backend, db_host)
self.id = get_corpus_id(db_collection_prefix, filename, storage_type)
self.storage_type = storage_type
self.storage = initialize_storage(self, storage, storage_type, db_host)

self.meta_index = ConvoKitIndex(self)
self.meta = ConvoKitMeta(self, self.meta_index, "corpus")
Expand All @@ -96,10 +91,10 @@ def __init__(
if exclude_overall_meta is None:
exclude_overall_meta = []

if filename is not None and backend == "db":
if filename is not None and storage_type == "db":
# JSON-to-DB construction mode uses a specialized code branch, which
# optimizes for this use case by using direct batch insertions into the
# DB rather than going through the BackendMapper, hence improving
# DB rather than going through the StorageManager, hence improving
# efficiency.

with open(os.path.join(filename, "index.json"), "r") as f:
Expand All @@ -109,7 +104,7 @@ def __init__(
# populate the DB with the contents of the source file
ids_in_db = populate_db_from_file(
filename,
self.backend_mapper.db,
self.storage.db,
self.id,
self.meta_index,
utterance_start_index,
Expand All @@ -120,7 +115,7 @@ def __init__(
exclude_overall_meta,
)

# with the BackendMapper's DB now populated, initialize the corresponding
# with the StorageManager's DB now populated, initialize the corresponding
# CorpusComponent instances.
init_corpus_from_storage_manager(self, ids_in_db)

Expand Down Expand Up @@ -221,8 +216,8 @@ def reconnect_to_db(cls, db_collection_prefix: str, db_host: Optional[str] = Non
resume where you left off.
"""
# create a blank Corpus that will hold the data
result = cls(db_collection_prefix=db_collection_prefix, db_host=db_host, backend="db")
# through the constructor, the blank Corpus' BackendMapper is now connected
result = cls(db_collection_prefix=db_collection_prefix, db_host=db_host, storage_type="db")
# through the constructor, the blank Corpus' StorageManager is now connected
# to the DB. Next use the DB contents to populate the corpus components.
init_corpus_from_storage_manager(result)

Expand Down Expand Up @@ -626,7 +621,7 @@ def filter_conversations_by(self, selector: Callable[[Conversation], bool]):
meta_ids.append(convo.meta.storage_key)
for speaker in self.iter_speakers():
meta_ids.append(speaker.meta.storage_key)
self.backend_mapper.purge_obsolete_entries(
self.storage.purge_obsolete_entries(
self.get_utterance_ids(), self.get_conversation_ids(), self.get_speaker_ids(), meta_ids
)

Expand All @@ -650,8 +645,8 @@ def filter_utterances(source_corpus: "Corpus", selector: Callable[[Utterance], b
convo.meta.update(source_corpus.get_conversation(convo.id).meta)

# original Corpus is invalidated and no longer usable; clear all data from
# its now-orphaned BackendMapper to avoid having duplicates in memory
source_corpus.backend_mapper.clear_all_data()
# its now-orphaned StorageManager to avoid having duplicates in memory
source_corpus.storage.clear_all_data()

return new_corpus

Expand Down Expand Up @@ -725,8 +720,8 @@ def reindex_conversations(
print(missing_convo_roots)

# original Corpus is invalidated and no longer usable; clear all data from
# its now-orphaned BackendMapper to avoid having duplicates in memory
source_corpus.backend_mapper.clear_all_data()
# its now-orphaned StorageManager to avoid having duplicates in memory
source_corpus.storage.clear_all_data()

return new_corpus

Expand Down Expand Up @@ -1032,10 +1027,10 @@ def merge(primary: "Corpus", secondary: "Corpus", warnings: bool = True):
new_corpus.reinitialize_index()

# source corpora are now invalidated and all needed data has been copied
# into the new merged corpus; clear the source corpora's backend mapper to
# into the new merged corpus; clear the source corpora's storage to
# prevent having duplicates in memory
primary.backend_mapper.clear_all_data()
secondary.backend_mapper.clear_all_data()
primary.storage.clear_all_data()
secondary.storage.clear_all_data()

return new_corpus

Expand Down Expand Up @@ -1300,9 +1295,9 @@ def load_info(self, obj_type, fields=None, dir_name=None):
for field in fields:
# self.aux_info[field] = self.load_jsonlist_to_dict(
# os.path.join(dir_name, 'feat.%s.jsonl' % field))
if self.backend == "mem":
if self.storage_type == "mem":
load_info_to_mem(self, dir_name, obj_type, field)
elif self.backend == "db":
elif self.storage_type == "db":
load_info_to_db(self, dir_name, obj_type, field)

def dump_info(self, obj_type, fields, dir_name=None):
Expand Down
4 changes: 2 additions & 2 deletions convokit/model/corpusComponent.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def set_owner(self, owner):
self._owner = owner
if owner is not None:
# when a new owner Corpus is assigned, we must take the following steps:
# (1) transfer this component's data to the new owner's BackendMapper
# (1) transfer this component's data to the new owner's StorageManager
# (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner)
# (3) reinitialize the metadata instance
data_dict = (
Expand All @@ -71,7 +71,7 @@ def set_owner(self, owner):
def init_meta(self, meta, overwrite=False):
if self._owner is None:
# ConvoKitMeta instances are not allowed for ownerless (standalone)
# components since they must be backed by a BackendMapper. In this
# components since they must be backed by a StorageManager. In this
# case we must forcibly convert the ConvoKitMeta instance to dict
if isinstance(meta, ConvoKitMeta):
meta = meta.to_dict()
Expand Down
12 changes: 6 additions & 6 deletions convokit/model/corpus_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .convoKitIndex import ConvoKitIndex
from .convoKitMeta import ConvoKitMeta
from .speaker import Speaker
from .backendMapper import BackendMapper, MemMapper, DBMapper
from .storageManager import StorageManager, MemStorageManager, DBStorageManager
from .utterance import Utterance

BIN_DELIM_L, BIN_DELIM_R = "<##bin{", "}&&@**>"
Expand Down Expand Up @@ -83,17 +83,17 @@ def get_corpus_dirpath(filename: str) -> Optional[str]:


def initialize_storage(
corpus: "Corpus", storage: Optional[BackendMapper], storage_type: str, db_host: Optional[str]
corpus: "Corpus", storage: Optional[StorageManager], storage_type: str, db_host: Optional[str]
):
if storage is not None:
return storage
else:
if storage_type == "mem":
return MemMapper()
return MemStorageManager()
elif storage_type == "db":
if db_host is None:
db_host = corpus.config.db_host
return DBMapper(corpus.id, db_host)
return DBStorageManager(corpus.id, db_host)
else:
raise ValueError(
f"Unrecognized setting '{storage_type}' for storage type; should be either 'mem' or 'db'."
Expand Down Expand Up @@ -886,7 +886,7 @@ def populate_db_from_file(
):
"""
Populate all necessary collections of a MongoDB database so that it can be
used by a DBMapper, sourcing data from the valid ConvoKit Corpus
used by a DBStorageManager, sourcing data from the valid ConvoKit Corpus
data pointed to by the filename parameter.
"""
binary_meta, updated_exclude_meta = load_binary_metadata(
Expand Down Expand Up @@ -983,5 +983,5 @@ def init_corpus_from_storage_manager(corpus, utt_ids=None):
corpus.meta_index.enable_type_check()
corpus.update_speakers_data()

# restore the BackendMapper's init behavior to default
# restore the StorageManager's init behavior to default
corpus.storage.bypass_init = False
Loading

0 comments on commit 6465639

Please sign in to comment.