merge with dev

Watts-Lab · Aug 15, 2024 · a6a6081 · a6a6081
2 parents 12bbd8d + 34ea876
commit a6a6081
Show file tree

Hide file tree

Showing 71 changed files with 85 additions and 42,552 deletions.
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@ We are a research project created by the [Computational Social Science Lab at UP
 
 [![View - Documentation](https://img.shields.io/badge/view-Documentation-blue?style=for-the-badge)](https://conversational-featurizer.readthedocs.io/en/latest/ "Go to project documentation")
 
+The Team Communication Toolkit is an academic project and is intended to be used for academic purposes only.
+
 </div>
 
 # Getting Started

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "team_comm_tools"
-version = "0.1.1"
+version = "0.1.2"
 requires-python = ">= 3.10"
 dependencies = [
  "chardet>=3.0.4",
@@ -61,6 +61,8 @@ where = ["src"]
 
 [tool.setuptools.package-data]
 "team_comm_tools" = [
- 'features/lexicons/**/*', 
+ 'features/lexicons/dale_chall.txt',
+ 'features/lexicons/function_words.txt',
+ 'features/lexicons/question_words.txt',
  'features/assets/*'
 ]
diff --git a/src/team_comm_tools/features/assets/certainty.pkl b/src/team_comm_tools/features/assets/certainty.pkl
diff --git a/src/team_comm_tools/features/certainty.py b/src/team_comm_tools/features/certainty.py
@@ -7,17 +7,6 @@
 
 # Note: This feature requires the message WITH punctuation.
 
-# parse certainty lexicon, compile into master regex, delimited by | 
-# Construct the absolute path to certainty.txt using the current script directory
-current_dir = os.path.dirname(__file__)
-certainty_file_pkl_path = os.path.join(current_dir, './lexicons/certainty.pkl')
-certainty_file_pkl_path = os.path.abspath(certainty_file_pkl_path)
-with open(certainty_file_pkl_path, 'rb') as f:
- certainty_data = pickle.load(f) # Load pickled data
- certainty = pd.read_csv(io.StringIO(certainty_data), sep = ",")
- certainty = certainty.sort_values(["NumWords", "NumCharacters"], ascending=False)
-master_regex = certainty["Word"].str.cat(sep='\\b|') + "\\b"
-
 def get_certainty(chat):
  """ Calculates a score of how "certain" a given expression is, using the Certainty Lexicon.
 
@@ -41,6 +30,17 @@ def get_certainty(chat):
  float: The certainty score of the utterance.
  """
 
+ # parse certainty lexicon, compile into master regex, delimited by | 
+ # Construct the absolute path to certainty.txt using the current script directory
+ current_dir = os.path.dirname(__file__)
+ certainty_file_pkl_path = os.path.join(current_dir, './assets/certainty.pkl')
+ certainty_file_pkl_path = os.path.abspath(certainty_file_pkl_path)
+ with open(certainty_file_pkl_path, 'rb') as f:
+ certainty_data = pickle.load(f) # Load pickled data
+ certainty = pd.read_csv(io.StringIO(certainty_data), sep = ",")
+ certainty = certainty.sort_values(["NumWords", "NumCharacters"], ascending=False)
+ master_regex = certainty["Word"].str.cat(sep='\\b|') + "\\b"
+
  # default certainty value is 4.5; aka a "neutral" statement in the event we don't find anything
  DEFAULT_CERTAINTY = 4.5
 

diff --git a/src/team_comm_tools/features/lexical_features_v2.py b/src/team_comm_tools/features/lexical_features_v2.py
@@ -42,17 +42,20 @@ def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
  pd.DataFrame: Dataframe of the lexical features stacked as columns.
  """
  # Load the preprocessed lexical regular expressions
- current_dir = os.path.dirname(__file__)
- lexicon_pkl_file_path = os.path.join(current_dir, './assets/lexicons_dict.pkl')
- lexicon_pkl_file_path = os.path.abspath(lexicon_pkl_file_path)
- with open(lexicon_pkl_file_path, "rb") as lexicons_pickle_file:
- lexicons_dict = pickle.load(lexicons_pickle_file)
-
- # Return the lexical features stacked as columns
- return pd.concat(
- # Finding the # of occurrences of lexicons of each type for all the messages.
- [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_rate(regex, chat)))\
- .rename({message_col + "_original": lexicon_type + "_lexical_per_100"}, axis=1)\
- for lexicon_type, regex in lexicons_dict.items()], 
- axis=1
- )
+ try:
+ current_dir = os.path.dirname(__file__)
+ lexicon_pkl_file_path = os.path.join(current_dir, './assets/lexicons_dict.pkl')
+ lexicon_pkl_file_path = os.path.abspath(lexicon_pkl_file_path)
+ with open(lexicon_pkl_file_path, "rb") as lexicons_pickle_file:
+ lexicons_dict = pickle.load(lexicons_pickle_file)
+
+ # Return the lexical features stacked as columns
+ return pd.concat(
+ # Finding the # of occurrences of lexicons of each type for all the messages.
+ [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_rate(regex, chat)))\
+ .rename({message_col + "_original": lexicon_type + "_lexical_per_100"}, axis=1)\
+ for lexicon_type, regex in lexicons_dict.items()], 
+ axis=1
+ )
+ except:
+ print("WARNING: Lexicons not found. Skipping feature...")
diff --git a/src/team_comm_tools/output/chat/jury_TINY_output_chat_level.csv b/src/team_comm_tools/output/chat/jury_TINY_output_chat_level.csv
diff --git a/src/team_comm_tools/output/chat/multi_task_TINY_output_chat_level_stageId_cumulative.csv b/src/team_comm_tools/output/chat/multi_task_TINY_output_chat_level_stageId_cumulative.csv
diff --git a/src/team_comm_tools/output/chat/negotiation_pilot_02_07_24.csv b/src/team_comm_tools/output/chat/negotiation_pilot_02_07_24.csv
diff --git a/src/team_comm_tools/output/chat/poke_politeness_chat.csv b/src/team_comm_tools/output/chat/poke_politeness_chat.csv