Merge pull request #255 from Watts-Lab/xehu/issue211

Close Issue #211
Watts-Lab · Jul 9, 2024 · 6fb1649 · 6fb1649
2 parents fa7abdc + 41b5bc7
commit 6fb1649
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 11 deletions.
diff --git a/feature_engine/feature_builder.py b/feature_engine/feature_builder.py
@@ -91,6 +91,13 @@ def __init__(
  self.output_file_path_conv_level = output_file_path_conv_level
  self.output_file_path_user_level = output_file_path_user_level
 
+ # Basic error detetection
+ # user didn't specify a file name, or specified one with only nonalphanumeric chars
+ if not bool(self.output_file_path_conv_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_conv_level)):
+ raise ValueError("ERROR: Improper conversation-level output file name detected.")
+ if not bool(self.output_file_path_user_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_user_level)):
+ raise ValueError("ERROR: Improper user (speaker)-level output file name detected.")
+
  # Set first pct of conversation you want to analyze
  assert(all(0 <= x <= 1 for x in analyze_first_pct)) # first, type check that this is a list of numbers between 0 and 1
  self.first_pct = analyze_first_pct
@@ -147,12 +154,59 @@ def __init__(
  df_type = df_type + "/cumulative/within_task/"
  df_type = df_type + "/cumulative/"
 
- ## TODO: the FeatureBuilder assumes that we are passing in an output file path that contains either "chat" or "turn"
- ### in the name, as it saves the featurized content into either a "chat" folder or "turn" folder based on user
- ### specifications. See: https://github.com/Watts-Lab/team-process-map/issues/211
- self.output_file_path_chat_level = re.sub('chat', 'turn', output_file_path_chat_level) if self.turns else output_file_path_chat_level
+ """
+ File path cleanup and assumptions:
+ -----
+ - By design, we save data into a folder called 'output/' (and add it if not already present in the path)
+ - Within 'output/', we save data within the following subfolders:
+ - chat/ for chat-level data
+ - turn/ for turn-level data
+ - conv/ for convesation-level data
+ - user/ for user-level data
+ - We always output files as a csv (and add '.csv' if not present)
+ - We consider the "base file name" to be the file name of the chat-level data, and we use this to name the file
+ containing the vector encodings
+ - The inputted file name must be a valid, non-empty string
+ - The inputted file name must not contain only special characters with no alphanumeric component
+ """
  # We assume that the base file name is the last item in the output path; we will use this to name the stored vectors.
- base_file_name = self.output_file_path_chat_level.split("/")[-1]
+ try:
+ base_file_name = output_file_path_chat_level.split("/")[-1]
+ except:
+ raise ValueError("ERROR: Improper chat-level output file name detected.") 
+
+ if not bool(base_file_name) or not bool(re.sub('[^A-Za-z0-9_]', '', base_file_name)): # user didn't specify a file name, or specified one with only nonalphanumeric chars
+ raise ValueError("ERROR: Improper chat-level output file name detected.")
+
+ try:
+ folder_type_name = output_file_path_chat_level.split("/")[-2]
+ except IndexError: # user didn't specify a folder, so we will have to append it for them
+ folder_type_name = "turn" if self.turns else "chat"
+ output_file_path_chat_level = '/'.join(output_file_path_chat_level.split("/")[:-1]) + '/' + folder_type_name + '/' + base_file_name
+
+ # We check whether the second to last item is a "folder type": either chat or turn.
+ if folder_type_name not in ["chat", "turn"]: # user didn't specify the folder type, so we will append it for them
+ folder_type_name = "turn" if self.turns else "chat"
+ output_file_path_chat_level = '/'.join(output_file_path_chat_level.split("/")[:-1]) + '/' + folder_type_name + '/' + base_file_name
+
+ # Set file paths, ensuring correct subfolder type is added.
+ self.output_file_path_chat_level = re.sub('chat', 'turn', output_file_path_chat_level) if self.turns else output_file_path_chat_level
+ if self.output_file_path_chat_level.split(".")[-1] != "csv": self.output_file_path_chat_level = self.output_file_path_chat_level + ".csv"
+ if not re.match("(.*\/|^)conv\/", self.output_file_path_conv_level):
+ self.output_file_path_conv_level = "/".join(self.output_file_path_conv_level.split("/")[:-1]) + "/conv/" + self.output_file_path_conv_level.split("/")[-1]
+ if self.output_file_path_conv_level.split(".")[-1] != "csv": self.output_file_path_conv_level = self.output_file_path_conv_level + ".csv"
+ if not re.match("(.*\/|^)user\/", self.output_file_path_user_level):
+ self.output_file_path_user_level = "/".join(self.output_file_path_user_level.split("/")[:-1]) + "/user/" + self.output_file_path_user_level.split("/")[-1]
+ if self.output_file_path_user_level.split(".")[-1] != "csv": self.output_file_path_user_level = self.output_file_path_user_level + ".csv"
+
+ # Ensure output/ is added before the subfolder.
+ if not re.match("(.*\/|^)output\/", self.output_file_path_chat_level):
+ self.output_file_path_chat_level = re.sub('/' + folder_type_name + '/', '/output/' + folder_type_name + '/', self.output_file_path_chat_level)
+ if not re.match("(.*\/|^)output\/", self.output_file_path_conv_level):
+ self.output_file_path_conv_level = re.sub('/conv/', '/output/conv/', self.output_file_path_conv_level)
+ if not re.match("(.*\/|^)output\/", self.output_file_path_user_level):
+ self.output_file_path_user_level = re.sub('/user/', '/output/user/', self.output_file_path_user_level)
+
  self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
  self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name
 

diff --git a/feature_engine/featurize.py b/feature_engine/featurize.py
@@ -27,7 +27,7 @@
  csopII_df = pd.read_csv("../feature_engine/tpm-data/cleaned_data/csopII_conversations_withblanks.csv", encoding='utf-8')
 
  # TINY / TEST DATASETS -------------------------------#
- 
+
  # Tiny Juries
  tiny_juries_feature_builder = FeatureBuilder(
  input_df = tiny_juries_df,

diff --git a/feature_engine/testing/run_package_grouping_tests.py b/feature_engine/testing/run_package_grouping_tests.py
@@ -28,17 +28,17 @@
  Here, we use a test dataset that has a different conversation ID, speaker ID, message column, and timestamp
  column compared to the defaults, and ensure that nothing breaks.
  """
- print("TESTING CASE 1 ......")
+ print("TESTING CASE 1 + FILE PATH ROBUSTNESS ......")
  testing_package_task_1 = FeatureBuilder(
  input_df = tiny_multi_task_renamed_df,
  conversation_id_col = "roundId",
  speaker_id_col = "speakerId",
  message_col = "text",
  timestamp_col = "time",
  vector_directory = "../tpm-data/vector_data/",
- output_file_path_chat_level = "../output/chat/tiny_multi_task_PT1_level_chat.csv",
- output_file_path_user_level = "../output/user/tiny_multi_task_PT1_level_user.csv",
- output_file_path_conv_level = "../output/conv/tiny_multi_task_PT1_level_conv.csv",
+ output_file_path_chat_level = "../tiny_multi_task_PT1_level_chat",
+ output_file_path_user_level = "../tiny_multi_task_PT1_level_user",
+ output_file_path_conv_level = "../tiny_multi_task_PT1_level_conv",
  turns = False,
  )
  testing_package_task_1.featurize(col="message")

diff --git a/feature_engine/testing/test_package.py b/feature_engine/testing/test_package.py
@@ -6,16 +6,28 @@
 
 
 input_data = pd.read_csv("data/cleaned_data/multi_task_TINY_cols_renamed.csv", encoding='utf-8')
-case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
+case1_chatdf = None # starts out as None as reading this file in is a test unto itself!
 case2_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case2_level_chat.csv")
 case3a_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3a_level_chat.csv")
 case3b_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3b_level_chat.csv")
 case3c_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3c_level_chat.csv")
 impropercase_chatdf = pd.read_csv("../output/chat/tiny_multi_task_improper_level_chat.csv")
 
+def test_path_robustness():
+ # case 1 was specified without the necessary 'output/', 'chat/', and '.csv' in its path. Ensure it works!
+ try:
+ case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
+ except:
+ with open('test.log', 'a') as file:
+ file.write("\n")
+ file.write("------TEST FAILED------\n")
+ file.write(f"Case 1 file (../output/chat/tiny_multi_task_PT1_level_chat.csv) not found: Path robustness test failed.\n")
+ raise
+
 def test_case_1():
 
  try:
+ case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
  # Case 1 should have the same number of rows as the input df
  assert(input_data.shape[0] == case1_chatdf.shape[0])
  except AssertionError: