Skip to content

Commit

Permalink
Merge pull request #255 from Watts-Lab/xehu/issue211
Browse files Browse the repository at this point in the history
Close Issue #211
  • Loading branch information
xehu committed Jul 9, 2024
2 parents fa7abdc + 41b5bc7 commit 6fb1649
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 11 deletions.
64 changes: 59 additions & 5 deletions feature_engine/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ def __init__(
self.output_file_path_conv_level = output_file_path_conv_level
self.output_file_path_user_level = output_file_path_user_level

# Basic error detetection
# user didn't specify a file name, or specified one with only nonalphanumeric chars
if not bool(self.output_file_path_conv_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_conv_level)):
raise ValueError("ERROR: Improper conversation-level output file name detected.")
if not bool(self.output_file_path_user_level) or not bool(re.sub('[^A-Za-z0-9_]', '', self.output_file_path_user_level)):
raise ValueError("ERROR: Improper user (speaker)-level output file name detected.")

# Set first pct of conversation you want to analyze
assert(all(0 <= x <= 1 for x in analyze_first_pct)) # first, type check that this is a list of numbers between 0 and 1
self.first_pct = analyze_first_pct
Expand Down Expand Up @@ -147,12 +154,59 @@ def __init__(
df_type = df_type + "/cumulative/within_task/"
df_type = df_type + "/cumulative/"

## TODO: the FeatureBuilder assumes that we are passing in an output file path that contains either "chat" or "turn"
### in the name, as it saves the featurized content into either a "chat" folder or "turn" folder based on user
### specifications. See: https://github.com/Watts-Lab/team-process-map/issues/211
self.output_file_path_chat_level = re.sub('chat', 'turn', output_file_path_chat_level) if self.turns else output_file_path_chat_level
"""
File path cleanup and assumptions:
-----
- By design, we save data into a folder called 'output/' (and add it if not already present in the path)
- Within 'output/', we save data within the following subfolders:
- chat/ for chat-level data
- turn/ for turn-level data
- conv/ for convesation-level data
- user/ for user-level data
- We always output files as a csv (and add '.csv' if not present)
- We consider the "base file name" to be the file name of the chat-level data, and we use this to name the file
containing the vector encodings
- The inputted file name must be a valid, non-empty string
- The inputted file name must not contain only special characters with no alphanumeric component
"""
# We assume that the base file name is the last item in the output path; we will use this to name the stored vectors.
base_file_name = self.output_file_path_chat_level.split("/")[-1]
try:
base_file_name = output_file_path_chat_level.split("/")[-1]
except:
raise ValueError("ERROR: Improper chat-level output file name detected.")

if not bool(base_file_name) or not bool(re.sub('[^A-Za-z0-9_]', '', base_file_name)): # user didn't specify a file name, or specified one with only nonalphanumeric chars
raise ValueError("ERROR: Improper chat-level output file name detected.")

try:
folder_type_name = output_file_path_chat_level.split("/")[-2]
except IndexError: # user didn't specify a folder, so we will have to append it for them
folder_type_name = "turn" if self.turns else "chat"
output_file_path_chat_level = '/'.join(output_file_path_chat_level.split("/")[:-1]) + '/' + folder_type_name + '/' + base_file_name

# We check whether the second to last item is a "folder type": either chat or turn.
if folder_type_name not in ["chat", "turn"]: # user didn't specify the folder type, so we will append it for them
folder_type_name = "turn" if self.turns else "chat"
output_file_path_chat_level = '/'.join(output_file_path_chat_level.split("/")[:-1]) + '/' + folder_type_name + '/' + base_file_name

# Set file paths, ensuring correct subfolder type is added.
self.output_file_path_chat_level = re.sub('chat', 'turn', output_file_path_chat_level) if self.turns else output_file_path_chat_level
if self.output_file_path_chat_level.split(".")[-1] != "csv": self.output_file_path_chat_level = self.output_file_path_chat_level + ".csv"
if not re.match("(.*\/|^)conv\/", self.output_file_path_conv_level):
self.output_file_path_conv_level = "/".join(self.output_file_path_conv_level.split("/")[:-1]) + "/conv/" + self.output_file_path_conv_level.split("/")[-1]
if self.output_file_path_conv_level.split(".")[-1] != "csv": self.output_file_path_conv_level = self.output_file_path_conv_level + ".csv"
if not re.match("(.*\/|^)user\/", self.output_file_path_user_level):
self.output_file_path_user_level = "/".join(self.output_file_path_user_level.split("/")[:-1]) + "/user/" + self.output_file_path_user_level.split("/")[-1]
if self.output_file_path_user_level.split(".")[-1] != "csv": self.output_file_path_user_level = self.output_file_path_user_level + ".csv"

# Ensure output/ is added before the subfolder.
if not re.match("(.*\/|^)output\/", self.output_file_path_chat_level):
self.output_file_path_chat_level = re.sub('/' + folder_type_name + '/', '/output/' + folder_type_name + '/', self.output_file_path_chat_level)
if not re.match("(.*\/|^)output\/", self.output_file_path_conv_level):
self.output_file_path_conv_level = re.sub('/conv/', '/output/conv/', self.output_file_path_conv_level)
if not re.match("(.*\/|^)output\/", self.output_file_path_user_level):
self.output_file_path_user_level = re.sub('/user/', '/output/user/', self.output_file_path_user_level)

self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name

Expand Down
2 changes: 1 addition & 1 deletion feature_engine/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
csopII_df = pd.read_csv("../feature_engine/tpm-data/cleaned_data/csopII_conversations_withblanks.csv", encoding='utf-8')

# TINY / TEST DATASETS -------------------------------#

# Tiny Juries
tiny_juries_feature_builder = FeatureBuilder(
input_df = tiny_juries_df,
Expand Down
8 changes: 4 additions & 4 deletions feature_engine/testing/run_package_grouping_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,17 @@
Here, we use a test dataset that has a different conversation ID, speaker ID, message column, and timestamp
column compared to the defaults, and ensure that nothing breaks.
"""
print("TESTING CASE 1 ......")
print("TESTING CASE 1 + FILE PATH ROBUSTNESS ......")
testing_package_task_1 = FeatureBuilder(
input_df = tiny_multi_task_renamed_df,
conversation_id_col = "roundId",
speaker_id_col = "speakerId",
message_col = "text",
timestamp_col = "time",
vector_directory = "../tpm-data/vector_data/",
output_file_path_chat_level = "../output/chat/tiny_multi_task_PT1_level_chat.csv",
output_file_path_user_level = "../output/user/tiny_multi_task_PT1_level_user.csv",
output_file_path_conv_level = "../output/conv/tiny_multi_task_PT1_level_conv.csv",
output_file_path_chat_level = "../tiny_multi_task_PT1_level_chat",
output_file_path_user_level = "../tiny_multi_task_PT1_level_user",
output_file_path_conv_level = "../tiny_multi_task_PT1_level_conv",
turns = False,
)
testing_package_task_1.featurize(col="message")
Expand Down
14 changes: 13 additions & 1 deletion feature_engine/testing/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,28 @@


input_data = pd.read_csv("data/cleaned_data/multi_task_TINY_cols_renamed.csv", encoding='utf-8')
case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
case1_chatdf = None # starts out as None as reading this file in is a test unto itself!
case2_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case2_level_chat.csv")
case3a_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3a_level_chat.csv")
case3b_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3b_level_chat.csv")
case3c_chatdf = pd.read_csv("../output/chat/tiny_multi_task_case3c_level_chat.csv")
impropercase_chatdf = pd.read_csv("../output/chat/tiny_multi_task_improper_level_chat.csv")

def test_path_robustness():
# case 1 was specified without the necessary 'output/', 'chat/', and '.csv' in its path. Ensure it works!
try:
case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
except:
with open('test.log', 'a') as file:
file.write("\n")
file.write("------TEST FAILED------\n")
file.write(f"Case 1 file (../output/chat/tiny_multi_task_PT1_level_chat.csv) not found: Path robustness test failed.\n")
raise

def test_case_1():

try:
case1_chatdf = pd.read_csv("../output/chat/tiny_multi_task_PT1_level_chat.csv")
# Case 1 should have the same number of rows as the input df
assert(input_data.shape[0] == case1_chatdf.shape[0])
except AssertionError:
Expand Down

0 comments on commit 6fb1649

Please sign in to comment.