package aggregation

Watts-Lab · Sep 11, 2024 · a65eb05 · a65eb05
1 parent dc78b09
commit a65eb05
Show file tree

Hide file tree

Showing 7 changed files with 740 additions and 175 deletions.
diff --git a/examples/featurize.py b/examples/featurize.py
@@ -37,29 +37,84 @@
 
 	"""
 
-	# Tiny Juries
-	tiny_juries_feature_builder = FeatureBuilder(
+	# # Tiny Juries with base features
+	# print("Basic Tiny Juries...")
+	# tiny_juries_feature_builder_basic = FeatureBuilder(
+	# 	input_df = tiny_juries_df,
+	# 	grouping_keys = ["batch_num", "round_num"],
+	# 	vector_directory = "./vector_data/",
+	# 	output_file_path_chat_level = "./jury_TINY_output_chat_level.csv",
+	# 	output_file_path_user_level = "./jury_TINY_output_user_level.csv",
+	# 	output_file_path_conv_level = "./jury_TINY_output_conversation_level.csv",
+	# 	turns = False,
+	# )
+	# tiny_juries_feature_builder_basic.featurize(col="message")
+
+	# # Tiny Juries with custom features
+	# print("Tiny Juries with all Custom SBERT-Dependent Features...")
+	# tiny_juries_feature_builder_custom = FeatureBuilder(
+	# 	input_df = tiny_juries_df,
+	# 	grouping_keys = ["batch_num", "round_num"],
+	# 	vector_directory = "./vector_data/",
+	# 	output_file_path_chat_level = "./jury_TINY_output_chat_level_custom.csv",
+	# 	output_file_path_user_level = "./jury_TINY_output_user_level_custom.csv",
+	# 	output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom.csv",
+	# 	custom_features = [ # This will generate SBERT vectors and compute the following features, which depend on the SBERT vectors.
+    #         "(BERT) Mimicry",
+    #         "Moving Mimicry",
+    #         "Forward Flow",
+    #         "Discursive Diversity"],
+	# 	turns = False,
+	# )
+	# tiny_juries_feature_builder_custom.featurize(col="message")
+
+	# # Tiny Juries with NO aggregations
+	# print("Tiny Juries with No Aggregation...")
+	# tiny_juries_feature_builder_no_aggregation = FeatureBuilder(
+	# 	input_df = tiny_juries_df,
+	# 	grouping_keys = ["batch_num", "round_num"],
+	# 	vector_directory = "./vector_data/",
+	# 	output_file_path_chat_level = "./jury_TINY_output_chat_level_no_agg.csv",
+	# 	output_file_path_user_level = "./jury_TINY_output_user_level_no_agg.csv",
+	# 	output_file_path_conv_level = "./jury_TINY_output_conversation_level_no_agg.csv",
+	# 	convo_aggregation = False, # This will turn all aggregations OFF.
+	# 	user_aggregation = False,
+	# 	turns = False,
+	# )
+	# tiny_juries_feature_builder_no_aggregation.featurize(col="message")
+
+	# Tiny Juries with custom Aggregations
+	print("Tiny Juries with Custom Aggregation...")
+	tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
 		input_df = tiny_juries_df,
 		grouping_keys = ["batch_num", "round_num"],
 		vector_directory = "./vector_data/",
-		output_file_path_chat_level = "./jury_TINY_output_chat_level.csv",
-		output_file_path_user_level = "./jury_TINY_output_user_level.csv",
-		output_file_path_conv_level = "./jury_TINY_output_conversation_level.csv",
-		turns = False
+		output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
+		output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
+		output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
+		convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
+		convo_columns = ['positive_bert'],
+		user_methods = ['mean', 'max'],
+		user_columns = ['positive_bert', 'negative_bert'],
+		# user_methods = ['max'],
+		# user_columns = ['negative_bert'],
+		turns = False,
 	)
-	tiny_juries_feature_builder.featurize(col="message")
+	tiny_juries_feature_builder_custom_aggregation.featurize(col="message")	
 
-	# Tiny multi-task
-	tiny_multi_task_feature_builder = FeatureBuilder(
-		input_df = tiny_multi_task_df,
-		conversation_id_col = "stageId",
-		vector_directory = "./vector_data/",
-		output_file_path_chat_level = "./multi_task_TINY_output_chat_level_stageId_cumulative.csv",
-		output_file_path_user_level = "./multi_task_TINY_output_user_level_stageId_cumulative.csv",
-		output_file_path_conv_level = "./multi_task_TINY_output_conversation_level_stageId_cumulative.csv",
-		turns = False
-	)
-	tiny_multi_task_feature_builder.featurize(col="message")
+
+	# # Tiny multi-task
+	# print("Tiny Multi-Task...")
+	# tiny_multi_task_feature_builder = FeatureBuilder(
+	# 	input_df = tiny_multi_task_df,
+	# 	conversation_id_col = "stageId",
+	# 	vector_directory = "./vector_data/",
+	# 	output_file_path_chat_level = "./multi_task_TINY_output_chat_level_stageId_cumulative.csv",
+	# 	output_file_path_user_level = "./multi_task_TINY_output_user_level_stageId_cumulative.csv",
+	# 	output_file_path_conv_level = "./multi_task_TINY_output_conversation_level_stageId_cumulative.csv",
+	# 	turns = False
+	# )
+	# tiny_multi_task_feature_builder.featurize(col="message")
 
 	# FULL DATASETS BELOW ------------------------------------- #