-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_FeatureExtractionInpatient.py
executable file
·3081 lines (2201 loc) · 116 KB
/
main_FeatureExtractionInpatient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# This script does feature extraction AND linear regression for plotting.
# See main_FeatureExtractionInpatient_JustLinReg.py if feature extraction is complete, and you just need linear regression!
PAT_NOW = "S24_224"
PAT_SHORT_NAME = "S_224"
print(f'[LOG] Patient Now: {PAT_NOW}')
MOOD_TRACKING_SHEET_PATH = f'/home/jgopal/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/Mood_Tracking.xlsx'
BEHAVIORAL_LABELS_SHEET_PATH = f'/home/jgopal/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/Behavior_Labeling.xlsx'
VIDEO_TIMESTAMPS_SHEET_PATH = f'/home/jgopal/NAS/Analysis/AudioFacialEEG/Behavioral Labeling/videoDateTimes/VideoDatetimes{PAT_SHORT_NAME[1:]}.xlsx'
OPENFACE_OUTPUT_DIRECTORY = f'/home/jgopal/NAS/Analysis/outputs_OpenFace/{PAT_NOW}/'
COMBINED_OUTPUT_DIRECTORY = f'/home/jgopal/NAS/Analysis/outputs_Combined/{PAT_NOW}/'
RUNTIME_VAR_PATH = '/home/jgopal/NAS/Analysis/AudioFacialEEG/Runtime_Vars/'
RESULTS_PATH_BASE = f'/home/jgopal/NAS/Analysis/AudioFacialEEG/Results/{PAT_SHORT_NAME}/'
FEATURE_VIS_PATH = f'/home/jgopal/NAS/Analysis/AudioFacialEEG/Feature_Visualization/{PAT_SHORT_NAME}/'
FEATURE_LABEL_PATH = '/home/jgopal/NAS/Analysis/AudioFacialEEG/Feature_Labels/'
QC_PATH = '/home/jgopal/NAS/Analysis/AudioFacialEEG/Quality_Control/'
EMO_FEATURE_SETTING = 2
# 0 - Our Custom AU --> Emotions, with all emotions
# 1 - Our Custom AU --> Emotions, with just OpenDBM's emotions
# 2 - OpenDBM's AU--> Emotions
STATS_FEATURE_SETTING = 3
# 0 - Our new features (including autocorrelation, kurtosis, etc.)
# 1 - Our new features, excluding extras like autocorrelation and kurtosis
# 2 - Just pres_pct
# 3 - Our new features, excluding extras. Do NOT threshold AUs before computing metrics. HSE gets 5 event features. OGAU gets num events and presence percent.
NORMALIZE_DATA = 0
# 0 - No time series normalization
# 1 - Yes time series normalization (for each time window)
import pandas as pd
import numpy as np
import os
import warnings
import pandas as pd
# Ignore all warnings
pd.options.mode.chained_assignment = None
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings('ignore')
# SAVE VARIABLES
import pickle
def get_var_name(our_variable):
namespace = globals()
for name, obj in namespace.items():
if obj is our_variable:
return name
return None
# Save the dictionary to a file using pickle
def save_var(our_variable, RUNTIME_VAR_PATH=RUNTIME_VAR_PATH, forced_name=None):
if forced_name is None:
name_now = get_var_name(our_variable)
else:
name_now = forced_name
# Construct the full path including the file name
full_path = os.path.join(RUNTIME_VAR_PATH, f'{name_now}.pkl')
# Ensure the directory exists, including any nested folders in name_now
os.makedirs(os.path.dirname(full_path), exist_ok=True)
# Save the variable
with open(full_path, 'wb') as file:
pickle.dump(our_variable, file)
def load_var(variable_name, RUNTIME_VAR_PATH=RUNTIME_VAR_PATH):
# Load from the file
with open(RUNTIME_VAR_PATH + f'{variable_name}.pkl', 'rb') as file:
return pickle.load(file)
print('[LOG] Starter Functions Defined')
df = pd.read_excel(MOOD_TRACKING_SHEET_PATH, sheet_name=f'{PAT_SHORT_NAME}')
## Preprocess the mood tracking sheet
# Replace the P_number mood headers with just the mood
# df.columns = df.columns.str.replace('P[0-9]+ ', '')
# Properly deal with the missing values
df = df.replace('', np.nan).replace(' ', np.nan).fillna(value=np.nan)
df_moodTracking = df
df_moodTracking = df_moodTracking.drop(columns=['Notes'], errors='ignore')
df_moodTracking['Datetime'] = pd.to_datetime(df_moodTracking['Datetime']).dt.strftime('%-m/%-d/%Y %H:%M:%S')
import numpy as np
# create lists to hold the positive and negative affect items
pos_items = [1, 3, 5, 9, 10, 12, 14, 16, 17, 19]
neg_items = [2, 4, 6, 7, 8, 11, 13, 15, 18, 20]
# get all columns that start with 'P' and split them into pos and neg groups
P_cols = [col for col in df_moodTracking.columns if col.startswith('P') and not(col.startswith('Pain')) and not(col.startswith('PANAS')) and not(col.startswith('Positive'))]
pos_cols = [col for col in P_cols if int(col[1:3]) in pos_items]
neg_cols = [col for col in P_cols if int(col[1:3]) in neg_items]
# create new columns for the summed scores
df_moodTracking['Positive Affect Score'] = df_moodTracking[pos_cols].fillna(0).astype(int).sum(axis=1, skipna=True)
df_moodTracking['Negative Affect Score'] = df_moodTracking[neg_cols].fillna(0).astype(int).sum(axis=1, skipna=True)
df_moodTracking['Overall Affect Score'] = df_moodTracking[['Positive Affect Score', 'Negative Affect Score']].fillna(0).astype(int).sum(axis=1, skipna=True)
# replace 0s with NaNs in columns 'Positive Affect Score' and 'Negative Affect Score'
df_moodTracking[['Positive Affect Score', 'Negative Affect Score', 'Overall Affect Score']] = \
df_moodTracking[['Positive Affect Score', 'Negative Affect Score', 'Overall Affect Score']].replace(0, np.nan)
# drop the original P columns used to create the scores
df_moodTracking.drop(columns=pos_cols + neg_cols, inplace=True)
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
def normalize_columns(df, method=1):
# Create a copy of the DataFrame
normalized_df = df.copy()
# Get the column names excluding 'Datetime'
columns_to_normalize = [col for col in normalized_df.columns if col != 'Datetime']
if method == 1:
# No scaling or normalization
pass
elif method == 2:
# MinMax scaling to range 0 to 10
scaler = MinMaxScaler(feature_range=(0, 10))
normalized_df[columns_to_normalize] = scaler.fit_transform(normalized_df[columns_to_normalize])
elif method == 3:
# MinMax scaling to range 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
normalized_df[columns_to_normalize] = scaler.fit_transform(normalized_df[columns_to_normalize])
elif method == 4:
# Log scaling
normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].astype(float)
normalized_df[columns_to_normalize] = np.log1p(normalized_df[columns_to_normalize])
elif method == 5:
# Standard normalization (Z-score normalization)
scaler = StandardScaler()
normalized_df[columns_to_normalize] = scaler.fit_transform(normalized_df[columns_to_normalize])
else:
raise ValueError("Invalid method. Choose a value between 1 and 5.")
return normalized_df
df_moodTracking = normalize_columns(df_moodTracking, method=2)
if PAT_SHORT_NAME == 'S_214':
df_moodTracking = df_moodTracking.drop(1).reset_index(drop=True)
df_videoTimestamps = pd.read_excel(VIDEO_TIMESTAMPS_SHEET_PATH, sheet_name=f'VideoDatetimes_{PAT_SHORT_NAME.split("_")[-1]}')
df_videoTimestamps['Filename'] = df_videoTimestamps['Filename'].str.replace('.m2t', '')
if PAT_SHORT_NAME == 'S_199':
# There's no H01 video, so let's drop that filename
df_videoTimestamps = df_videoTimestamps.drop(211)
print('[LOG] Labels Processed')
# Check for any missing videos!
def print_difference(list1, list2):
for item in list1:
if item not in list2:
print(item)
filenames_master_list = list(df_videoTimestamps['Filename'].values)
filenames_we_have = [i[:-4] for i in os.listdir(COMBINED_OUTPUT_DIRECTORY)]
print_difference(filenames_master_list, filenames_we_have)
# DICTIONARY OF SEPARATE DFS
def get_dict_openface(output_dir):
# Create an empty dictionary to hold the DataFrames
dfs_openface = {}
# Get a list of all the CSV files in the directory
csv_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.csv')])
# List of columns to keep
columns_to_keep = [
'frame', 'timestamp', 'success',
'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r',
'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r',
'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r',
'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c',
'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c',
'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU45_c'
]
failed_files = []
# Loop through the CSV files
for csv_file in csv_files:
try:
# Load data into a pandas DataFrame
csv_file_path = os.path.join(output_dir, csv_file)
df_temp = pd.read_csv(csv_file_path)
df_temp.columns = df_temp.columns.str.strip()
# Keep every 6th row so it's 5 fps!
X = 6
df_temp = df_temp[df_temp.index % X == 0]
# Filter DataFrame to keep only columns in list
df_temp = df_temp.loc[:, columns_to_keep]
# Fix column names to not have leading or trailing spaces
df_temp = df_temp.rename(columns=lambda x: x.strip())
# Store the DataFrame in the dictionary with the csv file name as the key
# Remove the '.csv' by doing csv_file[:-4]
dfs_openface[csv_file[:-4]] = df_temp
except Exception as e:
print(f"Failed to load {csv_file}: {str(e)}")
failed_files.append(csv_file)
if failed_files:
raise Exception(f"Errors occurred while processing the following files: {', '.join(failed_files)}")
return dfs_openface
def get_dict_openface_extras(output_dir):
# Create an empty dictionary to hold the DataFrames
dfs_openface = {}
# Get a list of all the CSV files in the directory
csv_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.csv')])
# list of columns to keep
columns_to_keep = ['frame', ' timestamp', ' success',
'gaze_0_x',
'gaze_0_y',
'gaze_0_z',
'gaze_1_x',
'gaze_1_y',
'gaze_1_z',
'pose_Tx',
'pose_Ty',
'pose_Tz',
'pose_Rx',
'pose_Ry',
'pose_Rz']
columns_to_keep = columns_to_keep + [f"eye_lmk_X_{i}" for i in range(56)] + [f"eye_lmk_Y_{i}" for i in range(56)] + [f"eye_lmk_Z_{i}" for i in range(56)]
columns_to_keep = columns_to_keep + [f"X_{i}" for i in range(68)] + [f"Y_{i}" for i in range(68)] + [f"Z_{i}" for i in range(68)]
# remove special character
columns_to_keep = [one_str.replace(' ', '') for one_str in columns_to_keep]
# Loop through the CSV files
for csv_file in csv_files:
# Load data into a pandas df
csv_file_path = os.path.join(output_dir, csv_file)
df_temp = pd.read_csv(csv_file_path)
df_temp.columns = df_temp.columns.str.strip()
# keep every 6th row such that it's 5 fps!
X = 6
df_temp = df_temp[df_temp.index % X == 0]
# filter DataFrame to keep only columns in list
# remove special character
df_temp = df_temp.loc[:, columns_to_keep]
# fix column names to not have leading or trailing spaces!
df_temp = df_temp.rename(columns=lambda x: x.strip())
# Store the DataFrame in the dictionary with the csv file name as the key
# remove the '.csv' by doing csv_file[:-4]
dfs_openface[csv_file[:-4]] = df_temp
del df_temp
return dfs_openface
def only_successful_frames(df):
# get frames where AU/emotion detection was successful!
return df[df['success'] == 1]
def apply_function_to_dict(dictionary, func, **kwargs):
"""
Apply a function to each DataFrame in a dictionary and return a modified copy of the dictionary.
Args:
dictionary (dict): The dictionary containing DataFrames.
func (function): The function to apply to each DataFrame.
**kwargs: Additional keyword arguments to pass to the function.
Returns:
dict: A modified copy of the dictionary with the function applied to each DataFrame.
"""
return {key: func(df, **kwargs) for key, df in dictionary.items()}
print('[LOG] Loading in OpenFace Outputs')
# Check and load or generate dfs_openface
if not os.path.exists(RUNTIME_VAR_PATH + f'dfs_openface_{PAT_SHORT_NAME}.pkl'):
# Generate dfs_openface if not already saved
dfs_openface = get_dict_openface(OPENFACE_OUTPUT_DIRECTORY)
dfs_openface = apply_function_to_dict(dfs_openface, only_successful_frames)
save_var(dfs_openface, forced_name=f'dfs_openface_{PAT_SHORT_NAME}')
else:
# Load dfs_openface if it already exists
dfs_openface = load_var(f'dfs_openface_{PAT_SHORT_NAME}')
# Check and load or generate dfs_openface_extras
if not os.path.exists(RUNTIME_VAR_PATH + f'dfs_openface_extras_{PAT_NOW}.pkl'):
# Generate dfs_openface_extras if not already saved
dfs_openface_extras = get_dict_openface_extras(OPENFACE_OUTPUT_DIRECTORY)
dfs_openface_extras = apply_function_to_dict(dfs_openface_extras, only_successful_frames)
save_var(dfs_openface_extras, forced_name=f'dfs_openface_extras_{PAT_NOW}')
else:
# Load dfs_openface_extras if it already exists
dfs_openface_extras = load_var(f'dfs_openface_extras_{PAT_NOW}')
print('[LOG] OpenFace Outputs Loaded In')
import pandas as pd
import os
def get_dict(output_dir, file_now='outputs_hse.csv', filterOutLR=True):
# Initialize an empty dictionary to store the dataframes
df_dict = {}
# Loop through the subfolders in alphabetical order
for subfolder_name in sorted(os.listdir(output_dir)):
# Check if the subfolder contains CSV files
subfolder_path = os.path.join(output_dir, subfolder_name)
if not os.path.isdir(subfolder_path):
continue
# Load the first CSV file in the subfolder into a dataframe
csv_file_path = os.path.join(subfolder_path, file_now)
if not os.path.isfile(csv_file_path):
continue
try:
df_temp = pd.read_csv(csv_file_path)
except:
df_temp = pd.DataFrame(columns=['frame', 'timestamp', 'success', 'AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9',
'AU10', 'AU11', 'AU12', 'AU13', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18',
'AU19', 'AU20', 'AU22', 'AU23', 'AU24', 'AU25', 'AU26', 'AU27', 'AU32',
'AU38', 'AU39'])
# OpenGraphAU - we are filtering out L and R!
if filterOutLR:
df_temp = df_temp.filter(regex='^(?!AUL|AUR)')
# Add the dataframe to the dictionary with the subfolder name as the key
# We do [:-4] to remove '.mp4' from the end of the string
df_dict[subfolder_name[:-4]] = df_temp
return df_dict
def create_binary_columns(df, threshold):
df_copy = df.copy()
# adds classification columns to opengraphAU
for col in df_copy.columns:
if col.startswith('AU'):
# Add _c to the column name for the new column
new_col_name = col + '_c'
# Apply the binary classification to the new column
df_copy[new_col_name] = df_copy[col].apply(lambda x: 1 if x >= threshold else 0)
# Add _r to the original column name
df_copy = df_copy.rename(columns={col: col + '_r'}, inplace=False)
return df_copy
def remove_columns_ending_with_r(df):
columns_to_drop = [col for col in df.columns if col.endswith('_r')]
df = df.drop(columns=columns_to_drop, inplace=False)
return df
def only_successful_frames(df):
# get frames where AU/emotion detection was successful!
return df[df['success'] == 1]
def apply_function_to_dict(dictionary, func, **kwargs):
"""
Apply a function to each DataFrame in a dictionary and return a modified copy of the dictionary.
Args:
dictionary (dict): The dictionary containing DataFrames.
func (function): The function to apply to each DataFrame.
**kwargs: Additional keyword arguments to pass to the function.
Returns:
dict: A modified copy of the dictionary with the function applied to each DataFrame.
"""
return {key: func(df, **kwargs) for key, df in dictionary.items()}
print('[LOG] Loading in HSE Outputs')
# Check and load or generate dfs_hsemotion
if not os.path.exists(RUNTIME_VAR_PATH + f'dfs_hsemotion_{PAT_SHORT_NAME}.pkl'):
# Generate dfs_hsemotion if not already saved
dfs_hsemotion = get_dict(COMBINED_OUTPUT_DIRECTORY, file_now='outputs_hse.csv')
dfs_hsemotion = apply_function_to_dict(dfs_hsemotion, only_successful_frames)
save_var(dfs_hsemotion, forced_name=f'dfs_hsemotion_{PAT_SHORT_NAME}')
else:
# Load dfs_hsemotion if it already exists
dfs_hsemotion = load_var(f'dfs_hsemotion_{PAT_SHORT_NAME}')
print('[LOG] HSE Outputs Loaded In')
print('[LOG] Loading in OGAU Outputs')
# Check and load or generate dfs_opengraphau
if not os.path.exists(RUNTIME_VAR_PATH + f'dfs_opengraphau_{PAT_SHORT_NAME}.pkl'):
# Generate dfs_opengraphau if not already saved
OPENGRAPHAU_THRESHOLD = 0.5
dfs_opengraphau = get_dict(COMBINED_OUTPUT_DIRECTORY, file_now='outputs_ogau.csv')
dfs_opengraphau = apply_function_to_dict(dfs_opengraphau, create_binary_columns, threshold=OPENGRAPHAU_THRESHOLD)
dfs_opengraphau = apply_function_to_dict(dfs_opengraphau, only_successful_frames)
dfs_opengraphau = apply_function_to_dict(dfs_opengraphau, remove_columns_ending_with_r)
save_var(dfs_opengraphau, forced_name=f'dfs_opengraphau_{PAT_SHORT_NAME}')
else:
# Load dfs_opengraphau if it already exists
dfs_opengraphau = load_var(f'dfs_opengraphau_{PAT_SHORT_NAME}')
print('[LOG] OGAU Outputs Loaded In')
def get_data_within_duration(dfs_dict, df_video_timestamps, datetime, duration):
# Takes in:
# dfs_dict -- a dictionary of dataframes containing csv data from one of the pipelines
# df_video_timestamps -- the VideoDateTimes_199 csv
# datetime -- a pd.datetime value to center our extraction
# duration -- a duration (in minutes) BEFORE the datetime to extract
# Outputs:
# One dataframe with all rows we want, with timestamps converted into correct datetimes
start_datetime = datetime - pd.Timedelta(minutes=duration)
end_datetime = datetime
relevant_keys = df_video_timestamps.loc[(pd.to_datetime(df_video_timestamps['VideoEnd']) >= start_datetime) &
(pd.to_datetime(df_video_timestamps['VideoStart']) <= end_datetime), 'Filename'].values
relevant_dfs = []
for key in relevant_keys:
if key in dfs_dict:
video_start = pd.to_datetime(df_video_timestamps.loc[df_video_timestamps['Filename'] == key, 'VideoStart'].values[0])
video_end = pd.to_datetime(df_video_timestamps.loc[df_video_timestamps['Filename'] == key, 'VideoEnd'].values[0])
time_mask = ((dfs_dict[key]['timestamp'] >= (start_datetime - video_start).total_seconds()) &
(dfs_dict[key]['timestamp'] <= (end_datetime - video_start).total_seconds()))
df = dfs_dict[key].loc[time_mask].copy()
df['timestamp'] = video_start + pd.to_timedelta(df['timestamp'], unit='s')
relevant_dfs.append(df)
if relevant_dfs:
df_combined = pd.concat(relevant_dfs, ignore_index=True, sort=False)
df_combined = df_combined.drop(columns='frame')
return df_combined
print(f"MAJOR ERROR! ZERO RELEVANT DFS!! DATETIME: {datetime}")
return pd.DataFrame()
def get_radius_dict(TIME_RADIUS_IN_MINUTES, INPUT_DF, df_videoTimestamps, df_moodTracking, takeAll=True):
# takes in the:
# --time radius,
# --input dataframe dict (e.g. is it from OpenFace? HSEmotion?)
# --df with video timestamps
# --df with mood tracking patient reports
# --takeAll - are we taking all reports, or filtering out values w/o mood (e.g. anxiety)? True = no filtering
# returns dictionary of timestamp : df with relevant frames
# We'll make a dictionary, with the relevant df for each datetime we have a report
radius_df_dict = {}
for oneIndex in range(len(df_moodTracking)):
# Let's make sure there's a value collected (or takeAll = True)!
if takeAll:
dt_now = get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)
filtered_df = get_data_within_duration(INPUT_DF, df_videoTimestamps, dt_now, TIME_RADIUS_IN_MINUTES)
radius_df_dict[dt_now] = filtered_df
else:
val_now = df_moodTracking[oneIndex:oneIndex+1]['Anxiety'][oneIndex]
if isinstance(val_now, str):
# Value was collected
dt_now = get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)
filtered_df = get_data_within_duration(INPUT_DF, df_videoTimestamps, dt_now, TIME_RADIUS_IN_MINUTES)
radius_df_dict[dt_now] = filtered_df
else:
# No value collected!
print('No value for Anxiety for index ', oneIndex, f'corresponding to {get_moodTracking_datetime(oneIndex, df_moodTracking=df_moodTracking)}')
return radius_df_dict
def generate_number_list(start, interval, count):
number_list = [start + i * interval for i in range(count)]
return number_list
def get_moodTracking_datetime(index, df_moodTracking):
temp_var = pd.to_datetime(pd.to_datetime(df_moodTracking[index:index+1]['Datetime']).dt.strftime('%d-%b-%Y %H:%M:%S'))
return pd.Timestamp(temp_var[index])
# EMOTION DETECTION & AFFECT
takeAll = True # we are taking all patient reports
# start and interval are in minutes
TIME_RADIUS_LIST = generate_number_list(start=15, interval=15, count=16)
#TIME_RADIUS_LIST = [60, 120, 180, 240]
ENABLE_OPENFACE = True
if ENABLE_OPENFACE:
openface_radius_dict = {}
openface_extras_radius_dict = {}
hsemotion_radius_dict = {}
opengraphau_radius_dict = {}
print('[LOG] Creating Time Radius Dicts')
for i in TIME_RADIUS_LIST:
if ENABLE_OPENFACE:
openface_radius_now = get_radius_dict(i, dfs_openface, df_videoTimestamps, df_moodTracking, takeAll=takeAll)
openface_radius_dict[f'{i}'] = openface_radius_now
openface_extras_radius_now = get_radius_dict(i, dfs_openface_extras, df_videoTimestamps, df_moodTracking, takeAll=takeAll)
openface_extras_radius_dict[f'{i}'] = openface_extras_radius_now
hsemotion_radius_now = get_radius_dict(i, dfs_hsemotion, df_videoTimestamps, df_moodTracking, takeAll=takeAll)
hsemotion_radius_dict[f'{i}'] = hsemotion_radius_now
opengraphau_radius_now = get_radius_dict(i, dfs_opengraphau, df_videoTimestamps, df_moodTracking, takeAll=takeAll)
opengraphau_radius_dict[f'{i}'] = opengraphau_radius_now
print('[LOG] Time Radius Dicts Created')
def time_splitter(input_dict, splitter_times):
# Initialize the output dictionary
output_dict = {}
# Frame rate: 5 frames per second
frame_rate = 5
# Iterate over each split time
for split_time in splitter_times:
# Initialize the dictionary for the current split time
output_dict[split_time] = {}
# Calculate the number of rows per split
rows_per_split = split_time * 60 * frame_rate
# Iterate over the outer dictionary
for outer_key, inner_dict in input_dict.items():
# Initialize the dictionary for the current outer key
output_dict[split_time][outer_key] = {}
# Iterate over the inner dictionary
for timestamp, df in inner_dict.items():
# Split the DataFrame into chunks of the specified size
split_dfs = [df.iloc[i:i+rows_per_split] for i in range(0, len(df), rows_per_split)]
# Assign the list of split DataFrames to the appropriate location in the output dictionary
output_dict[split_time][outer_key][timestamp] = split_dfs
return output_dict
# Functions to apply feature processing to the inpatient dictionary structure
def apply_function_to_dict_list(dictionary, func, **kwargs):
"""
Apply a function to each DataFrame in a dictionary where values are LISTS of dfs and return a modified copy of the dictionary.
Args:
dictionary (dict): The dictionary containing DataFrames.
func (function): The function to apply to each DataFrame.
**kwargs: Additional keyword arguments to pass to the function.
Returns:
dict: A modified copy of the dictionary with the function applied to each DataFrame.
"""
new_dict = {}
for split_time, outer_dict in dictionary.items():
new_dict[split_time] = {}
for outer_key, inner_dict in outer_dict.items():
new_dict[split_time][outer_key] = {}
for timestamp, df_list in inner_dict.items():
new_dict[split_time][outer_key][timestamp] = [func(df, **kwargs) for df in df_list]
return new_dict
def average_inner_dfs(dictionary):
"""
Replace each list of DataFrames in a nested dictionary with the average of the DataFrames in each list.
For columns with strings, convert to numbers if possible and take the string from the first DataFrame otherwise.
Args:
dictionary (dict): The dictionary containing lists of DataFrames.
Returns:
dict: A modified copy of the dictionary with the average of each list of DataFrames.
"""
def process_columns(df_list):
"""
Process columns to calculate averages for numeric columns and keep strings from the first DataFrame.
"""
if not df_list:
# If df_list is empty, return an empty DataFrame
return pd.DataFrame()
combined_df = pd.concat(df_list, ignore_index=True)
avg_df = pd.DataFrame(index=combined_df.index)
for column in combined_df.columns:
# Try to convert the column to numeric
numeric_series = pd.to_numeric(combined_df[column], errors='coerce')
if numeric_series.notna().all():
# If all values can be converted to numeric, calculate the mean
avg_df[column] = numeric_series.groupby(combined_df.index).mean()
else:
# Repeat the first DataFrame's values to match the length of the combined DataFrame
avg_df[column] = np.tile(df_list[0][column].values[0], len(combined_df))
return avg_df
def create_empty_df_like(sample_df):
"""
Create a DataFrame with the same columns as sample_df but filled with zeros (or equivalent) based on datatype.
"""
return pd.DataFrame({col: np.zeros(sample_df.shape[0], dtype=sample_df[col].dtype) for col in sample_df.columns})
new_dict = {}
for split_time, outer_dict in dictionary.items():
new_dict[split_time] = {}
for outer_key, inner_dict in outer_dict.items():
new_dict[split_time][outer_key] = {}
for timestamp, df_list in inner_dict.items():
if df_list:
# If the df_list is not empty, process normally
avg_df = process_columns(df_list)
else:
# If the df_list is empty, find a non-empty DataFrame structure to create a zero-filled DataFrame
for outer_split_time in dictionary.values():
for outer_inner_dict in outer_split_time.values():
for df in outer_inner_dict.values():
if df: # Ensure df_list is not empty
avg_df = create_empty_df_like(df[0])
break
if 'avg_df' in locals():
break
if 'avg_df' in locals():
break
new_dict[split_time][outer_key][timestamp] = avg_df
return new_dict
print('[LOG] Applying 5 Min Time Split to Radius Dicts')
openface_radius_dict = time_splitter(openface_radius_dict, [5, 10])
save_var(openface_radius_dict, forced_name=f'openface_radius_dict_{PAT_SHORT_NAME}')
hsemotion_radius_dict = time_splitter(hsemotion_radius_dict, [5, 10])
save_var(hsemotion_radius_dict, forced_name=f'hsemotion_radius_dict_{PAT_SHORT_NAME}')
opengraphau_radius_dict = time_splitter(opengraphau_radius_dict, [5, 10])
save_var(opengraphau_radius_dict, forced_name=f'opengraphau_radius_dict_{PAT_SHORT_NAME}')
openface_extras_radius_dict = time_splitter(openface_extras_radius_dict, [5, 10])
save_var(openface_extras_radius_dict, forced_name=f'openface_extras_radius_dict_{PAT_SHORT_NAME}')
print('[LOG] 5 Min Time Splitter Applied to Radius Dicts')
print('[LOG] Beginning Feature Extraction')
# Define emotion to AU mapping
# OpenDBM:
emo_AUs = {'Happiness': [6, 12],
'Sadness': [1, 4, 15],
'Surprise': [1, 2, 5, 26],
'Fear': [1, 2, 4, 5, 7, 20, 26],
'Anger': [4, 5, 7, 23],
'Disgust': [9, 15, 16],
'Contempt': [12, 14]}
# Define AU to lower/upper
# OpenDBM:
AU_lower = [12, 15, 26, 20, 23, 14]
AU_upper = [6, 1, 4, 2, 5, 7, 9]
def only_successful_frames(df):
# get frames where AU/emotion detection was successful!
return df[df['success'] == 1]
from scipy.stats import skew, kurtosis
from statsmodels.tsa.stattools import acf
def binarize_cols(df, threshold=0.5):
new_df = df.copy()
emotions = [col for col in new_df.columns if col not in ['frame', 'success', 'timestamp']]
for emotion in emotions:
new_df[f'{emotion}_Raw'] = new_df[emotion]
new_df[f'{emotion}_Binary'] = (new_df[f'{emotion}_Raw'] >= threshold).astype(int)
new_df = new_df.drop(columns=emotions, inplace=False)
return new_df
def fill_empty_dfs_lists(dictionary):
"""
Fill empty DataFrames in a nested dictionary structure with a DataFrame of zeros.
Args:
dictionary (dict): The dictionary containing nested dictionaries with lists of DataFrames.
Returns:
dict: A modified copy of the dictionary with empty DataFrames filled with zeros.
"""
# Find the first non-empty DataFrame to use as a template for filling empty DataFrames
non_empty_df = None
for split_time, outer_dict in dictionary.items():
for outer_key, inner_dict in outer_dict.items():
for timestamp, df_list in inner_dict.items():
for df in df_list:
if not df.empty:
non_empty_df = df
break
if non_empty_df is not None:
break
if non_empty_df is not None:
break
if non_empty_df is not None:
break
# If no non-empty DataFrame is found, return the original dictionary
if non_empty_df is None:
return dictionary
# Create the modified dictionary
modified_dictionary = {}
for split_time, outer_dict in dictionary.items():
modified_dictionary[split_time] = {}
for outer_key, inner_dict in outer_dict.items():
modified_dictionary[split_time][outer_key] = {}
for timestamp, df_list in inner_dict.items():
modified_df_list = []
for df in df_list:
if df.empty:
modified_df = pd.DataFrame(0, index=non_empty_df.index, columns=non_empty_df.columns)
# Preserve string columns from the non-empty DataFrame
for column in non_empty_df.columns:
if non_empty_df[column].dtype == object:
modified_df[column] = non_empty_df[column]
else:
modified_df = df.copy()
modified_df_list.append(modified_df)
modified_dictionary[split_time][outer_key][timestamp] = modified_df_list
return modified_dictionary
def analyze_emotion_events_v2(df, max_frame_gap=10, event_minimum_num_frames=1, method='HSE'):
df = df.reset_index(drop=True)
# Emotions to analyze
emotions_raw = [col for col in df.columns if col not in ['frame', 'success', 'timestamp']]
# Removing "_Raw" or "_Binary" from each string
processed_strings = [s.replace("_Raw", "").replace("_Binary", "") for s in emotions_raw]
# Eliminating duplicates
emotions = list(set(processed_strings))
# Create DataFrame for results
if STATS_FEATURE_SETTING == 0:
results_df = pd.DataFrame(index=['avg_event_length', 'avg_event_duration', 'total_num_events', 'avg_probability', 'std', 'skewness', 'kurtosis', 'autocorrelation', 'pres_pct'])
elif STATS_FEATURE_SETTING == 1 or (STATS_FEATURE_SETTING == 3 and method == 'HSE'):
results_df = pd.DataFrame(index=['avg_event_length', 'total_num_events', 'avg_probability', 'std', 'pres_pct'])
elif STATS_FEATURE_SETTING == 2:
results_df = pd.DataFrame(index=['pres_pct'])
elif STATS_FEATURE_SETTING == 3 and (method == 'OGAU' or method=='OF'):
results_df = pd.DataFrame(index=['pres_pct', 'total_num_events'])
def detect_events(emotion_binary_col):
probThreshold = 0.5 # irrelevant because it's a binary column
minInterval = max_frame_gap
minDuration = event_minimum_num_frames
probBinary = emotion_binary_col > probThreshold
# Using np.diff to find changes in the binary array
changes = np.diff(probBinary.astype(int))
# Identify start (1) and stop (-1) points
starts = np.where(changes == 1)[0] + 1 # +1 to correct the index shift caused by diff
stops = np.where(changes == -1)[0] + 1
# Adjust for edge cases
if probBinary.iloc[0]:
starts = np.insert(starts, 0, 0)
if probBinary.iloc[-1]:
stops = np.append(stops, len(probBinary))
# Merge close events and filter by duration
events = []
for start, stop in zip(starts, stops):
# Construct the event considering only indices where probBinary is 1
event = np.arange(start, stop)[probBinary[start:stop].values]
# Check if there is a previous event to potentially merge with
if events and event.size > 0 and events[-1][-1] >= start - minInterval:
# Merge with the previous event
events[-1] = np.unique(np.concatenate([events[-1], event]))
elif event.size >= event_minimum_num_frames:
events.append(event)
# Filter events by minimum duration
valid_events = [event for event in events if len(event) >= minDuration]
return valid_events
for emotion in emotions:
# Identify events
emotion_binary_col = df[f'{emotion}_Binary']
emotion_presence = df[f'{emotion}_Binary'].sum()
pres_pct = emotion_presence / len(df) * 100 # Percentage of frames where emotion is present
events = detect_events(emotion_binary_col)
if not(STATS_FEATURE_SETTING == 2):
# Calculate features for each event
if events:
event_lengths = [len(event) for event in events]
event_durations = [event[-1] - event[0] + 1 for event in events]
probabilities = [df.loc[event, f'{emotion}_Raw'].values for event in events]
probabilities_flattened = np.concatenate(probabilities)
avg_event_length = np.mean(event_lengths)
avg_event_duration = np.mean(event_durations)
total_num_events = len(events)
# NORMALIZE TOTAL NUM EVENTS BASED ON DF SIZE
# total_num_events = len(events) * 1000 / df.shape[0]
avg_probability = np.mean(probabilities_flattened)
std_dev = np.std(probabilities_flattened)
skewness_val = skew(probabilities_flattened)
kurtosis_val = kurtosis(probabilities_flattened)
autocorr = acf(probabilities_flattened, fft=True, nlags=1)[1] if len(probabilities_flattened) > 1 else 0
else:
avg_event_length = 0
avg_event_duration = 0
total_num_events = 0
avg_probability = 0
std_dev = 0
skewness_val = 0
kurtosis_val = 0
autocorr = 0
# Add results to the DataFrame
if STATS_FEATURE_SETTING == 0:
results_df[emotion] = [avg_event_length, avg_event_duration, total_num_events, avg_probability, std_dev, skewness_val, kurtosis_val, autocorr, pres_pct]
elif STATS_FEATURE_SETTING == 1 or (STATS_FEATURE_SETTING == 3 and method == 'HSE'):
results_df[emotion] = [avg_event_length, total_num_events, avg_probability, std_dev, pres_pct]
elif STATS_FEATURE_SETTING == 2:
results_df[emotion] = [pres_pct]
elif STATS_FEATURE_SETTING == 3 and (method == 'OGAU' or method=='OF'):
results_df[emotion] = [pres_pct, total_num_events]
# Replace NaN values with 0
results_df.fillna(0, inplace=True)
return results_df
import scipy.stats as stats
def detect_emotions(df, method, emo_AUs, additional_filter=None):
# INPUT:
# df -- dataframe with AUs for each frame
# method -- must be 'OpenFace'
# emo_AUs -- the hash table
# additional_filter -- are we just doing lower half? upper half? This is None or a list of ints (which AUs to keep)
# OUTPUT:
# 3 datafrmes. Each has emotion values for each frame
# emo_hard, emo_soft, emo_binary (see OpenDBM docs for details)
if df.empty:
return (df, df, df)
# We start by mapping AUs to emotions for each of our two methods
# Using this mapping: https://aicure.github.io/open_dbm/docs/emotional-expressivity
if method == 'OpenFace':
columns = ['AU01_r','AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r',
'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r',
'AU26_r', 'AU45_r',
'AU01_c',
'AU02_c',
'AU04_c',
'AU05_c',
'AU06_c',
'AU07_c',
'AU09_c',
'AU10_c',
'AU12_c',
'AU14_c',
'AU15_c',
'AU17_c',
'AU20_c',
'AU23_c',
'AU25_c',
'AU26_c',
'AU45_c']
# hash tables for presence and intensity
emo_AUs_presence = {}
emo_AUs_intensity = {}
for key in emo_AUs.keys(): # loop through emotion strings
new_values_r = [] # regression
new_values_c = [] # classification
for value in emo_AUs[key]:
if isinstance(value, int):
AU_key_r = "AU{:02d}_r".format(value)
AU_key_c = "AU{:02d}_c".format(value)
if AU_key_r in columns:
if additional_filter is not None:
if value in additional_filter:
new_values_r.append(AU_key_r)
else:
new_values_r.append(AU_key_r)
if AU_key_c in columns:
if additional_filter is not None:
if value in additional_filter:
new_values_c.append(AU_key_c)
else:
new_values_c.append(AU_key_c)
if new_values_r:
emo_AUs_intensity[key] = new_values_r
if new_values_c:
emo_AUs_presence[key] = new_values_c
else:
# if the method specified is not OpenFace or OpenGraphAU, raise an error (pipeline doesn't support others yet)