-
Notifications
You must be signed in to change notification settings - Fork 0
/
vowel_articulation_expert_rating_corr_PCGITA.py
177 lines (167 loc) · 9.09 KB
/
vowel_articulation_expert_rating_corr_PCGITA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python
# coding: utf-8
# This program is to compute the correlation between vowel articulation features (automatic and manual) and experts' ratings on speech and voice impairment for PC-GITA read speech.
# LIU Yuanyuan, 2021-3-23.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from scipy.stats import ttest_ind
import seaborn as sns
import os
def vowel_articulation_expert_rating_corr_PCGITA(filepath, task):
# filepath = '/home/yuanyuan/Documents/VAI_data_2020/exp/PC-GITA_read/'
# feature file and rating file should sort according to speaker first.
filepath_task = os.path.join(filepath, 'exp', task)
df_features = pd.read_excel(os.path.join(filepath_task, 'speaker_formants_stat_auto_None_ipa.xlsx'))
if task == 'PC-GITA_read':
df_ratings = pd.read_excel(os.path.join(filepath_task, 'PCGITA_metadata.xlsx'))
UPDRS = df_ratings['UPDRS']
PD_labels = np.zeros(np.array(UPDRS).shape)
PD_labels[UPDRS != 0] = 1
speech = df_ratings['UPDRS-speech']
speakers_rating = df_ratings['NAME']
speakers_feature = df_features['speaker']
groups = ['VAI', 'VSA', 'FCR', 'F2IU']
his = ['', '50', '70', '90']
feat_names = []
for group in groups:
for hi in his:
if hi == '':
feat_name = group
else:
feat_name = group + '[' + hi + ']'
feat_names.append(feat_name)
# print(feat_names)
columns = ['control(mean)', 'control(std)', 'PD(mean)', 'PD(std)', 'diff_ratio', 'ttest', 'pvalue']
df_summary = pd.DataFrame(np.arange(len(groups) * len(his) * 7).reshape(len(groups) * len(his), 7), index=feat_names, columns=columns)
df_corr = pd.DataFrame(np.arange(len(feat_names) * 4).reshape(len(feat_names), 4), index=feat_names,
columns=['r_updrs', 'p_updrs', 'r_speech', 'p_speech'])
if speakers_rating.all() == speakers_feature.all():
print('equal speaker order.')
for feat_name in feat_names:
feat = df_features[feat_name]
feat_control = feat.loc[UPDRS == 0]
feat_dys = feat.loc[UPDRS != 0]
ttest, p = ttest_ind(feat_control, feat_dys)
feat_control_mean = round(np.average(feat_control), 2)
feat_control_std = round(np.std(feat_control), 2)
feat_dys_mean = round(np.average(feat_dys), 2)
feat_dys_std = round(np.std(feat_dys), 2)
diff_ratio = round((feat_control_mean - feat_dys_mean) / feat_dys_mean, 3) * 100
df_summary.loc[feat_name, 'control(mean)'] = feat_control_mean
df_summary.loc[feat_name, 'control(std)'] = feat_control_std
df_summary.loc[feat_name, 'PD(mean)'] = feat_dys_mean
df_summary.loc[feat_name, 'PD(std)'] = feat_dys_std
df_summary.loc[feat_name, 'diff_ratio'] = diff_ratio
df_summary.loc[feat_name, 'ttest'] = round(ttest, 3)
df_summary.loc[feat_name, 'pvalue'] = round(p, 5)
data = pd.DataFrame({'PD_labels': PD_labels, feat_name: feat})
fig, ax = plt.subplots(1, 1)
ax = sns.stripplot(x='PD_labels', y=feat_name, data=data, jitter=True, color='.3')
ax = sns.boxplot(x='PD_labels', y=feat_name, data=data, whis=np.inf)
levels_uni = ['control', 'PD']
x_range = np.arange(len(levels_uni))
ax.set_xticks(x_range)
ax.set_xticklabels(levels_uni)
ax.set_xlabel('')
# ax.get_legend().remove()
fig.savefig(filepath_task + feat_name + '_scatter_boxplot.pdf')
# scatter plot with UPDRS (and speech) ratings
# fig, ax = plt.subplots(1, 1)
x = UPDRS
y_auto = feat
r, p = scipy.stats.pearsonr(y_auto, x)
df_corr.loc[feat_name, 'r_updrs'] = round(r, 3)
df_corr.loc[feat_name, 'p_updrs'] = round(p, 5)
# ax.scatter(x[UPDRS == 0], y_auto[UPDRS == 0], c='b', label='control')
# ax.scatter(x[UPDRS != 0], y_auto[UPDRS != 0], c='r', label='PD')
# legend = ax.legend(loc='upper right')
# ax.yaxis.get_major_formatter().set_powerlimits((0, 1))
# ax.set_xlabel('UPDRS')
# ax.set_ylabel(feat_name+' (automatic)')
# fig.savefig(filepath_task+'scatter_plot_'+feat_name+'_UPDRS_auto.pdf')
# plt.close()
# scatter plot with speech ratings
# fig, ax = plt.subplots(1, 1)
x = speech
y_auto = feat
r, p = scipy.stats.pearsonr(y_auto, x)
df_corr.loc[feat_name, 'r_speech'] = round(r, 3)
df_corr.loc[feat_name, 'p_speech'] = round(p, 5)
# ax.scatter(x[UPDRS == 0], y_auto[UPDRS == 0], c='b', label='control')
# ax.scatter(x[UPDRS != 0], y_auto[UPDRS != 0], c='r', label='PD')
# legend = ax.legend(loc='upper right')
# ax.yaxis.get_major_formatter().set_powerlimits((0, 1))
# ax.set_xlabel('UPDRS-speech')
# ax.set_ylabel(feat_name+' (automatic)')
# fig.savefig(filepath_task+'scatter_plot_'+feat_name+'_speech_auto.pdf')
# plt.close()
df_corr.to_excel(filepath_task + 'corr_vowel_articulation_ratings.xlsx')
df_summary.to_excel(filepath_task + 'control_PD_summary.xlsx')
## formants distribution
for hi in his:
if hi == '':
F1a = df_features['f1_a']
F2a = df_features['f2_a']
F1i = df_features['f1_i']
F2i = df_features['f2_i']
F1u = df_features['f1_u']
F2u = df_features['f2_u']
else:
lo = str(100 - int(hi))
F1a = df_features['f1_a[' + hi + ']']
F2a = df_features['f2_a[' + lo + ']']
F1i = df_features['f1_i[' + lo + ']']
F2i = df_features['f2_i[' + hi + ']']
F1u = df_features['f1_u[' + lo + ']']
F2u = df_features['f2_u[' + lo + ']']
F1_min = np.floor(np.min(F1u) / 100) * 100
F1_max = np.ceil(np.max(F1a) / 100) * 100
F2_min = np.floor(np.min(F2u) / 100) * 100
F2_max = np.ceil(np.max(F2i) / 100) * 100
# fig, axs = plt.subplots(1, 2, figsize=(15, 6))
fig, axs = plt.subplots()
if hi == '':
print('Control: Formants distribution (mean of frame-level formants)')
else:
print('Control: Formants distribution (percentiles: ' + hi + '/' + str(100 - int(hi)) + ')')
axs.plot(F2a.loc[UPDRS == 0], F1a.loc[UPDRS == 0], 'ro', label='/a/')
axs.plot(F2i.loc[UPDRS == 0], F1i.loc[UPDRS == 0], 'g^', label='/i/')
axs.plot(F2u.loc[UPDRS == 0], F1u.loc[UPDRS == 0], 'bs', label='/u/')
axs.plot(np.average(F2a.loc[UPDRS == 0]), np.average(F1a.loc[UPDRS == 0]), 'ko', markersize=12)
axs.plot(np.average(F2i.loc[UPDRS == 0]), np.average(F1i.loc[UPDRS == 0]), 'k^', markersize=12)
axs.plot(np.average(F2u.loc[UPDRS == 0]), np.average(F1u.loc[UPDRS == 0]), 'ks', markersize=12)
legend = axs.legend(loc='upper right')
axs.set(xlabel='F2/Hz', ylabel='F1/Hz')
# axs[0].set_title('Control')
axs.set_xlim(F2_min, F2_max)
axs.set_ylim(F1_min, F1_max)
plt.grid(True)
if hi == '':
fig.savefig(filepath_task + 'Formants_vowel_auto_None_ipa_mean_formants_control.pdf')
else:
fig.savefig(filepath_task + 'Formants_vowel_auto_None_ipa_' + hi + '_' + str(100 - int(hi)) + '_control.pdf')
plt.close()
fig, axs = plt.subplots()
if hi == '':
print('PD: Formants distribution (mean of frame-level formants)')
else:
print('PD: Formants distribution (percentiles: ' + hi + '/' + str(100 - int(hi)) + ')')
axs.plot(F2a.loc[UPDRS != 0], F1a.loc[UPDRS != 0], 'ro', label='/a/')
axs.plot(F2i.loc[UPDRS != 0], F1i.loc[UPDRS != 0], 'g^', label='/i/')
axs.plot(F2u.loc[UPDRS != 0], F1u.loc[UPDRS != 0], 'bs', label='/u/')
axs.plot(np.average(F2a.loc[UPDRS != 0]), np.average(F1a.loc[UPDRS != 0]), 'ko', markersize=12)
axs.plot(np.average(F2i.loc[UPDRS != 0]), np.average(F1i.loc[UPDRS != 0]), 'k^', markersize=12)
axs.plot(np.average(F2u.loc[UPDRS != 0]), np.average(F1u.loc[UPDRS != 0]), 'ks', markersize=12)
legend = axs.legend(loc='upper right')
axs.set(xlabel='F2/Hz', ylabel='F1/Hz')
# axs[0].set_title('Dysarthric')
axs.set_xlim(F2_min, F2_max)
axs.set_ylim(F1_min, F1_max)
plt.grid(True)
if hi == '':
fig.savefig(filepath_task + 'Formants_vowel_auto_None_ipa_mean_formants_PD.pdf')
else:
fig.savefig(filepath_task + 'Formants_vowel_auto_None_ipa_' + hi + '_' + str(100 - int(hi)) + '_PD.pdf')
plt.close()