Skip to content

Commit 77e19d0

Browse files
committed
Merge pull request #311 from YeoLab/splicing_min20
Use at least 20 samples for all splicing calculations
2 parents dcdf8c7 + 769d7a3 commit 77e19d0

File tree

10 files changed

+255
-244
lines changed

10 files changed

+255
-244
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ test:
77

88
coverage:
99
cp testing/matplotlibrc .
10-
coverage run --source flotilla --omit=test --module py.test
10+
coverage run --source flotilla --omit=test --module py.test -v
1111
rm matplotlibrc
1212

1313
lint:

doc/releases/v0.2.8.txt

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
v0.2.8 (........)
2-
------------------------
2+
-----------------
3+
4+
New features
5+
~~~~~~~~~~~~
6+
7+
- Added ``Study.modality_log2bf()`` which will get the log2 bayes factor
8+
for each splicing event's fit to each modality in each phenotype
39

410
Bug fixes
511
~~~~~~~~~
@@ -15,4 +21,7 @@ Miscellaneous
1521
- Change modality estimation to a two-step process: Estimate :math:`$\Psi~0` and :math:`$\Psi~1`
1622
first, which change 1 parameter of the Beta distribution at a time,
1723
then bimodal and middle, which change both parameters of the Beta
18-
distribution at once.
24+
distribution at once.
25+
- Make sure both modality estimation and NMF space calculation use at least
26+
20 samples per event
27+
- Get rid of ``big_nmf_space_transitions`` for now

flotilla/compute/splicing.py

+69-37
Original file line numberDiff line numberDiff line change
@@ -107,20 +107,52 @@ def _logsumexp(self, logliks):
107107
logsumexps['ambiguous'] = self.logbf_thresh
108108
return logsumexps
109109

110-
def _guess_modality(self, logsumexps):
111-
"""Guess the most likely modality.
110+
def assign_modalities(self, log2_bayes_factors, reset_index=False):
111+
"""Guess the most likely modality for each event
112112
113-
If no modalilites have logsumexp'd logliks greater than the log Bayes
114-
factor threshold, then they are assigned the 'uniform' modality,
115-
which is the null hypothesis
116-
"""
113+
For each event that has at least one non-NA value, if no modalilites
114+
have logsumexp'd logliks greater than the log Bayes factor threshold,
115+
then they are assigned the 'ambiguous' modality, because we cannot
116+
reject the null hypothesis that these did not come from the uniform
117+
distribution.
118+
119+
Parameters
120+
----------
121+
log2_bayes_factors : pandas.DataFrame
122+
A (4, n_events) dataframe with bayes factors for the Psi~1, Psi~0,
123+
bimodal, and middle modalities. If an event has no bayes factors
124+
for any of those modalities, it is ignored
125+
reset_index : bool
126+
If True, remove the first level of the index from the dataframe.
127+
Useful if you are using this function to apply to a grouped
128+
dataframe where the first level is something other than the
129+
modality, e.g. the celltype
130+
131+
Returns
132+
-------
133+
modalities : pandas.Series
134+
A (n_events,) series with the most likely modality for each event
117135
118-
if all(logsumexps[self.one_param_models.keys()] > self.logbf_thresh):
119-
return logsumexps[self.one_param_models.keys()].idxmax()
136+
"""
137+
if reset_index:
138+
x = log2_bayes_factors.reset_index(level=0, drop=True)
139+
else:
140+
x = log2_bayes_factors
141+
not_na = (x.notnull() > 0).any()
142+
not_na_columns = not_na[not_na].index
143+
x.ix['ambiguous', not_na_columns] = self.logbf_thresh
144+
return x.idxmax()
145+
146+
def _fit_transform_one_step(self, data, models):
147+
non_na = data.count() > 0
148+
non_na_columns = non_na[non_na].index
149+
data_non_na = data[non_na_columns]
150+
if data_non_na.empty:
151+
return pd.DataFrame()
120152
else:
121-
other_models = logsumexps.index.difference(
122-
self.one_param_models.keys())
123-
return logsumexps[other_models].idxmax()
153+
return data_non_na.apply(lambda x: pd.Series(
154+
{k: v.logsumexp_logliks(x)
155+
for k, v in models.iteritems()}), axis=0)
124156

125157
def fit_transform(self, data):
126158
"""Get the modality assignments of each splicing event in the data
@@ -133,9 +165,9 @@ def fit_transform(self, data):
133165
134166
Returns
135167
-------
136-
modality_assignments : pandas.Series
137-
A (n_events,) series of the estimated modality for each splicing
138-
event
168+
log2_bayes_factors : pandas.DataFrame
169+
A (n_modalities, n_events) dataframe of the estimated log2
170+
bayes factor for each splicing event, for each modality
139171
140172
Raises
141173
------
@@ -145,29 +177,29 @@ def fit_transform(self, data):
145177
assert np.all(data.values.flat[np.isfinite(data.values.flat)] <= 1)
146178
assert np.all(data.values.flat[np.isfinite(data.values.flat)] >= 0)
147179

148-
# Estimate Psi~0/Psi~1 first
149-
logsumexp_logliks1 = data.apply(
150-
lambda x: pd.Series(
151-
{k: v.logsumexp_logliks(x)
152-
for k, v in self.one_param_models.iteritems()}), axis=0)
153-
logsumexp_logliks1.ix['ambiguous'] = self.logbf_thresh
154-
modality_assignments1 = logsumexp_logliks1.idxmax()
155-
156-
# Take everything that was ambiguous for included/excluded and estimate
157-
# bimodal and middle
158-
data2 = data.ix[:, modality_assignments1 == 'ambiguous']
159-
logsumexp_logliks2 = data2.apply(
160-
lambda x: pd.Series(
161-
{k: v.logsumexp_logliks(x)
162-
for k, v in self.two_param_models.iteritems()}), axis=0)
163-
logsumexp_logliks2.ix['ambiguous'] = self.logbf_thresh
164-
modality_assignments2 = logsumexp_logliks2.idxmax()
165-
166-
# Combine the results
167-
modality_assignments = modality_assignments1
168-
modality_assignments[modality_assignments2.index] = \
169-
modality_assignments2.values
170-
return modality_assignments
180+
# Estimate Psi~0/Psi~1 first (only one parameter change with each
181+
# paramterization)
182+
logbf_one_param = self._fit_transform_one_step(data,
183+
self.one_param_models)
184+
185+
# Take everything that was below the threshold for included/excluded
186+
# and estimate bimodal and middle (two parameters change in each
187+
# parameterization
188+
ind = (logbf_one_param < self.logbf_thresh).all()
189+
ambiguous_columns = ind[ind].index
190+
data2 = data.ix[:, ambiguous_columns]
191+
logbf_two_param = self._fit_transform_one_step(data2,
192+
self.two_param_models)
193+
log2_bayes_factors = pd.concat([logbf_one_param, logbf_two_param],
194+
axis=0)
195+
196+
# Make sure the returned dataframe has the same number of columns
197+
empty = data.count() == 0
198+
empty_columns = empty[empty].index
199+
empty_df = pd.DataFrame(np.nan, index=log2_bayes_factors.index,
200+
columns=empty_columns)
201+
log2_bayes_factors = pd.concat([log2_bayes_factors, empty_df], axis=1)
202+
return log2_bayes_factors
171203

172204

173205
def switchy_score(array):

flotilla/data_model/base.py

+9-80
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,7 @@ def plot_feature(self, feature_id, sample_ids=None,
10531053
phenotype_to_color=None,
10541054
phenotype_to_marker=None, nmf_xlabel=None,
10551055
nmf_ylabel=None,
1056-
nmf_space=False, fig=None, axesgrid=None):
1056+
nmf_space=False, fig=None, axesgrid=None, n=20):
10571057
"""
10581058
Plot the violinplot of a feature. Have the option to show NMF movement
10591059
"""
@@ -1101,13 +1101,13 @@ def plot_feature(self, feature_id, sample_ids=None,
11011101
phenotype_to_color=phenotype_to_color,
11021102
phenotype_to_marker=phenotype_to_marker,
11031103
order=phenotype_order, ax=axes[1],
1104-
xlabel=nmf_xlabel, ylabel=nmf_ylabel)
1104+
xlabel=nmf_xlabel, ylabel=nmf_ylabel, n=n)
11051105
except KeyError:
11061106
continue
11071107
sns.despine()
11081108
fig.tight_layout()
11091109

1110-
def nmf_space_positions(self, groupby, n=0.5):
1110+
def nmf_space_positions(self, groupby, n=20):
11111111
"""Calculate NMF-space position of splicing events in phenotype groups
11121112
11131113
Parameters
@@ -1138,16 +1138,17 @@ def nmf_space_positions(self, groupby, n=0.5):
11381138
# lambda x: x if x.count() >= n else pd.Series(np.nan,
11391139
# index=x.index))
11401140
df = at_least_n_per_group_per_event.groupby(groupby).apply(
1141-
lambda x: self.binned_nmf_reduced(data=x))
1141+
lambda x: self.binned_nmf_reduced(data=x) if
1142+
x.notnull().sum().sum() > 0 else pd.DataFrame())
11421143
df = df.swaplevel(0, 1)
11431144
df = df.sort_index()
11441145
return df
11451146

11461147
def plot_nmf_space_transitions(self, feature_id, groupby,
11471148
phenotype_to_color,
11481149
phenotype_to_marker, order, ax=None,
1149-
xlabel=None, ylabel=None):
1150-
nmf_space_positions = self.nmf_space_positions(groupby)
1150+
xlabel=None, ylabel=None, n=20):
1151+
nmf_space_positions = self.nmf_space_positions(groupby, n=n)
11511152

11521153
nmf_space_transitions(nmf_space_positions, feature_id,
11531154
phenotype_to_color,
@@ -1156,7 +1157,7 @@ def plot_nmf_space_transitions(self, feature_id, groupby,
11561157

11571158
@staticmethod
11581159
def transition_distances(positions, transitions):
1159-
"""Get NMF distance of features between phenotype transitions
1160+
"""Get cartesian distance of phenotype transitions in NMF space
11601161
11611162
Parameters
11621163
----------
@@ -1186,7 +1187,7 @@ def transition_distances(positions, transitions):
11861187
pass
11871188
return distances
11881189

1189-
def nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
1190+
def nmf_space_transitions(self, groupby, phenotype_transitions, n=20):
11901191
"""Get distance in NMF space of different splicing events
11911192
11921193
Parameters
@@ -1225,78 +1226,6 @@ def nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
12251226
axis=0)
12261227
return nmf_space_transitions
12271228

1228-
def big_nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
1229-
"""Get features whose change in NMF space between phenotypes is large
1230-
1231-
Parameters
1232-
----------
1233-
groupby : mappable
1234-
A sample id to phenotype group mapping
1235-
phenotype_transitions : list of length-2 tuples of str
1236-
List of ('phenotype1', 'phenotype2') transitions whose change in
1237-
distribution you are interested in
1238-
n : int
1239-
Minimum number of samples per phenotype, per event
1240-
1241-
Returns
1242-
-------
1243-
big_transitions : pandas.DataFrame
1244-
A (n_events, n_transitions) dataframe of the NMF distances between
1245-
splicing events
1246-
"""
1247-
nmf_space_transitions = self.nmf_space_transitions(
1248-
groupby, phenotype_transitions, n=n)
1249-
1250-
# get the mean and standard dev of the whole array
1251-
n = nmf_space_transitions.count().sum()
1252-
mean = nmf_space_transitions.sum().sum() / n
1253-
std = np.sqrt(np.square(nmf_space_transitions - mean).sum().sum() / n)
1254-
1255-
big_transitions = nmf_space_transitions[
1256-
nmf_space_transitions > (mean + std)].dropna(how='all')
1257-
return big_transitions
1258-
1259-
def plot_big_nmf_space_transitions(self, phenotype_groupby,
1260-
phenotype_transitions,
1261-
phenotype_order, color,
1262-
phenotype_to_color,
1263-
phenotype_to_marker, n=0.5):
1264-
"""Violinplots and NMF transitions of features different in phenotypes
1265-
1266-
Plot violinplots and NMF-space transitions of features that have large
1267-
NMF-space transitions between different phenotypes
1268-
1269-
Parameters
1270-
----------
1271-
n : int
1272-
Minimum number of samples per phenotype, per event
1273-
1274-
1275-
Returns
1276-
-------
1277-
1278-
1279-
Raises
1280-
------
1281-
"""
1282-
big_transitions = self.big_nmf_space_transitions(phenotype_groupby,
1283-
phenotype_transitions,
1284-
n=n)
1285-
nrows = big_transitions.shape[0]
1286-
ncols = 2
1287-
figsize = 4 * ncols, 4 * nrows
1288-
1289-
fig, axesgrid = plt.subplots(nrows=nrows, ncols=ncols,
1290-
figsize=figsize)
1291-
if nrows == 1:
1292-
axesgrid = [axesgrid]
1293-
for feature_id in big_transitions.index:
1294-
self.plot_feature(feature_id, phenotype_groupby=phenotype_groupby,
1295-
phenotype_order=phenotype_order, color=color,
1296-
phenotype_to_color=phenotype_to_color,
1297-
phenotype_to_marker=phenotype_to_marker,
1298-
nmf_space=True, fig=fig, axesgrid=axesgrid)
1299-
13001229
def plot_two_samples(self, sample1, sample2, fillna=None,
13011230
**kwargs):
13021231
"""

0 commit comments

Comments
 (0)