YeoLab
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎doc/releases/v0.2.8.txt
+11-2 b/‎doc/releases/v0.2.8.txt
+11-2
diff --git a/‎flotilla/compute/splicing.py
+69-37 b/‎flotilla/compute/splicing.py
+69-37
diff --git a/‎flotilla/data_model/base.py
+9-80 b/‎flotilla/data_model/base.py
+9-80
@@ -7,7 +7,7 @@ test:
 
 coverage:
 	cp testing/matplotlibrc .
-	coverage run --source flotilla --omit=test --module py.test
+	coverage run --source flotilla --omit=test --module py.test -v
 	rm matplotlibrc
 
 lint:
 
@@ -1,5 +1,11 @@
 v0.2.8 (........)
-------------------------
+-----------------
+
+New features
+~~~~~~~~~~~~
+
+- Added ``Study.modality_log2bf()`` which will get the log2 bayes factor
+  for each splicing event's fit to each modality in each phenotype
 
 Bug fixes
 ~~~~~~~~~
@@ -15,4 +21,7 @@ Miscellaneous
 - Change modality estimation to a two-step process: Estimate :math:`$\Psi~0` and :math:`$\Psi~1`
   first, which change 1 parameter of the Beta distribution at a time,
   then bimodal and middle, which change both parameters of the Beta
-  distribution at once.
+  distribution at once.
+- Make sure both modality estimation and NMF space calculation use at least
+  20 samples per event
+- Get rid of ``big_nmf_space_transitions`` for now
@@ -107,20 +107,52 @@ def _logsumexp(self, logliks):
         logsumexps['ambiguous'] = self.logbf_thresh
         return logsumexps
 
-    def _guess_modality(self, logsumexps):
-        """Guess the most likely modality.
+    def assign_modalities(self, log2_bayes_factors, reset_index=False):
+        """Guess the most likely modality for each event
 
-        If no modalilites have logsumexp'd logliks greater than the log Bayes
-        factor threshold, then they are assigned the 'uniform' modality,
-        which is the null hypothesis
-        """
+        For each event that has at least one non-NA value, if no modalilites
+        have logsumexp'd logliks greater than the log Bayes factor threshold,
+        then they are assigned the 'ambiguous' modality, because we cannot
+        reject the null hypothesis that these did not come from the uniform
+        distribution.
+
+        Parameters
+        ----------
+        log2_bayes_factors : pandas.DataFrame
+            A (4, n_events) dataframe with bayes factors for the Psi~1, Psi~0,
+            bimodal, and middle modalities. If an event has no bayes factors
+            for any of those modalities, it is ignored
+        reset_index : bool
+            If True, remove the first level of the index from the dataframe.
+            Useful if you are using this function to apply to a grouped
+            dataframe where the first level is something other than the
+            modality, e.g. the celltype
+
+        Returns
+        -------
+        modalities : pandas.Series
+            A (n_events,) series with the most likely modality for each event
 
-        if all(logsumexps[self.one_param_models.keys()] > self.logbf_thresh):
-            return logsumexps[self.one_param_models.keys()].idxmax()
+        """
+        if reset_index:
+            x = log2_bayes_factors.reset_index(level=0, drop=True)
+        else:
+            x = log2_bayes_factors
+        not_na = (x.notnull() > 0).any()
+        not_na_columns = not_na[not_na].index
+        x.ix['ambiguous', not_na_columns] = self.logbf_thresh
+        return x.idxmax()
+
+    def _fit_transform_one_step(self, data, models):
+        non_na = data.count() > 0
+        non_na_columns = non_na[non_na].index
+        data_non_na = data[non_na_columns]
+        if data_non_na.empty:
+            return pd.DataFrame()
         else:
-            other_models = logsumexps.index.difference(
-                self.one_param_models.keys())
-            return logsumexps[other_models].idxmax()
+            return data_non_na.apply(lambda x: pd.Series(
+                {k: v.logsumexp_logliks(x)
+                 for k, v in models.iteritems()}), axis=0)
 
     def fit_transform(self, data):
         """Get the modality assignments of each splicing event in the data
@@ -133,9 +165,9 @@ def fit_transform(self, data):
 
         Returns
         -------
-        modality_assignments : pandas.Series
-            A (n_events,) series of the estimated modality for each splicing
-            event
+        log2_bayes_factors : pandas.DataFrame
+            A (n_modalities, n_events) dataframe of the estimated log2
+            bayes factor for each splicing event, for each modality
 
         Raises
         ------
@@ -145,29 +177,29 @@ def fit_transform(self, data):
         assert np.all(data.values.flat[np.isfinite(data.values.flat)] <= 1)
         assert np.all(data.values.flat[np.isfinite(data.values.flat)] >= 0)
 
-        # Estimate Psi~0/Psi~1 first
-        logsumexp_logliks1 = data.apply(
-            lambda x: pd.Series(
-                {k: v.logsumexp_logliks(x)
-                 for k, v in self.one_param_models.iteritems()}), axis=0)
-        logsumexp_logliks1.ix['ambiguous'] = self.logbf_thresh
-        modality_assignments1 = logsumexp_logliks1.idxmax()
-
-        # Take everything that was ambiguous for included/excluded and estimate
-        # bimodal and middle
-        data2 = data.ix[:, modality_assignments1 == 'ambiguous']
-        logsumexp_logliks2 = data2.apply(
-            lambda x: pd.Series(
-                {k: v.logsumexp_logliks(x)
-                 for k, v in self.two_param_models.iteritems()}), axis=0)
-        logsumexp_logliks2.ix['ambiguous'] = self.logbf_thresh
-        modality_assignments2 = logsumexp_logliks2.idxmax()
-
-        # Combine the results
-        modality_assignments = modality_assignments1
-        modality_assignments[modality_assignments2.index] = \
-            modality_assignments2.values
-        return modality_assignments
+        # Estimate Psi~0/Psi~1 first (only one parameter change with each
+        # paramterization)
+        logbf_one_param = self._fit_transform_one_step(data,
+                                                       self.one_param_models)
+
+        # Take everything that was below the threshold for included/excluded
+        # and estimate bimodal and middle (two parameters change in each
+        # parameterization
+        ind = (logbf_one_param < self.logbf_thresh).all()
+        ambiguous_columns = ind[ind].index
+        data2 = data.ix[:, ambiguous_columns]
+        logbf_two_param = self._fit_transform_one_step(data2,
+                                                       self.two_param_models)
+        log2_bayes_factors = pd.concat([logbf_one_param, logbf_two_param],
+                                       axis=0)
+
+        # Make sure the returned dataframe has the same number of columns
+        empty = data.count() == 0
+        empty_columns = empty[empty].index
+        empty_df = pd.DataFrame(np.nan, index=log2_bayes_factors.index,
+                                columns=empty_columns)
+        log2_bayes_factors = pd.concat([log2_bayes_factors, empty_df], axis=1)
+        return log2_bayes_factors
 
 
 def switchy_score(array):
 
@@ -1053,7 +1053,7 @@ def plot_feature(self, feature_id, sample_ids=None,
                      phenotype_to_color=None,
                      phenotype_to_marker=None, nmf_xlabel=None,
                      nmf_ylabel=None,
-                     nmf_space=False, fig=None, axesgrid=None):
+                     nmf_space=False, fig=None, axesgrid=None, n=20):
         """
         Plot the violinplot of a feature. Have the option to show NMF movement
         """
@@ -1101,13 +1101,13 @@ def plot_feature(self, feature_id, sample_ids=None,
                         phenotype_to_color=phenotype_to_color,
                         phenotype_to_marker=phenotype_to_marker,
                         order=phenotype_order, ax=axes[1],
-                        xlabel=nmf_xlabel, ylabel=nmf_ylabel)
+                        xlabel=nmf_xlabel, ylabel=nmf_ylabel, n=n)
                 except KeyError:
                     continue
             sns.despine()
         fig.tight_layout()
 
-    def nmf_space_positions(self, groupby, n=0.5):
+    def nmf_space_positions(self, groupby, n=20):
         """Calculate NMF-space position of splicing events in phenotype groups
 
         Parameters
@@ -1138,16 +1138,17 @@ def nmf_space_positions(self, groupby, n=0.5):
         #     lambda x: x if x.count() >= n else pd.Series(np.nan,
         #                                                  index=x.index))
         df = at_least_n_per_group_per_event.groupby(groupby).apply(
-            lambda x: self.binned_nmf_reduced(data=x))
+            lambda x: self.binned_nmf_reduced(data=x) if
+            x.notnull().sum().sum() > 0 else pd.DataFrame())
         df = df.swaplevel(0, 1)
         df = df.sort_index()
         return df
 
     def plot_nmf_space_transitions(self, feature_id, groupby,
                                    phenotype_to_color,
                                    phenotype_to_marker, order, ax=None,
-                                   xlabel=None, ylabel=None):
-        nmf_space_positions = self.nmf_space_positions(groupby)
+                                   xlabel=None, ylabel=None, n=20):
+        nmf_space_positions = self.nmf_space_positions(groupby, n=n)
 
         nmf_space_transitions(nmf_space_positions, feature_id,
                               phenotype_to_color,
@@ -1156,7 +1157,7 @@ def plot_nmf_space_transitions(self, feature_id, groupby,
 
     @staticmethod
     def transition_distances(positions, transitions):
-        """Get NMF distance of features between phenotype transitions
+        """Get cartesian distance of phenotype transitions in NMF space
 
         Parameters
         ----------
@@ -1186,7 +1187,7 @@ def transition_distances(positions, transitions):
                 pass
         return distances
 
-    def nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
+    def nmf_space_transitions(self, groupby, phenotype_transitions, n=20):
         """Get distance in NMF space of different splicing events
 
         Parameters
@@ -1225,78 +1226,6 @@ def nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
                                                              axis=0)
         return nmf_space_transitions
 
-    def big_nmf_space_transitions(self, groupby, phenotype_transitions, n=0.5):
-        """Get features whose change in NMF space between phenotypes is large
-
-        Parameters
-        ----------
-        groupby : mappable
-            A sample id to phenotype group mapping
-        phenotype_transitions : list of length-2 tuples of str
-            List of ('phenotype1', 'phenotype2') transitions whose change in
-            distribution you are interested in
-        n : int
-            Minimum number of samples per phenotype, per event
-
-        Returns
-        -------
-        big_transitions : pandas.DataFrame
-            A (n_events, n_transitions) dataframe of the NMF distances between
-            splicing events
-        """
-        nmf_space_transitions = self.nmf_space_transitions(
-            groupby, phenotype_transitions, n=n)
-
-        # get the mean and standard dev of the whole array
-        n = nmf_space_transitions.count().sum()
-        mean = nmf_space_transitions.sum().sum() / n
-        std = np.sqrt(np.square(nmf_space_transitions - mean).sum().sum() / n)
-
-        big_transitions = nmf_space_transitions[
-            nmf_space_transitions > (mean + std)].dropna(how='all')
-        return big_transitions
-
-    def plot_big_nmf_space_transitions(self, phenotype_groupby,
-                                       phenotype_transitions,
-                                       phenotype_order, color,
-                                       phenotype_to_color,
-                                       phenotype_to_marker, n=0.5):
-        """Violinplots and NMF transitions of features different in phenotypes
-
-        Plot violinplots and NMF-space transitions of features that have large
-        NMF-space transitions between different phenotypes
-
-        Parameters
-        ----------
-        n : int
-            Minimum number of samples per phenotype, per event
-
-
-        Returns
-        -------
-
-
-        Raises
-        ------
-        """
-        big_transitions = self.big_nmf_space_transitions(phenotype_groupby,
-                                                         phenotype_transitions,
-                                                         n=n)
-        nrows = big_transitions.shape[0]
-        ncols = 2
-        figsize = 4 * ncols, 4 * nrows
-
-        fig, axesgrid = plt.subplots(nrows=nrows, ncols=ncols,
-                                     figsize=figsize)
-        if nrows == 1:
-            axesgrid = [axesgrid]
-        for feature_id in big_transitions.index:
-            self.plot_feature(feature_id, phenotype_groupby=phenotype_groupby,
-                              phenotype_order=phenotype_order, color=color,
-                              phenotype_to_color=phenotype_to_color,
-                              phenotype_to_marker=phenotype_to_marker,
-                              nmf_space=True, fig=fig, axesgrid=axesgrid)
-
     def plot_two_samples(self, sample1, sample2, fillna=None,
                          **kwargs):
         """