diff --git a/.travis.yml b/.travis.yml
index e38d678f..a55e2e1f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,6 +17,7 @@ install:
- source activate testenv
- conda install --yes --file conda_requirements.txt
- pip install -r requirements.txt
+- pip install coverage
- pip install coveralls
before_script:
- git config --global user.email "olga.botvinnik@gmail.com"
diff --git a/Makefile b/Makefile
index 75bf6bb9..8487faff 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ test:
coverage:
cp testing/matplotlibrc .
- py.test --durations=20 --cov flotilla --cov-report term-missing flotilla/test/
+ coverage run --source flotilla --omit=test --module py.test
rm matplotlibrc
lint:
diff --git a/README.rst b/README.rst
index 141ae472..2f4e5236 100644
--- a/README.rst
+++ b/README.rst
@@ -1,115 +1,38 @@
-|Build Status|\ |Coverage Status|
-
flotilla
========
-.. figure:: flotilla.png
- :alt: flotilla Logo
-
- flotilla Logo
-Installation instructions
-=========================
-
-From a clean install of Mavericks 10.9.4, follow these steps.
-
-All others must fend for themselves to install matplotlib, scipy and
-their third-party dependencies.
-
-*This part only needs to be done once*
-
-- `Install anaconda `__
-- `Install Xcode (this can take an
- hour) `__
-- Open Xcode and agree to terms and services (it is very important to
- read them thoroughly)
-- Install `homebrew `__
-
- ``ruby -e "$(curl -fsSL https://raw.github.com/Homebrew/homebrew/go/install)"``
-
-- Install freetype:
-
- ``brew install freetype``
-
-- Install heavy packages (this can take an hour or more)
-
-::
-
- conda install pip numpy scipy cython matplotlib nose six scikit-learn ipython networkx pandas tornado statsmodels setuptools pytest pyzmq jinja2 pyyaml`
-
-- Create a virtual environment
- ``conda create -n flotilla_env pip numpy scipy cython matplotlib nose six scikit-learn ipython networkx pandas tornado statsmodels setuptools pytest pyzmq jinja2 pyyaml```
-
-- Switch to virtual environment
-
- ``source activate flotilla_env``
-
-- Install flotilla and its dependencies (this can take a few minutes):
-
- ``pip install git+https://github.com/YeoLab/flotilla.git``
-
-- Create a scratch space for your work
-
- ``mkdir ~/flotilla_scratch``
-
-- Make a place to store flotilla projects
-
- ``mkdir ~/flotilla_projects``
-
-- Go back to the real world
-
- ``source deactivate``
-
-Start using flotilla:
-=====================
-
-Use the above instructions to create a flotilla-friendly environment,
-then:
-
-- switch to virtual environment
-
- ``source activate flotilla_env``
-
-- start an ipython notebook:
-
- ``ipython notebook --notebook-dir=~/flotilla_scratch``
-
-- create a new notebook by clicking ``New Notebook``
-- rename your notebook from "Untitled" to something more informative by
- clicking the title panel.
-- load matplotlib backend using every notebook must use this to display
- inline output
-
- ``%matplotlib inline``
-
-Test interactive features with example data:
---------------------------------------------
-
-We have prepared a slice of the full dataset for testing and
-demonstration purposes.
-
-Run each of the following code lines in its own ipython notebook cell
-for an interactive feature.
-
-::
-
- import flotilla
- test_study = flotilla.embark('http://sauron.ucsd.edu/flotilla_projects/neural_diff_chr22/datapackage.json')
-
- test_study.interactive_pca()
-
- test_study.interactive_graph()
-
- test_study.interactive_classifier()
-
- test_study.interactive_lavalamp_pooled_inconsistent()
-
-IMPORTANT NOTE: for this test,several failures are expected since the
-test set is small. Adjust parameters to explore valid parameter spaces.
-For example, you can manually select ``all_genes`` as the
-``feature_subset`` from the drop-down menu that appears after running
-these interactive functions.
-
-.. |Build Status| image:: https://travis-ci.org/YeoLab/flotilla.svg?branch=master
- :target: https://travis-ci.org/YeoLab/flotilla
-.. |Coverage Status| image:: https://img.shields.io/coveralls/YeoLab/flotilla.svg
- :target: https://coveralls.io/r/YeoLab/flotilla?branch=master
+``flotilla`` is a Python package for visualizing transcriptome (RNA expression) data from hundreds of
+samples. We include utilities to perform common tasks on these large data matrices, including:
+
+* Dimensionality reduction
+* Classification and Regression
+* Outlier detection
+* Network graphs from covariance
+* Hierarchical clustering
+
+And common tasks for biological data including:
+
+* Renaming database features to gene symbols
+* Coloring/marking samples based on experimental phenotype
+* Removing poor-quality samples (technical outliers)
+
+
+Finally, ``flotilla`` is a platform for active collaboration between bioinformatics scientists and
+traditional "wet lab" scientists. Leveraging `interactive widgets `_
+in the `IPython Notebook `_,
+we have created tools for simple and streamlined data exploration including:
+
+* Subsetting sample groups and feature (genes/splicing events) groups
+* Dynamically adjusting parameters for analysis
+* Integrating external lists of features from the web or local files
+
+These empower the "wet lab" scientists to ask questions on their own and gives bioniformatics
+scientists a platform and share their analysis tools.
+
+
+What flotilla is **not**
+-----------------------
+
+``flotilla`` is not a genomics pipeline. We expect that you have already generated
+data tables for gene expression, isoform expression and metadata. ``flotilla`` only makes
+it easy to integrate all those data parts together once you have the pieces.
diff --git a/doc/releases/v0.2.7txt b/doc/releases/v0.2.7.txt
similarity index 100%
rename from doc/releases/v0.2.7txt
rename to doc/releases/v0.2.7.txt
diff --git a/doc/releases/v0.2.8.txt b/doc/releases/v0.2.8.txt
new file mode 100644
index 00000000..0a63b769
--- /dev/null
+++ b/doc/releases/v0.2.8.txt
@@ -0,0 +1,14 @@
+v0.2.8 (........)
+------------------------
+
+Bug fixes
+~~~~~~~~~
+
+- ``Study.tidy_splicing_with_expression`` now deals with when splicing events
+ map to multiple gene names. Fixes #304 with #309.
+
+Miscellaneous
+~~~~~~~~~~~~~
+
+- Rasterize lavalamp plot for visualizing many splicing events at once,
+ otherwise the image is too big. PR #308
\ No newline at end of file
diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
index 15423d5c..6cffa2ac 100644
--- a/doc/whatsnew.rst
+++ b/doc/whatsnew.rst
@@ -7,6 +7,10 @@ What's new in the package
A catalog of new features, improvements, and bug-fixes in each release.
+.. include:: releases/v0.2.8.txt
+.. include:: releases/v0.2.7.txt
+.. include:: releases/v0.2.6.txt
+.. include:: releases/v0.2.5.txt
.. include:: releases/v0.2.4.txt
.. include:: releases/v0.2.3.txt
.. include:: releases/v0.2.2.txt
diff --git a/flotilla/data_model/study.py b/flotilla/data_model/study.py
index 87f2d106..400a6b9e 100644
--- a/flotilla/data_model/study.py
+++ b/flotilla/data_model/study.py
@@ -3,6 +3,7 @@
heavier in terms of data load
"""
import inspect
+import itertools
import json
import os
import sys
@@ -1796,6 +1797,16 @@ def tidy_splicing_with_expression(self):
id_vars=splicing_index_name,
value_name='psi',
var_name=splicing_columns_name)
+
+ s = splicing_common_id.dropna()
+
+ event_name_to_ensembl_ids = list(itertools.chain(
+ *[zip([k] * len(v.split(',')), v.split(',')) for k, v in
+ s.iteritems()]))
+ index, data = zip(*event_name_to_ensembl_ids)
+ event_name_to_ensembl_ids = pd.Series(data, index=index,
+ name=self._common_id)
+
rename_columns = {}
if splicing_index_name == 'index':
rename_columns[splicing_index_name] = self._sample_id
@@ -1804,20 +1815,13 @@ def tidy_splicing_with_expression(self):
splicing_columns_name = self._event_name
splicing_tidy = splicing_tidy.rename(columns=rename_columns)
- # Create a column of the common id on which to join splicing
- # and expression
- splicing_names = splicing_tidy[splicing_columns_name]
- if isinstance(splicing_names, pd.Series):
- splicing_tidy[self._common_id] = splicing_tidy[
- splicing_columns_name].map(splicing_common_id)
- else:
- # Splicing ids are a multi-index, so the feature renamer will get
- # the name of the feature.
- splicing_tidy[self._common_id] = [
- self.splicing.feature_renamer(x)
- for x in splicing_names.itertuples(index=False)]
+ splicing_tidy = splicing_tidy.set_index(splicing_columns_name)
+ splicing_tidy = splicing_tidy.ix[event_name_to_ensembl_ids.index]
+ splicing_tidy = splicing_tidy.join(event_name_to_ensembl_ids)
- splicing_tidy = splicing_tidy.dropna()
+ splicing_tidy = splicing_tidy.dropna().reset_index()
+ splicing_tidy = splicing_tidy.rename(
+ columns={'index': self._event_name})
# Tidify expression
expression = self.expression.data_original
@@ -1833,11 +1837,10 @@ def tidy_splicing_with_expression(self):
columns={'index': self._sample_id})
expression_tidy = expression_tidy.dropna()
- splicing_tidy.set_index([self._sample_id, self._common_id],
- inplace=True)
- expression_tidy.set_index([self._sample_id, self._common_id],
- inplace=True)
- return splicing_tidy.join(expression_tidy, how='inner').reset_index()
+ splicing_tidy_with_expression = splicing_tidy.merge(
+ expression_tidy, left_on=[self._sample_id, self._common_id],
+ right_on=[self._sample_id, self._common_id])
+ return splicing_tidy_with_expression
def filter_splicing_on_expression(self, expression_thresh,
sample_subset=None):
diff --git a/flotilla/external/combat.py b/flotilla/external/combat.py
index 32fe117e..5210cb22 100755
--- a/flotilla/external/combat.py
+++ b/flotilla/external/combat.py
@@ -1,180 +1,121 @@
import sys
-import pandas as pa
+import pandas as pd
import patsy
import numpy.linalg as la
import numpy as np
-"""
-README from: https://github.com/brentp/combat.py
-
-ComBat is an R package for removing batch effects from data.
-This is a python version that matches the output from the ComBat function
-in SVA (http://www.bioconductor.org/packages/release/bioc/html/sva.html).
-This code is completely copied from the ComBat function in that package.
-
-
-Compare
-=======
-
-To test, run this R code (requires sva and bladderbatch from bioconductor):
-
-```Shell
-
-Rscript R-combat.R
-
-```
-
-Then, from the same directory, run
-
-```Shell
-
-python combat.py
-
-```
-
-you can then run this python code to see the differences:
-
-```Python
-import pandas as pa
-p = pa.read_table('py-batch.txt', index_col=0)
-r = pa.read_table('r-batch.txt', index_col=0)
-
-print (p - r).max().max()
-```
-This outputs 3.9423421307e-05 on my machine. Indicating that
-that is the largest difference between the 22,283*57 values generated by the
-R version and those generated in this version.
-
-Performance
-===========
-In the example above, the combat function runs in < 1 second in python
-and about 15 seconds in R.
-
-On an identical dataset, of 30K rows * 190 samples, this python version finishes in 10.008s
-as measured by unix `time`.
-The R version takes 4m0.681s with output identical to 3 decimal places. This is a speed-up
-of about *24x.*
-
-The speed improvement seems to be larger for larger datasets.
-
-Function
-========
-
-The python version is usable as a module, the function has the signature:
-
-```Python
-
- combat(dat, batch, mod, numCovs=None)
-
-```
-
-which is the same as the R function except the non-parametric version is not supported.
-
- + dat is the expression/methylation data.
- + batch is a list containing the batch variable
- + mod is the model matrix (can use patsy for this from python)
- + numCovs is a list like ["age", "height"], that gives the column name or number
- of numeric variables in batch (otherwise they will be converted to factors).
-
-Read
-====
-
- Johnson WE, Rabinovic A, Li C (2007). Adjusting batch effects in microarray
- expression data using Empirical Bayes methods. Biostatistics 8:118-127.
-
- Jeffrey T. Leek, W. Evan Johnson, Hilary S. Parker, Andrew E. Jaffe
- and John D. Storey (). sva: Surrogate Variable Analysis. R package
- version 3.4.0.
-
-"""
-
-def adjust_nums(numCovs, drop_idxs):
+def adjust_nums(numerical_covariates, drop_idxs):
# if we dropped some values, have to adjust those with a larger index.
- if numCovs is None: return drop_idxs
- return [nc - sum(nc < di for di in drop_idxs) for nc in numCovs]
+ if numerical_covariates is None: return drop_idxs
+ return [nc - sum(nc < di for di in drop_idxs) for nc in numerical_covariates]
-def design_mat(mod, numCovs, batch_levels):
+def design_mat(mod, numerical_covariates, batch_levels):
# require levels to make sure they are in the same order as we use in the
# rest of the script.
design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels),
mod, return_type="dataframe")
mod = mod.drop(["batch"], axis=1)
- numCovs = list(numCovs)
- print >>sys.stderr, "found %i batches" % design.shape[1]
- other_cols = [c for i, c in enumerate(mod.columns) if not i in numCovs]
+ numerical_covariates = list(numerical_covariates)
+ sys.stderr.write("found %i batches\n" % design.shape[1])
+ other_cols = [c for i, c in enumerate(mod.columns)
+ if not i in numerical_covariates]
factor_matrix = mod[other_cols]
- design = pa.concat((design, factor_matrix), axis=1)
- if numCovs is not None:
- print >>sys.stderr, "found %i numerical covariates..." % len(numCovs)
- for i, nC in enumerate(numCovs):
+ design = pd.concat((design, factor_matrix), axis=1)
+ if numerical_covariates is not None:
+ sys.stderr.write("found %i numerical covariates...\n"
+ % len(numerical_covariates))
+ for i, nC in enumerate(numerical_covariates):
cname = mod.columns[nC]
- print >>sys.stderr, "\t", cname
+ sys.stderr.write("\t{0}\n".format(cname))
design[cname] = mod[mod.columns[nC]]
- print >>sys.stderr, "found %i categorical variables:" % len(other_cols)
- print >>sys.stderr, "\t" + ", ".join(other_cols)
+ sys.stderr.write("found %i categorical variables:" % len(other_cols))
+ sys.stderr.write("\t" + ", ".join(other_cols) + '\n')
return design
-def combat(dat, batch, mod, numCovs=None):
- if isinstance(numCovs, basestring):
- numCovs = [numCovs]
- if numCovs is None:
- numCovs = []
- if mod:
- mod["batch"] = list(batch)
+def combat(data, batch, model=None, numerical_covariates=None):
+ """Correct for batch effects in a dataset
+
+ Parameters
+ ----------
+ data : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the expression or methylation
+ data to batch correct
+ batch : List-like
+ A column corresponding to the batches in the data, in the same order
+ as the samples in ``data``
+ model : patsy.design_info.DesignMatrix, optional
+ A model matrix describing metadata on the samples which could be
+ causing batch effects. If not provided, then will attempt to coarsely
+ correct just from the information provided in ``batch``
+ numerical_covariates : list-like
+ List of covariates in the model which are numerical, rather than
+ categorical
+
+ Returns
+ -------
+ corrected : pandas.DataFrame
+ A (n_features, n_samples) dataframe of the batch-corrected data
+ """
+ if isinstance(numerical_covariates, str):
+ numerical_covariates = [numerical_covariates]
+ if numerical_covariates is None:
+ numerical_covariates = []
+
+ if model is not None and isinstance(model, pd.DataFrame):
+ model["batch"] = list(batch)
else:
- mod = pa.DataFrame({'batch': batch})
+ model = pd.DataFrame({'batch': batch})
- batch_items = mod.groupby("batch").groups.items()
+ batch_items = model.groupby("batch").groups.items()
batch_levels = [k for k, v in batch_items]
batch_info = [v for k, v in batch_items]
n_batch = len(batch_info)
-
n_batches = np.array([len(v) for v in batch_info])
n_array = float(sum(n_batches))
# drop intercept
- drop_cols = [cname for cname, inter in ((mod == 1).all()).iterkv() if inter == True]
- drop_idxs = [list(mod.columns).index(cdrop) for cdrop in drop_cols]
- mod = mod[[c for c in mod.columns if not c in drop_cols]]
- numCovs = [list(mod.columns).index(c) if isinstance(c, basestring) else c
- for c in numCovs if not c in drop_cols]
+ drop_cols = [cname for cname, inter in ((model == 1).all()).iterkv() if inter == True]
+ drop_idxs = [list(model.columns).index(cdrop) for cdrop in drop_cols]
+ model = model[[c for c in model.columns if not c in drop_cols]]
+ numerical_covariates = [list(model.columns).index(c) if isinstance(c, str) else c
+ for c in numerical_covariates if not c in drop_cols]
- design = design_mat(mod, numCovs, batch_levels)
+ design = design_mat(model, numerical_covariates, batch_levels)
- print >>sys.stderr, "Standardizing Data across genes."
- B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), dat.T)
+ sys.stderr.write("Standardizing Data across genes.\n")
+ B_hat = np.dot(np.dot(la.inv(np.dot(design.T, design)), design.T), data.T)
grand_mean = np.dot((n_batches / n_array).T, B_hat[:n_batch,:])
- var_pooled = np.dot(((dat - np.dot(design, B_hat).T)**2), np.ones((n_array, 1)) / n_array)
+ var_pooled = np.dot(((data - np.dot(design, B_hat).T)**2), np.ones((n_array, 1)) / n_array)
stand_mean = np.dot(grand_mean.T.reshape((len(grand_mean), 1)), np.ones((1, n_array)))
tmp = np.array(design.copy())
tmp[:,:n_batch] = 0
stand_mean += np.dot(tmp, B_hat).T
- s_data = ((dat - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, n_array))))
+ s_data = ((data - stand_mean) / np.dot(np.sqrt(var_pooled), np.ones((1, n_array))))
- print >>sys.stderr, "Fitting L/S model and finding priors"
+ sys.stderr.write("Fitting L/S model and finding priors\n")
batch_design = design[design.columns[:n_batch]]
gamma_hat = np.dot(np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T)
delta_hat = []
for i, batch_idxs in enumerate(batch_info):
- #batches = [list(mod.columns).index(b) for b in batches]
+ #batches = [list(model.columns).index(b) for b in batches]
delta_hat.append(s_data[batch_idxs].var(axis=1))
gamma_bar = gamma_hat.mean(axis=1)
t2 = gamma_hat.var(axis=1)
- a_prior = map(aprior, delta_hat)
- b_prior = map(bprior, delta_hat)
+ a_prior = list(map(aprior, delta_hat))
+ b_prior = list(map(bprior, delta_hat))
- print >>sys.stderr, ("Finding parametric adjustments")
+ sys.stderr.write("Finding parametric adjustments\n")
gamma_star, delta_star = [], []
for i, batch_idxs in enumerate(batch_info):
#print '18 20 22 28 29 31 32 33 35 40 46'
@@ -186,7 +127,7 @@ def combat(dat, batch, mod, numCovs=None):
gamma_star.append(temp[0])
delta_star.append(temp[1])
- print("adjusting data")
+ sys.stdout.write("Adjusting data\n")
bayesdata = s_data
gamma_star = np.array(gamma_star)
delta_star = np.array(delta_star)
@@ -196,7 +137,7 @@ def combat(dat, batch, mod, numCovs=None):
dsq = np.sqrt(delta_star[j,:])
dsq = dsq.reshape((len(dsq), 1))
- denom = np.dot(dsq, np.ones((1, n_batches[j])))
+ denom = np.dot(dsq, np.ones((1, n_batches[j])))
numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.ix[batch_idxs], gamma_star).T)
bayesdata[batch_idxs] = numer / denom
@@ -247,7 +188,10 @@ def postvar(sum2, n, a, b):
if __name__ == "__main__":
# NOTE: run this first to get the bladder batch stuff written to files.
- """
+ """
+ source("http://bioconductor.org/biocLite.R")
+ biocLite("sva")
+
library("sva")
options(stringsAsFactors=FALSE)
@@ -260,25 +204,26 @@ def postvar(sum2, n, a, b):
write.table(data.frame(cel=rownames(pheno), pheno), row.names=F, quote=F, sep="\t", file="bladder-pheno.txt")
edata = exprs(bladderEset)
- write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt")
+ write.table(edata, row.names=T, quote=F, sep="\t", file="bladder-expr.txt")
# use dataframe instead of matrix
mod = model.matrix(~as.factor(cancer) + age, data=pheno)
- t = Sys.time()
+ t = Sys.time()
cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), mod=mod, numCov=match("age", colnames(mod)))
- print(Sys.time() - t)
- print(cdata[1:5, 1:5])
- """
+ print(Sys.time() - t)
+ print(cdata[1:5, 1:5])
+ write.table(cdata, row.names=True, quote=F, sep="\t", file="r-batch.txt")
+ """
- pheno = pa.read_table('bladder-pheno.txt', index_col=0)
- dat = pa.read_table('bladder-expr.txt', index_col=0)
+ pheno = pd.read_table('bladder-pheno.txt', index_col=0)
+ dat = pd.read_table('bladder-expr.txt', index_col=0)
mod = patsy.dmatrix("~ age + cancer", pheno, return_type="dataframe")
import time
t = time.time()
ebat = combat(dat, pheno.batch, mod, "age")
- print "%.2f seconds" % (time.time() - t)
-
- print ebat.ix[:5, :5]
+ sys.stdout.write("%.2f seconds\n" % (time.time() - t))
+
+ sys.stdout.write(str(ebat.ix[:5, :5]))
ebat.to_csv("py-batch.txt", sep="\t")
diff --git a/flotilla/test/data_model/test_study.py b/flotilla/test/data_model/test_study.py
index ad14ca9b..615f81ea 100644
--- a/flotilla/test/data_model/test_study.py
+++ b/flotilla/test/data_model/test_study.py
@@ -3,6 +3,7 @@
computation or visualization tests yet.
"""
from collections import Iterable
+import itertools
import json
import matplotlib.pyplot as plt
@@ -267,6 +268,82 @@ def test_sample_subset_to_sample_ids(self, study, sample_subset):
pdt.assert_array_equal(true_sample_subset, test_sample_subset)
+ @pytest.fixture(params=[True, False])
+ def multiple_genes_per_event(self, request):
+ return request.param
+
+ def test_tidy_splicing_with_expression(self, study, monkeypatch,
+ multiple_genes_per_event):
+ if multiple_genes_per_event:
+ df = study.splicing.feature_data.copy()
+ events = df.index[:5]
+ column = study.splicing.feature_expression_id_col
+ df.ix[events, column] = '{},{}'.format(
+ *study.expression.data.columns[:2])
+ monkeypatch.setattr(study.splicing, 'feature_data', df)
+ test = study.tidy_splicing_with_expression
+
+ splicing_common_id = study.splicing.feature_data[
+ study.splicing.feature_expression_id_col]
+
+ # Tidify splicing
+ splicing = study.splicing.data
+ splicing_index_name = study._maybe_get_axis_name(splicing, axis=0)
+ splicing_columns_name = study._maybe_get_axis_name(splicing, axis=1)
+
+ splicing_tidy = pd.melt(splicing.reset_index(),
+ id_vars=splicing_index_name,
+ value_name='psi',
+ var_name=splicing_columns_name)
+
+ s = splicing_common_id.dropna()
+
+ event_name_to_ensembl_ids = list(itertools.chain(
+ *[zip([k] * len(v.split(',')), v.split(',')) for k, v in
+ s.iteritems()]))
+ index, data = zip(*event_name_to_ensembl_ids)
+ event_name_to_ensembl_ids = pd.Series(data, index=index,
+ name=study._common_id)
+
+ rename_columns = {}
+ if splicing_index_name == 'index':
+ rename_columns[splicing_index_name] = study._sample_id
+ if splicing_columns_name == 'columns':
+ rename_columns[splicing_columns_name] = study._event_name
+ splicing_columns_name = study._event_name
+ splicing_tidy = splicing_tidy.rename(columns=rename_columns)
+
+ splicing_tidy = splicing_tidy.set_index(splicing_columns_name)
+ splicing_tidy = splicing_tidy.ix[event_name_to_ensembl_ids.index]
+ splicing_tidy = splicing_tidy.join(event_name_to_ensembl_ids)
+
+ splicing_tidy = splicing_tidy.dropna().reset_index()
+ splicing_tidy = splicing_tidy.rename(
+ columns={'index': study._event_name})
+
+ # Tidify expression
+ expression = study.expression.data_original
+ expression_index_name = study._maybe_get_axis_name(expression, axis=0)
+
+ expression_tidy = pd.melt(expression.reset_index(),
+ id_vars=expression_index_name,
+ value_name='expression',
+ var_name=study._common_id)
+ # This will only do anything if there is a column named "index" so
+ # no need to check anything
+ expression_tidy = expression_tidy.rename(
+ columns={'index': study._sample_id})
+ expression_tidy = expression_tidy.dropna()
+
+ true = splicing_tidy.merge(
+ expression_tidy, left_on=[study._sample_id, study._common_id],
+ right_on=[study._sample_id, study._common_id])
+ pdt.assert_frame_equal(test, true)
+ assert 'event_name' in test
+ assert 'event_name' in true
+ assert 'common_id' in true
+ assert 'common_id' in test
+
def test_filter_splicing_on_expression(self, study):
expression_thresh = 5
sample_subset = None
diff --git a/flotilla/visualize/splicing.py b/flotilla/visualize/splicing.py
index 38598735..fd80872d 100644
--- a/flotilla/visualize/splicing.py
+++ b/flotilla/visualize/splicing.py
@@ -209,6 +209,7 @@ def lavalamp(psi, color=None, x_offset=0, title='', ax=None,
plot_kws.setdefault('linestyle', 'None')
plot_kws.setdefault('markeredgecolor', '#262626')
plot_kws.setdefault('markeredgewidth', .1)
+ plot_kws.setdefault('rasterized', True)
y = as_numpy(psi.dropna(how='all', axis=1))