From 681e5a689e5fee517c65e208eb377026a15a6647 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Wed, 12 Dec 2018 16:50:10 -0800 Subject: [PATCH] Handle empty results properly and bump version. --- codemeta.json | 2 +- requirements.txt | 2 +- setup.py | 2 +- shmlast/VERSION | 2 +- shmlast/app.py | 8 +- shmlast/crbl.py | 23 +++- shmlast/hits.py | 1 + shmlast/last.py | 122 +------------------ shmlast/maf.py | 212 +++++++++++++++++++++++++++++++++ shmlast/tests/test_besthits.py | 18 ++- shmlast/tests/test_last.py | 2 +- shmlast/tests/utils.py | 10 +- 12 files changed, 263 insertions(+), 141 deletions(-) create mode 100644 shmlast/maf.py diff --git a/codemeta.json b/codemeta.json index 54d5c86..414dee1 100644 --- a/codemeta.json +++ b/codemeta.json @@ -19,5 +19,5 @@ "keywords": "bioinformatics, orthology, alignment, LAST, BLAST, python", "license": "BSD-3-Clause", "title": "shmlast", - "version": "v1.1" + "version": "v1.3" } diff --git a/requirements.txt b/requirements.txt index 8eefe04..b672921 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ doit>=0.29.0 -ficus>=0.3.2 +ficus>=0.5 matplotlib>=1.5.1 numpy>=1.9.0 pandas>=0.17.0 diff --git a/setup.py b/setup.py index f6c86d2..44e7281 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def main(): tests_require = ['pytest', 'codecov'], install_requires = ['doit>=0.29.0', - 'ficus>=0.3.2', + 'ficus>=0.5', 'matplotlib>=1.5.1', 'numpy>=1.9.0', 'pandas>=0.17.0', diff --git a/shmlast/VERSION b/shmlast/VERSION index 6085e94..7e32cd5 100644 --- a/shmlast/VERSION +++ b/shmlast/VERSION @@ -1 +1 @@ -1.2.1 +1.3 diff --git a/shmlast/app.py b/shmlast/app.py index b004ee0..5b0d7c1 100644 --- a/shmlast/app.py +++ b/shmlast/app.py @@ -9,9 +9,9 @@ from .crbl import (get_reciprocal_best_last_translated, backmap_names, scale_evalues, fit_crbh_model, filter_hits_from_model, plot_crbh_fit) - +from .last import lastdb_task, lastal_task +from .maf import MafParser from .profile import StartProfiler, profile_task -from .last import lastdb_task, lastal_task, MafParser from .translate import translate_task, rename_task from .util import ShortenedPythonAction, title, hidden_fn from .util import create_doit_task as doit_task @@ -228,10 +228,10 @@ def do_crbl_fit_and_filter(): self.db_x_query_fn) model_df = fit_crbh_model(rbh_df) model_df.to_csv(self.model_fn, index=False) - model_df = pd.read_csv(self.model_fn) + #model_df = pd.read_csv(self.model_fn) filtered_df = filter_hits_from_model(model_df, rbh_df, hits_df) - results = pd.concat([rbh_df, filtered_df], axis=0) + results = pd.concat([rbh_df, filtered_df], axis=0, sort=True) results, scaled_col = scale_evalues(results, inplace=True) del results['translated_q_name'] diff --git a/shmlast/crbl.py b/shmlast/crbl.py index cd96baa..e76344e 100644 --- a/shmlast/crbl.py +++ b/shmlast/crbl.py @@ -10,7 +10,7 @@ import seaborn as sns from .hits import BestHits -from .last import MafParser +from .maf import MafParser float_info = np.finfo(float) @@ -27,14 +27,21 @@ def get_reciprocal_best_last_translated(query_maf, database_maf): ''' bh = BestHits(comparison_cols=['E', 'EG2']) qvd_df = MafParser(query_maf).read() - qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]] + try: + qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]] + except KeyError: + # could have been empty + qvd_df = qvd_df.assign(qg_name=[], q_frame=[]) qvd_df.rename(columns={'q_name': 'translated_q_name', 'qg_name': 'q_name'}, inplace=True) qvd_df['ID'] = qvd_df.index dvq_df = MafParser(database_maf).read() - dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]] + try: + dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]] + except KeyError: + dvq_df = dvq_df.assign(sg_name=[], frame=[]) dvq_df.rename(columns={'s_name': 'translated_s_name', 'sg_name': 's_name'}, inplace=True) @@ -111,8 +118,9 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'): _, feature_col = scale_evalues(data, name=feature_col, inplace=True) # create a DataFrame for the model, staring with the alignment lengths - fit = pd.DataFrame(np.arange(10, data['length'].max()), - columns=['center'], dtype=int) + stop = data['length'].max() + fit = pd.DataFrame(np.arange(10, stop if not np.isnan(stop) else 11), + columns=['center'], dtype=np.int64) # create the bins fit['size'] = fit['center'] * 0.1 @@ -125,6 +133,7 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'): def bin_mean(fit_row, df): hits = df[(df['length'] >= fit_row.left) & (df['length'] <= fit_row.right)] return hits[feature_col].mean() + fit['fit'] = fit.apply(bin_mean, args=(data,), axis=1) model_df = fit.dropna() @@ -150,6 +159,7 @@ def filter_hits_from_model(model_df, rbh_df, hits_df, feature_col='E', rbh_df, scaled_feature_col = scale_evalues(rbh_df, name=feature_col, inplace=False) # Merge the model into the subset of the hits which aren't in RBH + print(hits_df[length_col].dtype, model_df['center'].dtype) comp_df = pd.merge(hits_df[hits_df[id_col].isin(rbh_df[id_col]) == False], model_df, left_on=length_col, right_on='center') @@ -173,6 +183,9 @@ def plot_crbh_fit(model_df, hits_df, model_plot_fn, show=False, with FigureManager(model_plot_fn, show=show, figsize=figsize, **fig_kwds) as (fig, ax): + if not len(hits_df): + return + sample_size = min(len(hits_df), 5000) hits_df, scaled_col = scale_evalues(hits_df, name=feature_col, inplace=False) diff --git a/shmlast/hits.py b/shmlast/hits.py index 4c753b1..5b52e9a 100644 --- a/shmlast/hits.py +++ b/shmlast/hits.py @@ -3,6 +3,7 @@ import pandas as pd + class BestHits(object): def __init__(self, comparison_cols=['E'], query_name_col='q_name', diff --git a/shmlast/last.py b/shmlast/last.py index e4fa97c..f43fa33 100644 --- a/shmlast/last.py +++ b/shmlast/last.py @@ -8,6 +8,7 @@ import os import pandas as pd +from .maf import MafParser from .profile import profile_task from .util import create_doit_task as doit_task from .util import which, parallel_fasta, title @@ -113,124 +114,3 @@ def lastal_task(query, db, out_fn, translate=False, 'targets': [out_fn], 'file_dep': [query, db + '.prj'], 'clean': [clean_targets]} - - -def next_or_raise(fp): - counter = count() - def func(raise_exc=True): - line = fp.readline() - n = next(counter) - if raise_exc is True and line == '': - raise RuntimeError('Malformed MAF file (line {0})'.format(n)) - return line - return func - - -class MafParser(object): - - def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs): - '''Parser for LAST's MAF output. - - Args: - filename (str): The MAF file. - aln_strings (bool): If True, parse the alignment strings as well. - chunksize (int): Size of chunks to parse. - ''' - self.chunksize = chunksize - self.filename = filename - self.aln_strings = aln_strings - self.LAMBDA = None - self.K = None - - def read(self): - '''Read the entire file at once and return as a single DataFrame. - ''' - return pd.concat(iter(self), ignore_index=True) - - def __iter__(self): - '''Iterator yielding DataFrames of length chunksize holding MAF alignments. - - An extra column is added for bitscore, using the equation described here: - http://last.cbrc.jp/doc/last-evalues.html - - Args: - fn (str): Path to the MAF alignment file. - chunksize (int): Alignments to parse per iteration. - Yields: - DataFrame: Pandas DataFrame with the alignments. - ''' - data = [] - with open(self.filename) as fp: - guarded_next = next_or_raise(fp) - n = 0 - while (True): - line = guarded_next(raise_exc=False) - if line == '': - break - line = line.strip() - if not line: - continue - if line.startswith('#'): - if 'lambda' in line: - meta = line.strip(' #').split() - meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)} - self.LAMBDA = float(meta['lambda']) - self.K = float(meta['K']) - else: - continue - if line.startswith('a'): - cur_aln = {} - - # Alignment info - tokens = line.split() - for token in tokens[1:]: - key, _, val = token.strip().partition('=') - cur_aln[key] = float(val) - - # First sequence info - line = guarded_next() - tokens = line.split() - cur_aln['s_name'] = tokens[1] - cur_aln['s_start'] = int(tokens[2]) - cur_aln['s_aln_len'] = int(tokens[3]) - cur_aln['s_strand'] = tokens[4] - cur_aln['s_len'] = int(tokens[5]) - if self.aln_strings: - cur_aln['s_aln'] = tokens[6] - - # First sequence info - line = guarded_next() - tokens = line.split() - cur_aln['q_name'] = tokens[1] - cur_aln['q_start'] = int(tokens[2]) - cur_aln['q_aln_len'] = int(tokens[3]) - cur_aln['q_strand'] = tokens[4] - cur_aln['q_len'] = int(tokens[5]) - if self.aln_strings: - cur_aln['q_aln'] = tokens[6] - - data.append(cur_aln) - if len(data) >= self.chunksize: - if self.LAMBDA is None: - raise RuntimeError("old version of lastal; please update") - yield self._build_df(data) - data = [] - - if data: - yield self._build_df(data) - - def _build_df(self, data): - - def _fix_sname(name): - new, _, _ = name.partition(',') - return new - - df = pd.DataFrame(data) - df['s_name'] = df['s_name'].apply(_fix_sname) - setattr(df, 'LAMBDA', self.LAMBDA) - setattr(df, 'K', self.K) - df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2) - # Sometimes lastal spits out e-values less than zero... - df.loc[df['E'] < 0, 'E'] = float_info.tiny - return df - diff --git a/shmlast/maf.py b/shmlast/maf.py new file mode 100644 index 0000000..5e55480 --- /dev/null +++ b/shmlast/maf.py @@ -0,0 +1,212 @@ +# Copyright (C) 2015-2018 Camille Scott +# All rights reserved. +# +# This software may be modified and distributed under the terms +# of the BSD license. See the LICENSE file for details. +# +# Note: this code came from dammit (https://github.com/dib-lab/dammit) +# because Camille was too lazy to spin off a third package for the +# parsers only. +# + +from itertools import count +from sys import stderr + +import pandas as pd +import numpy as np + + +class EmptyFile(Exception): + pass + + +def warn_empty(msg): + '''Warn that a file is empty.''' + print('\nWARNING: Empty file: {0}\n'.format(msg), file=stderr) + + +def next_or_raise(fp): + '''Get the next line and raise an exception if its empty. + ''' + counter = count() + def func(raise_exc=True): + line = fp.readline() + n = next(counter) + if raise_exc is True and line == '': + raise RuntimeError('Malformed file (line {0})'.format(n)) + return line + return func + + +def convert_dtypes(df, dtypes): + '''Convert the columns of a DataFrame to the types specified + in the given dictionary, inplace. + + Args: + df (DataFrame): The DataFrame to convert. + dtypes (dict): Dictionary mapping columns to types. + ''' + + for c in df.columns: + try: + df[c] = df[c].astype(dtypes[c]) + except KeyError: + pass + + +class BaseParser(object): + + def __init__(self, filename): + self.filename = filename + + def raise_empty(self): + raise EmptyFile('Empty file: {0}'.format(self.filename)) + + +class ChunkParser(BaseParser): + + def __init__(self, filename, chunksize=10000): + ''' + Args: + filename (str): Path to the file to parse. + chunksize (int): Number of entries to yield per call. + ''' + + self.chunksize = chunksize + super(ChunkParser, self).__init__(filename) + + def __iter__(self): + raise NotImplementedError() + yield + + def read(self): + '''Read the entire file at once and return as a single DataFrame. + ''' + try: + return pd.concat(self, ignore_index=True) + except (EmptyFile, ValueError) as e: + # no objects, return an empty dataframe + return self.empty() + + def empty(self): + '''Get an empty DataFrame with the appropriate columns. + ''' + + df = pd.DataFrame(columns=[k for k, _ in self.columns]) + convert_dtypes(df, dict(self.columns)) + return df + + +class MafParser(ChunkParser): + + columns = [('E', float), + ('EG2', float), + ('q_aln_len', int), + ('q_len', int), + ('q_name', str), + ('q_start', int), + ('q_strand', str), + ('s_aln_len', int), + ('s_len', int), + ('s_name', str), + ('s_start', int), + ('s_strand', str), + ('score', float), + ('bitscore', float)] + + def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs): + self.aln_strings = aln_strings + self.LAMBDA = None + self.K = None + super(MafParser, self).__init__(filename, chunksize=chunksize, **kwargs) + + def __iter__(self): + '''Iterator yielding DataFrames of length chunksize holding MAF alignments. + + An extra column is added for bitscore, using the equation described here: + http://last.cbrc.jp/doc/last-evalues.html + + Args: + fn (str): Path to the MAF alignment file. + chunksize (int): Alignments to parse per iteration. + Yields: + DataFrame: Pandas DataFrame with the alignments. + ''' + data = [] + n_entries = 0 + with open(self.filename) as fp: + guarded_next = next_or_raise(fp) + while (True): + line = guarded_next(raise_exc=False) + if line == '': + break + line = line.strip() + if not line: + continue + if line.startswith('#'): + if 'lambda' in line: + meta = line.strip(' #').split() + meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)} + self.LAMBDA = float(meta['lambda']) + self.K = float(meta['K']) + else: + continue + if line.startswith('a'): + n_entries += 1 + cur_aln = {} + + # Alignment info + tokens = line.split() + for token in tokens[1:]: + key, _, val = token.strip().partition('=') + cur_aln[key] = float(val) + + # First sequence info + line = guarded_next() + tokens = line.split() + cur_aln['s_name'] = tokens[1] + cur_aln['s_start'] = int(tokens[2]) + cur_aln['s_aln_len'] = int(tokens[3]) + cur_aln['s_strand'] = tokens[4] + cur_aln['s_len'] = int(tokens[5]) + if self.aln_strings: + cur_aln['s_aln'] = tokens[6] + + # First sequence info + line = guarded_next() + tokens = line.split() + cur_aln['q_name'] = tokens[1] + cur_aln['q_start'] = int(tokens[2]) + cur_aln['q_aln_len'] = int(tokens[3]) + cur_aln['q_strand'] = tokens[4] + cur_aln['q_len'] = int(tokens[5]) + if self.aln_strings: + cur_aln['q_aln'] = tokens[6] + + data.append(cur_aln) + if len(data) >= self.chunksize: + if self.LAMBDA is None: + raise RuntimeError("old version of lastal; please update") + yield self._build_df(data) + data = [] + + if n_entries == 0: + self.raise_empty() + if data: + yield self._build_df(data) + + def _build_df(self, data): + if not data: + self.raise_empty() + + def _fix_sname(name): + new, _, _ = name.partition(',') + return new + + df = pd.DataFrame(data) + df['s_name'] = df['s_name'].apply(_fix_sname) + setattr(df, 'LAMBDA', self.LAMBDA) + setattr(df, 'K', self.K) + df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2) + + return df diff --git a/shmlast/tests/test_besthits.py b/shmlast/tests/test_besthits.py index 59c9edd..a3fdda7 100644 --- a/shmlast/tests/test_besthits.py +++ b/shmlast/tests/test_besthits.py @@ -1,9 +1,10 @@ import pytest import pandas as pd -from shmlast.tests.utils import datadir +from shmlast.tests.utils import datadir, run_task, run_tasks from shmlast.hits import BestHits from shmlast.crbl import scale_evalues +from shmlast.app import CRBL def check_df_equals(dfA, dfB, col='E'): @@ -53,6 +54,7 @@ def test_reciprocal_best_hits(datadir): assert check_df_equals(results_df, expected_df) + def test_scale_evalues(): test_df = pd.DataFrame({'E': [.1, 0.01, 0.0]}) expected_df = pd.DataFrame({'E_scaled': [1.0, 2.0, 307.652655568]}) @@ -60,3 +62,17 @@ def test_scale_evalues(): assert list(result_df[col_name]) == \ pytest.approx(list(expected_df['E_scaled'])) + + +def test_crbl_tasks(tmpdir, datadir): + with tmpdir.as_cwd(): + input_fa = datadir('pom.single.fa') + pep_fa = datadir('odb_subset.fa') + results_fn = tmpdir.join('result.csv').strpath + + crbl = CRBL(input_fa, + pep_fa, + results_fn) + result = run_tasks([tsk for tsk in crbl.tasks()], ['run']) + + assert result == 0 diff --git a/shmlast/tests/test_last.py b/shmlast/tests/test_last.py index 5fdf03c..d359fdc 100644 --- a/shmlast/tests/test_last.py +++ b/shmlast/tests/test_last.py @@ -10,7 +10,7 @@ from shmlast.tests.utils import datadir, run_task, run_tasks, check_status, touch from shmlast.last import lastal_task from shmlast.last import lastdb_task -from shmlast.last import MafParser +from shmlast.maf import MafParser LASTDB_EXTENSIONS = ['.bck', '.des', '.prj', '.sds', '.ssp', '.suf', '.tis'] diff --git a/shmlast/tests/utils.py b/shmlast/tests/utils.py index 1016ed6..26a204e 100644 --- a/shmlast/tests/utils.py +++ b/shmlast/tests/utils.py @@ -10,7 +10,6 @@ from distutils import dir_util from pytest import fixture - from doit.cmd_base import TaskLoader from doit.doit_cmd import DoitMain from doit.dependency import Dependency, DbmDB @@ -26,13 +25,14 @@ def datadir(tmpdir, request): Fixture responsible for locating the test data directory and copying it into a temporary directory. ''' - filename = request.module.__file__ - test_dir = os.path.dirname(filename) - data_dir = os.path.join(test_dir, 'data') - dir_util.copy_tree(data_dir, str(tmpdir)) + filename = request.module.__file__ + test_dir = os.path.dirname(filename) + data_dir = os.path.join(test_dir, 'data') def getter(filename, as_str=True): filepath = tmpdir.join(filename) + shutil.copyfile(os.path.join(data_dir, filename), + filepath) if as_str: return str(filepath) return filepath