From 681e5a689e5fee517c65e208eb377026a15a6647 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Wed, 12 Dec 2018 16:50:10 -0800
Subject: [PATCH] Handle empty results properly and bump version.

---
 codemeta.json                  |   2 +-
 requirements.txt               |   2 +-
 setup.py                       |   2 +-
 shmlast/VERSION                |   2 +-
 shmlast/app.py                 |   8 +-
 shmlast/crbl.py                |  23 +++-
 shmlast/hits.py                |   1 +
 shmlast/last.py                | 122 +------------------
 shmlast/maf.py                 | 212 +++++++++++++++++++++++++++++++++
 shmlast/tests/test_besthits.py |  18 ++-
 shmlast/tests/test_last.py     |   2 +-
 shmlast/tests/utils.py         |  10 +-
 12 files changed, 263 insertions(+), 141 deletions(-)
 create mode 100644 shmlast/maf.py

diff --git a/codemeta.json b/codemeta.json
index 54d5c86..414dee1 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -19,5 +19,5 @@
   "keywords": "bioinformatics, orthology, alignment, LAST, BLAST, python",
   "license": "BSD-3-Clause",
   "title": "shmlast",
-  "version": "v1.1"
+  "version": "v1.3"
 }
diff --git a/requirements.txt b/requirements.txt
index 8eefe04..b672921 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 doit>=0.29.0
-ficus>=0.3.2
+ficus>=0.5
 matplotlib>=1.5.1
 numpy>=1.9.0
 pandas>=0.17.0
diff --git a/setup.py b/setup.py
index f6c86d2..44e7281 100755
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ def main():
             tests_require = ['pytest',
                              'codecov'],
             install_requires = ['doit>=0.29.0',
-                                'ficus>=0.3.2',
+                                'ficus>=0.5',
                                 'matplotlib>=1.5.1',
                                 'numpy>=1.9.0',
                                 'pandas>=0.17.0',
diff --git a/shmlast/VERSION b/shmlast/VERSION
index 6085e94..7e32cd5 100644
--- a/shmlast/VERSION
+++ b/shmlast/VERSION
@@ -1 +1 @@
-1.2.1
+1.3
diff --git a/shmlast/app.py b/shmlast/app.py
index b004ee0..5b0d7c1 100644
--- a/shmlast/app.py
+++ b/shmlast/app.py
@@ -9,9 +9,9 @@
 from .crbl import (get_reciprocal_best_last_translated, backmap_names,
                    scale_evalues, fit_crbh_model, filter_hits_from_model,
                    plot_crbh_fit)
-
+from .last import lastdb_task, lastal_task
+from .maf import MafParser
 from .profile import StartProfiler, profile_task
-from .last import lastdb_task, lastal_task, MafParser
 from .translate import translate_task, rename_task
 from .util import ShortenedPythonAction, title, hidden_fn
 from .util import create_doit_task as doit_task
@@ -228,10 +228,10 @@ def do_crbl_fit_and_filter():
                                                                      self.db_x_query_fn)
             model_df = fit_crbh_model(rbh_df)
             model_df.to_csv(self.model_fn, index=False)
-            model_df = pd.read_csv(self.model_fn)
+            #model_df = pd.read_csv(self.model_fn)
 
             filtered_df = filter_hits_from_model(model_df, rbh_df, hits_df)
-            results = pd.concat([rbh_df, filtered_df], axis=0)
+            results = pd.concat([rbh_df, filtered_df], axis=0, sort=True)
             results, scaled_col = scale_evalues(results, inplace=True)
             del results['translated_q_name']
 
diff --git a/shmlast/crbl.py b/shmlast/crbl.py
index cd96baa..e76344e 100644
--- a/shmlast/crbl.py
+++ b/shmlast/crbl.py
@@ -10,7 +10,7 @@
 import seaborn as sns
 
 from .hits import BestHits
-from .last import MafParser
+from .maf import MafParser
 
 float_info = np.finfo(float)
 
@@ -27,14 +27,21 @@ def get_reciprocal_best_last_translated(query_maf, database_maf):
     '''
     bh = BestHits(comparison_cols=['E', 'EG2'])
     qvd_df = MafParser(query_maf).read()
-    qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
+    try:
+        qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
+    except KeyError:
+        # could have been empty
+        qvd_df = qvd_df.assign(qg_name=[], q_frame=[])
     qvd_df.rename(columns={'q_name': 'translated_q_name',
                            'qg_name': 'q_name'},
                   inplace=True)
     qvd_df['ID'] = qvd_df.index
 
     dvq_df = MafParser(database_maf).read()
-    dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
+    try:
+        dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
+    except KeyError:
+        dvq_df = dvq_df.assign(sg_name=[], frame=[])
     dvq_df.rename(columns={'s_name': 'translated_s_name',
                            'sg_name': 's_name'},
                   inplace=True)
@@ -111,8 +118,9 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
     _, feature_col = scale_evalues(data, name=feature_col, inplace=True)
 
     # create a DataFrame for the model, staring with the alignment lengths
-    fit = pd.DataFrame(np.arange(10, data['length'].max()), 
-                       columns=['center'], dtype=int)
+    stop = data['length'].max()
+    fit = pd.DataFrame(np.arange(10, stop if not np.isnan(stop) else 11), 
+                       columns=['center'], dtype=np.int64)
     
     # create the bins
     fit['size'] = fit['center'] * 0.1
@@ -125,6 +133,7 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
     def bin_mean(fit_row, df):
         hits = df[(df['length'] >= fit_row.left) & (df['length'] <= fit_row.right)]
         return hits[feature_col].mean()
+
     fit['fit'] = fit.apply(bin_mean, args=(data,), axis=1)
     model_df = fit.dropna()
 
@@ -150,6 +159,7 @@ def filter_hits_from_model(model_df, rbh_df, hits_df, feature_col='E',
     rbh_df, scaled_feature_col = scale_evalues(rbh_df, name=feature_col, inplace=False)
 
     # Merge the model into the subset of the hits which aren't in RBH
+    print(hits_df[length_col].dtype, model_df['center'].dtype)
     comp_df = pd.merge(hits_df[hits_df[id_col].isin(rbh_df[id_col]) == False], 
                        model_df, left_on=length_col, right_on='center')
 
@@ -173,6 +183,9 @@ def plot_crbh_fit(model_df, hits_df, model_plot_fn, show=False,
     with FigureManager(model_plot_fn, show=show, 
                        figsize=figsize, **fig_kwds) as (fig, ax):
 
+        if not len(hits_df):
+            return
+
         sample_size = min(len(hits_df), 5000)
         hits_df, scaled_col = scale_evalues(hits_df, name=feature_col,
                                             inplace=False)
diff --git a/shmlast/hits.py b/shmlast/hits.py
index 4c753b1..5b52e9a 100644
--- a/shmlast/hits.py
+++ b/shmlast/hits.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+
 class BestHits(object):
 
     def __init__(self, comparison_cols=['E'], query_name_col='q_name', 
diff --git a/shmlast/last.py b/shmlast/last.py
index e4fa97c..f43fa33 100644
--- a/shmlast/last.py
+++ b/shmlast/last.py
@@ -8,6 +8,7 @@
 import os
 import pandas as pd
 
+from .maf import MafParser
 from .profile import profile_task
 from .util import create_doit_task as doit_task
 from .util import which, parallel_fasta, title
@@ -113,124 +114,3 @@ def lastal_task(query, db, out_fn, translate=False,
             'targets': [out_fn],
             'file_dep': [query, db + '.prj'],
             'clean': [clean_targets]}
-
-
-def next_or_raise(fp):
-    counter = count()
-    def func(raise_exc=True):
-        line = fp.readline()
-        n = next(counter)
-        if raise_exc is True and line == '':
-            raise RuntimeError('Malformed MAF file (line {0})'.format(n))
-        return line
-    return func
-
-
-class MafParser(object):
-
-    def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs):
-        '''Parser for LAST's MAF output.
-
-        Args:
-            filename (str): The MAF file.
-            aln_strings (bool): If True, parse the alignment strings as well.
-            chunksize (int): Size of chunks to parse.
-        '''
-        self.chunksize = chunksize
-        self.filename = filename
-        self.aln_strings = aln_strings
-        self.LAMBDA = None
-        self.K = None
-
-    def read(self):
-        '''Read the entire file at once and return as a single DataFrame.
-        '''
-        return pd.concat(iter(self), ignore_index=True)
-
-    def __iter__(self):
-        '''Iterator yielding DataFrames of length chunksize holding MAF alignments.
-
-        An extra column is added for bitscore, using the equation described here:
-            http://last.cbrc.jp/doc/last-evalues.html
-
-        Args:
-            fn (str): Path to the MAF alignment file.
-            chunksize (int): Alignments to parse per iteration.
-        Yields:
-            DataFrame: Pandas DataFrame with the alignments.
-        '''
-        data = []
-        with open(self.filename) as fp:
-            guarded_next = next_or_raise(fp)
-            n = 0
-            while (True):
-                line = guarded_next(raise_exc=False)
-                if line == '':
-                    break
-                line = line.strip()
-                if not line:
-                    continue
-                if line.startswith('#'):
-                    if 'lambda' in line:
-                        meta = line.strip(' #').split()
-                        meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)}
-                        self.LAMBDA = float(meta['lambda'])
-                        self.K = float(meta['K'])
-                    else:
-                        continue
-                if line.startswith('a'):
-                    cur_aln = {}
-
-                    # Alignment info
-                    tokens = line.split()
-                    for token in tokens[1:]:
-                        key, _, val = token.strip().partition('=')
-                        cur_aln[key] = float(val)
-
-                    # First sequence info
-                    line = guarded_next()
-                    tokens = line.split()
-                    cur_aln['s_name'] = tokens[1]
-                    cur_aln['s_start'] = int(tokens[2])
-                    cur_aln['s_aln_len'] = int(tokens[3])
-                    cur_aln['s_strand'] = tokens[4]
-                    cur_aln['s_len'] = int(tokens[5])
-                    if self.aln_strings:
-                        cur_aln['s_aln'] = tokens[6]
-
-                    # First sequence info
-                    line = guarded_next() 
-                    tokens = line.split()
-                    cur_aln['q_name'] = tokens[1]
-                    cur_aln['q_start'] = int(tokens[2])
-                    cur_aln['q_aln_len'] = int(tokens[3])
-                    cur_aln['q_strand'] = tokens[4]
-                    cur_aln['q_len'] = int(tokens[5])
-                    if self.aln_strings:
-                        cur_aln['q_aln'] = tokens[6]
-
-                    data.append(cur_aln)
-                    if len(data) >= self.chunksize:
-                        if self.LAMBDA is None:
-                            raise RuntimeError("old version of lastal; please update")
-                        yield self._build_df(data)
-                        data = []
-
-        if data:
-            yield self._build_df(data)
-
-    def _build_df(self, data):
-
-        def _fix_sname(name):
-            new, _, _ = name.partition(',')
-            return new
-
-        df = pd.DataFrame(data)
-        df['s_name'] = df['s_name'].apply(_fix_sname)
-        setattr(df, 'LAMBDA', self.LAMBDA)
-        setattr(df, 'K', self.K)
-        df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2)
-        # Sometimes lastal spits out e-values less than zero...
-        df.loc[df['E'] < 0, 'E'] = float_info.tiny
-        return df
-
diff --git a/shmlast/maf.py b/shmlast/maf.py
new file mode 100644
index 0000000..5e55480
--- /dev/null
+++ b/shmlast/maf.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2015-2018 Camille Scott
+# All rights reserved.
+#
+# This software may be modified and distributed under the terms
+# of the BSD license.  See the LICENSE file for details.
+#
+# Note: this code came from dammit (https://github.com/dib-lab/dammit)
+# because Camille was too lazy to spin off a third package for the
+# parsers only.
+#
+
+from itertools import count
+from sys import stderr
+
+import pandas as pd
+import numpy as np
+
+
+class EmptyFile(Exception):
+    pass
+
+
+def warn_empty(msg):
+    '''Warn that a file is empty.'''
+    print('\nWARNING: Empty file: {0}\n'.format(msg), file=stderr)
+
+
+def next_or_raise(fp):
+    '''Get the next line and raise an exception if its empty.
+    '''
+    counter = count()
+    def func(raise_exc=True):
+        line = fp.readline()
+        n = next(counter)
+        if raise_exc is True and line == '':
+            raise RuntimeError('Malformed file (line {0})'.format(n))
+        return line
+    return func
+
+
+def convert_dtypes(df, dtypes):
+    '''Convert the columns of a DataFrame to the types specified
+    in the given dictionary, inplace.
+
+    Args:
+        df (DataFrame): The DataFrame to convert.
+        dtypes (dict): Dictionary mapping columns to types.
+    '''
+
+    for c in df.columns:
+        try:
+            df[c] = df[c].astype(dtypes[c])
+        except KeyError:
+            pass
+
+
+class BaseParser(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    def raise_empty(self):
+        raise EmptyFile('Empty file: {0}'.format(self.filename))
+
+
+class ChunkParser(BaseParser):
+
+    def __init__(self, filename, chunksize=10000):
+        '''
+        Args:
+            filename (str): Path to the file to parse.
+            chunksize (int): Number of entries to yield per call.
+        '''
+
+        self.chunksize = chunksize
+        super(ChunkParser, self).__init__(filename)
+
+    def __iter__(self):
+        raise NotImplementedError()
+        yield
+
+    def read(self):
+        '''Read the entire file at once and return as a single DataFrame.
+        '''
+        try:
+            return pd.concat(self, ignore_index=True)
+        except (EmptyFile, ValueError) as e:
+            # no objects, return an empty dataframe
+            return self.empty()
+
+    def empty(self):
+        '''Get an empty DataFrame with the appropriate columns.
+        '''
+
+        df = pd.DataFrame(columns=[k for k, _ in self.columns])
+        convert_dtypes(df, dict(self.columns))
+        return df
+
+
+class MafParser(ChunkParser):
+
+    columns = [('E', float),
+               ('EG2', float),
+               ('q_aln_len', int),
+               ('q_len', int),
+               ('q_name', str),
+               ('q_start', int),
+               ('q_strand', str),
+               ('s_aln_len', int),
+               ('s_len', int),
+               ('s_name', str),
+               ('s_start', int),
+               ('s_strand', str),
+               ('score', float),
+               ('bitscore', float)]
+
+    def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs):
+        self.aln_strings = aln_strings
+        self.LAMBDA = None
+        self.K = None
+        super(MafParser, self).__init__(filename, chunksize=chunksize, **kwargs)
+
+    def __iter__(self):
+        '''Iterator yielding DataFrames of length chunksize holding MAF alignments.
+
+        An extra column is added for bitscore, using the equation described here:
+            http://last.cbrc.jp/doc/last-evalues.html
+
+        Args:
+            fn (str): Path to the MAF alignment file.
+            chunksize (int): Alignments to parse per iteration.
+        Yields:
+            DataFrame: Pandas DataFrame with the alignments.
+        '''
+        data = []
+        n_entries = 0
+        with open(self.filename) as fp:
+            guarded_next = next_or_raise(fp)
+            while (True):
+                line = guarded_next(raise_exc=False)
+                if line == '':
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                if line.startswith('#'):
+                    if 'lambda' in line:
+                        meta = line.strip(' #').split()
+                        meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)}
+                        self.LAMBDA = float(meta['lambda'])
+                        self.K = float(meta['K'])
+                    else:
+                        continue
+                if line.startswith('a'):
+                    n_entries += 1
+                    cur_aln = {}
+
+                    # Alignment info
+                    tokens = line.split()
+                    for token in tokens[1:]:
+                        key, _, val = token.strip().partition('=')
+                        cur_aln[key] = float(val)
+
+                    # First sequence info
+                    line = guarded_next()
+                    tokens = line.split()
+                    cur_aln['s_name'] = tokens[1]
+                    cur_aln['s_start'] = int(tokens[2])
+                    cur_aln['s_aln_len'] = int(tokens[3])
+                    cur_aln['s_strand'] = tokens[4]
+                    cur_aln['s_len'] = int(tokens[5])
+                    if self.aln_strings:
+                        cur_aln['s_aln'] = tokens[6]
+
+                    # First sequence info
+                    line = guarded_next() 
+                    tokens = line.split()
+                    cur_aln['q_name'] = tokens[1]
+                    cur_aln['q_start'] = int(tokens[2])
+                    cur_aln['q_aln_len'] = int(tokens[3])
+                    cur_aln['q_strand'] = tokens[4]
+                    cur_aln['q_len'] = int(tokens[5])
+                    if self.aln_strings:
+                        cur_aln['q_aln'] = tokens[6]
+
+                    data.append(cur_aln)
+                    if len(data) >= self.chunksize:
+                        if self.LAMBDA is None:
+                            raise RuntimeError("old version of lastal; please update")
+                        yield self._build_df(data)
+                        data = []
+
+        if n_entries == 0:
+            self.raise_empty()
+        if data:
+            yield self._build_df(data)
+
+    def _build_df(self, data):
+        if not data:
+            self.raise_empty()
+
+        def _fix_sname(name):
+            new, _, _ = name.partition(',')
+            return new
+
+        df = pd.DataFrame(data)
+        df['s_name'] = df['s_name'].apply(_fix_sname)
+        setattr(df, 'LAMBDA', self.LAMBDA)
+        setattr(df, 'K', self.K)
+        df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2)
+
+        return df
diff --git a/shmlast/tests/test_besthits.py b/shmlast/tests/test_besthits.py
index 59c9edd..a3fdda7 100644
--- a/shmlast/tests/test_besthits.py
+++ b/shmlast/tests/test_besthits.py
@@ -1,9 +1,10 @@
 import pytest
 import pandas as pd
 
-from shmlast.tests.utils import datadir
+from shmlast.tests.utils import datadir, run_task, run_tasks
 from shmlast.hits import BestHits
 from shmlast.crbl import scale_evalues
+from shmlast.app  import CRBL
 
 
 def check_df_equals(dfA, dfB, col='E'):
@@ -53,6 +54,7 @@ def test_reciprocal_best_hits(datadir):
     
     assert check_df_equals(results_df, expected_df)
 
+
 def test_scale_evalues():
     test_df = pd.DataFrame({'E': [.1, 0.01, 0.0]})
     expected_df = pd.DataFrame({'E_scaled': [1.0, 2.0, 307.652655568]})
@@ -60,3 +62,17 @@ def test_scale_evalues():
 
     assert list(result_df[col_name]) == \
         pytest.approx(list(expected_df['E_scaled']))
+
+
+def test_crbl_tasks(tmpdir, datadir):
+    with tmpdir.as_cwd():
+        input_fa   = datadir('pom.single.fa')
+        pep_fa     = datadir('odb_subset.fa')
+        results_fn = tmpdir.join('result.csv').strpath
+        
+        crbl = CRBL(input_fa,
+                    pep_fa,
+                    results_fn)
+        result = run_tasks([tsk for tsk in crbl.tasks()], ['run'])
+
+        assert result == 0
diff --git a/shmlast/tests/test_last.py b/shmlast/tests/test_last.py
index 5fdf03c..d359fdc 100644
--- a/shmlast/tests/test_last.py
+++ b/shmlast/tests/test_last.py
@@ -10,7 +10,7 @@
 from shmlast.tests.utils import datadir, run_task, run_tasks, check_status, touch
 from shmlast.last import lastal_task
 from shmlast.last import lastdb_task
-from shmlast.last import MafParser
+from shmlast.maf import MafParser
 
 LASTDB_EXTENSIONS = ['.bck', '.des', '.prj', '.sds', '.ssp', '.suf', '.tis']
 
diff --git a/shmlast/tests/utils.py b/shmlast/tests/utils.py
index 1016ed6..26a204e 100644
--- a/shmlast/tests/utils.py
+++ b/shmlast/tests/utils.py
@@ -10,7 +10,6 @@
 from distutils import dir_util
 from pytest import fixture
 
-
 from doit.cmd_base import TaskLoader
 from doit.doit_cmd import DoitMain
 from doit.dependency import Dependency, DbmDB
@@ -26,13 +25,14 @@ def datadir(tmpdir, request):
     Fixture responsible for locating the test data directory and copying it
     into a temporary directory.
     '''
-    filename = request.module.__file__
-    test_dir = os.path.dirname(filename)
-    data_dir = os.path.join(test_dir, 'data') 
-    dir_util.copy_tree(data_dir, str(tmpdir))
+    filename   = request.module.__file__
+    test_dir   = os.path.dirname(filename)
+    data_dir   = os.path.join(test_dir, 'data')
 
     def getter(filename, as_str=True):
         filepath = tmpdir.join(filename)
+        shutil.copyfile(os.path.join(data_dir, filename),
+                        filepath)
         if as_str:
             return str(filepath)
         return filepath