Handle empty results properly and bump version.

camillescott · Dec 13, 2018 · 681e5a6 · 681e5a6
1 parent 03229b8
commit 681e5a6
Show file tree

Hide file tree

Showing 12 changed files with 263 additions and 141 deletions.
diff --git a/codemeta.json b/codemeta.json
@@ -19,5 +19,5 @@
   "keywords": "bioinformatics, orthology, alignment, LAST, BLAST, python",
   "license": "BSD-3-Clause",
   "title": "shmlast",
-  "version": "v1.1"
+  "version": "v1.3"
 }
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 doit>=0.29.0
-ficus>=0.3.2
+ficus>=0.5
 matplotlib>=1.5.1
 numpy>=1.9.0
 pandas>=0.17.0

diff --git a/setup.py b/setup.py
@@ -36,7 +36,7 @@ def main():
             tests_require = ['pytest',
                              'codecov'],
             install_requires = ['doit>=0.29.0',
-                                'ficus>=0.3.2',
+                                'ficus>=0.5',
                                 'matplotlib>=1.5.1',
                                 'numpy>=1.9.0',
                                 'pandas>=0.17.0',

diff --git a/shmlast/VERSION b/shmlast/VERSION
@@ -1 +1 @@
-1.2.1
+1.3
diff --git a/shmlast/app.py b/shmlast/app.py
@@ -9,9 +9,9 @@
 from .crbl import (get_reciprocal_best_last_translated, backmap_names,
                    scale_evalues, fit_crbh_model, filter_hits_from_model,
                    plot_crbh_fit)
-
+from .last import lastdb_task, lastal_task
+from .maf import MafParser
 from .profile import StartProfiler, profile_task
-from .last import lastdb_task, lastal_task, MafParser
 from .translate import translate_task, rename_task
 from .util import ShortenedPythonAction, title, hidden_fn
 from .util import create_doit_task as doit_task
@@ -228,10 +228,10 @@ def do_crbl_fit_and_filter():
                                                                      self.db_x_query_fn)
             model_df = fit_crbh_model(rbh_df)
             model_df.to_csv(self.model_fn, index=False)
-            model_df = pd.read_csv(self.model_fn)
+            #model_df = pd.read_csv(self.model_fn)
 
             filtered_df = filter_hits_from_model(model_df, rbh_df, hits_df)
-            results = pd.concat([rbh_df, filtered_df], axis=0)
+            results = pd.concat([rbh_df, filtered_df], axis=0, sort=True)
             results, scaled_col = scale_evalues(results, inplace=True)
             del results['translated_q_name']
 

diff --git a/shmlast/crbl.py b/shmlast/crbl.py
@@ -10,7 +10,7 @@
 import seaborn as sns
 
 from .hits import BestHits
-from .last import MafParser
+from .maf import MafParser
 
 float_info = np.finfo(float)
 
@@ -27,14 +27,21 @@ def get_reciprocal_best_last_translated(query_maf, database_maf):
     '''
     bh = BestHits(comparison_cols=['E', 'EG2'])
     qvd_df = MafParser(query_maf).read()
-    qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
+    try:
+        qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
+    except KeyError:
+        # could have been empty
+        qvd_df = qvd_df.assign(qg_name=[], q_frame=[])
     qvd_df.rename(columns={'q_name': 'translated_q_name',
                            'qg_name': 'q_name'},
                   inplace=True)
     qvd_df['ID'] = qvd_df.index
 
     dvq_df = MafParser(database_maf).read()
-    dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
+    try:
+        dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
+    except KeyError:
+        dvq_df = dvq_df.assign(sg_name=[], frame=[])
     dvq_df.rename(columns={'s_name': 'translated_s_name',
                            'sg_name': 's_name'},
                   inplace=True)
@@ -111,8 +118,9 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
     _, feature_col = scale_evalues(data, name=feature_col, inplace=True)
 
     # create a DataFrame for the model, staring with the alignment lengths
-    fit = pd.DataFrame(np.arange(10, data['length'].max()), 
-                       columns=['center'], dtype=int)
+    stop = data['length'].max()
+    fit = pd.DataFrame(np.arange(10, stop if not np.isnan(stop) else 11), 
+                       columns=['center'], dtype=np.int64)
 
     # create the bins
     fit['size'] = fit['center'] * 0.1
@@ -125,6 +133,7 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
     def bin_mean(fit_row, df):
         hits = df[(df['length'] >= fit_row.left) & (df['length'] <= fit_row.right)]
         return hits[feature_col].mean()
+
     fit['fit'] = fit.apply(bin_mean, args=(data,), axis=1)
     model_df = fit.dropna()
 
@@ -150,6 +159,7 @@ def filter_hits_from_model(model_df, rbh_df, hits_df, feature_col='E',
     rbh_df, scaled_feature_col = scale_evalues(rbh_df, name=feature_col, inplace=False)
 
     # Merge the model into the subset of the hits which aren't in RBH
+    print(hits_df[length_col].dtype, model_df['center'].dtype)
     comp_df = pd.merge(hits_df[hits_df[id_col].isin(rbh_df[id_col]) == False], 
                        model_df, left_on=length_col, right_on='center')
 
@@ -173,6 +183,9 @@ def plot_crbh_fit(model_df, hits_df, model_plot_fn, show=False,
     with FigureManager(model_plot_fn, show=show, 
                        figsize=figsize, **fig_kwds) as (fig, ax):
 
+        if not len(hits_df):
+            return
+
         sample_size = min(len(hits_df), 5000)
         hits_df, scaled_col = scale_evalues(hits_df, name=feature_col,
                                             inplace=False)

diff --git a/shmlast/hits.py b/shmlast/hits.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+
 class BestHits(object):
 
     def __init__(self, comparison_cols=['E'], query_name_col='q_name', 

diff --git a/shmlast/last.py b/shmlast/last.py
@@ -8,6 +8,7 @@
 import os
 import pandas as pd
 
+from .maf import MafParser
 from .profile import profile_task
 from .util import create_doit_task as doit_task
 from .util import which, parallel_fasta, title
@@ -113,124 +114,3 @@ def lastal_task(query, db, out_fn, translate=False,
             'targets': [out_fn],
             'file_dep': [query, db + '.prj'],
             'clean': [clean_targets]}
-
-
-def next_or_raise(fp):
-    counter = count()
-    def func(raise_exc=True):
-        line = fp.readline()
-        n = next(counter)
-        if raise_exc is True and line == '':
-            raise RuntimeError('Malformed MAF file (line {0})'.format(n))
-        return line
-    return func
-
-
-class MafParser(object):
-
-    def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs):
-        '''Parser for LAST's MAF output.
-
-        Args:
-            filename (str): The MAF file.
-            aln_strings (bool): If True, parse the alignment strings as well.
-            chunksize (int): Size of chunks to parse.
-        '''
-        self.chunksize = chunksize
-        self.filename = filename
-        self.aln_strings = aln_strings
-        self.LAMBDA = None
-        self.K = None
-
-    def read(self):
-        '''Read the entire file at once and return as a single DataFrame.
-        '''
-        return pd.concat(iter(self), ignore_index=True)
-
-    def __iter__(self):
-        '''Iterator yielding DataFrames of length chunksize holding MAF alignments.
-
-        An extra column is added for bitscore, using the equation described here:
-            http://last.cbrc.jp/doc/last-evalues.html
-
-        Args:
-            fn (str): Path to the MAF alignment file.
-            chunksize (int): Alignments to parse per iteration.
-        Yields:
-            DataFrame: Pandas DataFrame with the alignments.
-        '''
-        data = []
-        with open(self.filename) as fp:
-            guarded_next = next_or_raise(fp)
-            n = 0
-            while (True):
-                line = guarded_next(raise_exc=False)
-                if line == '':
-                    break
-                line = line.strip()
-                if not line:
-                    continue
-                if line.startswith('#'):
-                    if 'lambda' in line:
-                        meta = line.strip(' #').split()
-                        meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)}
-                        self.LAMBDA = float(meta['lambda'])
-                        self.K = float(meta['K'])
-                    else:
-                        continue
-                if line.startswith('a'):
-                    cur_aln = {}
-
-                    # Alignment info
-                    tokens = line.split()
-                    for token in tokens[1:]:
-                        key, _, val = token.strip().partition('=')
-                        cur_aln[key] = float(val)
-
-                    # First sequence info
-                    line = guarded_next()
-                    tokens = line.split()
-                    cur_aln['s_name'] = tokens[1]
-                    cur_aln['s_start'] = int(tokens[2])
-                    cur_aln['s_aln_len'] = int(tokens[3])
-                    cur_aln['s_strand'] = tokens[4]
-                    cur_aln['s_len'] = int(tokens[5])
-                    if self.aln_strings:
-                        cur_aln['s_aln'] = tokens[6]
-
-                    # First sequence info
-                    line = guarded_next() 
-                    tokens = line.split()
-                    cur_aln['q_name'] = tokens[1]
-                    cur_aln['q_start'] = int(tokens[2])
-                    cur_aln['q_aln_len'] = int(tokens[3])
-                    cur_aln['q_strand'] = tokens[4]
-                    cur_aln['q_len'] = int(tokens[5])
-                    if self.aln_strings:
-                        cur_aln['q_aln'] = tokens[6]
-
-                    data.append(cur_aln)
-                    if len(data) >= self.chunksize:
-                        if self.LAMBDA is None:
-                            raise RuntimeError("old version of lastal; please update")
-                        yield self._build_df(data)
-                        data = []
-
-        if data:
-            yield self._build_df(data)
-
-    def _build_df(self, data):
-
-        def _fix_sname(name):
-            new, _, _ = name.partition(',')
-            return new
-
-        df = pd.DataFrame(data)
-        df['s_name'] = df['s_name'].apply(_fix_sname)
-        setattr(df, 'LAMBDA', self.LAMBDA)
-        setattr(df, 'K', self.K)
-        df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2)
-        # Sometimes lastal spits out e-values less than zero...
-        df.loc[df['E'] < 0, 'E'] = float_info.tiny
-        return df
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,7 @@

		import pandas as pd


		class BestHits(object):

		def __init__(self, comparison_cols=['E'], query_name_col='q_name',
Expand Down