Skip to content

Commit

Permalink
Handle empty results properly and bump version.
Browse files Browse the repository at this point in the history
  • Loading branch information
camillescott committed Dec 13, 2018
1 parent 03229b8 commit 681e5a6
Show file tree
Hide file tree
Showing 12 changed files with 263 additions and 141 deletions.
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@
"keywords": "bioinformatics, orthology, alignment, LAST, BLAST, python",
"license": "BSD-3-Clause",
"title": "shmlast",
"version": "v1.1"
"version": "v1.3"
}
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
doit>=0.29.0
ficus>=0.3.2
ficus>=0.5
matplotlib>=1.5.1
numpy>=1.9.0
pandas>=0.17.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def main():
tests_require = ['pytest',
'codecov'],
install_requires = ['doit>=0.29.0',
'ficus>=0.3.2',
'ficus>=0.5',
'matplotlib>=1.5.1',
'numpy>=1.9.0',
'pandas>=0.17.0',
Expand Down
2 changes: 1 addition & 1 deletion shmlast/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.1
1.3
8 changes: 4 additions & 4 deletions shmlast/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from .crbl import (get_reciprocal_best_last_translated, backmap_names,
scale_evalues, fit_crbh_model, filter_hits_from_model,
plot_crbh_fit)

from .last import lastdb_task, lastal_task
from .maf import MafParser
from .profile import StartProfiler, profile_task
from .last import lastdb_task, lastal_task, MafParser
from .translate import translate_task, rename_task
from .util import ShortenedPythonAction, title, hidden_fn
from .util import create_doit_task as doit_task
Expand Down Expand Up @@ -228,10 +228,10 @@ def do_crbl_fit_and_filter():
self.db_x_query_fn)
model_df = fit_crbh_model(rbh_df)
model_df.to_csv(self.model_fn, index=False)
model_df = pd.read_csv(self.model_fn)
#model_df = pd.read_csv(self.model_fn)

filtered_df = filter_hits_from_model(model_df, rbh_df, hits_df)
results = pd.concat([rbh_df, filtered_df], axis=0)
results = pd.concat([rbh_df, filtered_df], axis=0, sort=True)
results, scaled_col = scale_evalues(results, inplace=True)
del results['translated_q_name']

Expand Down
23 changes: 18 additions & 5 deletions shmlast/crbl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import seaborn as sns

from .hits import BestHits
from .last import MafParser
from .maf import MafParser

float_info = np.finfo(float)

Expand All @@ -27,14 +27,21 @@ def get_reciprocal_best_last_translated(query_maf, database_maf):
'''
bh = BestHits(comparison_cols=['E', 'EG2'])
qvd_df = MafParser(query_maf).read()
qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
try:
qvd_df[['qg_name', 'q_frame']] = qvd_df.q_name.str.partition('_')[[0,2]]
except KeyError:
# could have been empty
qvd_df = qvd_df.assign(qg_name=[], q_frame=[])
qvd_df.rename(columns={'q_name': 'translated_q_name',
'qg_name': 'q_name'},
inplace=True)
qvd_df['ID'] = qvd_df.index

dvq_df = MafParser(database_maf).read()
dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
try:
dvq_df[['sg_name', 'frame']] = dvq_df.s_name.str.partition('_')[[0,2]]
except KeyError:
dvq_df = dvq_df.assign(sg_name=[], frame=[])
dvq_df.rename(columns={'s_name': 'translated_s_name',
'sg_name': 's_name'},
inplace=True)
Expand Down Expand Up @@ -111,8 +118,9 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
_, feature_col = scale_evalues(data, name=feature_col, inplace=True)

# create a DataFrame for the model, staring with the alignment lengths
fit = pd.DataFrame(np.arange(10, data['length'].max()),
columns=['center'], dtype=int)
stop = data['length'].max()
fit = pd.DataFrame(np.arange(10, stop if not np.isnan(stop) else 11),
columns=['center'], dtype=np.int64)

# create the bins
fit['size'] = fit['center'] * 0.1
Expand All @@ -125,6 +133,7 @@ def fit_crbh_model(rbh_df, length_col='s_aln_len', feature_col='E'):
def bin_mean(fit_row, df):
hits = df[(df['length'] >= fit_row.left) & (df['length'] <= fit_row.right)]
return hits[feature_col].mean()

fit['fit'] = fit.apply(bin_mean, args=(data,), axis=1)
model_df = fit.dropna()

Expand All @@ -150,6 +159,7 @@ def filter_hits_from_model(model_df, rbh_df, hits_df, feature_col='E',
rbh_df, scaled_feature_col = scale_evalues(rbh_df, name=feature_col, inplace=False)

# Merge the model into the subset of the hits which aren't in RBH
print(hits_df[length_col].dtype, model_df['center'].dtype)
comp_df = pd.merge(hits_df[hits_df[id_col].isin(rbh_df[id_col]) == False],
model_df, left_on=length_col, right_on='center')

Expand All @@ -173,6 +183,9 @@ def plot_crbh_fit(model_df, hits_df, model_plot_fn, show=False,
with FigureManager(model_plot_fn, show=show,
figsize=figsize, **fig_kwds) as (fig, ax):

if not len(hits_df):
return

sample_size = min(len(hits_df), 5000)
hits_df, scaled_col = scale_evalues(hits_df, name=feature_col,
inplace=False)
Expand Down
1 change: 1 addition & 0 deletions shmlast/hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd


class BestHits(object):

def __init__(self, comparison_cols=['E'], query_name_col='q_name',
Expand Down
122 changes: 1 addition & 121 deletions shmlast/last.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import pandas as pd

from .maf import MafParser
from .profile import profile_task
from .util import create_doit_task as doit_task
from .util import which, parallel_fasta, title
Expand Down Expand Up @@ -113,124 +114,3 @@ def lastal_task(query, db, out_fn, translate=False,
'targets': [out_fn],
'file_dep': [query, db + '.prj'],
'clean': [clean_targets]}


def next_or_raise(fp):
counter = count()
def func(raise_exc=True):
line = fp.readline()
n = next(counter)
if raise_exc is True and line == '':
raise RuntimeError('Malformed MAF file (line {0})'.format(n))
return line
return func


class MafParser(object):

def __init__(self, filename, aln_strings=False, chunksize=10000, **kwargs):
'''Parser for LAST's MAF output.
Args:
filename (str): The MAF file.
aln_strings (bool): If True, parse the alignment strings as well.
chunksize (int): Size of chunks to parse.
'''
self.chunksize = chunksize
self.filename = filename
self.aln_strings = aln_strings
self.LAMBDA = None
self.K = None

def read(self):
'''Read the entire file at once and return as a single DataFrame.
'''
return pd.concat(iter(self), ignore_index=True)

def __iter__(self):
'''Iterator yielding DataFrames of length chunksize holding MAF alignments.
An extra column is added for bitscore, using the equation described here:
http://last.cbrc.jp/doc/last-evalues.html
Args:
fn (str): Path to the MAF alignment file.
chunksize (int): Alignments to parse per iteration.
Yields:
DataFrame: Pandas DataFrame with the alignments.
'''
data = []
with open(self.filename) as fp:
guarded_next = next_or_raise(fp)
n = 0
while (True):
line = guarded_next(raise_exc=False)
if line == '':
break
line = line.strip()
if not line:
continue
if line.startswith('#'):
if 'lambda' in line:
meta = line.strip(' #').split()
meta = {k:v for k, _, v in map(lambda x: x.partition('='), meta)}
self.LAMBDA = float(meta['lambda'])
self.K = float(meta['K'])
else:
continue
if line.startswith('a'):
cur_aln = {}

# Alignment info
tokens = line.split()
for token in tokens[1:]:
key, _, val = token.strip().partition('=')
cur_aln[key] = float(val)

# First sequence info
line = guarded_next()
tokens = line.split()
cur_aln['s_name'] = tokens[1]
cur_aln['s_start'] = int(tokens[2])
cur_aln['s_aln_len'] = int(tokens[3])
cur_aln['s_strand'] = tokens[4]
cur_aln['s_len'] = int(tokens[5])
if self.aln_strings:
cur_aln['s_aln'] = tokens[6]

# First sequence info
line = guarded_next()
tokens = line.split()
cur_aln['q_name'] = tokens[1]
cur_aln['q_start'] = int(tokens[2])
cur_aln['q_aln_len'] = int(tokens[3])
cur_aln['q_strand'] = tokens[4]
cur_aln['q_len'] = int(tokens[5])
if self.aln_strings:
cur_aln['q_aln'] = tokens[6]

data.append(cur_aln)
if len(data) >= self.chunksize:
if self.LAMBDA is None:
raise RuntimeError("old version of lastal; please update")
yield self._build_df(data)
data = []

if data:
yield self._build_df(data)

def _build_df(self, data):

def _fix_sname(name):
new, _, _ = name.partition(',')
return new

df = pd.DataFrame(data)
df['s_name'] = df['s_name'].apply(_fix_sname)
setattr(df, 'LAMBDA', self.LAMBDA)
setattr(df, 'K', self.K)
df['bitscore'] = (self.LAMBDA * df['score'] - np.log(self.K)) / np.log(2)
# Sometimes lastal spits out e-values less than zero...
df.loc[df['E'] < 0, 'E'] = float_info.tiny
return df

Loading

0 comments on commit 681e5a6

Please sign in to comment.