Skip to content

Commit

Permalink
Merge branch 'master' of github.com:ged-lab/khmer into fix/spacechecks
Browse files Browse the repository at this point in the history
Conflicts:
	ChangeLog
  • Loading branch information
SensibleSalmon committed Jul 20, 2015
2 parents 9fd52c9 + 5695b9f commit af68d9d
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 66 deletions.
10 changes: 9 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
2015-07-18 Jacob Fenton <bocajnotnef@gmail.com>
2015-07-20 Jacob Fenton <bocajnotnef@gmail.com>

* khmer/{kfile,khmer_args}.py: refactored information passing, made it so
space checks happen in the right directory
Expand All @@ -8,6 +8,14 @@
sample-reads-randomly,trim-low-abund}.py,tests/test_script_arguments.py:
changed to use new arg structure

2015-07-20 Titus Brown <titus@idyll.org>

* khmer/__init__.py: cleaned up FP rate reporting.
* scripts/normalize-by-median.py: corrected epilog; refactored reporting
to be a bit cleaner; use CSV for reporting file;
added --report-frequency arg.
* tests/test_normalize_by_median.py: updated/added tests for reporting.

2015-07-17 Jacob Fenton <bocajnotnef@gmail.com>
* oxli/{functions,build_graph}.py,scripts/{load-graph,normalize-by-median,
abundance-dist}.py,tests/test_{normalize_by_median,subset_graph,hashbits,
Expand Down
4 changes: 2 additions & 2 deletions khmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
print("** Do not use these results!!", file=sys.stderr)
print("**", file=sys.stderr)
print("** (estimated false positive rate of %.3f;" % fp_all,
file=sys.stderr)
print("max allowable %.3f" % max_false_pos, file=sys.stderr)
file=sys.stderr, end=' ')
print("max recommended %.3f)" % max_false_pos, file=sys.stderr)
print("**", file=sys.stderr)

if not force:
Expand Down
152 changes: 93 additions & 59 deletions scripts/normalize-by-median.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,43 +38,87 @@
DEFAULT_DESIRED_COVERAGE = 20


def with_diagnostics(ifilename, norm, reader, fp_file):
class WithDiagnostics(object):
"""
Generator/context manager to do boilerplate output of statistics.
uses a Normalizer object.
"""
# per read diagnostic output
for _, record in enumerate(norm(reader)):

if norm.total % 100000 == 0:
print('... kept {kept} of {total} or {perc:2}% so far'
.format(kept=norm.total - norm.discarded,
total=norm.total,
perc=int(100. - norm.discarded /
float(norm.total) * 100.)),
file=sys.stderr)
def __init__(self, norm, report_fp=None, report_frequency=100000):
self.norm = norm
self.report_fp = report_fp
if report_fp:
report_fp.write('total,kept,f_kept\n')

print('... in file ' + ifilename, file=sys.stderr)
self.total = 0
self.kept = 0

self.report_frequency = report_frequency
self.next_report_at = self.report_frequency
self.last_report_at = self.report_frequency

def __call__(self, reader, ifilename):
norm = self.norm
report_fp = self.report_fp

reads_start = self.total
total = self.total
kept = self.kept

try:
for _, is_paired, read0, read1 in reader:
if is_paired:
total += 2
else:
total += 1

# do diginorm
for record in norm(is_paired, read0, read1):
kept += 1
yield record

yield record
# report!
if total >= self.next_report_at:
self.next_report_at += self.report_frequency
self.last_report_at = total

perc_kept = kept / float(total)

print('... kept {kept} of {tot} or {perc_kept:.1%} so far'
.format(kept=kept, tot=total, perc_kept=perc_kept),
file=sys.stderr)
print('... in file ' + ifilename, file=sys.stderr)

if report_fp:
print("{total},{kept},{f_kept:.4}"
.format(total=total, f_kept=perc_kept,
kept=kept),
file=report_fp)
report_fp.flush()
finally:
self.total = total
self.kept = kept

# per file diagnostic output
if total == reads_start:
print('SKIPPED empty file ' + ifilename, file=sys.stderr)
else:
perc_kept = kept / float(total)

# per file diagnostic output
if norm.total == 0:
print('SKIPPED empty file ' + ifilename, file=sys.stderr)
else:
print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'
.format(inp=ifilename, kept=norm.total - norm.discarded,
total=norm.total, perc=int(100. - norm.discarded /
float(norm.total) * 100.)),
file=sys.stderr)
print('DONE with {inp}; kept {kept} of {total} or {perc_kept:.1%}'
.format(inp=ifilename, kept=kept, total=total,
perc_kept=perc_kept),
file=sys.stderr)

if fp_file:
print("{total} {kept} {discarded}"
.format(total=norm.total, kept=norm.total - norm.discarded,
discarded=1. - (norm.discarded / float(norm.total))),
file=fp_file)
fp_file.flush()
# make sure there's at least one report per file, at the end of each
# file.
if report_fp and total != self.last_report_at:
perc_kept = kept / float(total)

print("{total},{kept},{f_kept:.4}"
.format(total=total, f_kept=perc_kept, kept=kept),
file=report_fp)
report_fp.flush()


class Normalizer(object):
Expand All @@ -85,10 +129,7 @@ def __init__(self, desired_coverage, htable):
self.htable = htable
self.desired_coverage = desired_coverage

self.total = 0
self.discarded = 0

def __call__(self, reader):
def __call__(self, is_paired, read0, read1):
"""
Actually does digital normalization - the core algorithm.
Expand All @@ -100,31 +141,23 @@ def __call__(self, reader):
"""
desired_coverage = self.desired_coverage

for _, is_paired, read0, read1 in reader:
passed_filter = False
passed_filter = False

self.total += 1
batch = []
batch.append(read0)
if read1 is not None:
batch.append(read1)

if is_paired:
self.total += 1

batch = []
batch.append(read0)
if read1 is not None:
batch.append(read1)
for record in batch:
seq = record.sequence.replace('N', 'A')
if not self.htable.median_at_least(seq, desired_coverage):
passed_filter = True

if passed_filter:
for record in batch:
seq = record.sequence.replace('N', 'A')
if not self.htable.median_at_least(seq, desired_coverage):
passed_filter = True

if passed_filter:
for record in batch:
seq = record.sequence.replace('N', 'A')
self.htable.consume(seq)
yield record
else:
self.discarded += len(batch)
self.htable.consume(seq)
yield record


@contextmanager
Expand Down Expand Up @@ -166,10 +199,7 @@ def get_parser():
With :option:`-s`/:option:`--savetable`, the k-mer counting table
will be saved to the specified file after all sequences have been
processed. With :option:`-d`, the k-mer counting table will be
saved every d files for multifile runs; if :option:`-s` is set,
the specified name will be used, and if not, the name `backup.ct`
will be used. :option:`-l`/:option:`--loadtable` will load the
processed. :option:`-l`/:option:`--loadtable` will load the
specified k-mer counting table before processing the specified
files. Note that these tables are are in the same format as those
produced by :program:`load-into-counting.py` and consumed by
Expand Down Expand Up @@ -220,9 +250,12 @@ def get_parser():
'reads are loaded.')
parser.add_argument('-R', '--report',
metavar='filename', type=argparse.FileType('w'))
parser.add_argument('--report-frequency',
metavar='report_frequency', type=int,
default=100000)
parser.add_argument('-f', '--force', dest='force',
help='continue on next file if read errors are \
encountered', action='store_true')
help='continue past file reading errors',
action='store_true')
parser.add_argument('-o', '--out', metavar="filename",
dest='single_output_file',
type=argparse.FileType('w'),
Expand Down Expand Up @@ -291,6 +324,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements

# create an object to handle diginorm of all files
norm = Normalizer(args.cutoff, htable)
with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

# make a list of all filenames and if they're paired or not;
# if we don't know if they're paired, default to allowing but not
Expand Down Expand Up @@ -331,7 +365,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
require_paired=require_paired)

# actually do diginorm
for record in with_diagnostics(filename, norm, reader, report_fp):
for record in with_diagnostics(reader, filename):
if record is not None:
write_record(record, outfp)

Expand Down
63 changes: 59 additions & 4 deletions tests/test_normalize_by_median.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,62 @@ def test_normalize_by_median_known_good():
assert False


@attr('huge')
def test_normalize_by_median_report_fp():
# this tests basic reporting of diginorm stats => report.out, including
# a test of aggregate stats for two input files.

infile = utils.get_temp_filename('test.fa')
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
infile2 = utils.get_temp_filename('test2.fa')
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile2)

in_dir = os.path.dirname(infile)
outfile = utils.get_temp_filename('report.out')

script = 'normalize-by-median.py'
args = ['-C', '1', '-k', '17', '-R', outfile, infile, infile2]
(status, out, err) = utils.runscript(script, args, in_dir)

assert os.path.exists(outfile)
report = open(outfile, 'r')
line = report.readline().strip()
assert line == 'total,kept,f_kept', line
line = report.readline().strip()
assert line == '1001,1,0.000999', line
line = report.readline().strip()
assert line == '2002,1,0.0004995', line


def test_normalize_by_median_report_fp_hifreq():
# this tests high-frequency reporting of diginorm stats for a single
# file => report.out.

infile = utils.get_temp_filename('test.fa')
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

in_dir = os.path.dirname(infile)
outfile = utils.get_temp_filename('report.out')

script = 'normalize-by-median.py'
args = ['-C', '1', '-k', '17', '-R', outfile, infile,
'--report-frequency', '100']
(status, out, err) = utils.runscript(script, args, in_dir)

assert os.path.exists(outfile)
report = open(outfile, 'r')
line = report.readline().strip()
assert line == 'total,kept,f_kept', line
line = report.readline().strip()
assert line == '100,1,0.01', line
line = report.readline().strip()
assert line == '200,1,0.005', line


@attr('huge')
def test_normalize_by_median_report_fp_huge():
# this tests reporting of diginorm stats => report.out for a large
# file, with the default reporting interval of once every 100k.

infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
outfile = utils.get_temp_filename('report.out')
Expand All @@ -224,8 +278,9 @@ def test_normalize_by_median_report_fp():

assert "fp rate estimated to be 0.623" in err, err
report = open(outfile, 'r')
line = report.readline() # skip header
line = report.readline()
assert "100000 25261 0.25261" in line, line
assert "100000,25261,0.2526" in line, line


def test_normalize_by_median_unpaired_and_paired():
Expand Down Expand Up @@ -262,12 +317,12 @@ def test_normalize_by_median_count_kmers_PE():
args = ['-C', CUTOFF, '-k', '17', '--force-single', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
assert 'kept 1 of 2 or 50%' in err, err
assert 'kept 1 of 2 or 50.0%' in err, err

args = ['-C', CUTOFF, '-k', '17', '-p', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 99' in err, err
assert 'kept 2 of 2 or 100%' in err, err
assert 'kept 2 of 2 or 100.0%' in err, err


def test_normalize_by_median_double_file_name():
Expand Down

0 comments on commit af68d9d

Please sign in to comment.