diff --git a/CITATION b/CITATION index 1e84c0141b..31b407d05c 100644 --- a/CITATION +++ b/CITATION @@ -43,8 +43,8 @@ If you use the khmer software, you must cite: url = "http://dx.doi.org/10.12688/f1000research.6924.1" } -If you use any of our published scientific methods, you should *also* -cite the relevant paper(s), as directed below. Additionally some scripts use +If you use any of our published scientific methods you should *also* +cite the relevant paper(s) as directed below. Additionally some scripts use the `SeqAn library `_ for read parsing: the full citation for that library is also included below. diff --git a/ChangeLog b/ChangeLog index 19194ebca0..cde43d1951 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,19 @@ -2015-08-11 Michael R. Crusoe +2015-08-12 Jacob Fenton + + * doc/dev/{codebase-guide,coding-guidelines-and-review,development, + for-khmer-developers,getting-started,release,scripts-and-sandbox, + binary-file-formats}.rst,doc/{index,introduction,whats-new-2.0, + contributors}.rst,doc/user/{blog-posts,choosing-table-sizes,galaxy, + getting-help,guide,install,known-issues,partitioning-big-data, + scripts}.rst,CITATION: Cleaned up documentation + * scripts/*.py, khmer/khmer_args.py: added epilog sanitation + * scripts/{load-into-counting,load-graph,load-into-countgraph, + load-into-nodegraph}.py, tests/{test_scripts,test_normalize_by_median, + test_streaming_io,test_countgraph}: renamed load-into-counting -> + load-into-countgraph, load-graph -> load-into-nodegraph, fixed tests to not + bork + +2015-08-12 Michael R. Crusoe * CITATION, doc/{index,introduction,user/scripts}.rst, khmer/khmer_args.py: formatting fixes and new citation for the software as a whole diff --git a/doc/LICENSE.rst b/doc/LICENSE.rst index 097af7b120..a9f1b8c8e5 100644 --- a/doc/LICENSE.rst +++ b/doc/LICENSE.rst @@ -5,6 +5,7 @@ License ======= Copyright (c) 2010-2014, Michigan State University. All rights reserved. +Copyright (c) 2015, The Regents of the University of California Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/doc/contributors.rst b/doc/contributors.rst index 76a4134a64..a9c612bb20 100644 --- a/doc/contributors.rst +++ b/doc/contributors.rst @@ -31,4 +31,4 @@ dramatically. Michael R. Crusoe took over maintainership June, 2013. -MRC 2015-07-31 +Last updated by MRC on 2015-07-31 diff --git a/doc/dev/binary-file-formats.rst b/doc/dev/binary-file-formats.rst index 8359af3906..f32c6bae13 100644 --- a/doc/dev/binary-file-formats.rst +++ b/doc/dev/binary-file-formats.rst @@ -56,6 +56,7 @@ Use Bigcount 1 6 ``0x01`` if bigcounts is used, else ``0x00`` K-size 4 7 k-mer length, ``ht._ksize``. [``uint32_t``] Number of Tables 1 11 Number of Count-min Sketch tables, ``ht._n_tables``. [``uint8_t``] +Occupied Bins 8 12 Number of occupied bins ================== ===== ===== ============================================== Then follows the Countgraph's tables. For each table: @@ -100,6 +101,7 @@ File Type 1 5 ``0x02`` (``SAVED_HASHBITS``) K-size 4 6 k-mer length, ``ht._ksize``. [``unsigned int``] Number of Tables 1 10 Number of Nodegraph tables. ``ht._n_tables``. [``uint8_t``] +Occupied Bins 8 11 Number of occupied bins ================== ===== ===== ============================================== Then follows the Nodegraph's tables. For each table: diff --git a/doc/dev/codebase-guide.rst b/doc/dev/codebase-guide.rst index c5c77e4421..9ff2ad0527 100644 --- a/doc/dev/codebase-guide.rst +++ b/doc/dev/codebase-guide.rst @@ -9,10 +9,10 @@ The ChangeLog file lists changes to the codebase, most recent first. The lib/ directory contains all of the C++ code. -The khmer/ directory contains the khmer package (khmer/__init__.py etc) -and the C++-to-Python bridge (khmer/_khmermodule.cc). +The `khmer/` directory contains the `khmer` package (`khmer/__init__.py`, etc) +and the C++-to-Python bridge (`khmer/_khmermodule.cc`). -The scripts/ and sandbox/ directory contain Python command-line scripts. +The `scripts/` and `sandbox/` directory contain Python command-line scripts. -The tests/ directory contains all of the tests. Each test is a function in -one of the tests/test*.py files. +The `tests/` directory contains all of the tests. Each test is a function in +one of the `tests/test*.py` files. diff --git a/doc/dev/coding-guidelines-and-review.rst b/doc/dev/coding-guidelines-and-review.rst index 9bfa820513..543c1e6de8 100644 --- a/doc/dev/coding-guidelines-and-review.rst +++ b/doc/dev/coding-guidelines-and-review.rst @@ -109,6 +109,7 @@ ready for review:: http://en.wikipedia.org/wiki/Changelog#Format - [ ] Was a spellchecker run on the source code and documentation after changes were made? + - [ ] Does the script respect streaming IO? (Is it tested for streaming IO?) - [ ] Is the Copyright year up to date? **Note** that after you submit the comment you can check and uncheck diff --git a/doc/dev/development.rst b/doc/dev/development.rst index 414f9140bc..4ee494e92f 100644 --- a/doc/dev/development.rst +++ b/doc/dev/development.rst @@ -15,7 +15,7 @@ tag. Build framework --------------- -'make' should build everything, including tests and "development" code. +`make` should build everything, including tests and "development" code. git and GitHub strategies ------------------------- @@ -54,13 +54,13 @@ Pipelines --------- All khmer scripts used by a published recommended analysis pipeline must be -included in scripts/ and meet the standards therein implied. +included in `scripts/` and meet the standards therein implied. Command line scripts -------------------- Python command-line scripts should use '-' instead of '_' in the name. -(Only filenames containing code for import imported should use _.) +(Only filenames containing code for import should use _.) Please follow the command-line conventions used under scripts/. This includes most especially standardization of '-x' to be hash table size, @@ -83,19 +83,12 @@ Command line thoughts: ---- -All code in scripts/ must have automated tests; see tests/test_scripts.py. -Otherwise it belongs in sandbox/. +All code in `scripts/` must have automated tests; see `tests/test_scripts.py`. +Otherwise it belongs in `sandbox/`. When files are overwritten, they should only be opened to be overwritten after the input files have been shown to exist. That prevents stupid -command like mistakes from trashing important files. - -It would be nice to allow piping from one command to another where possible. -But this seems complicated. - -CTB: should we squash output files (overwrite them if they exist), or not? -So far, leaning towards 'not', as that way no one is surprised and loses -their data. +command line mistakes from trashing important files. A general error should be signaled by exit code `1` and success by `0`. Linux supports exit codes from `0` to `255` where the value `1` means a general @@ -115,7 +108,7 @@ Python / C integration ---------------------- The Python extension that wraps the C++ core of khmer lives in -khmer/_khmermodule.CC +`khmer/_khmermodule.cc` This wrapper code is tedious and annoying so we use a static analysis tool to check for correctness. diff --git a/doc/dev/for-khmer-developers.rst b/doc/dev/for-khmer-developers.rst index bbf6368f21..16f96f9a5c 100644 --- a/doc/dev/for-khmer-developers.rst +++ b/doc/dev/for-khmer-developers.rst @@ -13,7 +13,7 @@ rule can be broken under specific conditions when doing a release; see :doc:`release`. Second, need to force another continuous integration run? Put "test -this please" in a comment. This can be used to ask our continuous +this please" in a comment. This is used to ask our continuous integration system to run on someone else's pull request -- by default, it only runs on commits from people who have write privileges to khmer, so you may need to do this if you're reviewing someone else's @@ -21,6 +21,6 @@ pull request. Third, we ask that all contributors set up standing Pull Requests while they are working something. (This is a **requirement** if -you're in the GED lab.) This lets us track what's going on. On the +you're in the DIB lab.) This lets us track what's going on. On the flip side, please do not review pull requests until they are indicated as "ready for review". diff --git a/doc/dev/getting-started.rst b/doc/dev/getting-started.rst index 164c65544c..196b8627f4 100644 --- a/doc/dev/getting-started.rst +++ b/doc/dev/getting-started.rst @@ -195,7 +195,7 @@ Building khmer and running the tests You should see lots of output, with something like:: - Ran 360 tests in 10.403s + Ran 633 tests in 47.446s OK @@ -230,6 +230,33 @@ Claiming an issue and starting to develop (This pulls in all of the latest changes from whatever we've been doing on dib-lab.) + It is possible that when you do a `git pull` you will get a "merge + conflict"--This is what happens when something changed in the branch you're + pulling in in the same place you made a change in your local copy. This + frequently happens in the `Changelog` file. + + Git will complain loudly about merges and tell you specifically in which + files they occurred. If you open the file, you'll see something vaugely + like this in the place where the merge occurred:: + + <<<<<<< HEAD + Changes made on the branch that is being merged into. In most cases, + this is the branch that you have currently checked out + ======= + Changes made on the branch that is being merged in, almost certianly + master. + >>>>>>> abcde1234 + + Though there are a variety of tools to assist with resolving merge + conflicts they can be quite complicated at first glance and it is usually + easy enough to manually resolve the conflict. + + To resolve the conflict you simply have to manually 'meld' the changes + together and remove the merge markers. + + After this you'll have to add and commit the merge just like any other set + of changes. It's also recommended that you run tests. + #. Create a new branch and link it to your fork on GitHub:: git checkout -b fix/brief_issue_description @@ -242,9 +269,42 @@ Claiming an issue and starting to develop #. Make some changes and commit them. - This will be issue dependent ;). + Though this will largely be issue-dependent the basics of committing are + simple. After you've made a cohesive set of changes, run the command `git + status`. This will display a list of all the files git has noticed you + changed. A file in the 'untracked' section are files that haven't existed + previously in the repository but git has noticed. + + To commit changes you have to 'stage' them--this is done by issuing the + following command:: + + git add path/to/file + + If you have a large quanity of changes and you don't want to add each file + manually you can do `git add --all`--but be careful! This can sometimes add + changes you didn't intend to commit, such as extraneous test datia--You can + do a `git diff` to see the difference between the current state of your + branch and the last time you made a commit. + + Once you have staged your changes, it's time to make a commit:: + + git commit + + Git will then open your default console text editor to write a commit + message--this is a short (typically 1-3 sentence) description of the + changes you've made. Please make your commit message informative but + concise--these messages become part of the 'official' history of the + project. + + Once your changes have been committed, push them up to the remote branch:: + + git push + + If this is your first commit on a new branch git will error out, telling + you the remote branch doesn't exist--This is fine, as it will also provide + the command to create the branch. Copy/paste/run and you should be set. - (You should visit and read :doc:`coding-guidelines-and-review`.) + You should also visit and read :doc:`coding-guidelines-and-review`. #. Periodically update your branch from the main khmer master branch:: @@ -256,7 +316,7 @@ Claiming an issue and starting to develop #. Run the tests and/or build the docs *before* pushing to GitHub:: - make doc test pep8 + make doc test pep8 diff-cover Make sure they all pass! diff --git a/doc/dev/release.rst b/doc/dev/release.rst index cb3da630a7..73cebfc3c7 100644 --- a/doc/dev/release.rst +++ b/doc/dev/release.rst @@ -40,7 +40,7 @@ release makers, following this checklist by MRC. #. Review the issue list for any new bugs that will not be fixed in this release. Add them to ``doc/known-issues.txt`` -#. Verify that the build is clean: http://ci.ged.msu.edu/job/khmer-master/ +#. Verify that the build is clean: http://ci.oxli.org/job/khmer-master/ #. Submit a build to Coverity Scan if it hasn't been done recently. You can get the token from @@ -221,7 +221,7 @@ cross-platform testing environment. Setuptools Bootstrap -------------------- -ez_setup.py is from https://bitbucket.org/pypa/setuptools/raw/bootstrap/ +`ez_setup.py` is from https://bitbucket.org/pypa/setuptools/raw/bootstrap/ Before major releases it should be examined to see if there are new versions available and if the change would be useful diff --git a/doc/dev/scripts-and-sandbox.rst b/doc/dev/scripts-and-sandbox.rst index 20d6bf2af2..42ecd21a72 100644 --- a/doc/dev/scripts-and-sandbox.rst +++ b/doc/dev/scripts-and-sandbox.rst @@ -78,6 +78,15 @@ Our current Copyright message is:: # Contact: khmer-project@idyll.org # +Some files are copyright University of California Regents:: + + # + # This file is part of khmer, https://github.com/dib-lab/khmer/, and is + # Copyright (C) The Regents of the University of California, 2015. + # It is licensed under the three-clause BSD license; see doc/LICENSE.txt. + # Contact: khmer-project@idyll.org + # + The beginning year should be the first year that this file existed in the repo; the end year should be the last year a coding change was made in the file. diff --git a/doc/index.rst b/doc/index.rst index ed5896716d..b1196b2dba 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -4,10 +4,23 @@ khmer -- k-mer counting & filtering FTW ####################################### -:Authors: Michael R. Crusoe, Greg Edvenson, Jordan Fish, Adina Howe, - Luiz Irber, Eric McDonald, Joshua Nahum, Kaben Nanlohy, Humberto - Ortiz-Zuazaga, Jason Pell, Jared Simpson, Camille Scott, Ramakrishnan - Rajaram Srinivasan, Qingpeng Zhang, and C. Titus Brown +:Authors: Michael R. Crusoe, ACharbonneau, James A. Stapleton, Sherine + Awad, Elmar Bucher, Adam Caldwell, Reed Cartwright, Bede Constantinides, + Peter Dave Hello, Kevin D. Murray, Greg Edvenson, Hussien F. Alameldin, + Scott Fay, Jacob Fenton, Thomas Fenzl, Jordan Fish, Leonor + Garcia-Gutierrez, Phillip Garland, Jonathan Gluck, Iván González, Sarah + Guermond, Jiarong Guo, Aditi Gupta, Andreas Härpfer, Adina Howe, + Alex Hyer, Luiz Irber, Alexander Johan Nederbragt, Rhys Kidd, David Lin, + Justin Lippi, Heather L. Wiencko, Tamer Mansour, Pamela McA'Nulty, Eric + McDonald, Jessica Mizzi, Kevin Murray, Kaben Nanlohy, Humberto + Ortiz-Zuazaga, Jeramia Ory, Jason Pell, Charles Pepe-Ranney, Rodney + Picett, Ryan R. Boyce, Michael R. Crusoe, Joshua R. Herr, Joshua R. + Nahum, Erich Schwarz, Camille Scott, Josiah Seaman, Scott Sievert, Jared + Simpson, James Spencer, Ramakrishnan Srinivasan, Daniel Standage, Joe + Stein, Susan Steinman, Benjamin Taylor, C. Titus Brown, Will Trimble, + Connor T. Skennerton, Michael Wright, Brian Wyss, Qingpeng Zhang, en + zyme, C. Titus Brown + :Contact: khmer-project@idyll.org :GitHub: https://github.com/dib-lab/khmer @@ -16,7 +29,7 @@ khmer -- k-mer counting & filtering FTW khmer is a library and suite of command line tools for working with -DNA sequence. It is primarily aimed at short-read sequencing data +DNA sequences. It is primarily aimed at short-read sequencing data such as that produced by the Illumina platform. khmer takes a k-mer-centric approach to sequence analysis, hence the name. @@ -32,7 +45,8 @@ the following URLs: * Announcements: http://lists.idyll.org/listinfo/khmer-announce -The archives for the khmer list are available at: http://lists.idyll.org/pipermail/khmer/ +The archives for the khmer mailing list are available at: +http://lists.idyll.org/pipermail/khmer/ khmer development has largely been supported by AFRI Competitive Grant no. `2010-65205-20361 diff --git a/doc/introduction.rst b/doc/introduction.rst index 9b25700535..589a65c1f4 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -11,7 +11,7 @@ khmer is a library and toolkit for doing k-mer-based dataset analysis and transformations. Our focus in developing it has been on scaling assembly of metagenomes and mRNA. -khmer can be used for a number of transformations, include inexact +khmer can be used for a number of transformations, including inexact transformations (abundance filtering and error trimming) and exact transformations (graph-size filtering, to throw away disconnected reads; and partitioning, to split reads into disjoint sets). Of these, only partitioning @@ -34,16 +34,16 @@ will never incorrectly report a k-mer as being absent when it *is* present. This one-sided error makes the Bloom filter very useful for certain kinds of operations. -khmer is also independent of K, and currently works for K <= 32. We will be -integrating code for up to K=64 soon. +khmer is also independent of a specific k-size (K), and currently works for +K <= 32. We will be integrating code for K<=64 soon. khmer is implemented in C++ with a Python wrapper, which is what all of the scripts use. -Some important documentation for khmer is provided on the Web sites for +Documentation for khmer is provided on the Web sites for `khmer-protocols `__ and `khmer-recipes `__. khmer-protocols provides detailed -protocols for using khmer to analyze either a transcriptome or a metagenome; +protocols for using khmer to analyze either a transcriptome or a metagenome. khmer-recipes provides individual recipes for using khmer in a variety of sequence-oriented tasks such as extracting reads by coverage, estimating a genome or metagenome size from unassembled reads, and error-trimming reads via @@ -71,7 +71,7 @@ immediately useful for a few different operations, including: - optimizing assemblies on various parameters; - - converting FASTA to FASTQ; + - converting FASTQ to FASTA; and a few other random functions. @@ -94,6 +94,7 @@ Copyright and license ===================== Portions of khmer are Copyright California Institute of Technology, -where the exact counting code was first developed; the remainder is -Copyright Michigan State University. The code is freely available for -use and re-use under the BSD License. +where the exact counting code was first developed. All other code developed +through 2014 is copyright Michigan State University. Portions are copyright +Michigan State University and Regents of the University of California. +All the code is freely available for use and re-use under the BSD License. diff --git a/doc/user/blog-posts.rst b/doc/user/blog-posts.rst index ab939c7729..c0eb192787 100644 --- a/doc/user/blog-posts.rst +++ b/doc/user/blog-posts.rst @@ -32,7 +32,7 @@ scripts can be used to generate the k-mer abundance profile data, after loading all the k-mer counts into a .ct file:: # first, load all the k-mer counts: - load-into-counting.py -k 20 -x 1e7 25k.ct data/25k.fq.gz + load-into-countgraph.py -k 20 -x 1e7 25k.ct data/25k.fq.gz # then, build the '.freq' file that contains all of the counts by position python sandbox/fasta-to-abundance-hist.py 25k.ct data/25k.fq.gz diff --git a/doc/user/choosing-table-sizes.rst b/doc/user/choosing-table-sizes.rst index caba889af4..260e5d00cd 100644 --- a/doc/user/choosing-table-sizes.rst +++ b/doc/user/choosing-table-sizes.rst @@ -18,7 +18,7 @@ details.) This is what the :option:`-M` parameter does. If you set it too low, khmer will warn you to set it higher at the end. See below for some good choices for various kinds of data. -**Note for khmer 1.x users:** as of khmer 2.0, the :option:`-M` +**Note for khmer 1.x users:** As of khmer 2.0, the :option:`-M` parameter sets the :option:`-N`/:option:`--n_tables` and :option:`-x`/:option:`--max_tablesize` parameters automatically. You can still set these parameters directly if you wish. @@ -57,7 +57,7 @@ memory, decrease the table size. Also see the rules of thumb, below. -The real full version +The long version ===================== khmer's scripts, at their heart, represents k-mers in a very memory diff --git a/doc/user/galaxy.rst b/doc/user/galaxy.rst index 763b958964..6127b50dcb 100644 --- a/doc/user/galaxy.rst +++ b/doc/user/galaxy.rst @@ -26,14 +26,14 @@ Single Output Usage For one or more files into a single file: #. Choose 'Normalize By Median' from the 'khmer protocols' section of the -'Tools' menu. + 'Tools' menu. #. Compatible files already uploaded to your Galaxy instance should be listed. -If not then you may need to `set their datatype manually -`__. + If not then you may need to `set their datatype manually + `__. #. After selecting the input files specify if they are paired-interleaved -or not. + or not. #. Specify the sample type or show the advanced parameters to set the tablesize -yourself. Consult :doc:`../user/choosing-table-sizes` for assistance. + yourself. Consult :doc:`../user/choosing-table-sizes` for assistance. diff --git a/doc/user/getting-help.rst b/doc/user/getting-help.rst index e84ff672ab..9a36218b59 100644 --- a/doc/user/getting-help.rst +++ b/doc/user/getting-help.rst @@ -27,7 +27,8 @@ Mailing List ------------ The primary way to get help is through the khmer discussion list: -http://lists.idyll.org/listinfo/khmer +http://lists.idyll.org/listinfo/khmer, though we are also available for +closer-to-realtime support via `Gitter `_. Asking a question ----------------- @@ -50,5 +51,5 @@ Asking a question GitHub ------ -You are also welcome to report an issue you are having using GitHub:: +You are also welcome to report an issue you are having using GitHub: https://github.com/dib-lab/khmer/issues/new diff --git a/doc/user/guide.rst b/doc/user/guide.rst index 05cf2e623d..be9bfd08c6 100644 --- a/doc/user/guide.rst +++ b/doc/user/guide.rst @@ -11,7 +11,7 @@ about. Moreover, our assembly strategies themselves are also under constant evolution as we do more research and find ever-wider applicability of our approaches. -Note, this is an exact copy of `Titus' blog post, here +Note, this is modified from `Titus' blog post, here `__ -- go check the bottom of that for comments. @@ -25,12 +25,6 @@ Pavangadkar, Likit Preeyanon, and others. Introduction ~~~~~~~~~~~~ -khmer is a general `framework for low-memory k-mer counting, filtering, -and advanced trickery `__. - -The latest source is always available `here -`__. - khmer is really focused on short read data, and, more specifically, Illumina, because that's where we have a too-much-data problem. However, a lot of the prescriptions below can be adapted to longer @@ -39,51 +33,21 @@ read technologies such as 454 and Ion Torrent without much effort. Don't try to use our k-mer approaches with PacBio -- the error rate is too high. -There are currently two papers available on khmer: the `partitioning -paper -`__ and -the `digital normalization paper `__. - There are many blog posts about this stuff on `Titus Brown's blog `__. We will try to link them in where appropriate. -Asking for help -~~~~~~~~~~~~~~~ - -There's some documentation here: - - https://khmer.readthedocs.org/en/latest/ - -There's also a khmer mailing list at lists.idyll.org that you can use to -get help with khmer. To sign up, just go to -`the khmer lists page `__ and -subscribe. - Preparing your sequences ~~~~~~~~~~~~~~~~~~~~~~~~ Do all the quality filtering, trimming, etc. that you think you should do. -Most of the khmer tools currently work "out of the box" on interleaved -paired-end data. Ask on the list if you're not sure. +The khmer tools work "out of the box" on interleaved paired-end data. All of our scripts will take in .fq or .fastq files as FASTQ, and all other files as FASTA. gzip files are always accepted. Let us know if not; that's a bug! -Most scripts *output* FASTA, and some mangle headers. Sorry. We're -working on outputting FASTQ for FASTQ input, and removing any header -mangling. - -Picking k-mer table sizes and k parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For k-mer table sizes, read :doc:`choosing-table-sizes` - -For k-mer sizes, we recommend k=20 for digital normalization and k=32 -for partitioning; then assemble with a variety of k parameters. - Genome assembly, including MDA samples and highly polymorphic genomes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -102,10 +66,10 @@ For low-coverage libraries (< 50x) do single-pass digital normalization: run normalize-by-median to C=10. 2. Extract any remaining paired-end reads and lump remaining orphan -reads into singletons using strip-and-split-for-assembly + reads into singletons using strip-and-split-for-assembly 3. Then assemble as normal, with appropriate insert size specs -etc. for the paired end reads. + etc. for the paired end reads. You can read about this process in the `digital normalization paper `__. @@ -114,14 +78,13 @@ mRNAseq assembly ~~~~~~~~~~~~~~~~ 1. Apply single-pass digital normalization. - -Run normalize-by-median to C=20. + Run normalize-by-median to C=20. 2. Extract any remaining paired-end reads and lump remaining orphan -reads into singletons using strip-and-split-for-assembly + reads into singletons using strip-and-split-for-assembly 3. Then assemble as normal, with appropriate insert size specs -etc. for the paired end reads. + etc. for the paired end reads. You can read about this process in the `digital normalization paper `__. @@ -130,17 +93,16 @@ Metagenome assembly ~~~~~~~~~~~~~~~~~~~ 1. Apply single-pass digital normalization. - -Run normalize-by-median to C=20 (we've also found C=10 works fine). + Run normalize-by-median to C=20 (we've also found C=10 works fine). 2. Run filter-below-abund with C=50 (if you diginormed to C=10) or -C=100 (if you diginormed to C=20); + C=100 (if you diginormed to C=20); 3. Partition reads with load-graph, etc. etc. 4. Assemble groups as normal, extracting paired-end reads and lumping -remaining orphan reads into singletons using -strip-and-split-for-assembly. + remaining orphan reads into singletons using + strip-and-split-for-assembly. (We actually use Velvet at this point, but there should be no harm in using a metagenome assembler such as MetaVelvet or MetaIDBA or @@ -186,7 +148,7 @@ Quantifying mRNAseq or metagenomes assembled with digital normalization For now, khmer only deals with assembly! So: assemble. Then, go back to your original, unnormalized reads, and map those to your assembly -with e.g. bowtie. Then count as you normally would :). +with e.g. bowtie. Then count as you normally would). Philosophy of digital normalization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -199,16 +161,16 @@ you should load in paired end reads, or longer reads, first. Iterative and independent normalization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can use :option:`--loadtable` and :option:`--savetable` to do iterative +You can use :option:`--loadgraph` and :option:`--savegraph` to do iterative normalizations on multiple files in multiple steps. For example, break :: normalize-by-median.py [ ... ] file1.fa file2.fa file3.fa into multiple steps like so:: - normalize-by-median.py [ ... ] --savetable file1.ct file1.fa - normalize-by-median.py [ ... ] --loadtable file1.ct --savetable file2.ct file2.fa - normalize-by-median.py [ ... ] --loadtable file2.ct --savetable file3.ct file3.fa + normalize-by-median.py [ ... ] --savegraph file1.ct file1.fa + normalize-by-median.py [ ... ] --loadgraph file1.ct --savegraph file2.ct file2.fa + normalize-by-median.py [ ... ] --loadgraph file2.ct --savegraph file3.ct file3.fa The results should be identical! diff --git a/doc/user/install.rst b/doc/user/install.rst index c3b49db4bb..1643bbce89 100644 --- a/doc/user/install.rst +++ b/doc/user/install.rst @@ -62,9 +62,11 @@ Installing khmer inside the virtualenv Run the tests ^^^^^^^^^^^^^ -After installing you can run the embedded test suite:: +After installing you can run the embedded test suite. If you are running an +OSX system you should also add `!linux` to prevent linux-specific tests from +failing.:: - nosetests khmer --attr '!known_failing' + nosetests khmer --attr '!known_failing,!huge' If the nosetests binary isn't installed then:: diff --git a/doc/user/partitioning-big-data.rst b/doc/user/partitioning-big-data.rst index 1b5dec7385..4fb8e140f6 100644 --- a/doc/user/partitioning-big-data.rst +++ b/doc/user/partitioning-big-data.rst @@ -55,7 +55,7 @@ https://s3.amazonaws.com/public.ged.msu.edu/khmer/iowa-corn-50m.fa.gz # the next command will create a '50m.ct' and a '50m.tagset', # representing the de Bruijn graph - load-graph.py -k 32 -N 4 -x 16e9 50m iowa-corn-50m.fa.gz + load-into-nodegraph.py -k 32 -N 4 -x 16e9 50m iowa-corn-50m.fa.gz # this will then partition that graph. should take a while. # update threads to something higher if you have more cores. @@ -81,7 +81,7 @@ https://s3.amazonaws.com/public.ged.msu.edu/khmer/iowa-corn-50m.fa.gz mv iowa-corn-50m.group0005.fa corn-50m.lump.fa # create graph, - load-graph.py -x 8e9 lump corn-50m.lump.fa + load-into-nodegraph.py -x 8e9 lump corn-50m.lump.fa # create an initial set of stoptags to help in knot-traversal; otherwise, # partitioning and knot-traversal (which is systematic) is really expensive. @@ -102,7 +102,7 @@ https://s3.amazonaws.com/public.ged.msu.edu/khmer/iowa-corn-50m.fa.gz # fasta, therefore if your files are fastq formatted you need # to append 'fastq' to the name so that 'load-graph.py' # will parse the file correctly - load-graph.py -x 8e9 lumpfilt corn-50m.lump.fa.stopfilt + load-into-nodegraph.py -x 8e9 lumpfilt corn-50m.lump.fa.stopfilt partition-graph.py -T 4 lumpfilt merge-partitions.py lumpfilt annotate-partitions.py lumpfilt corn-50m.lump.fa.stopfilt diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst index 963c941464..acb8fa33cb 100644 --- a/doc/user/scripts.rst +++ b/doc/user/scripts.rst @@ -27,15 +27,14 @@ This should generally be set as high as possible; see output the same. Some scripts may only recognize FASTQ if the file ending is '.fq' or '.fastq', at least for now. - Files ending with '.gz' will be treated as gzipped files, and - files ending with '.bz2' will be treated as bzip2'd files. + Gzip and bzip2 compressed files are detected using magic bits. .. _scripts-counting: k-mer counting and abundance filtering ====================================== -.. autoprogram:: load-into-counting:get_parser() +.. autoprogram:: load-into-countgraph:get_parser() :prog: load-into-counting.py .. autoprogram:: abundance-dist:get_parser() @@ -67,7 +66,7 @@ Partitioning .. autoprogram:: do-partition:get_parser() :prog: do-partition.py -.. autoprogram:: load-graph:get_parser() +.. autoprogram:: load-into-nodegraph:get_parser() :prog: load-graph.py See :program:`extract-partitions.py` for a complete workflow. diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst index 2fedcf8fa2..6ada4e2f4e 100644 --- a/doc/whats-new-2.0.rst +++ b/doc/whats-new-2.0.rst @@ -54,8 +54,8 @@ automatically to match the desired memory usage. (:option:`--min-tablesize` was also renamed to :option:`--max-tablesize` to reflect this more desirable behavior.) -Binary file formats have changed! ---------------------------------- +Binary file formats have changed +-------------------------------- All binary khmer formats (presence tables, counting tables, tag sets, stop tags, and partition subsets) have changed. Files are now diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index 7e820cda1c..b280bccc2c 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -391,6 +391,13 @@ def add_threading_args(parser): parser.add_argument('--threads', '-T', default=DEFAULT_N_THREADS, type=int, help='Number of simultaneous threads to execute') + +def sanitize_epilog(parser): + parser.epilog = parser.epilog.replace( + '//', '/').replace(':option:', '').replace( + ':program:', '').replace('::', ':') + return parser + _algorithms = { 'software': 'MR Crusoe et al., ' '2015. http://dx.doi.org/10.12688/f1000research.6924.1', diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py index ddf1c25a14..05f886b29e 100755 --- a/sandbox/collect-reads.py +++ b/sandbox/collect-reads.py @@ -22,7 +22,7 @@ import khmer from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, info, - calculate_graphsize) + calculate_graphsize, sanitize_epilog) from khmer.kfile import check_input_files, check_space from khmer.kfile import check_space_for_graph import argparse @@ -68,7 +68,7 @@ def get_parser(): def main(): info('collect-reads.py', ['counting']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename diff --git a/sandbox/correct-reads.py b/sandbox/correct-reads.py index 46e70d9681..7a14f5bff8 100755 --- a/sandbox/correct-reads.py +++ b/sandbox/correct-reads.py @@ -26,7 +26,7 @@ import argparse from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, - report_on_config) + report_on_config, sanitize_epilog) from khmer.utils import write_record, write_record_pair, broken_paired_reader from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists) @@ -114,8 +114,7 @@ def get_parser(): def main(): info('correct-reads.py', ['streaming']) - parser = get_parser() - args = parser.parse_args() + args = sanitize_epilog(get_parser()).parse_args() ### diff --git a/sandbox/estimate_optimal_hash.py b/sandbox/estimate_optimal_hash.py index 2b8d45056b..0c6f8b21a8 100755 --- a/sandbox/estimate_optimal_hash.py +++ b/sandbox/estimate_optimal_hash.py @@ -29,7 +29,7 @@ from __future__ import print_function import argparse import khmer, oxli -from khmer.khmer_args import info, optimal_size +from khmer.khmer_args import info, optimal_size, sanitize_epilog import textwrap import sys @@ -70,7 +70,7 @@ def get_parser(): def main(): info('estimate_optimal_hash.py', ['counting']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() N = args.N if args.M: M = args.M diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py index 05784701f6..1e28f01117 100755 --- a/sandbox/saturate-by-median.py +++ b/sandbox/saturate-by-median.py @@ -21,7 +21,8 @@ import textwrap from khmer.khmer_args import (build_counting_args, add_loadgraph_args, - report_on_config, info, create_countgraph) + report_on_config, info, create_countgraph, + sanitize_epilog) import argparse from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists) @@ -177,7 +178,8 @@ def get_parser(): def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) - args = get_parser().parse_args() + parser = sanitize_epilog(get_parser()) + args = parser.parse_args() report_on_config(args) diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py index 4f12d43e67..55a13a9fff 100755 --- a/sandbox/sweep-files.py +++ b/sandbox/sweep-files.py @@ -36,7 +36,8 @@ import os import time import khmer -from khmer.khmer_args import (build_nodegraph_args, report_on_config, info) +from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, + sanitize_epilog) DEFAULT_OUT_PREF = 'reads' DEFAULT_RANGE = -1 @@ -100,7 +101,7 @@ def clear(self): def main(): #info('sweep-files.py', ['sweep']) - parser = get_parser() + parser = sanitize_epilog(get_parser()) args = parser.parse_args() if args.max_tablesize < MIN_HSIZE: diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index fa878e656d..b5b9e801fd 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -38,7 +38,8 @@ import os import time import khmer -from khmer.khmer_args import (build_nodegraph_args, report_on_config, info) +from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, + sanitize_epilog) from khmer.kfile import (check_input_files, check_valid_file_exists, check_space) @@ -205,7 +206,7 @@ def get_parser(): def main(): info('sweep-reads-buffered.py', ['sweep']) - parser = get_parser() + parser = sanitize_epilog(get_parser()) args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index a032e8a831..0ce40e8fe1 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -24,7 +24,8 @@ import textwrap from khmer import khmer_args from khmer.khmer_args import (build_counting_args, add_threading_args, - report_on_config, info, calculate_graphsize) + report_on_config, info, calculate_graphsize, + sanitize_epilog) from khmer.kfile import (check_input_files, check_space_for_graph) @@ -51,7 +52,7 @@ def get_parser(): '(4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', - help='Do not output 0-count bins') + help='Do not output zero-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') @@ -68,7 +69,7 @@ def get_parser(): def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py index ebbeedfe14..ad40d4534c 100755 --- a/scripts/abundance-dist.py +++ b/scripts/abundance-dist.py @@ -39,7 +39,7 @@ def get_parser(): 'count, (4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', - help='Do not output 0-count bins') + help='Do not output zero-count bins') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite existing output_histogram_filename') diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index a5e6d71247..b9fed70b6d 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -37,7 +37,7 @@ def get_parser(): Example (results will be in ``random-20-a.fa.part``):: - load-graph.py -k 20 example tests/test-data/random-20-a.fa + load-into-nodegraph.py -k 20 example tests/test-data/random-20-a.fa partition-graph.py example merge-partitions.py -k 20 example annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa diff --git a/scripts/count-median.py b/scripts/count-median.py index 76714b3788..a5b8e73eaf 100755 --- a/scripts/count-median.py +++ b/scripts/count-median.py @@ -30,7 +30,7 @@ import khmer from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog def get_parser(): @@ -68,7 +68,7 @@ def get_parser(): def main(): info('count-median.py', ['diginorm']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input diff --git a/scripts/do-partition.py b/scripts/do-partition.py index f74109f64d..abccee5bca 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -24,7 +24,7 @@ import textwrap from khmer import khmer_args from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, - add_threading_args) + add_threading_args, sanitize_epilog) import glob from khmer.kfile import check_input_files, check_space import re @@ -72,11 +72,12 @@ def get_parser(): Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. - This script combines the functionality of :program:`load-graph.py`, - :program:`partition-graph.py`, :program:`merge-partitions.py`, and - :program:`annotate-partitions.py` into one script. This is convenient - but should probably not be used for large data sets, because - :program:`do-partition.py` doesn't provide save/resume functionality. + This script combines the functionality of + :program:`load-into-nodegraph.py`, :program:`partition-graph.py`, + :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into + one script. This is convenient but should probably not be used for large + data sets, because :program:`do-partition.py` doesn't provide save/resume + functionality. """ parser = build_nodegraph_args( descr='Load, partition, and annotate FAST[AQ] sequences', @@ -102,7 +103,7 @@ def get_parser(): # pylint: disable=too-many-branches def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') @@ -120,7 +121,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) - # load-graph + # load-into-nodegraph print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index ba750dc081..c7241f0905 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -24,7 +24,7 @@ import argparse import khmer from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog from khmer.kfile import add_output_compression_type from khmer.kfile import get_file_writer @@ -41,8 +41,8 @@ def get_parser(): normalization in non-paired mode, or partitioning) and separates the interleaved reads from the orphaned reads. - The default output is two files, .pe and .se, placed in the current directory. The .pe file contains + The default output is two files, `.pe` and `.se`, placed in the current directory. The .pe file contains interleaved and properly paired sequences, while the .se file contains orphan sequences. @@ -85,7 +85,7 @@ def get_parser(): def main(): info('extract-paired-reads.py') - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() infile = args.infile check_input_files(infile, args.force) diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index 9d7093dcd9..f6f6cd440f 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -28,7 +28,7 @@ from khmer.kfile import (check_input_files, check_space, add_output_compression_type, get_file_writer) -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog from khmer.utils import write_record DEFAULT_MAX_SIZE = int(1e6) @@ -45,7 +45,7 @@ def get_parser(): epilog = """ Example (results will be in ``example.group0000.fa``):: - load-graph.py -k 20 example tests/test-data/random-20-a.fa + load-into-nodegraph.py -k 20 example tests/test-data/random-20-a.fa partition-graph.py example merge-partitions.py -k 20 example annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa @@ -86,7 +86,7 @@ def get_parser(): # pylint: disable=too-many-statements def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() distfilename = args.prefix + '.dist' diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index 8a82555085..5be844b447 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -26,7 +26,8 @@ from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, - add_threading_args, info, calculate_graphsize) + add_threading_args, info, calculate_graphsize, + sanitize_epilog) from khmer.kfile import (check_input_files, check_space, check_space_for_graph, add_output_compression_type, @@ -42,7 +43,7 @@ def get_parser(): This script is constant memory. To trim reads based on k-mer abundance across multiple files, use - :program:`load-into-counting.py` and :program:`filter-abund.py`. + :program:`load-into-countgraph.py` and :program:`filter-abund.py`. Example:: @@ -68,7 +69,7 @@ def get_parser(): def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index 1f79a9e2c9..f23ae077da 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -23,7 +23,8 @@ import argparse import sys from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader -from khmer.khmer_args import (ComboFormatter, add_threading_args, info) +from khmer.khmer_args import (ComboFormatter, add_threading_args, info, + sanitize_epilog) from khmer.kfile import (check_input_files, check_space, add_output_compression_type, get_file_writer) from khmer import __version__ @@ -34,13 +35,13 @@ def get_parser(): epilog = """ - Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt + Trimmed sequences will be placed in `${input_sequence_filename}.abundfilt` for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: - load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa + load-into-countgraph.py -k 20 -x 5e7 countgraph data/100k-filtered.fa filter-abund.py -C 2 countgraph data/100k-filtered.fa """ parser = argparse.ArgumentParser( @@ -79,7 +80,7 @@ def get_parser(): def main(): info('filter-abund.py', ['counting']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index e2d1cb8c39..65999b9af5 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -25,7 +25,7 @@ import sys from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog # @CTB K should be loaded from file... DEFAULT_K = 32 @@ -33,9 +33,9 @@ def get_parser(): epilog = """ - Load stoptags in from the given .stoptags file and use them to trim - or remove the sequences in . Trimmed sequences will be placed in - .stopfilt. + Load stoptags in from the given `.stoptags` file and use them to trim + or remove the sequences in ``. Trimmed sequences will be placed + in `.stopfilt`. """ parser = argparse.ArgumentParser( description="Trim sequences at stoptags.", @@ -55,7 +55,7 @@ def get_parser(): def main(): info('filter-stoptags.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames diff --git a/scripts/find-knots.py b/scripts/find-knots.py index 368336927c..d8a8eee12d 100755 --- a/scripts/find-knots.py +++ b/scripts/find-knots.py @@ -24,7 +24,7 @@ from khmer.kfile import check_input_files, check_space from khmer import khmer_args from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, - report_on_config) + report_on_config, sanitize_epilog) # counting hash parameters. DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes @@ -49,8 +49,9 @@ def get_parser(): epilog = """ - Load an k-mer nodegraph/tagset pair created by load-graph, and a set - of pmap files created by partition-graph. Go through each pmap file, + Load an k-mer nodegraph/tagset pair created by + :program:`load-into-nodegraph.py`, and a set of pmap files created by + :program:`partition-graph.py`. Go through each pmap file, select the largest partition in each, and do the same kind of traversal as in :program:`make-initial-stoptags.py` from each of the waypoints in that partition; this should identify all of the HCKs in that partition. These diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py index 574ec36ed6..873dab5d80 100755 --- a/scripts/interleave-reads.py +++ b/scripts/interleave-reads.py @@ -25,7 +25,7 @@ import argparse import khmer from khmer.kfile import check_input_files, check_space, is_block -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog from khmer.kfile import (add_output_compression_type, get_file_writer, describe_file_handle) from khmer.utils import (write_record_pair, check_is_left, check_is_right, @@ -45,11 +45,11 @@ def get_parser(): As a "bonus", this file ensures that if read names are not already formatted properly, they are reformatted consistently, such that - they look like the pre-1.8 Casava format (@name/1, @name/2). + they look like the pre-1.8 Casava format (`@name/1`, `@name/2`). Example:: - interleave-reads.py tests/test-data/paired.fq.1 \ + interleave-reads.py tests/test-data/paired.fq.1 \\ tests/test-data/paired.fq.2 -o paired.fq""" parser = argparse.ArgumentParser( description='Produce interleaved files from R1/R2 paired files', @@ -71,7 +71,7 @@ def get_parser(): def main(): info('interleave-reads.py') - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) diff --git a/scripts/load-into-counting.py b/scripts/load-into-countgraph.py similarity index 90% rename from scripts/load-into-counting.py rename to scripts/load-into-countgraph.py index ddb63f0c88..bcf0db278b 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-countgraph.py @@ -8,7 +8,7 @@ """ Build a counting Bloom filter from the given sequences, save in . -% load-into-counting.py [ <...> ] +% load-into-countgraph.py [ <...> ] Use '-h' for parameter help. """ @@ -21,8 +21,9 @@ import textwrap import khmer from khmer import khmer_args -from khmer.khmer_args import build_counting_args, report_on_config, info,\ - add_threading_args, calculate_graphsize +from khmer.khmer_args import (build_counting_args, report_on_config, info, + add_threading_args, calculate_graphsize, + sanitize_epilog) from khmer.kfile import check_file_writable from khmer.kfile import check_input_files from khmer.kfile import check_space_for_graph @@ -38,14 +39,14 @@ def get_parser(): Example:: - load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa + load-into-countgraph.py -k 20 -x 5e7 out.ct data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: - load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa + load-into-countgraph.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer countgraph from the given" @@ -63,7 +64,8 @@ def get_parser(): parser.add_argument('--summary-info', '-s', type=str, default=None, metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " - "summary be in? (json or tsv, disabled by default)") + "summary be in? (`json` or `tsv`, disabled by" + " default)") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser @@ -71,9 +73,9 @@ def get_parser(): def main(): - info('load-into-counting.py', ['counting', 'SeqAn']) + info('load-into-countgraph.py', ['counting', 'SeqAn']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename diff --git a/scripts/load-graph.py b/scripts/load-into-nodegraph.py similarity index 91% rename from scripts/load-graph.py rename to scripts/load-into-nodegraph.py index f9ae753d59..974e888985 100755 --- a/scripts/load-graph.py +++ b/scripts/load-into-nodegraph.py @@ -9,7 +9,7 @@ """ Build a graph from the given sequences, save in . -% python scripts/load-graph.py [ <...> ] +% python scripts/load-into-nodegraph.py [ <...> ] Use '-h' for parameter help. """ diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index 5d15cd1ffd..8212780d80 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -17,7 +17,7 @@ import textwrap import khmer from khmer import khmer_args -from khmer.khmer_args import (build_counting_args, info) +from khmer.khmer_args import (build_counting_args, info, sanitize_epilog) from khmer.kfile import check_input_files DEFAULT_SUBSET_SIZE = int(1e4) @@ -42,12 +42,13 @@ def get_parser(): epilog = """ - Loads a k-mer nodegraph/tagset pair created by load-graph.py, and does - a small set of traversals from graph waypoints; on these traversals, looks - for k-mers that are repeatedly traversed in high-density regions of the - graph, i.e. are highly connected. Outputs those k-mers as an initial set of - stoptags, which can be fed into partition-graph.py, find-knots.py, and - filter-stoptags.py. + Loads a k-mer nodegraph/tagset pair created by + :program:`load-into-nodegraph.py`, and + does a small set of traversals from graph waypoints; on these traversals, + looks for k-mers that are repeatedly traversed in high-density regions of + the graph, i.e. are highly connected. Outputs those k-mers as an initial + set of stoptags, which can be fed into :program:`partition-graph.py`, + :program:`find-knots.py`, and :program:`filter-stoptags.py`. The k-mer countgraph size options parameters are for a k-mer countgraph to keep track of repeatedly-traversed k-mers. The subset size option @@ -72,7 +73,7 @@ def get_parser(): def main(): info('make-initial-stoptags.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() graphbase = args.graphbase diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 7d4c172862..8a33271710 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -23,15 +23,16 @@ import khmer import sys from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog DEFAULT_K = 32 def get_parser(): epilog = """ - Take the ${graphbase}.subset.#.pmap files and merge them all into a single - ${graphbase}.pmap.merged file for :program:`annotate-partitions.py` to use. + Take the `${graphbase}.subset.#.pmap` files and merge them all into a + single ${graphbase}.pmap.merged file for :program:`annotate-partitions.py` + to use. """ parser = argparse.ArgumentParser( description="Merge partition map '.pmap' files.", @@ -52,7 +53,7 @@ def get_parser(): def main(): info('merge-partitions.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 3a5a9082c9..3553772ae6 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -27,7 +27,8 @@ from khmer import khmer_args from contextlib import contextmanager from khmer.khmer_args import (build_counting_args, add_loadgraph_args, - report_on_config, info, calculate_graphsize) + report_on_config, info, calculate_graphsize, + sanitize_epilog) import argparse from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, @@ -190,7 +191,7 @@ def get_parser(): reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. - If :option:`-p`/`--paired` is set, then proper pairing is required + If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. @@ -203,7 +204,7 @@ def get_parser(): processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those - produced by :program:`load-into-counting.py` and consumed by + produced by :program:`load-into-countgraph.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output @@ -277,10 +278,8 @@ def get_parser(): def main(): # pylint: disable=too-many-branches,too-many-statements + parser = sanitize_epilog(get_parser()) parser = get_parser() - parser.epilog = parser.epilog.replace( - '//', '/').replace(':option:', '').replace( - ':program:', '').replace('::', ':') args = parser.parse_args() configure_logging(args.quiet) info('normalize-by-median.py', ['diginorm']) diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py index b322b6fb9b..af3b2ee2f1 100755 --- a/scripts/partition-graph.py +++ b/scripts/partition-graph.py @@ -23,7 +23,7 @@ import argparse import khmer import sys -from khmer.khmer_args import (add_threading_args, info) +from khmer.khmer_args import (add_threading_args, info, sanitize_epilog) from khmer.kfile import check_input_files # stdlib queue module was renamed on Python 3 @@ -64,7 +64,7 @@ def worker(queue, basename, stop_big_traversals): def get_parser(): epilog = """ - The resulting partition maps are saved as '${basename}.subset.#.pmap' + The resulting partition maps are saved as `${basename}.subset.#.pmap` files. """ parser = argparse.ArgumentParser( @@ -92,7 +92,7 @@ def get_parser(): def main(): info('partition-graph.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() basename = args.basename filenames = [basename, basename + '.tagset'] diff --git a/scripts/readstats.py b/scripts/readstats.py index 570d8cfeba..5eabe94817 100755 --- a/scripts/readstats.py +++ b/scripts/readstats.py @@ -20,6 +20,8 @@ import argparse import textwrap +from khmer.khmer_args import sanitize_epilog + def get_parser(): descr = "Display summary statistics for one or more FASTA/FASTQ files." @@ -141,7 +143,7 @@ def analyze_file(filename): def main(): """Main function - run when executed as a script.""" - parser = get_parser() + parser = sanitize_epilog(get_parser()) args = parser.parse_args() total_bp = 0 diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 8e2525d84d..4c998800c6 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -29,7 +29,7 @@ import khmer from khmer.kfile import (check_input_files, add_output_compression_type, get_file_writer) -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog from khmer.utils import write_record, broken_paired_reader DEFAULT_NUM_READS = int(1e5) @@ -47,7 +47,7 @@ def get_parser(): but take :option:`-S`/:option:`--samples` samples if specified. The output is placed in :option:`-o`/:option:`--output` - (for a single sample) or in .subset.0 to .subset.S-1 + (for a single sample) or in `.subset.0` to `.subset.S-1` (for more than one sample). This script uses the `reservoir sampling @@ -66,7 +66,8 @@ def get_parser(): default=DEFAULT_MAX_READS) parser.add_argument('-S', '--samples', type=int, dest='num_samples', default=1) - parser.add_argument('-R', '--random-seed', type=int, dest='random_seed') + parser.add_argument('-R', '--random-seed', type=int, dest='random_seed', + help='Provide a random seed for the generator') parser.add_argument('--force_single', default=False, action='store_true', help='Ignore read pair information if present') parser.add_argument('-o', '--output', dest='output_file', @@ -82,7 +83,7 @@ def get_parser(): def main(): info('sample-reads-randomly.py') - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() for _ in args.filenames: check_input_files(_, args.force) diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 994db33099..0c2e34b200 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -23,7 +23,7 @@ import textwrap import argparse import khmer -from khmer.khmer_args import info +from khmer.khmer_args import info, sanitize_epilog from khmer.utils import (write_record, broken_paired_reader, UnpairedReadsError) from khmer.kfile import (check_input_files, check_space, @@ -92,7 +92,7 @@ def get_parser(): def main(): info('split-paired-reads.py') - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() infile = args.infile diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index d7d2c9a0cc..e42c04156c 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -28,7 +28,8 @@ from khmer import khmer_args from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, - report_on_config, calculate_graphsize) + report_on_config, calculate_graphsize, + sanitize_epilog) from khmer.utils import write_record, write_record_pair, broken_paired_reader from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, @@ -61,7 +62,7 @@ def get_parser(): Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, - use ``load-into-counting.py`` and ``filter-abund.py``. However, read + use ``load-into-countgraph.py`` and ``filter-abund.py``. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. @@ -112,7 +113,7 @@ def get_parser(): def main(): info('trim-low-abund.py', ['streaming']) - parser = get_parser() + parser = sanitize_epilog(get_parser()) args = parser.parse_args() ### diff --git a/scripts/unique-kmers.py b/scripts/unique-kmers.py index 9ac283c998..ad871fc8e3 100755 --- a/scripts/unique-kmers.py +++ b/scripts/unique-kmers.py @@ -24,7 +24,7 @@ import khmer from khmer.khmer_args import (DEFAULT_K, info, ComboFormatter, - _VersionStdErrAction) + _VersionStdErrAction, sanitize_epilog) from khmer.utils import write_record from khmer.khmer_args import graphsize_args_report from khmer import __version__ @@ -109,7 +109,7 @@ def get_parser(): def main(): info('unique-kmers.py', ['SeqAn', 'hll']) - args = get_parser().parse_args() + args = sanitize_epilog(get_parser()).parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) diff --git a/tests/test_countgraph.py b/tests/test_countgraph.py index c0aec9fa2e..70448b20c0 100644 --- a/tests/test_countgraph.py +++ b/tests/test_countgraph.py @@ -1328,7 +1328,7 @@ def test_find_all_tags_list_error(): def test_abund_dist_gz_bigcount(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' htfile = utils.get_temp_filename('test_ct') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table @@ -1365,7 +1365,7 @@ def test_abund_dist_gz_bigcount(): def test_abund_dist_gz_bigcount_compressed_first(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' htfile = utils.get_temp_filename('test_ct.gz') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py index 0e620c812b..2ce6dad2c5 100644 --- a/tests/test_normalize_by_median.py +++ b/tests/test_normalize_by_median.py @@ -37,7 +37,7 @@ def test_normalize_by_median_loadgraph_with_args(): tablefile = utils.get_temp_filename("table") in_dir = os.path.dirname(tablefile) - script = "load-into-counting.py" + script = "load-into-countgraph.py" args = [tablefile, infile] (status, out, err) = utils.runscript(script, args) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 9790f600ad..1e6d646255 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -42,7 +42,7 @@ def test_check_space(): def test_load_into_counting(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e3', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out.ct') @@ -56,7 +56,7 @@ def test_load_into_counting(): def test_load_into_counting_autoargs_0(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' outfile = utils.get_temp_filename('table') infile = utils.get_test_data('test-abund-read-2.fa') @@ -72,7 +72,7 @@ def test_load_into_counting_autoargs_0(): def test_load_into_counting_autoargs_1(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' outfile = utils.get_temp_filename('table') infile = utils.get_test_data('test-abund-read-2.fa') @@ -86,7 +86,7 @@ def test_load_into_counting_autoargs_1(): def test_load_into_count_graphsize_warning(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-k', '20'] outfile = utils.get_temp_filename('out.ct') @@ -100,7 +100,7 @@ def test_load_into_count_graphsize_warning(): def test_load_into_counting_max_memory_usage_parameter(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-M', '2e3', '-k', '20'] outfile = utils.get_temp_filename('out.ct') @@ -117,7 +117,7 @@ def test_load_into_counting_max_memory_usage_parameter(): def test_load_into_counting_abundance_dist_nobig(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e3', '-N', '2', '-k', '20', '-b'] outfile = utils.get_temp_filename('out.ct') @@ -143,7 +143,7 @@ def test_load_into_counting_abundance_dist_squashing(): infile = utils.get_test_data('test-abund-read-2.fa') args = [graphfile, infile] - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' utils.runscript(script, args) histogram = utils.get_temp_filename('histogram') @@ -176,7 +176,7 @@ def test_load_into_counting_abundance_dist_squashing(): def test_load_into_counting_nonwritable(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e3', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('test-nonwritable') @@ -195,7 +195,7 @@ def test_load_into_counting_nonwritable(): @attr('huge') def test_load_into_counting_toobig(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e12', '-N', '2', '-k', '20', '--force'] outfile = utils.get_temp_filename('out.kh') @@ -209,7 +209,7 @@ def test_load_into_counting_toobig(): def test_load_into_counting_fail(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e2', '-N', '2', '-k', '20'] # use small HT outfile = utils.get_temp_filename('out.ct') @@ -224,7 +224,7 @@ def test_load_into_counting_fail(): def test_load_into_counting_multifile(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out.kh') @@ -239,7 +239,7 @@ def test_load_into_counting_multifile(): def test_load_into_counting_tsv(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'tsv'] outfile = utils.get_temp_filename('out.ct') @@ -262,7 +262,7 @@ def test_load_into_counting_tsv(): def test_load_into_counting_json(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'json'] outfile = utils.get_temp_filename('out.ct') @@ -293,7 +293,7 @@ def test_load_into_counting_json(): def test_load_into_counting_bad_summary_fmt(): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'badfmt'] outfile = utils.get_temp_filename('out.ct') @@ -307,7 +307,7 @@ def test_load_into_counting_bad_summary_fmt(): def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True): - script = 'load-into-counting.py' + script = 'load-into-countgraph.py' args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)] if not BIGCOUNT: @@ -766,7 +766,7 @@ def test_count_median_fq_csv_stdout(): def test_load_graph(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out') @@ -792,7 +792,7 @@ def test_load_graph(): # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a - # roundabout way of checking that load-graph worked :) + # roundabout way of checking that load-into-nodegraph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x @@ -854,7 +854,7 @@ def test_oxli_build_graph_unique_kmers_arg(): # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a - # roundabout way of checking that load-graph worked :) + # roundabout way of checking that load-into-nodegraph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x @@ -868,7 +868,7 @@ def test_oxli_nocommand(): def test_load_graph_no_tags(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', '1e7', '-N', '2', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') @@ -914,7 +914,7 @@ def test_oxli_build_graph_no_tags(): def test_load_graph_fail(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', '1e3', '-N', '2', '-k', '20'] # use small HT outfile = utils.get_temp_filename('out') @@ -942,7 +942,7 @@ def test_oxli_build_graph_fail(): def test_load_graph_write_fp(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', '1e5', '-N', '2', '-k', '20'] # use small HT outfile = utils.get_temp_filename('out') @@ -987,7 +987,7 @@ def test_oxli_build_graph_write_fp(): def test_load_graph_multithread(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' outfile = utils.get_temp_filename('test') infile = utils.get_test_data('test-reads.fa') @@ -1009,7 +1009,7 @@ def test_oxli_build_graph_multithread(): def test_load_graph_max_memory_usage_parameter(): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-M', '2e7', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') @@ -1036,7 +1036,7 @@ def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20, do_partition=False, annotate_partitions=False, stop_big_traverse=False): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', str(min_hashsize), '-N', str(n_hashes), '-k', str(ksize)] outfile = utils.get_temp_filename('out') @@ -1083,7 +1083,7 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20, do_partition=False, annotate_partitions=False, stop_big_traverse=False): - script = 'load-graph.py' + script = 'load-into-nodegraph.py' args = ['-x', str(min_hashsize), '-N', str(n_hashes), '-k', str(ksize)] outfile = utils.get_temp_filename('out') @@ -1941,19 +1941,19 @@ def test_interleave_reads_2_fa(): def test_make_initial_stoptags(): - # gen input files using load-graph.py -t + # gen input files using load-into-nodegraph.py -t # should keep test_data directory size down # or something like that - # this assumes (obv.) load-graph works properly + # this assumes (obv.) load-into-nodegraph works properly bzinfile = utils.get_temp_filename('test-reads.fq.bz2') shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile) in_dir = os.path.dirname(bzinfile) - genscript = 'load-graph.py' + genscript = 'load-into-nodegraph.py' genscriptargs = ['test-reads', 'test-reads.fq.bz2'] utils.runscript(genscript, genscriptargs, in_dir) - # test input file gen'd by load-graphs + # test input file gen'd by load-into-nodegraphs infile = utils.get_temp_filename('test-reads.pt') infile2 = utils.get_temp_filename('test-reads.tagset', in_dir) @@ -1981,7 +1981,7 @@ def test_make_initial_stoptags_load_stoptags(): shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile) in_dir = os.path.dirname(bzinfile) - genscript = 'load-graph.py' + genscript = 'load-into-nodegraph.py' genscriptargs = ['test-reads', 'test-reads.fq.bz2'] utils.runscript(genscript, genscriptargs, in_dir) @@ -3059,7 +3059,7 @@ def _execute_load_graph_streaming(filename): args = '-x 1e7 -N 2 -k 20 out -' - cmd = 'cat {infile} | {scripts}/load-graph.py {args}'.format( + cmd = 'cat {infile} | {scripts}/load-into-nodegraph.py {args}'.format( infile=infile, scripts=scripts, args=args) (status, out, err) = utils.run_shell_cmd(cmd, in_directory=in_dir) @@ -3082,7 +3082,7 @@ def _execute_load_graph_streaming(filename): # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a - # roundabout way of checking that load-graph worked :) + # roundabout way of checking that load-into-nodegraph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x @@ -3466,7 +3466,7 @@ def test_trim_low_abund_trimtest_after_load(): shutil.copyfile(utils.get_test_data('test-abund-read-2.paired.fq'), infile) args = ["-k", "17", "-x", "1e7", "-N", "2", saved_table, infile] - utils.runscript('load-into-counting.py', args, in_dir) + utils.runscript('load-into-countgraph.py', args, in_dir) args = ["-Z", "2", "-C", "2", "-V", '--loadgraph', saved_table, infile] utils.runscript('trim-low-abund.py', args, in_dir) diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py index f942edbb7c..7ca9ac2e26 100644 --- a/tests/test_streaming_io.py +++ b/tests/test_streaming_io.py @@ -309,7 +309,7 @@ def test_load_into_counting_1(): cmd = """ cat {in1} | - {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \ + {scripts}/load-into-countgraph.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ @@ -327,7 +327,7 @@ def test_load_graph_1(): cmd = """ cat {in1} | - {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \ + {scripts}/load-into-nodegraph.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """