From 4f1be8ce75335901570ac3fd60c00d1fa79886f7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 20 Apr 2015 22:54:32 -0400 Subject: [PATCH 01/20] Squash branch test/py3 --- Makefile | 5 +- khmer/__init__.py | 14 +- khmer/{_khmermodule.cc => _khmer.cc} | 245 +++++++++++-------- khmer/_version.py | 1 + khmer/kfile.py | 64 ++--- khmer/khmer_args.py | 23 +- khmer/thread_utils.py | 56 ++--- khmer/utils.py | 23 +- lib/get_version.py | 3 +- sandbox/abundance-hist-by-position.py | 10 +- sandbox/assembly-diff-2.py | 7 +- sandbox/assembly-diff.py | 6 +- sandbox/assemstats3.py | 22 +- sandbox/bloom-count-intersection.py | 15 +- sandbox/bloom-count.py | 7 +- sandbox/build-sparse-graph.py | 7 +- sandbox/calc-best-assembly.py | 21 +- sandbox/calc-error-profile.py | 34 +-- sandbox/calc-median-distribution.py | 6 +- sandbox/collect-reads.py | 30 +-- sandbox/collect-variants.py | 41 ++-- sandbox/correct-errors.py | 33 +-- sandbox/extract-single-partition.py | 3 +- sandbox/fasta-to-abundance-hist.py | 5 +- sandbox/filter-below-abund.py | 13 +- sandbox/filter-median-and-pct.py | 15 +- sandbox/filter-median.py | 11 +- sandbox/find-high-abund-kmers.py | 39 +-- sandbox/graph-size.py | 23 +- sandbox/hi-lo-abundance-by-position.py | 7 +- sandbox/make-coverage.py | 15 +- sandbox/multi-rename.py | 3 +- sandbox/normalize-by-median-pct.py | 70 +++--- sandbox/print-tagset.py | 3 +- sandbox/renumber-partitions.py | 5 +- sandbox/saturate-by-median.py | 47 ++-- sandbox/shuffle-reverse-rotary.py | 7 +- sandbox/slice-reads-by-coverage.py | 13 +- sandbox/split-fasta.py | 5 +- sandbox/split-sequences-by-length.py | 5 +- sandbox/stoptag-abundance-hist.py | 5 +- sandbox/stoptags-by-position.py | 3 +- sandbox/strip-partition.py | 3 +- sandbox/subset-report.py | 15 +- sandbox/sweep-files.py | 45 ++-- sandbox/sweep-out-reads-with-contigs.py | 5 +- sandbox/sweep-reads.py | 104 ++++---- sandbox/sweep-reads2.py | 31 +-- sandbox/sweep-reads3.py | 31 +-- sandbox/unique-kmers.py | 16 +- sandbox/write-trimmomatic.py | 5 +- scripts/abundance-dist-single.py | 53 ++-- scripts/annotate-partitions.py | 11 +- scripts/count-median.py | 8 +- scripts/count-overlap.py | 7 +- scripts/do-partition.py | 78 +++--- scripts/extract-long-sequences.py | 3 +- scripts/extract-paired-reads.py | 19 +- scripts/extract-partitions.py | 68 +++--- scripts/fastq-to-fasta.py | 13 +- scripts/filter-abund-single.py | 25 +- scripts/filter-abund.py | 10 +- scripts/filter-stoptags.py | 7 +- scripts/find-knots.py | 40 +-- scripts/interleave-reads.py | 37 +-- scripts/load-graph.py | 2 + scripts/load-into-counting.py | 36 +-- scripts/make-initial-stoptags.py | 15 +- scripts/merge-partitions.py | 11 +- scripts/normalize-by-median.py | 6 +- scripts/partition-graph.py | 55 +++-- scripts/readstats.py | 17 +- scripts/sample-reads-randomly.py | 51 ++-- scripts/split-paired-reads.py | 22 +- scripts/trim-low-abund.py | 71 +++--- setup.py | 11 +- tests/khmer_tst_utils.py | 28 ++- tests/test_counting_hash.py | 113 ++++----- tests/test_counting_single.py | 20 +- tests/test_filter.py | 10 +- tests/test_functions.py | 12 +- tests/test_graph.py | 14 +- tests/test_hashbits.py | 48 ++-- tests/test_hashbits_obj.py | 28 ++- tests/test_hll.py | 6 +- tests/test_labelhash.py | 46 ++-- tests/test_lump.py | 5 +- tests/test_read_aligner.py | 6 +- tests/test_read_parsers.py | 32 +-- tests/test_sandbox_scripts.py | 41 ++-- tests/test_script_arguments.py | 28 ++- tests/test_scripts.py | 285 +++++++++++++--------- tests/test_subset_graph.py | 44 ++-- tests/test_threaded_sequence_processor.py | 8 +- tests/test_version.py | 15 +- versioneer.py | 1 + 96 files changed, 1459 insertions(+), 1232 deletions(-) rename khmer/{_khmermodule.cc => _khmer.cc} (96%) diff --git a/Makefile b/Makefile index 3a23d18580..e70fcb5c3b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmermodule.cc) PYSOURCES=$(wildcard khmer/*.py scripts/*.py) SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py DEVPKGS=sphinxcontrib-autoprogram pep8==1.5.7 diff_cover \ -autopep8 pylint coverage gcovr nose screed pep257 +autopep8 pylint coverage gcovr nose pep257 future GCOVRURL=git+https://github.com/nschum/gcovr.git@never-executed-branches VERSION=$(shell git describe --tags --dirty | sed s/v//) @@ -36,7 +36,8 @@ help: Makefile install-dep: install-dependencies install-dependencies: - pip2 install --upgrade $(DEVPKGS) || pip install --upgrade $(DEVPKGS) + pip install --upgrade $(DEVPKGS) + pip install git+https://github.com/ged-lab/screed.git@py3 ## sharedobj : build khmer shared object file sharedobj: khmer/_khmermodule.so diff --git a/khmer/__init__.py b/khmer/__init__.py index d1456439a2..63587326d0 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -6,6 +6,8 @@ # """This is khmer; please see http://khmer.readthedocs.org/.""" +from __future__ import print_function + from khmer._khmer import CountingHash from khmer._khmer import LabelHash as _LabelHash from khmer._khmer import Hashbits as _Hashbits @@ -179,12 +181,12 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2): fp_all = fp_one ** n_ht if fp_all > max_false_pos: - print >>sys.stderr, "**" - print >>sys.stderr, "** ERROR: the graph structure is too small for " - print >>sys.stderr, "this data set. Increase k-mer presence table " - print >>sys.stderr, "size/num of tables." - print >>sys.stderr, "** Do not use these results!!" - print >>sys.stderr, "**" + print("**", file=sys.stderr) + print("** ERROR: the graph structure is too small for ", file=sys.stderr) + print("this data set. Increase k-mer presence table ", file=sys.stderr) + print("size/num of tables.", file=sys.stderr) + print("** Do not use these results!!", file=sys.stderr) + print("**", file=sys.stderr) if not force: sys.exit(1) diff --git a/khmer/_khmermodule.cc b/khmer/_khmer.cc similarity index 96% rename from khmer/_khmermodule.cc rename to khmer/_khmer.cc index 4464c21fae..61d556ed40 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmer.cc @@ -43,6 +43,27 @@ using namespace read_parsers; #include "bytesobject.h" +// +// Python 2/3 compatibility: Module initialization +// http://python3porting.com/cextensions.html#module-initialization +// + +#if PY_MAJOR_VERSION >= 3 + #define MOD_ERROR_VAL NULL + #define MOD_SUCCESS_VAL(val) val + #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ + ob = PyModule_Create(&moduledef); +#else + #define MOD_ERROR_VAL + #define MOD_SUCCESS_VAL(val) + #define MOD_INIT(name) void init##name(void) + #define MOD_DEF(ob, name, doc, methods) \ + ob = Py_InitModule3(name, methods, doc); +#endif + using namespace khmer; // @@ -50,7 +71,7 @@ using namespace khmer; // extern "C" { - void init_khmer(); + MOD_INIT(_khmer); } // Configure module logging. @@ -104,14 +125,6 @@ class _khmer_exception }; }; -class _khmer_signal : public _khmer_exception -{ -public: - _khmer_signal(std::string message) : _khmer_exception(message) { }; -}; - -typedef pre_partition_info _pre_partition_info; - /***********************************************************************/ // @@ -145,7 +158,7 @@ static PyObject * Read_get_name(khmer_Read_Object * obj, void * closure ) { - return PyBytes_FromString(obj->read->name.c_str()) ; + return PyUnicode_FromString(obj->read->name.c_str()) ; } @@ -153,7 +166,7 @@ static PyObject * Read_get_sequence(khmer_Read_Object * obj, void * closure) { - return PyBytes_FromString(obj->read->sequence.c_str()) ; + return PyUnicode_FromString(obj->read->sequence.c_str()) ; } @@ -161,7 +174,7 @@ static PyObject * Read_get_quality(khmer_Read_Object * obj, void * closure) { - return PyBytes_FromString(obj->read->quality.c_str()) ; + return PyUnicode_FromString(obj->read->quality.c_str()) ; } @@ -169,7 +182,7 @@ static PyObject * Read_get_annotations(khmer_Read_Object * obj, void * closure) { - return PyBytes_FromString(obj->read->annotations.c_str()) ; + return PyUnicode_FromString(obj->read->annotations.c_str()) ; } @@ -335,7 +348,7 @@ _ReadParser_iternext( PyObject * self ) exc = e.what(); } catch (InvalidRead &e) { exc = e.what(); - } + } } Py_END_ALLOW_THREADS @@ -616,6 +629,44 @@ _PyObject_to_khmer_ReadParser( PyObject * py_object ) return ((python:: khmer_ReadParser_Object *)py_object)->parser; } +typedef struct { + PyObject_HEAD + pre_partition_info * PrePartitionInfo; +} khmer_PrePartitionInfo_Object; + +static +void +khmer_PrePartitionInfo_dealloc(khmer_PrePartitionInfo_Object * obj) +{ + delete obj->PrePartitionInfo; + obj->PrePartitionInfo = NULL; + Py_TYPE(obj)->tp_free((PyObject*)obj); +} + +static PyTypeObject khmer_PrePartitionInfo_Type = { + PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ + "_khmer.PrePartitionInfo", /* tp_name */ + sizeof(khmer_PrePartitionInfo_Object),/* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)khmer_PrePartitionInfo_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Stores a k-kmer and a set of tagged seen k-mers.", /* tp_doc */ +}; + /***********************************************************************/ @@ -625,12 +676,6 @@ void free_subset_partition_info(void * p) delete subset_p; } -void free_pre_partition_info(void * p) -{ - _pre_partition_info * ppi = (_pre_partition_info *) p; - delete ppi; -} - typedef struct { PyObject_HEAD Hashtable * hashtable; @@ -845,11 +890,9 @@ hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me, read_parsers:: IParser * rparser = _PyObject_to_khmer_ReadParser( rparser_obj ); - char const * exc = ""; // call the C++ function, and trap signals => Python unsigned long long n_consumed = 0; unsigned int total_reads = 0; - bool exc_raised = false; Py_BEGIN_ALLOW_THREADS try { hashtable->consume_fasta(rparser, total_reads, n_consumed); @@ -858,10 +901,6 @@ hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me, exc_raised = true; } Py_END_ALLOW_THREADS - if (exc_raised) { - PyErr_SetString(PyExc_IOError, exc); - return NULL; - } return Py_BuildValue("IK", total_reads, n_consumed); } @@ -1358,7 +1397,7 @@ hashtable_trim_on_stoptags(khmer_KHashtable_Object * me, PyObject * args) Py_END_ALLOW_THREADS; - PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at); + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); if (trim_seq == NULL) { return NULL; } @@ -1435,7 +1474,11 @@ hashtable_do_subset_partition(khmer_KHashtable_Object * me, PyObject * args) return PyErr_NoMemory(); } - return PyCObject_FromVoidPtr(subset_p, free_subset_partition_info); + khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ + PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); + subset_obj->subset = subset_p; + + return (PyObject *)subset_obj; } static @@ -1460,18 +1503,12 @@ hashtable_merge_subset(khmer_KHashtable_Object * me, PyObject * args) { Hashtable * hashtable = me->hashtable; - PyObject * subset_obj; - if (!PyArg_ParseTuple(args, "O", &subset_obj)) { - return NULL; - } - - if (!PyCObject_Check(subset_obj)) { - PyErr_SetString( PyExc_ValueError, "invalid subset"); + khmer_KSubsetPartition_Object * subset_obj; + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) { return NULL; } - SubsetPartition * subset_p; - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj); + subset_p = subset_obj->subset; hashtable->partition->merge(subset_p); @@ -1642,7 +1679,12 @@ hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args) Py_END_ALLOW_THREADS - return PyCObject_FromVoidPtr(ppi, free_pre_partition_info); + khmer_PrePartitionInfo_Object * ppi_obj = (khmer_PrePartitionInfo_Object *) \ + PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type); + + ppi_obj->PrePartitionInfo = ppi; + + return (PyObject*)ppi_obj; } static @@ -1661,8 +1703,13 @@ hashtable_assign_partition_id(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - _pre_partition_info * ppi; - ppi = (_pre_partition_info *) PyCObject_AsVoidPtr(ppi_obj); + khmer_PrePartitionInfo_Object * ppi_obj; + if (!PyArg_ParseTuple(args, "O!", &khmer_PrePartitionInfo_Type, &ppi_obj)) { + return NULL; + } + + pre_partition_info * ppi; + ppi = ppi_obj->PrePartitionInfo; PartitionID p; p = hashtable->partition->assign_partition_id(ppi->kmer, @@ -1927,17 +1974,14 @@ static PyObject * hashtable_subset_count_partitions(khmer_KHashtable_Object * me, PyObject * args) { - PyObject * subset_obj = NULL; + khmer_KSubsetPartition_Object * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "O", &subset_obj)) { + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) { return NULL; } - SubsetPartition * subset_p; - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj); - size_t n_partitions = 0, n_unassigned = 0; - subset_p->count_partitions(n_partitions, n_unassigned); + subset_obj->subset->count_partitions(n_partitions, n_unassigned); return Py_BuildValue("nn", (Py_ssize_t) n_partitions, (Py_ssize_t) n_unassigned); @@ -1948,13 +1992,14 @@ PyObject * hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me, PyObject * args) { - PyObject * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "O", &subset_obj)) { + khmer_KSubsetPartition_Object * subset_obj = NULL; + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) { return NULL; } SubsetPartition * subset_p; - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj); + subset_p = subset_obj->subset; + PyObject * subset_obj = NULL; PartitionCountDistribution d; @@ -2106,7 +2151,10 @@ hashtable_load_subset_partitionmap(khmer_KHashtable_Object * me, delete subset_p; return NULL; } else { - return PyCObject_FromVoidPtr(subset_p, free_subset_partition_info); + khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ + PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); + subset_obj->subset = subset_p; + return (PyObject*) subset_obj; } } @@ -2146,15 +2194,13 @@ PyObject * hashtable__validate_subset_partitionmap(khmer_KHashtable_Object * me, PyObject * args) { - PyObject * subset_obj = NULL; + khmer_KSubsetPartition_Object * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "O", &subset_obj)) { + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) { return NULL; } - SubsetPartition * subset_p; - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj); - subset_p->_validate_pmap(); + subset_obj->subset->_validate_pmap(); Py_RETURN_NONE; } @@ -2172,8 +2218,6 @@ hashtable_set_partition_id(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - hashtable->partition->set_partition_id(kmer, p); - Py_RETURN_NONE; } @@ -2314,7 +2358,7 @@ hashtable_extract_unique_paths(khmer_KHashtable_Object * me, PyObject * args) } for (unsigned int i = 0; i < results.size(); i++) { - PyList_SET_ITEM(x, i, PyBytes_FromString(results[i].c_str())); + PyList_SET_ITEM(x, i, PyUnicode_FromString(results[i].c_str())); } return x; @@ -3725,8 +3769,6 @@ labelhash_consume_fasta_and_tag_with_labels(khmer_KLabelHash_Object * me, try { hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - exc = e.get_message().c_str(); } catch (khmer_file_exception &e) { exc = e.what(); } @@ -3761,10 +3803,6 @@ labelhash_consume_partitioned_fasta_and_tag_with_labels( try { labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, - "error parsing in consume_partitioned_fasta_and_tag_with_labels"); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -3785,12 +3823,7 @@ labelhash_consume_sequence_and_tag_with_labels(khmer_KLabelHash_Object * me, } unsigned long long n_consumed = 0; Label * the_label = hb->check_and_allocate_label(c); - - try { - hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); - } catch (_khmer_signal &e) { - return NULL; - } + hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); return Py_BuildValue("K", n_consumed); } @@ -3835,23 +3868,14 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me, //std::pair ret; LabelPtrSet found_labels; - bool exc_raised = false; //unsigned int num_traversed = 0; //Py_BEGIN_ALLOW_THREADS - try { - hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, + hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); - } catch (_khmer_signal &e) { - exc_raised = true; - } //Py_END_ALLOW_THREADS //printf("...%u kmers traversed\n", num_traversed); - if (exc_raised) { - return NULL; - } - PyObject * x = PyList_New(found_labels.size()); LabelPtrSet::const_iterator si; unsigned long long i = 0; @@ -4413,9 +4437,6 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me, unsigned int total_reads = 0; try { me->hllcounter->consume_fasta(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -4718,7 +4739,7 @@ static PyObject * reverse_hash(PyObject * self, PyObject * args) return NULL; } - return PyBytes_FromString(_revhash(val, ksize).c_str()); + return PyUnicode_FromString(_revhash(val, ksize).c_str()); } static PyObject * murmur3_forward_hash(PyObject * self, PyObject * args) @@ -4755,7 +4776,7 @@ get_version_cpp( PyObject * self, PyObject * args ) #define xstr(s) str(s) #define str(s) #s std::string dVersion = xstr(VERSION); - return PyBytes_FromString(dVersion.c_str()); + return PyUnicode_FromString(dVersion.c_str()); } @@ -4813,8 +4834,7 @@ static PyMethodDef KhmerMethods[] = { { NULL, NULL, 0, NULL } // sentinel }; -PyMODINIT_FUNC -init_khmer(void) +MOD_INIT(_khmer) { using namespace python; @@ -4824,84 +4844,93 @@ init_khmer(void) khmer_KCountingHash_Type.tp_base = &khmer_KHashtable_Type; if (PyType_Ready(&khmer_KCountingHash_Type) < 0) { - return; + return MOD_ERROR_VAL; + } + + if (PyType_Ready(&khmer_PrePartitionInfo_Type) < 0) { + return MOD_ERROR_VAL; } khmer_KSubsetPartition_Type.tp_methods = khmer_subset_methods; if (PyType_Ready(&khmer_KSubsetPartition_Type) < 0) { - return; + return MOD_ERROR_VAL; } khmer_KHashbits_Type.tp_base = &khmer_KHashtable_Type; + khmer_KHashbits_Type.tp_new = khmer_hashbits_new; khmer_KHashbits_Type.tp_methods = khmer_hashbits_methods; if (PyType_Ready(&khmer_KHashbits_Type) < 0) { - return; + return MOD_ERROR_VAL; } - // add LabelHash + khmer_KLabelHash_Type.tp_base = &khmer_KHashbits_Type; khmer_KLabelHash_Type.tp_methods = khmer_labelhash_methods; + khmer_KLabelHash_Type.tp_new = khmer_labelhash_new; if (PyType_Ready(&khmer_KLabelHash_Type) < 0) { - return; - } - - if (PyType_Ready(&khmer_ReadAlignerType) < 0) { - return; + return MOD_ERROR_VAL; } if (PyType_Ready(&khmer_KHLLCounter_Type) < 0) { - return; + return MOD_ERROR_VAL; } if (PyType_Ready(&khmer_ReadAlignerType) < 0) { - return; + return MOD_ERROR_VAL; } _init_ReadParser_Type_constants(); if (PyType_Ready( &khmer_ReadParser_Type ) < 0) { - return; + return MOD_ERROR_VAL; } if (PyType_Ready(&khmer_Read_Type ) < 0) { - return; + return MOD_ERROR_VAL; } if (PyType_Ready(&khmer_ReadPairIterator_Type ) < 0) { - return; + return MOD_ERROR_VAL; } PyObject * m; - m = Py_InitModule3( "_khmer", KhmerMethods, - "interface for the khmer module low-level extensions" ); + + MOD_DEF(m, "_khmer", "interface for the khmer module low-level extensions", + KhmerMethods); + if (m == NULL) { - return; + return MOD_ERROR_VAL; } Py_INCREF(&khmer_ReadParser_Type); if (PyModule_AddObject( m, "ReadParser", (PyObject *)&khmer_ReadParser_Type ) < 0) { - return; + return MOD_ERROR_VAL; } Py_INCREF(&khmer_KCountingHash_Type); if (PyModule_AddObject( m, "CountingHash", (PyObject *)&khmer_KCountingHash_Type ) < 0) { - return; + return MOD_ERROR_VAL; + } + if (PyModule_AddObject( m, "ReadParser", (PyObject *)&khmer_ReadParser_Type ) < 0) { + return MOD_ERROR_VAL; } Py_INCREF(&khmer_KHashbits_Type); if (PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbits_Type) < 0) { - return; + return MOD_ERROR_VAL; } Py_INCREF(&khmer_KLabelHash_Type); if (PyModule_AddObject(m, "LabelHash", (PyObject *)&khmer_KLabelHash_Type) < 0) { - return; + return MOD_ERROR_VAL; } Py_INCREF(&khmer_KHLLCounter_Type); PyModule_AddObject(m, "HLLCounter", (PyObject *)&khmer_KHLLCounter_Type); Py_INCREF(&khmer_ReadAlignerType); PyModule_AddObject(m, "ReadAligner", (PyObject *)&khmer_ReadAlignerType); + + return MOD_SUCCESS_VAL(m); } // vim: set ft=cpp sts=4 sw=4 tw=79: diff --git a/khmer/_version.py b/khmer/_version.py index b50bd8714d..6c8a1ea4a8 100644 --- a/khmer/_version.py +++ b/khmer/_version.py @@ -1,3 +1,4 @@ +from __future__ import print_function # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag diff --git a/khmer/kfile.py b/khmer/kfile.py index 563d3a7c29..8fec14e7f8 100644 --- a/khmer/kfile.py +++ b/khmer/kfile.py @@ -7,6 +7,8 @@ """File handling/checking utilities for command-line scripts.""" +from __future__ import print_function, unicode_literals + import os import sys import errno @@ -22,16 +24,16 @@ def check_input_files(file_path, force): """ mode = None - if file_path is '-': + if file_path == '-': return try: mode = os.stat(file_path).st_mode except OSError: - print >>sys.stderr, "ERROR: Input file %s does not exist" % \ - file_path + print("ERROR: Input file %s does not exist" % + file_path, file=sys.stderr) if not force: - print >>sys.stderr, "Exiting" + print("Exiting", file=sys.stderr) sys.exit(1) else: return @@ -41,14 +43,14 @@ def check_input_files(file_path, force): return if not os.path.exists(file_path): - print >>sys.stderr, "ERROR: Input file %s does not exist; exiting" % \ - file_path + print("ERROR: Input file %s does not exist; exiting" % + file_path, file=sys.stderr) if not force: sys.exit(1) else: if os.stat(file_path).st_size == 0: - print >>sys.stderr, "ERROR: Input file %s is empty; exiting." % \ - file_path + print("ERROR: Input file %s is empty; exiting." % + file_path, file=sys.stderr) if not force: sys.exit(1) @@ -59,11 +61,11 @@ def check_file_writable(file_path): file_obj = open(file_path, "a") except IOError as error: if error.errno == errno.EACCES: - print >>sys.stderr, "ERROR: File %s does not have write " \ - % file_path + "permission; exiting" + print("ERROR: File %s does not have write " + % file_path + "permission; exiting", file=sys.stderr) sys.exit(1) else: - print >>sys.stderr, "ERROR: " + error.strerror + print("ERROR: " + error.strerror, file=sys.stderr) else: file_obj.close() return @@ -97,14 +99,14 @@ def check_space(in_files, force, _testhook_free_space=None): size_diff = total_size - free_space if size_diff > 0: - print >>sys.stderr, "ERROR: Not enough free space on disk " \ - "for output files;\n" \ - " Need at least %.1f GB more." \ - % (float(size_diff) / 1e9) - print >>sys.stderr, " Estimated output size: %.1f GB" \ - % (float(total_size) / 1e9,) - print >>sys.stderr, " Free space: %.1f GB" \ - % (float(free_space) / 1e9,) + print("ERROR: Not enough free space on disk " + "for output files;\n" + " Need at least %.1f GB more." + % (float(size_diff) / 1e9), file=sys.stderr) + print(" Estimated output size: %.1f GB" + % (float(total_size) / 1e9,), file=sys.stderr) + print(" Free space: %.1f GB" + % (float(free_space) / 1e9,), file=sys.stderr) if not force: sys.exit(1) @@ -121,14 +123,14 @@ def check_space_for_hashtable(hash_size, force, _testhook_free_space=None): size_diff = hash_size - free_space if size_diff > 0: - print >>sys.stderr, "ERROR: Not enough free space on disk " \ - "for saved table files;" \ - " Need at least %s GB more." \ - % (float(size_diff) / 1e9,) - print >>sys.stderr, " Table size: %.1f GB" \ - % (float(hash_size) / 1e9,) - print >>sys.stderr, " Free space: %.1f GB" \ - % (float(free_space) / 1e9,) + print("ERROR: Not enough free space on disk " + "for saved table files;" + " Need at least %s GB more." + % (float(size_diff) / 1e9,), file=sys.stderr) + print(" Table size: %.1f GB" + % (float(hash_size) / 1e9,), file=sys.stderr) + print(" Free space: %.1f GB" + % (float(free_space) / 1e9,), file=sys.stderr) if not force: sys.exit(1) @@ -146,8 +148,8 @@ def check_valid_file_exists(in_files): if os.stat(in_file).st_size > 0: return else: - print >>sys.stderr, 'WARNING: Input file %s is empty' % \ - in_file + print('WARNING: Input file %s is empty' % + in_file, file=sys.stderr) else: - print >>sys.stderr, 'WARNING: Input file %s not found' % \ - in_file + print('WARNING: Input file %s not found' % + in_file, file=sys.stderr) diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index 1e6c8d0a81..4182ad6a7b 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -6,9 +6,12 @@ # Contact: khmer-project@idyll.org # +from __future__ import unicode_literals + import sys import os import argparse +from argparse import _VersionAction from khmer import extract_countinghash_info, extract_hashbits_info from khmer import __version__ import screed @@ -19,6 +22,18 @@ DEFAULT_N_THREADS = 1 +class _VersionStdErrAction(_VersionAction): + + def __call__(self, parser, namespace, values, option_string=None): + version = self.version + if version is None: + version = parser.version + formatter = parser._get_formatter() + formatter.add_text(version) + parser._print_message(formatter.format_help(), sys.stderr) + parser.exit() + + class ComboFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass @@ -35,7 +50,7 @@ def build_hash_args(descr=None, epilog=None, parser=None): env_tablesize = os.environ.get('KHMER_MIN_TABLESIZE', DEFAULT_MIN_TABLESIZE) - parser.add_argument('--version', action='version', + parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') @@ -197,7 +212,11 @@ def info(scriptname, algorithm_list=None): for alg in algorithm_list: sys.stderr.write("|| * ") - sys.stderr.write(_algorithms[alg]) + algstr = _algorithms[alg].encode('utf-8', 'surrogateescape').decode('utf-8', 'replace') + try: + sys.stderr.write(algstr) + except UnicodeEncodeError: + sys.stderr.write(algstr.encode(sys.getfilesystemencoding(), 'replace')) sys.stderr.write("\n") sys.stderr.write("||\n|| Please see http://khmer.readthedocs.org/en/" diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py index 98fdf8cca8..0e6705d86a 100644 --- a/khmer/thread_utils.py +++ b/khmer/thread_utils.py @@ -4,10 +4,13 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # + """Utilities for dealing with multithreaded processing of short reads.""" +from __future__ import print_function, unicode_literals + import threading -import Queue +import queue import sys import screed from khmer import utils @@ -20,7 +23,7 @@ def verbose_loader(filename): screed_iter = screed.open(filename, parse_description=False) for n, record in enumerate(screed_iter): if n % 100000 == 0: - print >>sys.stderr, '... filtering', n + print('... filtering', n, file=sys.stderr) yield record verbose_fasta_iter = verbose_loader @@ -49,8 +52,8 @@ def __init__(self, process_fn, n_workers=DEFAULT_WORKER_THREADS, self.n_workers = n_workers self.group_size = group_size - self.inqueue = Queue.Queue(self.QUEUESIZE) - self.outqueue = Queue.Queue(self.QUEUESIZE) + self.inqueue = queue.Queue(self.QUEUESIZE) + self.outqueue = queue.Queue(self.QUEUESIZE) self.worker_count = 0 self.worker_count_lock = threading.Lock() @@ -65,7 +68,7 @@ def __init__(self, process_fn, n_workers=DEFAULT_WORKER_THREADS, def start(self, inputiter, outfp): if self.verbose: - print >>sys.stderr, 'starting threads' + print('starting threads', file=sys.stderr) try: for _ in range(self.n_workers): @@ -74,18 +77,18 @@ def start(self, inputiter, outfp): t.start() if self.verbose: - print >>sys.stderr, 'starting writer' + print('starting writer', file=sys.stderr) w = threading.Thread(target=self.do_write, args=(outfp,)) w.start() if self.verbose: - print >>sys.stderr, 'loading...' + print('loading...', file=sys.stderr) self.push_sequences(inputiter) if self.verbose: - print >>sys.stderr, 'done loading in sequences' + print('done loading in sequences', file=sys.stderr) self.done = True w.join() @@ -129,7 +132,7 @@ def do_process(self): while not self.done or not inq.empty(): try: g = inq.get(True, 1) - except Queue.Empty: + except queue.Empty: continue bp_processed = 0 @@ -157,17 +160,16 @@ def do_process(self): self.bp_written += bp_written if self.verbose and self.n_processed % 500000 == 0: - print >>sys.stderr, \ - "processed %d / wrote %d / removed %d" % \ - (self.n_processed, self.n_written, - self.n_processed - self.n_written) - print >>sys.stderr, \ - "processed %d bp / wrote %d bp / removed %d bp" % \ - (self.bp_processed, self.bp_written, - self.bp_processed - self.bp_written) + print("processed %d / wrote %d / removed %d" % + (self.n_processed, self.n_written, + self.n_processed - self.n_written), file=sys.stderr) + print("processed %d bp / wrote %d bp / removed %d bp" % + (self.bp_processed, self.bp_written, + self.bp_processed - self.bp_written), + file=sys.stderr) discarded = self.bp_processed - self.bp_written f = float(discarded) / float(self.bp_processed) * 100 - print >>sys.stderr, "discarded %.1f%%" % f + print("discarded %.1f%%" % f, file=sys.stderr) # end of thread; exit, decrement worker count. with self.worker_count_lock: @@ -178,7 +180,7 @@ def do_write(self, outfp): while self.worker_count > 0 or not outq.empty(): try: g = outq.get(True, 1) - except Queue.Empty: + except queue.Empty: continue for name, seq, quality in g.seqlist: @@ -188,16 +190,14 @@ def do_write(self, outfp): outfp.write('>%s\n%s\n' % (name, seq,)) if self.verbose: - print >>sys.stderr, \ - "DONE writing.\nprocessed %d / wrote %d / removed %d" % \ - (self.n_processed, self.n_written, - self.n_processed - self.n_written) - print >>sys.stderr, \ - "processed %d bp / wrote %d bp / removed %d bp" % \ - (self.bp_processed, self.bp_written, - self.bp_processed - self.bp_written) + print("DONE writing.\nprocessed %d / wrote %d / removed %d" % + (self.n_processed, self.n_written, + self.n_processed - self.n_written), file=sys.stderr) + print("processed %d bp / wrote %d bp / removed %d bp" % + (self.bp_processed, self.bp_written, + self.bp_processed - self.bp_written), file=sys.stderr) discarded = self.bp_processed - self.bp_written f = float(discarded) / float(self.bp_processed) * 100 - print >>sys.stderr, "discarded %.1f%%" % f + print("discarded %.1f%%" % f, file=sys.stderr) # vim: set ft=python ts=4 sts=4 sw=4 et tw=79: diff --git a/khmer/utils.py b/khmer/utils.py index 342f0f5fb6..ea1c93a747 100644 --- a/khmer/utils.py +++ b/khmer/utils.py @@ -1,3 +1,4 @@ +from __future__ import print_function, unicode_literals # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -12,7 +13,7 @@ def print_error(msg): """Print the given message to 'stderr'.""" import sys - print >>sys.stderr, msg + print(msg, file=sys.stderr) def _split_left_right(name): @@ -153,15 +154,19 @@ def broken_paired_reader(screed_iter, min_length=None, def write_record(record, fileobj): """Write sequence record to 'fileobj' in FASTA/FASTQ format.""" if hasattr(record, 'quality'): - fileobj.write( - '@{name}\n{seq}\n' - '+\n{qual}\n'.format(name=record.name, - seq=record.sequence, - qual=record.quality)) + recstr = '@{name}\n{sequence}\n+\n{quality}\n'.format( + name=record.name, + sequence=record.sequence, + quality=record.quality) else: - fileobj.write( - '>{name}\n{seq}\n'.format(name=record.name, - seq=record.sequence)) + recstr = '>{name}\n{sequence}\n'.format( + name=record.name, + sequence=record.sequence) + + try: + fileobj.write(bytes(recstr, 'utf-8')) + except TypeError: + fileobj.write(recstr) def write_record_pair(read1, read2, fileobj): diff --git a/lib/get_version.py b/lib/get_version.py index 6ad89b8b85..5d7fa666ab 100644 --- a/lib/get_version.py +++ b/lib/get_version.py @@ -1,3 +1,4 @@ +from __future__ import print_function import sys sys.path.insert(0, '../') import versioneer @@ -7,4 +8,4 @@ versioneer.tag_prefix = 'v' # tags are like v1.2.0 versioneer.parentdir_prefix = '..' -print versioneer.get_version() +print(versioneer.get_version()) diff --git a/sandbox/abundance-hist-by-position.py b/sandbox/abundance-hist-by-position.py index 71d36edb57..9281eec1e1 100755 --- a/sandbox/abundance-hist-by-position.py +++ b/sandbox/abundance-hist-by-position.py @@ -5,6 +5,8 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import division +from __future__ import print_function import sys def main(): @@ -14,11 +16,11 @@ def main(): freqfile = sys.argv[1] - print >>sys.stderr, 'opening .freq file:', freqfile + print('opening .freq file:', freqfile, file=sys.stderr) fd = open(freqfile) for n, line in enumerate(fd): if n % 100000 == 0: - print >>sys.stderr, '...', n + print('...', n, file=sys.stderr) tok = line.split() @@ -26,7 +28,7 @@ def main(): countSum[i] += int(tok[i]) countN[i] += 1 - print >>sys.stderr, 'summarizing.' + print('summarizing.', file=sys.stderr) y = [0.0] * len(countSum) for i in range(len(countSum)): @@ -34,7 +36,7 @@ def main(): y[i] = float(countSum[i]) / float(countN[i]) for n, i in enumerate(y): - print n, i + print(n, i) if __name__ == '__main__': main() diff --git a/sandbox/assembly-diff-2.py b/sandbox/assembly-diff-2.py index 1a8776b5de..96e59173db 100755 --- a/sandbox/assembly-diff-2.py +++ b/sandbox/assembly-diff-2.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import khmer import screed @@ -26,21 +27,21 @@ def main(): kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) for n, record in enumerate(screed.open(filename1)): if n % 10000 == 0: - print '...', filename1, n + print('...', filename1, n) seq = record.sequence.upper().replace('N', 'A') kh.consume(seq) path_n = 0 for n, record in enumerate(screed.open(filename2)): if n % 10000 == 0: - print '...', filename2, n + print('...', filename2, n) seq = record.sequence.upper().replace('N', 'A') paths = kh.extract_unique_paths(seq, UNIQUE_LEN, UNIQUE_F) kh.consume(seq) for path in paths: path_n += 1 - print >>uniq2, '>%s from:%s\n%s' % (path_n, record.name, path) + print('>%s from:%s\n%s' % (path_n, record.name, path), file=uniq2) if __name__ == '__main__': diff --git a/sandbox/assembly-diff.py b/sandbox/assembly-diff.py index 01479045da..2271c45fcc 100755 --- a/sandbox/assembly-diff.py +++ b/sandbox/assembly-diff.py @@ -5,6 +5,8 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import division +from __future__ import print_function import sys import khmer import screed @@ -59,9 +61,9 @@ def main(): present2 = False if present1 and not present2: - print >>uniq1, '>%s\n%s' % (record.name, record.sequence) + print('>%s\n%s' % (record.name, record.sequence), file=uniq1) elif present2 and not present1: - print >>uniq2, '>%s\n%s' % (record.name, record.sequence) + print('>%s\n%s' % (record.name, record.sequence), file=uniq2) if __name__ == '__main__': diff --git a/sandbox/assemstats3.py b/sandbox/assemstats3.py index be84f4c918..f40152ffdb 100755 --- a/sandbox/assemstats3.py +++ b/sandbox/assemstats3.py @@ -21,6 +21,8 @@ Author: Jason Pell (pelljaso@cse.msu.edu) ''' +from __future__ import division +from __future__ import print_function import screed import sys @@ -89,23 +91,23 @@ def main(): totalSum = 0 if len(sys.argv) < 3: - print "Usage: python assemstats.py [ FASTA files ]" + print("Usage: python assemstats.py [ FASTA files ]") return try: minLen = int(sys.argv[1]) except ValueError: - print "Minimum contig length must be an integer." + print("Minimum contig length must be an integer.") return - print '** cutoff:', minLen - print "N\tsum\tmax\tfilename" + print('** cutoff:', minLen) + print("N\tsum\tmax\tfilename") for filename in sys.argv[2:]: if not os.path.exists(filename): - print >>sys.stderr, "WARNING: file %s does not exist." % filename + print("WARNING: file %s does not exist." % filename, file=sys.stderr) continue - + lens = getLens(filename) trimmedLens = trimLens(lens, minLen) @@ -121,12 +123,12 @@ def main(): totalN += statTrimmedN totalSum += statSum - print "%d\t%d\t%d\t%s" % (statTrimmedN, statSum, statMax, filename) + print("%d\t%d\t%d\t%s" % (statTrimmedN, statSum, statMax, filename)) if len(sys.argv) > 3 and totalN: - print '--' - print 'TOTAL: %g in %d contigs (mean size %d)' % ( - totalSum, totalN, totalSum / float(totalN) + .5) + print('--') + print('TOTAL: %g in %d contigs (mean size %d)' % ( + totalSum, totalN, totalSum / totalN + .5)) main() diff --git a/sandbox/bloom-count-intersection.py b/sandbox/bloom-count-intersection.py index c532555cac..28c7a92731 100755 --- a/sandbox/bloom-count-intersection.py +++ b/sandbox/bloom-count-intersection.py @@ -1,3 +1,4 @@ +from __future__ import print_function #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -30,9 +31,9 @@ def main(): if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) - print filename, 'has been consumed.' - print '# of unique kmers:', n_unique - print '# of occupied bin:', ht.n_occupied() + print(filename, 'has been consumed.') + print('# of unique kmers:', n_unique) + print('# of occupied bin:', ht.n_occupied()) filename2 = sys.argv[5] ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) @@ -49,11 +50,11 @@ def main(): n_overlap += 1 ht2.count(kmer) - print filename2, 'has been consumed.' - print '# of unique kmers:', n_unique - print '# of occupied bin:', ht2.n_occupied() + print(filename2, 'has been consumed.') + print('# of unique kmers:', n_unique) + print('# of occupied bin:', ht2.n_occupied()) - print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2 + print(n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2) if __name__ == '__main__': diff --git a/sandbox/bloom-count.py b/sandbox/bloom-count.py index 6210b1520b..dad8db2cea 100755 --- a/sandbox/bloom-count.py +++ b/sandbox/bloom-count.py @@ -1,3 +1,4 @@ +from __future__ import print_function #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -31,9 +32,9 @@ def main(): n_unique += 1 ht.count(kmer) - print n_unique - print ht.n_occupied() - print ht.n_unique_kmers() + print(n_unique) + print(ht.n_occupied()) + print(ht.n_unique_kmers()) if __name__ == '__main__': diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py index 4d4f752153..b2e06d63d9 100755 --- a/sandbox/build-sparse-graph.py +++ b/sandbox/build-sparse-graph.py @@ -1,3 +1,4 @@ +from __future__ import print_function #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -27,13 +28,13 @@ def main(): for n, record in enumerate(screed.open(input_fasta)): if n % 1000 == 0: - print >>sys.stderr, '...loaded and tagged {} sequences'.format(n) + print('...loaded and tagged {} sequences'.format(n), file=sys.stderr) name = record.name sequence = record.sequence ht.consume_sequence_and_tag_with_labels(sequence, n) tags = ht.sweep_tag_neighborhood(sequence, 0) - for i in xrange(len(tags) - 1): + for i in range(len(tags) - 1): src = tags[i] dst = tags[i + 1] @@ -58,7 +59,7 @@ def main(): if new: e = sparse_graph.add_edge(srcv, dstv) - print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges()) + print('Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges())) comp = gt.label_largest_component(sparse_graph, directed=False) #pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0)) gt.graph_draw(sparse_graph, output_size=( diff --git a/sandbox/calc-best-assembly.py b/sandbox/calc-best-assembly.py index 8c12565d83..cd3de8faa3 100755 --- a/sandbox/calc-best-assembly.py +++ b/sandbox/calc-best-assembly.py @@ -4,6 +4,7 @@ # Copyright (C) Michigan State University, 2009-2015. It is licensed under # the three-clause BSD license; see LICENSE. Contact: ctb@msu.edu # +from __future__ import print_function import screed import argparse import sys @@ -34,31 +35,31 @@ def main(): try: total = calculate_bp_above_cutoff(filename, args.cutoff) except IOError: - print >>sys.stderr, "** WARNING: %s does not exist, skipping" %\ - filename + print("** WARNING: %s does not exist, skipping" %\ + filename, file=sys.stderr) continue stats.append((total, filename)) if not args.quiet: - print >>sys.stderr, "assembly %s has %d bp > %d" % (filename, + print("assembly %s has %d bp > %d" % (filename, total, - args.cutoff) + args.cutoff), file=sys.stderr) stats.sort(reverse=True) best_total, winner_file = stats[0] - print >>sys.stderr, '----' - print >>sys.stderr, "assembly %s wins: %d total bp > %d" % (winner_file, + print('----', file=sys.stderr) + print("assembly %s wins: %d total bp > %d" % (winner_file, best_total, - args.cutoff) + args.cutoff), file=sys.stderr) if args.output_file: for record in screed.open(winner_file, parse_description=False): - print >>args.output_file, '>%s\n%s' % (record.name, - record.sequence) + print('>%s\n%s' % (record.name, + record.sequence), file=args.output_file) - print winner_file + print(winner_file) if __name__ == '__main__': main() diff --git a/sandbox/calc-error-profile.py b/sandbox/calc-error-profile.py index 8fe9be8ddc..d77cf58e50 100755 --- a/sandbox/calc-error-profile.py +++ b/sandbox/calc-error-profile.py @@ -12,6 +12,8 @@ Reads FASTQ and FASTA input. """ +from __future__ import division +from __future__ import print_function import sys import argparse @@ -80,13 +82,13 @@ def main(): # look for errors. total = 0 for filename in args.filenames: - print >>sys.stderr, 'opening', filename + print('opening', filename, file=sys.stderr) for n, record in enumerate(screed.open(filename)): total += 1 if total % CHECK_EXIT == 0: - print >>sys.stderr, '...', total, n_consumed, n_checked + print('...', total, n_consumed, n_checked, file=sys.stderr) # two exit conditions: first, have we hit our max reads limit? if total >= MAX_READS: @@ -111,8 +113,8 @@ def main(): lengths.append(len(seq)) if args.errors_per_read: - print >>args.errors_per_read, record.name, \ - ",".join(map(str, posns)) + print(record.name, \ + ",".join(map(str, posns)), file=args.errors_per_read) # track the positions => errors for p in posns: @@ -131,24 +133,24 @@ def main(): # write! output_file.write('position error_count error_fraction\n') for n, i in enumerate(positions[:max_length]): - print >>output_file, n, i, float(i) / float(length_count[n]) + print(n, i, float(i) / float(length_count[n]), file=output_file) output_file.close() - print >>sys.stderr, '' - print >>sys.stderr, 'total sequences:', total - print >>sys.stderr, 'n consumed:', n_consumed - print >>sys.stderr, 'n checked:', n_checked - print >>sys.stderr, 'bp consumed:', bp_consumed, bp_consumed / float(C) - print >>sys.stderr, 'error rate: %.2f%%' % \ - (100.0 * sum(positions) / float(sum(lengths))) + print('', file=sys.stderr) + print('total sequences:', total, file=sys.stderr) + print('n consumed:', n_consumed, file=sys.stderr) + print('n checked:', n_checked, file=sys.stderr) + print('bp consumed:', bp_consumed, bp_consumed / float(C), file=sys.stderr) + print('error rate: %.2f%%' % \ + (100.0 * sum(positions) / float(sum(lengths))), file=sys.stderr) - print >>sys.stderr, 'Error histogram is in %s' % output_filename + print('Error histogram is in %s' % output_filename, file=sys.stderr) if not exit_condition(n_consumed, n_checked): - print >>sys.stderr, "" - print >>sys.stderr, "** WARNING: not enough reads to get a good result" - print >>sys.stderr, "** Is this high diversity sample / small subset?" + print("", file=sys.stderr) + print("** WARNING: not enough reads to get a good result", file=sys.stderr) + print("** Is this high diversity sample / small subset?", file=sys.stderr) sys.exit(-1) diff --git a/sandbox/calc-median-distribution.py b/sandbox/calc-median-distribution.py index 51b3540f61..5be50928e8 100755 --- a/sandbox/calc-median-distribution.py +++ b/sandbox/calc-median-distribution.py @@ -5,6 +5,8 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import division +from __future__ import print_function import sys import khmer import argparse @@ -27,7 +29,7 @@ def main(): outfp = open(histout, 'w') - print 'hashtable from', hashfile + print('hashtable from', hashfile) ht = khmer.load_counting_hash(hashfile) hist = {} @@ -37,7 +39,7 @@ def main(): for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: - print '...', n + print('...', n) seq = record.sequence.replace('N', 'A') diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py index b8aa498d24..8d68593947 100755 --- a/sandbox/collect-reads.py +++ b/sandbox/collect-reads.py @@ -14,6 +14,8 @@ Use '-h' for parameter help. """ +from __future__ import division +from __future__ import print_function import sys import textwrap @@ -76,12 +78,12 @@ def main(): check_space(args.input_sequence_filename, False) check_space_for_hashtable(args.n_tables * args.min_tablesize, False) - print 'Saving k-mer counting table to %s' % base - print 'Loading sequences from %s' % repr(filenames) + print('Saving k-mer counting table to %s' % base) + print('Loading sequences from %s' % repr(filenames)) if args.output: - print 'Outputting sequences to', args.output + print('Outputting sequences to', args.output) - print 'making k-mer counting table' + print('making k-mer counting table') htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) @@ -103,8 +105,8 @@ def main(): n += 1 if total_coverage / float(n) > args.coverage: - print 'reached target average coverage:', \ - total_coverage / float(n) + print('reached target average coverage:', \ + total_coverage / float(n)) break htable.consume(seq) @@ -112,18 +114,18 @@ def main(): args.output.write(output_single(record)) if n % 100000 == 0: - print '...', index, filename, n, total_coverage / float(n) + print('...', index, filename, n, total_coverage / float(n)) if total_coverage / float(n) > args.coverage: break - print 'Collected %d reads' % (n,) + print('Collected %d reads' % (n,)) if args.report_total_kmers: - print >> sys.stderr, 'Total number of k-mers: {0}'.format( - htable.n_occupied()) + print('Total number of k-mers: {0}'.format( + htable.n_occupied()), file=sys.stderr) - print 'saving', base + print('saving', base) htable.save(base) info_fp = open(base + '.info', 'w') @@ -131,10 +133,10 @@ def main(): # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) - print 'fp rate estimated to be %1.3f' % fp_rate - print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate) + print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp) - print 'DONE.' + print('DONE.') if __name__ == '__main__': main() diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py index 1bbec5b015..dd48941f61 100755 --- a/sandbox/collect-variants.py +++ b/sandbox/collect-variants.py @@ -12,6 +12,7 @@ TODO: add to sandbox README """ +from __future__ import print_function import sys import screed @@ -41,16 +42,16 @@ def main(): args = parser.parse_args() if not args.quiet: - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize - print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_tables - print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ - args.min_tablesize - print >>sys.stderr, '' - print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ + print('\nPARAMETERS:', file=sys.stderr) + print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) + print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr) + print(' - min hashsize = %-5.2g \t(-x)' % \ + args.min_tablesize, file=sys.stderr) + print('', file=sys.stderr) + print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % \ - (args.n_tables * args.min_tablesize) - print >>sys.stderr, '-' * 8 + (args.n_tables * args.min_tablesize), file=sys.stderr) + print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_tablesize @@ -59,10 +60,10 @@ def main(): filenames = args.input_filenames if args.loadhash: - print 'loading hashtable from', args.loadhash + print('loading hashtable from', args.loadhash) ht = khmer.load_counting_hash(args.loadhash) else: - print 'making hashtable' + print('making hashtable') ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) @@ -80,9 +81,9 @@ def main(): for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: - print '... kept', total - discarded, 'of', total, ', or', \ - int(100. - discarded / float(total) * 100.), '%' - print '... in file', input_filename + print('... kept', total - discarded, 'of', total, ', or', \ + int(100. - discarded / float(total) * 100.), '%') + print('... in file', input_filename) total += 1 @@ -133,19 +134,19 @@ def main(): discarded += 1 if total: - print 'DONE with', input_filename, \ + print('DONE with', input_filename, \ '; kept', total - discarded, 'of', total, 'or', \ - int(100. - discarded / float(total) * 100.), '%' - print 'output in', output_name + int(100. - discarded / float(total) * 100.), '%') + print('output in', output_name) if args.savehash: - print 'Saving hashfile through', input_filename - print '...saving to', args.savehash + print('Saving hashfile through', input_filename) + print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2) - print 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate) if __name__ == '__main__': diff --git a/sandbox/correct-errors.py b/sandbox/correct-errors.py index c67e5fac69..798f6d0d3b 100755 --- a/sandbox/correct-errors.py +++ b/sandbox/correct-errors.py @@ -18,6 +18,7 @@ TODO: add to sandbox/README TODO: change name to correct-reads? """ +from __future__ import print_function import sys import screed import os @@ -90,13 +91,13 @@ def main(): NORMALIZE_LIMIT = args.normalize_to - print 'making hashtable' + print('making hashtable') ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) - print 'created temporary directory %s; use -T to change location' % tempdir + print('created temporary directory %s; use -T to change location' % tempdir) ### @@ -120,8 +121,8 @@ def main(): total_reads += 1 if n % 10000 == 0: - print '...', n, filename, n_aligned, n_corrected, save_pass2, \ - total_reads + print('...', n, filename, n_aligned, n_corrected, save_pass2, \ + total_reads) seq = read.sequence.replace('N', 'A') # build the alignment... @@ -165,18 +166,18 @@ def main(): pass2fp.close() corrfp.close() - print '%s: kept aside %d of %d from first pass, in %s' % \ - (filename, save_pass2, n, filename) - print 'aligned %d of %d reads so far' % (n_aligned, total_reads) - print 'changed %d of %d reads so far' % (n_corrected, total_reads) + print('%s: kept aside %d of %d from first pass, in %s' % \ + (filename, save_pass2, n, filename)) + print('aligned %d of %d reads so far' % (n_aligned, total_reads)) + print('changed %d of %d reads so far' % (n_corrected, total_reads)) for orig_filename, pass2filename, corrfilename in pass2list: - print 'second pass: looking at sequences kept aside in %s' % \ - pass2filename + print('second pass: looking at sequences kept aside in %s' % \ + pass2filename) for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: - print '... x 2', n, pass2filename, n_aligned, n_corrected, \ - total_reads + print('... x 2', n, pass2filename, n_aligned, n_corrected, \ + total_reads) corrfp = open(corrfilename, 'a') @@ -205,14 +206,14 @@ def main(): corrfp.write(output_single(read, corrected)) - print 'removing %s' % pass2filename + print('removing %s' % pass2filename) os.unlink(pass2filename) - print 'removing temp directory & contents (%s)' % tempdir + print('removing temp directory & contents (%s)' % tempdir) shutil.rmtree(tempdir) - print 'Aligned %d of %d total' % (n_aligned, total_reads) - print 'Changed %d of %d total' % (n_corrected, total_reads) + print('Aligned %d of %d total' % (n_aligned, total_reads)) + print('Changed %d of %d total' % (n_corrected, total_reads)) if __name__ == '__main__': main() diff --git a/sandbox/extract-single-partition.py b/sandbox/extract-single-partition.py index a5a8cee7e2..6f6c9295f1 100755 --- a/sandbox/extract-single-partition.py +++ b/sandbox/extract-single-partition.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys from screed.fasta import fasta_iter @@ -21,7 +22,7 @@ def main(): count = 0 for n, name, pid, seq in read_partition_file(open(sys.argv[1])): if pid == select_pid: - print '>%s\t%d\n%s' % (name, pid, seq) + print('>%s\t%d\n%s' % (name, pid, seq)) count += 1 if n % 10000 == 0: diff --git a/sandbox/fasta-to-abundance-hist.py b/sandbox/fasta-to-abundance-hist.py index 91a358faf8..258c787077 100755 --- a/sandbox/fasta-to-abundance-hist.py +++ b/sandbox/fasta-to-abundance-hist.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import khmer @@ -16,13 +17,13 @@ def main(): n_consumed = len(files) * [0] n_seq_kept = len(files) * [0] - print 'loading ht' + print('loading ht') ht = khmer.new_counting_hash(1, 1, 1) ht.load(sys.argv[1]) for i, infile in enumerate(files): - print 'outputting', infile + '.freq' + print('outputting', infile + '.freq') ht.output_fasta_kmer_pos_freq(infile, infile + ".freq") diff --git a/sandbox/filter-below-abund.py b/sandbox/filter-below-abund.py index 4f1da39b45..207ddf4001 100755 --- a/sandbox/filter-below-abund.py +++ b/sandbox/filter-below-abund.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import os import khmer @@ -22,17 +23,17 @@ def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] - print 'file with ht: %s' % counting_ht - print '-- settings:' - print 'N THREADS', WORKER_THREADS - print '--' + print('file with ht: %s' % counting_ht) + print('-- settings:') + print('N THREADS', WORKER_THREADS) + print('--') - print 'making hashtable' + print('making hashtable') ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() for infile in infiles: - print 'filtering', infile + print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') diff --git a/sandbox/filter-median-and-pct.py b/sandbox/filter-median-and-pct.py index b2fa8d37b4..4cd80b35da 100755 --- a/sandbox/filter-median-and-pct.py +++ b/sandbox/filter-median-and-pct.py @@ -13,6 +13,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import screed.fasta import os @@ -37,15 +38,15 @@ def main(): counting_ht = args.input_table infiles = args.input_filenames - print 'file with ht: %s' % counting_ht + print('file with ht: %s' % counting_ht) - print 'loading hashtable' + print('loading hashtable') ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() xxxfp = None - print "K:", K + print("K:", K) # the filtering function. def process_fn(record): @@ -64,20 +65,20 @@ def process_fn(record): # the filtering loop for infile in infiles: - print 'filtering', infile + print('filtering', infile) xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w') outfile = os.path.basename(infile) + '.medpctfilt' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(infile)): if n % 100000 == 0: - print '...', n + print('...', n) name, seq = process_fn(record) if name and seq: - print >>outfp, '>%s\n%s' % (name, seq) + print('>%s\n%s' % (name, seq), file=outfp) - print 'output in', outfile + print('output in', outfile) if __name__ == '__main__': main() diff --git a/sandbox/filter-median.py b/sandbox/filter-median.py index 1670cec2be..6f9cf707da 100755 --- a/sandbox/filter-median.py +++ b/sandbox/filter-median.py @@ -13,6 +13,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import screed.fasta import os @@ -37,13 +38,13 @@ def main(): counting_ht = args.input_table infiles = args.input_filenames - print 'file with ht: %s' % counting_ht + print('file with ht: %s' % counting_ht) - print 'loading hashtable' + print('loading hashtable') ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() - print "K:", K + print("K:", K) # the filtering function. def process_fn(record): @@ -59,14 +60,14 @@ def process_fn(record): # the filtering loop for infile in infiles: - print 'filtering', infile + print('filtering', infile) outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) - print 'output in', outfile + print('output in', outfile) if __name__ == '__main__': main() diff --git a/sandbox/find-high-abund-kmers.py b/sandbox/find-high-abund-kmers.py index c7c5432d33..0c7046dd88 100755 --- a/sandbox/find-high-abund-kmers.py +++ b/sandbox/find-high-abund-kmers.py @@ -8,6 +8,7 @@ """ @@ """ +from __future__ import print_function import sys import screed @@ -34,20 +35,20 @@ def main(): if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: - print >>sys.stderr, "** WARNING: hashsize is default! " \ + print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ - "Please read the docs!" - - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize - print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes - print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ - args.min_hashsize - print >>sys.stderr, '' - print >>sys.stderr, 'Estimated memory usage is %.2g bytes " \ + "Please read the docs!", file=sys.stderr) + + print('\nPARAMETERS:', file=sys.stderr) + print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) + print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) + print(' - min hashsize = %-5.2g \t(-x)' % \ + args.min_hashsize, file=sys.stderr) + print('', file=sys.stderr) + print('Estimated memory usage is %.2g bytes " \ "(n_hashes x min_hashsize)' % ( - args.n_hashes * args.min_hashsize) - print >>sys.stderr, '-' * 8 + args.n_hashes * args.min_hashsize), file=sys.stderr) + print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize @@ -56,23 +57,23 @@ def main(): output = args.output_filename input = args.input_filename - print 'lower cutoff:', args.lower_cutoff - print 'upper cutoff:', args.upper_cutoff - print 'Saving stoptags to %s' % output - print 'Loading sequences in %s' % input + print('lower cutoff:', args.lower_cutoff) + print('upper cutoff:', args.upper_cutoff) + print('Saving stoptags to %s' % output) + print('Loading sequences in %s' % input) ### - print 'making hashtable' + print('making hashtable') ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) - print 'consuming input', input + print('consuming input', input) hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff, args.upper_cutoff) - print 'saving stoptags', output + print('saving stoptags', output) hb.save_stop_tags(output) if __name__ == '__main__': diff --git a/sandbox/graph-size.py b/sandbox/graph-size.py index 74ca1c7933..1bf45d1726 100755 --- a/sandbox/graph-size.py +++ b/sandbox/graph-size.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import khmer import sys import screed @@ -30,19 +31,19 @@ def main(): if len(sys.argv) == 3: outfile = sys.argv[2] - print 'input file to graphsize filter: %s' % infile - print 'filtering to output:', outfile - print '-- settings:' - print 'K', K - print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE - print 'N HASHTABLES %d' % N_HT - print 'THRESHOLD', THRESHOLD - print 'N THREADS', WORKER_THREADS - print '--' + print('input file to graphsize filter: %s' % infile) + print('filtering to output:', outfile) + print('-- settings:') + print('K', K) + print('HASHTABLE SIZE %g' % HASHTABLE_SIZE) + print('N HASHTABLES %d' % N_HT) + print('THRESHOLD', THRESHOLD) + print('N THREADS', WORKER_THREADS) + print('--') - print 'creating ht' + print('creating ht') ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) - print 'eating fa', infile + print('eating fa', infile) total_reads, n_consumed = ht.consume_fasta(infile) outfp = open(outfile, 'w') diff --git a/sandbox/hi-lo-abundance-by-position.py b/sandbox/hi-lo-abundance-by-position.py index ae94e599a4..53b22b195b 100755 --- a/sandbox/hi-lo-abundance-by-position.py +++ b/sandbox/hi-lo-abundance-by-position.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import os import khmer @@ -20,16 +21,16 @@ def main(): filename = sys.argv[2] outfile = os.path.basename(filename) - print 'loading kh file', hashfile + print('loading kh file', hashfile) ht = khmer.load_counting_hash(hashfile) x = ht.fasta_count_kmers_by_position(filename, 100, 1) write_dist(x, open(outfile + '.pos.abund=1', 'w')) - print 'wrote', outfile + '.pos.abund=1' + print('wrote', outfile + '.pos.abund=1') y = ht.fasta_count_kmers_by_position(filename, 100, 255) write_dist(y, open(outfile + '.pos.abund=255', 'w')) - print 'wrote', outfile + '.pos.abund=255' + print('wrote', outfile + '.pos.abund=255') if __name__ == '__main__': diff --git a/sandbox/make-coverage.py b/sandbox/make-coverage.py index 712703d066..508a5f2f1a 100755 --- a/sandbox/make-coverage.py +++ b/sandbox/make-coverage.py @@ -5,6 +5,9 @@ # the three-clause BSD license; see LICENSE.txt. # Contact: khmer-project@idyll.org # + +from __future__ import print_function + import screed import sys @@ -16,13 +19,13 @@ def main(): lengths = {} for n, record in enumerate(screed.open(dbfile)): if n % 100000 == 0: - print '...', n + print('...', n) lengths[record.name] = len(record.sequence) sums = {} for n, line in enumerate(open(mapfile)): if n % 100000 == 0: - print '... 2x', n + print('... 2x', n) x = line.split('\t') name = x[2] readlen = len(x[4]) @@ -38,10 +41,12 @@ def main(): outfp = open(dbfile + '.cov', 'w') for n, record in enumerate(screed.open(dbfile)): if n % 100000 == 0: - print '...', n + print('...', n) - print >>outfp, ">%s[cov=%d]\n%s" % ( - record.name, rpkms.get(record.name, 0), record.sequence) + print(">%s[cov=%d]\n%s" % (record.name, + rpkms.get(record.name, 0), + record.sequence), + file=outfp) if __name__ == '__main__': main() diff --git a/sandbox/multi-rename.py b/sandbox/multi-rename.py index 3a1b4b3be1..62917aa226 100755 --- a/sandbox/multi-rename.py +++ b/sandbox/multi-rename.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import screed import sys @@ -18,7 +19,7 @@ def main(): for record in screed.open(filename): if len(record.sequence) >= CUTOFF: n += 1 - print '>%s.%s %s\n%s' % (prefix, n, record.name, record.sequence) + print('>%s.%s %s\n%s' % (prefix, n, record.name, record.sequence)) if __name__ == '__main__': diff --git a/sandbox/normalize-by-median-pct.py b/sandbox/normalize-by-median-pct.py index e40b9fe5bf..6ffccf8138 100755 --- a/sandbox/normalize-by-median-pct.py +++ b/sandbox/normalize-by-median-pct.py @@ -13,12 +13,14 @@ Use '-h' for parameter help. """ +from __future__ import division +from __future__ import print_function import sys import screed import os import khmer -from itertools import izip + from khmer.khmer_args import build_counting_args, DEFAULT_MIN_TABLESIZE import argparse @@ -30,7 +32,7 @@ def batchwise(t, size): it = iter(t) - return izip(*[it] * size) + return zip(*[it] * size) # Returns true if the pair of records are properly pairs @@ -57,17 +59,17 @@ def main(): if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash: - print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" - - print>>sys.stderr, '\nPARAMETERS:' - print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize - print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes - print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize - print>>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired - print>>sys.stderr, '' - print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( - args.n_hashes * args.min_hashsize) - print>>sys.stderr, '-' * 8 + print("** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!", file=sys.stderr) + + print('\nPARAMETERS:', file=sys.stderr) + print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) + print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) + print(' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize, file=sys.stderr) + print(' - paired = %s \t\t(-p)' % args.paired, file=sys.stderr) + print('', file=sys.stderr) + print('Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( + args.n_hashes * args.min_hashsize), file=sys.stderr) + print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize @@ -82,10 +84,10 @@ def main(): batch_size = 2 if args.loadhash: - print 'loading hashtable from', args.loadhash + print('loading hashtable from', args.loadhash) ht = khmer.load_counting_hash(args.loadhash) else: - print 'making hashtable' + print('making hashtable') ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 @@ -98,13 +100,13 @@ def main(): n = -1 for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)): if n > 0 and n % 100000 == 0: - print '... kept', total - discarded, 'of', total, ', or', \ - int(100. - discarded / float(total) * 100.), '%' - print '... in file', input_filename + print('... kept', total - discarded, 'of', total, ', or', \ + int(100. - discarded / float(total) * 100.), '%') + print('... in file', input_filename) if report_fp: - print>>report_fp, total, total - discarded, \ - 1. - (discarded / float(total)) + print(total, total - discarded, \ + 1. - (discarded / float(total)), file=report_fp) report_fp.flush() total += batch_size @@ -112,8 +114,8 @@ def main(): # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): - print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % ( - batch[0].name, batch[1].name) + print('Error: Improperly interleaved pairs %s %s' % ( + batch[0].name, batch[1].name), file=sys.stderr) sys.exit(-1) # Emit the batch of reads if any read passes the filter @@ -150,27 +152,27 @@ def main(): discarded += batch_size if -1 < n: - print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ - total, 'or', int(100. - discarded / float(total) * 100.), '%' - print 'output in', output_name + print('DONE with', input_filename, '; kept', total - discarded, 'of',\ + total, 'or', int(100. - discarded / float(total) * 100.), '%') + print('output in', output_name) else: - print 'SKIPPED empty file', input_filename + print('SKIPPED empty file', input_filename) if args.savehash: - print 'Saving hashfile through', input_filename - print '...saving to', args.savehash + print('Saving hashfile through', input_filename) + print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) - print 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate) if fp_rate > 0.20: - print >>sys.stderr, "**" - print >>sys.stderr, "** ERROR: the counting hash is too small for" - print >>sys.stderr, "** this data set. Increase hashsize/num ht." - print >>sys.stderr, "**" - print >>sys.stderr, "** Do not use these results!!" + print("**", file=sys.stderr) + print("** ERROR: the counting hash is too small for", file=sys.stderr) + print("** this data set. Increase hashsize/num ht.", file=sys.stderr) + print("**", file=sys.stderr) + print("** Do not use these results!!", file=sys.stderr) sys.exit(-1) if __name__ == '__main__': diff --git a/sandbox/print-tagset.py b/sandbox/print-tagset.py index 5acc1144a1..dc2a971e46 100755 --- a/sandbox/print-tagset.py +++ b/sandbox/print-tagset.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import khmer import sys import os @@ -15,7 +16,7 @@ def main(): ht = khmer.new_hashbits(32, 1, 1) ht.load_tagset(sys.argv[1]) - print 'loaded!' + print('loaded!') ht.print_tagset(os.path.basename(sys.argv[1]) + '.txt') diff --git a/sandbox/renumber-partitions.py b/sandbox/renumber-partitions.py index ad4e638adb..87c020c3df 100755 --- a/sandbox/renumber-partitions.py +++ b/sandbox/renumber-partitions.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import screed import gzip @@ -21,7 +22,7 @@ def main(): old_to_new = {} for n, record in enumerate(screed.open(filename)): if n > 0 and n % 10000 == 0: - print '...', os.path.basename(filename), n + print('...', os.path.basename(filename), n) partition = record.name.split()[-1] name = record.name.split()[0] @@ -33,7 +34,7 @@ def main(): outfp.write('>%s\t%d\n%s\n' % (name, new_part, record.sequence)) outfp.close() - print 'renumbered %d partitions in %s' % (len(old_to_new), filename) + print('renumbered %d partitions in %s' % (len(old_to_new), filename)) if __name__ == '__main__': diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py index 7dc0170347..18eb619754 100755 --- a/sandbox/saturate-by-median.py +++ b/sandbox/saturate-by-median.py @@ -11,13 +11,15 @@ reads whether or not they have high coverage. This is better for assessing saturation of (esp) low-coverage data sets. """ +from __future__ import division +from __future__ import print_function import sys import screed import os import khmer import textwrap -from itertools import izip + from khmer.khmer_args import (build_counting_args, add_loadhash_args, report_on_config, info) import argparse @@ -32,7 +34,7 @@ def batchwise(coll, size): iter_coll = iter(coll) - return izip(*[iter_coll] * size) + return zip(*[iter_coll] * size) # Returns true if the pair of records are properly pairs @@ -61,14 +63,14 @@ def normalize_by_median(input_filename, htable, args, report_fp=None, for index, batch in enumerate(batchwise(screed.open( input_filename), batch_size)): if index > 0 and index % report_frequency == 0: - print '... kept {kept} of {total} or {perc:2}%'.format( + print('... kept {kept} of {total} or {perc:2}%'.format( kept=total - discarded, total=total, - perc=int(100. - discarded / float(total) * 100.)) - print '... in file', input_filename + perc=int(100. - discarded / float(total) * 100.))) + print('... in file', input_filename) if report_fp: - print >> report_fp, total, total - discarded, \ - 1. - (discarded / float(total)) + print(total, total - discarded, \ + 1. - (discarded / float(total)), file=report_fp) report_fp.flush() total += batch_size @@ -103,8 +105,8 @@ def normalize_by_median(input_filename, htable, args, report_fp=None, def handle_error(error, input_name): - print >> sys.stderr, '** ERROR:', error - print >> sys.stderr, '** Failed on {name}: '.format(name=input_name) + print('** ERROR:', error, file=sys.stderr) + print('** Failed on {name}: '.format(name=input_name), file=sys.stderr) def get_parser(): epilog = (""" @@ -192,10 +194,10 @@ def main(): # pylint: disable=too-many-branches,too-many-statements corrupt_files = [] if args.loadtable: - print 'loading k-mer counting table from', args.loadtable + print('loading k-mer counting table from', args.loadtable) htable = khmer.load_counting_hash(args.loadtable) else: - print 'making k-mer counting table' + print('making k-mer counting table') htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) @@ -214,36 +216,37 @@ def main(): # pylint: disable=too-many-branches,too-many-statements except IOError as err: handle_error(err, input_filename) if not args.force: - print >> sys.stderr, '** Exiting!' + print('** Exiting!', file=sys.stderr) sys.exit(1) else: - print >> sys.stderr, '*** Skipping error file, moving on...' + print('*** Skipping error file, moving on...', file=sys.stderr) corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: - print 'SKIPPED empty file', input_filename + print('SKIPPED empty file', input_filename) else: total += total_acc discarded += discarded_acc - print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ + print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, - perc=int(100. - discarded / float(total) * 100.)) + perc=int(100. - discarded / float(total) * 100.))) if args.savetable: - print 'Saving k-mer counting table through', input_filename - print '...saving to', args.savetable + print('Saving k-mer counting table through', input_filename) + print('...saving to', args.savetable) htable.save(args.savetable) # re: threshold, see Zhang et al., # http://arxiv.org/abs/1309.2975 fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) - print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) + print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)) if args.force and len(corrupt_files) > 0: - print >> sys.stderr, "** WARNING: Finished with errors!" - print >> sys.stderr, "** IOErrors occurred in the following files:" - print >> sys.stderr, "\t", " ".join(corrupt_files) + print("** WARNING: Finished with errors!", file=sys.stderr) + print("** IOErrors occurred in the following files:", file=sys.stderr) + print("\t", " ".join(corrupt_files), file=sys.stderr) + if __name__ == '__main__': main() diff --git a/sandbox/shuffle-reverse-rotary.py b/sandbox/shuffle-reverse-rotary.py index 0cb13c4f79..a63e510c4c 100755 --- a/sandbox/shuffle-reverse-rotary.py +++ b/sandbox/shuffle-reverse-rotary.py @@ -1,3 +1,4 @@ +from __future__ import print_function #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -30,12 +31,12 @@ def main(): for record in screed.open(filename): total += 1 if total % 10000 == 0: - print '...', total + print('...', total) loc = total % ROTARY_SIZE fp_d[loc].write('>%s\n%s\n' % (record.name, record.sequence)) - print 'reverse-rotary shuffled %d sequences into %d files (%s.NNN)' % \ - (total, ROTARY_SIZE, prefix) + print('reverse-rotary shuffled %d sequences into %d files (%s.NNN)' % \ + (total, ROTARY_SIZE, prefix)) if __name__ == '__main__': diff --git a/sandbox/slice-reads-by-coverage.py b/sandbox/slice-reads-by-coverage.py index a830e8f7f1..d8dc3176fa 100755 --- a/sandbox/slice-reads-by-coverage.py +++ b/sandbox/slice-reads-by-coverage.py @@ -4,6 +4,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org +from __future__ import print_function import argparse import screed import sys @@ -26,16 +27,16 @@ def main(): parser.add_argument('output_readfile') args = parser.parse_args() - print >>sys.stderr, 'min_coverage: %s' % args.min_coverage - print >>sys.stderr, 'max_coverage: %s' % args.max_coverage + print('min_coverage: %s' % args.min_coverage, file=sys.stderr) + print('max_coverage: %s' % args.max_coverage, file=sys.stderr) if not (args.min_coverage or args.max_coverage): - print >>sys.stderr, "neither min nor max coverage specified!? exiting!" + print("neither min nor max coverage specified!? exiting!", file=sys.stderr) sys.exit(1) if args.min_coverage and args.max_coverage and \ args.max_coverage < args.min_coverage: - print >>sys.stderr, "min_coverage > max_coverage!? exiting!" + print("min_coverage > max_coverage!? exiting!", file=sys.stderr) sys.exit(1) htable = khmer.load_counting_hash(args.input_counting_table) @@ -46,7 +47,7 @@ def main(): n = 0 for n, record in enumerate(screed.open(args.input_readfile)): if n % 100000 == 0: - print >>sys.stderr, '...', n, n_kept + print('...', n, n_kept, file=sys.stderr) seq = record.sequence.upper() seq = seq.replace('N', 'A') @@ -68,7 +69,7 @@ def main(): output_fp.write(output_single(record)) - print >>sys.stderr, 'consumed %d reads; kept %d' % (n, n_kept) + print('consumed %d reads; kept %d' % (n, n_kept), file=sys.stderr) if __name__ == '__main__': main() diff --git a/sandbox/split-fasta.py b/sandbox/split-fasta.py index 911e102885..3deb0956fb 100755 --- a/sandbox/split-fasta.py +++ b/sandbox/split-fasta.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import screed @@ -17,12 +18,12 @@ def main(): division = -1 for n, record in enumerate(screed.open(filename)): if n % 100000 == 0: - print '...', n + print('...', n) if n % size == 0: division += 1 new_name = '%s.%04d.fa' % (prefix, division) - print 'opening', new_name + print('opening', new_name) fp = open(new_name, 'w') fp.write('>%s\n%s\n' % (record['name'], record['sequence'])) diff --git a/sandbox/split-sequences-by-length.py b/sandbox/split-sequences-by-length.py index 98ebd040d2..1be79df129 100755 --- a/sandbox/split-sequences-by-length.py +++ b/sandbox/split-sequences-by-length.py @@ -13,6 +13,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import screed.fasta import os @@ -48,12 +49,12 @@ def main(): n = 0 for filename in filenames: - print 'opening' + print('opening') for record in screed.open(filename): out.save(record.name, record.sequence) n += 1 if n % 10000 == 0: - print '...', n + print('...', n) if __name__ == '__main__': main() diff --git a/sandbox/stoptag-abundance-hist.py b/sandbox/stoptag-abundance-hist.py index 0c863a3bdb..b5fc2dcc2b 100755 --- a/sandbox/stoptag-abundance-hist.py +++ b/sandbox/stoptag-abundance-hist.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import khmer import os @@ -33,7 +34,7 @@ def main(): d[count] = d.get(count, 0) + 1 if count > 1000: - print >>outabund, sequence, count + print(sequence, count, file=outabund) outfp = open(figure + '.countshist', 'w') sofar = 0 @@ -41,7 +42,7 @@ def main(): for k in sorted(d.keys()): sofar += d[k] sofar_cumu += k * d[k] - print >>outfp, k, d[k], sofar, sofar_cumu + print(k, d[k], sofar, sofar_cumu, file=outfp) hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000)) savefig(figure) diff --git a/sandbox/stoptags-by-position.py b/sandbox/stoptags-by-position.py index 4838f86bf3..e1130785e4 100755 --- a/sandbox/stoptags-by-position.py +++ b/sandbox/stoptags-by-position.py @@ -1,3 +1,4 @@ +from __future__ import print_function #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -38,7 +39,7 @@ def main(): for i, (n, m) in enumerate(zip(x, y)): if m: - print '%d,%d,%d' % (i, n, m) + print('%d,%d,%d' % (i, n, m)) if __name__ == '__main__': diff --git a/sandbox/strip-partition.py b/sandbox/strip-partition.py index 08147b1a16..e98ae79074 100755 --- a/sandbox/strip-partition.py +++ b/sandbox/strip-partition.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import screed import sys @@ -16,7 +17,7 @@ def main(): name = name.split()[0] - print '>%s\n%s' % (name, sequence,) + print('>%s\n%s' % (name, sequence,)) if __name__ == '__main__': diff --git a/sandbox/subset-report.py b/sandbox/subset-report.py index 75e8e323e6..47eab6118d 100755 --- a/sandbox/subset-report.py +++ b/sandbox/subset-report.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import khmer import sys import gc @@ -18,19 +19,19 @@ def main(): ht = khmer.new_hashbits(K, 1, 1) for filename in subset_filenames: - print '--' - print 'partition map:', filename + print('--') + print('partition map:', filename) subset = ht.load_subset_partitionmap(filename) n_part, n_orphan = ht.subset_count_partitions(subset) - print 'num partitions:', n_part - print 'num orphans:', n_orphan + print('num partitions:', n_part) + print('num orphans:', n_orphan) (dist, n_unassigned) = ht.subset_partition_size_distribution(subset) for (size, count) in dist: - print size, count - print '%d unassigned tags' % n_unassigned + print(size, count) + print('%d unassigned tags' % n_unassigned) - print '--' + print('--') if __name__ == '__main__': diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py index cff9145aef..c3da019118 100755 --- a/sandbox/sweep-files.py +++ b/sandbox/sweep-files.py @@ -12,6 +12,7 @@ % sweep-files.py -r --db \ --query """ +from __future__ import print_function EPILOG = """ Output will be a collection of fasta/q files, each corresponding to a database @@ -121,7 +122,7 @@ def main(): # de Bruin graph; open a file and output queue for each file as well. ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: - print >>sys.stderr, 'consuming and labeling input sequences...' + print('consuming and labeling input sequences...', file=sys.stderr) for i, dbfile in enumerate(args.db): @@ -132,18 +133,17 @@ def main(): for n, record in enumerate(screed.open(dbfile)): if n % 50000 == 0: - print >>sys.stderr, \ - '...consumed {n} sequences...'.format(n=n) + print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, i) except IOError as e: - print >>sys.stderr, '!! ERROR: !!', e - print >>sys.stderr, '...error setting up outputs. exiting...' + print('!! ERROR: !!', e, file=sys.stderr) + print('...error setting up outputs. exiting...', file=sys.stderr) - print >>sys.stderr, 'done consuming input sequence. \ + print('done consuming input sequence. \ added {t} tags and {l} labels...' \ - .format(t=ht.n_tags(), l=ht.n_labels()) + .format(t=ht.n_tags(), l=ht.n_labels()), file=sys.stderr) n_orphaned = 0 n_labeled = 0 @@ -152,21 +152,20 @@ def main(): # Iterate through all the reads and check for the labels with which they # intersect. Queue to the corresponding label when found. for read_file in args.query: - print >>sys.stderr, '** sweeping {read_file} for labels...'.format( - read_file=read_file) + print('** sweeping {read_file} for labels...'.format( + read_file=read_file), file=sys.stderr) try: read_fp = screed.open(read_file) except IOError as error: - print >>sys.stderr, '!! ERROR: !!', error - print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( - fn=read_file) + print('!! ERROR: !!', error, file=sys.stderr) + print('*** Could not open {fn}, skipping...'.format( + fn=read_file), file=sys.stderr) else: for n, record in enumerate(read_fp): if n % 50000 == 0 and n > 0: - print >>sys.stderr, \ - '\tswept {n} reads [{nc} labeled, {no} orphaned]' \ + print('\tswept {n} reads [{nc} labeled, {no} orphaned]' \ .format(n=n, nc=n_labeled, - no=n_orphaned) + no=n_orphaned), file=sys.stderr) seq = record.sequence try: labels = ht.sweep_label_neighborhood(seq, traversal_range) @@ -184,19 +183,19 @@ def main(): else: n_orphaned += 1 - print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) + print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) read_fp.close() # gotta output anything left in the buffers at the end! - print >>sys.stderr, '** End of run...' - for q in outputs.values(): + print('** End of run...', file=sys.stderr) + for q in list(outputs.values()): q.clear() - print >>sys.stderr, 'swept {n_reads}...'.format( - n_reads=n_labeled + n_orphaned) - print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( - nc=n_labeled, no=n_orphaned) - print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) + print('swept {n_reads}...'.format( + n_reads=n_labeled + n_orphaned), file=sys.stderr) + print('...with {nc} labeled and {no} orphaned'.format( + nc=n_labeled, no=n_orphaned), file=sys.stderr) + print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr) if __name__ == '__main__': main() diff --git a/sandbox/sweep-out-reads-with-contigs.py b/sandbox/sweep-out-reads-with-contigs.py index ce96e214d5..7df9aceed8 100755 --- a/sandbox/sweep-out-reads-with-contigs.py +++ b/sandbox/sweep-out-reads-with-contigs.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import sys import khmer import os.path @@ -27,12 +28,12 @@ def main(): ht._set_tag_density(0) # load contigs, connect into N partitions - print 'loading contigs from', contigfile + print('loading contigs from', contigfile) ht.consume_fasta_and_tag(contigfile) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) - print 'outputting contig-partitioned reads to', outfile + print('outputting contig-partitioned reads to', outfile) ht.output_partitions(readsfile, outfile, True) diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index 849b871844..ffdcc90cac 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -1,3 +1,4 @@ +from __future__ import print_function, unicode_literals #! /usr/bin/env python2 # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -6,6 +7,8 @@ # # pylint: disable=invalid-name,missing-docstring,no-member +from io import open + from khmer import utils """ @@ -103,12 +106,12 @@ def __init__(self, max_buffers, max_reads, max_size, output_pref, outdir, self.num_write_errors = 0 self.num_file_errors = 0 - print >>sys.stderr, '''Init new ReadBuffer [ + print('''Init new ReadBuffer [ Max Buffers: {num_bufs} Max Reads: {max_reads} Buffer flush: {buf_flush} ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_reads, - buf_flush=self.buffer_flush) + buf_flush=self.buffer_flush), file=sys.stderr) def flush_buffer(self, buf_id): fn = '{prefix}_{buffer_id}.{ext}'.format(prefix=self.output_pref, @@ -119,9 +122,9 @@ def flush_buffer(self, buf_id): try: outfp = open(fpath, 'a') except IOError as _: - print >>sys.stderr, '!! ERROR: {_} !!'.format(_=_) - print >>sys.stderr, '*** Failed to open {fn} for \ - buffer flush'.format(fn=fpath) + print('!! ERROR: {_} !!'.format(_=_), file=sys.stderr) + print('*** Failed to open {fn} for \ + buffer flush'.format(fn=fpath), file=sys.stderr) self.num_file_errors += 1 else: outfp.write(buf.flush()) @@ -142,16 +145,16 @@ def queue(self, seq_str, buf_id): self.cur_reads += 1 if self.cur_reads > self.max_reads: - print >>sys.stderr, '** Reached max num reads...' + print('** Reached max num reads...', file=sys.stderr) self.flush_all() if len(self.buffers) > self.max_buffers: # self.clean_buffers(2) - print >>sys.stderr, '** Reached max num buffers...' + print('** Reached max num buffers...', file=sys.stderr) self.flush_all() def flush_all(self): - print >>sys.stderr, '*** Flushing all to files...' - buf_ids = self.buffers.keys() + print('*** Flushing all to files...', file=sys.stderr) + buf_ids = list(self.buffers.keys()) for buf_id in buf_ids: self.flush_buffer(buf_id) assert self.cur_reads == 0 @@ -239,7 +242,7 @@ def main(): # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) - record = ix.next() + record = next(ix) del ix extension = 'fa' @@ -252,21 +255,19 @@ def main(): # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: - print >>sys.stderr, 'consuming input sequences...' + print('consuming input sequences...', file=sys.stderr) if args.label_by_pid: - print >>sys.stderr, '...labeling by partition id (pid)' + print('...labeling by partition id (pid)', file=sys.stderr) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: - print >>sys.stderr, '...labeling by sequence' + print('...labeling by sequence', file=sys.stderr) for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: - print >>sys.stderr, \ - '...consumed {n} sequences...'.format(n=n) + print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: - print >>sys.stderr, \ - '...labeling to create groups of size {s}'.format( - s=args.group_size) + print('...labeling to create groups of size {s}'.format( + s=args.group_size), file=sys.stderr) label = -1 g = 0 try: @@ -283,23 +284,22 @@ def main(): pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: - print >>sys.stderr, \ - '...consumed {n} sequences...'.format(n=n) + print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) - + except IOError as e: - print >>sys.stderr, '!! ERROR !!', e - print >>sys.stderr, '...error splitting input. exiting...' + print('!! ERROR !!', e, file=sys.stderr) + print('...error splitting input. exiting...', file=sys.stderr) except IOError as e: - print >>sys.stderr, '!! ERROR: !!', e - print >>sys.stderr, '...error consuming \ - {i}. exiting...'.format(i=input_fastp) + print('!! ERROR: !!', e, file=sys.stderr) + print('...error consuming \ + {i}. exiting...'.format(i=input_fastp), file=sys.stderr) - print >>sys.stderr, 'done consuming input sequence. \ + print('done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.graph.n_tags(), l=ht.n_labels()) @@ -314,27 +314,27 @@ def main(): total_t = time.clock() start_t = time.clock() for read_file in args.input_files: - print >>sys.stderr, '** sweeping {read_file} for labels...'.format( - read_file=read_file) + print('** sweeping {read_file} for labels...'.format( + read_file=read_file), file=sys.stderr) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: - print >>sys.stderr, '!! ERROR: !!', error - print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( - fn=read_file) + print('!! ERROR: !!', error, file=sys.stderr) + print('*** Could not open {fn}, skipping...'.format( + fn=read_file), file=sys.stderr) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t - print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ + print('\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, - sec=batch_t, sect=file_t) + sec=batch_t, sect=file_t), file=sys.stderr) start_t = time.clock() seq = record.sequence name = record.name @@ -361,37 +361,37 @@ def main(): n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 - print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) + print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! - print >>sys.stderr, '** End of run...' + print('** End of run...', file=sys.stderr) output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: - print >>sys.stderr, '! WARNING: Sweep finished with errors !' - print >>sys.stderr, '** {writee} reads not written'.format( - writee=output_buffer.num_write_errors) - print >>sys.stderr, '** {filee} errors opening files'.format( - filee=output_buffer.num_file_errors) - - print >>sys.stderr, 'swept {n_reads} for labels...'.format( - n_reads=n_labeled + n_orphaned) - print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( - nc=n_labeled, no=n_orphaned) - print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) - - print >>sys.stderr, '** outputting label number distribution...' + print('! WARNING: Sweep finished with errors !', file=sys.stderr) + print('** {writee} reads not written'.format( + writee=output_buffer.num_write_errors), file=sys.stderr) + print('** {filee} errors opening files'.format( + filee=output_buffer.num_file_errors), file=sys.stderr) + + print('swept {n_reads} for labels...'.format( + n_reads=n_labeled + n_orphaned), file=sys.stderr) + print('...with {nc} labeled and {no} orphaned'.format( + nc=n_labeled, no=n_orphaned), file=sys.stderr) + print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr) + + print('** outputting label number distribution...', file=sys.stderr) fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) - with open(fn, 'wb') as outfp: + with open(fn, 'w', encoding='utf-8') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) - print >>sys.stderr, '** outputting label read counts...' - with open(fn, 'wb') as outfp: + print('** outputting label read counts...', file=sys.stderr) + with open(fn, 'w', encoding='utf-8') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k])) diff --git a/sandbox/sweep-reads2.py b/sandbox/sweep-reads2.py index eae51e261c..4a749af075 100755 --- a/sandbox/sweep-reads2.py +++ b/sandbox/sweep-reads2.py @@ -14,6 +14,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import khmer @@ -31,20 +32,20 @@ def main(): if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: - print >>sys.stderr, "** WARNING: hashsize is default! " \ + print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ - "Please read the docs!" - - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize - print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes - print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ - args.min_hashsize - print >>sys.stderr, '' - print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ + "Please read the docs!", file=sys.stderr) + + print('\nPARAMETERS:', file=sys.stderr) + print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) + print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) + print(' - min hashsize = %-5.2g \t(-x)' % \ + args.min_hashsize, file=sys.stderr) + print('', file=sys.stderr) + print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( - args.n_hashes * args.min_hashsize / 8.) - print >>sys.stderr, '-' * 8 + args.n_hashes * args.min_hashsize / 8.), file=sys.stderr) + print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize @@ -60,10 +61,10 @@ def main(): ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions - print 'loading input reads from', inp + print('loading input reads from', inp) ht.consume_fasta(inp) - print 'starting sweep.' + print('starting sweep.') n = 0 m = 0 @@ -72,7 +73,7 @@ def main(): continue if n % 10000 == 0: - print '...', n, m + print('...', n, m) count = ht.get_median_count(record.sequence)[0] if count: diff --git a/sandbox/sweep-reads3.py b/sandbox/sweep-reads3.py index d2e0db5a95..d52ffe4039 100755 --- a/sandbox/sweep-reads3.py +++ b/sandbox/sweep-reads3.py @@ -14,6 +14,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import os.path @@ -38,20 +39,20 @@ def main(): if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: - print >>sys.stderr, "** WARNING: hashsize is default! " \ + print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ - "Please read the docs!" - - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize - print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes - print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ - args.min_hashsize - print >>sys.stderr, '' - print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ + "Please read the docs!", file=sys.stderr) + + print('\nPARAMETERS:', file=sys.stderr) + print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) + print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) + print(' - min hashsize = %-5.2g \t(-x)' % \ + args.min_hashsize, file=sys.stderr) + print('', file=sys.stderr) + print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( - args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.) - print >>sys.stderr, '-' * 8 + args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.), file=sys.stderr) + print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize @@ -73,10 +74,10 @@ def main(): ht = query_list[n][0] # load contigs, connect into N partitions - print 'loading input reads from', inp_name + print('loading input reads from', inp_name) ht.consume_fasta(inp_name) - print 'starting sweep.' + print('starting sweep.') n = 0 m = 0 @@ -85,7 +86,7 @@ def main(): continue if n % 10000 == 0: - print '...', n, m + print('...', n, m) for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] diff --git a/sandbox/unique-kmers.py b/sandbox/unique-kmers.py index 6e78c990bd..e14949e64a 100755 --- a/sandbox/unique-kmers.py +++ b/sandbox/unique-kmers.py @@ -13,6 +13,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import argparse @@ -93,21 +94,22 @@ def main(): write_record(record, sys.stdout) cardinality = hllcpp.estimate_cardinality() - print >> sys.stderr, \ - 'Estimated number of unique {0}-mers in {1}: {2}'.format( - args.ksize, input_filename, cardinality) + print('Estimated number of unique {0}-mers in {1}: {2}'.format( + args.ksize, input_filename, cardinality), + file=sys.stderr) if report_fp: - print >> report_fp, cardinality, args.ksize, '(total)' + print(cardinality, args.ksize, '(total)', file=report_fp) report_fp.flush() total_hll.merge(hllcpp) cardinality = total_hll.estimate_cardinality() - print >> sys.stderr, 'Total estimated number of unique {0}-mers: {1}'.format( - args.ksize, cardinality) + print('Total estimated number of unique {0}-mers: {1}'.format( + args.ksize, cardinality), + file=sys.stderr) if report_fp: - print >> report_fp, cardinality, args.ksize, 'total' + print(cardinality, args.ksize, 'total', file=report_fp) report_fp.flush() if __name__ == "__main__": diff --git a/sandbox/write-trimmomatic.py b/sandbox/write-trimmomatic.py index 93a389a897..4d711ff87f 100755 --- a/sandbox/write-trimmomatic.py +++ b/sandbox/write-trimmomatic.py @@ -5,6 +5,7 @@ # the three-clause BSD license; see LICENSE. # Contact: khmer-project@idyll.org # +from __future__ import print_function import glob filelist = glob.glob('*R1*.fastq.gz') @@ -13,7 +14,7 @@ r2 = r1.replace('R1', 'R2') final_pe = r1[:-9] + '.pe.fq.gz' final_se = r1[:-9] + '.se.fq.gz' - print """\ + print("""\ mkdir trim cd trim java -jar /usr/local/bin/trimmomatic-0.30.jar PE ../%s ../%s s1_pe s1_se s2_pe s2_se ILLUMINACLIP:/usr/local/share/adapters/TruSeq3-PE.fa:2:30:10 @@ -24,4 +25,4 @@ rm -r ./trim/ chmod u-w %s %s -""" % (r1, r2, final_pe, final_se, final_pe, final_se) +""" % (r1, r2, final_pe, final_se, final_pe, final_se)) diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 0b3452bfae..3625e7faed 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -15,6 +15,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import os import sys import csv @@ -83,8 +84,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches if (not args.squash_output and os.path.exists(args.output_histogram_filename)): - print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ - args.output_histogram_filename + print('ERROR: %s exists; not squashing.' % + args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') @@ -94,26 +95,26 @@ def main(): # pylint: disable=too-many-locals,too-many-branches hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) - print >>sys.stderr, 'making k-mer counting table' + print('making k-mer counting table', file=sys.stderr) counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) counting_hash.set_use_bigcount(args.bigcount) - print >> sys.stderr, 'building k-mer tracking table' + print('building k-mer tracking table', file=sys.stderr) tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) - print >>sys.stderr, 'kmer_size:', counting_hash.ksize() - print >>sys.stderr, 'k-mer counting table sizes:', \ - counting_hash.hashsizes() - print >>sys.stderr, 'outputting to', args.output_histogram_filename + print('kmer_size:', counting_hash.ksize(), file=sys.stderr) + print('k-mer counting table sizes:', + counting_hash.hashsizes(), file=sys.stderr) + print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] - print >>sys.stderr, 'consuming input, round 1 --', \ - args.input_sequence_filename - for _ in xrange(args.threads): + print('consuming input, round 1 --', + args.input_sequence_filename, file=sys.stderr) + for _ in range(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, @@ -126,8 +127,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches thread.join() if args.report_total_kmers: - print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( - counting_hash.n_unique_kmers()) + print('Total number of unique k-mers: {0}'.format( + counting_hash.n_unique_kmers()), file=sys.stderr) abundance_lists = [] @@ -136,13 +137,13 @@ def __do_abundance_dist__(read_parser): read_parser, tracking) abundance_lists.append(abundances) - print >>sys.stderr, 'preparing hist from %s...' % \ - args.input_sequence_filename + print('preparing hist from %s...' % + args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] - print >>sys.stderr, 'consuming input, round 2 --', \ - args.input_sequence_filename - for _ in xrange(args.threads): + print('consuming input, round 2 --', + args.input_sequence_filename, file=sys.stderr) + for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, @@ -163,10 +164,10 @@ def __do_abundance_dist__(read_parser): total = sum(abundance.values()) if 0 == total: - print >> sys.stderr, \ - "ERROR: abundance distribution is uniformly zero; " \ - "nothing to report." - print >> sys.stderr, "\tPlease verify that the input files are valid." + print("ERROR: abundance distribution is uniformly zero; " + "nothing to report.", file=sys.stderr) + print( + "\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 @@ -180,17 +181,17 @@ def __do_abundance_dist__(read_parser): if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: - print >> hist_fp, _, i, sofar, round(frac, 3) + print(_, i, sofar, round(frac, 3), file=hist_fp) if sofar == total: break if args.savetable: - print >>sys.stderr, 'Saving k-mer counting table ', args.savetable - print >>sys.stderr, '...saving to', args.savetable + print('Saving k-mer counting table ', args.savetable, file=sys.stderr) + print('...saving to', args.savetable, file=sys.stderr) counting_hash.save(args.savetable) - print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename + print('wrote to: ' + args.output_histogram_filename, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index 0b27fed12e..ab1a0db93c 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -15,6 +15,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import os import argparse @@ -76,16 +77,16 @@ def main(): check_space(filenames, args.force) - print >>sys.stderr, 'loading partition map from:', partitionmap_file + print('loading partition map from:', partitionmap_file, file=sys.stderr) htable.load_partitionmap(partitionmap_file) for infile in filenames: - print >>sys.stderr, 'outputting partitions for', infile + print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) - print >>sys.stderr, 'output %d partitions for %s' % ( - part_count, infile) - print >>sys.stderr, 'partitions are in', outfile + print('output %d partitions for %s' % ( + part_count, infile), file=sys.stderr) + print('partitions are in', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/count-median.py b/scripts/count-median.py index 72903b46cf..5d9be9af6f 100755 --- a/scripts/count-median.py +++ b/scripts/count-median.py @@ -21,6 +21,7 @@ NOTE: All 'N's in the input sequences are converted to 'A's. """ +from __future__ import print_function import screed import argparse import sys @@ -85,11 +86,11 @@ def main(): check_space(infiles, args.force) - print >>sys.stderr, 'loading k-mer counting table from', htfile + print('loading k-mer counting table from', htfile, file=sys.stderr) htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() - print >>sys.stderr, 'writing to', output_filename + print('writing to', output_filename, file=sys.stderr) output = open(output_filename, 'w') if args.csv: @@ -109,10 +110,11 @@ def main(): if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) + ave, stdev = [round(x, 9) for x in (ave, stdev)] if args.csv: output.writerow([record.name, medn, ave, stdev, len(seq)]) else: - print >> output, record.name, medn, ave, stdev, len(seq) + print(record.name, medn, ave, stdev, len(seq), file=output) if __name__ == '__main__': main() diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py index e703449b61..3c9968c33d 100755 --- a/scripts/count-overlap.py +++ b/scripts/count-overlap.py @@ -18,6 +18,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import csv import khmer @@ -64,7 +65,7 @@ def main(): check_space([args.ptfile, args.fafile], args.force) - print >>sys.stderr, 'loading k-mer presence table from', args.ptfile + print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() @@ -93,9 +94,9 @@ def main(): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: - print >> f_curve_obj, list_curve[100 + i], list_curve[i] + print(list_curve[100 + i], list_curve[i], file=f_curve_obj) - print >> sys.stderr, 'wrote to: ' + args.report_filename + print('wrote to: ' + args.report_filename, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/do-partition.py b/scripts/do-partition.py index be8091f871..eefb43f4da 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -13,11 +13,12 @@ Use '-h' for parameter help. """ +from __future__ import print_function import khmer import sys import threading -import Queue +import queue import gc import os.path import os @@ -36,9 +37,9 @@ # Debugging Support if "Linux" == platform.system(): def __debug_vm_usage(msg): - print >>sys.stderr, "===> DEBUG: " + msg + print("===> DEBUG: " + msg, file=sys.stderr) for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()): - print vmstat + print(vmstat) else: def __debug_vm_usage(msg): # pylint: disable=unused-argument pass @@ -48,23 +49,23 @@ def worker(queue, basename, stop_big_traversals): while True: try: (htable, index, start, stop) = queue.get(False) - except Queue.Empty: - print >>sys.stderr, 'exiting' + except queue.Empty: + print('exiting', file=sys.stderr) return outfile = basename + '.subset.%d.pmap' % (index,) if os.path.exists(outfile): - print >>sys.stderr, 'SKIPPING', outfile, ' -- already exists' + print('SKIPPING', outfile, ' -- already exists', file=sys.stderr) continue - print >>sys.stderr, 'starting:', basename, index + print('starting:', basename, index, file=sys.stderr) # pay attention to stoptags when partitioning; take command line # direction on whether or not to exhaustively traverse. subset = htable.do_subset_partition(start, stop, True, stop_big_traversals) - print >>sys.stderr, 'saving:', basename, index + print('saving:', basename, index, file=sys.stderr) htable.save_subset_partitionmap(subset, outfile) del subset gc.collect() @@ -114,38 +115,39 @@ def main(): # pylint: disable=too-many-locals,too-many-statements check_space(args.input_filenames, args.force) - print >>sys.stderr, 'Saving k-mer presence table to %s' % args.graphbase - print >>sys.stderr, 'Loading kmers from sequences in %s' % \ - repr(args.input_filenames) - print >>sys.stderr, '--' - print >>sys.stderr, 'SUBSET SIZE', args.subset_size - print >>sys.stderr, 'N THREADS', args.threads - print >>sys.stderr, '--' + print('Saving k-mer presence table to %s' % + args.graphbase, file=sys.stderr) + print('Loading kmers from sequences in %s' % + repr(args.input_filenames), file=sys.stderr) + print('--', file=sys.stderr) + print('SUBSET SIZE', args.subset_size, file=sys.stderr) + print('N THREADS', args.threads, file=sys.stderr) + print('--', file=sys.stderr) # load-graph - print >>sys.stderr, 'making k-mer presence table' + print('making k-mer presence table', file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): - print >>sys.stderr, 'consuming input', filename + print('consuming input', filename, file=sys.stderr) htable.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) - print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: - print >>sys.stderr, '** This script brakes for lumps: ', \ - 'stop_big_traversals is true.' + print('** This script brakes for lumps: ', + 'stop_big_traversals is true.', file=sys.stderr) else: - print >>sys.stderr, '** Traverse all the things:', \ - ' stop_big_traversals is false.' + print('** Traverse all the things:', + ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! @@ -157,7 +159,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements divvy.append(0) # build a queue of tasks: - worker_q = Queue.Queue() + worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): @@ -165,7 +167,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements end = divvy[_ + 1] worker_q.put((htable, _, start, end)) - print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets + print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) @@ -173,8 +175,8 @@ def main(): # pylint: disable=too-many-locals,too-many-statements args.threads = n_subsets # start threads! - print >>sys.stderr, 'starting %d threads' % args.threads - print >>sys.stderr, '---' + print('starting %d threads' % args.threads, file=sys.stderr) + print('---', file=sys.stderr) threads = [] for _ in range(args.threads): @@ -186,43 +188,43 @@ def main(): # pylint: disable=too-many-locals,too-many-statements assert threading.active_count() == args.threads + 1 - print >>sys.stderr, 'done starting threads' + print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() - print >>sys.stderr, '---' - print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ - (args.graphbase,) + print('---', file=sys.stderr) + print('done making subsets! see %s.subset.*.pmap' % + (args.graphbase,), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') - print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ - (len(pmap_files), pmap_files[0]) + print('loading %d pmap files (first one: %s)' % + (len(pmap_files), pmap_files[0]), file=sys.stderr) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: - print >>sys.stderr, 'merging', pmap_file + print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: - print >>sys.stderr, 'removing pmap files' + print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: - print >>sys.stderr, 'outputting partitions for', infile + print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) - print >>sys.stderr, 'output %d partitions for %s' % ( - part_count, infile) - print >>sys.stderr, 'partitions are in', outfile + print('output %d partitions for %s' % ( + part_count, infile), file=sys.stderr) + print('partitions are in', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py index 01f5f2f8d8..cf5f01e503 100755 --- a/scripts/extract-long-sequences.py +++ b/scripts/extract-long-sequences.py @@ -18,6 +18,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import argparse import screed import sys @@ -47,7 +48,7 @@ def main(): for record in screed.open(filename, parse_description=False): if len(record['sequence']) >= args.length: write_record(record, outfp) - print >> sys.stderr, 'wrote to: ' + args.output + print('wrote to: ' + args.output, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index b2ca47a471..691a488b24 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -16,6 +16,7 @@ Reads FASTQ and FASTA input, retains format for output. """ +from __future__ import print_function import screed import sys import os.path @@ -70,9 +71,9 @@ def main(): single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') - print >>sys.stderr, 'reading file "%s"' % args.infile - print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile - print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile + print('reading file "%s"' % args.infile, file=sys.stderr) + print('outputting interleaved pairs to "%s.pe"' % outfile, file=sys.stderr) + print('outputting orphans to "%s.se"' % outfile, file=sys.stderr) n_pe = 0 n_se = 0 @@ -80,7 +81,7 @@ def main(): screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: - print >>sys.stderr, '...', index + print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) @@ -95,12 +96,12 @@ def main(): if n_pe == 0: raise Exception("no paired reads!? check file formats...") - print >>sys.stderr, 'DONE; read %d sequences,' \ - ' %d pairs and %d singletons' % \ - (n_pe * 2 + n_se, n_pe, n_se) + print('DONE; read %d sequences,' + ' %d pairs and %d singletons' % + (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) - print >> sys.stderr, 'wrote to: ' + outfile \ - + '.se' + ' and ' + outfile + '.pe' + print('wrote to: ' + outfile + + '.se' + ' and ' + outfile + '.pe', file=sys.stderr) if __name__ == '__main__': diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index 179cd97d47..dad81fda37 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -18,6 +18,7 @@ @CTB note that if threshold is != 1, those sequences will not be output by output_unassigned... """ +from __future__ import print_function import sys import screed @@ -94,24 +95,24 @@ def main(): # pylint: disable=too-many-locals,too-many-branches check_space(args.part_filenames, args.force) - print >>sys.stderr, '---' - print >>sys.stderr, 'reading partitioned files:', repr(args.part_filenames) + print('---', file=sys.stderr) + print('reading partitioned files:', repr( + args.part_filenames), file=sys.stderr) if args.output_groups: - print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \ - args.prefix - print >>sys.stderr, 'min reads to keep a partition:', \ - args.min_part_size - print >>sys.stderr, 'max size of a group file:', args.max_size + print('outputting to files named "%s.groupN.fa"' % + args.prefix, file=sys.stderr) + print('min reads to keep a partition:', + args.min_part_size, file=sys.stderr) + print('max size of a group file:', args.max_size, file=sys.stderr) else: - print >>sys.stderr, 'NOT outputting groups! Beware!' + print('NOT outputting groups! Beware!', file=sys.stderr) if args.output_unassigned: - print >>sys.stderr, \ - 'outputting unassigned reads to "%s.unassigned.fa"' % \ - args.prefix - print >>sys.stderr, 'partition size distribution will go to %s' \ - % distfilename - print >>sys.stderr, '---' + print('outputting unassigned reads to "%s.unassigned.fa"' % + args.prefix, file=sys.stderr) + print('partition size distribution will go to %s' + % distfilename, file=sys.stderr) + print('---', file=sys.stderr) # @@ -142,7 +143,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: - print >>sys.stderr, '...', index + print('...', index, file=sys.stderr) count[pid] = count.get(pid, 0) + 1 @@ -159,7 +160,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches # develop histogram of partition sizes dist = {} - for pid, size in count.items(): + for pid, size in list(count.items()): dist[size] = dist.get(size, 0) + 1 # output histogram @@ -177,7 +178,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches sys.exit(0) # sort groups by size - divvy = sorted(count.items(), key=lambda y: y[1]) + divvy = sorted(list(count.items()), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences @@ -205,9 +206,9 @@ def main(): # pylint: disable=too-many-locals,too-many-branches # print 'group_d', partition_id, group_n group_n += 1 - print >>sys.stderr, '%d groups' % group_n + print('%d groups' % group_n, file=sys.stderr) if group_n == 0: - print >>sys.stderr, 'nothing to output; exiting!' + print('nothing to output; exiting!', file=sys.stderr) return # open a bunch of output files for the different groups @@ -225,7 +226,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: - print >>sys.stderr, '...x2', index + print('...x2', index, file=sys.stderr) if partition_id == 0: continue @@ -242,20 +243,19 @@ def main(): # pylint: disable=too-many-locals,too-many-branches write_record(read, outfp) part_seqs += 1 - print >>sys.stderr, '---' - print >>sys.stderr, 'Of %d total seqs,' % total_seqs - print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \ - part_seqs - print >>sys.stderr, \ - 'discarded %d sequences from small partitions (see -m),' % \ - toosmall_parts - print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \ - n_unassigned - print >>sys.stderr, '' - print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \ - (len(group_fps), - args.prefix, - suffix) + print('---', file=sys.stderr) + print('Of %d total seqs,' % total_seqs, file=sys.stderr) + print('extracted %d partitioned seqs into group files,' % + part_seqs, file=sys.stderr) + print('discarded %d sequences from small partitions (see -m),' % + toosmall_parts, file=sys.stderr) + print('and found %d unpartitioned sequences (see -U).' % + n_unassigned, file=sys.stderr) + print('', file=sys.stderr) + print('Created %d group files named %s.groupXXXX.%s' % + (len(group_fps), + args.prefix, + suffix), file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py index ad6e8a5e42..8bc13a0986 100755 --- a/scripts/fastq-to-fasta.py +++ b/scripts/fastq-to-fasta.py @@ -14,6 +14,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import argparse import screed @@ -39,13 +40,13 @@ def get_parser(): def main(): args = get_parser().parse_args() - print >> sys.stderr, ('fastq from ', args.input_sequence) + print(('fastq from ', args.input_sequence), file=sys.stderr) n_count = 0 for n, record in enumerate(screed.open(args.input_sequence, parse_description=False)): if n % 10000 == 0: - print>>sys.stderr, '...', n + print('...', n, file=sys.stderr) sequence = record['sequence'] name = record['name'] @@ -58,15 +59,15 @@ def main(): args.output.write('>' + name + '\n') args.output.write(sequence + '\n') - print >> sys.stderr, '\n' + 'lines from ' + args.input_sequence + print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr) if not args.n_keep: - print >> sys.stderr, str(n_count) + ' lines dropped.' + print(str(n_count) + ' lines dropped.', file=sys.stderr) else: - print >> sys.stderr, 'No lines dropped from file.' + print('No lines dropped from file.', file=sys.stderr) - print >> sys.stderr, 'Wrote output to', args.output + print('Wrote output to', args.output, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index 6be3e4d9a2..1daad7fb20 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -17,6 +17,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import os import sys import khmer @@ -73,15 +74,15 @@ def main(): args.n_tables * args.min_tablesize, args.force) report_on_config(args) - print >>sys.stderr, 'making k-mer counting table' + print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] - print >>sys.stderr, 'consuming input, round 1 --', args.datafile - for _ in xrange(args.threads): + print('consuming input, round 1 --', args.datafile, file=sys.stderr) + for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, @@ -94,11 +95,11 @@ def main(): _.join() if args.report_total_kmers: - print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( - htable.n_unique_kmers()) + print('Total number of unique k-mers: {0}'.format( + htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) - print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. @@ -118,21 +119,21 @@ def process_fn(record): return None, None # the filtering loop - print >>sys.stderr, 'filtering', args.datafile + print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) - print >>sys.stderr, 'output in', outfile + print('output in', outfile, file=sys.stderr) if args.savetable: - print >>sys.stderr, 'Saving k-mer counting table filename', \ - args.savetable - print >>sys.stderr, '...saving to', args.savetable + print('Saving k-mer counting table filename', + args.savetable, file=sys.stderr) + print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) - print >>sys.stderr, 'wrote to: ', outfile + print('wrote to: ', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index 57fbdd26e9..9354bfa7e7 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -16,6 +16,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import os import khmer import textwrap @@ -85,11 +86,12 @@ def main(): check_space(infiles, args.force) - print >>sys.stderr, 'loading counting table:', args.input_table + print('loading counting table:', args.input_table, + file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() - print >>sys.stderr, "K:", ksize + print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): @@ -113,7 +115,7 @@ def process_fn(record): # the filtering loop for infile in infiles: - print >>sys.stderr, 'filtering', infile + print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') @@ -124,7 +126,7 @@ def process_fn(record): tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) - print >>sys.stderr, 'output in', outfile + print('output in', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index 0d1b1923ae..3ef752ec34 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -16,6 +16,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import os import khmer @@ -63,7 +64,7 @@ def main(): check_space(infiles, args.force) - print >>sys.stderr, 'loading stop tags, with K', args.ksize + print('loading stop tags, with K', args.ksize, file=sys.stderr) htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) @@ -82,7 +83,7 @@ def process_fn(record): # the filtering loop for infile in infiles: - print >>sys.stderr, 'filtering', infile + print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') @@ -90,7 +91,7 @@ def process_fn(record): tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) - print >>sys.stderr, 'output in', outfile + print('output in', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/find-knots.py b/scripts/find-knots.py index 6b66b28dd1..031c8440b0 100755 --- a/scripts/find-knots.py +++ b/scripts/find-knots.py @@ -13,6 +13,7 @@ % python scripts/find-knots.py """ +from __future__ import print_function import argparse import glob @@ -96,28 +97,30 @@ def main(): check_space(infiles, args.force) - print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase + print('loading k-mer presence table %s.pt' % graphbase, file=sys.stderr) htable = khmer.load_hashbits(graphbase + '.pt') - print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase + print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): - print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase + print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') - print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ - (len(pmap_files), pmap_files[0]) - print >>sys.stderr, '---' - print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' + print('loading %d pmap files (first one: %s)' % + (len(pmap_files), pmap_files[0]), file=sys.stderr) + print('---', file=sys.stderr) + print('output stoptags will be in', + graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: - print >>sys.stderr, \ - '(these output stoptags will include the already-loaded set)' - print >>sys.stderr, '---' + print( + '(these output stoptags will include the already-loaded set)', + file=sys.stderr) + print('---', file=sys.stderr) # create counting hash ksize = htable.ksize() @@ -126,31 +129,32 @@ def main(): # load & merge for index, subset_file in enumerate(pmap_files): - print >>sys.stderr, '<-', subset_file + print('<-', subset_file, file=sys.stderr) subset = htable.load_subset_partitionmap(subset_file) - print >>sys.stderr, '** repartitioning subset... %s' % subset_file + print('** repartitioning subset... %s' % subset_file, file=sys.stderr) htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) - print >>sys.stderr, '** merging subset... %s' % subset_file + print('** merging subset... %s' % subset_file, file=sys.stderr) htable.merge_subset(subset) - print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file + print('** repartitioning, round 2... %s' % + subset_file, file=sys.stderr) size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) - print >>sys.stderr, '** repartitioned size:', size + print('** repartitioned size:', size, file=sys.stderr) - print >>sys.stderr, 'saving stoptags binary' + print('saving stoptags binary', file=sys.stderr) htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') - print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) + print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) - print >>sys.stderr, 'done!' + print('done!', file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py index c6da98a970..4c97ec77e5 100755 --- a/scripts/interleave-reads.py +++ b/scripts/interleave-reads.py @@ -16,13 +16,13 @@ By default, output is sent to stdout; or use -o. Use '-h' for parameter help. """ +from __future__ import print_function # TODO: take fa as well? # support gzip option? import screed import sys -import itertools import os import textwrap import argparse @@ -32,6 +32,11 @@ from khmer.utils import (write_record_pair, check_is_left, check_is_right, check_is_pair) +try: + from itertools import zip_longest +except ImportError: + from itertools import izip_longest as zip_longest + def get_parser(): epilog = """ @@ -77,38 +82,38 @@ def main(): else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: - print >>sys.stderr, ("ERROR: given only one filename, that " - "doesn't contain _R1_. Exiting.") + print(("ERROR: given only one filename, that " + "doesn't contain _R1_. Exiting."), file=sys.stderr) sys.exit(1) - print >> sys.stderr, ("given only one file; " - "guessing that R2 file is %s" % s2_file) + print(("given only one file; " + "guessing that R2 file is %s" % s2_file), file=sys.stderr) fail = False if not os.path.exists(s1_file): - print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file + print("Error! R1 file %s does not exist" % s1_file, file=sys.stderr) fail = True if not os.path.exists(s2_file): - print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file + print("Error! R2 file %s does not exist" % s2_file, file=sys.stderr) fail = True if fail and not args.force: sys.exit(1) - print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) + print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) - for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): + for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: - print >>sys.stderr, ("ERROR: Input files contain different number" - " of records.") + print(("ERROR: Input files contain different number" + " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: - print >> sys.stderr, '...', counter, 'pairs' + print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name @@ -122,14 +127,14 @@ def main(): read2.name = name2 if not check_is_pair(read1, read2): - print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ - "%s %s" % (read1.name, read2.name) + print("ERROR: This doesn't look like paired data! " + "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, args.output) - print >> sys.stderr, 'final: interleaved %d pairs' % counter - print >> sys.stderr, 'output written to', args.output.name + print('final: interleaved %d pairs' % counter, file=sys.stderr) + print('output written to', args.output.name, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/load-graph.py b/scripts/load-graph.py index acb1090e25..9e34103f7f 100755 --- a/scripts/load-graph.py +++ b/scripts/load-graph.py @@ -13,6 +13,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function, unicode_literals import sys import threading @@ -32,6 +33,7 @@ def get_parser(): parser = build_graph.build_parser(parser) return parser + if __name__ == '__main__': parser = get_parser() args = parser.parse_args() diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index acc5c82de1..6c2afc626c 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -12,6 +12,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function, unicode_literals import json import os @@ -58,8 +59,8 @@ def get_parser(): action='store_false', help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.") - parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT", - choices=['json', 'tsv'], + parser.add_argument('--summary-info', '-s', type=str, default=None, + metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)") parser.add_argument('--report-total-kmers', '-t', action='store_true', @@ -88,14 +89,15 @@ def main(): check_file_writable(base) check_file_writable(base + ".info") - print >>sys.stderr, 'Saving k-mer counting table to %s' % base - print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) + print('Saving k-mer counting table to %s' % base, file=sys.stderr) + print('Loading kmers from sequences in %s' % + repr(filenames), file=sys.stderr) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') - print >>sys.stderr, 'making k-mer counting table' + print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) @@ -108,8 +110,8 @@ def main(): rparser = khmer.ReadParser(filename) threads = [] - print >>sys.stderr, 'consuming input', filename - for _ in xrange(args.threads): + print('consuming input', filename, file=sys.stderr) + for _ in range(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, @@ -124,19 +126,19 @@ def main(): if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) - print >>sys.stderr, 'mid-save', base + print('mid-save', base, file=sys.stderr) htable.save(base) with open(base + '.info', 'a') as info_fh: - print >> info_fh, 'through', filename + print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = htable.n_unique_kmers() if args.report_total_kmers: - print >> sys.stderr, 'Total number of unique k-mers:', n_kmers + print('Total number of unique k-mers:', n_kmers, file=sys.stderr) with open(base + '.info', 'a') as info_fp: - print >>info_fp, 'Total number of unique k-mers:', n_kmers + print('Total number of unique k-mers:', n_kmers, file=info_fp) - print >>sys.stderr, 'saving', base + print('saving', base, file=sys.stderr) htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't @@ -144,12 +146,12 @@ def main(): khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: - print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate + print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt - print >> sys.stderr, "Writing summmary info to", mr_file + print("Writing summmary info to", mr_file, file=sys.stderr) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { @@ -173,10 +175,10 @@ def main(): ] mr_fh.write("\t".join(vals) + "\n") - print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate + print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) - print >>sys.stderr, 'DONE.' - print >>sys.stderr, 'wrote to:', base + '.info' + print('DONE.', file=sys.stderr) + print('wrote to:', base + '.info', file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index b1a7a6a7ae..5dd9939c64 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -11,6 +11,7 @@ % python scripts/make-initial-stoptags.py """ +from __future__ import print_function import sys import textwrap @@ -83,15 +84,15 @@ def main(): check_space(infiles, args.force) - print >>sys.stderr, 'loading htable %s.pt' % graphbase + print('loading htable %s.pt' % graphbase, file=sys.stderr) htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: - print >>sys.stderr, 'loading stoptags from', args.stoptags + print('loading stoptags from', args.stoptags, file=sys.stderr) htable.load_stop_tags(args.stoptags) - print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase + print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() @@ -108,19 +109,19 @@ def main(): start, end = divvy[:2] # partition! - print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end + print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = htable.do_subset_partition(start, end) # now, repartition... - print >>sys.stderr, 'repartitioning to find HCKs.' + print('repartitioning to find HCKs.', file=sys.stderr) htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) - print >>sys.stderr, 'saving stop tags' + print('saving stop tags', file=sys.stderr) htable.save_stop_tags(graphbase + '.stoptags') - print >> sys.stderr, 'wrote to:', graphbase + '.stoptags' + print('wrote to:', graphbase + '.stoptags', file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 5b78ff4210..2904e60693 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -14,6 +14,7 @@ Load .subset.*.pmap and merge into a single pmap file. Final merged pmap file will be in .pmap.merged. """ +from __future__ import print_function import argparse import glob @@ -56,8 +57,8 @@ def main(): output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') - print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ - (len(pmap_files), pmap_files[0]) + print('loading %d pmap files (first one: %s)' % + (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) @@ -68,14 +69,14 @@ def main(): check_space(pmap_files, args.force) for pmap_file in pmap_files: - print >>sys.stderr, 'merging', pmap_file + print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) - print >>sys.stderr, 'saving merged to', output_file + print('saving merged to', output_file, file=sys.stderr) htable.save_partitionmap(output_file) if args.remove_subsets: - print >>sys.stderr, 'removing pmap files' + print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 045d6ce9c4..87623221a6 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -17,6 +17,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function from __future__ import print_function @@ -158,9 +159,9 @@ def get_parser(): Paired end reads will be considered together if :option:`-p` is set. If either read will be kept, then both will be kept. This should result in keeping (or discarding) each sequencing fragment. This helps with retention - of repeats, especially. With :option: `-u`/:option:`--unpaired-reads`, + of repeats, especially. With :option: `-u`/:option:`--unpaired-reads`, unpaired reads from the specified file will be read after the paired data - is read. + is read. With :option:`-s`/:option:`--savetable`, the k-mer counting table will be saved to the specified file after all sequences have been @@ -355,6 +356,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements print("** IOErrors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr) + if __name__ == '__main__': main() diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py index 9fe3982e91..30d11e2971 100755 --- a/scripts/partition-graph.py +++ b/scripts/partition-graph.py @@ -15,9 +15,10 @@ Use '-h' for parameter help. """ +from __future__ import print_function import threading -import Queue +import queue import gc import os.path import argparse @@ -31,9 +32,9 @@ import platform if "Linux" == platform.system(): def __debug_vm_usage(msg): - print >>sys.stderr, "===> DEBUG: " + msg + print("===> DEBUG: " + msg, file=sys.stderr) for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()): - print >>sys.stderr, vmstat + print(vmstat, file=sys.stderr) else: def __debug_vm_usage(msg): # pylint: disable=unused-argument pass @@ -46,23 +47,23 @@ def worker(queue, basename, stop_big_traversals): while True: try: (htable, index, start, stop) = queue.get(False) - except Queue.Empty: - print >>sys.stderr, 'exiting' + except queue.Empty: + print('exiting', file=sys.stderr) return outfile = basename + '.subset.%d.pmap' % (index,) if os.path.exists(outfile): - print >>sys.stderr, 'SKIPPING', outfile, ' -- already exists' + print('SKIPPING', outfile, ' -- already exists', file=sys.stderr) continue - print >>sys.stderr, 'starting:', basename, index + print('starting:', basename, index, file=sys.stderr) # pay attention to stoptags when partitioning; take command line # direction on whether or not to exhaustively traverse. subset = htable.do_subset_partition(start, stop, True, stop_big_traversals) - print >>sys.stderr, 'saving:', basename, index + print('saving:', basename, index, file=sys.stderr) htable.save_subset_partitionmap(subset, outfile) del subset gc.collect() @@ -107,30 +108,30 @@ def main(): check_space(filenames, args.force) - print >>sys.stderr, '--' - print >>sys.stderr, 'SUBSET SIZE', args.subset_size - print >>sys.stderr, 'N THREADS', args.threads + print('--', file=sys.stderr) + print('SUBSET SIZE', args.subset_size, file=sys.stderr) + print('N THREADS', args.threads, file=sys.stderr) if args.stoptags: - print >>sys.stderr, 'stoptag file:', args.stoptags - print >>sys.stderr, '--' + print('stoptag file:', args.stoptags, file=sys.stderr) + print('--', file=sys.stderr) - print >>sys.stderr, 'loading ht %s.pt' % basename + print('loading ht %s.pt' % basename, file=sys.stderr) htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: - print >>sys.stderr, 'loading stoptags from', args.stoptags + print('loading stoptags from', args.stoptags, file=sys.stderr) htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: - print >>sys.stderr, '** This script brakes for lumps:', \ - ' stop_big_traversals is true.' + print('** This script brakes for lumps:', + ' stop_big_traversals is true.', file=sys.stderr) else: - print >>sys.stderr, '** Traverse all the things:', \ - ' stop_big_traversals is false.' + print('** Traverse all the things:', + ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! @@ -142,7 +143,7 @@ def main(): divvy.append(0) # build a queue of tasks: - worker_q = Queue.Queue() + worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): @@ -150,7 +151,7 @@ def main(): end = divvy[_ + 1] worker_q.put((htable, _, start, end)) - print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets + print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads @@ -158,8 +159,8 @@ def main(): n_threads = n_subsets # start threads! - print >>sys.stderr, 'starting %d threads' % n_threads - print >>sys.stderr, '---' + print('starting %d threads' % n_threads, file=sys.stderr) + print('---', file=sys.stderr) threads = [] for _ in range(n_threads): @@ -168,15 +169,15 @@ def main(): threads.append(cur_thrd) cur_thrd.start() - print >>sys.stderr, 'done starting threads' + print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() - print >>sys.stderr, '---' - print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ - (basename,) + print('---', file=sys.stderr) + print('done making subsets! see %s.subset.*.pmap' % + (basename,), file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/readstats.py b/scripts/readstats.py index bfc68b32c3..74a0065e81 100755 --- a/scripts/readstats.py +++ b/scripts/readstats.py @@ -12,6 +12,7 @@ Use '-h' for parameter help. """ +from __future__ import print_function import sys import csv @@ -130,7 +131,7 @@ def analyze_file(filename): input_iter = screed.open(filename, parse_description=False) for record in input_iter: if seqs % 100000 == 0: - print >>sys.stderr, '...', filename, seqs + print('...', filename, seqs, file=sys.stderr) bps += len(record.sequence) seqs += 1 return bps, seqs @@ -150,8 +151,8 @@ def main(): try: bps, seqs = analyze_file(filename) except (IOError, OSError, EOFError) as exc: - print >>sys.stderr, 'ERROR in opening %s:' % filename - print >>sys.stderr, ' ', str(exc) + print('ERROR in opening %s:' % filename, file=sys.stderr) + print(' ', str(exc), file=sys.stderr) continue if seqs: @@ -161,11 +162,9 @@ def main(): seqs, avg, filename) - - print >>sys.stderr, '... found', msg - + print('... found', msg, file=sys.stderr) else: - print >>sys.stderr, 'No sequences found in %s' % filename + print('No sequences found in %s' % filename, file=sys.stderr) if statistics: if args.csv: @@ -176,8 +175,8 @@ def main(): for stat in statistics: out.append(*stat) else: - print >>args.outfp, \ - 'No sequences found in %d files' % len(args.filenames) + print('No sequences found in %d files' % + len(args.filenames), file=args.outfp) if __name__ == '__main__': diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index f2a55d9f54..3f8ce1692b 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -17,6 +17,7 @@ Reads FASTQ and FASTA input, retains format for output. """ +from __future__ import print_function import argparse import screed @@ -110,17 +111,18 @@ def main(): output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: - print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\ - args.num_reads - print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ - output_filename - print >>sys.stderr, '' + print('Subsampling %d reads using reservoir sampling.' % + args.num_reads, file=sys.stderr) + print('Subsampled reads will be placed in %s' % + output_filename, file=sys.stderr) + print('', file=sys.stderr) else: # > 1 - print >>sys.stderr, 'Subsampling %d reads, %d times,' \ - % (args.num_reads, num_samples), ' using reservoir sampling.' - print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ - % output_filename - print >>sys.stderr, '' + print('Subsampling %d reads, %d times,' + % (args.num_reads, num_samples), ' using reservoir sampling.', + file=sys.stderr) + print('Subsampled reads will be placed in %s.N' + % output_filename, file=sys.stderr) + print('', file=sys.stderr) reads = [] for n in range(num_samples): @@ -128,17 +130,16 @@ def main(): # read through all the sequences and load/resample the reservoir for filename in args.filenames: - print >>sys.stderr, 'opening', filename, 'for reading' - screed_iter = screed.open(filename, parse_description=False) - - for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( - screed_iter, - force_single=args.force_single)): - if count % 10000 == 0: - print >>sys.stderr, '...', count, 'reads scanned' - if count >= args.max_reads: - print >>sys.stderr, 'reached upper limit of %d reads' % \ - args.max_reads, '(see -M); exiting' + print('opening', filename, 'for reading', file=sys.stderr) + with screed.open(filename, parse_description=False) as screed_iter: + for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( + screed_iter, + force_single=args.force_single)): + if count % 10000 == 0: + print('...', total, 'reads scanned', file=sys.stderr) + if count >= args.max_reads: + print('reached upper limit of %d reads' % + args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads @@ -158,8 +159,8 @@ def main(): # output all the subsampled reads: if len(reads) == 1: - print >>sys.stderr, 'Writing %d sequences to %s' % \ - (len(reads[0]), output_filename) + print('Writing %d sequences to %s' % + (len(reads[0]), output_filename), file=sys.stderr) if not output_file: output_file = open(output_filename, 'w') @@ -170,8 +171,8 @@ def main(): else: for n in range(num_samples): n_filename = output_filename + '.%d' % n - print >>sys.stderr, 'Writing %d sequences to %s' % \ - (len(reads[n]), n_filename) + print('Writing %d sequences to %s' % + (len(reads[n]), n_filename), file=sys.stderr) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 0ee096ad5a..8c5fbfa27d 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -16,6 +16,7 @@ Reads FASTQ and FASTA input, retains format for output. """ +from __future__ import print_function import screed import sys import os @@ -130,11 +131,12 @@ def main(): paired_iter = broken_paired_reader(screed_iter) for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: - print >>sys.stderr, '...', index + print('...', index, file=sys.stderr) + # are we requiring pairs? if args.force_paired and not is_pair: - print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ - record1.name + print('ERROR, %s is not part of a pair' % + record1.name, file=sys.stderr) sys.exit(1) if is_pair: @@ -151,15 +153,15 @@ def main(): write_record(record1, fp_out2) counter2 += 1 else: - print >>sys.stderr, \ - "Unrecognized format for read pair information: %s" % name - print >>sys.stderr, "Exiting." + print("Unrecognized format for read pair information: %s" % + name, file=sys.stderr) + print("Exiting.", file=sys.stderr) sys.exit(1) - print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ - (counter1 + counter2, counter1, counter2) - print >> sys.stderr, "/1 reads in %s" % out1 - print >> sys.stderr, "/2 reads in %s" % out2 + print("DONE; split %d sequences (%d left, %d right)" % + (counter1 + counter2, counter1, counter2), file=sys.stderr) + print("/1 reads in %s" % out1, file=sys.stderr) + print("/2 reads in %s" % out2, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 9e506e8bb7..dddd091321 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -14,6 +14,7 @@ Use -h for parameter help. """ +from __future__ import print_function import sys import screed import os @@ -113,8 +114,8 @@ def main(): ### if len(set(args.input_filenames)) != len(args.input_filenames): - print >>sys.stderr, \ - "Error: Cannot input the same filename multiple times." + print("Error: Cannot input the same filename multiple times.", + file=sys.stderr) sys.exit(1) ### @@ -132,15 +133,16 @@ def main(): NORMALIZE_LIMIT = args.normalize_to if args.loadtable: - print >>sys.stderr, 'loading k-mer counting table from', args.loadtable + print('loading k-mer counting table from', + args.loadtable, file=sys.stderr) ct = khmer.load_counting_hash(args.loadtable) else: - print >>sys.stderr, 'making k-mer counting table' + print('making k-mer counting table', file=sys.stderr) ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) - print >>sys.stderr, 'created temporary directory %s; ' \ - 'use -T to change location' % tempdir + print('created temporary directory %s; ' + 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### @@ -173,8 +175,8 @@ def main(): force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: - print >>sys.stderr, '...', n, filename, save_pass2, \ - n_reads, n_bp, written_reads, written_bp + print('...', n, filename, save_pass2, n_reads, n_bp, + written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. @@ -240,8 +242,9 @@ def main(): pass2fp.close() - print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ - % (filename, save_pass2, n, filename) + print('%s: kept aside %d of %d from first pass, in %s' % + (filename, save_pass2, n, filename), + file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### @@ -249,8 +252,9 @@ def main(): skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: - print >>sys.stderr, ('second pass: looking at sequences kept aside ' - 'in %s') % pass2filename + print('second pass: looking at sequences kept aside in %s' % + pass2filename, + file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, @@ -260,8 +264,8 @@ def main(): for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: - print >>sys.stderr, '... x 2', n, pass2filename, \ - written_reads, written_bp + print('... x 2', n, pass2filename, + written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) @@ -288,42 +292,45 @@ def main(): if trim_at != len(read.sequence): trimmed_reads += 1 - print >>sys.stderr, 'removing %s' % pass2filename + print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) - print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir + print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 - print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,) - print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,) - print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ - (save_pass2_total, n_passes) - print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \ - (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) - print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \ - ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) + print('read %d reads, %d bp' % (n_reads, n_bp,)) + print('wrote %d reads, %d bp' % (written_reads, written_bp,)) + print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, + n_passes)) + print('removed %d reads and trimmed %d reads (%.2f%%)' % + (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)) + print('trimmed or removed %.2f%% of bases (%d total)' % + ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads - print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ - (n_reads - skipped_n, percent_reads_hicov) - print >>sys.stderr, ('skipped %d reads/%d bases because of low' - 'coverage') % (skipped_n, skipped_bp) + print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, + percent_reads_hicov), + file=sys.stderr) + print('skipped %d reads/%d bases because of low coverage' % + (skipped_n, skipped_bp) + file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 - print >>sys.stderr, \ - 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) + print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), + file=sys.stderr) - print >>sys.stderr, 'output in *.abundtrim' + print('output in *.abundtrim', file=sys.stderr) if args.savetable: - print >>sys.stderr, "Saving k-mer counting table to", args.savetable + print("Saving k-mer counting table to", + args.savetable, file=sys.stderr) ct.save(args.savetable) diff --git a/setup.py b/setup.py index 5ff3474e5a..a4a5e32ee2 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under # the three-clause BSD license; see doc/LICENSE.txt. @@ -104,7 +104,7 @@ def check_for_openmp(): "khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash", "hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers"]) -SOURCES = ["khmer/_khmermodule.cc"] +SOURCES = ["khmer/_khmer.cc"] SOURCES.extend(path_join("lib", bn + ".cc") for bn in [ "trace_logger", "perf_metrics", "read_parsers", "kmer_hash", "hashtable", "hashbits", "labelhash", "counting", "subset", "read_aligner", @@ -134,7 +134,7 @@ def check_for_openmp(): "define_macros": [("VERSION", versioneer.get_version()), ], } -EXTENSION_MOD = Extension("khmer._khmermodule", # pylint: disable=W0142 +EXTENSION_MOD = Extension("khmer._khmer", # pylint: disable=W0142 ** EXTENSION_MOD_DICT) SCRIPTS = [] SCRIPTS.extend([path_join("scripts", script) @@ -215,9 +215,8 @@ def run(self): SETUP_METADATA["packages"]) if "z" not in self.libraries: - zcmd = ['bash', '-c', 'cd ' + ZLIBDIR + ' && ( test Makefile -nt' - ' configure || bash ./configure --static ) && make -f ' - 'Makefile.pic PIC'] + zcmd = ['bash', '-c', 'cd ' + ZLIBDIR + ' && bash ./configure ' + '--static && make -f Makefile.pic PIC'] spawn(cmd=zcmd, dry_run=self.dry_run) self.extensions[0].extra_objects.extend( path_join("third-party", "zlib", bn + ".lo") for bn in [ diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py index 85f20558fb..b3b2bafb13 100644 --- a/tests/khmer_tst_utils.py +++ b/tests/khmer_tst_utils.py @@ -1,3 +1,4 @@ +from __future__ import print_function # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -8,11 +9,16 @@ import os import shutil from pkg_resources import Requirement, resource_filename, ResolutionError -from cStringIO import StringIO import nose import sys import traceback import subprocess +from io import open + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO def get_test_data(filename): @@ -100,8 +106,8 @@ def runscript(scriptname, args, in_directory=None, os.chdir(in_directory) try: - print 'running:', scriptname, 'in:', in_directory - print 'arguments', sysargs + print('running:', scriptname, 'in:', in_directory) + print('arguments', sysargs) status = _runscript(scriptname, sandbox=sandbox) except nose.SkipTest: raise @@ -118,8 +124,8 @@ def runscript(scriptname, args, in_directory=None, os.chdir(cwd) if status != 0 and not fail_ok: - print out - print err + print(out) + print(err) assert False, (status, out, err) return status, out, err @@ -148,10 +154,10 @@ def runscriptredirect(scriptname, args, stdinfilename, in_directory=None, os.chdir(in_directory) sysargs = 'cat ' + stdinfilename + ' | python ' + scriptfile + \ " " + args - out = open(os.path.join(in_directory, "out"), 'w+b') - err = open(os.path.join(in_directory, "err"), 'w+b') - print 'running:', scriptname, 'in:', in_directory - print 'arguments', sysargs + out = open(os.path.join(in_directory, "out"), 'w+', encoding='utf-8') + err = open(os.path.join(in_directory, "err"), 'w+', encoding='utf-8') + print('running:', scriptname, 'in:', in_directory) + print('arguments', sysargs) status = subprocess.call(args=sysargs, stdout=out, stderr=err, shell=True) os.chdir(cwd) @@ -160,8 +166,8 @@ def runscriptredirect(scriptname, args, stdinfilename, in_directory=None, out = out.read() err.seek(0) err = err.read() - print out - print err + print(out) + print(err) assert False, (status, out, err) return status, out, err diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py index fcd630b054..4b2d269f53 100644 --- a/tests/test_counting_hash.py +++ b/tests/test_counting_hash.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import, unicode_literals # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -11,7 +13,7 @@ import shutil import khmer -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from khmer import ReadParser import screed @@ -128,27 +130,18 @@ def test_get_raw_tables(): tables = ht.get_raw_tables() for size, table in zip(ht.hashsizes(), tables): - assert isinstance(table, buffer) + assert isinstance(table, memoryview) assert size == len(table) def test_get_raw_tables_view(): - try: - memoryview - except NameError: - raise nose.SkipTest("This test requires memoryview") ht = khmer.new_counting_hash(20, 1e5, 4) tables = ht.get_raw_tables() for tab in tables: - try: - memv = memoryview(tab) - except TypeError: - raise nose.SkipTest("This test needs a higher version of Python.") - assert sum(memv.tolist()) == 0 + assert sum(tab.tolist()) == 0 ht.consume('AAAATTTTCCCCGGGGAAAA') for tab in tables: - memv = memoryview(tab) - assert sum(memv.tolist()) == 1 + assert sum(tab.tolist()) == 1 @attr('linux') @@ -157,7 +150,7 @@ def test_toobig(): ct = khmer.new_counting_hash(30, 1e13, 1) assert 0, "this should fail" except MemoryError as err: - print str(err) + print(str(err)) def test_3_tables(): @@ -199,35 +192,35 @@ def test_simple_median(): hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 1 assert average == 1.0 assert stddev == 0.0 hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 2 assert average == 2.0 assert stddev == 0.0 hi.consume("AAAAAT") (median, average, stddev) = hi.get_median_count("AAAAAAT") - print median, average, stddev + print(median, average, stddev) assert median == 2 assert average == 1.5 assert int(stddev * 100) == 50 # .5 hi.consume("AAAAAT") (median, average, stddev) = hi.get_median_count("AAAAAAT") - print median, average, stddev + print(median, average, stddev) assert median == 2 assert average == 2.0 assert stddev == 0.0 hi.consume("AAAAAT") (median, average, stddev) = hi.get_median_count("AAAAAAT") - print median, average, stddev + print(median, average, stddev) assert median == 3 assert average == 2.5 assert int(stddev * 100) == 50 # .5 @@ -713,7 +706,7 @@ def test_find_spectral_error_positions_1(): hi.consume(DNA[:30]) for n in range(len(DNA) - 8 + 1): - print n, hi.get(DNA[n:n + 8]) + print(n, hi.get(DNA[n:n + 8])) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [30], posns @@ -736,7 +729,7 @@ def test_find_spectral_error_positions_6(): hi.consume(DNA[1:]) for n in range(len(DNA) - 8 + 1): - print n, hi.get(DNA[n:n + 8]) + print(n, hi.get(DNA[n:n + 8])) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [0], posns @@ -770,7 +763,7 @@ def test_find_spectral_error_positions_6(): hi.consume(DNA[K:]) for n in range(len(DNA) - 8 + 1): - print n, hi.get(DNA[n:n + 8]) + print(n, hi.get(DNA[n:n + 8])) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [7], posns @@ -901,7 +894,7 @@ def test_bigcount_abund_dist(): kh.consume_fasta(seqpath) dist = kh.abundance_distribution(seqpath, tracking) - print kh.get('GGTTGACGGGGCTCAGGG') + print(kh.get('GGTTGACGGGGCTCAGGG')) pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]] assert dist[1001] == 1, pdist @@ -919,7 +912,7 @@ def test_bigcount_abund_dist_2(): kh.count('GGTTGACGGGGCTCAGGG') dist = kh.abundance_distribution(seqpath, tracking) - print kh.get('GGTTGACGGGGCTCAGGG') + print(kh.get('GGTTGACGGGGCTCAGGG')) pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]] assert dist[1001] == 1, pdist @@ -962,7 +955,7 @@ def test_load_notexist_should_fail(): hi.load(savepath) assert 0, "load should fail" except IOError as e: - print str(e) + print(str(e)) def test_load_truncated_should_fail(): @@ -986,7 +979,7 @@ def test_load_truncated_should_fail(): hi.load(savepath) assert 0, "load should fail" except IOError as e: - print str(e) + print(str(e)) def test_load_gz_notexist_should_fail(): @@ -997,7 +990,7 @@ def test_load_gz_notexist_should_fail(): hi.load(savepath) assert 0, "load should fail" except IOError as e: - print str(e) + print(str(e)) def test_load_gz_truncated_should_fail(): @@ -1021,7 +1014,7 @@ def test_load_gz_truncated_should_fail(): hi.load(savepath) assert 0, "load should fail" except IOError as e: - print str(e) + print(str(e)) def test_counting_file_version_check(): @@ -1033,7 +1026,7 @@ def test_counting_file_version_check(): ht.load(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_counting_gz_file_version_check(): @@ -1045,7 +1038,7 @@ def test_counting_gz_file_version_check(): ht.load(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_counting_file_type_check(): @@ -1057,7 +1050,7 @@ def test_counting_file_type_check(): kh.load(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_counting_gz_file_type_check(): @@ -1071,7 +1064,7 @@ def test_counting_gz_file_type_check(): kh.load(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_counting_bad_primes_list(): @@ -1079,7 +1072,7 @@ def test_counting_bad_primes_list(): ht = khmer.CountingHash(12, ["a", "b", "c"], 1) assert 0, "bad list of primes should fail" except TypeError as e: - print str(e) + print(str(e)) def test_bad_use_bigcount(): @@ -1090,7 +1083,7 @@ def test_bad_use_bigcount(): countingtable.get_use_bigcount(True) assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_consume_absentfasta(): @@ -1099,7 +1092,7 @@ def test_consume_absentfasta(): countingtable.consume_fasta("absent_file.fa") assert 0, "This should fail" except IOError as err: - print str(err) + print(str(err)) def test_consume_absentfasta_with_reads_parser(): @@ -1108,15 +1101,15 @@ def test_consume_absentfasta_with_reads_parser(): countingtable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) countingtable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) except ValueError as err: - print str(err) + print(str(err)) def test_badconsume(): @@ -1125,12 +1118,12 @@ def test_badconsume(): countingtable.consume() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.consume("AAA") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_get_badmin_count(): @@ -1139,12 +1132,12 @@ def test_get_badmin_count(): countingtable.get_min_count() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.get_min_count("AAA") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_get_badmax_count(): @@ -1153,12 +1146,12 @@ def test_get_badmax_count(): countingtable.get_max_count() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.get_max_count("AAA") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_get_badmedian_count(): @@ -1167,12 +1160,12 @@ def test_get_badmedian_count(): countingtable.get_median_count() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.get_median_count("AAA") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_get_badkadian_count(): @@ -1181,12 +1174,12 @@ def test_get_badkadian_count(): countingtable.get_kadian_count() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.get_kadian_count("AAA") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_badget(): @@ -1195,7 +1188,7 @@ def test_badget(): countingtable.get() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_badget_2(): @@ -1211,7 +1204,7 @@ def test_badget_2(): countingtable.get("AGCTT") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_badtrim(): @@ -1222,7 +1215,7 @@ def test_badtrim(): countingtable.trim_on_abundance() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) countingtable.trim_on_abundance("AAAAAA", 1) @@ -1231,19 +1224,19 @@ def test_badfasta_count_kmers_by_position(): try: countingtable.fasta_count_kmers_by_position() except TypeError as err: - print str(err) + print(str(err)) filename = utils.get_test_data("test-short.fa") try: countingtable.fasta_count_kmers_by_position(filename, -1, 0) assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) try: countingtable.fasta_count_kmers_by_position(filename, 0, -1) assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_badload(): @@ -1252,7 +1245,7 @@ def test_badload(): countingtable.load() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_badsave(): @@ -1261,7 +1254,7 @@ def test_badsave(): countingtable.save() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_badksize(): @@ -1270,7 +1263,7 @@ def test_badksize(): countingtable.ksize(True) assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_badhashsizes(): @@ -1279,7 +1272,7 @@ def test_badhashsizes(): countingtable.hashsizes(True) assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_badconsume_and_tag(): @@ -1288,7 +1281,7 @@ def test_badconsume_and_tag(): countingtable.consume_and_tag() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) def test_consume_fasta_and_tag(): @@ -1297,7 +1290,7 @@ def test_consume_fasta_and_tag(): countingtable.consume_fasta_and_tag() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa")) diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py index 507596f540..73c4dc77c3 100644 --- a/tests/test_counting_single.py +++ b/tests/test_counting_single.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -8,7 +10,7 @@ # pylint: disable=C0111,C0103 import khmer -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr MAX_COUNT = 255 @@ -30,7 +32,7 @@ def test_toobig(): ct = khmer.new_hashtable(4, 1000000000000) assert 0, "this should fail" except MemoryError as err: - print str(err) + print(str(err)) def test_collision(): @@ -49,12 +51,12 @@ def test_badcount(): countingtable.count() assert 0, "count should require one argument" except TypeError as err: - print str(err) + print(str(err)) try: countingtable.count('ABCDE') assert 0, "count should require k-mer size to be equal" except ValueError as err: - print str(err) + print(str(err)) def test_hashtable_n_entries(): @@ -63,7 +65,7 @@ def test_hashtable_n_entries(): countingtable.n_entries("nope") assert 0, "n_entries should accept no arguments" except TypeError as err: - print str(err) + print(str(err)) def test_complete_no_collision(): @@ -143,7 +145,7 @@ def test_maxcount(): kh.count('AAAA') c = kh.get('AAAA') - print last_count, c + print(last_count, c) if c == last_count: break last_count = c @@ -162,7 +164,7 @@ def test_maxcount_with_bigcount(): kh.count('AAAA') c = kh.get('AAAA') - print last_count, c + print(last_count, c) if c == last_count: break last_count = c @@ -278,7 +280,7 @@ def test_badget(): kh.get("AGCTT") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_64bitshift(): @@ -328,7 +330,7 @@ def test_n_occupied(self): self.kh.n_occupied("MU", 1, 3) assert 0, "n_occupied shouldn't accept three arguments" except TypeError as err: - print str(err) + print(str(err)) def test_abundance_by_pos(self): kh = self.kh diff --git a/tests/test_filter.py b/tests/test_filter.py index 45e1fc645e..193ac89a24 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -8,7 +10,7 @@ from screed.fasta import fasta_iter from nose.plugins.attrib import attr -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils def teardown(): @@ -35,18 +37,18 @@ def test_abund(self): ht.consume_fasta() assert 0, "should fail" except TypeError as err: - print str(err) + print(str(err)) try: ht.consume_fasta("nonexistent") assert 0, "should fail" except IOError as err: - print str(err) + print(str(err)) ht.output_fasta_kmer_pos_freq(filename, outname) try: ht.output_fasta_kmer_pos_freq() assert 0, "should fail" except TypeError as err: - print str(err) + print(str(err)) fd = open(outname, "r") diff --git a/tests/test_functions.py b/tests/test_functions.py index 31ddc55d27..39214af7c1 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -7,7 +9,7 @@ import khmer from nose.plugins.attrib import attr import os -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils import collections from khmer.utils import (check_is_pair, broken_paired_reader, check_is_left, check_is_right) @@ -84,7 +86,7 @@ def test_extract_countinghash_info(): info = khmer.extract_countinghash_info(fn) ksize, table_size, n_tables, _, _, _ = info - print ksize, table_size, n_tables + print(ksize, table_size, n_tables) assert(ksize) == 25 assert table_size == size @@ -93,7 +95,7 @@ def test_extract_countinghash_info(): try: os.remove(fn) except OSError as e: - print >>sys.stder, '...failed to remove {fn}'.format(fn) + print('...failed to remove {fn}'.format(fn), file=sys.stder) def test_extract_hashbits_info(): @@ -104,7 +106,7 @@ def test_extract_hashbits_info(): info = khmer.extract_hashbits_info(fn) ksize, table_size, n_tables, _, _ = info - print ksize, table_size, n_tables + print(ksize, table_size, n_tables) assert(ksize) == 25 assert table_size == size @@ -113,7 +115,7 @@ def test_extract_hashbits_info(): try: os.remove(fn) except OSError as e: - print >>sys.stderr, '...failed to remove {fn}'.format(fn) + print('...failed to remove {fn}'.format(fn), file=sys.stderr) def test_check_file_status_kfile(): diff --git a/tests/test_graph.py b/tests/test_graph.py index 51f43a2f5e..350db5d1e1 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -7,7 +9,7 @@ import khmer import screed -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr @@ -237,7 +239,7 @@ def test_output_fq(self): output_file = utils.get_temp_filename('parttest') ht.output_partitions(filename, output_file, False) - print open(output_file).read() + print(open(output_file).read()) x = set([r.quality for r in screed.open(output_file)]) assert x, x @@ -314,7 +316,7 @@ def test_find_all_tags_kmersize(self): b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT" - print ht.consume(a) + print(ht.consume(a)) try: ppi = ht.find_all_tags(c[:19]) assert False, "should raise a ValueError for wrong k-mer size" @@ -334,17 +336,17 @@ def test_ordered_connect(self): b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT" - print ht.consume(a) + print(ht.consume(a)) ppi = ht.find_all_tags(a[:20]) pid = ht.assign_partition_id(ppi) assert pid == 0, pid - print ht.consume(b) + print(ht.consume(b)) ppi = ht.find_all_tags(b[:20]) pid = ht.assign_partition_id(ppi) assert pid == 0, pid - print ht.consume(c) + print(ht.consume(c)) ppi = ht.find_all_tags(c[:20]) pid = ht.assign_partition_id(ppi) assert pid == 2, pid diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 6186741cd4..f7525eca5e 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -11,7 +13,7 @@ from screed.fasta import fasta_iter import screed -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr @@ -268,7 +270,7 @@ def test_count_within_radius_simple(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.new_hashbits(4, 2, 2) - print ht.consume_fasta(inpfile) + print(ht.consume_fasta(inpfile)) n = ht.count_kmers_within_radius('AAAA', 1) assert n == 1 @@ -459,7 +461,7 @@ def test_extract_unique_paths_1(): kh.consume('AGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer @@ -468,7 +470,7 @@ def test_extract_unique_paths_2(): kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer @@ -478,7 +480,7 @@ def test_extract_unique_paths_3(): kh.consume('ATGGAGAGAC') kh.consume('AGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) # all but the 1st/last k-mer assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT'] @@ -492,7 +494,7 @@ def test_extract_unique_paths_4(): kh.consume('ATAGACAGGA') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT'] @@ -566,14 +568,14 @@ def test_simple_median(): hi = khmer.new_hashbits(6, 2, 2) (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 0 assert average == 0.0 assert stddev == 0.0 hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 1 assert average == 1.0 assert stddev == 0.0 @@ -600,7 +602,7 @@ def test_badget(): hbts.get(u"AGCTT") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) # @@ -638,7 +640,7 @@ def test_load_truncated_should_fail(): hi.load(savepath) assert 0, "load should fail" except IOError as e: - print str(e) + print(str(e)) def test_save_load_tagset_notexist(): @@ -649,7 +651,7 @@ def test_save_load_tagset_notexist(): ht.load_tagset(outfile) assert 0, "this test should fail" except IOError as e: - print str(e) + print(str(e)) def test_save_load_tagset_trunc(): @@ -732,7 +734,7 @@ def test_hashbits_file_version_check(): ht.load(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_hashbits_file_type_check(): @@ -746,7 +748,7 @@ def test_hashbits_file_type_check(): ht.load(savepath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_stoptags_file_version_check(): @@ -758,7 +760,7 @@ def test_stoptags_file_version_check(): ht.load_stop_tags(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_stoptags_ksize_check(): @@ -769,7 +771,7 @@ def test_stoptags_ksize_check(): ht.load_stop_tags(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_stop_tags_filetype_check(): @@ -780,7 +782,7 @@ def test_stop_tags_filetype_check(): ht.load_stop_tags(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_tagset_file_version_check(): @@ -792,7 +794,7 @@ def test_tagset_file_version_check(): ht.load_tagset(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_stop_tags_truncate_check(): @@ -822,7 +824,7 @@ def test_tagset_ksize_check(): ht.load_tagset(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_tagset_filetype_check(): @@ -833,7 +835,7 @@ def test_tagset_filetype_check(): ht.load_tagset(inpath) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_bad_primes_list(): @@ -841,7 +843,7 @@ def test_bad_primes_list(): coutingtable = khmer._Hashbits(31, ["a", "b", "c"], 1) assert 0, "Bad primes list should fail" except TypeError as e: - print str(e) + print(str(e)) def test_consume_absentfasta_with_reads_parser(): @@ -850,12 +852,12 @@ def test_consume_absentfasta_with_reads_parser(): presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: - print str(err) + print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) presencetable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) except ValueError as err: - print str(err) + print(str(err)) diff --git a/tests/test_hashbits_obj.py b/tests/test_hashbits_obj.py index 6e602d7df3..9de8e799ec 100644 --- a/tests/test_hashbits_obj.py +++ b/tests/test_hashbits_obj.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -17,7 +19,7 @@ from screed.fasta import fasta_iter import screed -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr @@ -31,7 +33,7 @@ def test_toobig(): pt = khmer.Hashbits(32, 1e13, 1) assert 0, "This should fail" except MemoryError as err: - print str(err) + print(str(err)) def test__get_set_tag_density(): @@ -213,7 +215,7 @@ def test_count_within_radius_simple(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.Hashbits(4, 1e6, 2) - print ht.consume_fasta(inpfile) + print(ht.consume_fasta(inpfile)) n = ht.count_kmers_within_radius('AAAA', 1) assert n == 1 @@ -404,7 +406,7 @@ def test_extract_unique_paths_1(): kh.consume('AGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer @@ -413,7 +415,7 @@ def test_extract_unique_paths_2(): kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer @@ -423,7 +425,7 @@ def test_extract_unique_paths_3(): kh.consume('ATGGAGAGAC') kh.consume('AGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) # all but the 1st/last k-mer assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT'] @@ -437,7 +439,7 @@ def test_extract_unique_paths_4(): kh.consume('ATAGACAGGA') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print x + print(x) assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT'] @@ -511,14 +513,14 @@ def test_simple_median(): hi = khmer.Hashbits(6, 1e6, 2) (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 0 assert average == 0.0 assert stddev == 0.0 hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") - print median, average, stddev + print(median, average, stddev) assert median == 1 assert average == 1.0 assert stddev == 0.0 @@ -539,7 +541,7 @@ def test_badget(): hbts.get("AGCTT") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) def test_bad_primes(): @@ -548,7 +550,7 @@ def test_bad_primes(): khmer._Hashbits, 6, ["a", "b", "c"]) assert 0, "this should fail" except TypeError as e: - print str(e) + print(str(e)) def test_consume_fasta_and_tag_with_badreads_parser(): @@ -558,6 +560,6 @@ def test_consume_fasta_and_tag_with_badreads_parser(): presencetable.consume_fasta_and_tag_with_reads_parser(readsparser) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) except ValueError as e: - print str(e) + print(str(e)) diff --git a/tests/test_hll.py b/tests/test_hll.py index 26cf33f0e6..e9d3b9b7ca 100644 --- a/tests/test_hll.py +++ b/tests/test_hll.py @@ -1,3 +1,5 @@ +from __future__ import division +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2014-2015. It is licensed under @@ -12,7 +14,7 @@ from screed.fasta import fasta_iter -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.tools import assert_raises @@ -235,7 +237,7 @@ def test_hll_change_ksize(): hllcpp.ksize = 24 assert hllcpp.ksize == 24 - hllcpp.ksize = 12L + hllcpp.ksize = 12 assert hllcpp.ksize == 12 with assert_raises(ValueError): diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index 0348741556..5791c34d19 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -11,7 +13,7 @@ from screed.fasta import fasta_iter import screed -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr @@ -29,7 +31,7 @@ def test_toobig(): lh = LabelHash(20, 1e13, 1) assert 0, "This should fail." except MemoryError as err: - print str(err) + print(str(err)) def test_error_create(): @@ -46,7 +48,7 @@ def test_n_labels(): filename = utils.get_test_data('test-labels.fa') lh.consume_fasta_and_tag_with_labels(filename) - print lh.n_labels() + print(lh.n_labels()) assert lh.n_labels() == 4 @@ -174,22 +176,22 @@ def test_consume_fasta_and_tag_with_labels(): filename = utils.get_test_data('test-transcript.fa') total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) - print "doing get" + print("doing get") assert lb.graph.get(read_1[:20]) assert total_reads == 3 - print "doing n_labels" - print lb.n_labels() - print "doing label dict" - print lb.get_label_dict() - print "get tagset" + print("doing n_labels") + print(lb.n_labels()) + print("doing label dict") + print(lb.get_label_dict()) + print("get tagset") for tag in lb.graph.get_tagset(): - print "forward hash" - print tag, khmer.forward_hash(tag, 20) + print("forward hash") + print(tag, khmer.forward_hash(tag, 20)) for record in screed.open(filename): - print "Sweeping tags" - print lb.sweep_tag_neighborhood(record.sequence, 40) - print "Sweeping labels..." - print lb.sweep_label_neighborhood(record.sequence, 40) + print("Sweeping tags") + print(lb.sweep_tag_neighborhood(record.sequence, 40)) + print("Sweeping labels...") + print(lb.sweep_label_neighborhood(record.sequence, 40)) assert lb.n_labels() == 3 @@ -260,11 +262,11 @@ def test_label_tag_correctness(): labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print lb.sweep_tag_neighborhood( + print(lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' - 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT') - print labels - print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19 + 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) + print(labels) + print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels @@ -273,7 +275,7 @@ def test_label_tag_correctness(): labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') - print labels + print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels @@ -284,7 +286,7 @@ def test_label_tag_correctness(): 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') - print labels + print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels @@ -292,7 +294,7 @@ def test_label_tag_correctness(): # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') - print labels + print(labels) assert len(labels) == 1 assert 3 in labels diff --git a/tests/test_lump.py b/tests/test_lump.py index 824b8a0a14..f503bd7dad 100644 --- a/tests/test_lump.py +++ b/tests/test_lump.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -8,7 +9,7 @@ import khmer import screed -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr # Below, 'fakelump.fa' is an artificial data set of 3x1 kb sequences in @@ -138,7 +139,7 @@ def test_fakelump_load_stop_tags_trunc(): EXCURSION_KMER_COUNT_THRESHOLD) ht.save_stop_tags(fakelump_fa_foo) - data = open(fakelump_fa_foo).read() + data = open(fakelump_fa_foo, 'rb').read() fp = open(fakelump_fa_foo, 'wb') fp.write(data[:10]) diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py index 7017765a33..34df28d8b6 100644 --- a/tests/test_read_aligner.py +++ b/tests/test_read_aligner.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from builtins import range # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -214,8 +216,8 @@ def test_readalign_new(): for query in queries: score, graphAlign, readAlign, trunc = aligner.align(query["seq"]) - print graphAlign - print readAlign + print(graphAlign) + print(readAlign) eq_(graphAlign, query["graph_aln"]) eq_(readAlign, query["read_aln"]) eq_(trunc, query["truncated"]) diff --git a/tests/test_read_parsers.py b/tests/test_read_parsers.py index 200c26ad70..c785772d4d 100644 --- a/tests/test_read_parsers.py +++ b/tests/test_read_parsers.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -10,7 +12,7 @@ import khmer from khmer import ReadParser -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils from nose.plugins.attrib import attr from functools import reduce @@ -68,7 +70,7 @@ def count_reads(rparser): n_threads = 4 threads = [] rparser = ReadParser(utils.get_test_data("100-reads.fq.gz")) - for _ in xrange(n_threads): + for _ in range(n_threads): thr = threading.Thread(target=count_reads, args=[rparser, ]) threads.append(thr) thr.start() @@ -108,7 +110,7 @@ def test_gzip_decompression_truncated(): pass assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) def test_gzip_decompression_truncated_pairiter(): @@ -119,7 +121,7 @@ def test_gzip_decompression_truncated_pairiter(): pass assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) def test_bzip2_decompression(): @@ -140,7 +142,7 @@ def test_bzip2_decompression_truncated(): pass assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) def test_bzip2_decompression_truncated_pairiter(): @@ -151,7 +153,7 @@ def test_bzip2_decompression_truncated_pairiter(): pass assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) def test_badbzip2(): @@ -161,9 +163,9 @@ def test_badbzip2(): pass assert 0, "this should fail" except IOError as err: - print str(err) + print(str(err)) except ValueError as err: - print str(err) + print(str(err)) @attr('multithread') @@ -184,7 +186,7 @@ def count_reads(rparser, counters, tnum): threads = [] reads_counts_per_thread = [0] * N_THREADS rparser = ReadParser(utils.get_test_data(testfile)) - for tnum in xrange(N_THREADS): + for tnum in range(N_THREADS): t = \ threading.Thread( target=count_reads, @@ -308,10 +310,10 @@ def test_read_pair_iterator_in_error_mode(): in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED): read_pairs_2.append([read_1, read_2]) matches = \ - map( + list(map( lambda rp1, rp2: rp1[0].name == rp2[0].name, read_pairs_1, read_pairs_2 - ) + )) assert all(matches) # Assert ALL the matches. :-] @@ -353,12 +355,12 @@ def test_constructor(): assert 0, ("ReadParser's constructor shouldn't accept a character for " "the number of threads") except TypeError as err: - print str(err) + print(str(err)) try: rparser = ReadParser("non-existent-file-name") assert 0, "ReadParser shouldn't accept a non-existant file name" except ValueError as err: - print str(err) + print(str(err)) def test_iternext(): @@ -369,7 +371,7 @@ def test_iternext(): read_pairs.append(read_1, read_2) assert 0, "Shouldn't be able to iterate over non FASTA file" except IOError as err: - print str(err) + print(str(err)) except ValueError as err: - print str(err) + print(str(err)) # vim: set ft=python ts=4 sts=4 sw=4 et tw=79: diff --git a/tests/test_sandbox_scripts.py b/tests/test_sandbox_scripts.py index c7fdb45995..d99925c4ff 100644 --- a/tests/test_sandbox_scripts.py +++ b/tests/test_sandbox_scripts.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2015. It is licensed under @@ -11,13 +13,14 @@ import os import os.path import shutil -from cStringIO import StringIO +from io import StringIO import traceback import nose +from nose.plugins.attrib import attr import glob import imp -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils import khmer import screed @@ -54,7 +57,7 @@ def __call__(self): try: mod = imp.load_source('__zzz', self.filename) except: - print traceback.format_exc() + print(traceback.format_exc()) raise AssertionError("%s cannot be imported" % (self.filename,)) # @@ -73,7 +76,7 @@ def __call__(self): compile(open(self.filename).read(), self.filename, 'exec'), global_dict) except (ImportError, SyntaxError): - print traceback.format_exc() + print(traceback.format_exc()) raise AssertionError("%s cannot be exec'd" % (self.filename,)) except: pass # other failures are expected :) @@ -107,7 +110,7 @@ def test_sweep_reads(): mout = os.path.join(in_dir, 'test_multi.fa') oout = os.path.join(in_dir, 'test_orphaned.fa') - print os.listdir(in_dir) + print(os.listdir(in_dir)) assert os.path.exists(out1) assert os.path.exists(out2) @@ -118,10 +121,10 @@ def test_sweep_reads(): seqsm = set([r.name for r in screed.open(mout)]) seqso = set([r.name for r in screed.open(oout)]) - print seqs1 - print seqs2 - print seqsm - print seqso + print(seqs1) + print(seqs2) + print(seqsm) + print(seqso) assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) assert (seqsm == set(['read4_multi\t0\t1']) or @@ -157,19 +160,19 @@ def test_sweep_reads_fq(): assert os.path.exists(out2) assert os.path.exists(mout) assert os.path.exists(oout) - print open(out1).read() + print(open(out1).read()) - print os.listdir(in_dir) + print(os.listdir(in_dir)) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) seqsm = set([r.name for r in screed.open(mout)]) seqso = set([r.name for r in screed.open(oout)]) - print seqs1 - print seqs2 - print seqsm - print seqso + print(seqs1) + print(seqs2) + print(seqsm) + print(seqso) assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) assert (seqsm == set(['read4_multi\t0\t1']) or @@ -194,9 +197,9 @@ def test_sweep_reads_2(): 'test', '--label-by-seq', inref, infile] status, out, err = utils.runscript(script, args, wdir, sandbox=True) - for i in xrange(99): + for i in range(99): p = os.path.join(wdir, 'test_{i}.fa'.format(i=i)) - print p, err, out + print(p, err, out) assert os.path.exists(p) os.remove(p) assert os.path.exists(os.path.join(wdir, 'test.counts.csv')) @@ -214,9 +217,9 @@ def test_sweep_reads_3(): 'test', '--label-by-group', '10', infile, infile] status, out, err = utils.runscript(script, args, wdir, sandbox=True) - for i in xrange(10): + for i in range(10): p = os.path.join(wdir, 'test_{i}.fa'.format(i=i)) - print p, err, out + print(p, err, out) assert os.path.exists(p) os.remove(p) diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py index 0d274664d2..3369da5555 100644 --- a/tests/test_script_arguments.py +++ b/tests/test_script_arguments.py @@ -7,10 +7,12 @@ """ Tests for various argument-handling code. """ +from __future__ import print_function, unicode_literals +from __future__ import absolute_import import sys -import cStringIO -import khmer_tst_utils as utils +import io +from . import khmer_tst_utils as utils import khmer.kfile @@ -18,25 +20,25 @@ def test_check_space(): fakelump_fa = utils.get_test_data('fakelump.fa') - save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + save_stderr, sys.stderr = sys.stderr, io.StringIO() try: khmer.kfile.check_space( [fakelump_fa], force=False, _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: - print str(e) + print(str(e)) finally: sys.stderr = save_stderr def test_check_tablespace(): - save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + save_stderr, sys.stderr = sys.stderr, io.StringIO() try: khmer.kfile.check_space_for_hashtable( 1e9, force=False, _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: - print str(e) + print(str(e)) finally: sys.stderr = save_stderr @@ -44,36 +46,36 @@ def test_check_tablespace(): def test_check_space_force(): fakelump_fa = utils.get_test_data('fakelump.fa') - save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + save_stderr, sys.stderr = sys.stderr, io.StringIO() try: khmer.kfile.check_space( [fakelump_fa], force=True, _testhook_free_space=0) assert True, "this should pass" except SystemExit as e: - print str(e) + print(str(e)) finally: sys.stderr = save_stderr def test_check_tablespace_force(): - save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + save_stderr, sys.stderr = sys.stderr, io.StringIO() try: khmer.kfile.check_space_for_hashtable( 1e9, force=True, _testhook_free_space=0) assert True, "this should pass" except SystemExit as e: - print str(e) + print(str(e)) finally: sys.stderr = save_stderr def test_invalid_file_warn(): - save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + save_stderr, sys.stderr = sys.stderr, io.StringIO() try: khmer.kfile.check_valid_file_exists(["nonexistent", "nonexistent2"]) assert sys.stderr.getvalue().count("\n") == 2, \ "Should produce two warning lines" - except SystemExit, e: - print str(e) + except SystemExit as e: + print(str(e)) finally: sys.stderr = save_stderr diff --git a/tests/test_scripts.py b/tests/test_scripts.py index fe941e507e..dd435f107f 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1,3 +1,6 @@ +from __future__ import print_function +from __future__ import absolute_import +from __future__ import unicode_literals # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -12,7 +15,7 @@ import os import stat import shutil -from cStringIO import StringIO +from io import StringIO import traceback from nose.plugins.attrib import attr import subprocess @@ -20,7 +23,7 @@ import bz2 import io -import khmer_tst_utils as utils +from . import khmer_tst_utils as utils import khmer import khmer.kfile import screed @@ -99,7 +102,7 @@ def test_load_into_counting_fail(): (status, out, err) = utils.runscript(script, args, fail_ok=True) assert status == 1, status - print err + print(err) assert "** ERROR: the graph structure is too small" in err @@ -804,8 +807,8 @@ def test_normalize_by_median_version(): continue break - print errlines - print err + print(errlines) + print(err) assert err.startswith('khmer ') @@ -863,8 +866,8 @@ def test_normalize_by_median_paired_fq(): script = scriptpath('normalize-by-median.py') args = ['-C', CUTOFF, '-p', '-k', '17', infile] _, out, err = utils.runscript(script, args, in_dir) - print out - print err + print(out) + print(err) outfile = infile + '.keep' assert os.path.exists(outfile), outfile @@ -930,7 +933,7 @@ def test_normalize_by_median_no_bigcount(): (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0, (out, err) - print(out, err) + print((out, err)) assert os.path.exists(hashfile), hashfile kh = khmer.load_counting_hash(hashfile) @@ -1086,8 +1089,6 @@ def test_count_median_fq_csv(): names = set([line.split(',')[0] for line in data]) assert '895:1:37:17593:9954 1::FOO' in names, names -# - def test_load_graph(): script = scriptpath('load-graph.py') @@ -1358,14 +1359,14 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20, assert os.path.exists(tagset_file), tagset_file if do_partition: - print ">>>> DEBUG: Partitioning <<<" + print(">>>> DEBUG: Partitioning <<<") script = scriptpath('partition-graph.py') args = [outfile] if stop_big_traverse: args.insert(0, '--no-big-traverse') utils.runscript(script, args) - print ">>>> DEBUG: Merging Partitions <<<" + print(">>>> DEBUG: Merging Partitions <<<") script = scriptpath('merge-partitions.py') args = [outfile, '-k', str(ksize)] utils.runscript(script, args) @@ -1374,7 +1375,7 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20, assert os.path.exists(final_pmap_file) if annotate_partitions: - print ">>>> DEBUG: Annotating Partitions <<<" + print(">>>> DEBUG: Annotating Partitions <<<") script = scriptpath('annotate-partitions.py') args = ["-k", str(ksize), outfile, infilename] @@ -1553,7 +1554,7 @@ def test_annotate_partitions_2(): parts = [r.name.split('\t')[1] for r in screed.open(partfile)] parts = set(parts) - print parts + print(parts) assert len(parts) == 99, len(parts) @@ -1787,23 +1788,23 @@ def test_abundance_dist(): args = ['-z', htfile, infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '1001 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '1001 2 98 1.0', line os.remove(outfile) args = ['-z', '--csv', htfile, infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert (line == 'abundance,count,cumulative,cumulative_fraction'), line - line = fp.next().strip() - assert line == '1,96,96,0.98', line - line = fp.next().strip() - assert line == '1001,2,98,1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert (line == 'abundance,count,cumulative,cumulative_fraction'), line + line = fp.readline().strip() + assert line == '1,96,96,0.98', line + line = fp.readline().strip() + assert line == '1001,2,98,1.0', line def test_abundance_dist_nobigcount(): @@ -1819,11 +1820,11 @@ def test_abundance_dist_nobigcount(): args = ['-z', htfile, infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '255 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '255 2 98 1.0', line def test_abundance_dist_single(): @@ -1840,11 +1841,11 @@ def test_abundance_dist_single(): assert 'Total number of unique k-mers: 98' in err, err - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '1001 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '1001 2 98 1.0', line def test_abundance_dist_threaded(): @@ -1861,11 +1862,11 @@ def test_abundance_dist_threaded(): assert 'Total number of unique k-mers: 98' in err, err - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '1001 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '1001 2 98 1.0', line def test_abundance_dist_single_csv(): @@ -1880,13 +1881,13 @@ def test_abundance_dist_single_csv(): outfile] (status, out, err) = utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert (line == 'abundance,count,cumulative,cumulative_fraction'), line - line = fp.next().strip() - assert line == '1,96,96,0.98', line - line = fp.next().strip() - assert line == '1001,2,98,1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert (line == 'abundance,count,cumulative,cumulative_fraction'), line + line = fp.readline().strip() + assert line == '1,96,96,0.98', line + line = fp.readline().strip() + assert line == '1001,2,98,1.0', line def test_abundance_dist_single_nobigcount(): @@ -1900,11 +1901,11 @@ def test_abundance_dist_single_nobigcount(): args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '255 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '255 2 98 1.0', line def test_abundance_dist_single_nosquash(): @@ -1918,11 +1919,11 @@ def test_abundance_dist_single_nosquash(): args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '1001 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '1001 2 98 1.0', line def test_abundance_dist_single_savetable(): @@ -1938,11 +1939,11 @@ def test_abundance_dist_single_savetable(): tabfile, infile, outfile] utils.runscript(script, args, in_dir) - fp = iter(open(outfile)) - line = fp.next().strip() - assert line == '1 96 96 0.98', line - line = fp.next().strip() - assert line == '1001 2 98 1.0', line + with open(outfile) as fp: + line = fp.readline().strip() + assert line == '1 96 96 0.98', line + line = fp.readline().strip() + assert line == '1001 2 98 1.0', line def test_do_partition(): @@ -2523,10 +2524,24 @@ def test_sample_reads_randomly(): outfile = infile + '.subset' assert os.path.exists(outfile), outfile + if sys.version_info.major == 2: + answer = set(['850:2:1:2691:14602/1', '850:2:1:1762:5439/1', + '850:2:1:2399:20086/2', '850:2:1:2503:4494/2', + '850:2:1:2084:17145/1', '850:2:1:2273:13309/1', + '850:2:1:2263:11143/2', '850:2:1:1984:7162/2', + '850:2:1:2065:16816/1', '850:2:1:1792:15774/2']) + else: + answer = set(['850:2:1:1946:20852/1', '850:2:1:1251:16575/1', + '850:2:1:1625:9364/2', '850:2:1:3123:15968/1', + '850:2:1:1601:18498/2', '850:2:1:1267:6790/1', + '850:2:1:2562:16360/2', '850:2:1:1199:4197/1', + '850:2:1:1832:14607/1', '850:2:1:2401:4896/1']) + seqs = set([r.name for r in screed.open(outfile)]) - print list(sorted(seqs)) + print(list(sorted(seqs))) - assert seqs == set(['850:2:1:1859:11742/1', '850:2:1:1859:11742/2', + if sys.version_info.major == 2: + answer = set(['850:2:1:1859:11742/1', '850:2:1:1859:11742/2', '850:2:1:2131:17360/1', '850:2:1:2131:17360/2', '850:2:1:2416:7565/1', '850:2:1:2416:7565/2', '850:2:1:2490:13491/1', '850:2:1:2490:13491/2', @@ -2536,6 +2551,10 @@ def test_sample_reads_randomly(): '850:2:1:3206:13876/1', '850:2:1:3206:13876/2', '850:2:1:3631:20919/1', '850:2:1:3631:20919/2', '850:2:1:3655:15581/1', '850:2:1:3655:15581/2']) + else: + answer = set() + + assert seqs == answer def test_sample_reads_randomly_force_single(): @@ -2554,7 +2573,7 @@ def test_sample_reads_randomly_force_single(): assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile)]) - print list(sorted(seqs)) + print(list(sorted(seqs))) assert seqs == set(['850:2:1:2399:20086/2', '850:2:1:2273:13309/1', '850:2:1:2065:16816/1', @@ -2582,20 +2601,29 @@ def test_sample_reads_randomly_fq(): outfile = infile + '.subset' assert os.path.exists(outfile), outfile + if sys.version_info.major == 2: + answer = set(['850:2:1:2399:20086/2', + '850:2:1:1762:5439 1::FOO', + '850:2:1:2065:16816/1', + '850:2:1:2263:11143/2', + '850:2:1:1792:15774/2', + '850:2:1:2691:14602/1', + '850:2:1:2503:4494 1::FOO', + '850:2:1:2084:17145/1', + '850:2:1:1984:7162 1::FOO', + '850:2:1:2273:13309 1::FOO']) + else: + answer = set(['850:2:1:1946:20852/1', '850:2:1:2401:4896 1::FOO', + '850:2:1:1251:16575 1::FOO', + '850:2:1:1199:4197 1::FOO', '850:2:1:1625:9364/2', + '850:2:1:1267:6790 1::FOO', + '850:2:1:2562:16360 1::FOO', '850:2:1:1601:18498/2', + '850:2:1:3123:15968 1::FOO', '850:2:1:1832:14607/1']) + seqs = set([r.name for r in screed.open(outfile, parse_description=False)]) - - print list(sorted(seqs)) - assert seqs == set(['850:2:1:2399:20086/2', - '850:2:1:1762:5439 1::FOO', - '850:2:1:2065:16816/1', - '850:2:1:2263:11143/2', - '850:2:1:1792:15774/2', - '850:2:1:2691:14602/1', - '850:2:1:2503:4494 1::FOO', - '850:2:1:2084:17145/1', - '850:2:1:1984:7162 1::FOO', - '850:2:1:2273:13309 1::FOO']) + print(list(sorted(seqs))) + assert seqs == answer def test_fastq_to_fasta(): @@ -2717,37 +2745,59 @@ def test_sample_reads_randomly_S(): assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile, parse_description=True)]) - print list(sorted(seqs)) - - assert seqs == set(['895:1:1:1303:14389', '895:1:1:1347:3237', - '895:1:1:1295:6189', '895:1:1:1308:20421', - '895:1:1:1320:11648', '895:1:1:1352:5369', - '895:1:1:1318:10532', '895:1:1:1363:11839', - '895:1:1:1355:13535', '895:1:1:1349:15165']) + print(list(sorted(seqs))) + + print(seqs) + if sys.version_info.major == 2: + answer = set(['895:1:1:1303:14389', '895:1:1:1347:3237', + '895:1:1:1295:6189', '895:1:1:1308:20421', + '895:1:1:1320:11648', '895:1:1:1352:5369', + '895:1:1:1318:10532', '895:1:1:1363:11839', + '895:1:1:1355:13535', '895:1:1:1349:15165']) + else: + answer = set(['895:1:1:1338:15407', '895:1:1:1362:3983', + '895:1:1:1308:20421', '895:1:1:1276:16426', + '895:1:1:1349:13984', '895:1:1:1378:18986', + '895:1:1:1357:19736', '895:1:1:1290:11501', + '895:1:1:1376:16513', '895:1:1:1265:2265']) + assert seqs == answer outfile = infile + '.subset.1' assert os.path.exists(outfile), outfile seqs = set([r.name for r in screed.open(outfile, parse_description=True)]) - print list(sorted(seqs)) - - assert seqs == set(['895:1:1:1303:14389', '895:1:1:1373:4848', - '895:1:1:1357:19736', '895:1:1:1347:3237', - '895:1:1:1338:7557', '895:1:1:1388:11093', - '895:1:1:1296:1784', '895:1:1:1290:11501', - '895:1:1:1355:13535', '895:1:1:1303:6251']) - - outfile = infile + '.subset.2' - assert os.path.exists(outfile), outfile + print(list(sorted(seqs))) + + if sys.version_info.major == 2: + answer = set(['895:1:1:1303:14389', '895:1:1:1373:4848', + '895:1:1:1357:19736', '895:1:1:1347:3237', + '895:1:1:1338:7557', '895:1:1:1388:11093', + '895:1:1:1296:1784', '895:1:1:1290:11501', + '895:1:1:1355:13535', '895:1:1:1303:6251']) + else: + answer = set(['895:1:1:1307:4308', '895:1:1:1338:7557', + '895:1:1:1308:2539', '895:1:1:1383:3089', + '895:1:1:1330:9540', '895:1:1:1386:14753', + '895:1:1:1327:13028', '895:1:1:1340:19387', + '895:1:1:1287:13756', '895:1:1:1327:15301']) + assert seqs == answer seqs = set([r.name for r in screed.open(outfile, parse_description=True)]) - print list(sorted(seqs)) - - assert seqs == set(['895:1:1:1298:13380', '895:1:1:1348:18672', - '895:1:1:1309:4153', '895:1:1:1252:19493', - '895:1:1:1368:4434', '895:1:1:1348:1257', - '895:1:1:1383:3089', '895:1:1:1355:13535', - '895:1:1:1303:6251', '895:1:1:1349:15165']) + print(list(sorted(seqs))) + + if sys.version_info.major == 2: + answer = set(['895:1:1:1298:13380', '895:1:1:1348:18672', + '895:1:1:1309:4153', '895:1:1:1252:19493', + '895:1:1:1368:4434', '895:1:1:1348:1257', + '895:1:1:1383:3089', '895:1:1:1355:13535', + '895:1:1:1303:6251', '895:1:1:1349:15165']) + else: + answer = set(['895:1:1:1381:7062', '895:1:1:1373:13994', + '895:1:1:1351:14718', '895:1:1:1376:16513', + '895:1:1:1344:1968', '895:1:1:1348:1257', + '895:1:1:1362:3983', '895:1:1:1363:9988', + '895:1:1:1273:17782', '895:1:1:1368:4434']) + assert seqs == answer def test_count_overlap_invalid_datafile(): @@ -2760,7 +2810,10 @@ def test_count_overlap_invalid_datafile(): args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', htfile + '.pt', outfile] (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True) - assert "IOError" in err + if sys.version_info.major == 2: + assert "IOError" in err + else: + assert "OSError" in err def test_count_overlap(): @@ -2869,9 +2922,9 @@ def execute_load_graph_streaming(filename): if status != 0: for line in out: - print out + print(out) for line in err: - print err + print(err) assert status == 0, status err.seek(0) err = err.read() @@ -2894,7 +2947,6 @@ def execute_load_graph_streaming(filename): assert x == (1, 0), x -@attr('known_failing') def test_screed_streaming_ufa(): # uncompressed fa o = execute_streaming_diginorm(utils.get_test_data('test-abund-read-2.fa')) @@ -2905,7 +2957,6 @@ def test_screed_streaming_ufa(): assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG') -@attr('known_failing') def test_screed_streaming_ufq(): # uncompressed fq o = execute_streaming_diginorm(utils.get_test_data('test-fastq-reads.fq')) @@ -2914,7 +2965,6 @@ def test_screed_streaming_ufq(): assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') -@attr('known_failing') def test_screed_streaming_bzipfq(): # bzip compressed fq o = execute_streaming_diginorm(utils.get_test_data('100-reads.fq.bz2')) @@ -2923,7 +2973,6 @@ def test_screed_streaming_bzipfq(): assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'), seqs -@attr('known_failing') def test_screed_streaming_bzipfa(): # bzip compressed fa o = execute_streaming_diginorm( @@ -3246,15 +3295,15 @@ def test_trim_low_abund_trimtest(): for record in screed.open(outfile): if record.name == 'seqtrim/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC' elif record.name == 'seqtrim/2': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC' elif record.name == 'seqtrim2/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA' @@ -3278,15 +3327,15 @@ def test_trim_low_abund_trimtest_after_load(): for record in screed.open(outfile): if record.name == 'seqtrim/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC' elif record.name == 'seqtrim/2': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC' elif record.name == 'seqtrim2/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA' @@ -3309,15 +3358,15 @@ def test_trim_low_abund_trimtest_savetable(): for record in screed.open(outfile): if record.name == 'seqtrim/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC' elif record.name == 'seqtrim/2': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC' elif record.name == 'seqtrim2/1': - print record.name, record.sequence + print(record.name, record.sequence) assert record.sequence == \ 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA' diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py index ffc2008f3d..3d7c312fa2 100644 --- a/tests/test_subset_graph.py +++ b/tests/test_subset_graph.py @@ -1,3 +1,5 @@ +from __future__ import print_function +from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under @@ -8,8 +10,8 @@ import khmer import screed -import khmer_tst_utils as utils import os +from . import khmer_tst_utils as utils def teardown(): @@ -205,8 +207,7 @@ def test_save_load_merge(self): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy - assert len(divvy) is 3 + print(divvy) (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') @@ -305,7 +306,7 @@ def test_save_load_merge_nexist(self): a = ht.load_subset_partitionmap('this does not exist') assert 0, "this should not succeed" except IOError as e: - print str(e) + print(str(e)) def test_save_merge_from_disk(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) @@ -315,8 +316,7 @@ def test_save_merge_from_disk(self): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy - assert len(divvy) is 3 + print(divvy) (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') @@ -374,8 +374,7 @@ def test_save_merge_from_disk_file_not_exist(self): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy - assert len(divvy) is 3 + print(divvy) (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') @@ -386,7 +385,7 @@ def test_save_merge_from_disk_file_not_exist(self): ht.merge_subset_from_disk(outfile1) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_merge_from_disk_file_bad_type(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) @@ -396,7 +395,7 @@ def test_merge_from_disk_file_bad_type(self): ht.merge_subset_from_disk(infile) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_merge_from_disk_file_version(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) @@ -406,7 +405,7 @@ def test_merge_from_disk_file_version(self): ht.merge_subset_from_disk(infile) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_save_merge_from_disk_ksize(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) @@ -416,8 +415,7 @@ def test_save_merge_from_disk_ksize(self): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy - assert len(divvy) is 3 + print(divvy) (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') @@ -430,7 +428,7 @@ def test_save_merge_from_disk_ksize(self): ht.merge_subset_from_disk(outfile1) assert 0, "this should fail" except IOError as e: - print str(e) + print(str(e)) def test_save_load_merge_on_graph(): @@ -599,14 +597,14 @@ def test_small_real_partitions(): def test_partition_on_abundance_1(): - print(a,) - print(b,) + print((a,)) + print((b,)) kh = khmer.new_counting_hash(20, 1e3, 4) for i in range(10): - print kh.consume_and_tag(a) + print(kh.consume_and_tag(a)) for i in range(10): - print kh.consume_and_tag(b) + print(kh.consume_and_tag(b)) # all paths in 'a' and 'b' p = kh.do_subset_partition_with_abundance(10, 50) @@ -617,10 +615,10 @@ def test_partition_on_abundance_1(): def test_partition_on_abundance_2(): kh = khmer.new_counting_hash(20, 1e3, 4) for i in range(10): - print kh.consume_and_tag(a) + print(kh.consume_and_tag(a)) for i in range(5): - print kh.consume_and_tag(b) + print(kh.consume_and_tag(b)) # all paths in 'a' p = kh.do_subset_partition_with_abundance(10, 50) @@ -631,10 +629,10 @@ def test_partition_on_abundance_2(): def test_partition_on_abundance_3(): kh = khmer.new_counting_hash(20, 1e4, 4) for i in range(10): - print kh.consume_and_tag(a) + print(kh.consume_and_tag(a)) for i in range(5): - print kh.consume_and_tag(b) + print(kh.consume_and_tag(b)) # this will get paths only in 'a' p = kh.do_subset_partition_with_abundance(10, 50) @@ -643,7 +641,7 @@ def test_partition_on_abundance_3(): p = kh.do_subset_partition_with_abundance(5, 10) x = p.count_partitions() - print x + print(x) assert x == (2, 2) # two partitions, two ignored tags diff --git a/tests/test_threaded_sequence_processor.py b/tests/test_threaded_sequence_processor.py index 5aac0f4170..fee1264110 100644 --- a/tests/test_threaded_sequence_processor.py +++ b/tests/test_threaded_sequence_processor.py @@ -1,9 +1,9 @@ import sys from khmer.thread_utils import ThreadedSequenceProcessor, SequenceGroup -from cStringIO import StringIO +from io import StringIO from screed.fasta import fasta_iter from screed.fastq import fastq_iter -import Queue +import queue from nose.plugins.attrib import attr @@ -111,7 +111,7 @@ def do_process(self): while not self.done or not inq.empty(): try: g = inq.get(True, 1) - except Queue.Empty: + except queue.Empty: continue assert len(g.seqlist) == 2 @@ -160,7 +160,7 @@ def do_process(self): while not self.done or not inq.empty(): try: g = inq.get(True, 1) - except Queue.Empty: + except queue.Empty: continue if len(g.seqlist) == 2: diff --git a/tests/test_version.py b/tests/test_version.py index bf274e4dcf..f8a8901daf 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -1,3 +1,4 @@ +from __future__ import print_function, unicode_literals # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2014-2015. It is licensed under @@ -12,8 +13,8 @@ def test_python_and_c_match(): # checks c++ compiler option version against versioneer version # (respectively) - print 'c++ version {0}:'.format(khmer.__version_cpp__()) - print 'versioneer (python) version: {0}'.format(khmer.__version__) + print('c++ version {0}:'.format(khmer.__version_cpp__())) + print('versioneer (python) version: {0}'.format(khmer.__version__)) assert khmer.__version_cpp__() == khmer.__version__ @@ -22,9 +23,9 @@ def test_python_and_c_match_base(): # it's a hash based on git commits which can get out-of-sync too easily cppver = '-'.join(khmer.__version_cpp__().split('-')[0:2]) pyver = '-'.join(khmer.__version__.split('-')[0:2]) - print 'c++ version {0}'.format(cppver) - print 'python version: {0}'.format(pyver) - print 'if you are seeing this, the version compiled into your cpp' - print 'objects and your versioneer stuff is out-of-sync.' - print 'try doing: make clean; make' + print('c++ version {0}'.format(cppver)) + print('python version: {0}'.format(pyver)) + print('if you are seeing this, the version compiled into your cpp') + print('objects and your versioneer stuff is out-of-sync.') + print('try doing: make clean; make') assert cppver == pyver diff --git a/versioneer.py b/versioneer.py index c00770fe4f..f6243095cc 100644 --- a/versioneer.py +++ b/versioneer.py @@ -280,6 +280,7 @@ domain. """ +from __future__ import print_function import errno import os From a7d9279865937cdb1d839fb78be5a798db9c89d6 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 22 Apr 2015 15:04:29 -0400 Subject: [PATCH 02/20] Use Record instead of _screed_record_dict; remove unused import --- tests/test_read_aligner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py index 34df28d8b6..f53df5e427 100644 --- a/tests/test_read_aligner.py +++ b/tests/test_read_aligner.py @@ -1,10 +1,10 @@ -from __future__ import print_function -from builtins import range # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under # the three-clause BSD license; see LICENSE. Contact: ctb@msu.edu # +from __future__ import print_function + import khmer from nose.tools import assert_almost_equals From 398a3fe4362b70ab95f5ec55b9b0012be7f70d66 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Apr 2015 10:00:52 -0400 Subject: [PATCH 03/20] PEP8 --- khmer/__init__.py | 6 ++++-- khmer/khmer_args.py | 6 ++++-- scripts/abundance-dist.py | 10 +++++----- tests/khmer_tst_utils.py | 6 ++++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/khmer/__init__.py b/khmer/__init__.py index 63587326d0..4ddbe7f2cf 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -182,8 +182,10 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2): if fp_all > max_false_pos: print("**", file=sys.stderr) - print("** ERROR: the graph structure is too small for ", file=sys.stderr) - print("this data set. Increase k-mer presence table ", file=sys.stderr) + print( + "** ERROR: the graph structure is too small for ", file=sys.stderr) + print( + "this data set. Increase k-mer presence table ", file=sys.stderr) print("size/num of tables.", file=sys.stderr) print("** Do not use these results!!", file=sys.stderr) print("**", file=sys.stderr) diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index 4182ad6a7b..436abd01f4 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -212,11 +212,13 @@ def info(scriptname, algorithm_list=None): for alg in algorithm_list: sys.stderr.write("|| * ") - algstr = _algorithms[alg].encode('utf-8', 'surrogateescape').decode('utf-8', 'replace') + algstr = _algorithms[alg].encode( + 'utf-8', 'surrogateescape').decode('utf-8', 'replace') try: sys.stderr.write(algstr) except UnicodeEncodeError: - sys.stderr.write(algstr.encode(sys.getfilesystemencoding(), 'replace')) + sys.stderr.write( + algstr.encode(sys.getfilesystemencoding(), 'replace')) sys.stderr.write("\n") sys.stderr.write("||\n|| Please see http://khmer.readthedocs.org/en/" diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py index 055355140e..3113553d52 100755 --- a/scripts/abundance-dist.py +++ b/scripts/abundance-dist.py @@ -63,8 +63,8 @@ def main(): for infile in infiles: check_input_files(infile, args.force) - print ('hashtable from', args.input_counting_table_filename, - file=sys.stderr) + print('hashtable from', args.input_counting_table_filename, + file=sys.stderr) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) @@ -73,9 +73,9 @@ def main(): tracking = khmer._Hashbits( # pylint: disable=protected-access kmer_size, hashsizes) - print ('K:', kmer_size, file=sys.stderr) - print ('HT sizes:', hashsizes, file=sys.stderr) - print ('outputting to', args.output_histogram_filename, file=sys.stderr) + print('K:', kmer_size, file=sys.stderr) + print('HT sizes:', hashsizes, file=sys.stderr) + print('outputting to', args.output_histogram_filename, file=sys.stderr) if os.path.exists(args.output_histogram_filename): if not args.squash_output: diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py index b3b2bafb13..0cbe36c271 100644 --- a/tests/khmer_tst_utils.py +++ b/tests/khmer_tst_utils.py @@ -154,8 +154,10 @@ def runscriptredirect(scriptname, args, stdinfilename, in_directory=None, os.chdir(in_directory) sysargs = 'cat ' + stdinfilename + ' | python ' + scriptfile + \ " " + args - out = open(os.path.join(in_directory, "out"), 'w+', encoding='utf-8') - err = open(os.path.join(in_directory, "err"), 'w+', encoding='utf-8') + out = open( + os.path.join(in_directory, "out"), 'w+', encoding='utf-8') + err = open( + os.path.join(in_directory, "err"), 'w+', encoding='utf-8') print('running:', scriptname, 'in:', in_directory) print('arguments', sysargs) status = subprocess.call(args=sysargs, stdout=out, stderr=err, From e30d2fbc80a50fbd791818da4fbbcb28dee2e07b Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 24 Apr 2015 11:51:38 -0400 Subject: [PATCH 04/20] PEP8 x2 --- scripts/extract-paired-reads.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index 691a488b24..13cbde4f3a 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -100,8 +100,8 @@ def main(): ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) - print('wrote to: ' + outfile - + '.se' + ' and ' + outfile + '.pe', file=sys.stderr) + print('wrote to: ' + outfile + '.se' + ' and ' + outfile + '.pe', + file=sys.stderr) if __name__ == '__main__': From 640fe1f3198351eb11cfbbb866cfb1c3a1e3cc30 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 15:30:54 -0400 Subject: [PATCH 05/20] Fix oversights during rebase --- khmer/_khmer.cc | 109 ++++++++----------------------- sandbox/sweep-reads.py | 2 +- scripts/sample-reads-randomly.py | 19 +++--- scripts/trim-low-abund.py | 2 +- setup.cfg | 2 +- tests/test_counting_hash.py | 4 +- tests/test_labelhash.py | 44 ++++++------- tests/test_scripts.py | 57 +++++++--------- 8 files changed, 88 insertions(+), 151 deletions(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 61d556ed40..9d205632e8 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -670,12 +670,6 @@ static PyTypeObject khmer_PrePartitionInfo_Type = { /***********************************************************************/ -void free_subset_partition_info(void * p) -{ - SubsetPartition * subset_p = (SubsetPartition *) p; - delete subset_p; -} - typedef struct { PyObject_HEAD Hashtable * hashtable; @@ -863,9 +857,6 @@ hashtable_consume_fasta(khmer_KHashtable_Object * me, PyObject * args) unsigned int total_reads = 0; try { hashtable->consume_fasta(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -894,12 +885,7 @@ hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me, unsigned long long n_consumed = 0; unsigned int total_reads = 0; Py_BEGIN_ALLOW_THREADS - try { - hashtable->consume_fasta(rparser, total_reads, n_consumed); - } catch (_khmer_signal &e) { - exc = e.get_message().c_str(); - exc_raised = true; - } + hashtable->consume_fasta(rparser, total_reads, n_consumed); Py_END_ALLOW_THREADS return Py_BuildValue("IK", total_reads, n_consumed); @@ -1054,13 +1040,9 @@ hashtable_consume_and_tag(khmer_KHashtable_Object * me, PyObject * args) // call the C++ function, and trap signals => Python unsigned long long n_consumed = 0; - try { - // @CTB needs to normalize - hashtable->consume_sequence_and_tag(seq, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_ValueError, e.get_message().c_str()); - return NULL; - } + + // @CTB needs to normalize + hashtable->consume_sequence_and_tag(seq, n_consumed); return Py_BuildValue("K", n_consumed); } @@ -1165,12 +1147,7 @@ hashtable_consume_fasta_and_tag(khmer_KHashtable_Object * me, PyObject * args) unsigned long long n_consumed; unsigned int total_reads; - try { - hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; - } + hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed); return Py_BuildValue("IK", total_reads, n_consumed); } @@ -1468,8 +1445,6 @@ hashtable_do_subset_partition(khmer_KHashtable_Object * me, PyObject * args) subset_p->do_partition(start_kmer, end_kmer, break_on_stop_tags, stop_big_traversals); Py_END_ALLOW_THREADS - } catch (_khmer_signal &e) { - return NULL; } catch (std::bad_alloc &e) { return PyErr_NoMemory(); } @@ -1561,8 +1536,6 @@ hashtable_consume_fasta_and_tag_with_reads_parser(khmer_KHashtable_Object * me, hashtable->consume_fasta_and_tag( rparser, total_reads, n_consumed ); - } catch (_khmer_signal &e) { - exc = e.get_message().c_str(); } catch (khmer::read_parsers::NoMoreReadsAvailable &e) { exc = e.what(); } @@ -1601,9 +1574,6 @@ hashtable_consume_fasta_and_tag_with_stoptags(khmer_KHashtable_Object * me, try { hashtable->consume_fasta_and_tag_with_stoptags(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -1632,9 +1602,6 @@ hashtable_consume_partitioned_fasta(khmer_KHashtable_Object * me, try { hashtable->consume_partitioned_fasta(filename, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -1661,7 +1628,7 @@ hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - _pre_partition_info * ppi = NULL; + pre_partition_info * ppi = NULL; Py_BEGIN_ALLOW_THREADS @@ -1669,7 +1636,7 @@ hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args) kmer = _hash(kmer_s, hashtable->ksize(), kmer_f, kmer_r); try { - ppi = new _pre_partition_info(kmer); + ppi = new pre_partition_info(kmer); } catch (std::bad_alloc &e) { return PyErr_NoMemory(); } @@ -1693,16 +1660,6 @@ hashtable_assign_partition_id(khmer_KHashtable_Object * me, PyObject * args) { Hashtable * hashtable = me->hashtable; - PyObject * ppi_obj; - if (!PyArg_ParseTuple(args, "O", &ppi_obj)) { - return NULL; - } - - if (!PyCObject_Check(ppi_obj)) { - PyErr_SetString( PyExc_ValueError, "invalid pre_partition_info"); - return NULL; - } - khmer_PrePartitionInfo_Object * ppi_obj; if (!PyArg_ParseTuple(args, "O!", &khmer_PrePartitionInfo_Type, &ppi_obj)) { return NULL; @@ -1828,9 +1785,6 @@ hashtable_output_partitions(khmer_KHashtable_Object * me, PyObject * args) n_partitions = subset_p->output_partitioned_file(filename, output, output_unassigned); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (khmer_file_exception &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -1858,18 +1812,11 @@ hashtable_find_unpart(khmer_KHashtable_Object * me, PyObject * args) bool stop_big_traversals = PyObject_IsTrue(stop_big_traversals_o); unsigned int n_singletons = 0; - try { - SubsetPartition * subset_p = hashtable->partition; - n_singletons = subset_p->find_unpart(filename, traverse, - stop_big_traversals); - } catch (_khmer_signal &e) { - return NULL; - } + SubsetPartition * subset_p = hashtable->partition; + n_singletons = subset_p->find_unpart(filename, traverse, + stop_big_traversals); return PyLong_FromLong(n_singletons); - - // Py_INCREF(Py_None); - // return Py_None; } static @@ -1885,11 +1832,7 @@ hashtable_filter_if_present(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - try { - hashtable->filter_if_present(filename, output); - } catch (_khmer_signal &e) { - return NULL; - } + hashtable->filter_if_present(filename, output); Py_RETURN_NONE; } @@ -1999,7 +1942,6 @@ hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me, SubsetPartition * subset_p; subset_p = subset_obj->subset; - PyObject * subset_obj = NULL; PartitionCountDistribution d; @@ -2089,14 +2031,14 @@ hashtable_save_subset_partitionmap(khmer_KHashtable_Object * me, PyObject * args) { const char * filename = NULL; - PyObject * subset_obj = NULL; + khmer_KSubsetPartition_Object * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "Os", &subset_obj, &filename)) { + if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type, &subset_obj, &filename)) { return NULL; } SubsetPartition * subset_p; - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj); + subset_p = subset_obj->subset; Py_BEGIN_ALLOW_THREADS @@ -2218,6 +2160,8 @@ hashtable_set_partition_id(khmer_KHashtable_Object * me, PyObject * args) return NULL; } + hashtable->partition->set_partition_id(kmer, p); + Py_RETURN_NONE; } @@ -2824,8 +2768,13 @@ count_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args) PyObject * raw_tables = PyList_New(sizes.size()); for (unsigned int i=0; iconsume_fasta_overlap(filename, curve, *ht2, total_reads, n_consumed); - } catch (_khmer_signal &e) { - PyErr_SetString(PyExc_IOError, e.get_message().c_str()); - return NULL; } catch (InvalidStreamHandle &e) { PyErr_SetString(PyExc_IOError, e.what()); return NULL; @@ -4118,19 +4062,18 @@ hashtable_repartition_largest_partition(khmer_KHashtable_Object * me, PyObject * args) { Hashtable * hashtable = me->hashtable; - khmer_KCountingHash_Object * counting_o = NULL; PyObject * subset_o = NULL; + SubsetPartition * subset_p; unsigned int distance, threshold, frequency; if (!PyArg_ParseTuple(args, "OO!III", &subset_o, &khmer_KCountingHash_Type, - &counting_o, &distance, &threshold, &frequency)) { + &counting_o, &distance, &threshold, &frequency)) { return NULL; } - SubsetPartition * subset_p; if (subset_o != Py_None) { - subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_o); + subset_p = ((khmer_KSubsetPartition_Object *) subset_o)->subset; } else { subset_p = hashtable->partition; } diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index ffdcc90cac..7b8b4386cf 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -302,7 +302,7 @@ def main(): print('done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.graph.n_tags(), - l=ht.n_labels()) + l=ht.n_labels())) label_dict = defaultdict(int) label_number_dist = [] diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 3f8ce1692b..4eafcee21c 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -131,15 +131,16 @@ def main(): # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) - with screed.open(filename, parse_description=False) as screed_iter: - for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( - screed_iter, - force_single=args.force_single)): - if count % 10000 == 0: - print('...', total, 'reads scanned', file=sys.stderr) - if count >= args.max_reads: - print('reached upper limit of %d reads' % - args.max_reads, '(see -M); exiting', file=sys.stderr) + screed_iter = screed.open(filename, parse_description=False) + + for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( + screed_iter, + force_single=args.force_single)): + if count % 10000 == 0: + print('...', count, 'reads scanned', file=sys.stderr) + if count >= args.max_reads: + print('reached upper limit of %d reads' % + args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index dddd091321..85304ca483 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -317,7 +317,7 @@ def main(): percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % - (skipped_n, skipped_bp) + (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ diff --git a/setup.cfg b/setup.cfg index d5e5068ebc..5b5b4a2ac7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [nosetests] verbosity = 2 -stop = TRUE +stop = FALSE attr = !known_failing,!jenkins,!linux #processes = -1 # breaks xunit output diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py index 4b2d269f53..145d65691a 100644 --- a/tests/test_counting_hash.py +++ b/tests/test_counting_hash.py @@ -1385,7 +1385,7 @@ def test_abund_dist_gz_bigcount(): # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): - print _, i + print(_, i) if _ > 255 and i > 0: flag = True break @@ -1396,6 +1396,6 @@ def test_counting_load_bigcount(): count_table = khmer.new_counting_hash(10, 1e5, 4) count_table.set_use_bigcount(True) for i in range(500): - print i, count_table.count('ATATATATAT') + print(i, count_table.count('ATATATATAT')) count = count_table.get('ATATATATAT') assert count == 500 diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index 5791c34d19..b7e8813527 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -40,7 +40,7 @@ def test_error_create(): lh = _LabelHash(None) assert 0, "This should fail." except ValueError as err: - print str(err) + print(str(err)) def test_n_labels(): @@ -107,7 +107,7 @@ def test_get_label_dict_save_load_wrong_ksize(): lb.load_labels_and_tags(savepath) assert 0, "this should not succeed - different ksize" except IOError as err: - print str(err) + print(str(err)) assert "Incorrect k-mer size 19" in str(err) @@ -137,7 +137,7 @@ def test_save_load_corrupted(): lb.load_labels_and_tags(truncated) assert 0, "this should not succeed -- truncated file len %d" % (i,) except IOError as err: - print 'expected failure for', i, ': ', str(err) + print('expected failure for', i, ': ', str(err)) def test_save_fail_readonly(): @@ -156,7 +156,7 @@ def test_save_fail_readonly(): lb_pre.save_labels_and_tags(savepath) assert 0, "this should fail: read-only file" except IOError as err: - print str(err) + print(str(err)) def test_get_tag_labels(): @@ -205,8 +205,8 @@ def test_consume_partitioned_fasta_and_tag_with_labels(): for record in screed.open(filename): seq = record.sequence labels.update(lb.sweep_label_neighborhood(seq, 0, False, False)) - # print lb.n_labels() - # print labels + # print(lb.n_labels()) + # print(labels) assert len(labels) == 1 assert labels.pop() == 2 assert lb.n_labels() == 1 @@ -308,11 +308,11 @@ def test_counting_label_tag_correctness(): labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print lb.sweep_tag_neighborhood( + print(lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' - 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT') - print labels - print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19 + 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) + print(labels) + print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels @@ -321,7 +321,7 @@ def test_counting_label_tag_correctness(): labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') - print labels + print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels @@ -332,7 +332,7 @@ def test_counting_label_tag_correctness(): 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') - print labels + print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels @@ -340,7 +340,7 @@ def test_counting_label_tag_correctness(): # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') - print labels + print(labels) assert len(labels) == 1 assert 3 in labels @@ -365,11 +365,11 @@ def test_label_tag_correctness_save_load(): labels = lb.sweep_label_neighborhood( 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print lb.sweep_tag_neighborhood( + print(lb.sweep_tag_neighborhood( 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG' - 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT') - print labels - print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19 + 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')) + print(labels) + print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19) assert len(labels) == 2 assert 0 in labels assert 1 in labels @@ -378,7 +378,7 @@ def test_label_tag_correctness_save_load(): labels = lb.sweep_label_neighborhood( 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG' 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') - print labels + print(labels) assert len(labels) == 3 assert 0 in labels assert 1 in labels @@ -389,7 +389,7 @@ def test_label_tag_correctness_save_load(): 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG' 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA' 'ACAACACATACA') - print labels + print(labels) assert len(labels) == 2 assert 1 in labels assert 2 in labels @@ -397,7 +397,7 @@ def test_label_tag_correctness_save_load(): # read D labels = lb.sweep_label_neighborhood( 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') - print labels + print(labels) assert len(labels) == 1 assert 3 in labels @@ -411,7 +411,7 @@ def test_load_wrong_filetype(): lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file type" except IOError as err: - print str(err) + print(str(err)) assert "Incorrect file format type" in str(err) @@ -424,5 +424,5 @@ def test_load_wrong_fileversion(): lb.load_labels_and_tags(filename) assert 0, "this should not succeed - bad file type" except IOError as err: - print str(err) + print(str(err)) assert "Incorrect file format version" in str(err) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index dd435f107f..4f66df23f8 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -2524,33 +2524,20 @@ def test_sample_reads_randomly(): outfile = infile + '.subset' assert os.path.exists(outfile), outfile - if sys.version_info.major == 2: - answer = set(['850:2:1:2691:14602/1', '850:2:1:1762:5439/1', - '850:2:1:2399:20086/2', '850:2:1:2503:4494/2', - '850:2:1:2084:17145/1', '850:2:1:2273:13309/1', - '850:2:1:2263:11143/2', '850:2:1:1984:7162/2', - '850:2:1:2065:16816/1', '850:2:1:1792:15774/2']) - else: - answer = set(['850:2:1:1946:20852/1', '850:2:1:1251:16575/1', - '850:2:1:1625:9364/2', '850:2:1:3123:15968/1', - '850:2:1:1601:18498/2', '850:2:1:1267:6790/1', - '850:2:1:2562:16360/2', '850:2:1:1199:4197/1', - '850:2:1:1832:14607/1', '850:2:1:2401:4896/1']) - seqs = set([r.name for r in screed.open(outfile)]) print(list(sorted(seqs))) if sys.version_info.major == 2: answer = set(['850:2:1:1859:11742/1', '850:2:1:1859:11742/2', - '850:2:1:2131:17360/1', '850:2:1:2131:17360/2', - '850:2:1:2416:7565/1', '850:2:1:2416:7565/2', - '850:2:1:2490:13491/1', '850:2:1:2490:13491/2', - '850:2:1:2962:3999/1', '850:2:1:2962:3999/2', - '850:2:1:3096:20321/1', '850:2:1:3096:20321/2', - '850:2:1:3164:6414/1', '850:2:1:3164:6414/2', - '850:2:1:3206:13876/1', '850:2:1:3206:13876/2', - '850:2:1:3631:20919/1', '850:2:1:3631:20919/2', - '850:2:1:3655:15581/1', '850:2:1:3655:15581/2']) + '850:2:1:2131:17360/1', '850:2:1:2131:17360/2', + '850:2:1:2416:7565/1', '850:2:1:2416:7565/2', + '850:2:1:2490:13491/1', '850:2:1:2490:13491/2', + '850:2:1:2962:3999/1', '850:2:1:2962:3999/2', + '850:2:1:3096:20321/1', '850:2:1:3096:20321/2', + '850:2:1:3164:6414/1', '850:2:1:3164:6414/2', + '850:2:1:3206:13876/1', '850:2:1:3206:13876/2', + '850:2:1:3631:20919/1', '850:2:1:3631:20919/2', + '850:2:1:3655:15581/1', '850:2:1:3655:15581/2']) else: answer = set() @@ -2574,16 +2561,22 @@ def test_sample_reads_randomly_force_single(): seqs = set([r.name for r in screed.open(outfile)]) print(list(sorted(seqs))) - assert seqs == set(['850:2:1:2399:20086/2', - '850:2:1:2273:13309/1', - '850:2:1:2065:16816/1', - '850:2:1:1984:7162/2', - '850:2:1:2691:14602/1', - '850:2:1:1762:5439/1', - '850:2:1:2503:4494/2', - '850:2:1:2263:11143/2', - '850:2:1:1792:15774/2', - '850:2:1:2084:17145/1']) + + if sys.version_info.major == 2: + answer = set(['850:2:1:2399:20086/2', + '850:2:1:2273:13309/1', + '850:2:1:2065:16816/1', + '850:2:1:1984:7162/2', + '850:2:1:2691:14602/1', + '850:2:1:1762:5439/1', + '850:2:1:2503:4494/2', + '850:2:1:2263:11143/2', + '850:2:1:1792:15774/2', + '850:2:1:2084:17145/1']) + else: + answer = set() + + assert seqs == answer def test_sample_reads_randomly_fq(): From 28081a4ee629b29422b61a198cfd9b093ad2ea74 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:12:29 -0400 Subject: [PATCH 06/20] Remove __init__ from labelhash and hashbits --- khmer/_khmer.cc | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 9d205632e8..4ec543a9e0 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -714,8 +714,6 @@ typedef struct { static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj); static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds); -static int khmer_hashbits_init(khmer_KHashbits_Object * self, PyObject * args, - PyObject * kwds); static PyTypeObject khmer_KHashbits_Type CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object") @@ -755,7 +753,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object") 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - (initproc)khmer_hashbits_init, /* tp_init */ + 0, /* tp_init */ 0, /* tp_alloc */ khmer_hashbits_new, /* tp_new */ }; @@ -3355,16 +3353,6 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, return (PyObject *) self; } -// there are no attributes that we need at this time, so we'll just return 0 -static int khmer_hashbits_init(khmer_KHashbits_Object * self, PyObject * args, - PyObject * kwds) -{ - if (khmer_KHashtable_Type.tp_init((PyObject *)self, args, kwds) < 0) { - return -1; - } - return 0; -} - #define is_hashbits_obj(v) (Py_TYPE(v) == &khmer_KHashbits_Type) //////////////////////////////////////////////////////////////////////////// @@ -3596,8 +3584,6 @@ typedef struct { LabelHash * labelhash; } khmer_KLabelHash_Object; -static int khmer_labelhash_init(khmer_KLabelHash_Object * self, PyObject *args, - PyObject *kwds); static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args, PyObject *kwds); @@ -3654,18 +3640,6 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, return (PyObject *) self; } -static int khmer_labelhash_init(khmer_KLabelHash_Object * self, PyObject *args, - PyObject *kwds) -{ - if (khmer_KHashbits_Type.tp_init((PyObject *)self, args, kwds) < 0) { - return -1; - } - //std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl; - //std::cout << "hashbits: " << self->khashbits.hashbits << std::endl; - //std::cout << "labelhash: " << self->labelhash << std::endl; - return 0; -} - static PyObject * labelhash_get_label_dict(khmer_KLabelHash_Object * me, PyObject * args) @@ -4031,7 +4005,7 @@ static PyTypeObject khmer_KLabelHash_Type = { 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ - (initproc)khmer_labelhash_init, /* tp_init */ + 0, /* tp_init */ 0, /* tp_alloc */ khmer_labelhash_new, /* tp_new */ }; @@ -4800,7 +4774,6 @@ MOD_INIT(_khmer) } khmer_KHashbits_Type.tp_base = &khmer_KHashtable_Type; - khmer_KHashbits_Type.tp_new = khmer_hashbits_new; khmer_KHashbits_Type.tp_methods = khmer_hashbits_methods; if (PyType_Ready(&khmer_KHashbits_Type) < 0) { return MOD_ERROR_VAL; From 0ecd60f09eeaf735303514c2cf0b9c17f8223515 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:12:51 -0400 Subject: [PATCH 07/20] Return unicodes instead of bytes; proper return on khmer_init --- khmer/_khmer.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 4ec543a9e0..899ac5a44b 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -2631,7 +2631,7 @@ count_trim_on_abundance(khmer_KCountingHash_Object * me, PyObject * args) Py_END_ALLOW_THREADS; - PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at); + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); if (trim_seq == NULL) { return NULL; } @@ -2663,7 +2663,7 @@ count_trim_below_abundance(khmer_KCountingHash_Object * me, PyObject * args) Py_END_ALLOW_THREADS; - PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at); + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); if (trim_seq == NULL) { return NULL; } @@ -4756,7 +4756,7 @@ MOD_INIT(_khmer) using namespace python; if (PyType_Ready(&khmer_KHashtable_Type) < 0) { - return; + return MOD_ERROR_VAL; } khmer_KCountingHash_Type.tp_base = &khmer_KHashtable_Type; From f03d2ae87e0049f2423a23a52ee17702c2007024 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:13:15 -0400 Subject: [PATCH 08/20] Fix print, izip and xrange --- oxli/build_graph.py | 32 ++++++++++++++++++-------------- oxli/functions.py | 2 +- scripts/normalize-by-median.py | 3 --- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/oxli/build_graph.py b/oxli/build_graph.py index 372f1a1ebc..3a38c140a4 100644 --- a/oxli/build_graph.py +++ b/oxli/build_graph.py @@ -14,6 +14,8 @@ Use '-h' for parameter help. """ +from __future__ import print_function, absolute_import, unicode_literals + import sys import khmer @@ -52,28 +54,28 @@ def main(args): check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) - print >>sys.stderr, 'Saving k-mer presence table to %s' % base - print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) + print('Saving k-mer presence table to %s' % base, file=sys.stderr) + print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: - print >>sys.stderr, 'We WILL NOT build the tagset.' + print('We WILL NOT build the tagset.', file=sys.stderr) else: - print >>sys.stderr, 'We WILL build the tagset', \ - ' (for partitioning/traversal).' + print('We WILL build the tagset (for partitioning/traversal).', + file=sys.stderr) - print >>sys.stderr, 'making k-mer presence table' + print('making k-mer presence table', file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) - print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( - htable.n_unique_kmers()) + print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), + file=sys.stderr) - print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' + print('saving k-mer presence table in', base + '.pt', file=sys.stderr) htable.save(base + '.pt') if not args.no_build_tagset: - print >>sys.stderr, 'saving tagset in', base + '.tagset' + print('saving tagset in', base + '.tagset', file=sys.stderr) htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') @@ -83,12 +85,14 @@ def main(args): khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. - print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate - print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate + print('false positive rate estimated to be %1.3f' % fp_rate, + file=sys.stderr) + print('\nfalse positive rate estimated to be %1.3f' % fp_rate, + file=info_fp) - print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' + print('wrote to', base + '.info and', base + '.pt', file=sys.stderr) if not args.no_build_tagset: - print >> sys.stderr, 'and ' + base + '.tagset' + print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0) diff --git a/oxli/functions.py b/oxli/functions.py index a00d2fed98..9b20d30e56 100644 --- a/oxli/functions.py +++ b/oxli/functions.py @@ -22,7 +22,7 @@ def build_graph(ifilenames, graph, num_threads=1, tags=False): rparser = khmer.ReadParser(ifile) threads = [] - for _ in xrange(num_threads): + for _ in range(num_threads): cur_thread = threading.Thread(target=eat, args=(rparser,)) threads.append(cur_thread) cur_thread.start() diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 87623221a6..e59801bb07 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -19,14 +19,11 @@ """ from __future__ import print_function -from __future__ import print_function - import sys import screed import os import khmer import textwrap -from itertools import izip from contextlib import contextmanager from khmer.khmer_args import (build_counting_args, add_loadhash_args, From 53c27b16298a5190c34c3400f3fd18162f8a3c19 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:13:31 -0400 Subject: [PATCH 09/20] Fix tests using random numbers --- tests/test_scripts.py | 150 ++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 63 deletions(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 4f66df23f8..f23d9d3e3a 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -2528,18 +2528,27 @@ def test_sample_reads_randomly(): print(list(sorted(seqs))) if sys.version_info.major == 2: - answer = set(['850:2:1:1859:11742/1', '850:2:1:1859:11742/2', - '850:2:1:2131:17360/1', '850:2:1:2131:17360/2', - '850:2:1:2416:7565/1', '850:2:1:2416:7565/2', - '850:2:1:2490:13491/1', '850:2:1:2490:13491/2', - '850:2:1:2962:3999/1', '850:2:1:2962:3999/2', - '850:2:1:3096:20321/1', '850:2:1:3096:20321/2', - '850:2:1:3164:6414/1', '850:2:1:3164:6414/2', - '850:2:1:3206:13876/1', '850:2:1:3206:13876/2', - '850:2:1:3631:20919/1', '850:2:1:3631:20919/2', - '850:2:1:3655:15581/1', '850:2:1:3655:15581/2']) + answer = {'850:2:1:1859:11742/1', '850:2:1:1859:11742/2', + '850:2:1:2131:17360/1', '850:2:1:2131:17360/2', + '850:2:1:2416:7565/1', '850:2:1:2416:7565/2', + '850:2:1:2490:13491/1', '850:2:1:2490:13491/2', + '850:2:1:2962:3999/1', '850:2:1:2962:3999/2', + '850:2:1:3096:20321/1', '850:2:1:3096:20321/2', + '850:2:1:3164:6414/1', '850:2:1:3164:6414/2', + '850:2:1:3206:13876/1', '850:2:1:3206:13876/2', + '850:2:1:3631:20919/1', '850:2:1:3631:20919/2', + '850:2:1:3655:15581/1', '850:2:1:3655:15581/2'} else: - answer = set() + answer = {'850:2:1:1257:3404/1', '850:2:1:1257:3404/2', + '850:2:1:1362:19357/1', '850:2:1:1362:19357/2', + '850:2:1:1396:5659/1', '850:2:1:1396:5659/2', + '850:2:1:2063:11124/1', '850:2:1:2063:11124/2', + '850:2:1:2121:12070/1', '850:2:1:2121:12070/2', + '850:2:1:2528:15779/1', '850:2:1:2528:15779/2', + '850:2:1:2581:12886/1', '850:2:1:2581:12886/2', + '850:2:1:2864:8505/1', '850:2:1:2864:8505/2', + '850:2:1:3000:2015/1', '850:2:1:3000:2015/2', + '850:2:1:3302:5025/1', '850:2:1:3302:5025/2'} assert seqs == answer @@ -2563,18 +2572,27 @@ def test_sample_reads_randomly_force_single(): print(list(sorted(seqs))) if sys.version_info.major == 2: - answer = set(['850:2:1:2399:20086/2', - '850:2:1:2273:13309/1', - '850:2:1:2065:16816/1', - '850:2:1:1984:7162/2', - '850:2:1:2691:14602/1', - '850:2:1:1762:5439/1', - '850:2:1:2503:4494/2', - '850:2:1:2263:11143/2', - '850:2:1:1792:15774/2', - '850:2:1:2084:17145/1']) + answer = {'850:2:1:2399:20086/2', + '850:2:1:2273:13309/1', + '850:2:1:2065:16816/1', + '850:2:1:1984:7162/2', + '850:2:1:2691:14602/1', + '850:2:1:1762:5439/1', + '850:2:1:2503:4494/2', + '850:2:1:2263:11143/2', + '850:2:1:1792:15774/2', + '850:2:1:2084:17145/1'} else: - answer = set() + answer = {'850:2:1:1199:4197/1', + '850:2:1:1251:16575/2', + '850:2:1:1267:6790/2', + '850:2:1:1601:4443/1', + '850:2:1:1625:19325/1', + '850:2:1:1832:14607/2', + '850:2:1:1946:20852/2', + '850:2:1:2401:4896/2', + '850:2:1:2562:1308/1', + '850:2:1:3123:15968/2'} assert seqs == answer @@ -2595,23 +2613,27 @@ def test_sample_reads_randomly_fq(): assert os.path.exists(outfile), outfile if sys.version_info.major == 2: - answer = set(['850:2:1:2399:20086/2', - '850:2:1:1762:5439 1::FOO', - '850:2:1:2065:16816/1', - '850:2:1:2263:11143/2', - '850:2:1:1792:15774/2', - '850:2:1:2691:14602/1', - '850:2:1:2503:4494 1::FOO', - '850:2:1:2084:17145/1', - '850:2:1:1984:7162 1::FOO', - '850:2:1:2273:13309 1::FOO']) + answer = {'850:2:1:2399:20086/2', + '850:2:1:1762:5439 1::FOO', + '850:2:1:2065:16816/1', + '850:2:1:2263:11143/2', + '850:2:1:1792:15774/2', + '850:2:1:2691:14602/1', + '850:2:1:2503:4494 1::FOO', + '850:2:1:2084:17145/1', + '850:2:1:1984:7162 1::FOO', + '850:2:1:2273:13309 1::FOO'} else: - answer = set(['850:2:1:1946:20852/1', '850:2:1:2401:4896 1::FOO', - '850:2:1:1251:16575 1::FOO', - '850:2:1:1199:4197 1::FOO', '850:2:1:1625:9364/2', - '850:2:1:1267:6790 1::FOO', - '850:2:1:2562:16360 1::FOO', '850:2:1:1601:18498/2', - '850:2:1:3123:15968 1::FOO', '850:2:1:1832:14607/1']) + answer = {'850:2:1:1199:4197 1::FOO', + '850:2:1:1251:16575/2', + '850:2:1:1267:6790/2', + '850:2:1:1601:4443 1::FOO', + '850:2:1:1625:1932 1::FOO1', + '850:2:1:1832:14607 1::FOO', + '850:2:1:1946:20852 1::FOO', + '850:2:1:2401:4896/2', + '850:2:1:2562:1308/1', + '850:2:1:3123:15968/2'} seqs = set([r.name for r in screed.open(outfile, parse_description=False)]) @@ -2742,17 +2764,18 @@ def test_sample_reads_randomly_S(): print(seqs) if sys.version_info.major == 2: - answer = set(['895:1:1:1303:14389', '895:1:1:1347:3237', - '895:1:1:1295:6189', '895:1:1:1308:20421', - '895:1:1:1320:11648', '895:1:1:1352:5369', - '895:1:1:1318:10532', '895:1:1:1363:11839', - '895:1:1:1355:13535', '895:1:1:1349:15165']) + answer = {'895:1:1:1303:14389', '895:1:1:1347:3237', + '895:1:1:1295:6189', '895:1:1:1308:20421', + '895:1:1:1320:11648', '895:1:1:1352:5369', + '895:1:1:1318:10532', '895:1:1:1363:11839', + '895:1:1:1355:13535', '895:1:1:1349:15165'} else: - answer = set(['895:1:1:1338:15407', '895:1:1:1362:3983', - '895:1:1:1308:20421', '895:1:1:1276:16426', - '895:1:1:1349:13984', '895:1:1:1378:18986', - '895:1:1:1357:19736', '895:1:1:1290:11501', - '895:1:1:1376:16513', '895:1:1:1265:2265']) + answer = {'895:1:1:1290:11501', '895:1:1:1303:14389', + '895:1:1:1307:4308', '895:1:1:1308:2539', + '895:1:1:1331:1766', '895:1:1:1333:2512', + '895:1:1:1347:3237', '895:1:1:1363:11839', + '895:1:1:1378:18986', '895:1:1:1383:3089'} + assert seqs == answer outfile = infile + '.subset.1' @@ -2768,28 +2791,29 @@ def test_sample_reads_randomly_S(): '895:1:1:1296:1784', '895:1:1:1290:11501', '895:1:1:1355:13535', '895:1:1:1303:6251']) else: - answer = set(['895:1:1:1307:4308', '895:1:1:1338:7557', - '895:1:1:1308:2539', '895:1:1:1383:3089', - '895:1:1:1330:9540', '895:1:1:1386:14753', - '895:1:1:1327:13028', '895:1:1:1340:19387', - '895:1:1:1287:13756', '895:1:1:1327:15301']) + answer = {'895:1:1:1255:18861', '895:1:1:1276:16426', + '895:1:1:1303:6251', '895:1:1:1308:20421', + '895:1:1:1314:10430', '895:1:1:1351:14718', + '895:1:1:1355:13535', '895:1:1:1358:4953', + '895:1:1:1362:3983', '895:1:1:1363:9988'} assert seqs == answer seqs = set([r.name for r in screed.open(outfile, parse_description=True)]) print(list(sorted(seqs))) if sys.version_info.major == 2: - answer = set(['895:1:1:1298:13380', '895:1:1:1348:18672', - '895:1:1:1309:4153', '895:1:1:1252:19493', - '895:1:1:1368:4434', '895:1:1:1348:1257', - '895:1:1:1383:3089', '895:1:1:1355:13535', - '895:1:1:1303:6251', '895:1:1:1349:15165']) + answer = {'895:1:1:1298:13380', '895:1:1:1348:18672', + '895:1:1:1309:4153', '895:1:1:1252:19493', + '895:1:1:1368:4434', '895:1:1:1348:1257', + '895:1:1:1383:3089', '895:1:1:1355:13535', + '895:1:1:1303:6251', '895:1:1:1349:15165'} else: - answer = set(['895:1:1:1381:7062', '895:1:1:1373:13994', - '895:1:1:1351:14718', '895:1:1:1376:16513', - '895:1:1:1344:1968', '895:1:1:1348:1257', - '895:1:1:1362:3983', '895:1:1:1363:9988', - '895:1:1:1273:17782', '895:1:1:1368:4434']) + answer = {'895:1:1:1265:2265', '895:1:1:1287:13756', + '895:1:1:1296:1784', '895:1:1:1307:4308', + '895:1:1:1344:1968', '895:1:1:1362:3983', + '895:1:1:1365:12527', '895:1:1:1383:14728', + '895:1:1:1383:3089', '895:1:1:1386:14753'} + assert seqs == answer From 43041c11976b4e825729350db2459a1d99ea1ff7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:17:38 -0400 Subject: [PATCH 10/20] Add line removed by mistake; use screed master --- Makefile | 2 +- tests/test_subset_graph.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e70fcb5c3b..5bdf61d987 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ install-dep: install-dependencies install-dependencies: pip install --upgrade $(DEVPKGS) - pip install git+https://github.com/ged-lab/screed.git@py3 + pip install git+https://github.com/ged-lab/screed.git ## sharedobj : build khmer shared object file sharedobj: khmer/_khmermodule.so diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py index 3d7c312fa2..f4c49a62a9 100644 --- a/tests/test_subset_graph.py +++ b/tests/test_subset_graph.py @@ -208,6 +208,7 @@ def test_save_load_merge(self): divvy = ht.divide_tags_into_subsets(1) print(divvy) + assert len(divvy) == 3 (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') From 97985ca21b8dbc83cf1915814e143d83f164d5de Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:22:09 -0400 Subject: [PATCH 11/20] PEP8 --- oxli/build_graph.py | 3 ++- scripts/sample-reads-randomly.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/oxli/build_graph.py b/oxli/build_graph.py index 3a38c140a4..92075ada38 100644 --- a/oxli/build_graph.py +++ b/oxli/build_graph.py @@ -55,7 +55,8 @@ def main(args): (float(args.n_tables * args.min_tablesize) / 8.), args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) - print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) + print('Loading kmers from sequences in %s' % + repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 4eafcee21c..35091b5404 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -140,7 +140,7 @@ def main(): print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % - args.max_reads, '(see -M); exiting', file=sys.stderr) + args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads From e631ad5ef95c213bd5113602fee68b2310fcaeff Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:40:57 -0400 Subject: [PATCH 12/20] Remove duplicate ReadParser; proper init for HLL and ReadAligner --- khmer/_khmer.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 899ac5a44b..3cc8e55fed 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -2093,7 +2093,7 @@ hashtable_load_subset_partitionmap(khmer_KHashtable_Object * me, } else { khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); - subset_obj->subset = subset_p; + subset_obj->subset = subset_p; return (PyObject*) subset_obj; } } @@ -4826,9 +4826,6 @@ MOD_INIT(_khmer) (PyObject *)&khmer_KCountingHash_Type ) < 0) { return MOD_ERROR_VAL; } - if (PyModule_AddObject( m, "ReadParser", (PyObject *)&khmer_ReadParser_Type ) < 0) { - return MOD_ERROR_VAL; - } Py_INCREF(&khmer_KHashbits_Type); if (PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbits_Type) < 0) { @@ -4842,9 +4839,16 @@ MOD_INIT(_khmer) } Py_INCREF(&khmer_KHLLCounter_Type); - PyModule_AddObject(m, "HLLCounter", (PyObject *)&khmer_KHLLCounter_Type); + if (PyModule_AddObject(m, "HLLCounter", + (PyObject *)&khmer_KHLLCounter_Type) < 0) { + return MOD_ERROR_VAL; + } + Py_INCREF(&khmer_ReadAlignerType); - PyModule_AddObject(m, "ReadAligner", (PyObject *)&khmer_ReadAlignerType); + if (PyModule_AddObject(m, "ReadAligner", + (PyObject *)&khmer_ReadAlignerType) < 0) { + return MOD_ERROR_VAL; + } return MOD_SUCCESS_VAL(m); } From ff110a19ae51d65f13d4871a02a15f11d7afb63e Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 17:57:17 -0400 Subject: [PATCH 13/20] Replace python2 for python in all executable scripts --- oxli/build_graph.py | 2 +- sandbox/abundance-hist-by-position.py | 2 +- sandbox/assembly-diff-2.py | 2 +- sandbox/assembly-diff.py | 2 +- sandbox/assemstats3.py | 2 +- sandbox/bloom-count-intersection.py | 2 +- sandbox/bloom-count.py | 2 +- sandbox/build-sparse-graph.py | 2 +- sandbox/calc-best-assembly.py | 2 +- sandbox/calc-error-profile.py | 2 +- sandbox/calc-median-distribution.py | 2 +- sandbox/collect-reads.py | 2 +- sandbox/collect-variants.py | 2 +- sandbox/correct-errors.py | 2 +- sandbox/extract-single-partition.py | 2 +- sandbox/fasta-to-abundance-hist.py | 2 +- sandbox/filter-below-abund.py | 2 +- sandbox/filter-median-and-pct.py | 2 +- sandbox/filter-median.py | 2 +- sandbox/find-high-abund-kmers.py | 2 +- sandbox/graph-size.py | 2 +- sandbox/hi-lo-abundance-by-position.py | 2 +- sandbox/make-coverage.py | 2 +- sandbox/multi-rename.py | 2 +- sandbox/normalize-by-median-pct.py | 2 +- sandbox/print-stoptags.py | 2 +- sandbox/print-tagset.py | 2 +- sandbox/renumber-partitions.py | 2 +- sandbox/saturate-by-median.py | 2 +- sandbox/shuffle-reverse-rotary.py | 2 +- sandbox/slice-reads-by-coverage.py | 2 +- sandbox/split-fasta.py | 2 +- sandbox/split-sequences-by-length.py | 2 +- sandbox/stoptag-abundance-hist.py | 2 +- sandbox/stoptags-by-position.py | 2 +- sandbox/strip-partition.py | 2 +- sandbox/subset-report.py | 2 +- sandbox/sweep-files.py | 2 +- sandbox/sweep-out-reads-with-contigs.py | 2 +- sandbox/sweep-reads.py | 2 +- sandbox/sweep-reads2.py | 2 +- sandbox/sweep-reads3.py | 2 +- sandbox/unique-kmers.py | 2 +- sandbox/write-trimmomatic.py | 2 +- scripts/abundance-dist-single.py | 2 +- scripts/abundance-dist.py | 2 +- scripts/annotate-partitions.py | 2 +- scripts/count-median.py | 2 +- scripts/count-overlap.py | 2 +- scripts/do-partition.py | 2 +- scripts/extract-long-sequences.py | 2 +- scripts/extract-paired-reads.py | 2 +- scripts/extract-partitions.py | 2 +- scripts/fastq-to-fasta.py | 2 +- scripts/filter-abund-single.py | 2 +- scripts/filter-abund.py | 2 +- scripts/filter-stoptags.py | 2 +- scripts/find-knots.py | 2 +- scripts/interleave-reads.py | 2 +- scripts/load-graph.py | 2 +- scripts/load-into-counting.py | 2 +- scripts/make-initial-stoptags.py | 2 +- scripts/merge-partitions.py | 2 +- scripts/normalize-by-median.py | 2 +- scripts/partition-graph.py | 2 +- scripts/readstats.py | 2 +- scripts/sample-reads-randomly.py | 2 +- scripts/split-paired-reads.py | 2 +- scripts/trim-low-abund.py | 2 +- 69 files changed, 69 insertions(+), 69 deletions(-) diff --git a/oxli/build_graph.py b/oxli/build_graph.py index 92075ada38..012a54f65c 100644 --- a/oxli/build_graph.py +++ b/oxli/build_graph.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, http://github.com/ged-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/abundance-hist-by-position.py b/sandbox/abundance-hist-by-position.py index 9281eec1e1..5a2125f636 100755 --- a/sandbox/abundance-hist-by-position.py +++ b/sandbox/abundance-hist-by-position.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/assembly-diff-2.py b/sandbox/assembly-diff-2.py index 96e59173db..883675b54e 100755 --- a/sandbox/assembly-diff-2.py +++ b/sandbox/assembly-diff-2.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/assembly-diff.py b/sandbox/assembly-diff.py index 2271c45fcc..003d54af65 100755 --- a/sandbox/assembly-diff.py +++ b/sandbox/assembly-diff.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/assemstats3.py b/sandbox/assemstats3.py index f40152ffdb..3b7ae42d48 100755 --- a/sandbox/assemstats3.py +++ b/sandbox/assemstats3.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/bloom-count-intersection.py b/sandbox/bloom-count-intersection.py index 28c7a92731..c50d823e66 100755 --- a/sandbox/bloom-count-intersection.py +++ b/sandbox/bloom-count-intersection.py @@ -1,5 +1,5 @@ from __future__ import print_function -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/bloom-count.py b/sandbox/bloom-count.py index dad8db2cea..3fed152daf 100755 --- a/sandbox/bloom-count.py +++ b/sandbox/bloom-count.py @@ -1,5 +1,5 @@ from __future__ import print_function -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py index b2e06d63d9..28f995f638 100755 --- a/sandbox/build-sparse-graph.py +++ b/sandbox/build-sparse-graph.py @@ -1,5 +1,5 @@ from __future__ import print_function -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2013-2015. It is licensed under diff --git a/sandbox/calc-best-assembly.py b/sandbox/calc-best-assembly.py index cd3de8faa3..931a24dc6a 100755 --- a/sandbox/calc-best-assembly.py +++ b/sandbox/calc-best-assembly.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/calc-error-profile.py b/sandbox/calc-error-profile.py index d77cf58e50..00cd013fdd 100755 --- a/sandbox/calc-error-profile.py +++ b/sandbox/calc-error-profile.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This script is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/calc-median-distribution.py b/sandbox/calc-median-distribution.py index 5be50928e8..a67a99c994 100755 --- a/sandbox/calc-median-distribution.py +++ b/sandbox/calc-median-distribution.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py index 8d68593947..512359497e 100755 --- a/sandbox/collect-reads.py +++ b/sandbox/collect-reads.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2014-2015. It is licensed under diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py index dd48941f61..bb81dc58e1 100755 --- a/sandbox/collect-variants.py +++ b/sandbox/collect-variants.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2013-2015. It is licensed under diff --git a/sandbox/correct-errors.py b/sandbox/correct-errors.py index 798f6d0d3b..b82e172ac2 100755 --- a/sandbox/correct-errors.py +++ b/sandbox/correct-errors.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/extract-single-partition.py b/sandbox/extract-single-partition.py index 6f6c9295f1..ccc0f28629 100755 --- a/sandbox/extract-single-partition.py +++ b/sandbox/extract-single-partition.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/fasta-to-abundance-hist.py b/sandbox/fasta-to-abundance-hist.py index 258c787077..132f748522 100755 --- a/sandbox/fasta-to-abundance-hist.py +++ b/sandbox/fasta-to-abundance-hist.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/filter-below-abund.py b/sandbox/filter-below-abund.py index 207ddf4001..e46ddeefd3 100755 --- a/sandbox/filter-below-abund.py +++ b/sandbox/filter-below-abund.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/filter-median-and-pct.py b/sandbox/filter-median-and-pct.py index 4cd80b35da..698aff5a79 100755 --- a/sandbox/filter-median-and-pct.py +++ b/sandbox/filter-median-and-pct.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/filter-median.py b/sandbox/filter-median.py index 6f9cf707da..a417c3deb5 100755 --- a/sandbox/filter-median.py +++ b/sandbox/filter-median.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/find-high-abund-kmers.py b/sandbox/find-high-abund-kmers.py index 0c7046dd88..beb3ac9a4e 100755 --- a/sandbox/find-high-abund-kmers.py +++ b/sandbox/find-high-abund-kmers.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/graph-size.py b/sandbox/graph-size.py index 1bf45d1726..cc0d9fb1b3 100755 --- a/sandbox/graph-size.py +++ b/sandbox/graph-size.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/hi-lo-abundance-by-position.py b/sandbox/hi-lo-abundance-by-position.py index 53b22b195b..5c9e353aa0 100755 --- a/sandbox/hi-lo-abundance-by-position.py +++ b/sandbox/hi-lo-abundance-by-position.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/make-coverage.py b/sandbox/make-coverage.py index 508a5f2f1a..fb4ce399e7 100755 --- a/sandbox/make-coverage.py +++ b/sandbox/make-coverage.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/multi-rename.py b/sandbox/multi-rename.py index 62917aa226..f9c3a3d834 100755 --- a/sandbox/multi-rename.py +++ b/sandbox/multi-rename.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/normalize-by-median-pct.py b/sandbox/normalize-by-median-pct.py index 6ffccf8138..1888047625 100755 --- a/sandbox/normalize-by-median-pct.py +++ b/sandbox/normalize-by-median-pct.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/print-stoptags.py b/sandbox/print-stoptags.py index 2653413398..f7633b0db5 100755 --- a/sandbox/print-stoptags.py +++ b/sandbox/print-stoptags.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/print-tagset.py b/sandbox/print-tagset.py index dc2a971e46..8a093071e4 100755 --- a/sandbox/print-tagset.py +++ b/sandbox/print-tagset.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/renumber-partitions.py b/sandbox/renumber-partitions.py index 87c020c3df..92d3134f18 100755 --- a/sandbox/renumber-partitions.py +++ b/sandbox/renumber-partitions.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py index 18eb619754..422024db23 100755 --- a/sandbox/saturate-by-median.py +++ b/sandbox/saturate-by-median.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/shuffle-reverse-rotary.py b/sandbox/shuffle-reverse-rotary.py index a63e510c4c..70d4d923a9 100755 --- a/sandbox/shuffle-reverse-rotary.py +++ b/sandbox/shuffle-reverse-rotary.py @@ -1,5 +1,5 @@ from __future__ import print_function -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/slice-reads-by-coverage.py b/sandbox/slice-reads-by-coverage.py index d8dc3176fa..2093aa6708 100755 --- a/sandbox/slice-reads-by-coverage.py +++ b/sandbox/slice-reads-by-coverage.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2014-2015. It is licensed under # the three-clause BSD license; see LICENSE. diff --git a/sandbox/split-fasta.py b/sandbox/split-fasta.py index 3deb0956fb..bf395dab7e 100755 --- a/sandbox/split-fasta.py +++ b/sandbox/split-fasta.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/split-sequences-by-length.py b/sandbox/split-sequences-by-length.py index 1be79df129..8e5aeb3d39 100755 --- a/sandbox/split-sequences-by-length.py +++ b/sandbox/split-sequences-by-length.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/stoptag-abundance-hist.py b/sandbox/stoptag-abundance-hist.py index b5fc2dcc2b..616a9f4d4a 100755 --- a/sandbox/stoptag-abundance-hist.py +++ b/sandbox/stoptag-abundance-hist.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/stoptags-by-position.py b/sandbox/stoptags-by-position.py index e1130785e4..5431261e2f 100755 --- a/sandbox/stoptags-by-position.py +++ b/sandbox/stoptags-by-position.py @@ -1,5 +1,5 @@ from __future__ import print_function -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/strip-partition.py b/sandbox/strip-partition.py index e98ae79074..ab972ab1b5 100755 --- a/sandbox/strip-partition.py +++ b/sandbox/strip-partition.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/subset-report.py b/sandbox/subset-report.py index 47eab6118d..bea112420c 100755 --- a/sandbox/subset-report.py +++ b/sandbox/subset-report.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py index c3da019118..426bebe52b 100755 --- a/sandbox/sweep-files.py +++ b/sandbox/sweep-files.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/sweep-out-reads-with-contigs.py b/sandbox/sweep-out-reads-with-contigs.py index 7df9aceed8..450057f5bd 100755 --- a/sandbox/sweep-out-reads-with-contigs.py +++ b/sandbox/sweep-out-reads-with-contigs.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index 7b8b4386cf..706fa726a5 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -1,5 +1,5 @@ from __future__ import print_function, unicode_literals -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/sweep-reads2.py b/sandbox/sweep-reads2.py index 4a749af075..6ad2af592d 100755 --- a/sandbox/sweep-reads2.py +++ b/sandbox/sweep-reads2.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/sweep-reads3.py b/sandbox/sweep-reads3.py index d52ffe4039..276c16a083 100755 --- a/sandbox/sweep-reads3.py +++ b/sandbox/sweep-reads3.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/unique-kmers.py b/sandbox/unique-kmers.py index e14949e64a..12de9b4c93 100755 --- a/sandbox/unique-kmers.py +++ b/sandbox/unique-kmers.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/sandbox/write-trimmomatic.py b/sandbox/write-trimmomatic.py index 4d711ff87f..5ab61e0d5c 100755 --- a/sandbox/write-trimmomatic.py +++ b/sandbox/write-trimmomatic.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 3625e7faed..5bc901b56b 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2010-2015. It is licensed under diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py index 3113553d52..9178c07d52 100755 --- a/scripts/abundance-dist.py +++ b/scripts/abundance-dist.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2010-2015. It is licensed under diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index ab1a0db93c..8a95a273b2 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/count-median.py b/scripts/count-median.py index 5d9be9af6f..19e6473c9e 100755 --- a/scripts/count-median.py +++ b/scripts/count-median.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py index 3c9968c33d..9d4cec8dd5 100755 --- a/scripts/count-overlap.py +++ b/scripts/count-overlap.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2012-2015-2015. It is licensed under diff --git a/scripts/do-partition.py b/scripts/do-partition.py index eefb43f4da..e71f95faea 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py index cf5f01e503..7155070d8e 100755 --- a/scripts/extract-long-sequences.py +++ b/scripts/extract-long-sequences.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index 13cbde4f3a..61da6a73c5 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This script is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index dad81fda37..777db38991 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py index 8bc13a0986..ef21cda312 100755 --- a/scripts/fastq-to-fasta.py +++ b/scripts/fastq-to-fasta.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index 1daad7fb20..b4d2215e9d 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index 9354bfa7e7..e994c60f54 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index 3ef752ec34..fe2fb6d001 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/find-knots.py b/scripts/find-knots.py index 031c8440b0..b37fd687a6 100755 --- a/scripts/find-knots.py +++ b/scripts/find-knots.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py index 4c97ec77e5..94d5776950 100755 --- a/scripts/interleave-reads.py +++ b/scripts/interleave-reads.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/load-graph.py b/scripts/load-graph.py index 9e34103f7f..999403e097 100755 --- a/scripts/load-graph.py +++ b/scripts/load-graph.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index 6c2afc626c..833901aa28 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index 5dd9939c64..2f31617e37 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 2904e60693..5ca4941d6b 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index e59801bb07..acb47442d7 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py index 30d11e2971..5bb40be040 100755 --- a/scripts/partition-graph.py +++ b/scripts/partition-graph.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/readstats.py b/scripts/readstats.py index 74a0065e81..e61fe99861 100755 --- a/scripts/readstats.py +++ b/scripts/readstats.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 35091b5404..79b4777c33 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This script is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 8c5fbfa27d..dd541dab6d 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This script is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 85304ca483..6bab0a4434 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2015. It is licensed under From c46546528dc67dd9764617accc102f2ad1b52037 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 2 Jun 2015 18:48:16 -0400 Subject: [PATCH 14/20] Add ChangeLog entry --- ChangeLog | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index f933172f84..045ca51819 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2015-06-12 Luiz Irber + + * *.py: refactor for Python 3 compatibility. Clear separation of Unicode + and Byte strings, use __future__ imports for compatibility (print function, + absolute imports, unicode_literals), fix tests to consider changes to random + number generator between Python versions. + * khmer/_khmer.cc: rename file, methods return Unicode strings instead of + Bytestrings. + 2015-06-12 Luiz Irber * khmer/{khmermodule.cc},tests/test_hashbits.py: Add Unicode support to @@ -83,7 +92,7 @@ * scripts/normalize-by-median.py: major refactoring to use context managers and classes; fixed -R * tests/test_scripts.py: added test for normalize's -R arg - + 2015-06-01 Tamer Mansour * scripts/normalize-by-median.py: changed to count kmers from both PE reads From 606ee94dd66d015cdeb049c7db88c69864851717 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 14:45:34 -0400 Subject: [PATCH 15/20] Fix prints --- tests/test_counting_hash.py | 24 ++++++++++++------------ tests/test_hashbits.py | 14 +++++++------- tests/test_scripts.py | 2 +- tests/test_subset_graph.py | 10 +++++----- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py index 145d65691a..2c934c86a8 100644 --- a/tests/test_counting_hash.py +++ b/tests/test_counting_hash.py @@ -59,7 +59,7 @@ def test_failed_get(self): hi.get(float(GGhash)) assert "the previous statement should fail" except ValueError as err: - print str(err) + print(str(err)) def test_collision_1(self): @@ -491,33 +491,33 @@ def test_get_kmer_counts(): hi.consume("AAAAAA") counts = hi.get_kmer_counts("AAAAAA") - print counts + print(counts) assert len(counts) == 1 assert counts[0] == 1 hi.consume("AAAAAA") counts = hi.get_kmer_counts("AAAAAA") - print counts + print(counts) assert len(counts) == 1 assert counts[0] == 2 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") - print counts + print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 1 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") - print counts + print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 2 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") - print counts + print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 3 @@ -528,33 +528,33 @@ def test_get_kmer_hashes(): hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("AAAAAA") - print hashes + print(hashes) assert len(hashes) == 1 assert hi.get(hashes[0]) == 1 hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("AAAAAA") - print hashes + print(hashes) assert len(hashes) == 1 assert hi.get(hashes[0]) == 2 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") - print hashes + print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 1 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") - print hashes + print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 2 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") - print hashes + print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 3 @@ -615,7 +615,7 @@ def test_load_truncated(): ht = khmer.load_counting_hash(truncpath) assert 0, "this should not be reached!" except IOError as err: - print str(err) + print(str(err)) def test_load_gz(): diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index f7525eca5e..2e176f5868 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -69,13 +69,13 @@ def test_update_from_diff_ksize_2(): ht.update(ht2) assert 0, "should not be reached" except ValueError as err: - print str(err) + print(str(err)) try: ht2.update(ht) assert 0, "should not be reached" except ValueError as err: - print str(err) + print(str(err)) def test_update_from_diff_tablesize(): @@ -86,7 +86,7 @@ def test_update_from_diff_tablesize(): ht.update(ht2) assert 0, "should not be reached" except ValueError as err: - print str(err) + print(str(err)) def test_update_from_diff_num_tables(): @@ -97,7 +97,7 @@ def test_update_from_diff_num_tables(): ht.update(ht2) assert 0, "should not be reached" except ValueError as err: - print str(err) + print(str(err)) def test_n_occupied_1(): @@ -596,7 +596,7 @@ def test_badget(): hbts.get(b"AGCTT") assert 0, "this should fail" except ValueError as err: - print str(err) + print(str(err)) try: hbts.get(u"AGCTT") @@ -678,7 +678,7 @@ def test_save_load_tagset_trunc(): ht.load_tagset(outfile) assert 0, "this test should fail" except IOError as err: - print str(err), i + print(str(err), i) # to build the test files used below, add 'test' to this function @@ -813,7 +813,7 @@ def test_stop_tags_truncate_check(): ht.load_stop_tags(truncpath) assert 0, "expect failure of previous command" except IOError as e: - print i, str(e) + print(i, str(e)) def test_tagset_ksize_check(): diff --git a/tests/test_scripts.py b/tests/test_scripts.py index f23d9d3e3a..b4fa230718 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -699,7 +699,7 @@ def test_normalize_by_median_known_good(): for rknown, rout in zip(iter_known, iter_out): assert rknown.name == rout.name except Exception as e: - print e + print(e) assert False diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py index f4c49a62a9..fc3f470d81 100644 --- a/tests/test_subset_graph.py +++ b/tests/test_subset_graph.py @@ -240,7 +240,7 @@ def test_save_load_merge_truncate(self): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy + print(divvy) assert len(divvy) is 3 (a, b, c) = divvy @@ -267,7 +267,7 @@ def test_save_load_merge_truncate(self): a = ht.load_subset_partitionmap(outfile3) assert 0, "this should not pass" except IOError as err: - print str(err), i + print(str(err), i) def test_save_load_merge_2(self): ht = khmer.new_hashbits(20, 4 ** 8 + 1) @@ -440,7 +440,7 @@ def test_save_load_merge_on_graph(): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy + print(divvy) assert len(divvy) is 3 (a, b, c) = divvy @@ -473,7 +473,7 @@ def test_save_load_on_graph_truncate(): assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) - print divvy + print(divvy) assert len(divvy) is 3 (a, b, c) = divvy @@ -500,7 +500,7 @@ def test_save_load_on_graph_truncate(): a = ht.load_partitionmap(outfile3) assert 0, "this should not pass" except IOError as err: - print str(err), i + print(str(err), i) def test_output_partitions(): From 650289ffac865deb44b98e5c70fab14529aca1a4 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 15:06:16 -0400 Subject: [PATCH 16/20] Fix PyInt and PyBytes for py3 --- khmer/_khmer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 3cc8e55fed..00bd155612 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -34,6 +34,7 @@ using namespace read_parsers; #if (PY_MAJOR_VERSION >= 3) #define PyInt_Check(arg) PyLong_Check(arg) #define PyInt_AsLong(arg) PyLong_AsLong(arg) +#define PyInt_FromLong(arg) PyLong_FromLong(arg) #endif // @@ -2324,7 +2325,7 @@ hashtable_get_kmers(khmer_KHashtable_Object * me, PyObject * args) PyObject * x = PyList_New(kmers.size()); for (unsigned int i = 0; i < kmers.size(); i++) { - PyObject * obj = PyBytes_FromString(kmers[i].c_str()); + PyObject * obj = PyUnicode_FromString(kmers[i].c_str()); PyList_SET_ITEM(x, i, obj); } From cbbb03f6d613acfedaba735ff9fb4f7e1031a8d3 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 15:06:30 -0400 Subject: [PATCH 17/20] Use released screed --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5bdf61d987..2ce9b50af3 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmermodule.cc) PYSOURCES=$(wildcard khmer/*.py scripts/*.py) SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py DEVPKGS=sphinxcontrib-autoprogram pep8==1.5.7 diff_cover \ -autopep8 pylint coverage gcovr nose pep257 future +autopep8 pylint coverage gcovr nose pep257 future screed GCOVRURL=git+https://github.com/nschum/gcovr.git@never-executed-branches VERSION=$(shell git describe --tags --dirty | sed s/v//) @@ -37,7 +37,6 @@ install-dep: install-dependencies install-dependencies: pip install --upgrade $(DEVPKGS) - pip install git+https://github.com/ged-lab/screed.git ## sharedobj : build khmer shared object file sharedobj: khmer/_khmermodule.so From 924d8c627c5863c2d557ce880da9ea7841e92c2b Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 15:06:56 -0400 Subject: [PATCH 18/20] More print fixes --- tests/test_hll.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_hll.py b/tests/test_hll.py index e9d3b9b7ca..ebde12d56a 100644 --- a/tests/test_hll.py +++ b/tests/test_hll.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, print_function, unicode_literals from __future__ import absolute_import # # This file is part of khmer, https://github.com/dib-lab/khmer/, and is @@ -270,7 +270,7 @@ def test_hll_merge_1(): hll.merge(hll2) assert 0, "previous statement should fail with a ValueError" except ValueError as err: - print str(err) + print(str(err)) def test_hll_merge_2(): @@ -281,7 +281,7 @@ def test_hll_merge_2(): hll.merge(hll2) assert 0, "previous statement should fail with a ValueError" except ValueError as err: - print str(err) + print(str(err)) def test_hll_merge_3(): From e28a8c4519f15c6dd15851bbcbe9cca1fcb17f6a Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 15:07:07 -0400 Subject: [PATCH 19/20] Update test output --- tests/test_scripts.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index b4fa230718..2b3bf02e9c 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -2802,17 +2802,18 @@ def test_sample_reads_randomly_S(): print(list(sorted(seqs))) if sys.version_info.major == 2: - answer = {'895:1:1:1298:13380', '895:1:1:1348:18672', - '895:1:1:1309:4153', '895:1:1:1252:19493', - '895:1:1:1368:4434', '895:1:1:1348:1257', - '895:1:1:1383:3089', '895:1:1:1355:13535', - '895:1:1:1303:6251', '895:1:1:1349:15165'} + answer = {'895:1:1:1303:14389', '895:1:1:1373:4848', + '895:1:1:1357:19736', '895:1:1:1347:3237', + '895:1:1:1338:7557', '895:1:1:1388:11093', + '895:1:1:1296:1784', '895:1:1:1290:11501', + '895:1:1:1355:13535', '895:1:1:1303:6251'} + else: - answer = {'895:1:1:1265:2265', '895:1:1:1287:13756', - '895:1:1:1296:1784', '895:1:1:1307:4308', - '895:1:1:1344:1968', '895:1:1:1362:3983', - '895:1:1:1365:12527', '895:1:1:1383:14728', - '895:1:1:1383:3089', '895:1:1:1386:14753'} + answer = {'895:1:1:1362:3983', '895:1:1:1363:9988', + '895:1:1:1314:10430', '895:1:1:1255:18861', + '895:1:1:1308:20421', '895:1:1:1358:4953', + '895:1:1:1351:14718', '895:1:1:1303:6251', + '895:1:1:1276:16426', '895:1:1:1355:13535'} assert seqs == answer From 4922a2abf7cdc4fa995ed1d5f9b3abcbde114c2b Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 12 Jun 2015 15:38:53 -0400 Subject: [PATCH 20/20] Update classifiers --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a4a5e32ee2..ab976cee76 100755 --- a/setup.py +++ b/setup.py @@ -150,8 +150,9 @@ def check_for_openmp(): "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Programming Language :: C++", - "Programming Language :: Python :: 2 :: Only", "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", "Topic :: Scientific/Engineering :: Bio-Informatics", ] if "-rc" in versioneer.get_version():