From 74f8d87a56f6e59994601932e33dda335f475e0f Mon Sep 17 00:00:00 2001 From: alex bishara Date: Sat, 12 Aug 2017 17:18:14 -0700 Subject: [PATCH] maps seed contigs back and retain only those in some connected components as seed hits for subasm --- BUILD | 120 +++++++++++++++--------------- aclocal.m4 | 72 +++++++----------- bin/Makefile.am | 83 +++++++++++---------- config.h | 9 +-- config.h.in | 3 - src/assembly/assembly_utility.cpp | 36 +++++++++ src/assembly/assembly_utility.h | 3 + src/graph/contig_graph.cpp | 33 ++++++++ src/graph/contig_graph.h | 5 ++ src/release/idba_subasm.cpp | 89 +++++++++++++++++++--- src/sequence/short_sequence.h | 2 +- 11 files changed, 287 insertions(+), 168 deletions(-) diff --git a/BUILD b/BUILD index 0648f61..868905e 100644 --- a/BUILD +++ b/BUILD @@ -44,240 +44,240 @@ cc_binary( ) cc_binary( - name = "fq2fa", - srcs = ["src/release/fq2fa.cpp"], + name = "idba_hybrid", + srcs = ["src/release/idba_hybrid.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba", - srcs = ["src/release/idba.cpp"], + name = "idba_subasm", + srcs = ["src/release/idba_subasm.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba_hybrid", - srcs = ["src/release/idba_hybrid.cpp"], + name = "sim_reads_tran", + srcs = ["src/release/sim_reads_tran.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba_tran", - srcs = ["src/release/idba_tran.cpp"], + name = "idba", + srcs = ["src/release/idba.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba_subasm", - srcs = ["src/release/idba_subasm.cpp"], + name = "parallel_blat", + srcs = ["src/release/parallel_blat.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba_ud", - srcs = ["src/release/idba_ud.cpp"], + name = "idba_tran", + srcs = ["src/release/idba_tran.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "parallel_blat", - srcs = ["src/release/parallel_blat.cpp"], + name = "split_scaffold", + srcs = ["src/release/split_scaffold.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "raw_n50", - srcs = ["src/release/raw_n50.cpp"], + name = "sim_reads", + srcs = ["src/release/sim_reads.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "sim_reads", - srcs = ["src/release/sim_reads.cpp"], + name = "fq2fa", + srcs = ["src/release/fq2fa.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "sim_reads_tran", - srcs = ["src/release/sim_reads_tran.cpp"], + name = "validate_contigs_blat", + srcs = ["src/release/validate_contigs_blat.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "split_scaffold", - srcs = ["src/release/split_scaffold.cpp"], + name = "raw_n50", + srcs = ["src/release/raw_n50.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "validate_contigs_blat", - srcs = ["src/release/validate_contigs_blat.cpp"], + name = "idba_ud", + srcs = ["src/release/idba_ud.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "fa2fq", - srcs = ["src/tools/fa2fq.cpp"], + name = "idba_tran_test", + srcs = ["src/tools/idba_tran_test.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "filter_blat", - srcs = ["src/tools/filter_blat.cpp"], + name = "validate_component", + srcs = ["src/tools/validate_component.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "filter_contigs", - srcs = ["src/tools/filter_contigs.cpp"], + name = "print_graph", + srcs = ["src/tools/print_graph.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "idba_tran_test", - srcs = ["src/tools/idba_tran_test.cpp"], + name = "scaffold", + srcs = ["src/tools/scaffold.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "parallel_rna_blat", - srcs = ["src/tools/parallel_rna_blat.cpp"], + name = "split_fq", + srcs = ["src/tools/split_fq.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "print_graph", - srcs = ["src/tools/print_graph.cpp"], + name = "test", + srcs = ["src/tools/test.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "sample_reads", - srcs = ["src/tools/sample_reads.cpp"], + name = "filter_blat", + srcs = ["src/tools/filter_blat.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "scaffold", - srcs = ["src/tools/scaffold.cpp"], + name = "validate_reads_blat", + srcs = ["src/tools/validate_reads_blat.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "shuffle_reads", - srcs = ["src/tools/shuffle_reads.cpp"], + name = "sample_reads", + srcs = ["src/tools/sample_reads.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "sort_psl", - srcs = ["src/tools/sort_psl.cpp"], + name = "fa2fq", + srcs = ["src/tools/fa2fq.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "sort_reads", - srcs = ["src/tools/sort_reads.cpp"], + name = "validate_rna", + srcs = ["src/tools/validate_rna.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "split_fa", - srcs = ["src/tools/split_fa.cpp"], + name = "sort_psl", + srcs = ["src/tools/sort_psl.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "split_fq", - srcs = ["src/tools/split_fq.cpp"], + name = "validate_contigs_mummer", + srcs = ["src/tools/validate_contigs_mummer.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "test", - srcs = ["src/tools/test.cpp"], + name = "parallel_rna_blat", + srcs = ["src/tools/parallel_rna_blat.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "validate_component", - srcs = ["src/tools/validate_component.cpp"], + name = "filter_contigs", + srcs = ["src/tools/filter_contigs.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "validate_contigs_mummer", - srcs = ["src/tools/validate_contigs_mummer.cpp"], + name = "split_fa", + srcs = ["src/tools/split_fa.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "validate_reads_blat", - srcs = ["src/tools/validate_reads_blat.cpp"], + name = "sort_reads", + srcs = ["src/tools/sort_reads.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], ) cc_binary( - name = "validate_rna", - srcs = ["src/tools/validate_rna.cpp"], + name = "shuffle_reads", + srcs = ["src/tools/shuffle_reads.cpp"], copts = ["-Wall", "-O3", "-Isrc", "-fopenmp"], linkopts = ["-pthread", "-fopenmp", "-lm"], deps = [":assembly"], diff --git a/aclocal.m4 b/aclocal.m4 index da06da5..bcc213b 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,8 +1,7 @@ -# generated automatically by aclocal 1.11.3 -*- Autoconf -*- +# generated automatically by aclocal 1.11.1 -*- Autoconf -*- # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, -# 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, -# Inc. +# 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -14,21 +13,18 @@ m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl -m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.68],, -[m4_warning([this file was generated for autoconf 2.68. +m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.63],, +[m4_warning([this file was generated for autoconf 2.63. You have another version of autoconf. It may work, but is not guaranteed to. If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically `autoreconf'.])]) -# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008, 2011 Free Software -# Foundation, Inc. +# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 1 - # AM_AUTOMAKE_VERSION(VERSION) # ---------------------------- # Automake X.Y traces this macro to ensure aclocal.m4 has been @@ -38,7 +34,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version='1.11' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.11.3], [], +m4_if([$1], [1.11.1], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -54,21 +50,19 @@ m4_define([_AM_AUTOCONF_VERSION], []) # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.11.3])dnl +[AM_AUTOMAKE_VERSION([1.11.1])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc. +# Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 1 - # For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets # $ac_aux_dir to `$srcdir/foo'. In other projects, it is set to # `$srcdir', `$srcdir/..', or `$srcdir/../..'. @@ -150,14 +144,14 @@ AC_CONFIG_COMMANDS_PRE( Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009, -# 2010, 2011 Free Software Foundation, Inc. +# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009 +# Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 12 +# serial 10 # There are a few dirty hacks below to avoid letting `AC_PROG_CC' be # written in clear, in which case automake, when reading aclocal.m4, @@ -197,7 +191,6 @@ AC_CACHE_CHECK([dependency style of $depcc], # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. - rm -rf conftest.dir mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. @@ -262,7 +255,7 @@ AC_CACHE_CHECK([dependency style of $depcc], break fi ;; - msvc7 | msvc7msys | msvisualcpp | msvcmsys) + msvisualcpp | msvcmsys) # This compiler won't grok `-c -o', but also, the minuso test has # not run yet. These depmodes are late enough in the game, and # so weak that their functioning should not be impacted. @@ -327,13 +320,10 @@ AC_DEFUN([AM_DEP_TRACK], if test "x$enable_dependency_tracking" != xno; then am_depcomp="$ac_aux_dir/depcomp" AMDEPBACKSLASH='\' - am__nodep='_no' fi AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno]) AC_SUBST([AMDEPBACKSLASH])dnl _AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl -AC_SUBST([am__nodep])dnl -_AM_SUBST_NOTMAKE([am__nodep])dnl ]) # Generate code to set up dependency tracking. -*- Autoconf -*- @@ -555,15 +545,12 @@ for _am_header in $config_headers :; do done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001, 2003, 2005, 2008, 2011 Free Software Foundation, -# Inc. +# Copyright (C) 2001, 2003, 2005, 2008 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 1 - # AM_PROG_INSTALL_SH # ------------------ # Define $install_sh. @@ -695,15 +682,12 @@ else fi ]) -# Copyright (C) 2003, 2004, 2005, 2006, 2011 Free Software Foundation, -# Inc. +# Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 1 - # AM_PROG_MKDIR_P # --------------- # Check for `mkdir -p'. @@ -726,14 +710,13 @@ esac # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001, 2002, 2003, 2005, 2008, 2010 Free Software -# Foundation, Inc. +# Copyright (C) 2001, 2002, 2003, 2005, 2008 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 5 +# serial 4 # _AM_MANGLE_OPTION(NAME) # ----------------------- @@ -741,13 +724,13 @@ AC_DEFUN([_AM_MANGLE_OPTION], [[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])]) # _AM_SET_OPTION(NAME) -# -------------------- +# ------------------------------ # Set option NAME. Presently that only means defining a flag for this option. AC_DEFUN([_AM_SET_OPTION], [m4_define(_AM_MANGLE_OPTION([$1]), 1)]) # _AM_SET_OPTIONS(OPTIONS) -# ------------------------ +# ---------------------------------- # OPTIONS is a space-separated list of Automake options. AC_DEFUN([_AM_SET_OPTIONS], [m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])]) @@ -823,14 +806,12 @@ Check your system clock]) fi AC_MSG_RESULT(yes)]) -# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc. +# Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 1 - # AM_PROG_INSTALL_STRIP # --------------------- # One issue with vendor `install' (even GNU) is that you can't @@ -853,13 +834,13 @@ fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006, 2008, 2010 Free Software Foundation, Inc. +# Copyright (C) 2006, 2008 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -# serial 3 +# serial 2 # _AM_SUBST_NOTMAKE(VARIABLE) # --------------------------- @@ -868,13 +849,13 @@ AC_SUBST([INSTALL_STRIP_PROGRAM])]) AC_DEFUN([_AM_SUBST_NOTMAKE]) # AM_SUBST_NOTMAKE(VARIABLE) -# -------------------------- +# --------------------------- # Public sister of _AM_SUBST_NOTMAKE. AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004, 2005, 2012 Free Software Foundation, Inc. +# Copyright (C) 2004, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -896,11 +877,10 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # a tarball read from stdin. # $(am__untar) < result.tar AC_DEFUN([_AM_PROG_TAR], -[# Always define AMTAR for backward compatibility. Yes, it's still used -# in the wild :-( We should find a proper way to deprecate it ... -AC_SUBST([AMTAR], ['$${TAR-tar}']) +[# Always define AMTAR for backward compatibility. +AM_MISSING_PROG([AMTAR], [tar]) m4_if([$1], [v7], - [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'], + [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'], [m4_case([$1], [ustar],, [pax],, [m4_fatal([Unknown tar format])]) AC_MSG_CHECKING([how to create a $1 tar archive]) diff --git a/bin/Makefile.am b/bin/Makefile.am index 887350c..90799f5 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -3,69 +3,72 @@ LIBS = $(top_srcdir)/lib/libassembly.a @LIBS@ AM_CXXFLAGS = -Wall -O3 -fopenmp -pthread INCLUDES = -I$(top_srcdir)/src -I$(top_srcdir)/gtest_src -shuffle_reads_SOURCES = $(top_srcdir)/src/tools/shuffle_reads.cpp -scaffold_SOURCES = $(top_srcdir)/src/tools/scaffold.cpp +idba_tran_test_SOURCES = $(top_srcdir)/src/tools/idba_tran_test.cpp validate_component_SOURCES = $(top_srcdir)/src/tools/validate_component.cpp -fa2fq_SOURCES = $(top_srcdir)/src/tools/fa2fq.cpp -validate_rna_SOURCES = $(top_srcdir)/src/tools/validate_rna.cpp -validate_contigs_mummer_SOURCES = $(top_srcdir)/src/tools/validate_contigs_mummer.cpp print_graph_SOURCES = $(top_srcdir)/src/tools/print_graph.cpp -idba_tran_test_SOURCES = $(top_srcdir)/src/tools/idba_tran_test.cpp +scaffold_SOURCES = $(top_srcdir)/src/tools/scaffold.cpp +split_fq_SOURCES = $(top_srcdir)/src/tools/split_fq.cpp +test_SOURCES = $(top_srcdir)/src/tools/test.cpp +filter_blat_SOURCES = $(top_srcdir)/src/tools/filter_blat.cpp +validate_reads_blat_SOURCES = $(top_srcdir)/src/tools/validate_reads_blat.cpp sample_reads_SOURCES = $(top_srcdir)/src/tools/sample_reads.cpp +fa2fq_SOURCES = $(top_srcdir)/src/tools/fa2fq.cpp +validate_rna_SOURCES = $(top_srcdir)/src/tools/validate_rna.cpp sort_psl_SOURCES = $(top_srcdir)/src/tools/sort_psl.cpp -sort_reads_SOURCES = $(top_srcdir)/src/tools/sort_reads.cpp -filter_contigs_SOURCES = $(top_srcdir)/src/tools/filter_contigs.cpp +validate_contigs_mummer_SOURCES = $(top_srcdir)/src/tools/validate_contigs_mummer.cpp parallel_rna_blat_SOURCES = $(top_srcdir)/src/tools/parallel_rna_blat.cpp -filter_blat_SOURCES = $(top_srcdir)/src/tools/filter_blat.cpp +filter_contigs_SOURCES = $(top_srcdir)/src/tools/filter_contigs.cpp split_fa_SOURCES = $(top_srcdir)/src/tools/split_fa.cpp -test_SOURCES = $(top_srcdir)/src/tools/test.cpp -validate_reads_blat_SOURCES = $(top_srcdir)/src/tools/validate_reads_blat.cpp -split_fq_SOURCES = $(top_srcdir)/src/tools/split_fq.cpp +sort_reads_SOURCES = $(top_srcdir)/src/tools/sort_reads.cpp +shuffle_reads_SOURCES = $(top_srcdir)/src/tools/shuffle_reads.cpp -fq2fa_SOURCES = $(top_srcdir)/src/release/fq2fa.cpp -validate_contigs_blat_SOURCES = $(top_srcdir)/src/release/validate_contigs_blat.cpp -idba_ud_SOURCES = $(top_srcdir)/src/release/idba_ud.cpp -raw_n50_SOURCES = $(top_srcdir)/src/release/raw_n50.cpp -idba_tran_SOURCES = $(top_srcdir)/src/release/idba_tran.cpp +filterfa_SOURCES = $(top_srcdir)/src/release/filterfa.cpp idba_hybrid_SOURCES = $(top_srcdir)/src/release/idba_hybrid.cpp -parallel_blat_SOURCES = $(top_srcdir)/src/release/parallel_blat.cpp +idba_subasm_SOURCES = $(top_srcdir)/src/release/idba_subasm.cpp sim_reads_tran_SOURCES = $(top_srcdir)/src/release/sim_reads_tran.cpp -filterfa_SOURCES = $(top_srcdir)/src/release/filterfa.cpp idba_SOURCES = $(top_srcdir)/src/release/idba.cpp +parallel_blat_SOURCES = $(top_srcdir)/src/release/parallel_blat.cpp +idba_tran_SOURCES = $(top_srcdir)/src/release/idba_tran.cpp split_scaffold_SOURCES = $(top_srcdir)/src/release/split_scaffold.cpp -sim_reads_SOURCES = $(top_srcdir)/src/release/sim_reads.cpp +sim_reads_SOURCES = $(top_srcdir)/src/release/sim_reads.cpp +fq2fa_SOURCES = $(top_srcdir)/src/release/fq2fa.cpp +validate_contigs_blat_SOURCES = $(top_srcdir)/src/release/validate_contigs_blat.cpp +raw_n50_SOURCES = $(top_srcdir)/src/release/raw_n50.cpp +idba_ud_SOURCES = $(top_srcdir)/src/release/idba_ud.cpp bin_PROGRAMS = \ idba_hybrid noinst_PROGRAMS = \ - shuffle_reads \ - scaffold \ + idba_tran_test \ validate_component \ - fa2fq \ - validate_rna \ - validate_contigs_mummer \ print_graph \ - idba_tran_test \ + scaffold \ + split_fq \ + test \ + filter_blat \ + validate_reads_blat \ sample_reads \ + fa2fq \ + validate_rna \ sort_psl \ - sort_reads \ - filter_contigs \ + validate_contigs_mummer \ parallel_rna_blat \ - filter_blat \ + filter_contigs \ split_fa \ - test \ - validate_reads_blat \ - split_fq \ - fq2fa \ - validate_contigs_blat \ - idba_ud \ - raw_n50 \ - idba_tran \ - parallel_blat \ - sim_reads_tran \ + sort_reads \ + shuffle_reads \ filterfa \ + idba_subasm \ + sim_reads_tran \ idba \ + parallel_blat \ + idba_tran \ split_scaffold \ - sim_reads + sim_reads \ + fq2fa \ + validate_contigs_blat \ + raw_n50 \ + idba_ud + diff --git a/config.h b/config.h index af51167..9afbecf 100644 --- a/config.h +++ b/config.h @@ -53,22 +53,19 @@ #define PACKAGE_NAME "idba" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "idba 1.1.2" +#define PACKAGE_STRING "idba 1.1.3" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "idba" -/* Define to the home page for this package. */ -#define PACKAGE_URL "" - /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.1.2" +#define PACKAGE_VERSION "1.1.3" /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 /* Version number of package */ -#define VERSION "1.1.2" +#define VERSION "1.1.3" /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ diff --git a/config.h.in b/config.h.in index 5bde18a..1841d31 100644 --- a/config.h.in +++ b/config.h.in @@ -57,9 +57,6 @@ /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME -/* Define to the home page for this package. */ -#undef PACKAGE_URL - /* Define to the version of this package. */ #undef PACKAGE_VERSION diff --git a/src/assembly/assembly_utility.cpp b/src/assembly/assembly_utility.cpp index c9a9cef..221d22d 100644 --- a/src/assembly/assembly_utility.cpp +++ b/src/assembly/assembly_utility.cpp @@ -44,6 +44,14 @@ bool CompareHashAlignerRecord(const HashAlignerRecord &x, const HashAlignerRecor return x.match_length > y.match_length; } +void ReadInput(const std::string &read_file, const std::string &long_read_file, const std::string &seed_contig_file, AssemblyInfo &assembly_info) +{ + if (seed_contig_file != "") + ReadSequence(seed_contig_file, assembly_info.seed_contigs); + + ReadInput(read_file, long_read_file, assembly_info); +} + void ReadInput(const std::string &read_file, const std::string &long_read_file, AssemblyInfo &assembly_info) { if (read_file != "") @@ -441,6 +449,34 @@ int64_t AlignReads(AssemblyInfo &assembly_info, HashAligner &hash_aligner, doubl return num_aligned_reads; } +int64_t AlignSeeds( + AssemblyInfo &assembly_info, + HashAligner &hash_aligner, + vector > &seed_records, + double similar) +{ + int64_t num_aligned_reads = 0; + deque &seeds = assembly_info.seed_contigs; + seed_records.resize(seeds.size()); + for (int64_t i = 0; i < (int64_t)seeds.size(); ++i) + { + Sequence seq(seeds[i]); + uint32_t max_records = seq.size(); + + hash_aligner.AlignSequence( + seq, + seed_records[i], + 150, + similar, + max_records); + if (seed_records[i].size() > 0) + { + ++num_aligned_reads; + } + } + return num_aligned_reads; +} + int64_t AlignReadsLocal(AssemblyInfo &assembly_info, HashAligner &hash_aligner, int min_match, int max_mismatch, const std::string &align_file) { deque &reads = assembly_info.reads; diff --git a/src/assembly/assembly_utility.h b/src/assembly/assembly_utility.h index 1150f66..02d4128 100644 --- a/src/assembly/assembly_utility.h +++ b/src/assembly/assembly_utility.h @@ -39,6 +39,7 @@ struct AssemblyInfo std::vector read_flags; std::vector long_read_flags; std::deque ref_contigs; + std::deque seed_contigs; void ClearStatus() { @@ -52,6 +53,7 @@ struct AssemblyInfo }; void ReadInput(const std::string &read_file, const std::string &long_read_file, AssemblyInfo &assembly_info); +void ReadInput(const std::string &read_file, const std::string &long_read_file, const std::string &seed_contig_file, AssemblyInfo &assembly_info); bool ReadHashAlignerRecords(FILE *fp, std::deque &records); bool WriteHashAlignerRecords(FILE *fp, std::deque &records); @@ -75,6 +77,7 @@ void InsertExistKmers(AssemblyInfo &assembly_info, HashGraph &hash_graph); int64_t InsertIterativeKmers(const HashGraph &old_hash_graph, const Sequence &seq, HashGraph &hash_graph, int count = 1); int64_t AlignReads(AssemblyInfo &assembly_info, HashAligner &hash_aligner, double similar, const std::string &align_file, bool is_all = true); +int64_t AlignSeeds(AssemblyInfo &assembly_info, HashAligner &hash_aligner, std::vector > &seed_records, double similar); int64_t AlignReadsLocal(AssemblyInfo &assembly_info, HashAligner &hash_aligner, int min_match, int max_mismatch, const std::string &align_file); int64_t AlignReadsMultiple(AssemblyInfo &assembly_info, HashAligner &hash_aligner, double similar, const std::string &align_file, bool is_all = true); diff --git a/src/graph/contig_graph.cpp b/src/graph/contig_graph.cpp index 754f607..3e200a5 100644 --- a/src/graph/contig_graph.cpp +++ b/src/graph/contig_graph.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include "graph/contig_graph_branch_group.h" #include "sequence/sequence.h" @@ -1180,6 +1181,38 @@ void ContigGraph::GetConsensus(deque &contigs) } } +int ContigGraph::FindSeedComponents( + deque &contigs, + deque &contig_infos, + map &seed_hits) +{ + int num_ccs = 0; + deque > components; + deque component_strings; + + GetComponents(components, component_strings); + for (unsigned i = 0; i < components.size(); ++i) + { + int seed_match_len = 0; + for (unsigned j = 0; j < components[i].size(); ++j) + { + ContigGraphVertexAdaptor contig = components[i][j]; + int ref_id = contig.id(); + seed_match_len += seed_hits[ref_id]; + } + if (seed_match_len >= 100) + { + num_ccs++; + for (unsigned j = 0; j < components[i].size(); ++j) + { + contigs.push_back(components[i][j].contig()); + contig_infos.push_back(components[i][j].contig_info()); + } + } + } + return num_ccs; +} + bool ContigGraph::FindPath(ContigGraphVertexAdaptor from, ContigGraphVertexAdaptor to, ContigGraphPath &path) { path.clear(); diff --git a/src/graph/contig_graph.h b/src/graph/contig_graph.h index 4048aca..bb2c313 100644 --- a/src/graph/contig_graph.h +++ b/src/graph/contig_graph.h @@ -137,6 +137,11 @@ class ContigGraph void GetComponents(std::deque > &components, std::deque &component_strings); void GetConsensus(std::deque &consensus); + int FindSeedComponents( + std::deque &contigs, + std::deque &contig_infos, + std::map &seed_hits); + bool FindPath(ContigGraphVertexAdaptor from, ContigGraphVertexAdaptor to, ContigGraphPath &path); void SortVertices() diff --git a/src/release/idba_subasm.cpp b/src/release/idba_subasm.cpp index c6b6f03..62323f7 100644 --- a/src/release/idba_subasm.cpp +++ b/src/release/idba_subasm.cpp @@ -1,5 +1,5 @@ /** - * @file idba_ud.cpp + * @file idba_subasm.cpp * @brief An iterative de Bruijn graph assembler for sequencing data with highly uneven depth. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.6 @@ -41,6 +41,7 @@ struct IDBAOption string directory; string read_file; string long_read_file; + string seed_contig_file; deque extra_read_files; int mink; int maxk; @@ -130,6 +131,7 @@ int read_length = 0; void BuildHashGraph(int kmer_size); void Assemble(HashGraph &hash_graph); void AlignReads(const string &contig_file, const string &align_file); +int64_t FilterSeedContigs(const string &contig_file, int kmer_size, deque &subasm_contigs, deque &subasm_contig_infos); void CorrectReads(int kmer_size); void LocalAssembly(int kmer_size, int new_kmer_size); void Iterate(int kmer_size, int new_kmer_size); @@ -170,6 +172,9 @@ int main(int argc, char *argv[]) desc.AddOption("no_correct", "", option.is_no_correct, "do not do correction"); desc.AddOption("pre_correction", "", option.is_pre_correction, "perform pre-correction before assembly"); + // subassembly options + desc.AddOption("seed_contig", "", option.seed_contig_file, "fasta seed contig file"); + try { desc.Parse(argc, argv); @@ -177,6 +182,9 @@ int main(int argc, char *argv[]) if (option.read_file == "" && option.long_read_file == "") throw logic_error("not enough parameters"); + if (option.seed_contig_file == "" ) + throw logic_error("seed_contig required"); + if (option.maxk < option.mink) throw invalid_argument("mink is larger than maxk"); @@ -187,7 +195,7 @@ int main(int argc, char *argv[]) { cerr << e.what() << endl; cerr << "IDBA-UD - Iterative de Bruijn Graph Assembler for sequencing data with highly uneven depth." << endl; - cerr << "Usage: idba_ud -r read.fa -o output_dir" << endl; + cerr << "Usage: idba_subasm -r read.fa -o output_dir" << endl; cerr << "Allowed Options: " << endl; cerr << desc << endl; exit(1); @@ -206,7 +214,7 @@ int main(int argc, char *argv[]) omp_set_num_threads(option.num_threads); cout << "number of threads " << option.num_threads << endl; - ReadInput(option.read_file, option.long_read_file, assembly_info); + ReadInput(option.read_file, option.long_read_file, option.seed_contig_file, assembly_info); deque extra_reads; for (unsigned i = 0; i < option.extra_read_files.size(); ++i) { @@ -219,6 +227,7 @@ int main(int argc, char *argv[]) } cout << "reads " << assembly_info.reads.size() << endl; cout << "long reads " << assembly_info.long_reads.size() << endl; + cout << "seed contigs " << assembly_info.seed_contigs.size() << endl; cout << "extra reads " << extra_reads.size() << endl; assembly_info.long_reads.insert(assembly_info.long_reads.end(), extra_reads.begin(), extra_reads.end()); @@ -241,6 +250,8 @@ int main(int argc, char *argv[]) int kmer_size = option.mink; while (true) { + //cout << "SKIPPING" << endl; + //break; cout << "kmer " << kmer_size << endl; if (kmer_size >= (option.mink + option.maxk)/2 || kmer_size == option.maxk) @@ -270,15 +281,22 @@ int main(int argc, char *argv[]) kmer_size = option.maxk; - deque contigs; - deque names; - ReadSequence(option.contig_file(kmer_size), contigs, names); - FastaWriter writer(option.contig_file()); - for (unsigned i = 0; i < contigs.size(); ++i) - { - if ((int)contigs[i].size() >= option.min_contig) - writer.Write(contigs[i], names[i]); - } + // obtain contigs only from connected components with seed contig hits + deque subasm_contigs; + deque subasm_contig_infos; + FilterSeedContigs( + option.contig_file(option.maxk), + option.maxk, + subasm_contigs, + subasm_contig_infos); + + // output final filtered contigs + WriteContig( + option.contig_file(), + subasm_contigs, + subasm_contig_infos, + FormatString("contig-%d", option.maxk), + option.min_contig); Scaffold(option.maxk, option.min_contig); @@ -652,3 +670,50 @@ void AlignReads(const string &contig_file, ShortReadLibrary &library, const stri assembly_info.reads.swap(library.reads()); } +int64_t FilterSeedContigs( + const string &contig_file, + int kmer_size, + deque &subasm_contigs, + deque &subasm_contig_infos) +{ + int num_ccs = 0; + + // align seed contigs + deque contigs; + ReadSequence(contig_file, contigs); + + HashAligner hash_aligner(option.seed_kmer_size, option.min_contig, 2); + hash_aligner.Initialize(contigs); + + cout << "aligning seed contigs" << endl; + vector > seed_records; + //printf("HERE hm2\n"); + int64_t num_aligned_seeds = AlignSeeds(assembly_info, hash_aligner, seed_records, option.similar); + cout << " - " << num_aligned_seeds << " aligned" << endl; + + // compile contig hits from seed alignments + map seed_hits; + int total_seed_length = 0; + for (unsigned i = 0; i < seed_records.size(); ++i) + { + for (unsigned j = 0; j < seed_records[i].size(); ++j) + { + int ref_id = seed_records[i][j].ref_id; + int match_length = seed_records[i][j].match_length; + seed_hits[ref_id] += match_length; + total_seed_length += match_length; + //printf("seed_records i,j = %d, %d\n", i, j); + //printf(" match_length: %d, total_length: %d\n", match_length, total_seed_length); + //fflush(stdout); + } + } + + // build contig graph and find connected components hit by the seeds + ContigGraph contig_graph(kmer_size, contigs); + + num_ccs = contig_graph.FindSeedComponents(subasm_contigs, subasm_contig_infos, seed_hits); + cout << "hit " << num_ccs << " connected components" << endl; + cout << " - retained " << subasm_contigs.size() << " of " << contigs.size() << " contigs" << endl; + return num_ccs; +} + diff --git a/src/sequence/short_sequence.h b/src/sequence/short_sequence.h index 59bf08c..f000add 100644 --- a/src/sequence/short_sequence.h +++ b/src/sequence/short_sequence.h @@ -99,7 +99,7 @@ class ShortSequence return size() < seq.size(); } - static const uint32_t kMaxShortSequence = 128; + static const uint32_t kMaxShortSequence = 162; static const uint32_t kNumBytes = (kMaxShortSequence + 3) / 4 + 2; private: