KwanLab · evanroyrees · Jan 30, 2022 · Jan 27, 2022 · Jan 28, 2022 · Jan 28, 2022
diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
@@ -878,7 +878,6 @@ def main():
 
     logger.info(f"Selected clustering method: {args.clustering_method}")
 
-    # Perform clustering w/o taxonomy
     if args.taxonomy:
         main_out = taxon_guided_binning(
             main=main_df,
@@ -894,6 +893,7 @@ def main():
             verbose=args.verbose,
         )
     else:
+        # Perform clustering w/o taxonomy
         main_out = get_clusters(
             main=main_df,
             markers_df=markers_df,
@@ -914,4 +914,16 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    import sys
+
+    # Using an http error status code...
+    # From: https://kinsta.com/blog/http-status-codes/#200-status-codes
+    # 204: “No Content.”
+    # This code means that the server has successfully processed the request
+    # but is not going to return any content.
+
+    try:
+        main()
+    except (TableFormatError, BinningError) as err:
+        logger.warn(err)
+        sys.exit(204)
diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
@@ -186,7 +186,7 @@ def get_metabin_stats(
 
     df = bin_df[metabin_stat_cols].fillna(value={cluster_col: "unclustered"}).copy()
 
-    clusters = df.join(markers_df, how="outer").groupby("cluster")
+    clusters = df.join(markers_df, how="outer").groupby(cluster_col)
 
     percent_metagenome_size = clusters.length.sum() / df.length.sum() * 100
     percent_metagenome_seqs = clusters.size() / df.shape[0] * 100

diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py
@@ -58,7 +58,6 @@
 from sklearn.tree import DecisionTreeClassifier
 
 from autometa.common.markers import load as load_markers
-from autometa.common.exceptions import BinningError
 
 
 logger = logging.getLogger(__name__)
@@ -239,7 +238,7 @@ def train_test_split_and_subset(
 
     Returns
     -------
-    (TrainingData, pd.DataFrame)
+    Tuple(TrainingData, pd.DataFrame)
         0th: Features and bin labels split and subset by clustered/unclustered contigs
         1st: Unclustered contigs features
     """
@@ -251,7 +250,7 @@ def train_test_split_and_subset(
     labels = get_labels(clustered)
     # Finally retrieve features for the subset of clustered contigs
     clustered_features_index = features.index.isin(clustered.index)
-    clustered_features = features[clustered_features_index]
+    clustered_features = features.loc[clustered_features_index].copy()
     # Store features, targets and target_names in TrainingData for namespace lookup later.
     train_data = TrainingData(
         features=clustered_features,
@@ -261,7 +260,7 @@ def train_test_split_and_subset(
     # Now retrieve features for unclustered contigs
     unclustered = binning[binning.cluster.isnull()]
     unclustered_features_index = features.index.isin(unclustered.index)
-    unclustered_features = features[unclustered_features_index]
+    unclustered_features = features.loc[unclustered_features_index].copy()
     return train_data, unclustered_features
 
 
@@ -518,6 +517,11 @@ def main():
         help="Path to write Autometa main table used during/after unclustered recruitment.",
         required=False,
     )
+    parser.add_argument(
+        "--output-features",
+        help="Path to write Autometa features table used during unclustered recruitment.",
+        required=False,
+    )
     parser.add_argument("--taxonomy", help="Path to taxonomy table.")
     parser.add_argument(
         "--taxa-dimensions",
@@ -576,7 +580,7 @@ def main():
     prev_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0]
     if not prev_num_unclustered:
         logger.warning("No unclustered contigs are available to recruit!")
-        sys.exit(0)
+        sys.exit(204)
     markers_df = load_markers(fpath=args.markers, format="wide")
 
     logger.debug(
@@ -619,22 +623,33 @@ def main():
         f"unclustered {prev_num_unclustered} -> {now_num_unclustered} (recruited {n_recruited} contigs) in {n_runs} runs"
     )
     # Re-read in the binning dataframe to merge with the newly recruited contigs
-    prev_bin_df = pd.read_csv(
-        args.binning, sep="\t", index_col="contig", usecols=["contig", "cluster"]
-    )
+    prev_bin_df = pd.read_csv(args.binning, sep="\t", index_col="contig")
     bin_df.rename(columns={"cluster": "recruited_cluster"}, inplace=True)
-    main_df = pd.merge(
-        prev_bin_df, bin_df[["recruited_cluster"]], left_index=True, right_index=True
+    binning_df = pd.merge(
+        prev_bin_df[["cluster"]],
+        bin_df[["recruited_cluster"]],
+        left_index=True,
+        right_index=True,
     )
-    # Write unclustered recruitment results into main bin df
-    # index = 'contig', cols = [..., 'cluster', 'recruited_cluster', ...]
-    main_df.to_csv(
+    # Write unclustered recruitment results into binning df
+    # index = 'contig', cols = ['cluster', 'recruited_cluster']
+    binning_df.to_csv(
         args.output_binning, sep="\t", index=True, header=True, float_format="%.5f"
     )
     if args.output_main:
+        main_df = pd.merge(
+            prev_bin_df,
+            bin_df[["recruited_cluster"]],
+            left_index=True,
+            right_index=True,
+        )
+        main_df.to_csv(
+            args.output_main, sep="\t", index=True, header=True, float_format="%.5f"
+        )
+    if args.output_features:
         # Outputs features matrix used as input to recruitment algorithm
         features.to_csv(
-            args.output_main, sep="\t", index=True, header=True, float_format="%.5f"
+            args.output_features, sep="\t", index=True, header=True, float_format="%.5f"
         )
 
 

diff --git a/autometa/common/kmers.py b/autometa/common/kmers.py
@@ -621,7 +621,6 @@ def main():
         datefmt="%m/%d/%Y %I:%M:%S %p",
         level=logger.DEBUG,
     )
-    skip_desc = "(will skip if file exists)"
     cpus = mp.cpu_count()
     parser = argparse.ArgumentParser(
         description="Count k-mer frequencies of given `fasta`",
@@ -631,20 +630,18 @@ def main():
         "--fasta",
         help="Metagenomic assembly fasta file",
         metavar="filepath",
-        required=True,
     )
     parser.add_argument(
         "--kmers",
-        help=f"K-mers frequency tab-delimited table {skip_desc}",
+        help=f"K-mers frequency tab-delimited table (will skip if file exists)",
         metavar="filepath",
-        required=True,
     )
     parser.add_argument(
         "--size", help="k-mer size in bp", default=5, metavar="int", type=int
     )
     parser.add_argument(
         "--norm-output",
-        help=f"Path to normalized kmers table {skip_desc}",
+        help=f"Path to normalized kmers table (will skip if file exists)",
         metavar="filepath",
     )
     parser.add_argument(
@@ -666,7 +663,7 @@ def main():
     )
     parser.add_argument(
         "--embedding-output",
-        help=f"Path to write embedded kmers table {skip_desc}",
+        help=f"Path to write embedded kmers table (will skip if file exists)",
         metavar="filepath",
     )
     parser.add_argument(
@@ -704,23 +701,50 @@ def main():
     )
     args = parser.parse_args()
 
-    if os.path.exists(args.kmers) and not args.force:
-        df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
+    if not args.fasta and not args.kmers and not args.norm_output:
+        raise ValueError(
+            "At least one of --fasta, --kmers or --norm-output are required!"
+        )
+
+    norm_df = pd.DataFrame()
+
+    if (
+        args.norm_output
+        and not os.path.exists(args.norm_output)
+        and not args.fasta
+        and not args.kmers
+    ):
+        # only normalized kmers were provided
+        raise FileNotFoundError(args.norm_output)
+    elif args.kmers and not os.path.exists(args.kmers) and not args.fasta:
+        # only kmer counts were provided
+        raise FileNotFoundError(args.kmers)
+    elif args.norm_output and os.path.exists(args.norm_output) and not args.force:
+        # We already have the normalized kmers
+        norm_df = pd.read_csv(args.norm_output, sep="\t", index_col="contig")
+    elif args.kmers and os.path.exists(args.kmers) and not args.force:
+        # We already have the kmer counts
+        kmers_df = pd.read_csv(args.kmers, sep="\t", index_col="contig")
     else:
-        df = count(
+        # Start with counting kmers
+        kmers_df = count(
             assembly=args.fasta,
             size=args.size,
             out=args.kmers,
             force=args.force,
             cpus=args.cpus,
         )
 
-    if args.norm_output:
-        df = normalize(
-            df=df, method=args.norm_method, out=args.norm_output, force=args.force
+    if args.norm_output and norm_df.empty:
+        norm_df = normalize(
+            df=kmers_df,
+            method=args.norm_method,
+            out=args.norm_output,
+            force=args.force,
         )
 
     if args.embedding_output:
+        df = kmers_df if norm_df.empty else norm_df
         embedded_df = embed(
             kmers=df,
             out=args.embedding_output,

diff --git a/conf/modules.config b/conf/modules.config
@@ -21,17 +21,17 @@
 */
 params {
     modules {
-        'analyze_kmers_options' {
+        'count_kmers_options' {
             publish_by_meta  = ['id']
-            publish_dir    = "kmer_analysis"
+            publish_dir    = "count_kmer_analysis"
         }
-        'binning_options' {
+        'normalize_kmers_options' {
             publish_by_meta  = ['id']
-            publish_dir    = "binning_results/bins"
+            publish_dir    = "normalize_kmer_analysis"
         }
-        'binning_summary_options' {
+        'embed_kmers_options' {
             publish_by_meta  = ['id']
-            publish_dir    = "binning_results/binning_summary"
+            publish_dir    = "embed_kmer_analysis"
         }
         'diamond_blastp_options' {
             args           = "--evalue 1e-5 --max-target-seqs 200 -b 6 --outfmt 6"
@@ -112,10 +112,17 @@ params {
         'taxon_assignment' {
             publish_by_meta  = ['id']
         }
+        'binning_options' {
+            publish_by_meta  = ['id']
+            publish_dir    = "binning"
+        }
         'unclustered_recruitment_options' {
             publish_by_meta  = ['id']
-            publish_dir      = "binning_results/unclustered_recruitment_results"
-
+            publish_dir      = "unclustered_recruitment"
+        }
+        'binning_summary_options' {
+            publish_by_meta  = ['id']
+            publish_dir    = "binning_summary"
         }
     }
 }
diff --git a/modules.json b/modules.json
@@ -6,12 +6,6 @@
             "bowtie2/align": {
                 "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
             },
-            "fastqc": {
-                "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-            },
-            "multiqc": {
-                "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-            },
             "prodigal": {
                 "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d"
             }

diff --git a/modules/local/bin_contigs.nf → modules/local/binning.nf b/modules/local/bin_contigs.nf → modules/local/binning.nf
@@ -4,8 +4,8 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
 params.options = [:]
 options        = initOptions(params.options)
 
-process BIN_CONTIGS {
-    tag "Performing Autometa binning on ${meta.id}"
+process BINNING {
+    tag "sample:${meta.id}, clustering:${params.clustering_method}, completeness:${params.completeness}, purity:${params.purity}, cov.std.dev.:${params.cov_stddev_limit}, gc.std.dev.:${params.gc_stddev_limit}"
     label 'process_high'
     publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode
 
@@ -16,6 +16,9 @@ process BIN_CONTIGS {
         container "jasonkwan/autometa:${params.autometa_image_tag}"
     }
 
+    // No markers were annotated for contigs in the table
+    errorStrategy { task.exitStatus in 204 ? 'ignore' : 'terminate' }
+
     input:
         tuple val(meta), path(kmers), path(coverage), path(gc_content), path(markers), path(taxonomy)
 

diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf
@@ -4,8 +4,6 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
 params.options = [:]
 options        = initOptions(params.options)
 
-params.taxdump_tar_gz_dir =  [:]
-
 process BINNING_SUMMARY {
     tag "Gathering binning summary for ${meta.id}"
     label 'process_high'
@@ -22,6 +20,7 @@ process BINNING_SUMMARY {
     input:
         tuple val(meta), path(binning_main), path(markers), path(metagenome)
         val(binning_column)
+        path(ncbi)
 
     output:
         tuple val(meta), path("metabin_stats.tsv")   , emit: stats
@@ -33,7 +32,7 @@ process BINNING_SUMMARY {
         def software = getSoftwareName(task.process)
         """
         autometa-binning-summary \\
-            --ncbi ${params.taxdump_tar_gz_dir} \\
+            --ncbi $ncbi \\
             --binning-main $binning_main \\
             --markers $markers \\
             --metagenome $metagenome \\

diff --git a/modules/local/count_kmers.nf b/modules/local/count_kmers.nf
@@ -0,0 +1,38 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process COUNT_KMERS {
+    tag "Counting ${params.kmer_size}-mers for ${meta.id}"
+    label 'process_medium'
+    publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode
+
+    conda (params.enable_conda ? "autometa" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/autometa"
+    } else {
+        container "jasonkwan/autometa:${params.autometa_image_tag}"
+    }
+
+    input:
+        tuple val(meta), path(metagenome)
+
+    output:
+        tuple val(meta), path("kmers.tsv")           , emit: counts
+        path  '*.version.txt'                        , emit: version
+
+    script:
+        def software = getSoftwareName(task.process)
+        """
+        autometa-kmers \\
+            --fasta $metagenome \\
+            --kmers "kmers.tsv" \\
+            --size "${params.kmer_size}" \\
+            --cpus "${task.cpus}" \\
+            --seed 42
+
+        echo "TODO" > autometa.version.txt
+        """
+}