CDCgov · jessicarowell · Jun 10, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/README.md b/README.md
@@ -138,7 +138,7 @@ CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
    ```console
    nextflow run polkapox/main.nf --input {SAMPLESHEET.csv OR input_directory} --outdir {OUTDIR} --fasta {REF.fa} -profile sge,singularity --kraken_db {PATH/TO/DB} --gff {ANNOTATION.gff} --workflow {WORKFLOW} --filter {true/false}
    ```
-
+   
    **note**: If you do not provide `--fasta`, `--gff`, or `--kraken_db`, they will default to the reference and gff in the `assets` folder of this repo, and a kraken db hosted on the SciComp file system, respectively. If you do not specify `--filter` then it will default to `true`. See `nextflow.config` for details.  Add `--file_levels {top (default)/nested}` if passing a directory as input. See [usage](/docs/usage.md) for details.
 
 ## Pipeline configuration

diff --git a/bin/create_samplesheet.py b/bin/create_samplesheet.py
@@ -52,7 +52,7 @@ def parse_args():
         required=False,
         metavar="FILE_LEVELS",
         help="Option for creating a sample sheet: 'nested' (default) for only nested files,\n"
-             "'all' for all files in the directory, 'top' for only top-level files"
+             "'top' for only top-level files"
     )
     return parser.parse_args()
 
@@ -131,7 +131,7 @@ def list_samples(samples_dir, file_levels, single=False):
                 s_name = re.sub(r'_1$', '', s_name) # remove a _1 only if it occurs at end of filename
                 s_name = remove_id(s_name)
                 seqfiles[s_name] = sample_path
-        if file_levels == 'nested':
+        elif file_levels == 'nested':
             # Check for fastq files nested one level down
             subdir = os.path.join(samples_dir, filename)
             if os.path.isdir(subdir):
@@ -199,7 +199,7 @@ def main():
             sys.exit(1)   
     else:
         logger.error(f"--file_levels must be 'nested' or 'top' ")
-
+        sys.exit(1)
 
     # Get the list of samples
     if args.single:

diff --git a/bin/summarize_qc.py b/bin/summarize_qc.py
@@ -25,6 +25,10 @@ def parse_args():
         "--samplesheet",
         metavar="SAMPLE_SHEET",
         help="The path to the sample sheet"),
+    parser.add_argument(
+        "--project_outdir",
+        metavar="PROJECT_OUTDIR",
+        help="The path to the project output directory"),
     parser.add_argument(
         "--reference_genome",
         metavar="REFERENCE_GENOME",
@@ -425,7 +429,7 @@ def main():
             fixed_summary = fixed_summary.merge(contigs[['Sample', 'n_contigs_unicycler']],left_on='sample', right_on='Sample').drop('Sample', axis = 1)
         else:
             # If Unicycler run failed, config_files don't get created
-            logger.info(f"{contig_file} not found")
+            logger.info(f"{contig_files} not found")
             fixed_summary['n_contigs_unicycler'] = 'NaN'    
 
     if args.workflow == 'ref_based' or args.workflow == 'full':
@@ -479,6 +483,19 @@ def main():
         if args.filter == 'false':
             summary_full = summary_full[['sample','reference_genome','total_raw_reads','opx_read_count_kraken','opx_percent_kraken','human_percent_kraken','unclass_percent_kraken','kraken_db','kraken_tax_ids','filtered_read_count_fastp','percent_reads_passed_fastp','percent_adapter_fastp','gc_content_postfilter_fastp','q30_rate_postfilter_fastp','percent_duplication_fastp','reads_mapped_bwa','percent_mapped_bwa','average_depth_bwa','count_20xdepth_bwa','n_contigs_unicycler','assembly_length_unicycler','n50_unicycler','mapped_reads_denovo','percent_mapped_denovo','orientation_copy_number','sequence_length','itr_length','gfa_status','gfa_notes','total_snps','corrected_snps','corrected_indels','corrected_Ns']]
 
+    # get the final paths for the seqtk output R1 and R2, and final assembly n
+    seqtk_outfile_pattern = f'{args.project_outdir}/seqtk/{sample}_{{}}.f[a,q].gz' # seqtk possible extensions are fq.gz or fa.gz
+
+    # Check for the files and assign to summary_full
+    for sample in summary_full['sample']:
+        # final assembly
+        final_assembly = f'{args.project_outdir}/final_assembly/{sample}.final.fa' # extension enforced by IVAR_CONSENSUS_POLISH + PUBLISH_CONTIGS
+        summary_full['final_assembly'] = final_assembly if final_assembly else None
+        # opxv reads
+        for i in [1, 2]:
+            seqtk_outfile = glob(seqtk_outfile_pattern.format(i))
+            summary_full[f'opxv_reads_{i}'] = seqtk_outfile[0] if seqtk_outfile else None
+
     summary_full.to_csv("sample_summary.tsv", sep = "\t", index = False)
     logger.info(f"Summary results successfully written to sample_summary.tsv")
 

diff --git a/modules/local/summarize_qc.nf b/modules/local/summarize_qc.nf
@@ -15,11 +15,14 @@ process SUMMARIZE_QC {
 
     script: // This script is bundled with the pipeline, in polkapox/bin/
     def args = task.ext.args   ?: ''
+    // Convert relative path to absolute path
+    def absolute_outdir = file(params.outdir)
     """
     summarize_qc.py \\
         --analysis_dir . \\
         --samplesheet $samplesheet \\
         --reference_genome ${params.fasta} \\
+        --project_outdir ${absolute_outdir} \\
         --kraken_db ${params.kraken_db} \\
         --kraken_tax_ids ${params.kraken2_tax_ids} \\
         --filter ${params.filter} \\