Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create new columns in sample_summary.tsv #25

Merged
merged 7 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
```console
nextflow run polkapox/main.nf --input {SAMPLESHEET.csv OR input_directory} --outdir {OUTDIR} --fasta {REF.fa} -profile sge,singularity --kraken_db {PATH/TO/DB} --gff {ANNOTATION.gff} --workflow {WORKFLOW} --filter {true/false}
```

**note**: If you do not provide `--fasta`, `--gff`, or `--kraken_db`, they will default to the reference and gff in the `assets` folder of this repo, and a kraken db hosted on the SciComp file system, respectively. If you do not specify `--filter` then it will default to `true`. See `nextflow.config` for details. Add `--file_levels {top (default)/nested}` if passing a directory as input. See [usage](/docs/usage.md) for details.

## Pipeline configuration
Expand Down
6 changes: 3 additions & 3 deletions bin/create_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parse_args():
required=False,
metavar="FILE_LEVELS",
help="Option for creating a sample sheet: 'nested' (default) for only nested files,\n"
"'all' for all files in the directory, 'top' for only top-level files"
"'top' for only top-level files"
)
return parser.parse_args()

Expand Down Expand Up @@ -131,7 +131,7 @@ def list_samples(samples_dir, file_levels, single=False):
s_name = re.sub(r'_1$', '', s_name) # remove a _1 only if it occurs at end of filename
s_name = remove_id(s_name)
seqfiles[s_name] = sample_path
if file_levels == 'nested':
elif file_levels == 'nested':
# Check for fastq files nested one level down
subdir = os.path.join(samples_dir, filename)
if os.path.isdir(subdir):
Expand Down Expand Up @@ -199,7 +199,7 @@ def main():
sys.exit(1)
else:
logger.error(f"--file_levels must be 'nested' or 'top' ")

sys.exit(1)

# Get the list of samples
if args.single:
Expand Down
19 changes: 18 additions & 1 deletion bin/summarize_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def parse_args():
"--samplesheet",
metavar="SAMPLE_SHEET",
help="The path to the sample sheet"),
parser.add_argument(
"--project_outdir",
metavar="PROJECT_OUTDIR",
help="The path to the project output directory"),
parser.add_argument(
"--reference_genome",
metavar="REFERENCE_GENOME",
Expand Down Expand Up @@ -425,7 +429,7 @@ def main():
fixed_summary = fixed_summary.merge(contigs[['Sample', 'n_contigs_unicycler']],left_on='sample', right_on='Sample').drop('Sample', axis = 1)
else:
# If Unicycler run failed, config_files don't get created
logger.info(f"{contig_file} not found")
logger.info(f"{contig_files} not found")
fixed_summary['n_contigs_unicycler'] = 'NaN'

if args.workflow == 'ref_based' or args.workflow == 'full':
Expand Down Expand Up @@ -479,6 +483,19 @@ def main():
if args.filter == 'false':
summary_full = summary_full[['sample','reference_genome','total_raw_reads','opx_read_count_kraken','opx_percent_kraken','human_percent_kraken','unclass_percent_kraken','kraken_db','kraken_tax_ids','filtered_read_count_fastp','percent_reads_passed_fastp','percent_adapter_fastp','gc_content_postfilter_fastp','q30_rate_postfilter_fastp','percent_duplication_fastp','reads_mapped_bwa','percent_mapped_bwa','average_depth_bwa','count_20xdepth_bwa','n_contigs_unicycler','assembly_length_unicycler','n50_unicycler','mapped_reads_denovo','percent_mapped_denovo','orientation_copy_number','sequence_length','itr_length','gfa_status','gfa_notes','total_snps','corrected_snps','corrected_indels','corrected_Ns']]

# get the final paths for the seqtk output R1 and R2, and final assembly n
seqtk_outfile_pattern = f'{args.project_outdir}/seqtk/{sample}_{{}}.f[a,q].gz' # seqtk possible extensions are fq.gz or fa.gz

# Check for the files and assign to summary_full
for sample in summary_full['sample']:
# final assembly
final_assembly = f'{args.project_outdir}/final_assembly/{sample}.final.fa' # extension enforced by IVAR_CONSENSUS_POLISH + PUBLISH_CONTIGS
summary_full['final_assembly'] = final_assembly if final_assembly else None
# opxv reads
for i in [1, 2]:
seqtk_outfile = glob(seqtk_outfile_pattern.format(i))
summary_full[f'opxv_reads_{i}'] = seqtk_outfile[0] if seqtk_outfile else None

summary_full.to_csv("sample_summary.tsv", sep = "\t", index = False)
logger.info(f"Summary results successfully written to sample_summary.tsv")

Expand Down
3 changes: 3 additions & 0 deletions modules/local/summarize_qc.nf
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops. Sorry about that...blaming it on Friday. Thanks for fixing it.

Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ process SUMMARIZE_QC {

script: // This script is bundled with the pipeline, in polkapox/bin/
def args = task.ext.args ?: ''
// Convert relative path to absolute path
def absolute_outdir = file(params.outdir)
"""
summarize_qc.py \\
--analysis_dir . \\
--samplesheet $samplesheet \\
--reference_genome ${params.fasta} \\
--project_outdir ${absolute_outdir} \\
--kraken_db ${params.kraken_db} \\
--kraken_tax_ids ${params.kraken2_tax_ids} \\
--filter ${params.filter} \\
Expand Down
Loading