From 621df572f259f8f614ba05795d41843f4dec586b Mon Sep 17 00:00:00 2001 From: mtisza1 Date: Tue, 24 May 2022 15:57:39 -0500 Subject: [PATCH] adding --exact-dtrs option --- apc_exact1.pl | 165 +++++++++++++++++++++++++++++++++++++++++++ cenote-taker2.1.5.sh | 10 ++- run_cenote-taker2.py | 3 +- 3 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 apc_exact1.pl diff --git a/apc_exact1.pl b/apc_exact1.pl new file mode 100644 index 0000000..42fcec2 --- /dev/null +++ b/apc_exact1.pl @@ -0,0 +1,165 @@ +#!/bin/perl + +# AUTHOR: Joseph Fass +# +# The Bioinformatics Core at UC Davis Genome Center +# http://bioinformatics.ucdavis.edu +# Copyright (c) 2015 The Regents of The University of California, Davis Campus. +# All rights reserved. + +use Getopt::Std; + +my $usage = "\nusage: $0 [options] \n\n". + "Have you assembled [a] [p]erfect [c]ircle (e.g. plasmid, bacterial chromosome)? ". + "This script tests each sequence in the supplied fasta file for self-overlap. ". + "If overlap is found, the 5' copy of the overlapping sequence is trimmed, ". + "the ends joined, and the join moved (\"permuted\") to the middle of the output sequence. ". + "Join location is appended to the sequence header, to allow confirmation. ". + "If a multi-fasta file is provided, each sequence is tested for circularity ". + "and output separately. ". + "If reads are provided (e.g. PacBio corrected.fastq), ". + "BWA MEM is used to align reads to spliced sequence, for validation.\n". + "The binaries \'lastdb\', \'lastal\', and (with -r) ". + "\'bwa\' and \'samtools\' must be in your PATH.\n". + "\n". + "-b \tbasename for output files (alphanumeric and underscore characters)\n". + "-r \thigh accuracy long reads to use for validation\n". + "-c \tlastdb command\n". + "-d \tlastal command\n". + + "\n"; +our($opt_b,$opt_r,$opt_c,$opt_d); +getopts('r:b:c:d:') or die $usage; +if (!(defined($opt_b)) or !($opt_b=~m/^\w+$/)) {$opt_b="permuted"} +$fname = $opt_b; # works better in interpreted quotes +my $assemblyfile = shift or die $usage; + +# pull in (single-line) assembled molecules +open ASM, "<$assemblyfile"; +while ($line = ) { + if ($line =~ m/^>/) { + push @contigs, $block if !($block eq ''); # push 2-line contig onto list of contigs + $block = $line."\n"; # adds another newline, because it'll be chomped below + } + else { + chomp $block; # remove newline before adding more sequence + chomp $line; # remove newline on current (sequence) line + $block .= $line."\n"; # block always ends in a single newline + } + } +push @contigs, $block; # push last sequence block onto list +close ASM; + +# loop through each contig, perform LAST self-alignment, trim, output +$n = 0; # keeps track of sequence count +while (@contigs) { + #print "contig\t"; # debug + #print $#contigs + 1; # debug + #print "\n"; # debug + $block = shift(@contigs); + $n++; + open SEQ, ">temp.apc.fa"; + print SEQ $block; + close SEQ; + print "formatting LAST db ... "; + $command = "$opt_c temp.apc temp.apc.fa"; # format lastdb for current contig + system($command); + print "running LAST self-alignment ... "; + $command = "$opt_d -s 1 -x 300 -f 0 -T 1 temp.apc temp.apc.fa > temp.apc.lastal"; # self align current contig + system($command); + $command = "$opt_d -s 1 -x 300 -f BlastTab -T 1 temp.apc temp.apc.fa | grep -v \"^#\" > temp.apc.blasttab"; # self align current contig + system($command); + print "done\n"; + # pull in output of current contig's LAST self-alignment + open LAST, ") { + if (!($line =~ m/^#/)) { + push @alignment, $line; + } + } + close LAST; + # process alignment lines; should be three, with first being full self-alignment + # second and third lines should be near identical end-overlaps + if ($#alignment + 1 == 3) { + # check that first is full self alignment + #print "three alignments in LAST output!\n"; # debug + # check that 2nd and 3rd are near identical overlaps + @overlap5prime = split("\t", $alignment[1]); + @overlap3prime = split("\t", $alignment[2]); + #print $overlap5prime[0]."\n"; # debug + #print $overlap3prime[0]."\n"; # debug + if ($overlap5prime[0] == $overlap3prime[0]) { + #$ONEQ = system("sed 's/ \\+/ /g' temp.apc.maf | grep \"^s \" | cut -d \" \" -f7 | head -n3 | tail -n1"); + #$TWOQ = system("sed 's/ \\+/ /g' temp.apc.maf | grep \"^s \" | cut -d \" \" -f7 | head -n4 | tail -n1"); + my $IDENT = `cut -f3 temp.apc.blasttab | head -n2 | tail -n1 `; + #print $IDENT."\n"; + + my $HUND=100.00; + #print $HUND."\n"; + $EQQ = $IDENT eq $HUND; + #print $EQQ."\n"; + if ($IDENT == $HUND) { + $start5prime = $overlap5prime[2] + 1; # 0-based! + $end5prime = $start5prime + $overlap5prime[3] - 1; + $start3prime = $overlap3prime[2] + 1; # 0-based! + $end3prime = $start3prime + $overlap3prime[3] - 1; + print "Overlap detected between "; + print $overlap5prime[1].":".$start5prime."-".$end5prime; + print " and "; + print $overlap3prime[1].":".$start3prime."-".$end3prime."\n"; + + # trim one overlap, then permute halfway + $block =~ m/^(.*\n)(.*)$/; + $header = $1; + chomp $header; + $sequence = $2; + chomp $sequence; + $trimmedSeq = substr($sequence, $overlap5prime[3]); # 0-, 1-based trickery + $mid = int( length($trimmedSeq) / 2 ); # midpoint, to permute around + $newSeq = substr($trimmedSeq, $mid) . substr($trimmedSeq, 0, $mid) . "\n"; + open OUT, ">$fname.$n.fa"; + chomp $block; + print OUT $header."|join_near_$mid\n"; + print OUT $newSeq."\n"; + close OUT; + open OUT2, ">$overlap5prime[1].DTR.tbl"; + print OUT2 ">Feature $overlap5prime[1] Table1"; + print OUT2 "\n".$start5prime."\t".$end5prime."\trepeat_region"; + print OUT2 "\n\t\t\trpt_type\tDTR"; + print OUT2 "\n".$start3prime."\t".$end3prime."\trepeat_region"; + print OUT2 "\n\t\t\trpt_type\tDTR\n"; + close OUT2; + if (defined($opt_r)) { + # use bwa mem to align reads to permuted sequence + $command = "bwa index $fname.$n.fa 2> $fname.$n.aln.log"; + print "indexing permuted sequence ... "; + system($command); + $command = "bwa mem -M $fname.$n.fa $opt_r 2>> $fname.$n.aln.log | gzip > $fname.$n.sam.gz"; + print "aligning reads to permuted sequence ... "; + system($command); + $command = "samtools view -uhS $fname.$n.sam.gz 2>> $fname.$n.aln.log | samtools sort - $fname.$n 2>> $fname.$n.aln.log"; + print "sorting BAM file of aligned reads ... "; + system($command); + $command = "samtools index $fname.$n.bam 2>> $fname.$n.aln.log"; + print "indexing BAM file ... "; + system($command); + print "done\n"; + } + } + + } + else { + system("mv temp.apc.lastal apc_aln_$n.txt"); + print "irregular LAST results ... check local file apc_aln_$n.txt\n"; + } + } + else { + system("mv temp.apc.lastal apc_aln_$n.txt"); + print "irregular LAST results ... check local file apc_aln_$n.txt\n"; + } +} + +# clean up temp files! +system("rm temp.apc*"); + diff --git a/cenote-taker2.1.5.sh b/cenote-taker2.1.5.sh index 92a58aa..0f69b42 100644 --- a/cenote-taker2.1.5.sh +++ b/cenote-taker2.1.5.sh @@ -59,6 +59,7 @@ base_directory=$PWD CENOTE_DBS=${35} WRAP=${36} HALLMARK_TAX=${37} +EXACT=${38} if [ "$ANNOTATION_MODE" == "True" ] ; then LIN_MINIMUM_DOMAINS=0 @@ -102,10 +103,11 @@ echo "Location of Cenote scripts: $CENOTE_SCRIPT_DIR" echo "Location of scratch directory: $SCRATCH_DIR" echo "GB of memory: $MEM" echo "number of CPUs available for run: $CPU" -echo "Annotation only mode? $ANNOTATION_MODE" +echo "Annotation only mode? $ANNOTATION_MODE" echo "Cenote-Taker 2 DBs directory: $CENOTE_DBS" echo "Wrap circular contigs?: $WRAP" echo "Taxonomy for each hallmark?: $HALLMARK_TAX" +echo "Exact match DTRs?: $EXACT" echo "@@@@@@@@@@@@@@@@@@@@@@@@@" #checking validity of run_title @@ -303,7 +305,11 @@ if [ ${original_contigs: -6} == ".fasta" ]; then bioawk -v run_var="$run_title" -v contig_cutoff="$LENGTH_MINIMUM" -c fastx '{ if(length($seq) > contig_cutoff) { print ">"run_var NR" "$name; print $seq }}' $original_contigs > ${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta ; cd $run_title echo "cenote_shortcut" > ${run_title}_CONTIG_SUMMARY.tsv - perl ${CENOTE_SCRIPT_DIR}/apc_cenote1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1 + if [ "$EXACT" == "True" ] ; then + perl ${CENOTE_SCRIPT_DIR}/apc_exact1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1 + else + perl ${CENOTE_SCRIPT_DIR}/apc_cenote1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1 + fi find . -type f -name "apc_aln*" -exec rm -f {} \; APC_CIRCS=$( find . -maxdepth 1 -type f -name "${run_title}*.fa" ) if [ -n "$APC_CIRCS" ] ;then diff --git a/run_cenote-taker2.py b/run_cenote-taker2.py index f7bbf98..7cc868f 100644 --- a/run_cenote-taker2.py +++ b/run_cenote-taker2.py @@ -65,6 +65,7 @@ def str2bool(v): optional_args.add_argument("--cenote-dbs", dest="CENOTE_DBS", type=str, default=cenote_script_path, help='Default: cenote_script_path -- If you downloaded and setup the databases in a non-standard location, specify path') optional_args.add_argument("--wrap", dest="WRAP", type=str2bool, default="True", help='Default: True -- Wrap/rotate DTR/circular contigs so the start codon of an ORF is the first nucleotide in the contig/genome') optional_args.add_argument("--hallmark_taxonomy", dest="HALLMARK_TAX", type=str2bool, default="False", help='Default: False -- Get hierarchical taxonomy information for all hallmark genes? This report (*.hallmarks.taxonomy.out) is not considered in the final taxonomy call.') +optional_args.add_argument("--exact_dtrs", dest="EXACT", type=str2bool, default="False", help='Default: False -- Only record DTRs if each ends matches with 100% ID.') @@ -171,5 +172,5 @@ def is_tool(name): else: print ("lastal/lastdb is not found. Exiting. You may have to manually install last using: conda install -y -n cenote-taker2_env -c conda-forge -c bioconda last=1282") quit() -subprocess.call(['bash', str(cenote_script_path) + '/cenote-taker2.1.5.sh', str(args.original_contigs), str(args.F_READS), str(args.R_READS), str(args.run_title), str(args.isolation_source), str(args.collection_date), str(args.metagenome_type), str(args.srr_number), str(args.srx_number), str(args.biosample), str(args.bioproject), str(args.template_file), str(args.circ_length_cutoff), str(args.linear_length_cutoff), str(args.virus_domain_db), str(args.LIN_MINIMUM_DOMAINS), str(args.handle_knowns), str(args.ASSEMBLER), str(args.MOLECULE_TYPE), str(args.HHSUITE_TOOL), str(args.DATA_SOURCE), str(args.BLASTP), str(args.PROPHAGE), str(args.FILTER_PLASMIDS), str(args.BLASTN_DB), str(cenote_script_path), str(args.CIRC_MINIMUM_DOMAINS), str(args.SCRATCH_DIR), str(args.MEM), str(args.CPU), str(args.ENFORCE_START_CODON), str(args.ORF_WITHIN), str(args.ANNOTATION_MODE), str(args.CRISPR_FILE), str(args.CENOTE_DBS), str(args.WRAP), str(args.HALLMARK_TAX)]) +subprocess.call(['bash', str(cenote_script_path) + '/cenote-taker2.1.5.sh', str(args.original_contigs), str(args.F_READS), str(args.R_READS), str(args.run_title), str(args.isolation_source), str(args.collection_date), str(args.metagenome_type), str(args.srr_number), str(args.srx_number), str(args.biosample), str(args.bioproject), str(args.template_file), str(args.circ_length_cutoff), str(args.linear_length_cutoff), str(args.virus_domain_db), str(args.LIN_MINIMUM_DOMAINS), str(args.handle_knowns), str(args.ASSEMBLER), str(args.MOLECULE_TYPE), str(args.HHSUITE_TOOL), str(args.DATA_SOURCE), str(args.BLASTP), str(args.PROPHAGE), str(args.FILTER_PLASMIDS), str(args.BLASTN_DB), str(cenote_script_path), str(args.CIRC_MINIMUM_DOMAINS), str(args.SCRATCH_DIR), str(args.MEM), str(args.CPU), str(args.ENFORCE_START_CODON), str(args.ORF_WITHIN), str(args.ANNOTATION_MODE), str(args.CRISPR_FILE), str(args.CENOTE_DBS), str(args.WRAP), str(args.HALLMARK_TAX), str(args.EXACT)])