Skip to content

Commit

Permalink
adding --exact-dtrs option
Browse files Browse the repository at this point in the history
  • Loading branch information
mtisza1 committed May 24, 2022
1 parent d9e6a82 commit 621df57
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 3 deletions.
165 changes: 165 additions & 0 deletions apc_exact1.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/bin/perl

# AUTHOR: Joseph Fass
#
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2015 The Regents of The University of California, Davis Campus.
# All rights reserved.

use Getopt::Std;

my $usage = "\nusage: $0 [options] <fasta>\n\n".
"Have you assembled [a] [p]erfect [c]ircle (e.g. plasmid, bacterial chromosome)? ".
"This script tests each sequence in the supplied fasta file for self-overlap. ".
"If overlap is found, the 5' copy of the overlapping sequence is trimmed, ".
"the ends joined, and the join moved (\"permuted\") to the middle of the output sequence. ".
"Join location is appended to the sequence header, to allow confirmation. ".
"If a multi-fasta file is provided, each sequence is tested for circularity ".
"and output separately. ".
"If reads are provided (e.g. PacBio corrected.fastq), ".
"BWA MEM is used to align reads to spliced sequence, for validation.\n".
"The binaries \'lastdb\', \'lastal\', and (with -r) ".
"\'bwa\' and \'samtools\' must be in your PATH.\n".
"\n".
"-b <text>\tbasename for output files (alphanumeric and underscore characters)\n".
"-r <fastq>\thigh accuracy long reads to use for validation\n".
"-c <text>\tlastdb command\n".
"-d <text>\tlastal command\n".

"\n";
our($opt_b,$opt_r,$opt_c,$opt_d);
getopts('r:b:c:d:') or die $usage;
if (!(defined($opt_b)) or !($opt_b=~m/^\w+$/)) {$opt_b="permuted"}
$fname = $opt_b; # works better in interpreted quotes
my $assemblyfile = shift or die $usage;

# pull in (single-line) assembled molecules
open ASM, "<$assemblyfile";
while ($line = <ASM>) {
if ($line =~ m/^>/) {
push @contigs, $block if !($block eq ''); # push 2-line contig onto list of contigs
$block = $line."\n"; # adds another newline, because it'll be chomped below
}
else {
chomp $block; # remove newline before adding more sequence
chomp $line; # remove newline on current (sequence) line
$block .= $line."\n"; # block always ends in a single newline
}
}
push @contigs, $block; # push last sequence block onto list
close ASM;

# loop through each contig, perform LAST self-alignment, trim, output
$n = 0; # keeps track of sequence count
while (@contigs) {
#print "contig\t"; # debug
#print $#contigs + 1; # debug
#print "\n"; # debug
$block = shift(@contigs);
$n++;
open SEQ, ">temp.apc.fa";
print SEQ $block;
close SEQ;
print "formatting LAST db ... ";
$command = "$opt_c temp.apc temp.apc.fa"; # format lastdb for current contig
system($command);
print "running LAST self-alignment ... ";
$command = "$opt_d -s 1 -x 300 -f 0 -T 1 temp.apc temp.apc.fa > temp.apc.lastal"; # self align current contig
system($command);
$command = "$opt_d -s 1 -x 300 -f BlastTab -T 1 temp.apc temp.apc.fa | grep -v \"^#\" > temp.apc.blasttab"; # self align current contig
system($command);
print "done\n";
# pull in output of current contig's LAST self-alignment
open LAST, "<temp.apc.lastal";
undef(@alignment);
while ($line = <LAST>) {
if (!($line =~ m/^#/)) {
push @alignment, $line;
}
}
close LAST;
# process alignment lines; should be three, with first being full self-alignment
# second and third lines should be near identical end-overlaps
if ($#alignment + 1 == 3) {
# check that first is full self alignment
#print "three alignments in LAST output!\n"; # debug
# check that 2nd and 3rd are near identical overlaps
@overlap5prime = split("\t", $alignment[1]);
@overlap3prime = split("\t", $alignment[2]);
#print $overlap5prime[0]."\n"; # debug
#print $overlap3prime[0]."\n"; # debug
if ($overlap5prime[0] == $overlap3prime[0]) {
#$ONEQ = system("sed 's/ \\+/ /g' temp.apc.maf | grep \"^s \" | cut -d \" \" -f7 | head -n3 | tail -n1");
#$TWOQ = system("sed 's/ \\+/ /g' temp.apc.maf | grep \"^s \" | cut -d \" \" -f7 | head -n4 | tail -n1");
my $IDENT = `cut -f3 temp.apc.blasttab | head -n2 | tail -n1 `;
#print $IDENT."\n";

my $HUND=100.00;
#print $HUND."\n";
$EQQ = $IDENT eq $HUND;
#print $EQQ."\n";
if ($IDENT == $HUND) {
$start5prime = $overlap5prime[2] + 1; # 0-based!
$end5prime = $start5prime + $overlap5prime[3] - 1;
$start3prime = $overlap3prime[2] + 1; # 0-based!
$end3prime = $start3prime + $overlap3prime[3] - 1;
print "Overlap detected between ";
print $overlap5prime[1].":".$start5prime."-".$end5prime;
print " and ";
print $overlap3prime[1].":".$start3prime."-".$end3prime."\n";

# trim one overlap, then permute halfway
$block =~ m/^(.*\n)(.*)$/;
$header = $1;
chomp $header;
$sequence = $2;
chomp $sequence;
$trimmedSeq = substr($sequence, $overlap5prime[3]); # 0-, 1-based trickery
$mid = int( length($trimmedSeq) / 2 ); # midpoint, to permute around
$newSeq = substr($trimmedSeq, $mid) . substr($trimmedSeq, 0, $mid) . "\n";
open OUT, ">$fname.$n.fa";
chomp $block;
print OUT $header."|join_near_$mid\n";
print OUT $newSeq."\n";
close OUT;
open OUT2, ">$overlap5prime[1].DTR.tbl";
print OUT2 ">Feature $overlap5prime[1] Table1";
print OUT2 "\n".$start5prime."\t".$end5prime."\trepeat_region";
print OUT2 "\n\t\t\trpt_type\tDTR";
print OUT2 "\n".$start3prime."\t".$end3prime."\trepeat_region";
print OUT2 "\n\t\t\trpt_type\tDTR\n";
close OUT2;
if (defined($opt_r)) {
# use bwa mem to align reads to permuted sequence
$command = "bwa index $fname.$n.fa 2> $fname.$n.aln.log";
print "indexing permuted sequence ... ";
system($command);
$command = "bwa mem -M $fname.$n.fa $opt_r 2>> $fname.$n.aln.log | gzip > $fname.$n.sam.gz";
print "aligning reads to permuted sequence ... ";
system($command);
$command = "samtools view -uhS $fname.$n.sam.gz 2>> $fname.$n.aln.log | samtools sort - $fname.$n 2>> $fname.$n.aln.log";
print "sorting BAM file of aligned reads ... ";
system($command);
$command = "samtools index $fname.$n.bam 2>> $fname.$n.aln.log";
print "indexing BAM file ... ";
system($command);
print "done\n";
}
}

}
else {
system("mv temp.apc.lastal apc_aln_$n.txt");
print "irregular LAST results ... check local file apc_aln_$n.txt\n";
}
}
else {
system("mv temp.apc.lastal apc_aln_$n.txt");
print "irregular LAST results ... check local file apc_aln_$n.txt\n";
}
}

# clean up temp files!
system("rm temp.apc*");

10 changes: 8 additions & 2 deletions cenote-taker2.1.5.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ base_directory=$PWD
CENOTE_DBS=${35}
WRAP=${36}
HALLMARK_TAX=${37}
EXACT=${38}

if [ "$ANNOTATION_MODE" == "True" ] ; then
LIN_MINIMUM_DOMAINS=0
Expand Down Expand Up @@ -102,10 +103,11 @@ echo "Location of Cenote scripts: $CENOTE_SCRIPT_DIR"
echo "Location of scratch directory: $SCRATCH_DIR"
echo "GB of memory: $MEM"
echo "number of CPUs available for run: $CPU"
echo "Annotation only mode? $ANNOTATION_MODE"
echo "Annotation only mode? $ANNOTATION_MODE"
echo "Cenote-Taker 2 DBs directory: $CENOTE_DBS"
echo "Wrap circular contigs?: $WRAP"
echo "Taxonomy for each hallmark?: $HALLMARK_TAX"
echo "Exact match DTRs?: $EXACT"
echo "@@@@@@@@@@@@@@@@@@@@@@@@@"

#checking validity of run_title
Expand Down Expand Up @@ -303,7 +305,11 @@ if [ ${original_contigs: -6} == ".fasta" ]; then
bioawk -v run_var="$run_title" -v contig_cutoff="$LENGTH_MINIMUM" -c fastx '{ if(length($seq) > contig_cutoff) { print ">"run_var NR" "$name; print $seq }}' $original_contigs > ${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta ;
cd $run_title
echo "cenote_shortcut" > ${run_title}_CONTIG_SUMMARY.tsv
perl ${CENOTE_SCRIPT_DIR}/apc_cenote1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1
if [ "$EXACT" == "True" ] ; then
perl ${CENOTE_SCRIPT_DIR}/apc_exact1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1
else
perl ${CENOTE_SCRIPT_DIR}/apc_cenote1.pl -b $run_title -c lastdb -d lastal ../${original_contigs%.fasta}.over_${LENGTH_MINIMUM}nt.fasta >/dev/null 2>&1
fi
find . -type f -name "apc_aln*" -exec rm -f {} \;
APC_CIRCS=$( find . -maxdepth 1 -type f -name "${run_title}*.fa" )
if [ -n "$APC_CIRCS" ] ;then
Expand Down
3 changes: 2 additions & 1 deletion run_cenote-taker2.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def str2bool(v):
optional_args.add_argument("--cenote-dbs", dest="CENOTE_DBS", type=str, default=cenote_script_path, help='Default: cenote_script_path -- If you downloaded and setup the databases in a non-standard location, specify path')
optional_args.add_argument("--wrap", dest="WRAP", type=str2bool, default="True", help='Default: True -- Wrap/rotate DTR/circular contigs so the start codon of an ORF is the first nucleotide in the contig/genome')
optional_args.add_argument("--hallmark_taxonomy", dest="HALLMARK_TAX", type=str2bool, default="False", help='Default: False -- Get hierarchical taxonomy information for all hallmark genes? This report (*.hallmarks.taxonomy.out) is not considered in the final taxonomy call.')
optional_args.add_argument("--exact_dtrs", dest="EXACT", type=str2bool, default="False", help='Default: False -- Only record DTRs if each ends matches with 100% ID.')



Expand Down Expand Up @@ -171,5 +172,5 @@ def is_tool(name):
else:
print ("lastal/lastdb is not found. Exiting. You may have to manually install last using: conda install -y -n cenote-taker2_env -c conda-forge -c bioconda last=1282")
quit()
subprocess.call(['bash', str(cenote_script_path) + '/cenote-taker2.1.5.sh', str(args.original_contigs), str(args.F_READS), str(args.R_READS), str(args.run_title), str(args.isolation_source), str(args.collection_date), str(args.metagenome_type), str(args.srr_number), str(args.srx_number), str(args.biosample), str(args.bioproject), str(args.template_file), str(args.circ_length_cutoff), str(args.linear_length_cutoff), str(args.virus_domain_db), str(args.LIN_MINIMUM_DOMAINS), str(args.handle_knowns), str(args.ASSEMBLER), str(args.MOLECULE_TYPE), str(args.HHSUITE_TOOL), str(args.DATA_SOURCE), str(args.BLASTP), str(args.PROPHAGE), str(args.FILTER_PLASMIDS), str(args.BLASTN_DB), str(cenote_script_path), str(args.CIRC_MINIMUM_DOMAINS), str(args.SCRATCH_DIR), str(args.MEM), str(args.CPU), str(args.ENFORCE_START_CODON), str(args.ORF_WITHIN), str(args.ANNOTATION_MODE), str(args.CRISPR_FILE), str(args.CENOTE_DBS), str(args.WRAP), str(args.HALLMARK_TAX)])
subprocess.call(['bash', str(cenote_script_path) + '/cenote-taker2.1.5.sh', str(args.original_contigs), str(args.F_READS), str(args.R_READS), str(args.run_title), str(args.isolation_source), str(args.collection_date), str(args.metagenome_type), str(args.srr_number), str(args.srx_number), str(args.biosample), str(args.bioproject), str(args.template_file), str(args.circ_length_cutoff), str(args.linear_length_cutoff), str(args.virus_domain_db), str(args.LIN_MINIMUM_DOMAINS), str(args.handle_knowns), str(args.ASSEMBLER), str(args.MOLECULE_TYPE), str(args.HHSUITE_TOOL), str(args.DATA_SOURCE), str(args.BLASTP), str(args.PROPHAGE), str(args.FILTER_PLASMIDS), str(args.BLASTN_DB), str(cenote_script_path), str(args.CIRC_MINIMUM_DOMAINS), str(args.SCRATCH_DIR), str(args.MEM), str(args.CPU), str(args.ENFORCE_START_CODON), str(args.ORF_WITHIN), str(args.ANNOTATION_MODE), str(args.CRISPR_FILE), str(args.CENOTE_DBS), str(args.WRAP), str(args.HALLMARK_TAX), str(args.EXACT)])

0 comments on commit 621df57

Please sign in to comment.