Refiner

#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) Refiner
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Given a set of instances of a particular interspersed
##      repeat develop and refine a consensus model for
##      them.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2005-2024 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################

=head1 NAME

Refiner - Generate and refine a seed alignment given a set of family instances

=head1 SYNOPSIS

  Refiner [-options] <family fasta sequences>

  -threads #     : The maximum number of threads the program may use.

  NOTE: The fasta file should contain only sequence identifiers and sequences.
        The fasta description field is currently reserved for RepeatModeler
        use at this time.

=head1 DESCRIPTION

This tool was developed to overcome the limitation many multiple sequence
aligners when supplied with highly diverged and fragmented sequences. Given
a set of related instances of TE family this tool:

  - Generates an all-vs-all pairwise alignment to identify the sequence
    with the minimal distance (best score) to all other sequences.

  - A transitive multiple sequence alignment is generated from the pairwise
    alignments to this minimally distant copy and a new consensus is drawn.
 
  - If the consensus has changed, the new consensus is realigned to the 
    instances and the process is repeated until the consensus stabilizes.

The options are:

=over 4

=item -h(elp)

Detailed help

=back

=head1 SEE ALSO

=over 4

RepeatModeler, RepeatMasker

=back

=head1 COPYRIGHT

 Copyright 2005-2024 Institute for Systems Biology

=head1 AUTHOR

 Robert Hubley <rhubley@systemsbiology.org>
 Arian Smit <asmit@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Getopt::Long;
use POSIX qw(:sys_wait_h);
use File::Copy;
use File::Spec;
use File::Path;
use File::Basename;
use Cwd;
use Pod::Text;
use Data::Dumper;

# RepeatModeler Libraries
use RepModelConfig;
use lib $RepModelConfig::configuration->{'REPEATMASKER_DIR'}->{'value'};
use MultAln;
use NeedlemanWunschGotohAlgorithm;

# RepeatMasker Libraries
use SearchResult;
use SearchResultCollection;
use WUBlastSearchEngine;
use NCBIBlastSearchEngine;
use SequenceSimilarityMatrix;
use SeqDBI;
use SimpleBatcher;
use FastaDB;

#
# Class Globals & Constants
#
my $CLASS = "Refiner";
my $DEBUG = 0;
$DEBUG = 10 if ( $RepModelConfig::DEBUGALL == 1 );
$|     = 1;                                         # Turn autoflush on

#
# Version
#
my $version = $RepModelConfig::VERSION;

if ( $ARGV[ 0 ] && $ARGV[ 0 ] eq '-v' ) {
  print "Refiner version $version\n";
  exit;
}

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts = ( '-help',     
             '-debug=i', 
             '-quiet',
             '-giToID=s', 
             '-extTwoBit=s',
             '-name=s', 
             '-threads=i' );

# Add configuration parameters as additional command-line options
push @opts, RepModelConfig::getCommandLineOptions();

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) ) {
  usage();
}

#
# Provide the POD text from this file and
# from the config file by merging them
# together.  The heading "CONFIGURATION
# OVERRIDES" provides the insertion point
# for the configuration POD.
#
sub usage {
  my $p = Pod::Text->new();
  $p->output_fh( *STDOUT );
  my $pod_str;
  open IN, "<$0"
      or die "Could not open self ($0) for generating documentation!";
  while ( <IN> ) {
    if ( /^=head1\s+CONFIGURATION OVERRIDES\s*$/ ) {
      my $c_pod = RepModelConfig::getPOD();
      if ( $c_pod ) {
        $pod_str .= $_ . $c_pod;
      }
    }
    else {
      $pod_str .= $_;
    }
  }
  close IN;
  print "$0 - $version\n";
  $p->parse_string_document( $pod_str );
  exit( 1 );
}

#
# Resolve configuration settings using the following precedence:
# command line first, then environment, followed by config
# file.
#
RepModelConfig::resolveConfiguration( \%options );
my $config           = $RepModelConfig::configuration;
my $NCBIBLASTDB_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/makeblastdb";
my $RMBLASTN_PRGM    = $config->{'RMBLAST_DIR'}->{'value'} . "/rmblastn";
my $REPEATAFTERME_DIR;
#my $REPEATAFTERME_DIR = $config->{'REPEATAFTERME_DIR'}->{'value'};

# Print the internal POD documentation if something is missing
if ( $#ARGV == -1 || $options{'help'} ) {
  print "No query sequence file indicated\n\n";
  usage();
}

$DEBUG = $options{'debug'} if ( $options{'debug'} );

#
# If used as part of the RepeatModeler system
# read in the seq IDs from the original database
# so we can place them in the stockholm file.
#
my %genomeDBToSeqID = ();
if ( $options{'giToID'} ) {
  open IN, "<$options{'giToID'}"
      or die "Could not open $options{'giToID'} file for reading!\n";
  while ( <IN> ) {
    if ( /^(\S+)\s+(\d+)/ ) {
      $genomeDBToSeqID{"gi|$2"} = $1;

      # LTR_retriever has an issue with "|" symbols.  For this
      # process the seq identifier will be gi-# instead.
      $genomeDBToSeqID{"gi-$2"} = $1;
    }
  }
  close IN;
}

#
# Setup the search engines
#
my $srchEngAllVsAll;
my $srchEngOneVsAll;
$srchEngAllVsAll =
    NCBIBlastSearchEngine->new( pathToEngine => $RMBLASTN_PRGM );
$srchEngOneVsAll =
    NCBIBlastSearchEngine->new( pathToEngine => $RMBLASTN_PRGM );
if ( not defined $srchEngAllVsAll ) {
  die "Refiner Failed: Cannot execute $RMBLASTN_PRGM please make "
    . "sure you have setup RepeatModeler to use RMBlast by "
    . "running the configure script.\n";
}

my $rmblast_version = $srchEngOneVsAll->getVersion();
my $engineHasQueryThreading = 0;
if ( $rmblast_version =~ /(\d+)\.(\d+)\.(\d+)\+/ ) {
  my $majorVer = $1;
  my $minorVer = $2;
  my $revision = $3;
  if ( $majorVer > 2 || ($majorVer == 2 && $minorVer >= 13 )) {
    $engineHasQueryThreading = 1;
  }
}


#
# Parse filenames
#
foreach my $file ( @ARGV ) {
  if ( $file =~ /\s/ ) {
    die "RepeatModeler can not handle filenames with spaces "
        . "like the file \"$file\"\n";
  }
  elsif ( $file =~ /([\`\!\$\^\&\*\(\)\{\}\[\]\|\\\;\"\'\<\>\?])/ ) {
    die "RepeatModeler can not handle filenames with the special "
        . "character \"$1\" as in the file \"$file\"\n";
  }
}

my @tmpDirPath = ( cwd(), ( File::Spec->splitpath( $ARGV[ 0 ] ) )[ 1 ] );
my $tmpDir;

my $alignReportFilename;
$alignReportFilename = "align-report.html" if ( $DEBUG );

if ( $DEBUG ) {
  print "##\n## Refiner\n##\n";
  print "# Version = $version\n";
  print "# TempDirPath = " . join( ", ", @tmpDirPath ) . "\n";
}

#
# Main loop
#
my @TimeBefore = ();
elapsedTime( 0 );
my $familyName;
my $inputFile;
my $inputFileDir;
my $inputFilePrefix;
foreach my $file ( @ARGV ) {

  unless ( -r $file ) {
    print "cannot read file $file\n";
    next;
  }

  print "\nanalyzing file $file\n" if ( $#ARGV >= 0 && $DEBUG );
  $tmpDir = dirname( $file );
  $tmpDir = createTempDir( \@tmpDirPath );
  print "Temp Directory = $tmpDir\n" if ( $DEBUG );

  #
  # Handle one sequence case
  #
  open IN, "<$file" or die "Could not open $file for reading!";
  my $firstSeq;
  my $firstID;
  my $count = 0;
  while ( <IN> ) {
    if ( /^>(\S+)/ ) {
      $count++;
      last if ( $count > 1 );
      $firstID = $1;
      next;
    }
    s/[\n\r\s]+//g;
    $firstSeq .= uc($_);
  }
  close IN;

  # The name of the input file will be used to name intermediate
  # and result files. Break it own so that we have the path to
  # the working directory and the prefix of the filename (sans
  # typical ".fa|.fasta|.FA|.FASTA" suffixes).
  $inputFileDir = dirname( $file );
  $inputFile = basename( $file );
  $inputFilePrefix = $inputFile;
  $inputFilePrefix =~ s/\.(fa|fasta)$//i;

  # The final family will be labeled "family" at the end ( foreach
  # file processed ) unless the user specifies a different name
  # on the command line ( -name option ).  This is problematic if
  # there are mutiple files handed to Refiner.  This should probably
  # be refactored to use the input file name as the family name.
  $familyName = $inputFilePrefix;
  $familyName = $options{'name'} if ( $options{'name'} );

  my $db = FastaDB->new( fileName => $file,
                        openMode => SeqDBI::ReadOnly );
                      
  # The convention is that Refiner always produces output files,
  # even if there is only one sequence in the input file. This
  # is legacy behaviour that users of this script expect.  So
  # for consensus sequences we simply output the single sequence
  # that was handed to us.
  if ( $count == 1 ) {
    my $malign = MultAln->new( sequences => [ [ $firstID, $firstSeq ] ] );
    my $cons    = $malign->consensus();
    my $maSize  = 1;
    my $avgKDiv = 0;

    my $db = FastaDB->new( fileName => $file,
                        openMode => SeqDBI::ReadOnly );

    adjustIdentifiers($malign, $db);

    $malign->toSTK( filename => "$file.refiner.stk", id => $familyName,  consRF => 1, idFormat => 2 );

    $cons =~ s/-//g;         
    open OUTC, ">$file.refiner_cons";
    print OUTC ">$familyName ( Final Multiple Alignment Size = " . $maSize
             . " , Avg Kimura = $avgKDiv )\n";
    print OUTC "$cons\n";                           
    close OUTC;

    unless ( $DEBUG ) {
      rmtree( [ $tmpDir ] );
    }

    next;
  }

  my $iteration = 1;
  if ( $DEBUG ) {
    print "========================="
        . $inputFilePrefix
        . "=========================\n";
    print " ------------------- Iteration 1 (bootstrapping) ------------------ \n";
  }

  my ( $refID, $cons, $numHSPs, $numUnAlignSeqs) = &bootstrapConsensus( "$FindBin::RealBin/Matrices", $srchEngAllVsAll, $file, $alignReportFilename, $tmpDir, $db ); 

  if ( $refID eq "" ) {
    warn "Refiner: No consensus could be derived from the input sequences in $file\n";
    next;
  }

  my $round1ConsLen = length( $cons );
  my $round1NumUnAlignSeqs = $numUnAlignSeqs;

  if ( $DEBUG ) {
    print "  Bootstrap Consensus Instance = $refID\n";
    print "  Derived Consensus Length = $round1ConsLen\n";
    print "  HSPs = $numHSPs\n";
    print "  Unaligned Sequences = $numUnAlignSeqs\n";
  }

  # Save the consensus to a fasta file
  open CONF, ">$tmpDir/family-cons-1.fa";
  print CONF ">$familyName ( "
          . $numHSPs
          . " hsps from initial "
          . "mulitiple alignment)\n";
  print CONF "$cons\n";
  close CONF;

  $iteration++;
  my $maxIterations = 10;
  my $malign;
  my $maSize = 0;
  my $gappedCons;
  my $sumScore;
  my $avgKDiv;
  ($gappedCons, $malign, $iteration, $sumScore, $avgKDiv, $maSize, $numUnAlignSeqs) = 
         &refineUntil( "$FindBin::RealBin/Matrices", $srchEngOneVsAll, $cons, 
                        $file, $alignReportFilename, 
                        $tmpDir, $db, $iteration, $maxIterations, $DEBUG );
                      

  if ( $DEBUG ) {
    print " ------------------- Resolving low quality blocks ------------------ \n";
  }
  # NOTE: Need gapped cons here
  $cons = $gappedCons;
  $cons =~ s/\-//g;
  my $newCons = $gappedCons;

  my $newConsBlocks = resolveLowQualityBlocks( multAln => $malign );

  print " Blocks Returned = " . ($#{$newConsBlocks} + 1) . "\n" if ( $DEBUG );

  # Patch up the cons
  for ( my $j = 0 ; $j <= $#{$newConsBlocks} ; $j++ ) {
    print "    -- Fixing block $j\n" if ( $DEBUG );
    my $colWidth =
          $newConsBlocks->[ $j ]->{'end'} -
          $newConsBlocks->[ $j ]->{'start'} + 1;
    my $colSeq = $newConsBlocks->[ $j ]->{'cons'};
    my $seq = $colSeq . "-" x ( $colWidth - length( $colSeq ) );
    substr( $newCons, $newConsBlocks->[ $j ]->{'start'}, length( $seq ) ) = $seq;
  }

  if ( $alignReportFilename ) {
    my $MOUT;
    open $MOUT, ">>$tmpDir/$alignReportFilename" or die "Could not open $tmpDir/$alignReportFilename for writing!\n";
    print $MOUT "<H2>Resolving Low Quality Blocks</H2>\n";
    writeHTMLMultAlign(              
                        multAln         => $malign,
                        destination     => $MOUT,        
                        leftFlankingID  => "", 
                        rightFlankingID => "",
                        printHistogram  => 1,
                        newConsBlocks   => $newConsBlocks,
                        finalConsensus  => $newCons
            );
    close $MOUT;
  }

  $newCons =~ s/-//g;
  print "  Consensus length: before = " . length($cons) . " after = " . length($newCons) . "\n" if ( $DEBUG );
  $cons = $newCons;
  
  # Save the consensus to a fasta file
  open CONF, ">$tmpDir/family-cons-$iteration.fa";
  print CONF ">$familyName ( "
          . $numHSPs
          . " hsps from initial "
          . "mulitiple alignment)\n";
  print CONF "$cons\n";
  close CONF;

  $iteration++;
  ($gappedCons, $malign, $iteration, $sumScore, $avgKDiv, $maSize, $numUnAlignSeqs) = 
       &refineUntil( "$FindBin::RealBin/Matrices", $srchEngOneVsAll, $cons, 
                     $file, $alignReportFilename, 
                     $tmpDir, $db, $iteration, $maxIterations, $DEBUG );
  $cons = $gappedCons;
  $cons =~ s/\-//g;

  # If the user has supplied a 2bit file we can try to extend the consensus
  my ( $leftExt, $rightExt, $extCons, $newFamilyFile, $warnings );
  if ( 0 || $options{'extTwoBit'} ) {
    if ( $DEBUG ) {
      print " ------------------- Extending consensus ------------------ \n";
    }
    ( $leftExt, $rightExt, $extCons, $newFamilyFile, $warnings ) = 
        &extendAlignment( $malign, $tmpDir, $options{'extTwoBit'}, \%genomeDBToSeqID, $db, $gappedCons, $DEBUG );
    print " Warnings: $warnings\n" if ( $DEBUG && $warnings );
    if ( $extCons ne "" ) {
        $newCons = $extCons;
        $cons = $newCons;
        # Save the consensus to a fasta file
        open CONF, ">$tmpDir/family-cons-$iteration.fa";
        print CONF ">$familyName ( "
          . $numHSPs
          . " hsps from initial "
          . "mulitiple alignment)\n";
        print CONF "$cons\n";
        close CONF;

        # Now that we have a new set of instance sequences we need to
        # reset the FastaDB, and use the new repam-repseq-nodups.fa file
        # for future iterations.
        $db = FastaDB->new( fileName => "$tmpDir/repam-repseq-nodups.fa",
                            openMode => SeqDBI::ReadOnly );
        # Note: we are now pointing to a file in the tmp directory for the
        # instances.
        $file = "$tmpDir/repam-repseq-nodups.fa";

        if ( $alignReportFilename ) {
          my $MOUT;
          open $MOUT, ">>$tmpDir/$alignReportFilename" or die "Could not open $tmpDir/$alignReportFilename for writing!\n";
          print $MOUT "<H2>Extending Consensus</H2>\n";
          close $MOUT;
        }

        $iteration++;
        ($gappedCons, $malign, $iteration, $sumScore, $avgKDiv, $maSize, $numUnAlignSeqs) = 
             &refineUntil( "$FindBin::RealBin/Matrices", $srchEngOneVsAll, $cons, 
                           $file, $alignReportFilename, 
                           $tmpDir, $db, $iteration, $maxIterations, $DEBUG );
        $cons = $gappedCons;
        $cons =~ s/\-//g;
     } # if ( $extCons ne "" )
  } # if ( $options{'extTwoBit'} )

  unless ( $options{'quiet'} ) {
    print "  - numRounds = $iteration\n";
    print "  - sumScore = $sumScore\n";
    print "  - Consensus Length = "
        . length( $cons )
        . " ( orig = $round1ConsLen )\n";
    print "  - Extended left=$leftExt right=$rightExt\n" if ( $options{'extTwoBit'} );
    print "  - Avg Kimura Divergence = "
        . sprintf( "%0.2f", $avgKDiv ) . "\n";
    print "  - Unaligned sequences = $numUnAlignSeqs ( orig = $round1NumUnAlignSeqs )\n";
  }

  adjustIdentifiers($malign, $db);

  # TODO: Consider providing program stats/warnings in curation notes of *.stk
  #$malign->toSTK( filename => "$pathToFastaFile.refiner.stk", includeTemplate => 1, nuclRF => 1 );
  $malign->toSTK( filename => "$inputFileDir/$inputFile.refiner.stk",  consRF => 1, idFormat => 2 );

  $cons =~ s/-//g;
  # The output file is the full filename + the suffix ".refiner_cons"
  open OUTC, ">$inputFileDir/$inputFile.refiner_cons";

  # Save the consensus to the consensi file.
  if ( $leftExt || $rightExt ) {
    print OUTC ">$familyName ( Final Multiple Alignment Size = " . $maSize
             . " , Avg Kimura = $avgKDiv , Extended = $leftExt/$rightExt )\n";
  }else {
    print OUTC ">$familyName ( Final Multiple Alignment Size = " . $maSize
             . " , Avg Kimura = $avgKDiv )\n";
  }
  print OUTC "$cons\n";
  close OUTC;

  unless ( $DEBUG ) {
    rmtree( [ $tmpDir ] );
  }

  print "Refiner: " . elapsedTime( 0 ) . "\n"
      unless ( $options{'quiet'} );

}

##########################################################################
##########################################################################
##########################################################################
##########################################################################

sub refineUntil {
  my $matrixDir       = shift;
  my $srchEngOneVsAll = shift;
  my $cons            = shift;
  my $file            = shift;
  my $htmlFile        = shift;
  my $tmpDir          = shift;
  my $db              = shift;
  my $iteration       = shift;
  my $maxIterations   = shift;
  my $DEBUG           = shift;

  my ( $newGappedCons, $avgKDiv, $numHSPs, $numUnAlignSeqs, $finalMultiAlignmentSize, $malign, $sumScore);
  for ( my $i = 0; $i <= $maxIterations ; $i++ ) {
    print " ------------------- Iteration $iteration ------------------ \n" if ( $DEBUG );
    print "* cons file = $tmpDir/family-cons-".($iteration-1).".fa\n" if ( $DEBUG > 5 );
    if ( $htmlFile ) {
      my $MOUT;
      open $MOUT, ">>$tmpDir/$htmlFile" or die "Could not open $tmpDir/$htmlFile for writing!\n";
      print $MOUT "<H2>Iteration $iteration : Refinement</H2>\n";
      close $MOUT;
    }
    ( $newGappedCons, $avgKDiv, $numHSPs, $numUnAlignSeqs, $finalMultiAlignmentSize, $malign, $sumScore) = &refineConsensus( "$FindBin::RealBin/Matrices", $srchEngOneVsAll, $cons, "$tmpDir/family-cons-".($iteration-1).".fa", $file, $htmlFile, $tmpDir, $db );

    my $newUngappedCons = $newGappedCons;
    $newUngappedCons =~ s/\-//g;

    if ( $newUngappedCons eq $cons ) {
      print "  ***Consensus has stabilized***\n" if ( $DEBUG );
      last;
    }

    $cons = $newUngappedCons;

    #
    # Save the consensus to a fasta file
    #
    open CONF, ">$tmpDir/family-cons-$iteration.fa";
    print CONF ">$familyName ( "
            . $numHSPs
            . " hsps from initial "
            . "mulitiple alignment)\n";
    print CONF "$cons\n";
    close CONF;
  
    if ( $DEBUG ) {
      print "  Consensus Length = " . length($cons) . "\n";
      print "  Avg Kimura = $avgKDiv\n";
      print "  HSPs = $numHSPs\n";
      print "  Unaligned Sequences = $numUnAlignSeqs\n";
      print "  Final Multi Alignment Size = $finalMultiAlignmentSize\n";
    }

    $iteration++;

  }

  return( $newGappedCons, $malign, $iteration, $sumScore, $avgKDiv, $finalMultiAlignmentSize, $numUnAlignSeqs );
}


sub liftUpOneLevel {
  my $seqID = shift;
  my $start = shift;  # always in sequence order
  my $end = shift;    # always in sequence order
  my $orient = shift; # Must be "+" or "-"
  my $transStr = shift;

  # Parse translation format:
  #   RECON/RepeatScout assembly:seq_id:start-end [one based, fully closed]
  #   where assembly is optional.  Start/End are in reverse order if the 
  #   sequence is reverse complemented
  #    e.g. gi|1:300-400
  #
  #    But this will accept anything that looks like:
  #      hg38:gi|1:300-400
  #      chr1:300-400
  #      gi-1:300-400
  #
  my $adjSeqID = $seqID;
  my $adjStart = $start;
  my $adjEnd = $end;
  my $adjOrient = $orient;
  if ( $transStr =~ /^\s*(\S+:)?(\S+):(\d+)-(\d+)\s*.*$/ ) {
    my $t_assembly = $1;
    my $t_seqID    = $2;
    my $t_start    = $3;
    my $t_end      = $4;

    $adjSeqID = $t_seqID;
    if ( $t_end < $t_start ) {
        # coordinates are reversed and the translation is reversed
        # two wrongs...make a right
        if ( $orient eq "-" ) {
          $adjOrient = "+";
        }else{
          $adjOrient = "-";
        }
        $adjEnd = $t_start - $start + 1;
        $adjStart   = $t_start - $end + 1;
    }
    else { 
     # coordinates are not reversed and the translation is not reversed
     # no adjustment needed to orientation.
     $adjStart = $t_start + $start - 1;
     $adjEnd   = $t_start + $end - 1;
    }
  }

  #print "$seqID:$start-$end $orient ($transStr) => $adjSeqID:$adjStart-$adjEnd $adjOrient\n";

  return( $adjSeqID, $adjStart, $adjEnd, $adjOrient );
}


# extendAlignment:
#  Use the RepeatAfterMe RAMExtend tool to try and extend seed alignment
#  that are likely fragements of longer TE.  This is often the case with the
#  output of de novo TE discovery programs such as RepeatScout and RECON
#  the primary algorithms used in RepeatModeler.  
sub extendAlignment { 
  my $mAlign = shift;
  my $tmpDir = shift;
  my $assemblyTwoBit = shift;
  my $giToIDHashRef = shift;
  my $db = shift;
  my $gappedCons = shift;
  my $DEBUG = shift;

  # Obtain divergence from MultAln
  my ( $null, $tdiv, $avgDiv ) = $mAlign->kimuraDivergence();
  my $div = 14;
    if ($tdiv >= 16) {
      $div = 18;
      if ($tdiv >= 19) {
        $div = 20;
        if ($tdiv >= 22.5) {
          $div = 25;
        }
      }
    }
  print "  MultAln Kimura divergence: $avgDiv (using div=$div)\n" if ($DEBUG);
  
  # 
  my $ungappedCons = $gappedCons;
  $ungappedCons =~ s/\-//g;
  print "Gapped len = " . length($gappedCons) . " Ungapped len = " . length($ungappedCons) . "\n" if ($DEBUG > 8);
  
  
  #  print "Working on $id.. [ cons length = " . length($reference) . " ] in $tdir\n";
  open TSV, ">$tmpDir/linup.tsv" or die "Could not open up $tmpDir/linup.tsv for writing!\n";

  #
  # In RepeatModeler the input files to Refiner (e.g. round-2/family-1.fa ) already
  # have the genomic sequence coordinates in the sequence description line. 
  # E.g. family-2.fa:>gi|621 gi|1:11485846-11485954
  # This means, that RepeatModeler internally corrected the coordinates obtained
  # from the sampleDB-#.fa file.  These coordinates are one-based, fully closed.
  #
  my @linupIdxToID = ();
  my @linupIdxToElement = ();
  for ( my $i = 0; $i < $mAlign->getNumAlignedSeqs(); $i++ ) {
    # The instance ID
    my $id     = $mAlign->getAlignedName( $i );                                                
    push @linupIdxToID, $id;
    # The instance to genomic coordinate translation
    my $genTrans = $db->getDescription( $id );                         
    #  e.g.  >gi|1 gi|1:964722-964563 element-167
    if ( $genTrans =~ /(element-\d+)/ ) {
      push @linupIdxToElement, $1;
    }else{
      push @linupIdxToElement, "";
    }

    # Identify how many bp left/right were unaligned to the current consensus
    my $s_unaligned = substr($gappedCons, 0, $mAlign->getAlignedStart( $i ) );
    $s_unaligned =~ s/\-//g;
    my $leftUnaligned = length($s_unaligned);
    $s_unaligned = substr($gappedCons, $mAlign->getAlignedEnd( $i )+1);
    $s_unaligned =~ s/\-//g;
    my $rightUnaligned = length($s_unaligned);

    # Identify where in the instance alignment starts/ends ( and orientation )
    my $instOrient = "+";
    $instOrient = "-" if ( $mAlign->getAlignedOrientation( $i ) eq "C" || $mAlign->getAlignedOrientation( $i ) eq "-" );
    my $instStart = $mAlign->getAlignedSeqStart( $i ); # one based, fully closed
    my $instEnd   = $mAlign->getAlignedSeqEnd( $i );   # one based, fully closed
    
    # The family-#.fa files contain the following sequence identifier format:
    # >instance_seq_id genomic_seq_id:start-end [one based, fully closed]
    # Where sample sequence start/end order is used to indicate reverse
    # complementation of the sequence.
    # e.g.
    # >gi|6910 gi|3:53484767-53491827
    # TODO: NO NO NO not the case anymore..the coordinates are now in Smitten V2
    warn "WARNING: Not implemented for Smitten V2 yet!\n";
    my ( $genSeqID, $genStart, $genEnd, $genOrient ) = liftUpOneLevel( $id, $instStart, $instEnd, $instOrient, $genTrans );
    # Convert to zero based, fully closed
    $genStart--;

    #print "consensus = left unaligned $leftUnaligned, right unaligned $rightUnaligned\n";
    # We will expect a twobit file using the same global "gi" identifiers.
   
    if ( $leftUnaligned <= 10 && $rightUnaligned <= 10 ) {
      # 10bp tolerance for both edges
      #print "Both\n";
      print TSV "$genSeqID\t".$genStart."\t$genEnd\t1\t1\t$genOrient\n";
    }elsif ( $leftUnaligned <= 10 ) {
      # 10bp tolerance for left edge only
      #print "Left\n";
      print TSV "$genSeqID\t".$genStart."\t$genEnd\t1\t0\t$genOrient\n";
    }elsif ( $rightUnaligned <= 10 ) {
      # 10bp tolerance for right edge only
      #print "Right\n";
      print TSV "$genSeqID\t".$genStart."\t$genEnd\t0\t1\t$genOrient\n";
    }else {
      # Unextendable sequence
      print TSV "$genSeqID\t".$genStart."\t$genEnd\t0\t0\t$genOrient\n";
    }
  }
  close TSV;
  
  ##
  ## Run RepeatAfterMe ExtendAlign
  ##
  my $cmd = "$REPEATAFTERME_DIR/RAMExtend -twobit $assemblyTwoBit -bandwidth 20 -matrix $div" . "p43g -ranges $tmpDir/linup.tsv -outtsv $tmpDir/repam-ranges.tsv -outfa $tmpDir/repam-repseq.fa -cons $tmpDir/repam-cons.fa 2>&1 > $tmpDir/repam.log";
  #print "Running extendalign..\n";
  #print "Running: $cmd\n";
  system($cmd);

  my $newCons = "";
  my $warnings;

  # Check for zero length extension
  #    ...
  #    Extended right: 0 bp
  #    Extended left : 0 bp
  #    Program duration is 4.0 sec = 0.1 min = 0.0 hr
  #
  my $t1 = `fgrep "Extended right: 0 bp" $tmpDir/repam.log`;
  my $t2 = `fgrep "Extended left : 0 bp" $tmpDir/repam.log`;
  if ( $t1 && $t2 ) {
    print "  **Could not extend**\n" if ( $DEBUG );
    #   my $cnotes = $seedAlign->getCuratorComments();
    #   $cnotes .= "RAMExtend: left: 0 bp, right: 0bp\n";
    #   $seedAlign->setCuratorComments( $cnotes );
    #   print OUTPUT "" . $seedAlign->toString();
    #   unless($DEBUG) {
    #     rmtree([ $tdir ]);
    #   }
    #   next;
  }elsif ( -e "$tmpDir/repam-cons.fa") {
    open IN,"<$tmpDir/repam-cons.fa" or die;
    my $ttid;
    my %seqs = ();
    while (<IN>){
      if ( />(\S+)/ )
      {
        $ttid = $1;
        next;
      }
      s/[\n\r\s]+//g;
      $seqs{$ttid} .= $_;
    }
    close IN;
    $newCons = $seqs{'left-extension'} . $ungappedCons . $seqs{'right-extension'};
    #print "newCons = $newCons\n";
  }else {
    print "Something went wrong with extend align:\n" if ( $DEBUG );
    $warnings .= "Something went wrong with RAMExtend!\n";
    print "  $cmd\n" if ( $DEBUG );
  } 

  $t1 = `fgrep "Extended right:" $tmpDir/repam.log`;
  $t2 = `fgrep "Extended left :" $tmpDir/repam.log`;
  my $rightExt = 0;
  if ( $t1 =~ /Extended\s+(left|right)\s*:\s+(\d+)/ ){
    $rightExt = $2;
  }
  my $leftExt = 0;
  if ( $t2 =~ /Extended\s+(left|right)\s*:\s+(\d+)/ ){
    $leftExt = $2;
  }
  print "  Captured Extensions: left $leftExt bp, right $rightExt bp\n" if ( $DEBUG );
  if ( $leftExt > 9990 && $rightExt > 9990 )
  {
    print "Extension hit limits in both directions...probably a segmental duplication...keeping unextended\n" if ( $DEBUG );
    $warnings .= "Extension hit limits in both directions...probably a segmental duplication, keeping unextended\n";
    #$segmental++;
    #   my $desc = $seedAlign->getDescription();
    #   $desc =~ s/[\n\r]+//g;
    #   $desc .= " [possibly part of segmental duplication]";
    #   $seedAlign->setDescription( $desc );
    #   my $cnotes = $seedAlign->getCuratorComments();
    #   $cnotes .= "RAMExtend: left: $leftExt bp, right: $rightExt bp [possible segmental duplication]\n";
    #     $seedAlign->setCuratorComments( $cnotes );
    # 
    #   open OUT,">$tdir/repam-newrep.fa" or die;
    #   print OUT ">repam-newrep\n$newCons\n";
    #   close OUT;
    #   print "  extended by " . (length($newCons)-length($reference)) . " bp\n";
    #   my $prevCons;
    $newCons = "";
  }elsif ( $leftExt > 2 || $rightExt > 2 ) {
    # Extension from independent proximal seeds can produce duplicate final instances.
    # different source locations). Report and remove these cases.
    #
    # NOTE: This is a new feature since extend-stk.pl.  It now checks for *any* overlap and merges
    #
    # Pass 1: identify overlapping sequences
    #    - Obtain ranges from file
    my $repseqs = readFastaFile("$tmpDir/repam-repseq.fa");
    my %seqRanges = ();
    foreach my $seq ( @{$repseqs} ) {
      my $hdr = $seq->[0];
      my $seq = $seq->[1];
      # E.g.
      # >gi|1:903922-904165_-  n=10,anchor_range=904165-904087,extended_left=0,extended_right=165,len=244,score=0
      # >gi|1:903264-903298_+  n=11,anchor_range=903264-903298,extended_left=0,extended_right=0,len=35,score=0
      if ( $hdr =~ /(\S+):(\d+)-(\d+)_(\S)\s+n=(\d+)/ ) {
        my $seqID = $1;
        my $start = $2;
        my $end = $3;
        my $orient = $4;
        my $linupIdx = $5;
        push @{$seqRanges{$seqID}}, [ $start, $end, $orient, $linupIdx, $seq ];
      }
    }
    undef $repseqs;
  
    #    - Merge ranges that overlap
    open my $nodups_out, ">$tmpDir/repam-repseq-nodups.fa" or die;
    my $newIdx = 1;
    foreach my $seqID ( sort { $a cmp $b } keys(%seqRanges) ) {
      # Sort by start position and then by the longer sequence first
      $seqRanges{$seqID} = [ sort { $a->[0] <=> $b->[0] } @{$seqRanges{$seqID}} ];
  
      # For this sequence ID combine all ranges, merging overlapping ranges
      my @mergedRanges;
      foreach my $range ( @{$seqRanges{$seqID}} ) {
        #print "Range: $seqID:$range->[0]-$range->[1] $range->[2] n=$range->[3]\n";
        my $last_merged = $mergedRanges[-1];
        if ( @mergedRanges && $range->[0] <= $last_merged->[1] ) {
          # Overlapping ranges
          #print "overlapping with $last_merged->[0]-$last_merged->[1]\n";
          if ( $range->[1] > $last_merged->[1] ) {
            # Overlapping + extending
            # update end position
            $last_merged->[1] = $range->[1];
            # merge sequence
            $last_merged->[4] .= substr($range->[4], $last_merged->[1] - $range->[0]);
          }
          # add linupIdx to merged range
          $last_merged->[3] .= ",$range->[3]";
        }else{
          # Singleton range, thus far
          push @mergedRanges, [ @{$range} ];
        }
      }
      # Emit merged ranges
      foreach my $range ( @mergedRanges ) {      
        my $desc = "";
        foreach my $linupIdx ( split(/,/, $range->[3]) ) {
          $desc .= ",n$linupIdx:$linupIdxToID[$linupIdx]:$linupIdxToElement[$linupIdx]";
        }
        $desc =~ s/^,//;
        if ( $range->[2] eq "-" ) {
          my $t = $range->[0];
          $range->[0] = $range->[1];
          $range->[1] = $t;
        }
        print $nodups_out ">gi|$newIdx $seqID:$range->[0]-$range->[1] src=$desc\n$range->[4]\n";
        $newIdx++;
      }
    }
    close $nodups_out;
    undef %seqRanges;
  }else {
    # too short of an extension
    $newCons = "";
  }
  return ( $leftExt, $rightExt, $newCons, "$tmpDir/repam-repseq-nodups.fa", $warnings );
}

sub readConsensusFromFile {
   my $file = shift;
   open IN,"<$file" or die;
   my $seq;
   while (<IN>){
     if ( />\S+/ )
     {
       next;
     }
     s/[\n\r\s]+//g;
     $seq .= $_;
   }
   close IN;
   return $seq;
}

sub readFastaFile {
   my $file = shift;
   open IN,"<$file" or die "readFastaFile: Could not open $file\n";
   my @result;
   my $seq;
   my $id;
   while (<IN>){
     if ( />(\S.*)/ )
     {
       my $tID = $1;
       if ( $seq ne "" ) {
         push @result, [ $id, $seq ];
       }
       $id = $tID;
       $seq = "";
       next;
     }
     s/[\n\r\s]+//g;
     $seq .= $_;
   }
   if ( $seq ) {
     push @result, [ $id, $seq ];
   }
   close IN;
   return [@result];
}

#
# Identify the sequence which is most similar to all other instances.
# This method is greedy and will attempt to find the sequence which 
# contains aligns the most bp of all other sequnces based on the sum
# of aligned scores.
#
sub bootstrapConsensus {
  my $matrixDir       = shift;
  my $srchEngAllVsAll = shift();
  my $pathToFastaFile = shift();
  my $htmlFile        = shift();
  my $wrkDir          = shift();
  my $db              = shift();

  my $cons = "";

  # MAKEBLASTDB the database
  system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out $pathToFastaFile "
          . "-dbtype nucl -in $pathToFastaFile >> "
          . "$wrkDir/makeblastdb.log 2>&1" );

  ## Setup the ALL vs ALL parameters 
  $srchEngAllVsAll->setMatrix( "$matrixDir/ncbi/nt/comparison.matrix" );
  $srchEngAllVsAll->setMinScore( 150 );
  $srchEngAllVsAll->setGenerateAlignments( 1 );
  $srchEngAllVsAll->setGapInit( -25 );
  $srchEngAllVsAll->setInsGapExt( -5 );
  $srchEngAllVsAll->setDelGapExt( -5 );
  $srchEngAllVsAll->setMinMatch( 7 );
  $srchEngAllVsAll->setCores( $options{'threads'} ? $options{'threads'} : undef );
  #$srchEngAllVsAll->setBandwidth( -50 );
  #$srchEngAllVsAll->setTempDir( dirname( $pathToFastaFile ) );
  $srchEngAllVsAll->setTempDir( $wrkDir );
  $srchEngAllVsAll->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );


  # 1. Search all instances against each other
  $srchEngAllVsAll->setQuery( $pathToFastaFile );
  $srchEngAllVsAll->setSubject( $pathToFastaFile );

  print $CLASS
      . "::bootstrapConsensus(): Running All-vs-All "
      . "on $pathToFastaFile \n"
      if ( $DEBUG > 8 );

  my $round = "1";
  print "Params: " . $srchEngAllVsAll->getParameters() . "\n"
            if ( $DEBUG > 8 );
  my ( $runStat, $familyCollection ) = $srchEngAllVsAll->search();

  print "Search complete\n" if ( $DEBUG > 8 );

  ## Cleanup after run
  # Remove ncbi databases
  system(   "rm $pathToFastaFile.nin "
          . "$pathToFastaFile.nhr $pathToFastaFile.nsq $pathToFastaFile.njs " )
      unless ( $DEBUG );

  print $CLASS
      . "::buildConsensus(): Returned "
      . $familyCollection->size()
      . " hits\n"
      if ( $DEBUG > 8);

  my $refID;
  my $numUnAlignSeqs = 0;
  my $numHSPs = 0;

  if ( $runStat ) {
    print STDERR "\nERROR from search engine (", $? >> 8, ") \n";
  }
  else {
    ## Find highest scoring element

    $refID =
        &findHighestScoringAlignmentSet( searchCollection => $familyCollection );


    # 11/21/24: RMH: Could not identify a reference sequence
    if ( $refID eq "" ) {
      return ( "", "", 0, 0 );
    }

    ##
    ## Prototype
    ##
    #if ( 1 ) {
    #  print "#\n";
    #  print "# PROTOTYPE: running Tensor Sketch to caculate distances...\n";
    #  print "#\n";
    #  system("$FindBin::RealBin/sketch --tuple_length 3 --embed_dim 16 --window_size 30 --stride 3 --alphabet dna5 -i $pathToFastaFile -o $wrkDir/sketch-dist.tsv >& /dev/null");
    #  open TSS,"<$wrkDir/sketch-dist.tsv" or die "Could not open $wrkDir/sketch-dist.tsv\n";
    #  my %tssSumDistsHash = ();
    #  my @tssSumDistsArray = ();
    #  while ( <TSS> ){
    #    if ( /^(\S+)\s+([\d\.]+)/ ) {
    #      push @tssSumDistsArray, [$1,$2];
    #      $tssSumDistsHash{$1} = $2;
    #    }
    #  }
    #  close TSS;
    #  @tssSumDistsArray = sort { $a->[1] <=> $b->[1] } @tssSumDistsArray;
    #  print "# Ten lowest sum distances according to TSS:\n";
    #  my $inTop = 0;
    #  for ( my $l = 0; $l < 5; $l++ ) {
    #    print "#  " . $tssSumDistsArray[$l]->[0] . "    " . $tssSumDistsArray[$l]->[1] . "\n";
    #    $inTop = 1 if ( $tssSumDistsArray[$l]->[0] eq $refID );
    #  }
    #  print "#\n";
    #  print "# Highest scoring sequence according to blast: $refID\n";
    #  print "#\n";
    #  #print "# Overriding refID with TSS result\n";
    #  #$refID = $tssSumDistsArray[0]->[0];
    #}

    $numHSPs = $familyCollection->size();
    if ( $familyCollection->size() > 0 ) {

      $db = FastaDB->new( fileName => $pathToFastaFile,
                          openMode => SeqDBI::ReadOnly );

      # Grab the reference sequence
      my $refSeq = $db->getSequence( $refID );

      # Develop a pseudo mutiple alignment based on the
      # remaining common sequence alignments.
      my $malign = MultAln->new(
                              referenceSeq              => $refSeq,
                              searchCollection          => $familyCollection,
                              searchCollectionReference => MultAln::Query,
                              flankingSequenceDatabase  => $db,
                              maxFlankingSequenceLen    => -1
      );

      # 20190605: We do want to include the reference in round-1 because it's
      # a real input sequence.  Also, matrix was unused here.
      #$cons = $malign->consensus( "$matrixDir/linupmatrix" );
      $cons = $malign->consensus( inclRef => 1 );

      # Print unalignable sequences
      my %seqNames = ();
      my $qryID    = $familyCollection->get( 0 )->getQueryName();
      $seqNames{$qryID} = 1;
      for ( my $j = 0 ; $j < $familyCollection->size() ; $j++ ) {
        my $sbjID = $familyCollection->get( $j )->getSubjName();
        $seqNames{$sbjID} = 1;
      }
      my $unAlignSeqs    = "";
      foreach my $seqID ( $db->getIDs() ) {
        unless ( exists $seqNames{$seqID} ) {
          $unAlignSeqs .=
              "Unaligned ( $seqID ): " . $db->getSequence( $seqID ) . "\n";
          $numUnAlignSeqs++;
        }
      }

      #$malign->serializeOUT( "$wrkDir/in-cons.malign" );

      print $CLASS
          . "::bootstrapConsensus(): "
          . $familyCollection->size()
          . " elements left after initial multiple alignment\n"
          if ( $DEBUG > 8 );

      ## Build Consensus
      $cons =~ s/-//g;

      if ( $htmlFile ) {
        # Write the multiple alignment to a file
        my (
           $leftMostSequence,  $leftMostSequenceID,
           $rightMostSequence, $rightMostSequenceID
          )
          = &getLongestFlankingExtension( multAln => $malign );
        my $MOUT;
        open $MOUT, ">>$tmpDir/$htmlFile" or die "Could not open $tmpDir/$htmlFile for writing!\n";
        print $MOUT "<H2>Iteration 1 : Bootstrap</H2>\n";
         writeHTMLMultAlign(
                            multAln         => $malign,
                            inclRef         => 1,
                            destination     => $MOUT,
                            leftFlankingID  => $leftMostSequenceID,
                            rightFlankingID => $rightMostSequenceID
        );
        if ( $numUnAlignSeqs > 0 ) {
          print $MOUT "$numUnAlignSeqs sequences could not be aligned to "
            . "this particular reference sequence:\n";
          print $MOUT "<PRE>\n$unAlignSeqs\n</PRE>\n";
        }
        close $MOUT;
      }

      undef $malign;

    }
  }

  return ( $refID, $cons, $numHSPs, $numUnAlignSeqs);
}


##
## refineConsensus
##
sub refineConsensus {
  my $matrixDir       = shift;
  my $srchEngOneVsAll = shift();
  my $cons            = shift();
  my $consFile        = shift();
  my $instFile        = shift();
  my $htmlFile        = shift();
  my $wrkDir          = shift();
  my $db              = shift();
  
  ## Setup the ONE vs ALL parameters 
  $srchEngOneVsAll->setMatrix( "$matrixDir/ncbi/nt/comparison.matrix" );
  $srchEngOneVsAll->setMinScore( 150 );
  $srchEngOneVsAll->setGenerateAlignments( 1 );
  $srchEngOneVsAll->setGapInit( -25 );
  $srchEngOneVsAll->setInsGapExt( -5 );
  $srchEngOneVsAll->setDelGapExt( -5 );
  $srchEngOneVsAll->setMinMatch( 7 );
  $srchEngOneVsAll->setCores( $options{'threads'} ? $options{'threads'} : undef);
  #$srchEngOneVsAll->setTempDir( dirname( $pathToFastaFile ) );
  $srchEngOneVsAll->setTempDir( $wrkDir );
  $srchEngOneVsAll->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
  $srchEngOneVsAll->setMaskLevel( 80 );

  # Determine if the version of RMBlast supports query threading
  my $rmblast_version = $srchEngOneVsAll->getVersion();
  my $engineHasQueryThreading = 0;
  if ( $rmblast_version =~ /(\d+)\.(\d+)\.(\d+)\+/ ) {
    my $majorVer = $1;
    my $minorVer = $2;
    my $revision = $3;
    if ( $majorVer > 2 || ($majorVer == 2 && $minorVer >= 13 )) {
      $srchEngOneVsAll->setThreadByQuery(1);
    }
  }

  # MAKEBLASTDB the consensus file
  system(   "$NCBIBLASTDB_PRGM -blastdb_version 4 -out "
                . "$consFile "
                . "-dbtype nucl -in "
                . "$consFile >> "
                . "$wrkDir/makeblastdb.log 2>&1" );

  $srchEngOneVsAll->setSubject( $consFile );
  $srchEngOneVsAll->setQuery( $instFile );

  print $CLASS
            . "::refineConsensus(): Cons-vs-All "
            . "$consFile\n"
            if ( $DEBUG > 8 );

  print "Params: " . $srchEngOneVsAll->getParameters() . "\n"
            if ( $DEBUG > 8 );
  my ( $runStat, $familyCollection ) = $srchEngOneVsAll->search();

  # Remove the ncbi databases
  system( "rm $consFile.n*" ) unless ( $DEBUG );

  print $CLASS
            . "::refineConsensus(): RMBlast returned "
            . $familyCollection->size()
            . " hits\n"
            if ( $DEBUG > 8 );

  my $sumScore = 0;
  my $malign;
  my $newCons = "";
  my $avgKDiv = 0;
  my $numHSPs = $familyCollection->size();
  my $numUnAlignSeqs = 0;
  my $finalMultiAlignmentSize = 0;

  if ( $runStat ) {
    print STDERR "\nERROR from search engine (", $? >> 8, ") \n";
  }else {
    # Create multiple alignment of results
    $malign = MultAln->new(
                            referenceSeq     => $cons,
                            searchCollection => $familyCollection,
                            searchCollectionReference => MultAln::Subject,
                            flankingSequenceDatabase  => $db,
                            maxFlankingSequenceLen    => -1

          );

    # Build consensus
    $newCons = $malign->consensus( "$matrixDir/linupmatrix" );
    my ( $avgKimura, $avgKimuraMod, $totTransitions, $totTransitionsMod, $totTransversions, $wellCharacterized, $CpGSites ) = $malign->calculateKimuraDivergence( consensus => $newCons );
    #print "ROUND: avgKimura = $avgKimura, avgKimuraMod = $avgKimuraMod, totTransitions = $totTransitions, totTransitionsMod = $totTransitionsMod, totTransversions = $totTransversions, wellCharacterized = $wellCharacterized, CpGSites = $CpGSites\n";
    # This now uses the CpG modified Kimura 
    $avgKDiv = $avgKimuraMod;

    # TODO: FIX
    #$malign->serializeOUT( "$wrkDir/$consName-malign-$round.ser" )
    #          if ( $DEBUG );
    
    # Determine which sequences were not aligned.
    my %seqNames = ();
    my $sbjID    = $familyCollection->get( 0 )->getSubjName();
    $seqNames{$sbjID} = 1;
    for ( my $j = 0 ; $j < $familyCollection->size() ; $j++ ) {
      my $qryID = $familyCollection->get( $j )->getQueryName();
      $sumScore += $familyCollection->get( $j )->getScore();
      $seqNames{$qryID} = 1;
    }
    my $unAlignSeqs        = "";
    my $unalignedFastaSeqs = "";
    foreach my $seqID ( $db->getIDs() ) {
      unless ( exists $seqNames{$seqID} ) {
        $unalignedFastaSeqs .=
                  ">$seqID\n" . $db->getSequence( $seqID ) . "\n";
        $unAlignSeqs .=
                  "Unaligned ( $seqID ): " . $db->getSequence( $seqID ) . "\n";
        $numUnAlignSeqs++;
      }
    }
    # Find out how many uniq sequences made it into the
    # final multiple alignment.
    my %uniqSeqIDs = ();
    foreach my $seqNum ( 0 .. $malign->getNumAlignedSeqs() - 1 ) {
      $uniqSeqIDs{ $malign->getAlignedName( $seqNum ) } = {
                               consStart => $malign->getAlignedStart( $seqNum ),
                               consEnd   => $malign->getAlignedEnd( $seqNum )
            };
    }


    if ( $htmlFile ) {
      # Write out the details of the multiple alignment
      my $MOUT;
      open $MOUT, ">>$tmpDir/$htmlFile" or die "Could not open $tmpDir/$htmlFile for writing!\n";
      writeHTMLMultAlign(
                        multAln         => $malign,
                        destination     => $MOUT,
                        #leftFlankingID  => $leftMostSequenceID,
                        #rightFlankingID => $rightMostSequenceID
              );

      if ( $numUnAlignSeqs > 0 ) {
        print $MOUT "$numUnAlignSeqs sequences could not be aligned to "
                  . "this reference sequence:\n";
        print $MOUT "<PRE>\n$unAlignSeqs\n</PRE>\n";
      }
      close $MOUT;
    }

    # TODO: Get alignment position ( ie. consensus pos ) so we can
    #       store this in with the final instances.
    $finalMultiAlignmentSize = keys( %uniqSeqIDs );
  }

  # Note: returning gapped cons
  return ( $newCons, $avgKDiv, $numHSPs, $numUnAlignSeqs, $finalMultiAlignmentSize, $malign, $sumScore);
}


# Adjust Identifiers in Multalign to generate final STK file
sub adjustIdentifiers {
  my $malign = shift;
  my $db = shift;

  # Adjust identifiers in MultAln
  my %onlyUniq;
  # 01/31/22: RMH - This "delete in forward iteration" error causes
  #                 some sequences to retain the 'gi|#' identifier
  #                 rather than being correctly translated back to
  #                 genomic identifiers.
  #for ( my $i = 0 ; $i < $malign->getNumAlignedSeqs() ; $i++ ) {
  for ( my $i = $malign->getNumAlignedSeqs()-1; $i >= 0 ; $i-- ) {
    my $id     = $malign->getAlignedName( $i );
    my $dbDesc = $db->getDescription( $id );
    my $refOrient = $malign->getAlignedOrientation( $i );

    # If we are provided a translation system -- do the translation
    #  e.g hg38:chr1:300-400
    #    or
    #      chr1:300-400
    #      gi|1:300-400
    #      gi-1:300-400
    #      gi-1:300..400
    #
    my $assembly;
    my $seqID;
    my $start;
    my $end;
    if ( $dbDesc =~ /^\s*(\S+:)?(\S+):(\d+)-(\d+)\s*.*$/ ) {
      $assembly = $1;
      $seqID    = $2;
      $start    = $3;
      $end      = $4;

      # LTR Retriever prints out stuff like this:
      #    gi-1:11224114..11230723_INT#LTR/Gypsy
    }
    elsif ( $id =~ /^\s*(\S+):(\d+)\.\.(\d+)/ ) {
      $seqID = $1;
      $start = $2;
      $end   = $3;
    }else {
      # We don't have a recognizable identifier
      $seqID = $id;
    }

    if ( $seqID ne "" && $start ne "" && $end ne "" ) {
      if ( $options{'giToID'}
           && exists $genomeDBToSeqID{$seqID} )
      {
        $seqID = $genomeDBToSeqID{$seqID};
      }
      $malign->setAlignedName( $i, $seqID );
      my $adjStart;
      my $adjEnd;

      #if ( $end < $start ) {
      #  $malign->setAlignedOrientation( $i, "-" );
      #  $adjStart = $malign->getAlignedSeqStart( $i ) + $end - 1;
      #  $adjEnd   = $malign->getAlignedSeqEnd( $i ) + $end - 1;
      #}
      #else {
      #  $adjStart = $malign->getAlignedSeqStart( $i ) + $start - 1;
      #  $adjEnd   = $malign->getAlignedSeqEnd( $i ) + $start - 1;
      #}
      # 2/1/22: RMH - Fixed a bug with coordinate translation.  The
      #               coordinates stored in the Family-#.fa file
      #               are one-based, fully-closed.  They are also
      #               stored in reversed order if the sequence
      #               is reverse complemented.  In the above code
      #               we added the coordinates to the end position
      #               rather than subtracting them from the start!
      if ( $end < $start ) {
        if ( $refOrient eq "C" || $refOrient eq "-" ) {
          $malign->setAlignedOrientation( $i, "+" );
        }else{
          $malign->setAlignedOrientation( $i, "-" );
        }
        # AlignedStart is always lower than AlignedEnd in the MALIGN obj
        $adjEnd = $start - $malign->getAlignedSeqStart( $i ) + 1;
        $adjStart   = $start - $malign->getAlignedSeqEnd( $i ) + 1;
      }
      else { 

       if ( $refOrient eq "C" || $refOrient eq "-" ) {
          $malign->setAlignedOrientation( $i, "-" );
        }else{
          $malign->setAlignedOrientation( $i, "+" );
        }
 
        $adjStart = $start + $malign->getAlignedSeqStart( $i ) - 1;
        $adjEnd   = $start + $malign->getAlignedSeqEnd( $i ) - 1;
      }
      $malign->setAlignedSeqStart( $i, $adjStart );
      $malign->setAlignedSeqEnd( $i, $adjEnd );
    }
    elsif ( $dbDesc ) {
      warn " INFO: $seqID descriptive text ignored: $dbDesc\n";
    }
    my $finalID = "$seqID:" . $malign->getAlignedSeqStart( $i ) . "-" . $malign->getAlignedSeqEnd( $i );
    if ( exists $onlyUniq{$finalID}  ) {
      # TODO: make this an function of MultAln
      splice(@{ $malign->{'alignCol'} }, $i+1, 1);
    }
    $onlyUniq{$finalID} = 1;
  }
}

##############################################################################
##############################################################################

##-------------------------------------------------------------------------##
##
##  Use: my = writeHTMLMultAlign( multAln => $multAlignRef,
##                                    [destination => $filename|$FH],
##                                    [leftFlankingID => 1],
##                                    [rightFlankingID => 1] );
##
##
##
##-------------------------------------------------------------------------##
sub writeHTMLMultAlign {
  my %parameters = @_;

  my $method = "writeHTMLMultAlign";
  croak $CLASS. "::$method() missing multAln parameter!\n"
      if ( !exists $parameters{'multAln'} );

  my $mAlign = $parameters{'multAln'};

  my $inclRef = 0;
  if ( defined $parameters{'inclRef'} ) {
    $inclRef = 1;
  }

  my $OUT = *STDOUT;
  if ( defined $parameters{'destination'} ) {
    if ( ref( $parameters{'destination'} ) !~ /GLOB|FileHandle/ ) {
      print $CLASS
          . "::$method() Opening file "
          . $parameters{'destination'} . "\n"
          if ( $DEBUG );
      open $OUT, $parameters{'destination'}
          or die $CLASS
          . "::$method: Unable to open "
          . "results file: $parameters{'destination'} : $!";
    }
    else {
      $OUT = $parameters{'destination'};
    }
  }

  print $OUT <<"END";
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>Alignments</title>
  <style type="text/css">
  font.lowQual {
    background: #CD5C5C;
  }
  font.deletion {
    background: #FF0000;
  }
  font.duplication {
    background: #0000FF;
  }
  font.unknown {
    background: #FFFF00;
  }
  font.dupFlank {
    background: #C0C0C0;
  }
  </style>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
END

  # Find max padding.
  my $maxLeftLen  = 0;
  my $maxRightLen = 0;
  my $maxQueryEnd = length( $mAlign->getReferenceSeq() );
  foreach my $seqNum ( 0 .. $mAlign->getNumAlignedSeqs() - 1 ) {
    my $relLeftLen =
        length( $mAlign->getLeftFlankingSequence( $seqNum ) ) -
        $mAlign->getAlignedStart( $seqNum );
    my $relRightLen =
        length( $mAlign->getRightFlankingSequence( $seqNum ) ) -
        ( $maxQueryEnd - $mAlign->getAlignedEnd( $seqNum ) );
    $maxLeftLen  = $relLeftLen  if ( $maxLeftLen < $relLeftLen );
    $maxRightLen = $relRightLen if ( $maxRightLen < $relRightLen );
  }

  ## Calculate Del/Dup candidates
  my @dupDelCols = ();
  if ( $parameters{'printDupDelCols'} == 1 ) {
    my $endStartPairs = $mAlign->_getEndStartPairs();
    foreach my $pair ( @{$endStartPairs} ) {
      if ( abs( $pair->{'refEnd'} - $pair->{'refStart'} ) < 50 ) {
        my $adjStart = 0;
        my $adjEnd   = 0;
        my $absStart = $pair->{'refEnd'};
        my $absEnd   = $pair->{'refStart'};
        if ( $pair->{'refEnd'} > $pair->{'refStart'} ) {
          $absStart = $pair->{'refStart'};
          $absEnd   = $pair->{'refEnd'};
        }
        $adjStart = $mAlign->getAlignPosFromBPPos( $absStart );
        $adjEnd   = $mAlign->getAlignPosFromBPPos( $absEnd );
        print " adjStart = $adjStart adjEnd=$adjEnd\n" if ( $DEBUG );
        if ( $pair->{'avgGapWidth'} > 30 ) {
          print "Calling a deletion\n";

          # Deletion
          push @dupDelCols,
              {
                'start' => $adjStart,
                'end'   => $adjEnd,
                'type'  => "deletion"
              };

        }
        elsif ( $pair->{'avgGapWidth'} < 0 ) {

          # Duplication
          print "Calling a duplication\n";
          my $flnkStart =
              $mAlign->getAlignPosFromBPPos(
                                    $absStart - abs( $pair->{'avgGapWidth'} ) );
          my $flnkEnd = $mAlign->getAlignPosFromBPPos( $absStart - 1 );
          push @dupDelCols,
              {
                'start' => $flnkStart,
                'end'   => $flnkEnd,
                'type'  => 'dupFlank'
              };
          push @dupDelCols,
              {
                'start' => $adjStart,
                'end'   => $adjEnd,
                'type'  => "duplication"
              };
          $flnkStart = $mAlign->getAlignPosFromBPPos( $absEnd + 1 );
          $flnkEnd   =
              $mAlign->getAlignPosFromBPPos(
                                      $absEnd + abs( $pair->{'avgGapWidth'} ) );
          push @dupDelCols,
              {
                'start' => $flnkStart,
                'end'   => $flnkEnd,
                'type'  => 'dupFlank'
              };
        }
        else {

          # Unknown
          print "Calling an Unknown\n";
          push @dupDelCols,
              {
                'start' => $adjStart,
                'end'   => $adjEnd,
                'type'  => "unknown"
              };
        }
      }
    }
  }

  ## Calculate low scoring columns
  my $matrix = SequenceSimilarityMatrix->new();
  $matrix->parseFromFile(
                    "$FindBin::RealBin/Matrices/wublast/nt/comparison.matrix" );
  my $columns;
  my $scoreArray;
  if ( defined $parameters{'threshold'} ) {
    ( $columns, $scoreArray ) = $mAlign->getLowScoringAlignmentColumns(
                                           matrix    => $matrix,
                                           threshold => $parameters{'threshold'}
    );
  }
  else {
    ( $columns, $scoreArray ) =
        $mAlign->getLowScoringAlignmentColumns( matrix => $matrix );
  }

  #print "Low scoring columns:\n";
  #foreach my $col ( @{$columns} ) {
  #  print "Col  start = " . $col->[0] . " end = " . $col->[1] . "\n";
  #}

  print $OUT "<PRE>\n";

  # Print out scoreArray just for fun
  #   find the largest/smallest score
  my $max = 0;
  for ( my $j = 0 ; $j <= $#{$scoreArray} ; $j++ ) {
    my $num = sprintf( "%0.1f", $scoreArray->[ $j ] );
    $max = length( $num ) if ( $max < length( $num ) );
    $scoreArray->[ $j ] = $num;
  }
  my $numCols = $max;
  my @lines   = ();
  foreach my $num ( @{$scoreArray} ) {
    my $paddedNum = ' ' x ( $numCols );
    if ( $num != 0 ) {
      $paddedNum = ' ' x ( $numCols - length( $num ) ) . $num;
    }
    for ( my $j = 0 ; $j < $numCols ; $j++ ) {
      $lines[ $j ] .= substr( $paddedNum, $j, 1 );
    }
  }
  my $label = "lowQualScore";
  my $paddedLabel = $label . " " x ( 30 - length( $label ) );
  foreach my $line ( @lines ) {
    print $OUT $paddedLabel . ": " . ' ' x ( $maxLeftLen ) . $line . "\n";
  }

  # First print the consensus sequence
  my $lineStart = 0;
  my $name      = "consensus";
  my $namePad   = 30 - length( $name );
  my $seq;

# 20190605: Allow user to specify inclusion of the reference in consensus calling
#    $mAlign->consensus(
#                  "$FindBin::RealBin/Matrices/linupmatrix" );
  if ( $inclRef ) {
    $seq = $mAlign->consensus( inclRef => 1 );
  }
  else {
    $seq = $mAlign->consensus();
  }
  my $rseq     = $mAlign->getReferenceSeq();
  print $OUT "<b><i>$name</i></b>"
      . ' ' x $namePad . ": "
      . ' ' x ( $maxLeftLen )
      . "<font color=\"blue\">";

  for ( my $i = 0 ; $i < length( $seq ) ; $i++ ) {
    my $cbase = substr( $seq, $i, 1 );
    my $rbase = substr( $rseq, $i, 1 );
    if ( $cbase eq $rbase ) {
      print $OUT "$cbase";
    }else {
      print $OUT "</font><font color=\"red\">$cbase</font><font color=\"blue\">";
    }
  }
  print $OUT "</font>\n";

  my $name    = "Reference ( " . $mAlign->getReferenceName() . " )";
  my $namePad = 30 - length( $name );
  my $seq     = $rseq;
  print $OUT "<b><i>$name</i></b>"
      . ' ' x $namePad . ": "
      . ' ' x ( $maxLeftLen )
      . "<font color=\"blue\">$seq</font>\n";

  # Now print the reference and the instances.
  for ( my $i = 0 ; $i < $mAlign->getNumAlignedSeqs() ; $i++ ) {
    $name = $mAlign->getAlignedName( $i );
    if ( length( $name ) > 30 ) {
      $name = substr( $name, 0, 30 );
    }
    $namePad = 30 - length( $name );

    my $lfSeq = $mAlign->getLeftFlankingSequence( $i );

    my $seq .=
        ' ' x
        ( $maxLeftLen - ( length( $lfSeq ) - $mAlign->getAlignedStart( $i ) ) );
    if ( $parameters{'leftFlankingID'} eq $i - 1 ) {
      $seq .= "<font color=\"blue\">" . lc( $lfSeq ) . "</font>";
    }
    else {
      $seq .= lc( $lfSeq );
    }

    # Highlight low scoring columns
    if ( $parameters{'printDupDelCols'} == 1 ) {
      $seq .= "<b>";
      my $seqPos = 0;
      foreach my $col ( @dupDelCols ) {
        my $start = $col->{'start'} - $mAlign->getAlignedStart( $i );
        my $end   = $col->{'end'} - $mAlign->getAlignedStart( $i );
        next if ( $start < 0 && $end < 0 );
        $start = 0 if ( $start < 0 );
        $end = length( $mAlign->getAlignedSeq( $i ) ) - $start - 1
            if ( $end < 0 );
        $seq .=
            substr( $mAlign->getAlignedSeq( $i ), $seqPos, $start - $seqPos );
        $seq .= "<font class=\"" . $col->{'type'} . "\">";
        $seq .=
            substr( $mAlign->getAlignedSeq( $i ), $start, $end - $start + 1 );
        $seq .= "</font>";
        $seqPos = $end + 1;
      }
      if ( $seqPos < length( $mAlign->getAlignedSeq( $i ) ) - 1 ) {
        $seq .= substr( $mAlign->getAlignedSeq( $i ), $seqPos );
      }
      $seq .= "</b>";
    }
    elsif ( $#{$columns} >= 0 ) {
      $seq .= "<b>";
      my $seqPos = 0;
      foreach my $col ( @{$columns} ) {
        my $start = $col->[ 0 ] - $mAlign->getAlignedStart( $i );
        my $end   = $col->[ 1 ] - $mAlign->getAlignedStart( $i );
        next if ( $start < 0 && $end < 0 );
        $start = 0 if ( $start < 0 );
        $end = length( $mAlign->getAlignedSeq( $i ) ) - $start - 1
            if ( $end < 0 );
        $seq .=
            substr( $mAlign->getAlignedSeq( $i ), $seqPos, $start - $seqPos );
        $seq .= "<font class=\"lowQual\">";
        $seq .=
            substr( $mAlign->getAlignedSeq( $i ), $start, $end - $start + 1 );
        $seq .= "</font>";
        $seqPos = $end + 1;
      }
      if ( $seqPos < length( $mAlign->getAlignedSeq( $i ) ) - 1 ) {
        $seq .= substr( $mAlign->getAlignedSeq( $i ), $seqPos );
      }
      $seq .= "</b>";
    }
    else {
      $seq .= "<b>" . $mAlign->getAlignedSeq( $i ) . "</b>";
    }

    if ( $parameters{'rightFlankingID'} eq $i - 1 ) {
      $seq .=
            "<font color=\"blue\">"
          . lc( $mAlign->getRightFlankingSequence( $i ) )
          . "</font>";
    }
    else {
      $seq .= lc( $mAlign->getRightFlankingSequence( $i ) );
    }

    print $OUT "<b>$name</b>" . ' ' x $namePad . ": $seq\n";
  }

  if ( defined $parameters{'printHistogram'} ) {
    print $OUT "\n\n";
    my @columnSeqs = ();
    my $maxRows    = 0;
    foreach my $col ( @{$columns} ) {
      # NOTE: This may return null sequences (e.g. if seq in block is all gaps )
      my ( $cons, $seqsRef ) = $mAlign->getAlignmentBlock(
                                                           start => $col->[ 0 ],
                                                           end   => $col->[ 1 ],
                                                           rawSequences => 1
      );
      push @columnSeqs, [ sort { length( $b ) <=> length( $a ) } @{$seqsRef} ];
      $maxRows = $#{$seqsRef} + 1 if ( $maxRows < ( $#{$seqsRef} + 1 ) );
    }
    $label = "blockSeqs";
    $paddedLabel = $label . " " x ( 30 - length( $label ) );
    for ( my $i = 0 ; $i < $maxRows ; $i++ ) {
      my $seq = " " x ( $maxLeftLen );
      my $pos = 0;
      for ( my $j = 0 ; $j <= $#{$columns} ; $j++ ) {
        my $col      = $columns->[ $j ];
        my $colWidth = $col->[ 1 ] - $col->[ 0 ] + 1;
        $seq .= " " x ( $col->[ 0 ] - $pos );
        my $colSeqArray = $columnSeqs[ $j ];
        my $colSeq      = "";
        if ( $#{$colSeqArray} >= 0 ) {
          $colSeq = shift @{$colSeqArray};
          $colSeq = "." if ( $colSeq eq "" );
        }
        $seq .= $colSeq . " " x ( $colWidth - length( $colSeq ) );
        $pos = $col->[ 1 ] + 1;
      }
      print $OUT "$paddedLabel: $seq\n";
    }
    print $OUT "\n\n";
    if ( defined $parameters{'newConsBlocks'} ) {
      my $newConsBlocks = $parameters{'newConsBlocks'};
      $label = "blockSeqCons";
      $paddedLabel = $label . " " x ( 30 - length( $label ) );
      my $seq = " " x ( $maxLeftLen );
      my $pos = 0;
      for ( my $j = 0 ; $j <= $#{$newConsBlocks} ; $j++ ) {
        my $colWidth =
            $newConsBlocks->[ $j ]->{'end'} -
            $newConsBlocks->[ $j ]->{'start'} + 1;
        $seq .= " " x ( $newConsBlocks->[ $j ]->{'start'} - $pos );
        my $colSeq = $newConsBlocks->[ $j ]->{'cons'};
        $seq .=
              "<font color=\"blue\">" . $colSeq
            . "</font>"
            . "*" x ( $colWidth - length( $colSeq ) );
        $pos = $newConsBlocks->[ $j ]->{'end'} + 1;
      }
      print $OUT "$paddedLabel: $seq\n";
    }
    if ( defined $parameters{'finalConsensus'} ) {
      $label = "originalCons";
      $paddedLabel = $label . " " x ( 30 - length( $label ) );
      my $seq =
            " " x ( $maxLeftLen )
          . "<font color=\"red\">"
          . $mAlign->consensus( "$FindBin::RealBin/Matrices/linupmatrix" )
          . "</font>";
      print $OUT "$paddedLabel: $seq\n";
      $label = "finalCons";
      $paddedLabel = $label . " " x ( 30 - length( $label ) );
      my $seq =
            " " x ( $maxLeftLen )
          . "<font color=\"blue\">"
          . $parameters{'finalConsensus'}
          . "</font>";
      print $OUT "$paddedLabel: $seq\n";
    }
  }

  print $OUT "</PRE>\n";

}

## TODO: Should we consider the reference
##       when looking for the longest flanking extension?
sub getLongestFlankingExtension {
  my %parameters = @_;

  my $mAlign = $parameters{'multAln'};

  my $maxLeftFlankingSeq  = "";
  my $maxLeftID           = -1;
  my $maxRightFlankingSeq = "";
  my $maxRightEnd         = length( $mAlign->getReferenceSeq() );
  my $maxRightID          = -1;
  foreach my $seqNum ( 0 .. $mAlign->getNumAlignedSeqs() - 1 ) {
    if ( $mAlign->getAlignedStart( $seqNum ) == 0
         && length( $mAlign->getLeftFlankingSequence( $seqNum ) ) >
         length( $maxLeftFlankingSeq ) )
    {
      $maxLeftFlankingSeq = $mAlign->getLeftFlankingSequence( $seqNum );
      $maxLeftID          = $seqNum;
    }
    if ( $mAlign->getAlignedEnd( $seqNum ) == $maxRightEnd
         && length( $mAlign->getRightFlankingSequence( $seqNum ) ) >
         length( $maxRightFlankingSeq ) )
    {
      $maxRightFlankingSeq = $mAlign->getRightFlankingSequence( $seqNum );
      $maxRightID          = $seqNum;
    }
  }
  ## TODO: The IDs have been adjusted to reflect the number in the Search
  ##       result collection....should adjust this back after we move
  ##       this routine into an object.
  print $CLASS
      . "::getLongestFlankingExtension: Returned ($maxLeftFlankingSeq," . " "
      . ( $maxLeftID - 1 )
      . ", $maxRightFlankingSeq, "
      . ( $maxRightID - 1 ) . "\n"
      if ( $DEBUG > 8 );

  return (
           $maxLeftFlankingSeq,  $maxLeftID - 1,
           $maxRightFlankingSeq, $maxRightID - 1
  );
}

##-------------------------------------------------------------------------##
## Use
##-------------------------------------------------------------------------##
sub resolveLowQualityBlocks {
  my %parameters = @_;

  croak $CLASS. "::resolveLowQualityBlocks: multAln parameter is missing!\n"
      if ( !defined $parameters{'multAln'} );
  my $mAlign = $parameters{'multAln'};

  my $matrix = SequenceSimilarityMatrix->new();
  $matrix->parseFromFile(
                    "$FindBin::RealBin/Matrices/wublast/nt/comparison.matrix" );
  my $lupMatrix = SequenceSimilarityMatrix->new();
  $lupMatrix->parseFromFile( "$FindBin::RealBin/Matrices/linupmatrix" );

  ## TODO: pass in penalties
  my ( $columns, $scoreArray ) =
      $mAlign->getLowScoringAlignmentColumns( matrix => $matrix );

  my @columnCons = ();
  foreach my $col ( @{$columns} ) {
    my $blockWidth = $col->[ 1 ] - $col->[ 0 ] + 1;

    if ( $blockWidth > 1 && $blockWidth <= 50 ) {
      my ( $refSeq, $instSeqs ) = $mAlign->getAlignmentBlock(
                                                           start => $col->[ 0 ],
                                                           end   => $col->[ 1 ],
                                                           rawSequences => 1
      );

      my $refSeqLen = length( $refSeq ) - ($refSeq =~ tr/-/-/);
      print "Considering column with cons: $refSeq [ bases = $refSeqLen ]\n" if ( $DEBUG > 8 );

      # Only consider blocks with 3 or more sequences
      if ( $#{$instSeqs} >= 3 ) {

        # Determine the most frequent length sequence
        my %seqLengthHisto = ();
        foreach my $seq ( @{$instSeqs} ) {
          print "  S: " . $seq . "\n" if ( $DEBUG > 8 );
          $seqLengthHisto{ length( $seq ) }++;
        }
        # RMH: 9/19/23 This wasn't deterministic.  If two lengths
       	#      had equal counts it randomly chose one or the other.
       	#      Now it always chooses the longest one first.
       	my @keysByLenCountThenLen = 
            sort { $seqLengthHisto{$b} <=> $seqLengthHisto{$a} ||
	           $b <=> $a }
            keys( %seqLengthHisto );
        my $mostFreqLen   = shift @keysByLenCountThenLen;
        my $mostFreqCount = $seqLengthHisto{$mostFreqLen};

        # Check to see if it's the same length as the reference/consensus
        print("mostFreqLen = $mostFreqLen, refSeq = $refSeq len=$refSeqLen\n") if ( $DEBUG > 8 );
        if ( $mostFreqLen != $refSeqLen ) {

          print "  - Block mostfreqLen = $mostFreqLen occurs in "
              . "$mostFreqCount out of "
              . ( $#{$instSeqs} + 1 )
              . " sequences.\n"
              if ( $DEBUG > 8 );
          if (    $mostFreqCount > ( .5 * ( $#{$instSeqs} + 1 ) )
               && $mostFreqCount >= 3 )
          {
            # There is a much better choice here.
            my @newConsSeqs = ();
            foreach my $seq ( @{$instSeqs} ) {
              if ( length( $seq ) == $mostFreqLen ) {
                push @newConsSeqs, $seq;
              }
            }

            # create a new consensus
            #my $newCons = MultAln::buildConsensusFromArray(
            #                                          matrix    => $lupMatrix,
            #                                          sequences => \@newConsSeqs
            #);
            # Lineupmatrix is now the hardcoded default.  No need to load it here.
            my $newCons =
                MultAln::buildConsensusFromArray( sequences => \@newConsSeqs );
            print "  -- Made a call! newCons = $newCons\n" if ( $DEBUG > 8 );
            push @columnCons,
                {
                  'start' => $col->[ 0 ],
                  'end'   => $col->[ 1 ],
                  'cons'  => $newCons
                };

          }
          else {
            print "  -- Aligning block....\n" if ( $DEBUG > 8 );
            ## Perform an all vs all of the sequences
            my %seqScore = ();
            my @results  = ();
            for ( my $i = 0 ; $i <= $#{$instSeqs} ; $i++ ) {
              for ( my $j = 0 ; $j <= $#{$instSeqs} ; $j++ ) {
                next if ( $j == $i );
                #next if ( length($instSeqs->[ $j ]) < 1 || length($instSeqs->[ $i ]) < 1 );
                my $searchResult = NeedlemanWunschGotohAlgorithm::search(
                  querySeq   => $instSeqs->[ $i ],
                  subjectSeq => $instSeqs->[ $j ],

                  #matrixFile =>
                  #    "$FindBin::RealBin/Matrices/linupmatrix",
                  matrix         => $lupMatrix,
                  insOpenPenalty => -25,
                  insExtPenalty  => -5,
                  delOpenPenalty => -25,
                  delExtPenalty  => -5
                );

                $seqScore{$i} += $searchResult->getScore();
                push @{ $results[ $i ] }, $searchResult;
              }
            }

      	    # RMH: 9/19/23 Added secondary sort to ensure deterministic behaviour
            my @instSeqSortedByScore = sort { $seqScore{$b} <=> $seqScore{$a} ||
	                                      $a cmp $b }
                keys( %seqScore );
            my $maxScoreQueryIdx = shift @instSeqSortedByScore;
            my $maxScore         = $seqScore{$maxScoreQueryIdx};

            # For now just print out the alignments
            print "---------------Alignments for col-----------------\n"
                if ( $DEBUG > 8 );
            my $src = SearchResultCollection->new();
            print "Score = $maxScore\n" if ( $DEBUG  > 8);
            foreach my $align ( @{ $results[ $maxScoreQueryIdx ] } ) {
              print ""
                  . $align->toStringFormatted( SearchResult::AlignWithQuerySeq )
                  . "\n"
                  if ( $DEBUG > 8 );
              $src->add( $align );
            }

            my $ma = MultAln->new( searchCollection          => $src,
                                   searchCollectionReference => MultAln::Query
            );

            #my $newCons =
            #    $ma->consensus(
            #        "$FindBin::RealBin/Matrices/linupmatrix" );
            my $newCons = $ma->consensus();

            print "The new block consensus = $newCons\n" if ( $DEBUG > 8 );
            print "--------------------------------------------------\n"
                if ( $DEBUG > 8 );

            $newCons =~ s/-//g;

            push @columnCons,
                {
                  'start' => $col->[ 0 ],
                  'end'   => $col->[ 1 ],
                  'cons'  => $newCons
                };
            undef $ma;
            undef $src;

          }
        }
        else {
          #print " -- Same length as consensus!\n";
        }
      }
    }
    elsif ( $blockWidth > 100 && $DEBUG > 8) {
      warn $CLASS
          . "::resolveLowQualityBlocks(): Skipping low quality block "
          . "because it is too big to align with perl. blockWidth=$blockWidth\n";
    }
  }

  return ( \@columnCons );

}

##-------------------------------------------------------------------------##
## Use: findHighestScoringAlignmentSet(
##                                      useSubjAsRef => scalar,
##                                      searchCollection => ref,
##                                    );
##
##  Returns
##
##    Given a set of All-vs-All alignments find the set of hits against
##    one sequence which scores the highest.  In this case score is the
##    sum of all the individual alignment scores.  Prior to suming the
##    scores alignments are trimmed if they overlap an existing alignment
##    by more than 20%, are on the same strand and have a lower score.
##
##    Modifies searchCollection.
##
##-------------------------------------------------------------------------##
sub findHighestScoringAlignmentSet {
  my %parameters = @_;

  # Which sequence (Query/Subject) represents the reference
  # sequence and which one represents the instance sequences.
  my $refStart      = "getQueryStart";
  my $refEnd        = "getQueryEnd";
  my $refName       = "getQueryName";
  my $refRemaining  = "getQueryRemaining";
  my $instStart     = "getSubjStart";
  my $instEnd       = "getSubjEnd";
  my $instName      = "getSubjName";
  my $instRemaining = "getSubjRemaining";
  if ( defined $parameters{'useSubjectAsRef'} ) {
    $refStart      = "getSubjStart";
    $refEnd        = "getSubjEnd";
    $refName       = "getSubjName";
    $refRemaining  = "getSubjRemaining";
    $instStart     = "getQueryStart";
    $instEnd       = "getQueryEnd";
    $instName      = "getQueryName";
    $instRemaining = "getQueryRemaining";
  }
  # RMH: TEsting
  #$DEBUG = 10;
  print "findHighestScoringAlignmentSet:\n" if ( $DEBUG > 8 );

  my $searchCollection = $parameters{'searchCollection'};
  my $highestScoringElement;

  # Deal with multiple instances.  Only keep ones which can
  # be seen as part of a global alignment
  my %elements   = ();
  my %deleteHash = ();
  my %dbl_mask_level = ();

  for ( my $l = $searchCollection->size() - 1 ; $l >= 0 ; $l-- ) {
    my $resultRef = $searchCollection->get( $l );

    my $refID  = $resultRef->$refName();
    my $instID = $resultRef->$instName();

    if ( $refID eq $instID ) {
      $deleteHash{$l} = 1;
      next;
    }

    push @{ $dbl_mask_level{$refID}->{$instID} },
        {
          'ref_start' => $resultRef->$refStart(),
          'ref_end'   => $resultRef->$refEnd(),
          'inst_start' => $resultRef->$instStart(),
          'inst_end'   => $resultRef->$instEnd(),
          'score' => $resultRef->getScore(),
          'index' => $l
        };

    if ( $resultRef->getOrientation() eq "C" ) {
      push @{ $elements{$refID}->{'reverse'}->{$instID} },
          {
            'start' => $resultRef->$refStart(),
            'end'   => $resultRef->$refEnd(),
            'score' => $resultRef->getScore(),
            'index' => $l
          };
    }
    else {
      push @{ $elements{$refID}->{'forward'}->{$instID} },
          {
            'start' => $resultRef->$refStart(),
            'end'   => $resultRef->$refEnd(),
            'score' => $resultRef->getScore(),
            'index' => $l
          };
    }
  }

  # 11/21/24: RMH - No non-self alignments
  if ( ! keys %elements ) {
    return ( "" );
  }

  my $maskLevel = 80;

  # Single masklevel
  if ( 1 ) {
    foreach my $seqID ( keys( %elements ) ) {
  
      print "Looking for overlaps in set $seqID\n" if ( $DEBUG > 8);
  
      # Only consider elements with more than one match
      foreach my $strand ( 'forward', 'reverse' ) {
        my $strandRec = $elements{$seqID}->{$strand};
        foreach my $instanceName ( keys( %{$strandRec} ) ) {
          if ( $#{ $strandRec->{$instanceName} } > 0 ) {
  
            # Sort by score
	          # RMH: 9/19/23
	          #   - Added secondary sorts start, and length to 
	          #     make sure this deterministic.
            my @sortedResults =
                sort { $b->{'score'} <=> $a->{'score'} ||
		                   $a->{'start'} <=> $b->{'start'} ||
	                    ($b->{'end'} - $b->{'start'}) <=> ($a->{'end'} - $a->{'start'}) }
                     @{ $strandRec->{$instanceName} };
  
            print "     - $instanceName appears $#sortedResults times.\n" if ( $DEBUG > 8 );
            for ( my $i = 0 ; $i <= $#sortedResults ; $i++ ) {
              for ( my $j = $i + 1 ; $j <= $#sortedResults ; $j++ ) {
                my $overlap = 0;
                my $perc    = 0;
                my $result1 = $sortedResults[ $i ];
                my $result2 = $sortedResults[ $j ];
  
                # Get members
                my $begin1 = $result1->{'start'};
                my $begin2 = $result2->{'start'};
                my $end1   = $result1->{'end'};
                my $end2   = $result2->{'end'};
  
                # Check if they overlap
                next if ( $begin2 > $end1 || $begin1 > $end2 );
  
                print "OVERLAP: $begin1-$end1 and $begin2-$end2\n" if ( $DEBUG > 8);
  
                # Calc overlap
                $overlap = $begin1 - $begin2 if ( $begin2 < $begin1 );
                $overlap += $end2 - $end1 if ( $end2 > $end1 );
                $perc = ( $overlap / ( $end2 - $begin2 + 1 ) ) * 100
                    if ( $overlap );
                if     ( $perc < ( 100 - $maskLevel ) ) {
  
                  print "Scheduling deletion for " . $result2->{'index'} . "\n"
                      if ( $DEBUG > 8 );
                  $deleteHash{ $result2->{'index'} } = 1;
                }
              }
            }
          }
        }
      }
    }
    undef %elements;
  }else {
    # Experimental double masklevel
    foreach my $seqID ( keys( %dbl_mask_level ) ) {
      print "Looking for overlaps in set $seqID\n" if ( $DEBUG > 8);
        my $strandRec = $dbl_mask_level{$seqID};
        foreach my $instanceName ( keys( %{$strandRec} ) ) {
          if ( $#{ $strandRec->{$instanceName} } > 0 ) {
            # Sort by score
	    # RMH: 9/19/23
	    #   - Added secondary sorts start, and length to 
	    #     make sure this deterministic.
            my @sortedResults =
                sort { $b->{'score'} <=> $a->{'score'} ||
		       $a->{'start'} <=> $b->{'start'} ||
	               ($b->{'end'} - $b->{'start'}) <=> ($a->{'end'} - $a->{'start'}) }
                @{ $strandRec->{$instanceName} };
  
            print "    - $instanceName appears " . ($#sortedResults+1) . " times.\n" if ( $DEBUG > 8 );
            for ( my $i = 0 ; $i <= $#sortedResults ; $i++ ) {
              for ( my $j = $i + 1 ; $j <= $#sortedResults ; $j++ ) {
                my $result1 = $sortedResults[ $i ];
                my $result2 = $sortedResults[ $j ];
  
                # Get members
                my $rbegin1 = $result1->{'ref_start'};
                my $rbegin2 = $result2->{'ref_start'};
                my $rend1   = $result1->{'ref_end'};
                my $rend2   = $result2->{'ref_end'};
                my $ibegin1 = $result1->{'inst_start'};
                my $ibegin2 = $result2->{'inst_start'};
                my $iend1   = $result1->{'inst_end'};
                my $iend2   = $result2->{'inst_end'};
  
                print "      o  $rbegin1-$rend1 and $rbegin2-$rend2, $ibegin1-$iend1 and $ibegin2-$iend2\n" if ( $DEBUG > 8 );
  
                # Calc overlap
                my $roverlap = 0;
                unless ( $rbegin1 > $rend2 || $rend1 < $rbegin2 ) {
                  $roverlap = min($rend1, $rend2) - max($rbegin1, $rbegin2) + 1;
                }
                my $rperc = 0;
                $rperc = ( $roverlap / ( $rend2 - $rbegin2 + 1 ) ) * 100
                    if ( $roverlap );
  
                my $ioverlap = 0;
                unless ( $ibegin1 > $iend2 || $iend1 < $ibegin2 ) {
                  $ioverlap = min($iend1, $iend2) - max($ibegin1, $ibegin2) + 1;
                }
                my $iperc = 0;
                $iperc = ( $ioverlap / ( $iend2 - $ibegin2 + 1 ) ) * 100
                    if ( $ioverlap );
  
                print "        rperc = $rperc / $roverlap, iperc = $iperc\n" if ( $DEBUG > 8 );
  
                if     ( $rperc > $maskLevel || $iperc > $maskLevel ) {
                  print "        Scheduling deletion for " . $result2->{'index'} . "\n"
                      if ( $DEBUG > 8);
                  $deleteHash{ $result2->{'index'} } = 1;
                }
              }
            }
          }
        }
      }
    undef %dbl_mask_level;
  }

  # Remove all hits which were filtered above
  if ( keys( %deleteHash ) ) {
    foreach my $index ( sort { $b <=> $a } keys( %deleteHash ) ) {
      $searchCollection->remove( $index );
    }
  }

  my %results = ();
  for ( my $l = 0 ; $l < $searchCollection->size() ; $l++ ) {
    my $resultRef = $searchCollection->get( $l );

    next if ( $resultRef->$instName() eq $resultRef->$refName() );

    print "Considering a hit: "
        . $resultRef->$refName() . " to "
        . $resultRef->$instName() . "\n"
        if ( $DEBUG > 8);

    my $refID = $resultRef->$refName();
    $elements{$refID} += $resultRef->getScore();

    # DISABLE
    #if ( $DEBUG ) {
    # push @{ $results{ $resultRef->$refName() } }, $resultRef;
    #}
  }

  if ( keys %elements ) { 
    # RMH: 9/19/23 added secondary sort after total score use
    #      the ID itself.
    my @sortedElementKeys =
        sort { $elements{$b} <=> $elements{$a} ||
	       $a cmp $b }
	   keys( %elements );

    if ( $DEBUG > 8 ) {
      foreach my $key ( @sortedElementKeys ) {
        print "Results for $key ( score = " . $elements{$key} . " )\n";
        foreach my $result ( @{ $results{$key} } ) {
          print "" . $result->toStringFormatted( SearchResult::NoAlign ) . "";
        }
      }
    }

    $highestScoringElement = shift @sortedElementKeys;
    print "Highest scoring element vs family is $highestScoringElement with "
        . "a score of "
        . $elements{$highestScoringElement} . "\n"
        if ( $DEBUG > 8);

    # Keep only the alignments against the highest scoring
    # element.
    for ( my $l = $searchCollection->size() - 1 ; $l >= 0 ; $l-- ) {
      my $resultRef = $searchCollection->get( $l );
      my $refID     = $resultRef->$refName();
      my $instID    = $resultRef->$instName();
      if (
           $refID ne $highestScoringElement
           || (    $refID eq $highestScoringElement
                && $instID eq $highestScoringElement )
          )
      {
        $searchCollection->remove( $l );
      }
    }
  }
  else {
    $searchCollection->clear();
  }

  return ( $highestScoringElement );
}

##-------------------------------------------------------------------------##
## Use:  my ( $tempDir ) = &createTempDir( \@tmpPath );
##
##  Returns
##
##-------------------------------------------------------------------------##
sub createTempDir {
  my $tmpPathRef = shift;

  ## Get date
  my $date = localtime( time() );

  # Windows does not support the use of ":" in a filename.
  $date =~ s/[ ,\t,\n:]//g;

  my $runnumber = "$$" . ".$date";
  my $tempDir   = "";
  foreach my $directory ( @{$tmpPathRef} ) {

    if ( $directory =~ /\/$/ ) {
      $tempDir = $directory . "RM_$runnumber";
    }
    else {
      $tempDir = $directory . "/RM_$runnumber";
    }

    if ( -d "$tempDir" || mkdir $tempDir, 0777 ) {
      if ( open( IN, ">$tempDir/deleteMe" ) ) {
        close IN;
        unlink( "$tempDir/deleteMe" );
        last;
      }
    }
    $tempDir = "";
  }
  return ( $tempDir );
}

##-------------------------------------------------------------------------##
## Use: my $string = elapsedTime( $index );
##
##   Returns
##
##      Great little utility for measuring the elapsed
##      time between one or more lines of perl code.
##
##      --- Depends on a global variable $TimeBefore
##-------------------------------------------------------------------------##
sub elapsedTime {
  my ( $TimeHistIdx ) = @_;
  if ( defined $TimeBefore[ $TimeHistIdx ] ) {
    my $DiffTime = time - $TimeBefore[ $TimeHistIdx ];
    $TimeBefore[ $TimeHistIdx ] = time;
    my $Min = int( $DiffTime / 60 );
    $DiffTime -= $Min * 60;
    my $Hours = int( $Min / 60 );
    $Min -= $Hours * 60;
    my $Sec = $DiffTime;
    return "$Hours:$Min:$Sec Elapsed Time";
  }
  else {
    $TimeBefore[ $TimeHistIdx ] = time;
    return 0;
  }
}

sub max {
  my $a = shift;
  my $b = shift;
  if ( $a >= $b ) {
    return $a; 
  }else {
    return $b;
  }
}

sub min {
  my $a = shift;
  my $b = shift;
  if ( $a <= $b ) {
    return $a; 
  }else {
    return $b;
  }
}


1;