deduplicate_bismark

#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Cwd;

### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
### Paired-end alignments are considered a duplicate if both partner reads start and end at the exact same position

my $dedup_version = 'v0.24.2';
my @filenames;

my ($single,$paired,$global_single,$global_paired,$samtools_path,$bam,$rrbs,$multiple,$output_dir,$outfile,$parallel) = process_commandline();

# Number of cores used for samtools view --threads. According to tests,
# additional cores are simply not worth the while  (https://github.com/FelixKrueger/Bismark/issues/364)
$parallel = 1;

my $user_defined_outfile = 0;
if (defined $outfile){
	++$user_defined_outfile;
}

###########################
### START OF PROCESSING ###
###########################

if($rrbs){
    warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n";
    sleep (1);
}
else{ # default; random (=first) alignment
    # multiple input files are treated as one single big BAM file when merging the data is required
    if ($multiple){
		warn "Multiple Input files for the same sample selected - All input files are treated as one big single file. The files to be used are:\n";
		warn join ("\n",@filenames),"\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; # sleep(1);
    }
    
    warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n";
    # sleep (1);
}

foreach my $file (@filenames){
	
    if ($global_single){
		$paired = 0;
		$single = 1;
    }
    elsif($global_paired){
		$paired = 1;
		$single = 0;
    }

    # Testing if the file appears to be truncated, in which case we bail with a big scary warning message
    if ($file =~ /(\.bam$)/){
		bam_isTruncated($file);
    }
    
    unless($global_single or $global_paired){
	
		warn "Trying to determine the type of mapping from the SAM header line\n"; # sleep(1);
	
		### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
		if ($file =~ /\.gz$/){
			open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
		}
		elsif ($file =~ /\.bam$/){
			open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
		}
		else{
			open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
		}
		while (<DETERMINE>){
			last unless (/^\@/);
			if ($_ =~ /^\@PG/){
				# warn "Found a \@PG line:\n";
				# warn "$_";
				
				if ($_ =~ /ID:Bismark/){
					# warn "This is the Bismark command we are after. Extracting the library type...\n";
				}
				else{
					# warn "This isn't a the Bismark \@PG line. Moving on...\n"; sleep(1);
					next;
				}

				if ($_ =~ /\s+--?1\s+/ and $_ =~ /\s+--?2\s+/){ # allowing -1 and -2 or --1 and --2
					warn "Treating file as paired-end data (extracted from \@PG line)\n"; # sleep(1);
					$paired = 1;
					$single = 0;
				}
				else{
					warn "Treating file as single-end data (extracted from \@PG line)\n"; # sleep(1);
					$paired = 0;
					$single = 1;
				}
			}
		}
		close DETERMINE or warn "$!\n";
	}
    
    if ($file =~ /(\.bam$|\.sam$)/){
		bam_isEmpty($file);
    }
	
	### OPTIONS
    unless ($single or $paired){
		die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n";
    }

    ### 
	if ($paired){
		test_positional_sorting($file);
	}
    

    ### writing to a report file
    my $report;
	
	if (defined $outfile){ 
		$report = $outfile;	
	}
	else{
		$report = $file;
	}

    # now would be a good time to get rid of the path information
    $report =~ s/.*\///; # removing path information
    # warn "Now: $report\n";
    $report =~ s/\.gz$//;
    $report =~ s/\.sam$//;
    $report =~ s/\.bam$//;
    $report =~ s/\.txt$//;
    $report =~ s/$/.deduplication_report.txt/;

    if ($multiple){
		$report =~ s/deduplication/multiple.deduplication/;
    }
  
    open (REPORT,'>',"${output_dir}$report") or die "Failed to write to report file to ${output_dir}$report: $!\n\n";


    if($rrbs){
		deduplicate_barcoded_rrbs($file);
    }

    ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option
    else{

		my $header_printed = 0;
		my %unique_seqs;
		my %positions;
	
		if ($multiple){
			# we are assuming that all files are either in BAM format
			if ($file =~ /\.bam$/){
				open (IN,"$samtools_path cat -h $file @filenames | $samtools_path view -h |") or die "Unable to read from BAM files in '@filenames': $!\n";
			}
			# or all in SAM format
			else{ # if the reads are in SAM format we simply concatenate them
				open (IN,"cat @filenames |") or die "Unable to read from several filenames '@filenames': $!\n";
			}
			
		}
		else{
			if ($file =~ /\.gz$/){
				open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
			}
			elsif ($file =~ /\.bam$/){
				# warn "Launching $samtools_path view -h with >>$parallel<< cores\n\n"; sleep(1);
				open (IN,"$samtools_path view -h --threads $parallel $file |") or die "Unable to read from BAM file $file: $!\n";
			}
			else{
				open (IN,$file) or die "Unable to read from $file: $!\n";
			}
		}
	
		if ($user_defined_outfile){ # user-defined output filename
			# using that name. $outfile is already given
		}
		else{ # deriving the name from the input file automatically (default)
			$outfile = $file;
		}
		
		# now would be a good time to get rid of the path information
		$outfile =~ s/.*\///; # removing path information

		$outfile =~ s/\.gz$//;
		$outfile =~ s/\.sam$//;
		$outfile =~ s/\.bam$//;
		$outfile =~ s/\.txt$//;

	    if ($bam == 1){
			$outfile =~ s/$/.deduplicated.bam/;
	    }
	    elsif ($bam == 2){
			$outfile =~ s/$/.deduplicated.sam.gz/;
	    }
	    else{
			$outfile =~ s/$/.deduplicated.sam/;
	    }
	
		if ($multiple){
			$outfile =~ s/deduplicated/multiple.deduplicated/;
		}

		warn "Output file is: $outfile\n\n";

		if ($bam == 1){
			# Using $parallel threads. Currently set to 1, as more cores don't seem to help (see above)
			open (OUT,"| $samtools_path view -bS --threads $parallel - > ${output_dir}$outfile") or die "Failed to write to ${output_dir}$outfile: $!\n";
		}
		elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
			open (OUT,"| gzip -c - > ${output_dir}$outfile") or die "Failed to write to ${output_dir}$outfile: $!\n";
		}
		else{
			open (OUT,'>',"${output_dir}$outfile") or die "Unable to write to ${output_dir}$outfile: $!\n";
		}

		my $count = 0;
		my $unique_seqs = 0;
		my $removed = 0;

		while (<IN>){

			if ($count == 0){
				if ($_ =~ /^Bismark version:/){
					warn "The file appears to be in the custom Bismark and not SAM/BAM format, which is no longer supported. Please see option --vanilla in a version of Bismark up to v0.20.1\n\n";
					print_helpfile();
					exit;
				}
			}

			### if this was a SAM file we ignore header lines
			if (/^\@\w{2}\s/){
				print "skipping header line:\t$_";
			
				if ($multiple){ # the header gets set with samtools cat -h in this case
					unless($header_printed){
						print OUT "$_";
					}	    
				}
				else{
					# Printing the header lines again into the de-duplicated file
					print OUT "$_";
				}
				next;
			}
		
			# we can assume that once we reach sequence entries we have passed the header section of the first file
			++$header_printed;
			++$count;
			my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths

			my ($strand,$chr,$start,$end,$cigar);
			my $line1;

			($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand

			my $read_conversion;
			my $genome_conversion;

			while ( /(XR|XG):Z:([^\t]+)/g ) {
				my $tag = $1;
				my $value = $2;
				
				if ($tag eq "XR") {
					$read_conversion = $value;
					$read_conversion =~ s/\r//;
					chomp $read_conversion;
				} 
				elsif ($tag eq "XG") {
					$genome_conversion = $value;
					$genome_conversion =~ s/\r//;
					chomp $genome_conversion;
				}
			}
			die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $genome_conversion);
			
			# Overwriting the FLAG in the strand with more detailed strand identity
			my $index;
			if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
				$index = 0;
				# $strand = '+'; old
				$strand = 'OT';
			} elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
				$index = 1;
				# $strand = '-'; old
				$strand = 'CTOT';
			} elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
				$index = 2;
				# $strand = '+'; old
				$strand = 'CTOB';
			} elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
				$index = 3;
				# $strand = '-';
				$strand = 'OB';
			} else {
				die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
			}
			
			### SAM single-end
			if ($single){
				if ($index == 0 or $index == 2 ){
					### read aligned to the forward strand. No action needed
				}
				else{
					### read is on reverse strand
				
					$start -= 1; # only need to adjust this once
				
					# for InDel free matches we can simply use the M number in the CIGAR string
					if ($cigar =~ /^(\d+)M$/){ # linear match
						$start += $1;
					}
					else{
						# parsing CIGAR string
						my @len = split (/\D+/,$cigar); # storing the length per operation
						my @ops = split (/\d+/,$cigar); # storing the operation
						shift @ops; # remove the empty first element
						die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

						# 	warn "CIGAR string; $cigar\n";
						### determining end position of a read
						foreach my $index(0..$#len){
							if ($ops[$index] eq 'M'){  # standard matching bases
								$start += $len[$index];
								# warn "Operation is 'M', adding $len[$index] bp\n";
							}
							elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
								# warn "Operation is 'I', next\n";
							}
							elsif($ops[$index] eq 'D'){ # deletions do affect the end position
								#  warn "Operation is 'D',adding $len[$index] bp\n";
								$start += $len[$index];
							}
							elsif($ops[$index] eq 'N'){ # skipped regions (splicing) do affect the end position
								# warn "Operation is 'N',adding $len[$index] bp\n";
								$start += $len[$index];
							}
							elsif($ops[$index] eq 'S'){ # soft-clipped bases do not affect the end position
								# warn "Operation is 'S', next\n";
							}
							else{
								die "SE BAM: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
							}
						}
					}
				}   
				$composite = join (":",$strand,$chr,$start);
				# warn "Adding the following string:\t$composite\n";    
			}
			elsif($paired){

				### storing the current line
				$line1 = $_;
				
				# if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2	
				if ($index == 0 or $index == 2){
				
					### reading in the next line
					$_ = <IN>;
					# the only thing we need is the end position
					($end,my $cigar_2) = (split (/\t/))[3,5];

					$end -= 1; # only need to adjust this once
					
					# for InDel free matches we can simply use the M number in the CIGAR string
					if ($cigar_2 =~ /^(\d+)M$/){ # linear match
						$end += $1;
					}
					else{
						# parsing CIGAR string
						my @len = split (/\D+/,$cigar_2); # storing the length per operation
						my @ops = split (/\d+/,$cigar_2); # storing the operation
						shift @ops; # remove the empty first element
						die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
						
						# warn "CIGAR string: $cigar_2\n";
						### determining end position of the read
						foreach my $index(0..$#len){
							if ($ops[$index] eq 'M'){  # standard matching bases
								$end += $len[$index];
								# warn "Operation is 'M', adding $len[$index] bp\n";
							}
							elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
								# warn "Operation is 'I', next\n";
							}
							elsif($ops[$index] eq 'D'){ # deletions do affect the end position
								#  warn "Operation is 'D',adding $len[$index] bp\n";
								$end += $len[$index];
							}
							elsif($ops[$index] eq 'N'){ # skipped regions (splicing) do affect the end position
								# warn "Operation is 'N',adding $len[$index] bp\n";
								$end += $len[$index];
							}
							elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
								# warn "Operation is 'S'\n";
							}
							else{
								die "PE: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
							}
						}
					}
				}
				else{
					# else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

					$end = $start - 1; # need to adjust this only once
				
					# for InDel free matches we can simply use the M number in the CIGAR string
					if ($cigar =~ /^(\d+)M$/){ # linear match
						$end += $1;
					}
					else{
						# parsing CIGAR string
						my @len = split (/\D+/,$cigar); # storing the length per operation
						my @ops = split (/\d+/,$cigar); # storing the operation
						shift @ops; # remove the empty first element
						die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
						
						# warn "CIGAR string: $cigar\n";
						### determining end position of the read
						foreach my $index(0..$#len){
							if ($ops[$index] eq 'M'){  # standard matching bases
								$end += $len[$index];
								# warn "Operation is 'M', adding $len[$index] bp\n";
							}
							elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
								# warn "Operation is 'I', next\n";
							}
							elsif($ops[$index] eq 'D'){ # deletions do affect the end position
								# warn "Operation is 'D',adding $len[$index] bp\n";
								$end += $len[$index];
							}
							elsif($ops[$index] eq 'N'){ # skipped regions (splicing) do affect the end position
								#warn "Operation is 'N',adding $len[$index] bp\n";
								$end += $len[$index];
							}
							elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
								# warn "Operation is 'S'\n";
							}
							else{
								die "PE: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
							}
						}
					}
					
					### reading in the next line
					$_ = <IN>;
					# the only thing we need is the start position
					($start) = (split (/\t/))[3];
				}
				$composite = join (":",$strand,$chr,$start,$end);
				# warn "Adding the following string:\t$composite\n";
			}
			else{
				die "Input must be single or paired-end\n";
			}
			# warn "Composite is: $composite\n";
			if (exists $unique_seqs{$composite}){
				++$removed;
				unless (exists $positions{$composite}){
					$positions{$composite}++;
				}
			}
			else{
				if ($paired){
					print OUT "$line1"; # printing first paired-end line for SAM output
				}
				print OUT "$_"; # printing single-end SAM alignment or second paired-end line
				$unique_seqs{$composite}++;
			}
		}
	

		my $percentage;
		my $percentage_leftover;
		my $leftover = $count - $removed;

		unless ($count == 0){
			$percentage = sprintf("%.2f",$removed/$count*100);
			$percentage_leftover = sprintf("%.2f",$leftover/$count*100);
		}
		else{
			$percentage = 'N/A';
			$percentage_leftover = 'N/A';
		}

		warn "\nTotal number of alignments analysed in $file:\t$count\n";
		warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
		warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
		warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

		print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
		print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
		print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
		print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";


		close OUT or warn "Failed to close output filehandle: $!\n";
		close REPORT or warn "Failed to close report filehandle: $!\n";
		
		if ($multiple){
			warn "Finished deduplication of multiple files\n@filenames\n\n";
			last;
		}
	}
}


sub deduplicate_barcoded_rrbs{

    my $file = shift;

    my %unique_seqs;
    my %positions;

    if ($file =~ /\.gz$/){
		open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
    }
    elsif ($file =~ /\.bam$/){
		open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
		open (IN,$file) or die "Unable to read from $file: $!\n";
    }

	# warn "User defined outfile name is: $user_defined_outfile\n";
    if ($user_defined_outfile){
		# using a user-specified name
	}
	else{
		$outfile = $file; # deriving the output filename from the input file. Default
		# warn "Else: $outfile = \$file";
	}
	$outfile =~ s/.*\///; # removing path information

	$outfile =~ s/\.gz$//;
    $outfile =~ s/\.sam$//;
    $outfile =~ s/\.bam$//;
    $outfile =~ s/\.txt$//;

   	if ($bam == 1){
	    $outfile =~ s/$/.deduplicated.bam/;
	}
	elsif ($bam == 2){
	    $outfile =~ s/$/.deduplicated.sam.gz/;
	}
	else{
	    $outfile =~ s/$/.deduplicated.sam/;
	}

	if ($multiple){
		$outfile =~ s/deduplicated/multiple.deduplicated/;
	}

   
    if ($bam == 1){
		open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}$outfile") or die "Failed to write to ${output_dir}$outfile: $!\n";
    }
    elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
		open (OUT,"| gzip -c - > ${output_dir}$outfile") or die "Failed to write to ${output_dir}$outfile: $!\n";
    }
    else{
		open (OUT,'>',"${output_dir}$outfile") or die "Unable to write to ${output_dir}$outfile: $!\n";
    }

    ### This mode only supports Bismark SAM/BAM output
    my $count = 0;
    my $unique_seqs = 0;
    my $removed = 0;

    while (<IN>){

		if ($count == 0){
			if ($_ =~ /^Bismark version:/){
				warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
				sleep(2);
				print_helpfile();
				exit;
			}
		}

		### if this was a SAM file we ignore header lines
		if (/^\@\w{2}\t/){
			warn "skipping SAM header line:\t$_";
			print OUT; # Printing the header lines again into the de-duplicated file
			next;
		}

		++$count;
		my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
		### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication
		### in effect the $composite string looks like this (separated by ':'):

		### strand identity:chromosome:start:barcode

		my $end;
		my $line1;

		# SAM format
		my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand

		$id =~ /:(\w+)$/;
		my $barcode = $1;
		unless ($barcode){
			die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n";
		}
		
		my $read_conversion;
		my $genome_conversion;
		
		while ( /(XR|XG):Z:([^\t]+)/g ) {
			my $tag = $1;
			my $value = $2;
			
			if ($tag eq "XR") {
				$read_conversion = $value;
				$read_conversion =~ s/\r//;
				chomp $read_conversion;
			}
			elsif ($tag eq "XG") {
				$genome_conversion = $value;
				$genome_conversion =~ s/\r//;
				chomp $genome_conversion;
			}
		}
		die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $genome_conversion);

		
		my $index;
		if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
			$index = 0;
			# $strand = '+'; # old
			$strand = 'OT';
		} elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
			$index = 1;
			# $strand = '-'; # old
			$strand = 'CTOT';
		} elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
			$index = 2;
			# $strand = '+'; # old
			$strand = 'CTOB';
		} elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
			$index = 3;
			# $strand = '-'; # old
			$strand = 'OB';
		} else {
			die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
		}
		
		
		### SAM single-end
		if ($single){
			
			if ($index == 0 or $index == 2){
				### read aligned to the forward strand. No action needed
			}
			else{
				### read is on reverse strand
				
				$start -= 1; # only need to adjust this once
				
				# for InDel free matches we can simply use the M number in the CIGAR string
				if ($cigar =~ /^(\d+)M$/){ # linear match
					$start += $1;
				}
				else{
					# parsing CIGAR string
					my @len = split (/\D+/,$cigar); # storing the length per operation
					my @ops = split (/\d+/,$cigar); # storing the operation
					shift @ops; # remove the empty first element
					die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

					# warn "CIGAR string; $cigar\n";
					### determining end position of a read
					foreach my $index(0..$#len){
						if ($ops[$index] eq 'M'){  # standard matching bases
							$start += $len[$index];
							# warn "Operation is 'M', adding $len[$index] bp\n";
						}
						elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
							# warn "Operation is 'I', next\n";
						}
						elsif($ops[$index] eq 'D'){ # deletions do affect the end position
							#  warn "Operation is 'D',adding $len[$index] bp\n";
							$start += $len[$index];
						}
						elsif($ops[$index] eq 'N'){ # skipped regions (splices) do affect the end position
							#  warn "Operation is 'N',adding $len[$index] bp\n";
							$start += $len[$index];
						}
						elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
							# warn "Operation is 'S', next\n";
						}
						else{
							die "SE barcoded BAM: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
						}
					}
				}
			}

			### Here we take the barcode sequence into consideration
			$composite = join (":",$strand,$chr,$start,$barcode);
			# warn "Adding the following string:\t$composite\n";    
			# sleep(1);
		}
		elsif($paired){

			### storing the current line
			$line1 = $_;

			# if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2	
			if ($index == 0 or $index == 2){
			
				### reading in the next line
				$_ = <IN>;
				# the only thing we need is the end position
				($end,my $cigar_2) = (split (/\t/))[3,5];

				$end -= 1; # only need to adjust this once
				
				# for InDel free matches we can simply use the M number in the CIGAR string
				if ($cigar_2 =~ /^(\d+)M$/){ # linear match
					$end += $1;
				}
				else{
					# parsing CIGAR string
					my @len = split (/\D+/,$cigar_2); # storing the length per operation
					my @ops = split (/\d+/,$cigar_2); # storing the operation
					shift @ops; # remove the empty first element
					die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
					
					# warn "CIGAR string; $cigar_2\n";
					### determining end position of the read
					foreach my $index(0..$#len){
						if ($ops[$index] eq 'M'){  # standard matching bases
							$end += $len[$index];
							# warn "Operation is 'M', adding $len[$index] bp\n";
						}
						elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
							# warn "Operation is 'I', next\n";
						}
						elsif($ops[$index] eq 'D'){ # deletions do affect the end position
							#  warn "Operation is 'D',adding $len[$index] bp\n";
							$end += $len[$index];
						}
						elsif($ops[$index] eq 'N'){ # skipped regions (splices) do affect the end position
							#  warn "Operation is 'N',adding $len[$index] bp\n";
							$end += $len[$index];
						}
						elsif($ops[$index] eq 'S'){ # soft-clipped positions do NOT affect the end position
							# warn "Operation is 'S', next\n";
						}
						else{
							die "Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
						}
					}
				}
			}
			else{
				# else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line
				
				$end = $start - 1; # need to adjust this only once
				
				# for InDel free matches we can simply use the M number in the CIGAR string
				if ($cigar =~ /^(\d+)M$/){ # linear match
					$end += $1;
				}
				else{
					# parsing CIGAR string
					my @len = split (/\D+/,$cigar); # storing the length per operation
					my @ops = split (/\d+/,$cigar); # storing the operation
					shift @ops; # remove the empty first element
					die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
					
					# warn "CIGAR string; $cigar\n";
					### determining end position of the read
					foreach my $index(0..$#len){
						if ($ops[$index] eq 'M'){  # standard matching bases
							$end += $len[$index];
							# warn "Operation is 'M', adding $len[$index] bp\n";
						}
						elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
							# warn "Operation is 'I', next\n";
						}
						elsif($ops[$index] eq 'D'){ # deletions do affect the end position
							# warn "Operation is 'D',adding $len[$index] bp\n";
							$end += $len[$index];
						}
						elsif($ops[$index] eq 'N'){ # skipped regions (splices) do affect the end position
							# warn "Operation is 'N',adding $len[$index] bp\n";
							$end += $len[$index];
						}
						elsif($ops[$index] eq 'S'){ # soft-clipped positions do NOT affect the end position
							# warn "Operation is 'S', next\n";
						}
						else{
							die "Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
						}
					}
				}
				
				### reading in the next line
				$_ = <IN>;
				# the only thing we need is the start position
				($start) = (split (/\t/))[3];
			}

			### Here we take the barcode sequence into consideration
			$composite = join (":",$strand,$chr,$start,$end,$barcode);
			# warn "Adding the following string:\t$composite\n";   
		}
		else{
			die "Input must be single or paired-end\n";
		}

		if (exists $unique_seqs{$composite}){
			++$removed;
			unless (exists $positions{$composite}){
				$positions{$composite}++;
			}
		}
		else{
			if ($paired){
				print OUT $line1; # printing first paired-end line for SAM output
			}
			print OUT; # printing single-end SAM alignment or second paired-end line
			$unique_seqs{$composite}++;
		}
    }

    my $percentage;
    my $percentage_leftover;
    my $leftover = $count - $removed;

    unless ($count == 0){
		$percentage = sprintf("%.2f",$removed/$count*100);
		$percentage_leftover = sprintf("%.2f",$leftover/$count*100);
    }
    else{
		$percentage = 'N/A';
		$percentage_leftover = 'N/A';
    }


    warn "\nTotal number of alignments analysed in $file (UMI-mode):\t$count\n";
    warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

    print REPORT "\nTotal number of alignments analysed in $file (UMI mode):\t$count\n";
    print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

}

sub bam_isEmpty{
      
    my $file = shift;

    if ($file =~ /\.bam$/){
		open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
		open (EMPTY,$file) or die "Unable to read from $file: $!\n";
    }
    my $count = 0;
    while (<EMPTY>){
		if ($_){
			$count++;  # file contains data, fine.
		}
		last; # one line is enough
    }

    if ($count == 0){
		die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n";
    }
    close EMPTY or warn "$!\n";
}

sub bam_isTruncated{
    
    my $file = shift;
    warn "Checking file >>$file<< for signs of file truncation...\n";
    
    open (TRUNC,"$samtools_path view 2>&1 $file |") or die "Unable to read from BAM file $file: $!\n"; # 2>&1 redirects stderr to stdout, so we should be able to capture problems
    
    my $count = 0;
    while (<TRUNC>){
		chomp;
		++$count;
		# errors tend to start with a [], I have seen e.g.: 
		# [W::bam_hdr_read] EOF marker is absent. The input is probably truncated.
		if ($_ =~ /^\[/){ 
			if ($_ =~ /[EOF|truncated]/){ 
				die "Captured error message: '$_'\n\n[ERROR] The file appears to be truncated, please ensure that there were no errors while copying the file!!! Exiting...\n\n";
			}
		}
		last if ($count == 10); 	# this should be enough
    }
    close TRUNC or warn "$!\n";
}


sub print_helpfile{
	print "\n",'='x111,"\n";
	print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM/BAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(as the alignments are not ordered in any way this is also near enough random).\n\n";
	print "For single-end alignments only the chromosome, start coordinate and strand of a read will be used for deduplication.\n\n";
	print "For paired-end alignments the chromosome, strand alignments, and start as well as end coordinate of the entire fragment\nwill be used for deduplication. ";
	print ">This script expects the Bismark output to be in SAM format<.\n\n";
	print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n)  ***\n\n";
	
	print '='x111,"\n\n";
	print ">>> USAGE: ./deduplicate_bismark [options] filename(s) <<<\n\n";

	print "-s/--single\t\tdeduplicate single-end BAM/SAM Bismark files. Default: [AUTO-DETECT]\n";
	print "-p/--paired\t\tdeduplicate paired-end BAM/SAM Bismark files. Default: [AUTO-DETECT]\n\n";
	print "-o/--outfile [filename]\tThe basename of a desired output file. This basename is modified to end in '.deduplicated.bam',\n\t\t\tor '.multiple.deduplicated.bam' in '--multiple' mode, for consistency reasons.\n\n";
	print "--output_dir [path]\tOutput directory, either relative or absolute. Output is written to the current directory if not\n\t\t\tspecified explicitly.\n\n";
	
	print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n\t\t\tconsideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n                        by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n";
	print "--bam\t\t\tThe output will be written out in BAM format. This script will attempt to use the path to Samtools\n\t\t\tthat was specified with '--samtools_path', or, if it hasn't been specified, attempt to find Samtools\n\t\t\tin the PATH. If no installation of Samtools can be found, a GZIP compressed output is written out\n\t\t\tinstead (yielding a .sam.gz output file). Default: ON.\n\n";
	print "--sam\t\t\tThe output will be written out in SAM format. Default: OFF.\n\n";
	
	print "--multiple\t\tAll specified input files are treated as one sample and concatenated together for deduplication.\n\t\t\tThis uses Unix 'cat' for SAM files and 'samtools cat' for BAM files. Additional notes for BAM files:\n\t\t\tAlthough this works on either BAM or CRAM, all input files must be the same format as each other.\n\t\t\tThe sequence dictionary of each input file must be identical, although this command does not check this.\n\t\t\tBy default the header is taken from the first file to be concatenated.\n\n";
	print "--samtools_path [path]\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n";
	print "--version\t\tPrint version information and exit\n";
	
	print '='x111,"\n\n";

	print "This script was last modified on 20 September 2022\n\n";
}


sub test_positional_sorting{

	my $filename = shift;

	print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
	# sleep(1);

	if ($filename =~ /\.gz$/) {
		open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
	}
	elsif ($filename =~ /bam$/ ||  isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
		if ($samtools_path){
			open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
		}
		else{
			die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
		}
	}
	else {
		open (TEST,$filename) or die "Can't open file $filename: $!\n";
	}

	my $count = 0;

	while (<TEST>) {
		if (/^\@/) {	     # testing header lines if they contain the @SO flag (for being sorted)
			if (/^\@SO/) {
				die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
			}
			next;
		}
		$count++;

		last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID

		my ($id_1) = (split (/\t/));

		### reading the next line which should be read 2
		$_ = <TEST>;
		my ($id_2) = (split (/\t/));
		last unless ($id_2);
		++$count;
	
		if ($id_1 eq $id_2){
			### ids are the same
			next;
		}
		else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
			my $id_1_trunc = $id_1;
			$id_1_trunc =~ s/\/1$//;
			my $id_2_trunc = $id_2;
			$id_2_trunc =~ s/\/2$//;

			unless ($id_1_trunc eq $id_2_trunc){
				die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
			}
		}
	}
	#  close TEST or die $!; somehow fails on our cluster...
	### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
	warn "...passed!\n";
	
}

sub isBam{

	my $filename = shift;

	# reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
	open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";

	### when BAM files read through a gunzip -c stream they start with BAM...
	my $bam_in_disguise = <DISGUISE>;
	# warn "BAM in disguise: $bam_in_disguise\n\n";

	if ($bam_in_disguise){
		if ($bam_in_disguise =~ /^BAM/){
			close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
			return 1;
		}
		else{
			close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
			return 0;
		}
	}
	else{
		close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
		return 0;
	}
}

sub process_commandline{

    my $help;
    my $representative;
    my $single;
    my $paired;
    my $global_single;
    my $global_paired;
    my $samtools_path;
    my $bam;
	my $sam;
    my $rrbs;
    my $version;
    my $multiple; # for multiple input files for the same sample
    my $output_dir;
	my $outfile;
	my $parallel;
	
    my $command_line = GetOptions ('help'                => \$help,
				   'representative'      => \$representative,
				   's|single'            => \$global_single,
				   'p|paired'            => \$global_paired,
				   'samtools_path=s'     => \$samtools_path,
				   'bam'                 => \$bam,
				   'sam'                 => \$sam,
				   'barcode'             => \$rrbs,
				   'version'             => \$version,
				   'multiple'            => \$multiple, 
				   'output_dir=s'        => \$output_dir,
				   'o|outfile=s'         => \$outfile,
				   'parallel=i'          => \$parallel,

	);

    die "Please respecify command line options\n\n" unless ($command_line);

    if ($help){
		print_helpfile();
		exit;
    }


    if ($version){
		print << "VERSION";

                  Bismark Deduplication Module

	      Deduplicator Version: $dedup_version
      Copyright 2010-22 Felix Krueger, Altos Labs Bioinformatics
	    https://github.com/FelixKrueger/Bismark
  	  
	  
VERSION
	  exit;
    }

    ### PARENT DIRECTORY
    my $parent_dir = getcwd();
	unless ($parent_dir =~ /\/$/){
		$parent_dir =~ s/$/\//;
    }

    ### OUTPUT DIRECTORY
    if (defined $output_dir){
		unless ($output_dir eq ''){
			unless ($output_dir =~ /\/$/){
				$output_dir =~ s/$/\//;
			}
			
			if (chdir $output_dir){
				$output_dir = getcwd(); #  making the path absolute
				unless ($output_dir =~ /\/$/){
					$output_dir =~ s/$/\//;
				}
			}
			else{
				mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
				warn "Output directory $output_dir doesn't exist, creating it for you...\n\n"; # sleep(1);
				chdir $output_dir or die "Failed to move to $output_dir\n";
				$output_dir = getcwd(); #  making the path absolute
				unless ($output_dir =~ /\/$/){
					$output_dir =~ s/$/\//;
				}
			}
			warn "Output will be written into the directory: $output_dir\n";
		}
    }
    else{
		$output_dir = '';
    }

    # Changing back to parent directory
    chdir $parent_dir or die "Failed to move to $parent_dir\n";

    @filenames = @ARGV;
    unless (@filenames){
		print "Please provide one or more Bismark output files for deduplication\n\n";
		sleep (1);
		print_helpfile();
		exit;
    }

	### Ensuring that there is only a single input file if --outfile
	if (defined $outfile){
		
		if ($multiple){
			# potentially fine (see below). Several output files will get written to a single, pre-specified output file
		}
		else{		
			if (scalar @filenames == 1){
				# fine
			}
			else{
				die "\nYou can only specify an output filename if a single input file was given (you supplied ",scalar @filenames," input files. Please re-specify!\n\n";
			}
		}
		warn "Output filename was given as: $outfile\n";
	}	

    ### representative mode is no longer available
    if($representative){
		die "Deduplication in '--representative' mode is no longer supported. Please stop wanting that.\n";
    }
    # ensuring we have at least 2 files
    if ($multiple){
		unless(scalar @filenames > 1){
			die "Please supply 2 or more files when using '--multiple'\n";	
		}
		if ($rrbs){
			die "The option --multiple is currently only working in default (= random alignment) mode\n\n";
		}
    }

    ### OPTIONS
    unless ($global_single or $global_paired){
		warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n";
    }

    if ($global_paired){
		if ($global_single){
			die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n";
		}
		warn "Processing paired-end Bismark output file(s) (SAM format):\n";
		warn join ("\t",@filenames),"\n\n";
	}
    else{
	    warn "Processing single-end Bismark output file(s) (SAM format):\n";
	    warn join ("\t",@filenames),"\n\n";
	}
    

    ### PATH TO SAMTOOLS
    if (defined $samtools_path){
		# if Samtools was specified as full command
		if ($samtools_path =~ /samtools$/){
			if (-e $samtools_path){
				# Samtools executable found
			}
			else{
				die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
			}
		}
		else{
			unless ($samtools_path =~ /\/$/){
				$samtools_path =~ s/$/\//;
			}
			$samtools_path .= 'samtools';
			if (-e $samtools_path){
				# Samtools executable found
			}
			else{
				die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
			}
		}
    }
    # Check whether Samtools is in the PATH if no path was supplied by the user
    else{
		if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
			$samtools_path = `which samtools`;
			chomp $samtools_path;
		}
    }
	
	### Making --bam mode the default (07 03 2019)
	if ($sam){
		$bam = 0;
	}
	else{
		$bam = 1;
	}
	
    if ($bam){
		if (defined $samtools_path){
			$bam = 1;
		}
		else{
			warn "No Samtools found on your system, writing out a gzipped SAM file instead\n";
			$bam = 2;
		}
    }
    
    return ($single,$paired,$global_single,$global_paired,$samtools_path,$bam,$rrbs,$multiple,$output_dir,$outfile,$parallel);
}