-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmerge_fastq_files
executable file
·79 lines (67 loc) · 2.07 KB
/
merge_fastq_files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/perl
# This script combines two possibly redundant fastq files
#
# It assumes the two files contain possibly overlapping
# sets of records. It outputs the OR set of both file
# records.
#
# Mark Stenglein July 14, 2011
# print "this script does not support new illumina format fastq files. needs to be updated. Mark 10.18.2012\n";
# exit;
my $usage = "merge_fastq_files <fastq_file_1> <fastq_file_2>\n";
my $r1_filename = shift or print ($usage) and die ($!);
my $r2_filename = shift or print ($usage) and die ($!);
open (my $r1_fh, "<", $r1_filename) or print ($usage) and die ($!);
open (my $r2_fh, "<", $r2_filename) or print ($usage) and die ($!);
open (my $out_fh, ">-") or die ("error, couldn't open stdout\n");
my %headers = ();
# first, store set of all headers in a hash
my $header = undef;
my $seq = undef;
my $qual_header = undef;
my $qual = undef;
my @fhs = ($r1_fh, $r2_fh);
foreach my $fh (@fhs)
{
RECORD: while (($header, $seq, $qual_header, $qual) = read_fastq_record($fh))
{
last if (!defined($header));
# if we've already seen a record with this header...
if (defined $headers{$header})
{
# don't do anything - just increment counter
$headers{$header} += 1;
}
else
{
# note that we've encountered it for first time
$headers{$header} = 1;
print "$header\n$seq\n$qual_header\n$qual\n";
}
}
}
# reads a fastq record from a fh (passed as arg)
# returns the header, seq, qual_header, and $qual
# as the elements of a list returned
#
# This assumes 4-line fastq format
sub read_fastq_record
{
my $fh = $_[0];
my $header = <$fh>;
if (!defined $header) { return undef; } # no more records
# make sure it's a fastq header
if (!($header =~ /^@/))
{
print "error reading fastq file. Was expecting 4-line fastq format. read this:\n$header\n";
exit;
}
chomp $header;
my $seq = <$fh>;
chomp $seq;
my $qual_header = <$fh>;
chomp $qual_header;
my $qual = <$fh>;
chomp $qual;
return ($header, $seq, $qual_header, $qual);
}