-
Notifications
You must be signed in to change notification settings - Fork 0
/
4-find_CGs.pl
69 lines (62 loc) · 1.54 KB
/
4-find_CGs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/perl
# 4-find_CGs.pl
#
# Jay Moore, John Innes Centre, Norwich
# jonathan.moore@jic.ac.uk
#
# 18/10/2017
#
# Version 1
#
# Change log
#
# Summary of functions
#
# Done:
#
# Accept a FASTA filepath from command line
# Parse the file identifying CG dinucleotides
# Output a list of tab-separated Chromosome IDs and start/end loci of CG dinucelotides
#
# To do:
#
# grab the FASTA filename from command line
my $fasta=$ARGV[0];
# This is the pattern to search for
my $pattern = 'CG';
# This is the adjustment to base position (0 or 1 base)
my $adjustment = 1;
# Does the alignment directory exist?
if (-e $fasta) {
# load the FASTA file into a hash, one entry per sequence, with no return characters
my %fasta;
open FASTA, $fasta;
my $current_sequence;
while (<FASTA>) {
my $line=$_;
chomp $line;
if (substr($line,0,1) eq '>') {
$current_sequence = substr $line, 1;
my @current_sequence = split ' ',$current_sequence;
$fasta{$current_sequence[0]} = '';
$current_sequence = $current_sequence[0];
#print "Loading $current_sequence\n";
} else {
$fasta{$current_sequence} .= $line;
}
}
close FASTA;
# Go through the entries in the hash outputting CG site loci
foreach my $seq(sort keys %fasta) {
#print $seq.' '.length($fasta{$seq})."\n";
for (my $locus=0; $locus<length($fasta{$seq}); $locus++) {
if (uc(substr($fasta{$seq},$locus,2)) eq $pattern) {
my $start = $locus+$adjustment;
my $end = $locus+$adjustment+1;
print $seq."\t".$start."\t".$end."\n";
}
}
}
} else {
print "$fasta not found.\n";
}