-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrsync_from_ncbi.pl
executable file
·166 lines (147 loc) · 5.27 KB
/
rsync_from_ncbi.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood <dwood@cs.jhu.edu>
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Reads an assembly_summary.txt file, which indicates taxids and FTP paths for
# genome/protein data. Performs the download of the complete genomes from
# that file, decompresses, and explicitly assigns taxonomy as needed.
# Modified by Bui Van-Kien (buikien.dp@sjtu.edu.cn)
# used for CDKAM: a metagenomic classification tool using discriminative k-mers and approximate matching strategy
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use Net::FTP;
use List::Util qw/max/;
my $PROG = basename $0;
my $SERVER = "ftp.ncbi.nlm.nih.gov";
my $SERVER_PATH = "/genomes";
my $FTP_USER = "anonymous";
my $FTP_PASS = "kraken2download";
my $qm_server = quotemeta $SERVER;
my $qm_server_path = quotemeta $SERVER_PATH;
my $use_ftp = $ENV{"CDKAM_USE_FTP"};
my $suffix = "_genomic.fna.gz";
# Manifest hash maps filenames (keys) to taxids (values)
my %manifest;
while (<>) {
next if /^#/;
chomp;
my @fields = split /\t/;
my ($assembly, $taxid, $asm_level, $ftp_path) = @fields[0, 5, 11, 19];
# Possible TODO - make the list here configurable by user-supplied flags
next unless grep {$asm_level eq $_} ("Complete Genome", "Chromosome");
next if $ftp_path eq "na"; # Skip if no provided path
my $full_path = $ftp_path . "/" . basename($ftp_path) . $suffix;
# strip off server/leading dir name to allow --files-from= to work w/ rsync
# also allows filenames to just start with "all/", which is nice
if (! ($full_path =~ s#^ftp://${qm_server}${qm_server_path}/##)) {
die "$PROG: unexpected FTP path (new server?) for $ftp_path\n";
}
my $fullID = basename($assembly) . "|" . basename($taxid);
$manifest{$full_path} = $fullID;
}
open MANIFEST, ">", "manifest.txt"
or die "$PROG: can't write manifest: $!\n";
print MANIFEST "$_\n" for keys %manifest;
close MANIFEST;
if ($use_ftp) {
print STDERR "Step 1/4: Performing ftp file transfer of requested files\n";
my $ftp = Net::FTP->new($SERVER, Passive => 1)
or die "$PROG: FTP connection error: $@\n";
$ftp->login($FTP_USER, $FTP_PASS)
or die "$PROG: FTP login error: " . $ftp->message() . "\n";
$ftp->binary()
or die "$PROG: FTP binary mode error: " . $ftp->message() . "\n";
$ftp->cwd($SERVER_PATH)
or die "$PROG: FTP CD error: " . $ftp->message() . "\n";
open MANIFEST, "<", "manifest.txt"
or die "$PROG: can't open manifest: $!\n";
mkdir "all" or die "$PROG: can't create 'all' directory: $!\n";
chdir "all" or die "$PROG: can't chdir into 'all' directory: $!\n";
while (<MANIFEST>) {
chomp;
$ftp->get($_)
or do {
my $msg = $ftp->message();
if ($msg !~ /: No such file or directory$/) {
warn "$PROG: unable to download $_: $msg\n";
}
};
}
close MANIFEST;
chdir ".." or die "$PROG: can't return to correct directory: $!\n";
}
else {
print STDERR "Step 1/4: Performing rsync file transfer of requested files\n";
system("rsync --no-motd --files-from=manifest.txt rsync://${SERVER}${SERVER_PATH}/ .") == 0
or die "$PROG: rsync error, exiting: $?\n";
print STDERR "Rsync file transfer complete.\n";
}
print STDERR "Step 2/4: Assigning taxonomic IDs to sequences\n";
my $output_file = "library.fna";
open OUT, ">", $output_file
or die "$PROG: can't write $output_file: $!\n";
my $projects_added = 0;
my $sequences_added = 0;
my $ch_added = 0;
my $ch = "bp";
my $max_out_chars = 0;
#my $taxid_file = "taxid.txt";
#open OUT2, ">", $taxid_file
# or die "$PROG: can't write $taxid_file: $!\n";
#for my $in_filename (keys %manifest) {
# my $taxid = $manifest{$in_filename};
# system("touch ./references/$taxid.txt");
# print OUT2 "$taxid\n";
#}
for my $in_filename (keys %manifest) {
my $taxid = $manifest{$in_filename};
if ($use_ftp) { # FTP downloading doesn't create full path locally
$in_filename = "all/" . basename($in_filename);
}
open IN, "gunzip -c $in_filename |" or die "$PROG: can't read $in_filename: $!\n";
while (<IN>) {
if (/^>/) {
s/^>/>CDKAM|$taxid|/;
$sequences_added++;
}
else {
$ch_added += length($_) - 1;
}
print OUT;
}
close IN;
unlink $in_filename;
$projects_added++;
my $out_line = progress_line($projects_added, scalar keys %manifest, $sequences_added, $ch_added) . "...";
$max_out_chars = max(length($out_line), $max_out_chars);
my $space_line = " " x $max_out_chars;
print STDERR "\r$space_line\r$out_line" if -t STDERR;
}
close OUT;
print STDERR " done.\n" if -t STDERR;
print STDERR "All files processed, cleaning up extra sequence files...";
system("rm -rf all/") == 0
or die "$PROG: can't clean up all/ directory: $?\n";
print STDERR " done, library complete.\n";
sub progress_line {
my ($projs, $total_projs, $seqs, $chs) = @_;
my $line = "Processed ";
$line .= ($projs == $total_projs) ? "$projs" : "$projs/$total_projs";
$line .= " project" . ($total_projs > 1 ? 's' : '') . " ";
$line .= "($seqs sequence" . ($seqs > 1 ? 's' : '') . ", ";
my $prefix;
my @prefixes = qw/k M G T P E/;
while (@prefixes && $chs >= 1000) {
$prefix = shift @prefixes;
$chs /= 1000;
}
if (defined $prefix) {
$line .= sprintf '%.2f %s%s)', $chs, $prefix, $ch;
}
else {
$line .= "$chs $ch)";
}
return $line;
}