forked from bigbio/proteomics-sample-metadata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate_magetab.pl
355 lines (254 loc) · 9.02 KB
/
validate_magetab.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/env perl
use strict;
use warnings;
# The following oracle-specific environment variables need to be set here at runtime
# or else all oracle DB connections (e.g. to Conan DB) needed in checks will fail.
BEGIN
{
$ENV{ORACLE_HOME} = "/sw/arch/dbtools/oracle/product/9.2.0";
$ENV{ORA_NLS33} = "/sw/arch/dbtools/oracle/product/9.2.0/ocommon/nls/admin/data";
}
=pod
=head1 NAME
validate_magetab.pl - a script to check that a MAGETAB experiment or array design
is suitable for loading into ArrayExpress
=head1 SYNOPSIS
=over 2
=item B<Experiment mode:>
validate_magetab.pl -i <IDF file>
validate_magetab.pl -m <Merged IDF and SDRF file>
validate_magetab.pl -m <Merged IDF and SDRF file>
=item B<ADF mode:>
validate_magetab.pl -a <ADF file>
=back
=head1 DESCRIPTION
Script performs basic content validation on the supplied MAGE-TAB file. Note that this script will only attempt to
resolve references within the given file/files. Any reference which has a Term Source REF value of 'ArrayExpress'
is assumed to be available in ArrayExpress and the loader will check this at a later stage.
The script will return 0 if the file is considered safe to load.
Additional curation checks can be run using the -c option. In this case the following logs will be created:
expt_filename_error.log - All errors and warnings
expt_filename_report.log - Experiment description information and file list
expt_filename_data.log - Tabular report describing data files
expt_filename_feature.log - Missing design elements found in data files
expt_ATLAS_filename_error.log - Atlas specific errors and warnings
filename.png - A graph showing links between nodes (materials, assays, data) in the magetab file
=head1 OPTIONS
=over 4
=item B<-i> C<IDF filename>
The MAGE-TAB IDF file to be checked (SDRF file name will be obtained from the IDF)
=item B<-m> C<Merged MAGE-TAB IDF and SDRF filename>
A MAGE-TAB document in which a single IDF and SDRF have been combined (in that order),
with the start of each section marked by [IDF] and [SDRF] respectively. Note that such
documents are not compliant with the MAGE-TAB format specification; this format is used
by ArrayExpress to simplify data submissions.
=item B<-d> C<data directory>
Directory where the data files and SDRF can be found if they are not in the same directory
as the IDF
=item B<-c>
Flag to switch on full curator checking mode, including Atlas checks
=item B<-x>
Flag to indicate that all data file checks should be skipped
=item B<-a> C<ADF filename>
The MAGE-TAB ADF file to be checked.
=item B<-v>
Swtich on verbose logging.
=item B<-h>
Prints a short help text.
=back
=head1 TESTS
=head1 AUTHOR
Anna Farne (farne@ebi.ac.uk), ArrayExpress team, EBI, 2012.
Modified by Emma Hastings (emma@ebi.ac.uk) and Amy Tang (amytang@ebi.ac.uk),
ArrayExpress team, EBI, 2014
Many of the experiment checks were implemented by Tim Rayner.
Acknowledgements go to the ArrayExpress curation team for feature
requests, bug reports and other valuable comments.
=cut
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case);
use File::Spec;
use Data::Dumper;
use EBI::FGPT::Reader::MAGETAB;
use EBI::FGPT::Writer::Report;
use EBI::FGPT::Reader::ADFParser;
use Log::Log4perl::Appender;
use Log::Log4perl::Level;
# Needed for ADF header recognition before using CPAN parser
# use ArrayExpress::Curator::Config qw($CONFIG);
my $svn_revision = '$Revision$';
our ($VERSION) = ( $svn_revision =~ /^\$Revision: ([\d\.]*)/ );
sub parse_args
{
my ( %args, $want_help );
GetOptions(
"m|merged=s" => \$args{merged_filename},
"i|idf=s" => \$args{idf_filename},
"h|help" => \$want_help,
"v|verbose" => \$args{verbose},
"d|data_dir=s" => \$args{data_dir},
"c|curate" => \$args{curate},
"x|skip_data" => \$args{skip_data},
"a|adf=s" => \$args{adf_filename},
"l|log=s" => \$args{adf_logdir},
);
if ($want_help)
{
pod2usage(
-exitval => 255,
-output => \*STDOUT,
-verbose => 1,
);
}
unless ( $args{merged_filename} || $args{idf_filename} || $args{adf_filename} )
{
pod2usage(
-message => 'You must provide an ADF, IDF or merged IDF/SDRF file.',
-exitval => 255,
-output => \*STDOUT,
-verbose => 0,
);
}
if ( $args{idf_filename} and $args{merged_filename} )
{
pod2usage(
-message => 'You cannot provide an IDF AND merged IDF/SDRF file.',
-exitval => 255,
-output => \*STDOUT,
-verbose => 0,
);
}
if ( $args{adf_filename} and ( $args{merged_filename} || $args{idf_filename} ) )
{
pod2usage(
-message => 'You cannot provide an ADF AND an IDF or merged magetab document',
-exitval => 255,
-output => \*STDOUT,
-verbose => 0,
);
}
return ( \%args );
}
# Get our arguments
my $args = parse_args();
### ADF CHECKS ###
# Did not specify any log file locations as the ADF loader should be capturing
# the STDOUT messages (INFO, WARN and ERROR alike)
if ( $args->{adf_filename} ) {
my $adf_checker = EBI::FGPT::Reader::ADFParser->new({
'adf_path' => $args->{adf_filename},
'verbose_logging' => $args->{verbose}
});
$adf_checker->check;
# Check for the presence of a single, valid accession number:
my $arraydesign = $adf_checker->get_arraydesign;
my @accession_number = grep {$_->get_name =~/ArrayExpressAccession/} @{ $arraydesign->get_comments || []};
if (scalar @accession_number == 0) {
$adf_checker->error("Comment[ArrayExpressAccession] is missing. ADF is not valid for database loading.");
} elsif (scalar @accession_number > 1) {
$adf_checker->error("There are multiple accession numbers in Comment[ArrayExpressAccession].");
} elsif ($accession_number[0]->get_value !~/A-[A-Z]{4}-\d+/) {
$adf_checker->error("The accession ".$accession_number[0]->get_value." is not in ArrayExpress format.");
}
my $checker_status_appender = Log::Log4perl->appender_by_name("adf_checker_status")
or die("Could not find log appender named '\adf_checker_status\'.");
print "\n";
print "Number of ADF warnings: "
. $checker_status_appender->howmany("WARN") . "\n";
print "Number of ADF errors: "
. $checker_status_appender->howmany("ERROR") . "\n";
if ($adf_checker->has_errors) {
exit 1;
} else {
exit 0;
}
}
### EXPERIMENT CHECKS ###
# Checker will always perform basic validation checks
# Can specify that it runs additional checks as required
my $check_sets;
my $reader_params;
# Some curation specific reporting set up
if ( $args->{curate} )
{
# Checks sets have no name because we do not want to
# create log files for them in curation mode
# $check_sets->{'EBI::FGPT::CheckSet::AEArchive'} = '';
# $check_sets->{'EBI::FGPT::CheckSet::Curation'} = '';
$check_sets->{'EBI::FGPT::CheckSet::AEAtlas'} = '';
my $reporter = Log::Log4perl::Appender->new(
"EBI::FGPT::Writer::Report",
name => "report_writer",
additivity => 1,
);
$reporter->threshold($INFO);
$reader_params->{report_writer} = $reporter;
#Create Atlas log
my $atlas_reporter = Log::Log4perl::Appender->new(
"EBI::FGPT::Writer::Report",
name => "atlas_report_writer",
additivity => 1,
);
$atlas_reporter->threshold($INFO);
$reader_params->{atlas_report_writer} = $atlas_reporter;
# When running in curation mode we want to add a temporary
# AE accession so that validation does not fail due to missing accession
$reader_params->{accession} = "DUMMY";
} else
{
# ae_validation log file will be created
$check_sets->{'EBI::FGPT::CheckSet::AEArchive'} = 'ae_validation';
}
# Set up parser params depending on script args provided
$reader_params->{'check_sets'} = $check_sets;
$reader_params->{'skip_data_checks'} = $args->{'skip_data'};
my $filename;
if ( $args->{idf_filename} )
{
$reader_params->{idf} = $args->{idf_filename};
$filename = $args->{idf_filename};
} else
{
$reader_params->{mtab_doc} = $args->{merged_filename};
$filename = $args->{merged_filename};
}
if ( $args->{data_dir} )
{
$reader_params->{data_dir} = $args->{data_dir};
} else
{
my ( $vol, $dir, $file ) = File::Spec->splitpath($filename);
$dir ||= ".";
$reader_params->{data_dir} = $dir;
}
if( $args->{ "verbose" } ) {
$reader_params->{ "verbose_logging" } = 1;
}
print "\nData dir: " . $reader_params->{data_dir} . "\n\n";
my $checker = EBI::FGPT::Reader::MAGETAB->new($reader_params);
$checker->parse();
$checker->print_checker_status();
END
{
# Attempt to delete Config.yml which is created by PAR unpacking process
# We will remove the requirement to have this file in the PAR archive
# during refactoring (it is needed by ArrayExpress::Curator::Config)
if ( -r "Config.yml" )
{
my $mtime = -M "Config.yml";
# If file was created after script started we delete it
# We do this check to avoid deleting Config files which are
# not related to the running of this script
if ( $mtime < 0 )
{
unlink "Config.yml";
}
}
}
if ( $checker->has_errors )
{
exit 1;
} else
{
exit 0;
}