-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtripal_eutils.install
375 lines (318 loc) · 10.7 KB
/
tripal_eutils.install
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
<?php
/**
* @file
*/
/**
* Implements hook_install().
*/
function tripal_eutils_install() {
chado_insert_cv('NCBI BioSample Attributes',
'The ncbi BioSample Attributes CV is downloaded from https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml.');
chado_insert_cvterm([
'id' => 'local:full_ncbi_xml',
'name' => 'full_ncbi_xml',
'cv_name' => 'local',
]);
chado_insert_cvterm([
'id' => 'local:ncbi_FTP_links',
'name' => 'NCBI Data Download FTP Link',
'cv_name' => 'local',
]);
tripal_eutils_add_dbs();
tripal_eutils_insert_biosample_attribute_terms();
tripal_eutils_install_chado_1_4_tables();
tripal_eutils_convert_terms();
}
/**
* Implements hook_uninstall().
*/
function tripal_eutils_uninstall() {
// Should not be needed because this table is defined by hook_schema, but here we are.
db_drop_table('chado.tripal_eutils_tag_dictionary');
}
/**
* Extra biosample property terms discovered in @ticket 170.
*/
function tripal_eutils_insert_extra_biosample_terms() {
chado_insert_cvterm(
[
'id' => 'NCBI_BioSample_Attributes:samples provided by',
'name' => 'Samples provided by',
'definition' => '',
'cv_name' => 'NCBI BioSample Attributes',
]
);
chado_insert_cvterm(
[
'id' => 'NCBI_BioSample_Attributes:Publication',
'name' => 'Publication',
'definition' => '',
'cv_name' => 'NCBI BioSample Attributes',
]
);
chado_insert_cvterm(
[
'id' => 'NCBI_BioSample_Attributes:note',
'name' => 'Note',
'definition' => 'Misc. free text.',
'cv_name' => 'NCBI BioSample Attributes',
]
);
}
/**
* Updates the NCBI BioSample Attribute CV Terms
*
* Automatically downloads and imports into the CVterm table the attribute
* names for biomaterials.
*/
function tripal_eutils_insert_biosample_attribute_terms() {
// We use NCBI biosample attributes to fill the 'NCBI BioSample Attributes' CV.
// These attributes can be accessed at the following url:
// https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml
$lookup = new \BiosamplePropertyLookup();
$terms = $lookup->lookupAll();
foreach ($terms as $machine_name => $attributes) {
chado_insert_cvterm(
[
'id' => 'NCBI_BioSample_Attributes:' . $machine_name,
'name' => $attributes['label'],
'definition' => $attributes['def'],
'cv_name' => 'NCBI BioSample Attributes',
]
);
}
}
/**
* Implements hook_schema()
*/
function tripal_eutils_schema() {
// This table is not currently used. It would allow admins to use custom cvterms when loading in properties. However, for now we want to limit this.
$schema['tripal_eutils_tag_dictionary'] = [
'description' => 'Stores all attribute tags encountered and their mappings',
'fields' => [
'id' => [
'type' => 'serial',
'unsigned' => TRUE,
'not null' => TRUE,
],
'tag' => [
'type' => 'varchar',
'length' => 255,
'not null' => TRUE,
'description' => 'The XML tag name.',
],
'type' => [
'description' => 'NCBI database type.',
'type' => 'varchar',
'length' => 60,
'not null' => TRUE,
],
'status' => [
'description' => 'assigned, unassigned, ignored',
'type' => 'varchar',
'length' => 60,
'not null' => TRUE,
],
'cvterm_id' => [
'description' => 'The cvterm this tag is mapped to.',
'type' => 'int',
'not null' => FALSE,
],
],
'primary key' => [
'id',
],
];
return $schema;
}
/**
* Custom tables. These are not deleted on uninstall since they
* are chado 1.4 tables.
*/
function tripal_eutils_chado_1_4_schema() {
// Linker table between biomaterials and projects.
$schema['biomaterial_project'] = [
'fields' => [
'biomaterial_project_id' => ['type' => 'serial', 'not null' => TRUE],
'biomaterial_id' => ['type' => 'int', 'not null' => TRUE],
'project_id' => ['type' => 'int', 'not null' => TRUE],
],
'primary key' => ['biomaterial_project_id'],
'foreign keys' => [
'biomaterial' => [
'table' => 'biomaterial',
'columns' => [
'biomaterial_id' => 'biomaterial_id',
],
],
'project' => [
'table' => 'project',
'columns' => [
'project_id' => 'project_id',
],
],
],
'indexes' => [
'biomaterial_project_idx1' => ['biomaterial_id'],
'biomaterial_project_idx2' => ['project_id'],
],
'unique keys' => [
'biomaterial_project_unique_uq1' => ['biomaterial_id', 'project_id'],
],
];
// Linker table between organisms and analyses.
// This table schema is identical to the tripal_manage_analyses module schema.
$schema['organism_analysis'] = [
'fields' => [
'organism_analysis_id' => ['type' => 'serial', 'not null' => TRUE],
'organism_id' => ['type' => 'int', 'not null' => TRUE],
'analysis_id' => ['type' => 'int', 'not null' => TRUE],
],
'primary key' => ['organism_analysis_id'],
'foreign keys' => [
'organism' => [
'table' => 'organism',
'columns' => [
'organism_id' => 'organism_id',
],
],
'analysis' => [
'table' => 'analysis',
'columns' => [
'analysis_id' => 'analysis_id',
],
],
],
'indexes' => [
'organism_analysis_idx1' => ['organism_id'],
'organism_analysis_idx2' => ['analysis_id'],
],
'unique keys' => [
'organism_analysis_unique_uq1' => ['organism_id', 'analysis_id'],
],
];
return $schema;
}
/**
* Add chado db records.
*/
function tripal_eutils_add_dbs() {
chado_insert_db(
[
'name' => 'NCBI BioSample',
'description' => 'The BioSample database contains descriptions of biological source materials used in experimental assays.',
'urlprefix' => 'http://www.ncbi.nlm.nih.gov/biosample/{accession}',
'url' => 'http://www.ncbi.nlm.nih.gov/biosample/',
]
);
chado_insert_db([
'name' => 'NCBI SRA',
'url' => 'https://www.ncbi.nlm.nih.gov/sra/',
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/sra/{accession}',
]);
chado_insert_db([
'name' => 'NCBI_BioSample_Attributes',
'description' => 'This database provides, in XML format, the listing of attribute names for biosamples housed in NCBI.',
'url' => 'https://www.ncbi.nlm.nih.gov/biosample/docs/attributes',
]);
chado_insert_db([
'name' => 'NCBI WGS',
'url' => 'https://www.ncbi.nlm.nih.gov/Traces/wgs/',
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/Traces/wgs/{accession}',
]);
chado_insert_db([
'name' => 'NCBI Refseq',
'description' => 'A comprehensive, integrated, non-redundant, well-annotated set of reference sequences including genomic, transcript, and protein.',
'url' => 'https://www.ncbi.nlm.nih.gov/refseq/',
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/refseq/{accession}',
]);
chado_insert_db([
'name' => 'NCBI GenBank',
'description' => 'GenBank ® is the NIH genetic sequence database, an annotated collection of all publicly available DNA sequences (Nucleic Acids Research, 2013 Jan;41(D1):D36-42).',
'url' => 'https://www.ncbi.nlm.nih.gov/genbank/',
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/genbank/{accession}',
]);
chado_insert_db([
'name' => 'NCBI BioProject',
'description' => "A BioProject is a collection of biological data related to a single initiative, originating from a single organization or from a consortium.",
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/bioproject/{accession}',
'url' => 'https://www.ncbi.nlm.nih.gov/bioproject/',
]);
chado_insert_db([
'name' => 'NCBI Assembly',
'description' => "A database providing information on the structure of assembled genomes, assembly names and other meta-data, statistical reports, and links to genomic sequence data..",
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/assembly/{accession}',
'url' => 'https://www.ncbi.nlm.nih.gov/assembly/',
]);
tripal_eutils_create_dbs_for_assembly_xrefs();
}
/**
* Adds Chado linker tables this module needs.
*
* These tables will be in the Chado 1.4 release so let's add them here for now.
*/
function tripal_eutils_install_chado_1_4_tables() {
$schema = tripal_eutils_chado_1_4_schema();
chado_create_custom_table('biomaterial_project', $schema['biomaterial_project'], TRUE, NULL, FALSE);
chado_create_custom_table('organism_analysis', $schema['organism_analysis'], TRUE, NULL, FALSE);
}
/**
* Creates db entries for keys introduced in the assembly loader.
*
*/
function tripal_eutils_create_dbs_for_assembly_xrefs() {
chado_insert_db([
'name' => 'Refseq Assembly',
'description' => "A database providing information on the structure of assembled genomes, assembly names and other meta-data, statistical reports, and links to genomic sequence data..",
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/assembly/{accession}',
'url' => 'https://www.ncbi.nlm.nih.gov/assembly/',
]);
chado_insert_db([
'name' => 'Genbank Assembly',
'description' => "A database providing information on the structure of assembled genomes, assembly names and other meta-data, statistical reports, and links to genomic sequence data..",
'urlprefix' => 'https://www.ncbi.nlm.nih.gov/assembly/{accession}',
'url' => 'https://www.ncbi.nlm.nih.gov/assembly/',
]);
}
/**
* Convert terms from ncbi_properties to NCBI_BioSample_Attributes
*/
function tripal_eutils_convert_terms() {
// Insert the new cv and db entries, but not the terms from the XML (this means we can't just call
// tripal_eutils_install() again).
chado_insert_cv('NCBI BioSample Attributes',
'The ncbi BioSample Attributes CV is downloaded from https://www.ncbi.nlm.nih.gov/biosample/docs/attributes/?format=xml.');
chado_insert_db([
'name' => 'NCBI_BioSample_Attributes',
'description' => 'Attribute and property terms for NCBI.',
'url' => 'http://www.ncbi.nlm.nih.gov/',
]);
chado_insert_cvterm(
[
'id' => 'NCBI_BioSample_Attributes:submitter_provided_accession',
'name' => 'submitter_provided_accession',
'cv_name' => 'NCBI BioSample Attributes',
],
['update_existing' => false]
);
// Insert the terms from NCBI's Sample XML file into the 'NCBI BioSample Attributes' CV
tripal_eutils_insert_biosample_attribute_terms();
}
/**
* Add extra property terms.
*/
function tripal_eutils_update_7301() {
tripal_eutils_insert_extra_biosample_terms();
}
/**
* Add extra databases
*/
function tripal_eutils_update_7302(){
tripal_eutils_create_dbs_for_assembly_xrefs();
}
/**
* Convert terms from ncbi_properties to NCBI_BioSample_Attributes
*/
function tripal_eutils_update_7303() {
tripal_eutils_convert_terms();
}