forked from qiita-spots/qiita
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
2978 lines (2583 loc) · 105 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
r"""
Util functions (:mod: `qiita_db.util`)
======================================
..currentmodule:: qiita_db.util
This module provides different util functions.
Methods
-------
..autosummary::
:toctree: generated/
quote_data_value
scrub_data
exists_table
get_db_files_base_dir
compute_checksum
get_files_from_uploads_folders
filepath_id_to_rel_path
filepath_id_to_object_id
get_mountpoint
insert_filepaths
check_table_cols
check_required_columns
convert_from_id
convert_to_id
get_environmental_packages
get_visibilities
purge_filepaths
move_filepaths_to_upload_folder
move_upload_files_to_trash
add_message
get_pubmed_ids_from_dois
generate_analysis_list
human_merging_scheme
"""
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
from random import SystemRandom
from string import ascii_letters, digits, punctuation
from binascii import crc32
from bcrypt import hashpw, gensalt
from functools import partial
from os.path import join, basename, isdir, exists, getsize
from os import walk, remove, listdir, rename, stat, makedirs
from glob import glob
from shutil import move, rmtree, copy as shutil_copy
from openpyxl import load_workbook
from tempfile import mkstemp
from csv import writer as csv_writer
from datetime import datetime, timedelta
from time import time as now
from itertools import chain
from contextlib import contextmanager
import h5py
from humanize import naturalsize
import hashlib
from smtplib import SMTP, SMTP_SSL, SMTPException
from errno import EEXIST
from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_core.qiita_settings import qiita_config
from subprocess import check_output
import qiita_db as qdb
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from io import StringIO
from json import loads
from scipy.optimize import minimize
# memory constant functions defined for @resource_allocation_plot
mem_model1 = (lambda x, k, a, b: k * np.log(x) + x * a + b)
mem_model2 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 + a)
mem_model3 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 +
a * np.log(x)**3)
mem_model4 = (lambda x, k, a, b: k * np.log(x) + b * np.log(x)**2 +
a * np.log(x)**2.5)
MODELS_MEM = [mem_model1, mem_model2, mem_model3, mem_model4]
# time constant functions defined for @resource_allocation_plot
time_model1 = (lambda x, k, a, b: a + b + np.log(x) * k)
time_model2 = (lambda x, k, a, b: a + b * x + np.log(x) * k)
time_model3 = (lambda x, k, a, b: a + b * np.log(x)**2 + np.log(x) * k)
time_model4 = (lambda x, k, a, b: a * np.log(x)**3 + b * np.log(x)**2
+ np.log(x) * k)
MODELS_TIME = [time_model1, time_model2, time_model3, time_model4]
def get_model_name(model):
if model == mem_model1:
return "k * log(x) + x * a + b"
elif model == mem_model2:
return "k * log(x) + b * log(x)^2 + a"
elif model == mem_model3:
return "k * log(x) + b * log(x)^2 + a * log(x)^3"
elif model == mem_model4:
return "k * log(x) + b * log(x)^2 + a * log(x)^2.5"
elif model == time_model1:
return "a + b + log(x) * k"
elif model == time_model2:
return "a + b * x + log(x) * k"
elif model == time_model3:
return "a + b * log(x)^2 + log(x) * k"
elif model == time_model4:
return "a * log(x)^3 + b * log(x)^2 + log(x) * k"
else:
return "Unknown model"
def scrub_data(s):
r"""Scrubs data fields of characters not allowed by PostgreSQL
disallowed characters:
' ;
Parameters
----------
s : str
The string to clean up
Returns
-------
str
The scrubbed string
"""
ret = s.replace("'", "")
ret = ret.replace(";", "")
return ret
def convert_type(obj):
"""Converts a passed item to int, float, or str in that order
Parameters
----------
obj : object
object to evaluate
Returns
-------
int, float, or str
Re-typed information from obj
Raises
------
IncompetentQiitaDeveloperError
If the object can't be converted to int, float, or string
Notes
-----
The function first tries to convert to an int. If that fails, it tries to
convert to a float. If that fails it returns the original string.
"""
item = None
if isinstance(obj, datetime):
item = str(obj)
else:
for fn in (int, float, str):
try:
item = fn(obj)
except ValueError:
continue
else:
break
if item is None:
raise IncompetentQiitaDeveloperError("Can't convert item of type %s!" %
str(type(obj)))
return item
def get_artifact_types(key_by_id=False):
"""Gets the list of possible artifact types
Parameters
----------
key : bool, optional
Determines the format of the returned dict. Defaults to false.
Returns
-------
dict
If key_by_id is True, dict is of the form
{artifact_type_id: artifact_type}
If key_by_id is False, dict is of the form
{artifact_type: artifact_type_id}
"""
with qdb.sql_connection.TRN:
cols = ('artifact_type_id, artifact_type'
if key_by_id else 'artifact_type, artifact_type_id')
sql = "SELECT {} FROM qiita.artifact_type".format(cols)
qdb.sql_connection.TRN.add(sql)
return dict(qdb.sql_connection.TRN.execute_fetchindex())
def get_filepath_types(key='filepath_type'):
"""Gets the list of possible filepath types from the filetype table
Parameters
----------
key : {'filepath_type', 'filepath_type_id'}, optional
Defaults to "filepath_type". Determines the format of the returned
dict.
Returns
-------
dict
- If `key` is "filepath_type", dict is of the form
{filepath_type: filepath_type_id}
- If `key` is "filepath_type_id", dict is of the form
{filepath_type_id: filepath_type}
"""
with qdb.sql_connection.TRN:
if key == 'filepath_type':
cols = 'filepath_type, filepath_type_id'
elif key == 'filepath_type_id':
cols = 'filepath_type_id, filepath_type'
else:
raise qdb.exceptions.QiitaDBColumnError(
"Unknown key. Pass either 'filepath_type' or "
"'filepath_type_id'.")
sql = 'SELECT {} FROM qiita.filepath_type'.format(cols)
qdb.sql_connection.TRN.add(sql)
return dict(qdb.sql_connection.TRN.execute_fetchindex())
def get_data_types(key='data_type'):
"""Gets the list of possible data types from the data_type table
Parameters
----------
key : {'data_type', 'data_type_id'}, optional
Defaults to "data_type". Determines the format of the returned dict.
Returns
-------
dict
- If `key` is "data_type", dict is of the form
{data_type: data_type_id}
- If `key` is "data_type_id", dict is of the form
{data_type_id: data_type}
"""
with qdb.sql_connection.TRN:
if key == 'data_type':
cols = 'data_type, data_type_id'
elif key == 'data_type_id':
cols = 'data_type_id, data_type'
else:
raise qdb.exceptions.QiitaDBColumnError(
"Unknown key. Pass either 'data_type_id' or 'data_type'.")
sql = 'SELECT {} FROM qiita.data_type'.format(cols)
qdb.sql_connection.TRN.add(sql)
return dict(qdb.sql_connection.TRN.execute_fetchindex())
def create_rand_string(length, punct=True):
"""Returns a string of random ascii characters
Parameters
----------
length: int
Length of string to return
punct: bool, optional
Include punctuation as well as letters and numbers. Default True.
"""
chars = ascii_letters + digits
if punct:
chars += punctuation
sr = SystemRandom()
return ''.join(sr.choice(chars) for i in range(length))
def hash_password(password, hashedpw=None):
"""Hashes password
Parameters
----------
password: str
Plaintext password
hashedpw: str, optional
Previously hashed password for bcrypt to pull salt from. If not
given, salt generated before hash
Returns
-------
str
Hashed password
Notes
-----
Relies on bcrypt library to hash passwords, which stores the salt as
part of the hashed password. Don't need to actually store the salt
because of this.
"""
# all the encode/decode as a python 3 workaround for bcrypt
if hashedpw is None:
hashedpw = gensalt()
else:
hashedpw = hashedpw.encode('utf-8')
password = password.encode('utf-8')
output = hashpw(password, hashedpw)
if isinstance(output, bytes):
output = output.decode("utf-8")
return output
def check_required_columns(keys, table):
"""Makes sure all required columns in database table are in keys
Parameters
----------
keys: iterable
Holds the keys in the dictionary
table: str
name of the table to check required columns
Raises
------
QiitaDBColumnError
If keys exist that are not in the table
RuntimeError
Unable to get columns from database
"""
with qdb.sql_connection.TRN:
sql = """SELECT is_nullable, column_name, column_default
FROM information_schema.columns WHERE table_name = %s"""
qdb.sql_connection.TRN.add(sql, [table])
cols = qdb.sql_connection.TRN.execute_fetchindex()
# Test needed because a user with certain permissions can query without
# error but be unable to get the column names
if len(cols) == 0:
raise RuntimeError("Unable to fetch column names for table %s"
% table)
required = set(x[1] for x in cols if x[0] == 'NO' and x[2] is None)
if len(required.difference(keys)) > 0:
raise qdb.exceptions.QiitaDBColumnError(
"Required keys missing: %s" % required.difference(keys))
def check_table_cols(keys, table):
"""Makes sure all keys correspond to column headers in a table
Parameters
----------
keys: iterable
Holds the keys in the dictionary
table: str
name of the table to check column names
Raises
------
QiitaDBColumnError
If a key is found that is not in table columns
RuntimeError
Unable to get columns from database
"""
with qdb.sql_connection.TRN:
sql = """SELECT column_name FROM information_schema.columns
WHERE table_name = %s"""
qdb.sql_connection.TRN.add(sql, [table])
cols = qdb.sql_connection.TRN.execute_fetchflatten()
# Test needed because a user with certain permissions can query without
# error but be unable to get the column names
if len(cols) == 0:
raise RuntimeError("Unable to fetch column names for table %s"
% table)
if len(set(keys).difference(cols)) > 0:
raise qdb.exceptions.QiitaDBColumnError(
"Non-database keys found: %s" % set(keys).difference(cols))
def get_table_cols(table):
"""Returns the column headers of table
Parameters
----------
table : str
The table name
Returns
-------
list of str
The column headers of `table`
"""
with qdb.sql_connection.TRN:
sql = """SELECT column_name FROM information_schema.columns
WHERE table_name=%s AND table_schema='qiita'"""
qdb.sql_connection.TRN.add(sql, [table])
return qdb.sql_connection.TRN.execute_fetchflatten()
def exists_table(table):
r"""Checks if `table` exists on the database
Parameters
----------
table : str
The table name to check if exists
Returns
-------
bool
Whether `table` exists on the database or not
"""
with qdb.sql_connection.TRN:
sql = """SELECT exists(
SELECT table_name FROM information_schema.tables
WHERE table_name=%s)"""
qdb.sql_connection.TRN.add(sql, [table])
return qdb.sql_connection.TRN.execute_fetchlast()
def get_db_files_base_dir():
r"""Returns the path to the base directory of all db files
Returns
-------
str
The path to the base directory of all db files
"""
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add("SELECT base_data_dir FROM settings")
basedir = qdb.sql_connection.TRN.execute_fetchlast()
# making sure that it never ends in a "/" as most tests expect this
if basedir.endswith("/"):
basedir = basedir[:-1]
return basedir
def get_work_base_dir():
r"""Returns the path to the base directory of all db files
Returns
-------
str
The path to the base directory of all db files
"""
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add("SELECT base_work_dir FROM settings")
return qdb.sql_connection.TRN.execute_fetchlast()
def max_preparation_samples():
r"""Returns the max number of samples allowed in a single preparation
Returns
-------
int
The max number of samples allowed in a single preparation
"""
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add(
"SELECT max_preparation_samples FROM settings")
return qdb.sql_connection.TRN.execute_fetchlast()
def max_artifacts_in_workflow():
r"""Returns the max number of artifacts allowed in a single workflow
Returns
-------
int
The max number of artifacts allowed in a single workflow
"""
with qdb.sql_connection.TRN:
qdb.sql_connection.TRN.add(
"SELECT max_artifacts_in_workflow FROM settings")
return qdb.sql_connection.TRN.execute_fetchlast()
def compute_checksum(path):
r"""Returns the checksum of the file pointed by path
Parameters
----------
path : str
The path to compute the checksum
Returns
-------
int
The file checksum
"""
filepaths = []
if isdir(path):
for name, dirs, files in walk(path):
join_f = partial(join, name)
filepaths.extend(list(map(join_f, files)))
else:
filepaths.append(path)
buffersize = 65536
crcvalue = 0
for fp in filepaths:
with open(fp, 'rb') as f:
buffr = f.read(buffersize)
while len(buffr) > 0:
crcvalue = crc32(buffr, crcvalue)
buffr = f.read(buffersize)
# We need the & 0xFFFFFFFF in order to get the same numeric value across
# all python versions and platforms
return crcvalue & 0xFFFFFFFF
def get_files_from_uploads_folders(study_id):
"""Retrieve files in upload folders
Parameters
----------
study_id : str
The study id of which to retrieve all upload folders
Returns
-------
list
List of the filepaths for upload for that study
"""
study_id = str(study_id)
fp = []
for pid, p in get_mountpoint("uploads", retrieve_all=True):
t = join(p, study_id)
if exists(t):
for f in listdir(t):
d = join(t, f)
if not f.startswith('.') and not isdir(d):
fp.append((pid, f, naturalsize(getsize(d), gnu=True)))
return fp
def move_upload_files_to_trash(study_id, files_to_move):
"""Move files to a trash folder within the study_id upload folder
Parameters
----------
study_id : int
The study id
files_to_move : list
List of tuples (folder_id, filename)
Raises
------
QiitaDBError
If folder_id or the study folder don't exist and if the filename to
erase matches the trash_folder, internal variable
"""
trash_folder = 'trash'
folders = {k: v for k, v in get_mountpoint("uploads", retrieve_all=True)}
for fid, filename in files_to_move:
if filename == trash_folder:
raise qdb.exceptions.QiitaDBError(
"You can not erase the trash folder: %s" % trash_folder)
if fid not in folders:
raise qdb.exceptions.QiitaDBError(
"The filepath id: %d doesn't exist in the database" % fid)
foldername = join(folders[fid], str(study_id))
if not exists(foldername):
raise qdb.exceptions.QiitaDBError(
"The upload folder for study id: %d doesn't exist" % study_id)
trashpath = join(foldername, trash_folder)
create_nested_path(trashpath)
fullpath = join(foldername, filename)
new_fullpath = join(foldername, trash_folder, filename)
if exists(fullpath):
rename(fullpath, new_fullpath)
def get_mountpoint(mount_type, retrieve_all=False, retrieve_subdir=False):
r""" Returns the most recent values from data directory for the given type
Parameters
----------
mount_type : str
The data mount type
retrieve_all : bool, optional
Retrieve all the available mount points or just the active one.
Default: False.
retrieve_subdir : bool, optional
Retrieve the subdirectory column. Default: False.
Returns
-------
list
List of tuple, where: [(id_mountpoint, filepath_of_mountpoint)]
"""
with qdb.sql_connection.TRN:
if retrieve_all:
sql = """SELECT data_directory_id, mountpoint, subdirectory
FROM qiita.data_directory
WHERE data_type=%s ORDER BY active DESC"""
else:
sql = """SELECT data_directory_id, mountpoint, subdirectory
FROM qiita.data_directory
WHERE data_type=%s AND active=true"""
qdb.sql_connection.TRN.add(sql, [mount_type])
db_result = qdb.sql_connection.TRN.execute_fetchindex()
basedir = get_db_files_base_dir()
if retrieve_subdir:
result = [(d, join(basedir, m), s) for d, m, s in db_result]
else:
result = [(d, join(basedir, m)) for d, m, _ in db_result]
return result
def get_mountpoint_path_by_id(mount_id):
r""" Returns the mountpoint path for the mountpoint with id = mount_id
Parameters
----------
mount_id : int
The mountpoint id
Returns
-------
str
The mountpoint path
"""
with qdb.sql_connection.TRN:
sql = """SELECT mountpoint FROM qiita.data_directory
WHERE data_directory_id=%s"""
qdb.sql_connection.TRN.add(sql, [mount_id])
mountpoint = qdb.sql_connection.TRN.execute_fetchlast()
return join(get_db_files_base_dir(), mountpoint)
def insert_filepaths(filepaths, obj_id, table, move_files=True, copy=False):
r"""Inserts `filepaths` in the database.
Since the files live outside the database, the directory in which the files
lives is controlled by the database, so it moves the filepaths from
its original location to the controlled directory.
Parameters
----------
filepaths : iterable of tuples (str, int)
The list of paths to the raw files and its filepath type identifier
obj_id : int
Id of the object calling the functions. Disregarded if move_files
is False
table : str
Table that holds the file data
move_files : bool, optional
Whether or not to move the given filepaths to the db filepaths
default: True
copy : bool, optional
If `move_files` is true, whether to actually move the files or just
copy them
Returns
-------
list of int
List of the filepath_id in the database for each added filepath
"""
with qdb.sql_connection.TRN:
new_filepaths = filepaths
dd_id, mp, subdir = get_mountpoint(table, retrieve_subdir=True)[0]
base_fp = join(get_db_files_base_dir(), mp)
if move_files or copy:
db_path = partial(join, base_fp)
if subdir:
# Generate the new filepaths, format:
# mountpoint/obj_id/original_name
dirname = db_path(str(obj_id))
create_nested_path(dirname)
new_filepaths = [
(join(dirname, basename(path)), id_)
for path, id_ in filepaths]
else:
# Generate the new fileapths. format:
# mountpoint/DataId_OriginalName
new_filepaths = [
(db_path("%s_%s" % (obj_id, basename(path))), id_)
for path, id_ in filepaths]
# Move the original files to the controlled DB directory
transfer_function = shutil_copy if copy else move
for old_fp, new_fp in zip(filepaths, new_filepaths):
transfer_function(old_fp[0], new_fp[0])
# In case the transaction executes a rollback, we need to
# make sure the files have not been moved
qdb.sql_connection.TRN.add_post_rollback_func(
move, new_fp[0], old_fp[0])
def str_to_id(x):
return (x if isinstance(x, int)
else convert_to_id(x, "filepath_type"))
# 1 is the checksum algorithm, which we only have one implemented
values = [[basename(path), str_to_id(id_), compute_checksum(path),
getsize(path), 1, dd_id] for path, id_ in new_filepaths]
# Insert all the filepaths at once and get the filepath_id back
sql = """INSERT INTO qiita.filepath
(filepath, filepath_type_id, checksum, fp_size,
checksum_algorithm_id, data_directory_id)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING filepath_id"""
idx = qdb.sql_connection.TRN.index
qdb.sql_connection.TRN.add(sql, values, many=True)
# Since we added the query with many=True, we've added len(values)
# queries to the transaction, so the ids are in the last idx queries
return list(chain.from_iterable(
chain.from_iterable(qdb.sql_connection.TRN.execute()[idx:])))
def _path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id):
"""Builds the path of a DB stored file
Parameters
----------
db_dir : str
The DB base dir
filepath : str
The path stored in the DB
mountpoint : str
The mountpoint of the given file
subdirectory : bool
Whether the file is stored in a subdirectory in the mountpoint or not
obj_id : int
The id of the object to which the file is attached
Returns
-------
str
The full path of the given file
"""
if subdirectory:
return join(db_dir, mountpoint, str(obj_id), filepath)
else:
return join(db_dir, mountpoint, filepath)
def retrieve_filepaths(obj_fp_table, obj_id_column, obj_id, sort=None,
fp_type=None):
"""Retrieves the filepaths for the given object id
Parameters
----------
obj_fp_table : str
The name of the table that links the object and the filepath
obj_id_column : str
The name of the column that represents the object id
obj_id : int
The object id
sort : {'ascending', 'descending'}, optional
The direction in which the results are sorted, using the filepath id
as sorting key. Default: None, no sorting is applied
fp_type: str, optional
Retrieve only the filepaths of the matching filepath type
Returns
-------
list of dict {fp_id, fp, ft_type, checksum, fp_size}
The list of dict with the properties of the filepaths
"""
sql_sort = ""
if sort == 'ascending':
sql_sort = " ORDER BY filepath_id"
elif sort == 'descending':
sql_sort = " ORDER BY filepath_id DESC"
elif sort is not None:
raise qdb.exceptions.QiitaDBError(
"Unknown sorting direction: %s. Please choose from 'ascending' or "
"'descending'" % sort)
sql_args = [obj_id]
sql_type = ""
if fp_type:
sql_type = " AND filepath_type=%s"
sql_args.append(fp_type)
with qdb.sql_connection.TRN:
sql = """SELECT filepath_id, filepath, filepath_type, mountpoint,
subdirectory, checksum, fp_size
FROM qiita.filepath
JOIN qiita.filepath_type USING (filepath_type_id)
JOIN qiita.data_directory USING (data_directory_id)
JOIN qiita.{0} USING (filepath_id)
WHERE {1} = %s{2}{3}""".format(obj_fp_table, obj_id_column,
sql_type, sql_sort)
qdb.sql_connection.TRN.add(sql, sql_args)
results = qdb.sql_connection.TRN.execute_fetchindex()
db_dir = get_db_files_base_dir()
return [{'fp_id': fpid, 'fp': _path_builder(db_dir, fp, m, s, obj_id),
'fp_type': fp_type_, 'checksum': c, 'fp_size': fpsize}
for fpid, fp, fp_type_, m, s, c, fpsize in results]
def _rm_files(TRN, fp):
# Remove the data
if exists(fp):
if isdir(fp):
func = rmtree
else:
func = remove
TRN.add_post_commit_func(func, fp)
def purge_filepaths(delete_files=True):
r"""Goes over the filepath table and removes all the filepaths that are not
used in any place
Parameters
----------
delete_files : bool
if True it will actually delete the files, if False print
"""
with qdb.sql_connection.TRN:
files_to_remove = []
# qiita can basically download 5 things: references, info files,
# artifacts, analyses & working_dir.
# 1. references are not longer used so we can skip
# 2. info files: here we could remove all old info files (the backup we
# keep when a user uploads a new file) and all info files from
# studies that no longer exist. We want to keep the old templates
# so we can recover them (this has happened before) but let's remove
# those from deleted studies. Note that we need to check for sample,
# prep and qiime info files
st_id = qdb.util.convert_to_id('sample_template', "filepath_type")
pt_id = qdb.util.convert_to_id('prep_template', "filepath_type")
qt_id = qdb.util.convert_to_id('qiime_map', "filepath_type")
sql = """SELECT filepath_id, filepath FROM qiita.filepath
WHERE filepath_type_id IN %s AND filepath ~ '^[0-9]' AND
data_directory_id = %s AND filepath_id NOT IN (
SELECT filepath_id FROM qiita.prep_template_filepath
UNION
SELECT filepath_id FROM qiita.sample_template_filepath)
"""
for mp_id, mp in get_mountpoint('templates'):
qdb.sql_connection.TRN.add(
sql, [tuple([st_id, pt_id, qt_id]), mp_id])
studies_exits = []
studies_erased = []
for fid, fp in qdb.sql_connection.TRN.execute_fetchindex():
# making sure the studies do _not_ exist, remember info files
# are prepended by the study id
study_id = int(fp.split('_')[0])
if study_id in studies_exits:
continue
elif study_id in studies_erased:
fpath = qdb.util.get_filepath_information(
fid)['fullpath']
files_to_remove.append([fid, fpath])
else:
try:
qdb.study.Study(study_id)
except qdb.exceptions.QiitaDBUnknownIDError:
fpath = qdb.util.get_filepath_information(
fid)['fullpath']
files_to_remove.append([fid, fpath])
studies_erased.append(study_id)
else:
studies_exits.append(study_id)
# 3. artifacts: [A] the difficulty of deleting artifacts is that (1)
# they live in different mounts, (2) as inidividual folders [the
# artifact id], (3) and the artifact id within the database has
# been lost. Thus, the easiest is to loop over the different data
# directories (mounts), get the folder names (artifact ids), and
# check if they exist; if they don't let's delete them. [B] As an
# additional and final step, we need to purge these filepaths from
# the DB.
# [A]
main_sql = """SELECT data_directory_id FROM qiita.artifact_type at
LEFT JOIN qiita.data_directory dd ON (
dd.data_type = at.artifact_type)
WHERE subdirectory = true"""
qdb.sql_connection.TRN.add(main_sql)
for mp_id in qdb.sql_connection.TRN.execute_fetchflatten():
mount = get_mountpoint_path_by_id(mp_id)
for fpath in listdir(mount):
full_fpath = join(mount, fpath)
if isdir(full_fpath):
try:
qdb.artifact.Artifact(int(fpath))
except qdb.exceptions.QiitaDBUnknownIDError:
files_to_remove.append([None, full_fpath])
else:
continue
# [B]
sql = """SELECT filepath_id FROM qiita.filepath
WHERE filepath_id not in (
SELECT filepath_id FROM qiita.artifact_filepath) AND
data_directory_id in (
SELECT data_directory_id FROM qiita.artifact_type at
LEFT JOIN qiita.data_directory dd ON (
dd.data_type = at.artifact_type)
WHERE subdirectory = true)
"""
qdb.sql_connection.TRN.add(sql)
for fid in qdb.sql_connection.TRN.execute_fetchflatten():
fpath = qdb.util.get_filepath_information(fid)['fullpath']
aid = fpath.split('/')[-2]
# making sure the artifact doesn't exist any more
if aid == 'None':
files_to_remove.append([fid, None])
# 4. analysis: we need to select all the filepaths that are not in
# the analysis_filepath, this will return both all filepaths not
# from analyses and those that are not being used, thus, we need
# to also not select those files that are not part of the artifacts
# by ignoring those files paths not stored in a data_directory from
# an artifact:
sql = """SELECT filepath_id FROM qiita.filepath
WHERE filepath_id not in (
SELECT filepath_id FROM qiita.analysis_filepath) AND
data_directory_id in (
SELECT data_directory_id FROM qiita.data_directory
WHERE data_type = 'analysis')
"""
qdb.sql_connection.TRN.add(sql)
for fid in qdb.sql_connection.TRN.execute_fetchflatten():
fdata = qdb.util.get_filepath_information(fid)
analysis_id = int(fdata['filepath'].split('_')[0])
# making sure the Analysis doesn't exist
if not qdb.analysis.Analysis.exists(analysis_id):
fpath = fdata['fullpath']
files_to_remove.append([fid, fpath])
# 5. working directory: this is done internally in the Qiita system via
# a cron job
# Deleting the files!
sql = "DELETE FROM qiita.filepath WHERE filepath_id = %s"
for fid, fpath in files_to_remove:
if delete_files:
if fid is not None:
qdb.sql_connection.TRN.add(sql, [fid])
if fpath is not None:
_rm_files(qdb.sql_connection.TRN, fpath)
else:
print('%s: %s' % (fid, fpath))
if delete_files:
# there is a chance that we will never enter the above
# "if fid is not None" statement so we will add an extra SQL
# command just to make sure that something gets executed
qdb.sql_connection.TRN.add("SELECT 42")
qdb.sql_connection.TRN.execute()
def quick_mounts_purge():
r"""This is a quick mount purge as it only slightly relies on the database
Notes
-----
Currently we delete anything older than 30 days that is not linked
to the database. This number is intentionally hardcoded in the code.
At the time of this writing this number seem high but keeping it
this way to be safe. In the future, if needed, it can be changed.
"""
with qdb.sql_connection.TRN:
main_sql = """SELECT data_directory_id FROM qiita.artifact_type at
LEFT JOIN qiita.data_directory dd ON (
dd.data_type = at.artifact_type)
WHERE subdirectory = true"""
qdb.sql_connection.TRN.add(main_sql)
mp_ids = qdb.sql_connection.TRN.execute_fetchflatten()
mounts = [qdb.util.get_mountpoint_path_by_id(x) for x in mp_ids]
folders = [join(x, f) for x in mounts for f in listdir(x)
if f.isnumeric()]
# getting all unlinked folders
to_delete = []
for i, f in enumerate(folders):
vals = f.split('/')
aid = int(vals[-1])
artifact_type = vals[-2]
if artifact_type == 'FeatureData[Taxonomy]':
continue
try:
a = qdb.artifact.Artifact(aid)
except qdb.exceptions.QiitaDBUnknownIDError:
to_delete.append(f)
continue
if not a.artifact_type.startswith(artifact_type):
raise ValueError('Review artifact type: '