-
Notifications
You must be signed in to change notification settings - Fork 80
Automatic jobs & new stats #2057
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
11f3657
148cef5
9e9eee7
3ccce3d
ace1d7a
b6cd23f
640de3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,17 @@ | |
# ----------------------------------------------------------------------------- | ||
from __future__ import division | ||
|
||
from moi import r_client | ||
from os import stat | ||
from time import strftime, localtime | ||
import matplotlib.pyplot as plt | ||
import matplotlib as mpl | ||
from base64 import b64encode | ||
from urllib import quote | ||
from StringIO import StringIO | ||
from future.utils import viewitems | ||
from datetime import datetime | ||
|
||
from qiita_core.qiita_settings import qiita_config | ||
import qiita_db as qdb | ||
|
||
|
@@ -122,6 +133,147 @@ def get_accessible_filepath_ids(user): | |
return filepath_ids | ||
|
||
|
||
def update_redis_stats(): | ||
"""Generate the system stats and save them in redis | ||
|
||
Returns | ||
------- | ||
list of str | ||
artifact filepaths that are not present in the file system | ||
""" | ||
STUDY = qdb.study.Study | ||
studies = {'public': STUDY.get_by_status('private'), | ||
'private': STUDY.get_by_status('public'), | ||
'sanbox': STUDY.get_by_status('sandbox')} | ||
number_studies = {k: len(v) for k, v in viewitems(studies)} | ||
|
||
number_of_samples = {} | ||
ebi_samples_prep = {} | ||
num_samples_ebi = 0 | ||
for k, sts in viewitems(studies): | ||
number_of_samples[k] = 0 | ||
for s in sts: | ||
st = s.sample_template | ||
if st is not None: | ||
number_of_samples[k] += len(list(st.keys())) | ||
|
||
ebi_samples_prep_count = 0 | ||
for pt in s.prep_templates(): | ||
ebi_samples_prep_count += len([ | ||
1 for _, v in viewitems(pt.ebi_experiment_accessions) | ||
if v is not None and v != '']) | ||
ebi_samples_prep[s.id] = ebi_samples_prep_count | ||
|
||
if s.sample_template is not None: | ||
num_samples_ebi += len([ | ||
1 for _, v in viewitems( | ||
s.sample_template.ebi_sample_accessions) | ||
if v is not None and v != '']) | ||
|
||
num_users = qdb.util.get_count('qiita.qiita_user') | ||
|
||
lat_longs = get_lat_longs() | ||
|
||
num_studies_ebi = len(ebi_samples_prep) | ||
number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)]) | ||
|
||
# generating file size stats | ||
stats = [] | ||
missing_files = [] | ||
for k, sts in viewitems(studies): | ||
for s in sts: | ||
for a in s.artifacts(): | ||
for _, fp, dt in a.filepaths: | ||
try: | ||
s = stat(fp) | ||
stats.append((dt, s.st_size, strftime('%Y-%m', | ||
localtime(s.st_ctime)))) | ||
except OSError: | ||
missing_files.append(fp) | ||
|
||
summary = {} | ||
all_dates = [] | ||
for ft, size, ym in stats: | ||
if ft not in summary: | ||
summary[ft] = {} | ||
if ym not in summary[ft]: | ||
summary[ft][ym] = 0 | ||
all_dates.append(ym) | ||
summary[ft][ym] += size | ||
all_dates = sorted(set(all_dates)) | ||
|
||
# sorting summaries | ||
rm_from_data = ['html_summary', 'tgz', 'directory', 'raw_fasta', 'log', | ||
'biom', 'raw_sff', 'raw_qual'] | ||
ordered_summary = {} | ||
for dt in summary: | ||
if dt in rm_from_data: | ||
continue | ||
new_list = [] | ||
current_value = 0 | ||
for ad in all_dates: | ||
if ad in summary[dt]: | ||
current_value += summary[dt][ad] | ||
new_list.append(current_value) | ||
ordered_summary[dt] = new_list | ||
|
||
plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary], | ||
key=lambda x: x[1]) | ||
|
||
# helper function to generate y axis, modified from: | ||
# http://stackoverflow.com/a/1094933 | ||
def sizeof_fmt(value, position): | ||
number = None | ||
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: | ||
if abs(value) < 1024.0: | ||
number = "%3.1f%s" % (value, unit) | ||
break | ||
value /= 1024.0 | ||
if number is None: | ||
number = "%.1f%s" % (value, 'Yi') | ||
return number | ||
|
||
all_dates_axis = range(len(all_dates)) | ||
plt.locator_params(axis='y', nbins=10) | ||
plt.figure(figsize=(20, 10)) | ||
for k, v in plot_order: | ||
plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k) | ||
|
||
plt.xticks(all_dates_axis, all_dates) | ||
plt.legend() | ||
plt.grid() | ||
ax = plt.gca() | ||
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt)) | ||
plt.xlabel('Date') | ||
plt.ylabel('Storage space per data type') | ||
|
||
plot = StringIO() | ||
plt.savefig(plot, format='png') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make more sense to use SVG, that way the browser can scale it as appropriate and it should look as sharp as possible? 📈 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that could work but the issue is that then you will have to make a lot of changes to make the size of the image to fix in the page, IMOO not worth it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good then. |
||
plot.seek(0) | ||
img = 'data:image/png;base64,' + quote(b64encode(plot.buf)) | ||
|
||
time = datetime.now().strftime('%m-%d-%y %H:%M:%S') | ||
|
||
portal = qiita_config.portal | ||
vals = [ | ||
('number_studies', number_studies, r_client.hmset), | ||
('number_of_samples', number_of_samples, r_client.hmset), | ||
('num_users', num_users, r_client.set), | ||
('lat_longs', lat_longs, r_client.set), | ||
('num_studies_ebi', num_studies_ebi, r_client.set), | ||
('num_samples_ebi', num_samples_ebi, r_client.set), | ||
('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set), | ||
('img', img, r_client.set), | ||
('time', time, r_client.set)] | ||
for k, v, f in vals: | ||
redis_key = '%s:stats:%s' % (portal, k) | ||
# important to "flush" variables to avoid errors | ||
r_client.delete(redis_key) | ||
f(redis_key, v) | ||
|
||
return missing_files | ||
|
||
|
||
def get_lat_longs(): | ||
"""Retrieve the latitude and longitude of all the samples in the DB | ||
|
||
|
@@ -146,7 +298,9 @@ def get_lat_longs(): | |
sql = [('SELECT CAST(latitude AS FLOAT), ' | ||
' CAST(longitude AS FLOAT) ' | ||
'FROM qiita.%s ' | ||
'WHERE isnumeric(latitude) AND isnumeric(latitude)' % s) | ||
'WHERE isnumeric(latitude) AND isnumeric(longitude) ' | ||
"AND latitude <> 'NaN' " | ||
"AND longitude <> 'NaN' " % s) | ||
for s in qdb.sql_connection.TRN.execute_fetchflatten()] | ||
sql = ' UNION '.join(sql) | ||
qdb.sql_connection.TRN.add(sql) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -714,9 +714,24 @@ def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id): | |
for fpid, fp, fp_type_, m, s in results] | ||
|
||
|
||
def purge_filepaths(): | ||
def _rm_files(TRN, fp): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need to pass TRN as a parameter, given that it is a global variable in the system, you can just use it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Turns out that you do, if not:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, well, yeah, in that case you need to use it as: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not blocking though... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From looking at the file, I think this is because you need to import the TRN object, right @josenavas? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gonna leave as is ... as @josenavas said, not blocking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. |
||
# Remove the data | ||
if exists(fp): | ||
if isdir(fp): | ||
func = rmtree | ||
else: | ||
func = remove | ||
TRN.add_post_commit_func(func, fp) | ||
|
||
|
||
def purge_filepaths(delete_files=True): | ||
r"""Goes over the filepath table and remove all the filepaths that are not | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you update the docstring with the new parameter? |
||
used in any place | ||
|
||
Parameters | ||
---------- | ||
delete_files : bool | ||
if True it will actually delete the files, if False print | ||
""" | ||
with qdb.sql_connection.TRN: | ||
# Get all the (table, column) pairs that reference to the filepath | ||
|
@@ -739,30 +754,58 @@ def purge_filepaths(): | |
union_str = " UNION ".join( | ||
["SELECT %s FROM qiita.%s WHERE %s IS NOT NULL" % (col, table, col) | ||
for table, col in qdb.sql_connection.TRN.execute_fetchindex()]) | ||
# Get all the filepaths from the filepath table that are not | ||
# referenced from any place in the database | ||
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id | ||
FROM qiita.filepath FP JOIN qiita.filepath_type FPT | ||
ON FP.filepath_type_id = FPT.filepath_type_id | ||
WHERE filepath_id NOT IN (%s)""" % union_str | ||
qdb.sql_connection.TRN.add(sql) | ||
if union_str: | ||
# Get all the filepaths from the filepath table that are not | ||
# referenced from any place in the database | ||
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id | ||
FROM qiita.filepath FP JOIN qiita.filepath_type FPT | ||
ON FP.filepath_type_id = FPT.filepath_type_id | ||
WHERE filepath_id NOT IN (%s)""" % union_str | ||
qdb.sql_connection.TRN.add(sql) | ||
|
||
# We can now go over and remove all the filepaths | ||
sql = "DELETE FROM qiita.filepath WHERE filepath_id=%s" | ||
db_results = qdb.sql_connection.TRN.execute_fetchindex() | ||
for fp_id, fp, fp_type, dd_id in db_results: | ||
qdb.sql_connection.TRN.add(sql, [fp_id]) | ||
if delete_files: | ||
qdb.sql_connection.TRN.add(sql, [fp_id]) | ||
fp = join(get_mountpoint_path_by_id(dd_id), fp) | ||
_rm_files(qdb.sql_connection.TRN, fp) | ||
else: | ||
print fp, fp_type | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this print statement needed? Doesn't seem to be described in the docs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just added it. |
||
|
||
# Remove the data | ||
fp = join(get_mountpoint_path_by_id(dd_id), fp) | ||
if exists(fp): | ||
if fp_type is 'directory': | ||
func = rmtree | ||
else: | ||
func = remove | ||
qdb.sql_connection.TRN.add_post_commit_func(func, fp) | ||
if delete_files: | ||
qdb.sql_connection.TRN.execute() | ||
|
||
qdb.sql_connection.TRN.execute() | ||
|
||
def empty_trash_upload_folder(delete_files=True): | ||
r"""Delete all files in the trash folder inside each of the upload | ||
folders | ||
|
||
Parameters | ||
---------- | ||
delete_files : bool | ||
if True it will actually delete the files, if False print | ||
""" | ||
gfp = partial(join, get_db_files_base_dir()) | ||
with qdb.sql_connection.TRN: | ||
sql = """SELECT mountpoint | ||
FROM qiita.data_directory | ||
WHERE data_type = 'uploads'""" | ||
qdb.sql_connection.TRN.add(sql) | ||
|
||
for mp in qdb.sql_connection.TRN.execute_fetchflatten(): | ||
for path, dirs, files in walk(gfp(mp)): | ||
if path.endswith('/trash'): | ||
if delete_files: | ||
for f in files: | ||
fp = join(path, f) | ||
_rm_files(qdb.sql_connection.TRN, fp) | ||
else: | ||
print files | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the other print. |
||
|
||
if delete_files: | ||
qdb.sql_connection.TRN.execute() | ||
|
||
|
||
def move_filepaths_to_upload_folder(study_id, filepaths): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would suggest using "SampleTemplate.ebi_sample_accessions" to count the number of samples that are in EBI. You can double-count some of the samples here since they can have been sequenced more than one time.
If you still want to show this count, you can show "sample runs submitted to ebi". In that case you will have the number of samples, the number of samples in EBI and the number of runs in EBI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Excellent point, will add both.