Skip to content

Automatic jobs & new stats #2057

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ script:
- qiita-env start_cluster qiita-general
- qiita-env make --no-load-ontologies
- if [ ${TEST_ADD_STUDIES} == "True" ]; then test_data_studies/commands.sh ; fi
- if [ ${TEST_ADD_STUDIES} == "True" ]; then qiita-cron-job ; fi
- if [ ${TEST_ADD_STUDIES} == "False" ]; then qiita-test-install ; fi
- if [ ${TEST_ADD_STUDIES} == "False" ]; then nosetests --with-doctest --with-coverage -v --cover-package=qiita_db,qiita_pet,qiita_core,qiita_ware; fi
- flake8 qiita_* setup.py scripts/qiita scripts/qiita-env scripts/qiita-test-install
- flake8 qiita_* setup.py scripts/*
- ls -R /home/travis/miniconda3/envs/qiita/lib/python2.7/site-packages/qiita_pet/support_files/doc/
- qiita pet webserver
addons:
Expand Down
156 changes: 155 additions & 1 deletion qiita_db/meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
# -----------------------------------------------------------------------------
from __future__ import division

from moi import r_client
from os import stat
from time import strftime, localtime
import matplotlib.pyplot as plt
import matplotlib as mpl
from base64 import b64encode
from urllib import quote
from StringIO import StringIO
from future.utils import viewitems
from datetime import datetime

from qiita_core.qiita_settings import qiita_config
import qiita_db as qdb

Expand Down Expand Up @@ -122,6 +133,147 @@ def get_accessible_filepath_ids(user):
return filepath_ids


def update_redis_stats():
"""Generate the system stats and save them in redis

Returns
-------
list of str
artifact filepaths that are not present in the file system
"""
STUDY = qdb.study.Study
studies = {'public': STUDY.get_by_status('private'),
'private': STUDY.get_by_status('public'),
'sanbox': STUDY.get_by_status('sandbox')}
number_studies = {k: len(v) for k, v in viewitems(studies)}

number_of_samples = {}
ebi_samples_prep = {}
num_samples_ebi = 0
for k, sts in viewitems(studies):
number_of_samples[k] = 0
for s in sts:
st = s.sample_template
if st is not None:
number_of_samples[k] += len(list(st.keys()))

ebi_samples_prep_count = 0
for pt in s.prep_templates():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest using "SampleTemplate.ebi_sample_accessions" to count the number of samples that are in EBI. You can double-count some of the samples here since they can have been sequenced more than one time.

If you still want to show this count, you can show "sample runs submitted to ebi". In that case you will have the number of samples, the number of samples in EBI and the number of runs in EBI.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent point, will add both.

ebi_samples_prep_count += len([
1 for _, v in viewitems(pt.ebi_experiment_accessions)
if v is not None and v != ''])
ebi_samples_prep[s.id] = ebi_samples_prep_count

if s.sample_template is not None:
num_samples_ebi += len([
1 for _, v in viewitems(
s.sample_template.ebi_sample_accessions)
if v is not None and v != ''])

num_users = qdb.util.get_count('qiita.qiita_user')

lat_longs = get_lat_longs()

num_studies_ebi = len(ebi_samples_prep)
number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)])

# generating file size stats
stats = []
missing_files = []
for k, sts in viewitems(studies):
for s in sts:
for a in s.artifacts():
for _, fp, dt in a.filepaths:
try:
s = stat(fp)
stats.append((dt, s.st_size, strftime('%Y-%m',
localtime(s.st_ctime))))
except OSError:
missing_files.append(fp)

summary = {}
all_dates = []
for ft, size, ym in stats:
if ft not in summary:
summary[ft] = {}
if ym not in summary[ft]:
summary[ft][ym] = 0
all_dates.append(ym)
summary[ft][ym] += size
all_dates = sorted(set(all_dates))

# sorting summaries
rm_from_data = ['html_summary', 'tgz', 'directory', 'raw_fasta', 'log',
'biom', 'raw_sff', 'raw_qual']
ordered_summary = {}
for dt in summary:
if dt in rm_from_data:
continue
new_list = []
current_value = 0
for ad in all_dates:
if ad in summary[dt]:
current_value += summary[dt][ad]
new_list.append(current_value)
ordered_summary[dt] = new_list

plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary],
key=lambda x: x[1])

# helper function to generate y axis, modified from:
# http://stackoverflow.com/a/1094933
def sizeof_fmt(value, position):
number = None
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
if abs(value) < 1024.0:
number = "%3.1f%s" % (value, unit)
break
value /= 1024.0
if number is None:
number = "%.1f%s" % (value, 'Yi')
return number

all_dates_axis = range(len(all_dates))
plt.locator_params(axis='y', nbins=10)
plt.figure(figsize=(20, 10))
for k, v in plot_order:
plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k)

plt.xticks(all_dates_axis, all_dates)
plt.legend()
plt.grid()
ax = plt.gca()
ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt))
plt.xlabel('Date')
plt.ylabel('Storage space per data type')

plot = StringIO()
plt.savefig(plot, format='png')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make more sense to use SVG, that way the browser can scale it as appropriate and it should look as sharp as possible? 📈

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that could work but the issue is that then you will have to make a lot of changes to make the size of the image to fix in the page, IMOO not worth it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good then.

plot.seek(0)
img = 'data:image/png;base64,' + quote(b64encode(plot.buf))

time = datetime.now().strftime('%m-%d-%y %H:%M:%S')

portal = qiita_config.portal
vals = [
('number_studies', number_studies, r_client.hmset),
('number_of_samples', number_of_samples, r_client.hmset),
('num_users', num_users, r_client.set),
('lat_longs', lat_longs, r_client.set),
('num_studies_ebi', num_studies_ebi, r_client.set),
('num_samples_ebi', num_samples_ebi, r_client.set),
('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set),
('img', img, r_client.set),
('time', time, r_client.set)]
for k, v, f in vals:
redis_key = '%s:stats:%s' % (portal, k)
# important to "flush" variables to avoid errors
r_client.delete(redis_key)
f(redis_key, v)

return missing_files


def get_lat_longs():
"""Retrieve the latitude and longitude of all the samples in the DB

Expand All @@ -146,7 +298,9 @@ def get_lat_longs():
sql = [('SELECT CAST(latitude AS FLOAT), '
' CAST(longitude AS FLOAT) '
'FROM qiita.%s '
'WHERE isnumeric(latitude) AND isnumeric(latitude)' % s)
'WHERE isnumeric(latitude) AND isnumeric(longitude) '
"AND latitude <> 'NaN' "
"AND longitude <> 'NaN' " % s)
for s in qdb.sql_connection.TRN.execute_fetchflatten()]
sql = ' UNION '.join(sql)
qdb.sql_connection.TRN.add(sql)
Expand Down
38 changes: 38 additions & 0 deletions qiita_db/test/test_meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pandas as pd

from moi import r_client
from qiita_core.qiita_settings import qiita_config
from qiita_core.util import qiita_test_checker

Expand Down Expand Up @@ -180,6 +181,43 @@ def test_get_lat_longs_EMP_portal(self):

self.assertItemsEqual(obs, exp)

def test_update_redis_stats(self):
qdb.meta_util.update_redis_stats()

portal = qiita_config.portal
vals = [
('number_studies', {'sanbox': '2', 'public': '0',
'private': '1'}, r_client.hgetall),
('number_of_samples', {'sanbox': '1', 'public': '0',
'private': '27'}, r_client.hgetall),
('num_users', '4', r_client.get),
('lat_longs', EXP_LAT_LONG, r_client.get),
('num_studies_ebi', '3', r_client.get),
('num_samples_ebi', '27', r_client.get),
('number_samples_ebi_prep', '54', r_client.get)
# not testing img/time for simplicity
# ('img', r_client.get),
# ('time', r_client.get)
]
for k, exp, f in vals:
redis_key = '%s:stats:%s' % (portal, k)
self.assertEqual(f(redis_key), exp)


EXP_LAT_LONG = (
'[[0.291867635913, 68.5945325743], [68.0991287718, 34.8360987059],'
' [10.6655599093, 70.784770579], [40.8623799474, 6.66444220187],'
' [13.089194595, 92.5274472082], [84.0030227585, 66.8954849864],'
' [12.7065957714, 84.9722975792], [78.3634273709, 74.423907894],'
' [82.8302905615, 86.3615778099], [53.5050692395, 31.6056761814],'
' [43.9614715197, 82.8516734159], [29.1499460692, 82.1270418227],'
' [23.1218032799, 42.838497795], [12.6245524972, 96.0693176066],'
' [38.2627021402, 3.48274264219], [74.0894932572, 65.3283470202],'
' [35.2374368957, 68.5041623253], [4.59216095574, 63.5115213108],'
' [95.2060749748, 27.3592668624], [68.51099627, 2.35063674718],'
' [85.4121476399, 15.6526750776], [60.1102854322, 74.7123248382],'
' [3.21190859967, 26.8138925876], [57.571893782, 32.5563076447],'
' [44.9725384282, 66.1920014699], [42.42, 41.41]]')

if __name__ == '__main__':
main()
16 changes: 15 additions & 1 deletion qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from unittest import TestCase, main
from tempfile import mkstemp
from os import close, remove
from os import close, remove, mkdir
from os.path import join, exists, basename
from shutil import rmtree
from datetime import datetime
Expand Down Expand Up @@ -365,6 +365,20 @@ def _common_purge_filpeaths_test(self):
def test_purge_filepaths(self):
self._common_purge_filpeaths_test()

def test_empty_trash_upload_folder(self):
# creating file to delete so we know it actually works
study_id = '1'
uploads_fp = join(qdb.util.get_mountpoint("uploads")[0][1], study_id)
trash = join(uploads_fp, 'trash')
if not exists(trash):
mkdir(trash)
fp = join(trash, 'my_file_to_delete.txt')
open(fp, 'w').close()

self.assertTrue(exists(fp))
qdb.util.empty_trash_upload_folder()
self.assertFalse(exists(fp))

def test_purge_filepaths_null_cols(self):
# For more details about the source of the issue that motivates this
# test: http://www.depesz.com/2008/08/13/nulls-vs-not-in/
Expand Down
79 changes: 61 additions & 18 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,9 +714,24 @@ def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id):
for fpid, fp, fp_type_, m, s in results]


def purge_filepaths():
def _rm_files(TRN, fp):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need to pass TRN as a parameter, given that it is a global variable in the system, you can just use it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turns out that you do, if not:

======================================================================
ERROR: test_purge_filepaths_null_cols (qiita_core.util.DecoratedClass)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "qiita_db/test/test_util.py", line 393, in test_purge_filepaths_null_cols
    self._common_purge_filpeaths_test()
  File "qiita_db/test/test_util.py", line 342, in _common_purge_filpeaths_test
    qdb.util.purge_filepaths()
  File "/Users/antoniog/svn_programs/qiita/qiita_db/util.py", line 768, in purge_filepaths
    _rm_files(fp)
  File "/Users/antoniog/svn_programs/qiita/qiita_db/util.py", line 724, in _rm_files
    TRN.add_post_commit_func(func, fp)
NameError: global name 'TRN' is not defined

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, well, yeah, in that case you need to use it as:
qdb.sql_connection.TRN not just TRN.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking though...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From looking at the file, I think this is because you need to import the TRN object, right @josenavas?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gonna leave as is ... as @josenavas said, not blocking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

# Remove the data
if exists(fp):
if isdir(fp):
func = rmtree
else:
func = remove
TRN.add_post_commit_func(func, fp)


def purge_filepaths(delete_files=True):
r"""Goes over the filepath table and remove all the filepaths that are not
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you update the docstring with the new parameter?

used in any place

Parameters
----------
delete_files : bool
if True it will actually delete the files, if False print
"""
with qdb.sql_connection.TRN:
# Get all the (table, column) pairs that reference to the filepath
Expand All @@ -739,30 +754,58 @@ def purge_filepaths():
union_str = " UNION ".join(
["SELECT %s FROM qiita.%s WHERE %s IS NOT NULL" % (col, table, col)
for table, col in qdb.sql_connection.TRN.execute_fetchindex()])
# Get all the filepaths from the filepath table that are not
# referenced from any place in the database
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id
FROM qiita.filepath FP JOIN qiita.filepath_type FPT
ON FP.filepath_type_id = FPT.filepath_type_id
WHERE filepath_id NOT IN (%s)""" % union_str
qdb.sql_connection.TRN.add(sql)
if union_str:
# Get all the filepaths from the filepath table that are not
# referenced from any place in the database
sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id
FROM qiita.filepath FP JOIN qiita.filepath_type FPT
ON FP.filepath_type_id = FPT.filepath_type_id
WHERE filepath_id NOT IN (%s)""" % union_str
qdb.sql_connection.TRN.add(sql)

# We can now go over and remove all the filepaths
sql = "DELETE FROM qiita.filepath WHERE filepath_id=%s"
db_results = qdb.sql_connection.TRN.execute_fetchindex()
for fp_id, fp, fp_type, dd_id in db_results:
qdb.sql_connection.TRN.add(sql, [fp_id])
if delete_files:
qdb.sql_connection.TRN.add(sql, [fp_id])
fp = join(get_mountpoint_path_by_id(dd_id), fp)
_rm_files(qdb.sql_connection.TRN, fp)
else:
print fp, fp_type
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this print statement needed? Doesn't seem to be described in the docs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just added it.


# Remove the data
fp = join(get_mountpoint_path_by_id(dd_id), fp)
if exists(fp):
if fp_type is 'directory':
func = rmtree
else:
func = remove
qdb.sql_connection.TRN.add_post_commit_func(func, fp)
if delete_files:
qdb.sql_connection.TRN.execute()

qdb.sql_connection.TRN.execute()

def empty_trash_upload_folder(delete_files=True):
r"""Delete all files in the trash folder inside each of the upload
folders

Parameters
----------
delete_files : bool
if True it will actually delete the files, if False print
"""
gfp = partial(join, get_db_files_base_dir())
with qdb.sql_connection.TRN:
sql = """SELECT mountpoint
FROM qiita.data_directory
WHERE data_type = 'uploads'"""
qdb.sql_connection.TRN.add(sql)

for mp in qdb.sql_connection.TRN.execute_fetchflatten():
for path, dirs, files in walk(gfp(mp)):
if path.endswith('/trash'):
if delete_files:
for f in files:
fp = join(path, f)
_rm_files(qdb.sql_connection.TRN, fp)
else:
print files
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the other print.


if delete_files:
qdb.sql_connection.TRN.execute()


def move_filepaths_to_upload_folder(study_id, filepaths):
Expand Down
Loading