diff --git a/.travis.yml b/.travis.yml index 8e4c86827..5a191929f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,9 +40,10 @@ script: - qiita-env start_cluster qiita-general - qiita-env make --no-load-ontologies - if [ ${TEST_ADD_STUDIES} == "True" ]; then test_data_studies/commands.sh ; fi + - if [ ${TEST_ADD_STUDIES} == "True" ]; then qiita-cron-job ; fi - if [ ${TEST_ADD_STUDIES} == "False" ]; then qiita-test-install ; fi - if [ ${TEST_ADD_STUDIES} == "False" ]; then nosetests --with-doctest --with-coverage -v --cover-package=qiita_db,qiita_pet,qiita_core,qiita_ware; fi - - flake8 qiita_* setup.py scripts/qiita scripts/qiita-env scripts/qiita-test-install + - flake8 qiita_* setup.py scripts/* - ls -R /home/travis/miniconda3/envs/qiita/lib/python2.7/site-packages/qiita_pet/support_files/doc/ - qiita pet webserver addons: diff --git a/qiita_db/meta_util.py b/qiita_db/meta_util.py index cc8f2dc18..efaf429e2 100644 --- a/qiita_db/meta_util.py +++ b/qiita_db/meta_util.py @@ -25,6 +25,17 @@ # ----------------------------------------------------------------------------- from __future__ import division +from moi import r_client +from os import stat +from time import strftime, localtime +import matplotlib.pyplot as plt +import matplotlib as mpl +from base64 import b64encode +from urllib import quote +from StringIO import StringIO +from future.utils import viewitems +from datetime import datetime + from qiita_core.qiita_settings import qiita_config import qiita_db as qdb @@ -122,6 +133,147 @@ def get_accessible_filepath_ids(user): return filepath_ids +def update_redis_stats(): + """Generate the system stats and save them in redis + + Returns + ------- + list of str + artifact filepaths that are not present in the file system + """ + STUDY = qdb.study.Study + studies = {'public': STUDY.get_by_status('private'), + 'private': STUDY.get_by_status('public'), + 'sanbox': STUDY.get_by_status('sandbox')} + number_studies = {k: len(v) for k, v in viewitems(studies)} + + number_of_samples = {} + ebi_samples_prep = {} + num_samples_ebi = 0 + for k, sts in viewitems(studies): + number_of_samples[k] = 0 + for s in sts: + st = s.sample_template + if st is not None: + number_of_samples[k] += len(list(st.keys())) + + ebi_samples_prep_count = 0 + for pt in s.prep_templates(): + ebi_samples_prep_count += len([ + 1 for _, v in viewitems(pt.ebi_experiment_accessions) + if v is not None and v != '']) + ebi_samples_prep[s.id] = ebi_samples_prep_count + + if s.sample_template is not None: + num_samples_ebi += len([ + 1 for _, v in viewitems( + s.sample_template.ebi_sample_accessions) + if v is not None and v != '']) + + num_users = qdb.util.get_count('qiita.qiita_user') + + lat_longs = get_lat_longs() + + num_studies_ebi = len(ebi_samples_prep) + number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)]) + + # generating file size stats + stats = [] + missing_files = [] + for k, sts in viewitems(studies): + for s in sts: + for a in s.artifacts(): + for _, fp, dt in a.filepaths: + try: + s = stat(fp) + stats.append((dt, s.st_size, strftime('%Y-%m', + localtime(s.st_ctime)))) + except OSError: + missing_files.append(fp) + + summary = {} + all_dates = [] + for ft, size, ym in stats: + if ft not in summary: + summary[ft] = {} + if ym not in summary[ft]: + summary[ft][ym] = 0 + all_dates.append(ym) + summary[ft][ym] += size + all_dates = sorted(set(all_dates)) + + # sorting summaries + rm_from_data = ['html_summary', 'tgz', 'directory', 'raw_fasta', 'log', + 'biom', 'raw_sff', 'raw_qual'] + ordered_summary = {} + for dt in summary: + if dt in rm_from_data: + continue + new_list = [] + current_value = 0 + for ad in all_dates: + if ad in summary[dt]: + current_value += summary[dt][ad] + new_list.append(current_value) + ordered_summary[dt] = new_list + + plot_order = sorted([(k, ordered_summary[k][-1]) for k in ordered_summary], + key=lambda x: x[1]) + + # helper function to generate y axis, modified from: + # http://stackoverflow.com/a/1094933 + def sizeof_fmt(value, position): + number = None + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(value) < 1024.0: + number = "%3.1f%s" % (value, unit) + break + value /= 1024.0 + if number is None: + number = "%.1f%s" % (value, 'Yi') + return number + + all_dates_axis = range(len(all_dates)) + plt.locator_params(axis='y', nbins=10) + plt.figure(figsize=(20, 10)) + for k, v in plot_order: + plt.plot(all_dates_axis, ordered_summary[k], linewidth=2, label=k) + + plt.xticks(all_dates_axis, all_dates) + plt.legend() + plt.grid() + ax = plt.gca() + ax.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(sizeof_fmt)) + plt.xlabel('Date') + plt.ylabel('Storage space per data type') + + plot = StringIO() + plt.savefig(plot, format='png') + plot.seek(0) + img = 'data:image/png;base64,' + quote(b64encode(plot.buf)) + + time = datetime.now().strftime('%m-%d-%y %H:%M:%S') + + portal = qiita_config.portal + vals = [ + ('number_studies', number_studies, r_client.hmset), + ('number_of_samples', number_of_samples, r_client.hmset), + ('num_users', num_users, r_client.set), + ('lat_longs', lat_longs, r_client.set), + ('num_studies_ebi', num_studies_ebi, r_client.set), + ('num_samples_ebi', num_samples_ebi, r_client.set), + ('number_samples_ebi_prep', number_samples_ebi_prep, r_client.set), + ('img', img, r_client.set), + ('time', time, r_client.set)] + for k, v, f in vals: + redis_key = '%s:stats:%s' % (portal, k) + # important to "flush" variables to avoid errors + r_client.delete(redis_key) + f(redis_key, v) + + return missing_files + + def get_lat_longs(): """Retrieve the latitude and longitude of all the samples in the DB @@ -146,7 +298,9 @@ def get_lat_longs(): sql = [('SELECT CAST(latitude AS FLOAT), ' ' CAST(longitude AS FLOAT) ' 'FROM qiita.%s ' - 'WHERE isnumeric(latitude) AND isnumeric(latitude)' % s) + 'WHERE isnumeric(latitude) AND isnumeric(longitude) ' + "AND latitude <> 'NaN' " + "AND longitude <> 'NaN' " % s) for s in qdb.sql_connection.TRN.execute_fetchflatten()] sql = ' UNION '.join(sql) qdb.sql_connection.TRN.add(sql) diff --git a/qiita_db/test/test_meta_util.py b/qiita_db/test/test_meta_util.py index 653b65237..f9f513467 100644 --- a/qiita_db/test/test_meta_util.py +++ b/qiita_db/test/test_meta_util.py @@ -10,6 +10,7 @@ import pandas as pd +from moi import r_client from qiita_core.qiita_settings import qiita_config from qiita_core.util import qiita_test_checker @@ -180,6 +181,43 @@ def test_get_lat_longs_EMP_portal(self): self.assertItemsEqual(obs, exp) + def test_update_redis_stats(self): + qdb.meta_util.update_redis_stats() + + portal = qiita_config.portal + vals = [ + ('number_studies', {'sanbox': '2', 'public': '0', + 'private': '1'}, r_client.hgetall), + ('number_of_samples', {'sanbox': '1', 'public': '0', + 'private': '27'}, r_client.hgetall), + ('num_users', '4', r_client.get), + ('lat_longs', EXP_LAT_LONG, r_client.get), + ('num_studies_ebi', '3', r_client.get), + ('num_samples_ebi', '27', r_client.get), + ('number_samples_ebi_prep', '54', r_client.get) + # not testing img/time for simplicity + # ('img', r_client.get), + # ('time', r_client.get) + ] + for k, exp, f in vals: + redis_key = '%s:stats:%s' % (portal, k) + self.assertEqual(f(redis_key), exp) + + +EXP_LAT_LONG = ( + '[[0.291867635913, 68.5945325743], [68.0991287718, 34.8360987059],' + ' [10.6655599093, 70.784770579], [40.8623799474, 6.66444220187],' + ' [13.089194595, 92.5274472082], [84.0030227585, 66.8954849864],' + ' [12.7065957714, 84.9722975792], [78.3634273709, 74.423907894],' + ' [82.8302905615, 86.3615778099], [53.5050692395, 31.6056761814],' + ' [43.9614715197, 82.8516734159], [29.1499460692, 82.1270418227],' + ' [23.1218032799, 42.838497795], [12.6245524972, 96.0693176066],' + ' [38.2627021402, 3.48274264219], [74.0894932572, 65.3283470202],' + ' [35.2374368957, 68.5041623253], [4.59216095574, 63.5115213108],' + ' [95.2060749748, 27.3592668624], [68.51099627, 2.35063674718],' + ' [85.4121476399, 15.6526750776], [60.1102854322, 74.7123248382],' + ' [3.21190859967, 26.8138925876], [57.571893782, 32.5563076447],' + ' [44.9725384282, 66.1920014699], [42.42, 41.41]]') if __name__ == '__main__': main() diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py index d2529e788..743138dcd 100644 --- a/qiita_db/test/test_util.py +++ b/qiita_db/test/test_util.py @@ -8,7 +8,7 @@ from unittest import TestCase, main from tempfile import mkstemp -from os import close, remove +from os import close, remove, mkdir from os.path import join, exists, basename from shutil import rmtree from datetime import datetime @@ -365,6 +365,20 @@ def _common_purge_filpeaths_test(self): def test_purge_filepaths(self): self._common_purge_filpeaths_test() + def test_empty_trash_upload_folder(self): + # creating file to delete so we know it actually works + study_id = '1' + uploads_fp = join(qdb.util.get_mountpoint("uploads")[0][1], study_id) + trash = join(uploads_fp, 'trash') + if not exists(trash): + mkdir(trash) + fp = join(trash, 'my_file_to_delete.txt') + open(fp, 'w').close() + + self.assertTrue(exists(fp)) + qdb.util.empty_trash_upload_folder() + self.assertFalse(exists(fp)) + def test_purge_filepaths_null_cols(self): # For more details about the source of the issue that motivates this # test: http://www.depesz.com/2008/08/13/nulls-vs-not-in/ diff --git a/qiita_db/util.py b/qiita_db/util.py index ade911f0a..938451450 100644 --- a/qiita_db/util.py +++ b/qiita_db/util.py @@ -714,9 +714,24 @@ def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id): for fpid, fp, fp_type_, m, s in results] -def purge_filepaths(): +def _rm_files(TRN, fp): + # Remove the data + if exists(fp): + if isdir(fp): + func = rmtree + else: + func = remove + TRN.add_post_commit_func(func, fp) + + +def purge_filepaths(delete_files=True): r"""Goes over the filepath table and remove all the filepaths that are not used in any place + + Parameters + ---------- + delete_files : bool + if True it will actually delete the files, if False print """ with qdb.sql_connection.TRN: # Get all the (table, column) pairs that reference to the filepath @@ -739,30 +754,58 @@ def purge_filepaths(): union_str = " UNION ".join( ["SELECT %s FROM qiita.%s WHERE %s IS NOT NULL" % (col, table, col) for table, col in qdb.sql_connection.TRN.execute_fetchindex()]) - # Get all the filepaths from the filepath table that are not - # referenced from any place in the database - sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id - FROM qiita.filepath FP JOIN qiita.filepath_type FPT - ON FP.filepath_type_id = FPT.filepath_type_id - WHERE filepath_id NOT IN (%s)""" % union_str - qdb.sql_connection.TRN.add(sql) + if union_str: + # Get all the filepaths from the filepath table that are not + # referenced from any place in the database + sql = """SELECT filepath_id, filepath, filepath_type, data_directory_id + FROM qiita.filepath FP JOIN qiita.filepath_type FPT + ON FP.filepath_type_id = FPT.filepath_type_id + WHERE filepath_id NOT IN (%s)""" % union_str + qdb.sql_connection.TRN.add(sql) # We can now go over and remove all the filepaths sql = "DELETE FROM qiita.filepath WHERE filepath_id=%s" db_results = qdb.sql_connection.TRN.execute_fetchindex() for fp_id, fp, fp_type, dd_id in db_results: - qdb.sql_connection.TRN.add(sql, [fp_id]) + if delete_files: + qdb.sql_connection.TRN.add(sql, [fp_id]) + fp = join(get_mountpoint_path_by_id(dd_id), fp) + _rm_files(qdb.sql_connection.TRN, fp) + else: + print fp, fp_type - # Remove the data - fp = join(get_mountpoint_path_by_id(dd_id), fp) - if exists(fp): - if fp_type is 'directory': - func = rmtree - else: - func = remove - qdb.sql_connection.TRN.add_post_commit_func(func, fp) + if delete_files: + qdb.sql_connection.TRN.execute() - qdb.sql_connection.TRN.execute() + +def empty_trash_upload_folder(delete_files=True): + r"""Delete all files in the trash folder inside each of the upload + folders + + Parameters + ---------- + delete_files : bool + if True it will actually delete the files, if False print + """ + gfp = partial(join, get_db_files_base_dir()) + with qdb.sql_connection.TRN: + sql = """SELECT mountpoint + FROM qiita.data_directory + WHERE data_type = 'uploads'""" + qdb.sql_connection.TRN.add(sql) + + for mp in qdb.sql_connection.TRN.execute_fetchflatten(): + for path, dirs, files in walk(gfp(mp)): + if path.endswith('/trash'): + if delete_files: + for f in files: + fp = join(path, f) + _rm_files(qdb.sql_connection.TRN, fp) + else: + print files + + if delete_files: + qdb.sql_connection.TRN.execute() def move_filepaths_to_upload_folder(study_id, filepaths): diff --git a/qiita_pet/handlers/stats.py b/qiita_pet/handlers/stats.py index 5a82c04a0..4164d0fc9 100644 --- a/qiita_pet/handlers/stats.py +++ b/qiita_pet/handlers/stats.py @@ -7,63 +7,36 @@ from qiita_core.util import execute_as_transaction from qiita_core.qiita_settings import qiita_config -from qiita_db.util import get_count from qiita_db.study import Study -from qiita_db.meta_util import get_lat_longs from .base_handlers import BaseHandler class StatsHandler(BaseHandler): @execute_as_transaction def _get_stats(self, callback): - # check if the key exists in redis - redis_lats_key = '%s:stats:sample_lats' % qiita_config.portal - redis_longs_key = '%s:stats:sample_longs' % qiita_config.portal - lats = r_client.lrange(redis_lats_key, 0, -1) - longs = r_client.lrange(redis_longs_key, 0, -1) - if not (lats and longs): - # if we don't have them, then fetch from disk and add to the - # redis server with a 24-hour expiration - lat_longs = get_lat_longs() - lats = [float(x[0]) for x in lat_longs] - longs = [float(x[1]) for x in lat_longs] - with r_client.pipeline() as pipe: - for latitude, longitude in lat_longs: - # storing as a simple data structure, hopefully this - # doesn't burn us later - pipe.rpush(redis_lats_key, latitude) - pipe.rpush(redis_longs_key, longitude) + stats = {} + # checking values from redis + portal = qiita_config.portal + vals = [ + ('number_studies', r_client.hgetall), + ('number_of_samples', r_client.hgetall), + ('num_users', r_client.get), + ('lat_longs', r_client.get), + ('num_studies_ebi', r_client.get), + ('num_samples_ebi', r_client.get), + ('number_samples_ebi_prep', r_client.get), + ('img', r_client.get), + ('time', r_client.get)] + for k, f in vals: + redis_key = '%s:stats:%s' % (portal, k) + stats[k] = f(redis_key) - # set the key to expire in 24 hours, so that we limit the - # number of times we have to go to the database to a reasonable - # amount - r_client.expire(redis_lats_key, 86400) - r_client.expire(redis_longs_key, 86400) - - pipe.execute() - else: - # If we do have them, put the redis results into the same structure - # that would come back from the database - longs = [float(x) for x in longs] - lats = [float(x) for x in lats] - lat_longs = zip(lats, longs) - - # Get the number of studies - num_studies = get_count('qiita.study') - - # Get the number of samples - num_samples = len(lats) - - # Get the number of users - num_users = get_count('qiita.qiita_user') - - callback([num_studies, num_samples, num_users, lat_longs]) + callback(stats) @coroutine @execute_as_transaction def get(self): - num_studies, num_samples, num_users, lat_longs = \ - yield Task(self._get_stats) + stats = yield Task(self._get_stats) # Pull a random public study from the database public_studies = Study.get_by_status('public') @@ -79,8 +52,14 @@ def get(self): random_study_id = study.id self.render('stats.html', - num_studies=num_studies, num_samples=num_samples, - num_users=num_users, lat_longs=lat_longs, + number_studies=stats['number_studies'], + number_of_samples=stats['number_of_samples'], + num_users=stats['num_users'], + lat_longs=eval(stats['lat_longs']), + num_studies_ebi=stats['num_studies_ebi'], + num_samples_ebi=stats['num_samples_ebi'], + number_samples_ebi_prep=stats['number_samples_ebi_prep'], + img=stats['img'], time=stats['time'], random_study_info=random_study_info, random_study_title=random_study_title, random_study_id=random_study_id) diff --git a/qiita_pet/templates/stats.html b/qiita_pet/templates/stats.html index 10b840f90..89d6a5384 100644 --- a/qiita_pet/templates/stats.html +++ b/qiita_pet/templates/stats.html @@ -49,26 +49,47 @@ {% end %} {% block content %} +
+ Generated on: {{time}} +

+ + + + + + + + + + + + {% if num_users and num_users is not None %} + + {% end %} + +
StudiesSamplesUsers
+ {% for k in number_studies %} + {{k}}: {{ "{:,}".format(int(number_studies[k])) }}
+ {% end %} + {% if num_studies_ebi and num_studies_ebi is not None %} + submitted to EBI: {{ "{:,}".format(int(num_studies_ebi)) }} + {% end %} +
+ {% for k in number_of_samples %} + {{k}}: {{ "{:,}".format(int(number_of_samples[k])) }}
+ {% end %} + {% if num_samples_ebi and num_samples_ebi is not None %} + submitted to EBI: {{ "{:,}".format(int(num_samples_ebi)) }}
+ {% end %} + {% if number_samples_ebi_prep and number_samples_ebi_prep is not None %} + submitted to EBI (prep): {{ "{:,}".format(int(number_samples_ebi_prep)) }} + {% end %} +
{{ "{:,}".format(int(num_users)) }}
+
-
- - - - - - - - - - - - - -
Number of studiesNumber of samplesNumber of users
{{ num_studies }}{{ num_samples }}{{ num_users }}
-
-
+
-{% if random_study_id is not None %} + {% if random_study_id is not None %}

Check out this random public study from the database!

{{ random_study_title }}

@@ -81,5 +102,14 @@

Log in above to see this and other public studies

{% end %}

-{% end %} + {% end %} + + {% if img %} +
+
Data usage
+ +
+ {% end %} + + {% end %} diff --git a/scripts/qiita-cron-job b/scripts/qiita-cron-job new file mode 100755 index 000000000..257353523 --- /dev/null +++ b/scripts/qiita-cron-job @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- + +from qiita_db.util import purge_filepaths, empty_trash_upload_folder +from qiita_db.meta_util import update_redis_stats + + +# This script will perform these jobs: +# 1. purge_filepaths: remove files from that are leftover in the +# qiita.filepath and are present in the filesystem +# 2. empty_trash_upload_folder: remove files that are present in the trash +# of the upload folders +# 3. update_redis_stats: updates the redis stats information +# +# Note that is responsability of the Qiita install system admin to add to a +# cron job this script and responsible to define how often it should run + + +def main(): + purge_filepaths(True) + empty_trash_upload_folder(True) + update_redis_stats() + + +if __name__ == "__main__": + main()