Skip to content

Load processed data #129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 20, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion qiita_db/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from dateutil.parser import parse
import pandas as pd
from functools import partial
try:
Expand All @@ -18,7 +19,7 @@
from .study import Study, StudyPerson
from .user import User
from .util import get_filetypes, get_filepath_types
from .data import RawData
from .data import RawData, PreprocessedData, ProcessedData
from .metadata_template import SampleTemplate


Expand Down Expand Up @@ -108,3 +109,50 @@ def load_raw_data_cmd(filepaths, filepath_types, filetype, study_ids):

return RawData.create(filetype_id, list(zip(filepaths, filepath_types)),
studies)


def load_processed_data_cmd(fps, fp_types, processed_params_table_name,
processed_params_id, preprocessed_data_id=None,
processed_date=None):
"""Add a new processed data entry

Parameters
----------
fps : list of str
Paths to the processed data files to associate with the ProcessedData
object
fp_types: list of str
The types of files, one per fp
processed_params_table_name : str
The name of the processed_params_ table to use
processed_params_id : int
The ID of the row in the processed_params_ table
preprocessed_data_id : int, optional
Defaults to ``None``. The ID of the row in the preprocessed_data table.
processed_date : str, optional
Defaults to ``None``. The date and time to use as the processing date.
Must be interpretable as a datetime object

Returns
-------
qiita_db.ProcessedData
The newly created `qiita_db.ProcessedData` object
"""
if len(fps) != len(fp_types):
raise ValueError("Please pass exactly one fp_type for each "
"and every fp")

fp_types_dict = get_filepath_types()
fp_types = [fp_types_dict[x] for x in fp_types]

if preprocessed_data_id is not None:
preprocessed_data = PreprocessedData(preprocessed_data_id)
else:
preprocessed_data = None

if processed_date is not None:
processed_date = parse(processed_date)

return ProcessedData.create(processed_params_table_name,
processed_params_id, list(zip(fps, fp_types)),
preprocessed_data, processed_date)
6 changes: 2 additions & 4 deletions qiita_db/support_files/qiita-db.dbs
Original file line number Diff line number Diff line change
Expand Up @@ -635,10 +635,8 @@ Linked by y being raw_data_id from raw data table.</comment>
<table name="processed_filepath" >
<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
<column name="filepath_id" type="bigint" jt="-5" mandatory="y" />
<index name="pk_processed_data_filepath" unique="UNIQUE" >
<index name="idx_processed_filepath" unique="PRIMARY_KEY" >
<column name="processed_data_id" />
</index>
<index name="idx_processed_data_filepath" unique="NORMAL" >
<column name="filepath_id" />
</index>
<fk name="fk_processed_data_filepath" to_schema="qiita" to_table="processed_data" >
Expand Down Expand Up @@ -1262,8 +1260,8 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
<group name="Group_analyses" color="c4e0f9" >
<comment>analysis tables</comment>
<entity schema="qiita" name="analysis" />
Expand Down
3,191 changes: 1,545 additions & 1,646 deletions qiita_db/support_files/qiita-db.html

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions qiita_db/support_files/qiita-db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,11 @@ CREATE INDEX idx_preprocessed_processed_data_1 ON qiita.preprocessed_processed_d
CREATE TABLE qiita.processed_filepath (
processed_data_id bigint NOT NULL,
filepath_id bigint NOT NULL,
CONSTRAINT pk_processed_data_filepath UNIQUE ( processed_data_id ) ,
CONSTRAINT idx_processed_filepath PRIMARY KEY ( processed_data_id, filepath_id ),
CONSTRAINT fk_processed_data_filepath FOREIGN KEY ( processed_data_id ) REFERENCES qiita.processed_data( processed_data_id ) ,
CONSTRAINT fk_processed_data_filepath_0 FOREIGN KEY ( filepath_id ) REFERENCES qiita.filepath( filepath_id )
);

CREATE INDEX idx_processed_data_filepath ON qiita.processed_filepath ( filepath_id );

CREATE TABLE qiita.processed_params_uclust (
processed_params_id bigserial NOT NULL,
reference_id bigint NOT NULL,
Expand Down
61 changes: 60 additions & 1 deletion qiita_db/test/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
from configparser import NoOptionError

from qiita_db.commands import (make_study_from_cmd, load_raw_data_cmd,
sample_template_adder)
sample_template_adder, load_processed_data_cmd)
from qiita_db.study import Study, StudyPerson
from qiita_db.user import User
from qiita_db.util import get_count, check_count, get_db_files_base_dir
from qiita_db.data import PreprocessedData
from qiita_core.util import qiita_test_checker


Expand Down Expand Up @@ -148,6 +149,64 @@ def test_load_data_from_cmd(self):
study_ids)


@qiita_test_checker()
class TestLoadProcessedDataFromCmd(TestCase):
def setUp(self):
fd, self.otu_table_fp = mkstemp(suffix='_otu_table.biom')
close(fd)
fd, self.otu_table_2_fp = mkstemp(suffix='_otu_table2.biom')
close(fd)

with open(self.otu_table_fp, "w") as f:
f.write("\n")
with open(self.otu_table_2_fp, "w") as f:
f.write("\n")

self.files_to_remove = []
self.files_to_remove.append(self.otu_table_fp)
self.files_to_remove.append(self.otu_table_2_fp)

self.db_test_processed_data_dir = join(get_db_files_base_dir(),
'processed_data')

def tearDown(self):
for fp in self.files_to_remove:
if exists(fp):
remove(fp)

def test_load_processed_data_from_cmd(self):
filepaths = [self.otu_table_fp, self.otu_table_2_fp]
filepath_types = ['biom', 'biom']

initial_processed_data_count = get_count('qiita.processed_data')
initial_processed_fp_count = get_count('qiita.processed_filepath')
initial_fp_count = get_count('qiita.filepath')

new = load_processed_data_cmd(filepaths, filepath_types,
'processed_params_uclust', 1, 1, None)
processed_data_id = new.id
self.files_to_remove.append(
join(self.db_test_processed_data_dir,
'%d_%s' % (processed_data_id, basename(self.otu_table_fp))))
self.files_to_remove.append(
join(self.db_test_processed_data_dir,
'%d_%s' % (processed_data_id,
basename(self.otu_table_2_fp))))

self.assertTrue(check_count('qiita.processed_data',
initial_processed_data_count + 1))
self.assertTrue(check_count('qiita.processed_filepath',
initial_processed_fp_count + 2))
self.assertTrue(check_count('qiita.filepath',
initial_fp_count + 2))

# Ensure that the ValueError is raised when a filepath_type is not
# provided for each and every filepath
with self.assertRaises(ValueError):
load_processed_data_cmd(filepaths, filepath_types[:-1],
'processed_params_uclust', 1, 1, None)


CONFIG_1 = """[required]
timeseries_type_id = 1
metadata_complete = True
Expand Down
6 changes: 5 additions & 1 deletion qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
compute_checksum, check_table_cols,
check_required_columns, convert_to_id,
get_table_cols, get_filetypes, get_filepath_types,
get_count, check_count)
get_count, check_count, get_processed_params_tables)


@qiita_test_checker()
Expand Down Expand Up @@ -150,6 +150,10 @@ def test_check_count(self):
self.assertTrue(check_count('qiita.study_person', 3))
self.assertFalse(check_count('qiita.study_person', 2))

def test_get_processed_params_tables(self):
obs = get_processed_params_tables()
self.assertEqual(obs, ['processed_params_uclust'])


class UtilTests(TestCase):
"""Tests for the util functions that do not need to access the DB"""
Expand Down
14 changes: 14 additions & 0 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,3 +467,17 @@ def check_count(table, exp_count):
"""
obs_count = get_count(table)
return obs_count == exp_count


def get_processed_params_tables():
"""Returns a list of all tables starting with "processed_params_"

Returns
-------
list of str
"""
sql = ("SELECT * FROM information_schema.tables WHERE table_schema = "
"'qiita' AND SUBSTR(table_name, 1, 17) = 'processed_params_'")

conn = SQLConnectionHandler()
return [x[2] for x in conn.execute_fetchall(sql)]
34 changes: 32 additions & 2 deletions scripts/qiita_db
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

import click

from qiita_db.util import get_filetypes, get_filepath_types
from qiita_db.util import (get_filetypes, get_filepath_types,
get_processed_params_tables)
from qiita_db.commands import (sample_template_adder, make_study_from_cmd,
load_raw_data_cmd)
load_raw_data_cmd, load_processed_data_cmd)


@click.group()
Expand All @@ -39,6 +40,35 @@ def load_raw_data(fp, fp_type, filetype, study):
load_raw_data_cmd(fp, fp_type, filetype, study)


@qiita_db.command()
@click.option('--fp', required=True, type=click.Path(resolve_path=True,
readable=True, exists=True), multiple=True, help='Path to the '
'processed data. This option can be used multilpe times if '
'there are multiple processed data files.')
@click.option('--fp_type', required=True, multiple=True, help='Describes the '
'contents of the file. Pass one fp_type per fp.',
type=click.Choice(get_filepath_types().keys()))
@click.option('--processed_params_table', required=True,
type=click.Choice(get_processed_params_tables()),
help='The table containing the processed parameters used to '
'generate this file')
@click.option('--processed_params_id', required=True, type=int,
help='The ID of the row in the processed_params table')
@click.option('--preprocessed_data_id', type=int, default=None, help='The '
'ID of the row in the preprocessed_data table from which '
'this processed data was created')
@click.option('--processed_date', type=str, default=None,
help='The date to use as the processed_date. Must be '
'interpretable as a datetime. If None, then the current date '
'and time will be used.')
def load_processed_data(fp, fp_type, processed_params_table,
processed_params_id, preprocessed_data_id,
processed_date):
load_processed_data_cmd(fp, fp_type, processed_params_table,
processed_params_id, preprocessed_data_id,
processed_date)


@qiita_db.command()
@click.option('--owner', help="The email address of the owner of the study")
@click.option('--title', help="The title of the study")
Expand Down