diff --git a/.appveyor.yml b/.appveyor.yml index 2f6ccfd..74a9397 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -51,7 +51,8 @@ install: # pip will build them from source using the MSVC compiler matching the # target Python version and architecture - "python -m pip install wheel pytest" - - "python -m pip install -r requirements/prod.txt" + - "python -m pip install -r requirements/win.txt -f https://download.pytorch.org/whl/torch_stable.html" + - "python -m pip install -r requirements/dev.txt" build_script: # Build the compiled extension diff --git a/LICENSE b/LICENSE index 9fe4561..02d4aca 100644 --- a/LICENSE +++ b/LICENSE @@ -1,22 +1,15 @@ -MIT License +phenotrex +Copyright (C) 2019-2020, Lukas Lüftinger -Copyright (c) 2019, Lukas Lüftinger +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +You should have received a copy of the GNU General Public License +along with this program. If not, see . diff --git a/phenotrex/bin/prodigal.linux b/phenotrex/bin/prodigal.linux new file mode 100755 index 0000000..c9812b0 Binary files /dev/null and b/phenotrex/bin/prodigal.linux differ diff --git a/phenotrex/bin/prodigal.osx.10.9.5 b/phenotrex/bin/prodigal.osx.10.9.5 new file mode 100755 index 0000000..9b51733 Binary files /dev/null and b/phenotrex/bin/prodigal.osx.10.9.5 differ diff --git a/phenotrex/bin/prodigal b/phenotrex/bin/prodigal.windows.exe similarity index 86% rename from phenotrex/bin/prodigal rename to phenotrex/bin/prodigal.windows.exe index 2c7dfc7..fcff06f 100755 Binary files a/phenotrex/bin/prodigal and b/phenotrex/bin/prodigal.windows.exe differ diff --git a/phenotrex/cli/predict.py b/phenotrex/cli/predict.py index d13d141..7eb4766 100644 --- a/phenotrex/cli/predict.py +++ b/phenotrex/cli/predict.py @@ -8,6 +8,9 @@ required=False, help='Input genotype file.') @click.option('--classifier', required=True, type=click.Path(exists=True), help='Path of pickled classifier file.') +@click.option('--min_proba', type=click.FloatRange(0.5, 1.0), default=0.5, + help='Class probability threshold for displaying predictions. ' + 'Predictions below the threshold will be given as "N/A".') @click.option('--out_explain_per_sample', type=click.Path(dir_okay=False), help='Write SHAP explanations for each predicted sample to file (optional).') @click.option('--out_explain_summary', type=click.Path(dir_okay=False), diff --git a/phenotrex/ml/cccv.py b/phenotrex/ml/cccv.py index 0ffbbd1..ce059f5 100644 --- a/phenotrex/ml/cccv.py +++ b/phenotrex/ml/cccv.py @@ -124,7 +124,8 @@ def _completeness_cv(self, param, **kwargs) -> Dict[float, Dict[float, float]]: X_train, y_train, tn = get_x_y_tn(training_records) classifier.fit(X=X_train, y=y_train, **kwargs) - # initialize the resampler with the test_records only, so the samples are unknown to the classifier + # initialize the resampler with the test_records only, + # so the samples are unknown to the classifier resampler = TrainingRecordResampler(random_state=self.random_state, verb=False) resampler.fit(records=test_records) cv_scores = {} @@ -147,7 +148,8 @@ def run(self, records: List[TrainingRecord]): """ # TODO: run compress_vocabulary before? - self.logger.info("Begin completeness/contamination matrix crossvalidation on training data.") + self.logger.info( + "Begin completeness/contamination matrix crossvalidation on training data.") t1 = time() if self.n_jobs is None: cv_scores = map(self._completeness_cv, self._replicates(records, self.cv, @@ -168,7 +170,8 @@ def run(self, records: List[TrainingRecord]): for comple in cv_scores_list[0].keys(): mba[comple] = {} for conta in cv_scores_list[0][comple].keys(): - single_result = [cv_scores_list[r][comple][conta] for r in range(self.n_replicates * self.cv)] + single_result = [cv_scores_list[r][comple][conta] for r in + range(self.n_replicates * self.cv)] mean_over_fold_and_replicates = np.mean(single_result) std_over_fold_and_replicates = np.std(single_result) mba[comple][conta] = {"score_mean": mean_over_fold_and_replicates, diff --git a/phenotrex/ml/feature_select.py b/phenotrex/ml/feature_select.py index 297bb38..56b1090 100644 --- a/phenotrex/ml/feature_select.py +++ b/phenotrex/ml/feature_select.py @@ -17,17 +17,14 @@ def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline): """ - Method to group features, that store redundant information, to avoid overfitting and speed up process (in some - cases). Might be replaced or complemented by a feature selection method in future versions. - - Compressing vocabulary is optional, for the test dataset it took 30 seconds, while the time saved later on is not - significant. + Method to group features, that store redundant information, + to avoid overfitting and speed up process (in some cases). + Might be replaced or complemented by a feature selection method in future versions. :param records: a list of TrainingRecord objects. :param pipeline: the targeted pipeline where the vocabulary should be modified :return: nothing, sets the vocabulary for CountVectorizer step """ - X, y, tn = get_x_y_tn(records) # we actually only need X vec = pipeline.named_steps["vec"] if not vec.vocabulary: @@ -58,8 +55,10 @@ def compress_vocabulary(records: List[TrainingRecord], pipeline: Pipeline): pipeline.named_steps["vec"].fixed_vocabulary_ = True -def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE, - n_features: int = None, random_state: np.random.RandomState = None): +def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, + step: float = DEFAULT_STEP_SIZE, + n_features: int = None, + random_state: np.random.RandomState = None): """ Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step. @@ -101,13 +100,15 @@ def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipel support = selector.get_support() support = support.nonzero()[0] new_id = {support[x]: x for x in range(len(support))} - vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if not new_id.get(i) is None} + vocabulary = {feature: new_id[i] for feature, i in previous_vocabulary.items() if + not new_id.get(i) is None} size_after = selector.n_features_ t2 = time() - logger.info(f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation" - f" in {np.round(t2 - t1, 2)} seconds.") + logger.info( + f"{size_after} features were selected of {original_size} using Recursive Feature Eliminiation" + f" in {np.round(t2 - t1, 2)} seconds.") # set vocabulary to vectorizer pipeline.named_steps["vec"].vocabulary = vocabulary @@ -115,26 +116,3 @@ def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipel pipeline.named_steps["vec"].fixed_vocabulary_ = True return size_after - - -def multiple_step_rfecv(records: List[TrainingRecord], pipeline: Pipeline, n_features: int, step=(0.01, 0.01, 0.01, ), - random_state: np.random.RandomState = None): - """ - Function to apply multiple steps-sizes of RFECV in series, currently not used. Strategy might be problematic, - no clear benefit. #TODO rethink or remove - - :param records: Data used - :param pipeline: The base estimator used - :param n_features: Goal number of features - :param step: List of steps that should be applied - :param random_state: random state for deterministic results - :return: - """ - # step = [0.0025] - for s in step: - size_after = recursive_feature_elimination(records, pipeline=pipeline, step=s, n_features=n_features, - random_state=random_state) - if size_after == n_features: - break - - return size_after diff --git a/phenotrex/ml/prediction.py b/phenotrex/ml/prediction.py index 839978d..8e635d5 100644 --- a/phenotrex/ml/prediction.py +++ b/phenotrex/ml/prediction.py @@ -10,7 +10,7 @@ from phenotrex.util.helpers import fail_missing_dependency as fastas_to_grs -def predict(fasta_files=tuple(), genotype=None, classifier=None, +def predict(fasta_files=tuple(), genotype=None, classifier=None, min_proba=0.5, out_explain_per_sample=None, out_explain_summary=None, shap_n_samples=None, n_max_explained_features=None, verb=False): """ @@ -71,4 +71,8 @@ def predict(fasta_files=tuple(), genotype=None, classifier=None, print(f"# Trait: {model.trait_name}") print("Identifier\tTrait present\tConfidence") for record, result, probability in zip(gr, preds, probas): - print(f"{record.identifier}\t{translate_output[result]}\t{str(round(probability[result], 4))}") + if probability[result] < min_proba: + result_disp = "N/A" + else: + result_disp = translate_output[result] + print(f"{record.identifier}\t{result_disp}\t{str(round(probability[result], 4))}") diff --git a/phenotrex/ml/shap_handler.py b/phenotrex/ml/shap_handler.py index 620512e..de1c87a 100644 --- a/phenotrex/ml/shap_handler.py +++ b/phenotrex/ml/shap_handler.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union, List +from typing import Tuple import pandas as pd import numpy as np @@ -97,20 +97,20 @@ def _get_feature_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: try: X_agg = self._used_features.astype(float) shap_agg = self._used_shaps.astype(float) - except (ValueError, AttributeError) as e: + except (ValueError, AttributeError): raise RuntimeError('No explanations saved.') sample_names = self._sample_names return X_agg, shap_agg, sample_names - def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[ - np.ndarray, np.ndarray, np.ndarray]: + def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Sort features by absolute magnitude of shap values, and return sorted features, shap values and feature names. :param sort_by_idx: if an index into the sample names is passed, sorting will be based only on this sample's SHAP values. - :return: Used features, used shaps, and the feature names all sorted by absolute magnitude of shap value. + :return: Used features, used shaps, and the feature names + all sorted by absolute magnitude of shap value. """ X_agg, shap_agg, _ = self._get_feature_data() @@ -125,12 +125,11 @@ def _get_sorted_by_shap_data(self, sort_by_idx=None) -> Tuple[ absshap = np.apply_along_axis(np.abs, feature_axis, shap_agg) if sort_by_idx is None: # sort features by absolute change in shap over all classes and samples sort_criterion = np.apply_over_axes(np.sum, absshap, nonfeature_axes) - feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1] else: # sort features by absolute change in shap over all classes for given sample idx sort_criterion = np.sum(absshap[sort_by_idx, ...], axis=0) feature_sort_inds = np.squeeze(np.argsort(sort_criterion))[::-1] - return X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds], \ - self._used_feature_names[feature_sort_inds] + return (X_agg[:, feature_sort_inds], shap_agg[..., feature_sort_inds], + self._used_feature_names[feature_sort_inds]) def plot_shap_force(self, sample_name: str, **kwargs): """ @@ -196,7 +195,6 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF :param sample_name: :param n_max_features: - :param nsamples: :return: a dataframe of the n_max_features most influential features, their value in the sample, and the associated SHAP value(s). @@ -215,6 +213,7 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF shap_vals = [shap_agg_s[i, :n_max_features].round(5), ] sample_names = [sample_name] * len(fns) df_arrs = [sample_names, fns, feature_vals, *shap_vals] + df_arrs = [np.array(x) for x in df_arrs] df_labels = ['sample', 'feature', 'feature_value', *[f'SHAP value ({x})' for x in self._class_names]][:len(df_arrs)] df = pd.DataFrame(df_arrs, index=df_labels).T @@ -223,7 +222,8 @@ def get_shap_force(self, sample_name: str, n_max_features: int = 20) -> pd.DataF def get_shap_summary(self, n_max_features: int = 50): """ - Get summary of features for all predictions, sorted by average impact of feature on shap value. + Get summary of features for all predictions, + sorted by average impact of feature on shap value. :param n_max_features: :return: a DataFrame of most important SHAP values for samples in the given dataset. diff --git a/phenotrex/structure/records.py b/phenotrex/structure/records.py index cc980e4..b83d714 100644 --- a/phenotrex/structure/records.py +++ b/phenotrex/structure/records.py @@ -1,15 +1,15 @@ # # Created by Lukas Lüftinger on 2/5/19. # -from typing import List +from typing import List, Optional from dataclasses import dataclass -"""Data structures containing Genotype and Phenotype information.""" - @dataclass class GenotypeRecord: - """ TODO add docstring """ + """ + Genomic features of a sample referenced by `identifier`. + """ identifier: str features: List[str] @@ -19,7 +19,12 @@ def __repr__(self): @dataclass class PhenotypeRecord: - """ TODO add docstring """ + """ + Ground truth labels of sample `identifier`, + indicating presence/absence of trait `trait_name`: + - 0 if trait is absent + - 1 if trait is present + """ identifier: str trait_name: str trait_sign: int @@ -30,10 +35,16 @@ def __repr__(self): @dataclass class GroupRecord: - """ TODO add docstring """ + """ + Group label of sample `identifier`. + Notes + ----- + Useful for leave-one-group-out cross-validation (LOGO-CV), + for example, to take taxonomy into account. + """ identifier: str - group_name: str - group_id: int + group_name: Optional[str] + group_id: Optional[int] def __repr__(self): return f"{self.identifier} group({self.group_name})={self.group_id}" @@ -41,11 +52,15 @@ def __repr__(self): @dataclass class TrainingRecord(GenotypeRecord, PhenotypeRecord, GroupRecord): - """ TODO add docstring """ - pass - + """ + Sample containing Genotype-, Phenotype- and GroupRecords, + suitable as machine learning input for a single observation. + """ def __repr__(self): gr_repr = GenotypeRecord.__repr__(self).split(' ')[1] pr_repr = PhenotypeRecord.__repr__(self).split(' ')[1] - gro_repr = GroupRecord.__repr__(self).split(' ')[1] + if self.group_name is not None and self.group_id is not None: + gro_repr = GroupRecord.__repr__(self).split(' ')[1] + else: + gro_repr = '' return f"id={self.identifier} {' '.join([gr_repr, pr_repr, gro_repr])}" diff --git a/phenotrex/transforms/annotation.py b/phenotrex/transforms/annotation.py index 5c2ed77..4dbbfa1 100644 --- a/phenotrex/transforms/annotation.py +++ b/phenotrex/transforms/annotation.py @@ -1,4 +1,6 @@ import os +import sys +from collections import namedtuple from typing import List from pathlib import Path from pkg_resources import resource_filename @@ -7,29 +9,51 @@ from concurrent.futures import ProcessPoolExecutor import torch -from Bio.SeqIO import SeqRecord, parse -from deepnog.deepnog import load_nn, predict, set_device, create_df -from deepnog.dataset import ProteinDataset +from deepnog.inference import load_nn, predict +from deepnog.io import create_df, get_weights_path +from deepnog.utils import set_device +from deepnog.dataset import ProteinDataset, ProteinIterator +from Bio.SeqIO import SeqRecord, parse, write from tqdm.auto import tqdm from phenotrex.io.flat import load_fasta_file from phenotrex.structure.records import GenotypeRecord - -PRODIGAL_BIN_PATH = resource_filename('phenotrex', 'bin/prodigal') -DEEPNOG_WEIGHTS_PATH = resource_filename('deepnog', 'parameters/eggNOG5/2/deepencoding.pth') +PRODIGAL_BIN_SUFFIX = {'win32': 'windows.exe', 'darwin': 'osx.10.9.5'}.get(sys.platform, 'linux') +PRODIGAL_BIN_PATH = resource_filename('phenotrex', f'bin/prodigal.{PRODIGAL_BIN_SUFFIX}') DEEPNOG_ARCH = 'deepencoding' -EGGNOGDB_VERS = '5.0' +DEEPNOG_VALID_DBS = { + 'eggNOG5', +} +DEEPNOG_VALID_TAX_LEVELS = { + 1, + 2, +} + + +class PreloadedProteinIterator(ProteinIterator): + """Hack ProteinDataset to load from list directly.""" + def __init__(self, protein_list: List[SeqRecord], aa_vocab, format): + self.iterator = (x for x in protein_list) + self.vocab = aa_vocab + self.format = format + self.start = 0 + self.pos = None + self.step = 0 + self.sequence = namedtuple('sequence', + ['index', 'id', 'string', 'encoded']) class PreloadedProteinDataset(ProteinDataset): """Hack ProteinDataset to load from list directly.""" def __init__(self, protein_list: List[SeqRecord]): - try: - super().__init__(file='') - except ValueError: - pass # >:D - self.iter = (x for x in protein_list) + super().__init__(file=None) + self.protein_list = protein_list + + def __iter__(self): + return PreloadedProteinIterator(protein_list=self.protein_list, + aa_vocab=self.vocab, + format=self.f_format) def fastas_to_grs(fasta_files: List[str], verb: bool = False, @@ -40,7 +64,7 @@ def fastas_to_grs(fasta_files: List[str], verb: bool = False, :param fasta_files: a list of DNA and/or protein FASTA files to be converted into GenotypeRecords. :param verb: Whether to display progress of annotation with tqdm. - :param n_threads: Number of threads to run in parallel. Default, use up to all available CPU cores. + :param n_threads: Number of parallel threads. Default, use all available CPU cores. :returns: A list of GenotypeRecords corresponding with supplied FASTA files. """ n_threads = min(os.cpu_count(), n_threads) if n_threads is not None else os.cpu_count() @@ -66,43 +90,54 @@ def fasta_to_gr(fasta_file: str, verb: bool = False) -> GenotypeRecord: fname = Path(str(fasta_file)).name seqtype, seqs = load_fasta_file(fasta_file) if seqtype == 'protein': - return annotate_with_deepnog(fname, seqs, verb) + return annotate_with_deepnog(fname, seqs, verb=verb) else: - return annotate_with_deepnog(fname, call_proteins(fasta_file), verb) + return annotate_with_deepnog(fname, call_proteins(seqs), verb=verb) -def call_proteins(fna_file: str) -> List[SeqRecord]: +def call_proteins(seqs: List[SeqRecord]) -> List[SeqRecord]: """ Perform protein calling with prodigal. - :param fna_file: A nucleotide fasta file. - :returns: a list of SeqRecords suitable for annotation with deepnog. + :param seqs: A list of DNA fasta SeqRecords. + :returns: a list of protein fasta SeqRecords suitable for annotation with deepnog. """ - with NamedTemporaryFile(mode='w+') as tmp_f: - check_call([PRODIGAL_BIN_PATH, '-i', fna_file, '-a', tmp_f.name], - stderr=DEVNULL, stdout=DEVNULL) - tmp_f.seek(0) - return list(parse(tmp_f, 'fasta')) + with NamedTemporaryFile(mode='w', delete=False) as fna_file: + write(seqs, fna_file, format='fasta') + with NamedTemporaryFile(mode='r', delete=False) as faa_file: + check_call([ + PRODIGAL_BIN_PATH, + '-i', fna_file.name, + '-a', faa_file.name + ], stderr=DEVNULL, stdout=DEVNULL) + parsed = list(parse(faa_file, 'fasta')) + os.unlink(fna_file.name) + os.unlink(faa_file.name) # cannot re-use .name of open NamedTemporaryFile under Win32 + return parsed def annotate_with_deepnog(identifier: str, protein_list: List[SeqRecord], + database: str = 'eggNOG5', tax_level: int = 2, verb: bool = True) -> GenotypeRecord: """ Perform calling of EggNOG5 clusters on a list of SeqRecords belonging to a sample, using deepnog. :param identifier: The name associated with the sample. :param protein_list: A list of SeqRecords containing protein sequences. - :param verb: Whether to use tqdm for progress calculation + :param database: Orthologous group/family database to use. + :param tax_level: The NCBI taxon ID of the taxonomic level to use from the given database. + :param verb: Whether to print verbose progress messages. :returns: a GenotypeRecord suitable for use with phenotrex. """ device = set_device('auto') torch.set_num_threads(1) - - model_dict = torch.load(DEEPNOG_WEIGHTS_PATH, map_location=device) + weights_path = get_weights_path( + database=database, level=str(tax_level), architecture=DEEPNOG_ARCH, + ) + model_dict = torch.load(weights_path, map_location=device) model = load_nn(DEEPNOG_ARCH, model_dict, device) class_labels = model_dict['classes'] dataset = PreloadedProteinDataset(protein_list) - preds, confs, ids, indices = predict(model, dataset, device, batch_size=1, num_workers=1, @@ -111,6 +146,6 @@ def annotate_with_deepnog(identifier: str, protein_list: List[SeqRecord], if hasattr(model, 'threshold'): threshold = model.threshold df = create_df(class_labels, preds, confs, ids, indices, - threshold=threshold, device=device, verbose=0) + threshold=threshold, verbose=0) cogs = [x for x in df.prediction.unique() if x] return GenotypeRecord(identifier=identifier, features=cogs) diff --git a/phenotrex/util/plotting.py b/phenotrex/util/plotting.py index 489d0af..e24cf7f 100644 --- a/phenotrex/util/plotting.py +++ b/phenotrex/util/plotting.py @@ -5,7 +5,12 @@ from typing import Dict, List, Union -def compleconta_plot(cccv_results: List[Dict[float, Dict[float, Dict[str, float]]]], +CccvType = Union[ + Dict[float, Dict[float, Dict[str, float]]], + List[Dict[float, Dict[float, Dict[str, float]]]] +] + +def compleconta_plot(cccv_results: CccvType, conditions: List[str] = (), each_n: List[int] = None, title: str = "", fontsize: int = 16, figsize=(10, 7), plot_comple: bool = True, plot_conta: bool = True, diff --git a/phenotrex/util/taxonomy.py b/phenotrex/util/taxonomy.py index 6f5a16e..c648fb2 100644 --- a/phenotrex/util/taxonomy.py +++ b/phenotrex/util/taxonomy.py @@ -10,15 +10,14 @@ def get_taxonomic_group_mapping(group_ids: List[str], selected_rank: str) -> Tuple[Dict, Dict]: """ - Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided training records - into training and validation sets + Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided + training records into training and validation sets :param group_ids: List of identifiers that should be NCBI taxon ids - :param selected_rank: selected standard rank determining on which level the set is split in training- and - validation-set + :param selected_rank: selected standard rank determining on which level the set is split in + training and validation-set :return: Mapping of input taxon_ids as string and groups as integers """ - ncbi = NCBITaxa() standard_ranks = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] if not selected_rank.lower() in standard_ranks: @@ -53,7 +52,5 @@ def auto_select_rank(group_ids: List[str]) -> str: :param group_ids: :return: selected rank """ - # TODO: implement some auto detection method, if needed - return DEFAULT_AUTO_SELECTED_RANK diff --git a/requirements/dev.txt b/requirements/dev.txt index bdece25..90fbdbb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,4 +1,5 @@ -r prod.txt +-r extras.txt -r test.txt pip>=19.2.3 wheel>=0.33.6 diff --git a/requirements/extras.txt b/requirements/extras.txt index 0c345a9..cdb4f4e 100644 --- a/requirements/extras.txt +++ b/requirements/extras.txt @@ -1,2 +1,2 @@ -deepnog>=1.0.1 +deepnog>=1.1.0 ete3 diff --git a/requirements/prod.txt b/requirements/prod.txt index b65c1e8..0fdaab0 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -5,3 +5,4 @@ xgboost>=1.0.1 shap>=0.34 click>=7.0 biopython>=1.74 +tqdm>=4.26.0 diff --git a/requirements/win.txt b/requirements/win.txt new file mode 100644 index 0000000..8d0b059 --- /dev/null +++ b/requirements/win.txt @@ -0,0 +1 @@ +torch>=1.2.0+cpu diff --git a/setup.py b/setup.py index 81328fd..7d85fef 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,5 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- -"""The setup script.""" import codecs from os import path diff --git a/tests/targets.py b/tests/targets.py index 50eaa57..1f95d52 100644 --- a/tests/targets.py +++ b/tests/targets.py @@ -9,6 +9,10 @@ "Sulfate_reducer": "GCA_000007085.1", } +first_groups_accession = { + "Sulfate_reducer": "GCA_000007085.1", +} + cv_scores_trex = { 'SVM': {'Sulfate_reducer': {3 : {'accuracy' : (0.9477939477939478, 0.04189200493982592), 'balanced_accuracy': (0.94017094017094, 0.04262807124628204), diff --git a/tests/test_data/Sulfate_reducer.taxids b/tests/test_data/Sulfate_reducer.taxids new file mode 100644 index 0000000..0b33974 --- /dev/null +++ b/tests/test_data/Sulfate_reducer.taxids @@ -0,0 +1,115 @@ +GCA_900113335.1 857293 +GCA_000621705.1 639282 +GCA_002995805.1 643648 +GCA_000016785.1 655607 +GCA_900167305.1 656914 +GCA_000233715.3 667014 +GCA_000341895.1 690567 +GCA_001586935.1 690879 +GCA_000199675.1 697303 +GCA_000519125.1 708132 +GCA_000007085.1 720554 +GCA_000010985.1 760142 +GCA_001640105.1 767817 +GCA_000241815.1 768704 +GCA_002270055.1 768710 +GCA_900176035.1 632518 +GCA_900142105.1 858215 +GCA_000147695.3 868595 +GCA_000425265.1 871968 +GCA_001681735.1 872965 +GCA_000820845.2 880073 +GCA_000204645.1 891968 +GCA_900104045.1 926569 +GCA_000214435.1 935948 +GCA_900129935.1 994573 +GCA_000425245.1 999894 +GCA_900129975.1 1120989 +GCA_003057965.1 1121270 +GCA_000601455.1 1121301 +GCA_000321415.2 390874 +GCA_001730685.1 119641 +GCA_000217795.1 157592 +GCA_000378005.1 161156 +GCA_003201285.1 206665 +GCA_001306175.1 215580 +GCA_000166335.1 224999 +GCA_000023325.1 243164 +GCA_900167165.1 272564 +GCA_000429925.1 273068 +GCA_000016065.1 281362 +GCA_000231385.3 323850 +GCA_000010665.1 335543 +GCA_900167065.1 345632 +GCA_900130025.1 379347 +GCA_001687335.1 1121392 +GCA_900142275.1 401526 +GCA_000266925.1 404380 +GCA_002005145.1 416591 +GCA_000429345.1 419481 +GCA_900106765.1 454188 +GCA_900142245.1 454194 +GCA_000746255.1 500633 +GCA_002797735.1 520767 +GCA_001748245.1 521045 +GCA_000194135.1 522772 +GCA_900096945.1 573370 +GCA_000430885.1 596154 +GCA_000495435.3 598659 +GCA_000422245.1 1408160 +GCA_000711735.1 1185843 +GCA_000817955.1 1255043 +GCA_900142025.1 1259795 +GCA_000237085.1 1266720 +GCA_000482745.1 1294023 +GCA_900112485.1 1303518 +GCA_001311885.1 1304874 +GCA_000421585.1 1304885 +GCA_001047035.1 1322246 +GCA_000021725.1 1323370 +GCA_001293685.1 1346286 +GCA_000244895.1 1348163 +GCA_000828675.1 1390249 +GCA_001968985.1 1403537 +GCA_001672295.1 1168067 +GCA_000189775.3 1408473 +GCA_002895565.1 1429043 +GCA_002924615.1 1478221 +GCA_001551835.1 1560234 +GCA_900142145.1 1564487 +GCA_000231405.3 1619234 +GCA_900112235.1 1630136 +GCA_000156055.1 1716143 +GCA_001485475.2 1763509 +GCA_002933415.1 1763535 +GCA_000017865.1 1794699 +GCA_000092405.1 1795632 +GCA_000014965.1 1833852 +GCA_000983115.1 1121467 +GCA_000297115.1 1121394 +GCA_002724055.1 1121406 +GCA_000020725.1 1121420 +GCA_001613545.1 1121422 +GCA_900129915.1 1121424 +GCA_001652585.1 1121428 +GCA_003315275.1 1121432 +GCA_000429905.1 1121436 +GCA_000526715.1 1121439 +GCA_000021925.1 1121442 +GCA_000025725.1 1121447 +GCA_900101345.1 1121451 +GCA_000169155.1 1121457 +GCA_000526375.1 1914305 +GCA_000931935.2 1121468 +GCA_900129895.1 1121911 +GCA_001563225.1 1121917 +GCA_000011905.1 1121919 +GCA_001601575.1 1122184 +GCA_001595645.1 1122252 +GCA_001742305.1 1122953 +GCA_000426225.1 1122954 +GCA_000383875.1 1123281 +GCA_001184205.1 1123350 +GCA_000427095.1 1123352 +GCA_000427425.1 1123371 +GCA_001642325.1 1156395 diff --git a/tests/test_ml.py b/tests/test_ml.py index a392737..941e60a 100644 --- a/tests/test_ml.py +++ b/tests/test_ml.py @@ -1,5 +1,3 @@ -from tempfile import TemporaryDirectory -from pathlib import Path import sys import json @@ -7,22 +5,25 @@ import numpy as np import matplotlib as mpl mpl.use('Agg') +from pathlib import Path +from tempfile import TemporaryDirectory -from tests.targets import (first_genotype_accession, first_phenotype_accession, cv_scores_trex, - num_of_features_compressed, num_of_features_uncompressed) -from phenotrex.io.flat import load_training_files -from phenotrex.io.serialization import load_classifier -from phenotrex.ml.shap_handler import ShapHandler +from tests.targets import (first_genotype_accession, first_phenotype_accession, first_groups_accession, + cv_scores_trex, num_of_features_compressed, num_of_features_uncompressed) +from phenotrex.io.flat import (load_training_files, + write_weights_file, write_params_file, write_misclassifications_file) +from phenotrex.io.serialization import save_classifier from phenotrex.ml import TrexSVM, TrexXGB from phenotrex.util.helpers import get_x_y_tn from phenotrex.ml.feature_select import recursive_feature_elimination, compress_vocabulary from phenotrex.ml.prediction import predict - from . import DATA_PATH, FROM_FASTA + RANDOM_STATE = 2 + trait_names = [ "Sulfate_reducer", # "Aerobe", @@ -71,12 +72,15 @@ def test_load_training_files(self, trait_name): """ full_path_genotype = DATA_PATH / f"{trait_name}.genotype" full_path_phenotype = DATA_PATH / f"{trait_name}.phenotype" + full_path_groups = DATA_PATH / f"{trait_name}.taxids" training_records, genotype, phenotype, group = load_training_files( genotype_file=full_path_genotype, phenotype_file=full_path_phenotype, + groups_file=full_path_groups, verb=True) assert genotype[0].identifier == first_genotype_accession[trait_name] assert phenotype[0].identifier == first_phenotype_accession[trait_name] + assert group[0].identifier == first_groups_accession[trait_name] return training_records, genotype, phenotype, group @pytest.mark.parametrize("trait_name", trait_names, ids=trait_names) @@ -91,28 +95,46 @@ def test_train(self, trait_name, classifier, use_shaps): """ training_records, genotype, phenotype, group = self.test_load_training_files(trait_name) clf = classifier(verb=True, random_state=RANDOM_STATE) - _ = clf.train(records=training_records, train_explainer=use_shaps) + clf.train(records=training_records, train_explainer=use_shaps) + with TemporaryDirectory() as tmpdir: + clf_path = Path(tmpdir)/'classifier.pkl' + weights_path = Path(tmpdir)/'weights.rank' + save_classifier(clf, clf_path) + weights = clf.get_feature_weights() + write_weights_file(weights_file=weights_path, weights=weights) + assert clf_path.is_file() + assert weights_path.is_file() @pytest.mark.parametrize("trait_name", trait_names, ids=trait_names) @pytest.mark.parametrize("cv", cv_folds, ids=[str(x) for x in cv_folds]) - @pytest.mark.parametrize("scoring", scoring_methods, ids=scoring_methods) @pytest.mark.parametrize("classifier", classifiers, ids=classifier_ids) - def test_crossvalidate(self, trait_name, cv, scoring, classifier): + @pytest.mark.parametrize("use_groups", [True, False], ids=['logo', 'nologo']) + def test_crossvalidate(self, trait_name, cv, classifier, use_groups): """ - Test default crossvalidation of PICASVM class. Using several different traits, cv folds, and scoring methods. + Test default crossvalidation of TrexClassifier class. + Using several different traits, cv folds, and scoring methods. Compares with dictionary cv_scores. + :param trait_name: :param cv: :param scoring: :param classifier: + :param use_groups: :return: """ training_records, genotype, phenotype, group = self.test_load_training_files(trait_name) clf = classifier(verb=True, random_state=RANDOM_STATE) - score_pred = clf.crossvalidate(records=training_records, cv=cv, scoring=scoring)[:2] - if classifier.identifier in cv_scores_trex: - score_target = cv_scores_trex[classifier.identifier][trait_name][cv][scoring] + score_pred = clf.crossvalidate(records=training_records, + cv=cv, scoring=scoring_methods[0], + groups=use_groups)[:2] + if classifier.identifier in cv_scores_trex and not use_groups: + score_target = cv_scores_trex[classifier.identifier][trait_name][cv][scoring_methods[0]] np.testing.assert_almost_equal(actual=score_pred, desired=score_target, decimal=1) + with TemporaryDirectory() as tmpdir: + misclass_path = Path(tmpdir)/'misclassifications.tsv' + write_misclassifications_file(misclass_path, training_records, + score_pred, use_groups=use_groups) + assert misclass_path.is_file() @pytest.mark.parametrize("trait_name", trait_names, ids=trait_names) @pytest.mark.parametrize("classifier", classifiers, ids=classifier_ids) @@ -126,10 +148,13 @@ def test_parameter_search(self, trait_name, classifier): """ training_records, genotype, phenotype, group = self.test_load_training_files(trait_name) clf = classifier(verb=True, random_state=RANDOM_STATE) - clf_opt = clf.parameter_search(records=training_records, n_iter=5, return_optimized=True) - assert type(clf_opt) == type(clf) + clf_opt = clf.parameter_search(records=training_records, n_iter=5, return_optimized=False) + assert isinstance(clf_opt, dict) + with TemporaryDirectory() as tmpdir: + param_path = Path(tmpdir)/'params.json' + write_params_file(param_path, clf_opt) + assert param_path.is_file() - @pytest.mark.skipif(sys.platform != "linux", reason="Stallman was right") @pytest.mark.parametrize("trait_name", trait_names, ids=trait_names) @pytest.mark.parametrize("classifier", classifiers, ids=classifier_ids) def test_compleconta_cv(self, trait_name, classifier): @@ -248,7 +273,6 @@ def test_recursive_feature_elimination(self, trait_name, n_features): for x in non_zero: if len(x) == 0: one_is_zero = True - assert not one_is_zero @pytest.mark.parametrize('trait_name', trait_names, ids=trait_names) @@ -258,7 +282,6 @@ def test_predict_from_genotype(self, trait_name, classifier_type): genotype_file = DATA_PATH/f'{trait_name}.genotype' print(predict(classifier=model_path, genotype=genotype_file)) - @pytest.mark.skipif(not FROM_FASTA, reason='Missing optional dependencies') @pytest.mark.parametrize('trait_name', trait_names, ids=trait_names) @pytest.mark.parametrize('fasta_files', predict_files, ids=['fna', 'faa', 'fna+faa']) diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 91c065d..4617326 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1,6 +1,9 @@ import pytest +from Bio.SeqRecord import SeqRecord + from phenotrex.transforms.resampling import TrainingRecordResampler +from phenotrex.transforms.annotation import call_proteins, load_fasta_file from phenotrex.io.flat import load_training_files from phenotrex.structure.records import GenotypeRecord @@ -31,6 +34,14 @@ def test_resampling(trait_name): trr.get_resampled(td[0], comple=.5, conta=.5) +@pytest.mark.skipif(not FROM_FASTA, reason='Missing optional dependencies') +def test_call_proteins(): + seqtype, seqs = load_fasta_file(predict_files[1]) + proteins = call_proteins(seqs) + assert isinstance(proteins[0], SeqRecord) + print(proteins[0]) + + @pytest.mark.skipif(not FROM_FASTA, reason='Missing optional dependencies') @pytest.mark.parametrize('infile', predict_files, ids=['fna-gz', 'fna', 'faa-gz', 'faa']) def test_compute_genotype(infile):