From 6b9767d7d70a4e8d7803dbb748a2116b7c4777ce Mon Sep 17 00:00:00 2001 From: jonbrenas <51911846+jonbrenas@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:11:26 +0000 Subject: [PATCH] More tests and some trimming of the hierarchy in anoph --- malariagen_data/anoph/dipclust.py | 6 +- malariagen_data/anopheles.py | 9 --- tests/anoph/test_dipclust.py | 102 ++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 11 deletions(-) diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py index fd0a6ac52..ac15eafdd 100644 --- a/malariagen_data/anoph/dipclust.py +++ b/malariagen_data/anoph/dipclust.py @@ -23,14 +23,16 @@ cnv_params, ) from .snp_frq import AnophelesSnpFrequencyAnalysis -from .cnv_data import AnophelesCnvData +from .cnv_frq import AnophelesCnvFrequencyAnalysis AA_CHANGE_QUERY = ( "effect in ['NON_SYNONYMOUS_CODING', 'START_LOST', 'STOP_LOST', 'STOP_GAINED']" ) -class AnophelesDipClustAnalysis(AnophelesSnpFrequencyAnalysis, AnophelesCnvData): +class AnophelesDipClustAnalysis( + AnophelesCnvFrequencyAnalysis, AnophelesSnpFrequencyAnalysis +): def __init__( self, **kwargs, diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index d932b39ca..8dd86da8f 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -12,11 +12,6 @@ import plotly.graph_objects as go # type: ignore from numpydoc_decorator import doc # type: ignore -from malariagen_data.anoph.snp_frq import ( - AnophelesSnpFrequencyAnalysis, -) - -from .anoph.cnv_frq import AnophelesCnvFrequencyAnalysis from .anoph import ( aim_params, @@ -31,7 +26,6 @@ ) from .anoph.aim_data import AnophelesAimData from .anoph.base import AnophelesBase -from .anoph.cnv_data import AnophelesCnvData from .anoph.genome_features import AnophelesGenomeFeaturesData from .anoph.genome_sequence import AnophelesGenomeSequenceData from .anoph.hap_data import AnophelesHapData, hap_params @@ -87,8 +81,6 @@ class AnophelesDataResource( AnophelesH12Analysis, AnophelesG123Analysis, AnophelesFstAnalysis, - AnophelesCnvFrequencyAnalysis, - AnophelesSnpFrequencyAnalysis, AnophelesHapFrequencyAnalysis, AnophelesDistanceAnalysis, AnophelesPca, @@ -97,7 +89,6 @@ class AnophelesDataResource( AnophelesAimData, AnophelesHapData, AnophelesSnpData, - AnophelesCnvData, AnophelesSampleMetadata, AnophelesGenomeFeaturesData, AnophelesGenomeSequenceData, diff --git a/tests/anoph/test_dipclust.py b/tests/anoph/test_dipclust.py index 09a3f11a2..1c1de419e 100644 --- a/tests/anoph/test_dipclust.py +++ b/tests/anoph/test_dipclust.py @@ -7,6 +7,14 @@ from malariagen_data.anoph.dipclust import AnophelesDipClustAnalysis +def random_transcripts_contig(*, api, contig, n): + df_gff = api.genome_features(attributes=["ID", "Parent"]) + df_transcripts = df_gff.query(f"type == 'mRNA' and contig == '{contig}'") + transcript_ids = df_transcripts["ID"].dropna().to_list() + transcripts = random.sample(transcript_ids, n) + return transcripts + + @pytest.fixture def ag3_sim_api(ag3_sim_fixture): return AnophelesDipClustAnalysis( @@ -98,3 +106,97 @@ def test_plot_diplotype_clustering( # Run checks. api.plot_diplotype_clustering(**dipclust_params) + + +@pytest.mark.parametrize("distance_metric", ["cityblock", "euclidean"]) +@parametrize_with_cases("fixture,api", cases=".") +def test_plot_diplotype_clustering_advanced( + fixture, api: AnophelesDipClustAnalysis, distance_metric +): + # Set up test parameters. + all_sample_sets = api.sample_sets()["sample_set"].to_list() + linkage_methods = ( + "single", + "complete", + "average", + "weighted", + "centroid", + "median", + "ward", + ) + sample_queries = (None, "sex_call == 'F'") + dipclust_params = dict( + region=fixture.random_region_str(region_size=5000), + sample_sets=[random.choice(all_sample_sets)], + linkage_method=random.choice(linkage_methods), + distance_metric=distance_metric, + sample_query=random.choice(sample_queries), + show=False, + ) + + # Run checks. + api.plot_diplotype_clustering_advanced(**dipclust_params) + + +@pytest.mark.parametrize("n", [1, 2]) +@parametrize_with_cases("fixture,api", cases=".") +def test_plot_diplotype_clustering_advanced_with_transcript( + fixture, api: AnophelesDipClustAnalysis, n +): + # Set up test parameters. + contig = fixture.random_contig() + transcripts = random_transcripts_contig(api=api, contig=contig, n=n) + all_sample_sets = api.sample_sets()["sample_set"].to_list() + linkage_methods = ( + "single", + "complete", + "average", + "weighted", + "centroid", + "median", + "ward", + ) + sample_queries = (None, "sex_call == 'F'") + dipclust_params = dict( + region=contig, + snp_transcripts=transcripts, + sample_sets=[random.choice(all_sample_sets)], + linkage_method=random.choice(linkage_methods), + distance_metric="cityblock", + sample_query=random.choice(sample_queries), + show=False, + ) + + # Run checks. + api.plot_diplotype_clustering_advanced(**dipclust_params) + + +@parametrize_with_cases("fixture,api", cases=".") +def test_plot_diplotype_clustering_advanced_with_cnv_region( + fixture, api: AnophelesDipClustAnalysis +): + # Set up test parameters. + region = fixture.random_region_str(region_size=5000) + all_sample_sets = api.sample_sets()["sample_set"].to_list() + linkage_methods = ( + "single", + "complete", + "average", + "weighted", + "centroid", + "median", + "ward", + ) + sample_queries = (None, "sex_call == 'F'") + dipclust_params = dict( + region=region, + cnv_region=region, + sample_sets=[random.choice(all_sample_sets)], + linkage_method=random.choice(linkage_methods), + distance_metric="cityblock", + sample_query=random.choice(sample_queries), + show=False, + ) + + # Run checks. + api.plot_diplotype_clustering_advanced(**dipclust_params)