From a031c7e089fb7cc80e9e1fc5ea6a3a9f74dd86a5 Mon Sep 17 00:00:00 2001 From: Adam Date: Thu, 3 Aug 2023 19:21:09 +0200 Subject: [PATCH 1/3] Bump altair and loguru versions --- eds_scikit/biology/viz/plot.py | 7 +++---- eds_scikit/biology/viz/wrapper.py | 6 +++--- eds_scikit/plot/age_pyramid.py | 4 ++-- eds_scikit/plot/event_sequences.py | 4 ++-- pyproject.toml | 4 ++-- tests/test_age_pyramid.py | 4 ++-- tests/test_event_sequences.py | 2 +- 7 files changed, 15 insertions(+), 16 deletions(-) diff --git a/eds_scikit/biology/viz/plot.py b/eds_scikit/biology/viz/plot.py index 9c001d25..971a2e0f 100644 --- a/eds_scikit/biology/viz/plot.py +++ b/eds_scikit/biology/viz/plot.py @@ -4,7 +4,6 @@ import altair as alt import pandas as pd -from altair.vegalite.v4.api import VConcatChart as AltChart from IPython.display import display from loguru import logger from pretty_html_table import build_table @@ -15,7 +14,7 @@ def plot_concepts_set( concepts_set_name: str, source_path: str = "Biology_summary", -) -> Union[AltChart, pd.DataFrame]: +) -> Union[alt.ConcatChart, pd.DataFrame]: """Plot and save a summary table and 2 interactive dashboards. For more details, have a look on the [visualization section][visualization] Parameters @@ -27,7 +26,7 @@ def plot_concepts_set( Returns ------- - List[AltChart, pd.DataFrame] + List[alt.ConcatChart, pd.DataFrame] Altair plots describing the volumetric and the distribution properties of your biological data along with a pandas DataFrame with a statistical summary """ if os.path.isdir("{}/{}".format(source_path, concepts_set_name)): @@ -80,7 +79,7 @@ def plot_concepts_set( def _save_and_display_chart( - chart: AltChart, source_path: str, concepts_set_name: str, chart_name: str + chart: alt.ConcatChart, source_path: str, concepts_set_name: str, chart_name: str ): chart.display() chart.save("{}/{}/{}.html".format(source_path, concepts_set_name, chart_name)) diff --git a/eds_scikit/biology/viz/wrapper.py b/eds_scikit/biology/viz/wrapper.py index f243134c..3e7ea8f6 100644 --- a/eds_scikit/biology/viz/wrapper.py +++ b/eds_scikit/biology/viz/wrapper.py @@ -3,8 +3,8 @@ from shutil import rmtree from typing import List, Tuple, Union +import altair as alt import pandas as pd -from altair.vegalite.v4.api import VConcatChart as AltChart from loguru import logger from eds_scikit.biology.utils.process_concepts import ( @@ -32,7 +32,7 @@ def plot_biology_summary( standard_concept_regex: dict = default_standard_concept_regex, pd_limit_size: int = 100000, stats_only: bool = False, -) -> Union[AltChart, pd.DataFrame]: +) -> Union[alt.ConcatChart, pd.DataFrame]: """It aggregates, plots and saves all the concepts-sets in folders. @@ -65,7 +65,7 @@ def plot_biology_summary( Returns ------- - List[AltChart, pd.DataFrame] + List[alt.ConcatChart, pd.DataFrame] Altair plots describing the volumetric and the distribution properties of your biological data along with a pandas DataFrame with a statistical summary """ diff --git a/eds_scikit/plot/age_pyramid.py b/eds_scikit/plot/age_pyramid.py index 471171f6..cbbbd5ce 100644 --- a/eds_scikit/plot/age_pyramid.py +++ b/eds_scikit/plot/age_pyramid.py @@ -17,7 +17,7 @@ def plot_age_pyramid( person: DataFrame, datetime_ref: datetime = None, return_array: bool = False, -) -> Tuple[alt.Chart, Series]: +) -> Tuple[alt.ConcatChart, Series]: """Plot an age pyramid from a 'person' pandas DataFrame. Parameters @@ -46,7 +46,7 @@ def plot_age_pyramid( Returns ------- - chart : alt.Chart, + chart : alt.ConcatChart, If savefig set to True, returns None. group_gender_age : Series, diff --git a/eds_scikit/plot/event_sequences.py b/eds_scikit/plot/event_sequences.py index 9c2b4730..113e421b 100644 --- a/eds_scikit/plot/event_sequences.py +++ b/eds_scikit/plot/event_sequences.py @@ -24,7 +24,7 @@ def plot_event_sequences( bar_height: Optional[int] = 20, title: Optional[str] = None, seed: Optional[int] = 0, -) -> alt.Chart: +) -> alt.VConcatChart: """ Plots individual sequences from an events DataFrame. Each event must be recorded with a start date, a name and a `person_id`. Events can be both one-time (only start date given) or longitudinal (both start and end dates). @@ -74,7 +74,7 @@ def plot_event_sequences( Returns ------- - chart: alt.Chart + chart: alt.VConcatChart Chart with the plotted individual event sequences. """ rng = np.random.RandomState(seed) diff --git a/pyproject.toml b/pyproject.toml index de861ddb..0bc9fc13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,8 +37,8 @@ dependencies = [ "pandas>=1.3.0, <2.0.0", "numpy>=1.0.0, <1.20", "koalas>=1.8.1, <2.0.0", - "altair>=4.2.0, <5.0.0", - "loguru>=0.6.0, <0.7.0", + "altair>=5.0.0, <6.0.0", + "loguru==0.7.0", "pypandoc==1.7.5", "pyspark==2.4.3", "pyarrow==0.17.0", #"pyarrow>=0.10, <0.17.0", diff --git a/tests/test_age_pyramid.py b/tests/test_age_pyramid.py index 3bc1e458..bfb77d4b 100644 --- a/tests/test_age_pyramid.py +++ b/tests/test_age_pyramid.py @@ -27,7 +27,7 @@ def test_plot_age_pyramid(datetime_ref): original_person = person_with_inclusion_date.copy() chart = plot_age_pyramid(person_with_inclusion_date, datetime_ref) - assert isinstance(chart, alt.vegalite.v4.api.ConcatChart) + assert isinstance(chart, alt.ConcatChart) # Check that the data is unchanged assert_frame_equal(original_person, person_with_inclusion_date) @@ -36,7 +36,7 @@ def test_plot_age_pyramid(datetime_ref): def test_age_pyramid_output(): chart = plot_age_pyramid(data.person) - assert isinstance(chart, alt.vegalite.v4.api.ConcatChart) + assert isinstance(chart, alt.ConcatChart) group_gender_age = plot_age_pyramid(data.person, return_array=True) assert isinstance(group_gender_age, Series) diff --git a/tests/test_event_sequences.py b/tests/test_event_sequences.py index fe11047a..70ed977e 100644 --- a/tests/test_event_sequences.py +++ b/tests/test_event_sequences.py @@ -49,4 +49,4 @@ def test_event_sequences( same_x_axis_scale=same_x_axis_scale, title=title, ) - assert type(chart) == alt.vegalite.v4.api.VConcatChart + assert type(chart) == alt.VConcatChart From ee832e09dc7f150e9a06cb46b37c23263383328f Mon Sep 17 00:00:00 2001 From: Adam Date: Fri, 4 Aug 2023 17:58:55 +0200 Subject: [PATCH 2/3] Cap mkdocs version --- mkdocs.yml | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 1b14cc30..db7dfea3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -96,7 +96,7 @@ plugins: module_name: docs/macros - bibtex: #bib_file: "docs/references.bib" - bib_dir: "./" + bib_dir: "./eds_scikit" - gen-files: scripts: - docs/generate_reference.py diff --git a/pyproject.toml b/pyproject.toml index 0bc9fc13..0a5310c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ doc = [ "jinja2==3.0.3", "mike==1.1.2", "nbformat==5.7.0", + "mkdocs<1.5.0", "mkdocs-autorefs==0.3.1", "mkdocs-bibtex==2.8.16", "mkdocs-charts-plugin==0.0.8", From 469449055359ccb33ea3f0954de0f7826b8a6430 Mon Sep 17 00:00:00 2001 From: Adam Date: Mon, 7 Aug 2023 18:29:34 +0200 Subject: [PATCH 3/3] Fix breaking changes altair v5 --- .../Custom_entity/stats_summary.csv | 3 - .../Custom_entity/stats_summary.html | 63 -- .../Protein_Quantitative/stats_summary.csv | 14 - .../Protein_Quantitative/stats_summary.html | 272 ------ docs/functionalities/biology/tutorial.ipynb | 21 +- eds_scikit/biology/viz/plot.py | 825 ++++-------------- eds_scikit/plot/age_pyramid.py | 4 +- 7 files changed, 167 insertions(+), 1035 deletions(-) delete mode 100644 docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.csv delete mode 100644 docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.html delete mode 100644 docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.csv delete mode 100644 docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.html diff --git a/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.csv b/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.csv deleted file mode 100644 index 7e357a17..00000000 --- a/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.csv +++ /dev/null @@ -1,3 +0,0 @@ -LOINC_concept_code,AnaBio_concept_code,LOINC_concept_name,AnaBio_concept_name,unit_source_value,count,mean,std,min,25%,50%,75%,max,MAD,max_threshold,min_threshold -1751-7,C2102,Albumine [Masse/Volume] Sérum/Plasma - Numérique,Albumine_Sérum_Colorimétrie_g/L,g/l,650,30.971,9.399,10.449,28.504,26.629,36.839,43.274,6.75,81.949,0.0 -1751-7,G6616,Albumine [Masse/Volume] Sérum/Plasma - Numérique,Albumine_Sérum_Turbidimétrie_g/L,g/l,1356,25.287,7.81,12.934,27.492,28.8,32.822,57.656,7.074,84.334,0.0 diff --git a/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.html b/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.html deleted file mode 100644 index 74fc751b..00000000 --- a/docs/functionalities/biology/Biology_summary/Custom_entity/stats_summary.html +++ /dev/null @@ -1,63 +0,0 @@ -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LOINC_concept_codeAnaBio_concept_codeLOINC_concept_nameAnaBio_concept_nameunit_source_valuecountmeanstdmin25%50%75%maxMADmax_thresholdmin_threshold
01751-7C2102Albumine [Masse/Volume] Sérum/Plasma - NumériqueAlbumine_Sérum_Colorimétrie_g/Lg/l65030.9719.39910.44928.50426.62936.83943.2746.7581.9490.0
11751-7G6616Albumine [Masse/Volume] Sérum/Plasma - NumériqueAlbumine_Sérum_Turbidimétrie_g/Lg/l135625.2877.8112.93427.49228.832.82257.6567.07484.3340.0

diff --git a/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.csv b/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.csv deleted file mode 100644 index f777ea8d..00000000 --- a/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.csv +++ /dev/null @@ -1,14 +0,0 @@ -LOINC_concept_code,AnaBio_concept_code,LOINC_concept_name,AnaBio_concept_name,unit_source_value,count,mean,std,min,25%,50%,75%,max,MAD,max_threshold,min_threshold -2885-2,A0249,Prot SerPl-mCnc,Protéines_Sérum_g/L,g/l,6021,77.286,8.321,24.819,65.504,61.279,85.818,104.826,8.924,103.919,23.073 -2885-2,A0250,Prot SerPl-mCnc,Protéines_Sérum_Electrophorèse_g/L,g/l,1176,59.705,7.609,24.735,47.535,84.605,90.445,137.543,7.131,91.838,32.455 -2885-2,A7347,Prot SerPl-mCnc,Protéines_Plasma_g/L,g/l,12421,51.113,8.548,22.551,63.876,58.16,77.023,95.262,8.17,86.654,33.378 -2885-2,B9417,Prot SerPl-mCnc,Protéines_Sérum_Colorimétrie_g/L,g/l,601,56.906,12.196,32.205,55.82,56.61,69.69,79.671,7.919,121.822,31.16 -2885-2,C9874,Prot SerPl-mCnc,Protéines_Sérum_Electrophorèse 2_g/L,g/l,169,54.237,6.402,54.82,51.428,76.413,74.323,84.257,8.145,124.186,34.603 -2885-2,D0058,Prot SerPl-mCnc,Protéines Après dialyse_Sérum/Plasma_g/L,g/l,51,64.92,4.699,52.023,71.595,61.444,78.434,76.351,4.502,73.379,39.551 -2885-2,F2624,Prot SerPl-mCnc,Protéines Pédiatrique_Sérum/Plasma_g/L,g/l,3,58.934,11.768,45.364,40.882,54.139,59.366,84.88,11.952,77.996,5.854 -2885-2,F5122,Prot SerPl-mCnc,Protéines Duplication A7347_Plasma_g/L,g/l,213,80.395,6.134,40.129,69.549,66.73,85.024,110.905,8.824,113.764,38.456 -2888-6,A1694,Protéines [Masse/Volume] Urine - Numérique,Protéines_Urines 24h_g/L,g/l,193,2.343,4.262,0.063,0.089,0.257,1.62,52.679,0.162,1.275,0.0 -2888-6,A1695,Protéines [Masse/Volume] Urine - Numérique,Protéines_Urines_g/L,g/l,2300,0.648,1.621,0.0,0.076,0.181,0.428,35.934,0.144,0.76,0.0 -2888-6,C9990,Non Renseigné,Protéines Duplication A1695_Urines_g/L,g/l,13,0.227,0.478,0.058,0.057,0.056,0.109,1.687,0.043,0.211,0.0 -2888-6,D0064,Non Renseigné,Protéines Sonde vésicale_Urines_g/L,g/l,2,0.389,0.368,0.138,0.29,0.378,0.501,0.643,0.272,1.576,0.0 -2888-6,J7268,Protéines [Masse/Volume] Urine - Numérique,Protéines Triplication A1695_Urines_g/L,g/l,115,0.808,0.961,0.038,0.054,0.198,0.505,6.025,0.286,1.482,0.0 diff --git a/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.html b/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.html deleted file mode 100644 index 33c40e99..00000000 --- a/docs/functionalities/biology/Biology_summary/Protein_Quantitative/stats_summary.html +++ /dev/null @@ -1,272 +0,0 @@ -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LOINC_concept_codeAnaBio_concept_codeLOINC_concept_nameAnaBio_concept_nameunit_source_valuecountmeanstdmin25%50%75%maxMADmax_thresholdmin_threshold
02885-2A0249Prot SerPl-mCncProtéines_Sérum_g/Lg/l602177.2868.32124.81965.50461.27985.818104.8268.924103.91923.073
12885-2A0250Prot SerPl-mCncProtéines_Sérum_Electrophorèse_g/Lg/l117659.7057.60924.73547.53584.60590.445137.5437.13191.83832.455
22885-2A7347Prot SerPl-mCncProtéines_Plasma_g/Lg/l1242151.1138.54822.55163.87658.1677.02395.2628.1786.65433.378
32885-2B9417Prot SerPl-mCncProtéines_Sérum_Colorimétrie_g/Lg/l60156.90612.19632.20555.8256.6169.6979.6717.919121.82231.16
42885-2C9874Prot SerPl-mCncProtéines_Sérum_Electrophorèse 2_g/Lg/l16954.2376.40254.8251.42876.41374.32384.2578.145124.18634.603
52885-2D0058Prot SerPl-mCncProtéines Après dialyse_Sérum/Plasma_g/Lg/l5164.924.69952.02371.59561.44478.43476.3514.50273.37939.551
62885-2F2624Prot SerPl-mCncProtéines Pédiatrique_Sérum/Plasma_g/Lg/l358.93411.76845.36440.88254.13959.36684.8811.95277.9965.854
72885-2F5122Prot SerPl-mCncProtéines Duplication A7347_Plasma_g/Lg/l21380.3956.13440.12969.54966.7385.024110.9058.824113.76438.456
82888-6A1694Protéines [Masse/Volume] Urine - NumériqueProtéines_Urines 24h_g/Lg/l1932.3434.2620.0630.0890.2571.6252.6790.1621.2750.0
92888-6A1695Protéines [Masse/Volume] Urine - NumériqueProtéines_Urines_g/Lg/l23000.6481.6210.00.0760.1810.42835.9340.1440.760.0
102888-6C9990Non RenseignéProtéines Duplication A1695_Urines_g/Lg/l130.2270.4780.0580.0570.0560.1091.6870.0430.2110.0
112888-6D0064Non RenseignéProtéines Sonde vésicale_Urines_g/Lg/l20.3890.3680.1380.290.3780.5010.6430.2721.5760.0
122888-6J7268Protéines [Masse/Volume] Urine - NumériqueProtéines Triplication A1695_Urines_g/Lg/l1150.8080.9610.0380.0540.1980.5056.0250.2861.4820.0

diff --git a/docs/functionalities/biology/tutorial.ipynb b/docs/functionalities/biology/tutorial.ipynb index bd6a87d5..5e6c0b35 100644 --- a/docs/functionalities/biology/tutorial.ipynb +++ b/docs/functionalities/biology/tutorial.ipynb @@ -23,7 +23,7 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext autoreload\n", + "%reload_ext autoreload\n", "%autoreload 2" ] }, @@ -618,14 +618,6 @@ "pd.read_csv(\"./Biology_summary/Protein_Quantitative/stats_summary.csv\")" ] }, - { - "cell_type": "markdown", - "id": "e6151283", - "metadata": {}, - "source": [ - "If you prefer, a [HTML table](./Biology_summary/Protein_Quantitative/stats_summary.html) is also generated along with the CSV (same name, but with a `.html` extension" - ] - }, { "cell_type": "markdown", "id": "70ce6f91", @@ -777,9 +769,9 @@ ], "metadata": { "kernelspec": { - "display_name": "scikit", + "display_name": "Python 3", "language": "python", - "name": "scikit" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -791,12 +783,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" - }, - "vscode": { - "interpreter": { - "hash": "a251e067adbd8c45f6d9e77d22a52920634565c087038863128b0fd851122a62" - } + "version": "3.8.10" } }, "nbformat": 4, diff --git a/eds_scikit/biology/viz/plot.py b/eds_scikit/biology/viz/plot.py index 971a2e0f..5b42ea24 100644 --- a/eds_scikit/biology/viz/plot.py +++ b/eds_scikit/biology/viz/plot.py @@ -86,7 +86,6 @@ def _save_and_display_chart( def _save_and_display_table(table: DataFrame, source_path: str, concepts_set_name: str): - display(table) html_measurement_stats = build_table( table, @@ -108,7 +107,6 @@ def _save_and_display_table(table: DataFrame, source_path: str, concepts_set_nam def plot_interactive_distribution(measurement: DataFrame): - standard_terminologies = _get_standard_terminologies(measurement) measurement["over_freq"] = measurement["frequency"].where( @@ -123,7 +121,7 @@ def plot_interactive_distribution(measurement: DataFrame): measurement["legend_outlier"] = "Outliers grouped at the thresholds" alt.data_transformers.disable_max_rows() - hospital_selection = alt.selection_multi(fields=["care_site_short_name"]) + hospital_selection = alt.selection_point(fields=["care_site_short_name"]) value_selection = alt.selection_interval(bind="scales", encodings=["x"]) color_hospital = alt.condition( hospital_selection, @@ -135,7 +133,7 @@ def plot_interactive_distribution(measurement: DataFrame): hospital_hist = ( alt.Chart(measurement) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( y=alt.Y( "sum(frequency):Q", @@ -151,7 +149,7 @@ def plot_interactive_distribution(measurement: DataFrame): tooltip=alt.Tooltip("sum(frequency):Q", format=","), ) .transform_filter(value_selection) - .add_selection(hospital_selection) + .add_params(hospital_selection) ).properties(width=900) # Density Chart @@ -174,7 +172,7 @@ def plot_interactive_distribution(measurement: DataFrame): } ], ) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( x=alt.X( "binned_value:Q", @@ -186,12 +184,12 @@ def plot_interactive_distribution(measurement: DataFrame): title="Overall frequency", ), color=alt.Color( - "quartile:O", + "min(quartile):O", scale=alt.Scale(scheme="pastel1"), legend=None, ), tooltip=[ - alt.Tooltip("binned_value:Q", title="Value", format=","), + alt.Tooltip("binned_value:Q", title="Value", format=".2f"), alt.Tooltip("sum(inside_freq):Q", title="Frequency", format=","), ], ) @@ -205,10 +203,11 @@ def plot_interactive_distribution(measurement: DataFrame): TotalUnder="sum(under_freq)", MinValue="min(binned_value)", ) - .mark_bar(color="gray") + .mark_bar(color="gray", cornerRadiusEnd=10) .encode( opacity=alt.Opacity( - "legend_outlier", + "min(legend_outlier)", + scale=alt.Scale(rangeMin=1), legend=alt.Legend(orient="bottom", title=None), ) ) @@ -277,7 +276,6 @@ def plot_interactive_distribution(measurement: DataFrame): alt.layer(overall_underlier, overall_overlier).resolve_scale(y="shared"), ) .resolve_scale(y="independent") - .add_selection(value_selection) .transform_filter(hospital_selection) ) @@ -287,149 +285,165 @@ def plot_interactive_distribution(measurement: DataFrame): terminologies_selection = [] overall_densities = [] width = 900 / len(standard_terminologies) - terminology_dist_base = (alt.Chart(measurement)).properties( height=100, width=width ) - terminology_density = ( - terminology_dist_base.transform_window( - sort=[{"field": "binned_value"}], - groupby=[ - "{}_concept_code".format(terminology) - for terminology in standard_terminologies - ], - cumulative_count="sum(inside_freq)", + for terminology in standard_terminologies: + terminology_density = ( + terminology_dist_base.transform_window( + sort=[{"field": "binned_value"}], + groupby=["{}_concept_code".format(terminology)], + cumulative_count="sum(inside_freq)", + ) + .transform_window( + sort=[{"field": "cumulative_count"}], + groupby=["{}_concept_code".format(terminology)], + window=[ + { + "field": "cumulative_count", + "op": "ntile", + "as": "quartile", + "param": 4, + } + ], + ) + .mark_bar(cornerRadiusEnd=10) + .encode( + x=alt.X( + "binned_value:Q", + title="Value", + ), + y=alt.Y( + "sum(inside_freq):Q", + axis=alt.Axis(format="s"), + title="Frequency", + ), + color=alt.Color( + "min(quartile):O", + scale=alt.Scale(domain=[1, 2, 3, 4], scheme="pastel1"), + legend=alt.Legend(orient="bottom", title="Quartile"), + ), + tooltip=[ + alt.Tooltip("binned_value:Q", title="Value", format=".2f"), + alt.Tooltip( + "sum(inside_freq):Q", title="Frequency", format="," + ), + ], + ) ) - .transform_window( - sort=[{"field": "cumulative_count"}], - groupby=[ - "{}_concept_code".format(terminology) - for terminology in standard_terminologies - ], - window=[ - { - "field": "cumulative_count", - "op": "ntile", - "as": "quartile", - "param": 4, - } - ], + + terminology_outlier_base = ( + terminology_dist_base.transform_joinaggregate( + TotalMeasures="sum(frequency)", + TotalOver="sum(over_freq)", + MaxValue="max(binned_value)", + TotalUnder="sum(under_freq)", + MinValue="min(binned_value)", + groupby=["{}_concept_code:N".format(terminology)], + ) + .mark_bar(color="grey", cornerRadiusEnd=10) + .encode( + opacity=alt.Opacity( + "legend_outlier", + scale=alt.Scale(rangeMin=1), + legend=alt.Legend(orient="bottom", title=None), + ) + ) ) - .mark_bar() - .encode( + + terminology_overlier = terminology_outlier_base.transform_calculate( + Percentage="datum.TotalOver / datum.TotalMeasures" + ).encode( x=alt.X( - "binned_value:Q", + "MaxValue:Q", title="Value", ), y=alt.Y( - "sum(inside_freq):Q", + "max(TotalOver):Q", axis=alt.Axis(format="s"), - title="Frequency", - ), - color=alt.Color( - "quartile:O", - scale=alt.Scale(domain=[1, 2, 3, 4], scheme="pastel1"), - legend=alt.Legend(orient="bottom", title="Quartile"), + title="Outliers frequency", ), tooltip=[ - alt.Tooltip("binned_value:Q", title="Value", format=","), - alt.Tooltip("sum(inside_freq):Q", title="Frequency", format=","), - ], - ) - ) - - terminology_outlier_base = ( - terminology_dist_base.transform_joinaggregate( - TotalMeasures="sum(frequency)", - TotalOver="sum(over_freq)", - MaxValue="max(binned_value)", - TotalUnder="sum(under_freq)", - MinValue="min(binned_value)", - groupby=[ - "{}_concept_code:N".format(terminology) - for terminology in standard_terminologies + alt.Tooltip( + "MaxValue:Q", + title="Maximum threshold (computed with MAD formula)", + format=",", + ), + alt.Tooltip( + "max(TotalOver):Q", + title="Frequency over the maximum", + ), + alt.Tooltip( + "max(Percentage):Q", + format=".2%", + ), ], ) - .mark_bar(color="grey") - .encode( - opacity=alt.Opacity( - "legend_outlier", - legend=alt.Legend(orient="bottom", title=None), - ) - ) - ) - - terminology_overlier = terminology_outlier_base.transform_calculate( - Percentage="datum.TotalOver / datum.TotalMeasures" - ).encode( - x=alt.X( - "MaxValue:Q", - title="Value", - ), - y=alt.Y( - "TotalOver:Q", - axis=alt.Axis(format="s"), - title="Outliers frequency", - ), - tooltip=[ - alt.Tooltip( - "MaxValue:Q", - title="Maximum threshold (computed with MAD formula)", - format=",", - ), - alt.Tooltip( - "TotalOver:Q", - title="Frequency over the maximum", - ), - alt.Tooltip( - "Percentage:Q", - format=".2%", - ), - ], - ) - terminology_underlier = terminology_outlier_base.transform_calculate( - Percentage="datum.TotalUnder / datum.TotalMeasures" - ).encode( - x=alt.X( - "MinValue:Q", - title="Value", - ), - y=alt.Y( - "TotalUnder:Q", - axis=alt.Axis(format="s"), - title="Outliers frequency", - ), - tooltip=[ - alt.Tooltip( + terminology_underlier = terminology_outlier_base.transform_calculate( + Percentage="datum.TotalUnder / datum.TotalMeasures" + ).encode( + x=alt.X( "MinValue:Q", - title="Minimum threshold (computed with MAD formula)", - format=",", - ), - alt.Tooltip( - "TotalUnder:Q", - title="Frequency under the minimum", + title="Value", ), - alt.Tooltip( - "Percentage:Q", - format=".2%", + y=alt.Y( + "max(TotalUnder):Q", + axis=alt.Axis(format="s"), + title="Outliers frequency", ), - ], - ) + tooltip=[ + alt.Tooltip( + "MinValue:Q", + title="Minimum threshold (computed with MAD formula)", + format=",", + ), + alt.Tooltip( + "TotalUnder:Q", + title="Frequency under the minimum", + ), + alt.Tooltip( + "Percentage:Q", + format=".2%", + ), + ], + ) - terminology_distribution_base = ( - alt.layer( - terminology_density, - alt.layer(terminology_underlier, terminology_overlier).resolve_scale( - y="shared" - ), + terminology_distribution_base = ( + ( + terminology_density + + (terminology_underlier + terminology_overlier).resolve_scale( + y="shared" + ) + ) + .transform_filter(hospital_selection) + .resolve_scale(y="independent") ) - .transform_filter(hospital_selection) - .add_selection(value_selection) - ).resolve_scale(y="independent") - for terminology in standard_terminologies: - terminology_selection = alt.selection_multi( + terminology_distribution = ( + ( + terminology_distribution_base.facet( + row=alt.Row( + "{}_concept_code:N".format(terminology), + sort={ + "field": "frequency", + "op": "sum", + "order": "descending", + }, + ) + ) + ) + .resolve_scale(y="independent") + .properties( + title=alt.TitleParams( + text="Distribution per {} code".format(terminology), + anchor="middle", + align="center", + ) + ) + ) + terminologies_distribution.append(terminology_distribution) + terminology_selection = alt.selection_point( fields=["{}_concept_code".format(terminology)], ) terminologies_selection.append(terminology_selection) @@ -443,10 +457,9 @@ def plot_interactive_distribution(measurement: DataFrame): ), alt.value("lightgray"), ) - terminology_hist = ( alt.Chart(measurement) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( y=alt.Y( "sum(frequency):Q", @@ -465,36 +478,11 @@ def plot_interactive_distribution(measurement: DataFrame): color=terminology_color, tooltip=alt.Tooltip("sum(frequency):Q", format=","), ) - .add_selection(terminology_selection) + .add_params(terminology_selection) .transform_filter(value_selection) .transform_filter(hospital_selection) ) - terminologies_hist.append(terminology_hist.properties(width=width)) - - terminology_distribution = ( - ( - terminology_distribution_base.facet( - row=alt.Row( - "{}_concept_code:N".format(terminology), - sort={ - "field": "frequency", - "op": "sum", - "order": "descending", - }, - ) - ) - ) - .resolve_scale(y="independent") - .properties( - title=alt.TitleParams( - text="Distribution per {} code".format(terminology), - anchor="middle", - align="center", - ) - ) - ) - terminologies_distribution.append(terminology_distribution) overall_densities.append(overall_density.properties(width=width)) for terminology_selection in terminologies_selection: @@ -517,13 +505,13 @@ def plot_interactive_distribution(measurement: DataFrame): lambda terminology_distribution_1, terminology_distribution_2: terminology_distribution_1 | terminology_distribution_2, terminologies_distribution, - ) + ).transform_filter(value_selection) overall_densities = reduce( lambda overall_density_1, overall_density_2: alt.hconcat( overall_density_1, overall_density_2, spacing=75 ), overall_densities, - ) + ).add_params(value_selection) else: terminologies_hist = alt.Chart().mark_text() @@ -557,7 +545,7 @@ def plot_interactive_volumetry( standard_terminologies = _get_standard_terminologies(measurement) alt.data_transformers.disable_max_rows() - hospital_selection = alt.selection_multi(fields=["care_site_short_name"]) + hospital_selection = alt.selection_point(fields=["care_site_short_name"]) time_selection = alt.selection_interval(encodings=["x"]) color_hospital = alt.condition( hospital_selection, @@ -567,7 +555,7 @@ def plot_interactive_volumetry( hospital_hist = ( alt.Chart(measurement) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( y=alt.Y( "sum(# measurements):Q", @@ -582,7 +570,7 @@ def plot_interactive_volumetry( color=color_hospital, tooltip=alt.Tooltip("sum(# measurements):Q", format=","), ) - .add_selection(hospital_selection) + .add_params(hospital_selection) .transform_filter(time_selection) ).properties(width=900, height=300) @@ -602,7 +590,7 @@ def plot_interactive_volumetry( title="Total number of measurements", ), ) - .add_selection(time_selection) + .add_params(time_selection) .transform_filter(hospital_selection) ).properties(width=900, height=50) @@ -616,11 +604,11 @@ def plot_interactive_volumetry( .transform_calculate( Percentage="datum.Missing / (datum.TotalMeasures + datum.Missing)" ) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( y=alt.Y( - "Percentage:Q", - axis=alt.Axis(format="%"), + "min(Percentage):Q", + axis=alt.Axis(format=".0%"), title="Percentage of missing values per hospital", ), x=alt.X( @@ -643,7 +631,7 @@ def plot_interactive_volumetry( width = 900 / len(standard_terminologies) for terminology in standard_terminologies: - terminology_selection = alt.selection_multi( + terminology_selection = alt.selection_point( fields=["{}_concept_code".format(terminology)] ) terminologies_selection.append(terminology_selection) @@ -656,7 +644,7 @@ def plot_interactive_volumetry( terminology_hist = ( alt.Chart(measurement) - .mark_bar() + .mark_bar(cornerRadiusEnd=10) .encode( y=alt.Y( "sum(# measurements):Q", @@ -675,7 +663,7 @@ def plot_interactive_volumetry( color=terminology_color, tooltip=alt.Tooltip("sum(# measurements):Q", format=","), ) - .add_selection(terminology_selection) + .add_params(terminology_selection) .transform_filter(hospital_selection) .transform_filter(time_selection) ).properties(height=300, width=width) @@ -764,7 +752,6 @@ def plot_interactive_volumetry( def _get_standard_terminologies(measurement): - standard_terminologies = list( set( col_name.split("_concept_code")[0] @@ -791,7 +778,6 @@ def _get_standard_terminologies(measurement): def _filter_zeros(measurement): - count_cols = ["# measurements", "# missing_values"] # Remove rows with all 0 @@ -799,492 +785,3 @@ def _filter_zeros(measurement): measurement = measurement.dropna(how="all", subset=count_cols) return measurement - - -# def plot_interactive_distribution_with_time( -# measurement: DataFrame, -# ): - -# standard_terminologies = _get_standard_terminologies(measurement) - -# measurement["over_freq"] = measurement["frequency"].where( -# measurement["over_outlier"], 0 -# ) -# measurement["under_freq"] = measurement["frequency"].where( -# measurement["under_outlier"], 0 -# ) -# measurement["inside_freq"] = measurement["frequency"].where( -# ~measurement["over_outlier"] & ~measurement["under_outlier"], 0 -# ) -# measurement["legend_outlier"] = "Outliers grouped at the thresholds" - -# delta_time = ( -# measurement["measurement_month"] -# .astype("datetime64") -# .dt.to_period("M") -# .view(dtype="int64") -# .drop_duplicates() -# .sort_values() -# .diff() -# .min() -# ) - -# if delta_time == 1: -# time_axis = alt.Axis(tickCount="month", labelAngle=-90, format="%b %Y") -# elif delta_time == 3: -# time_axis = alt.Axis(tickCount="month", labelAngle=-90, format="%YQ%q") -# else: -# time_axis = alt.Axis(tickCount="year", labelAngle=-90, format="%Y") - -# alt.data_transformers.disable_max_rows() -# hospital_selection = alt.selection_multi(fields=["care_site_short_name"]) -# time_selection = alt.selection_interval(encodings=["x"]) -# value_selection = alt.selection_interval(bind="scales", encodings=["x"]) -# color_hospital = alt.condition( -# hospital_selection, -# alt.Color( -# "care_site_short_name:N", legend=None, scale=alt.Scale(scheme="accent") -# ), -# alt.value("lightgray"), -# ) - -# time_line = ( -# alt.Chart(measurement) -# .mark_line() -# .encode( -# x=alt.X( -# "measurement_month:T", -# title="Time", -# axis=time_axis, -# ), -# y=alt.Y( -# "sum(frequency):Q", -# axis=alt.Axis(format="s"), -# impute=alt.ImputeParams(value=0), -# title="Total number of measurements", -# ), -# ) -# .add_selection(time_selection) -# .transform_filter(value_selection) -# .transform_filter(hospital_selection) -# ).properties(width=900, height=50) - -# hospital_hist = ( -# alt.Chart(measurement) -# .mark_bar() -# .encode( -# y=alt.Y( -# "sum(frequency):Q", -# axis=alt.Axis(format="s"), -# title="Number of measurements per hospital", -# ), -# x=alt.X( -# "care_site_short_name:N", -# title="Hospital", -# sort={"field": "frequency", "op": "sum", "order": "descending"}, -# ), -# color=color_hospital, -# tooltip=alt.Tooltip("sum(frequency):Q", format=","), -# ) -# .add_selection(hospital_selection) -# .transform_filter(value_selection) -# .transform_filter(time_selection) -# ).properties(width=900) - -# # Density Chart -# overall_dist_base = ( -# alt.Chart(measurement, title="Overall distribution") -# ).properties(height=100) - -# overall_density = ( -# overall_dist_base.transform_window( -# sort=[{"field": "binned_value"}], cumulative_count="sum(inside_freq)" -# ) -# .transform_window( -# sort=[{"field": "cumulative_count"}], -# window=[ -# { -# "field": "cumulative_count", -# "op": "ntile", -# "as": "quartile", -# "param": 4, -# } -# ], -# ) -# .mark_bar() -# .encode( -# x=alt.X( -# "binned_value:Q", -# title="Value", -# ), -# y=alt.Y( -# "sum(inside_freq):Q", -# axis=alt.Axis(format="s"), -# title="Overall frequency", -# ), -# color=alt.Color( -# "quartile:O", -# scale=alt.Scale(scheme="pastel1"), -# legend=None, -# ), -# tooltip=[ -# alt.Tooltip("binned_value:Q", title="Value", format=","), -# alt.Tooltip("sum(inside_freq):Q", title="Frequency", format=","), -# ], -# ) -# ) - -# overall_outlier_base = ( -# overall_dist_base.transform_joinaggregate( -# TotalMeasures="sum(frequency)", -# TotalOver="sum(over_freq)", -# MaxValue="max(binned_value)", -# TotalUnder="sum(under_freq)", -# MinValue="min(binned_value)", -# ) -# .mark_bar(color="grey") -# .encode( -# opacity=alt.Opacity( -# "legend_outlier", -# legend=alt.Legend(orient="bottom", title=None), -# ) -# ) -# ) - -# overall_overlier = overall_outlier_base.transform_calculate( -# Percentage="datum.TotalOver / datum.TotalMeasures" -# ).encode( -# x=alt.X( -# "MaxValue:Q", -# title="Value", -# ), -# y=alt.Y( -# "TotalOver:Q", -# axis=alt.Axis(format="s"), -# title="Outliers frequency", -# ), -# tooltip=[ -# alt.Tooltip( -# "MaxValue:Q", -# title="Maximum threshold (computed with MAD formula)", -# format=",", -# ), -# alt.Tooltip( -# "TotalOver:Q", -# title="Frequency over the maximum", -# ), -# alt.Tooltip( -# "Percentage:Q", -# format=".2%", -# ), -# ], -# ) -# overall_underlier = overall_outlier_base.transform_calculate( -# Percentage="datum.TotalUnder / datum.TotalMeasures" -# ).encode( -# x=alt.X( -# "MinValue:Q", -# title="Value", -# ), -# y=alt.Y( -# "TotalUnder:Q", -# axis=alt.Axis(format="s"), -# title="Outliers frequency", -# ), -# tooltip=[ -# alt.Tooltip( -# "MinValue:Q", -# title="Minimum threshold (computed with MAD formula)", -# format=",", -# ), -# alt.Tooltip( -# "TotalUnder:Q", -# title="Frequency under the minimum", -# ), -# alt.Tooltip( -# "Percentage:Q", -# format=".2%", -# ), -# ], -# ) - -# overall_density = ( -# alt.layer( -# overall_density, -# alt.layer(overall_underlier, overall_overlier).resolve_scale(y="shared"), -# ) -# .resolve_scale(y="independent") -# .transform_filter(hospital_selection) -# .add_selection(value_selection) -# ) -# if standard_terminologies: -# terminologies_hist = [] -# terminologies_distribution = [] -# terminologies_selection = [] -# overall_densities = [] -# width = 900 / len(standard_terminologies) - -# terminology_dist_base = (alt.Chart(measurement)).properties( -# height=100, width=width -# ) -# terminology_density = ( -# terminology_dist_base.transform_window( -# sort=[{"field": "binned_value"}], -# groupby=[ -# "{}_concept_code".format(terminology) -# for terminology in standard_terminologies -# ], -# cumulative_count="sum(inside_freq)", -# ) -# .transform_window( -# sort=[{"field": "cumulative_count"}], -# groupby=[ -# "{}_concept_code".format(terminology) -# for terminology in standard_terminologies -# ], -# window=[ -# { -# "field": "cumulative_count", -# "op": "ntile", -# "as": "quartile", -# "param": 4, -# } -# ], -# ) -# .mark_bar() -# .encode( -# x=alt.X( -# "binned_value:Q", -# title="Value", -# ), -# y=alt.Y( -# "sum(inside_freq):Q", -# axis=alt.Axis(format="s"), -# title="Frequency", -# ), -# color=alt.Color( -# "quartile:O", -# scale=alt.Scale(domain=[1, 2, 3, 4], scheme="pastel1"), -# legend=alt.Legend(orient="bottom", title="Quartile"), -# ), -# tooltip=[ -# alt.Tooltip("binned_value:Q", title="Value", format=","), -# alt.Tooltip("sum(inside_freq):Q", title="Frequency", format=","), -# ], -# ) -# ) - -# terminology_outlier_base = ( -# terminology_dist_base.transform_joinaggregate( -# TotalMeasures="sum(frequency)", -# TotalOver="sum(over_freq)", -# MaxValue="max(binned_value)", -# TotalUnder="sum(under_freq)", -# MinValue="min(binned_value)", -# groupby=[ -# "{}_concept_code:N".format(terminology) -# for terminology in standard_terminologies -# ], -# ) -# .mark_bar(color="gray") -# .encode( -# opacity=alt.Opacity( -# "legend_outlier", -# legend=alt.Legend(orient="bottom", title=None), -# ) -# ) -# ) - -# terminology_overlier = terminology_outlier_base.transform_calculate( -# Percentage="datum.TotalOver / datum.TotalMeasures" -# ).encode( -# x=alt.X( -# "MaxValue:Q", -# title="Value", -# ), -# y=alt.Y( -# "TotalOver:Q", -# axis=alt.Axis(format="s"), -# title="Outliers frequency", -# ), -# tooltip=[ -# alt.Tooltip( -# "MaxValue:Q", -# title="Maximum threshold (computed with MAD formula)", -# format=",", -# ), -# alt.Tooltip( -# "TotalOver:Q", -# title="Frequency over the maximum", -# ), -# alt.Tooltip( -# "Percentage:Q", -# format=".2%", -# ), -# ], -# ) - -# terminology_underlier = terminology_outlier_base.transform_calculate( -# Percentage="datum.TotalUnder / datum.TotalMeasures" -# ).encode( -# x=alt.X( -# "MinValue:Q", -# title="Value", -# ), -# y=alt.Y( -# "TotalUnder:Q", -# axis=alt.Axis(format="s"), -# title="Outliers frequency", -# ), -# tooltip=[ -# alt.Tooltip( -# "MinValue:Q", -# title="Minimum threshold (computed with MAD formula)", -# format=",", -# ), -# alt.Tooltip( -# "TotalUnder:Q", -# title="Frequency under the minimum", -# ), -# alt.Tooltip( -# "Percentage:Q", -# format=".2%", -# ), -# ], -# ) -# terminology_distribution_base = ( -# alt.layer( -# terminology_density, -# alt.layer(terminology_underlier, terminology_overlier).resolve_scale( -# y="shared" -# ), -# ) -# .transform_filter(time_selection) -# .transform_filter(hospital_selection) -# .add_selection(value_selection) -# ).resolve_scale(y="independent") - -# for terminology in standard_terminologies: -# terminology_selection = alt.selection_multi( -# fields=["{}_concept_code".format(terminology)], -# ) -# terminologies_selection.append(terminology_selection) - -# terminology_color = alt.condition( -# terminology_selection, -# alt.Color( -# "{}_concept_code:N".format(terminology), -# legend=None, -# scale=alt.Scale(scheme="pastel2"), -# ), -# alt.value("lightgray"), -# ) - -# terminology_hist = ( -# alt.Chart(measurement) -# .mark_bar() -# .encode( -# y=alt.Y( -# "sum(frequency):Q", -# axis=alt.Axis(format="s"), -# title="Number of measurements per {} code".format(terminology), -# ), -# x=alt.X( -# "{}_concept_code:N".format(terminology), -# title="{} code".format(terminology), -# sort={ -# "field": "frequency", -# "op": "sum", -# "order": "descending", -# }, -# ), -# color=terminology_color, -# tooltip=alt.Tooltip("sum(frequency):Q", format=","), -# ) -# .add_selection(terminology_selection) -# .transform_filter(hospital_selection) -# .transform_filter(value_selection) -# .transform_filter(time_selection) -# ) - -# terminology_distribution = ( -# ( -# terminology_distribution_base.facet( -# row=alt.Row( -# "{}_concept_code:N".format(terminology), -# sort={ -# "field": "frequency", -# "op": "sum", -# "order": "descending", -# }, -# ) -# ) -# ) -# .resolve_scale(y="independent") -# .properties( -# title=alt.TitleParams( -# text="Distribution per {} code".format(terminology), -# anchor="middle", -# align="center", -# ) -# ) -# ) - -# terminologies_hist.append(terminology_hist.properties(width=width)) - -# terminologies_distribution.append(terminology_distribution) -# overall_densities.append(overall_density.properties(width=width)) - -# for terminology_selection in terminologies_selection: -# hospital_hist = hospital_hist.transform_filter(terminology_selection) -# time_line = time_line.transform_filter(terminology_selection) -# for idx in range(len(standard_terminologies)): -# if idx != terminologies_selection.index(terminology_selection): -# terminologies_hist[idx] = terminologies_hist[idx].transform_filter( -# terminology_selection -# ) -# terminologies_distribution[idx] = terminologies_distribution[ -# idx -# ].transform_filter(terminology_selection) - -# terminologies_hist = reduce( -# lambda terminology_hist_1, terminology_hist_2: terminology_hist_1 -# | terminology_hist_2, -# terminologies_hist, -# ) -# terminologies_distribution = reduce( -# lambda terminology_distribution_1, terminology_distribution_2: terminology_distribution_1 -# | terminology_distribution_2, -# terminologies_distribution, -# ) -# overall_densities = reduce( -# lambda overall_density_1, overall_density_2: alt.hconcat( -# overall_density_1, overall_density_2, spacing=75 -# ), -# overall_densities, -# ) - -# else: -# terminologies_hist = alt.Chart().mark_text() -# terminologies_distribution = alt.Chart().mark_text() -# overall_densities = ( -# overall_density.encode( -# color=alt.Color( -# "quartile:O", -# scale=alt.Scale(domain=[1, 2, 3, 4], scheme="pastel1"), -# legend=alt.Legend(orient="bottom", title="Quartile"), -# ), -# ) -# .transform_filter(hospital_selection) -# .properties(width=900) -# ) - -# chart = ( -# hospital_hist -# & time_line -# & terminologies_hist -# & overall_densities -# & terminologies_distribution -# ).resolve_scale(color="independent") - -# return chart diff --git a/eds_scikit/plot/age_pyramid.py b/eds_scikit/plot/age_pyramid.py index cbbbd5ce..0c5a8dd0 100644 --- a/eds_scikit/plot/age_pyramid.py +++ b/eds_scikit/plot/age_pyramid.py @@ -137,8 +137,8 @@ def plot_age_pyramid( alt.Chart(male) .mark_text() .encode( - y=alt.Text("age_bins", axis=None, sort=alt.SortOrder("descending")), - text=alt.Y("age_bins"), + y=alt.Y("age_bins", axis=None, sort=alt.SortOrder("descending")), + text=alt.Text("age_bins"), ) )