From 247e49b9fb42dd555dd2d0d87b28b5cb9fab59d0 Mon Sep 17 00:00:00 2001 From: mandresm Date: Tue, 26 Nov 2024 16:58:45 +0100 Subject: [PATCH 01/13] add catch KeyError: 'JUPYTERHUB_SERVICE_PREFIX' for when evaluating the dashboard_link of the cluster and fallback to default dashboard_link --- src/pymorize/cluster.py | 34 ++++++++++++++++++++++++++++++++++ src/pymorize/cmorizer.py | 10 ++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 src/pymorize/cluster.py diff --git a/src/pymorize/cluster.py b/src/pymorize/cluster.py new file mode 100644 index 00000000..36012ff6 --- /dev/null +++ b/src/pymorize/cluster.py @@ -0,0 +1,34 @@ +""" +This module contains the functions to manage the Dask cluster. +""" +import dask + +from .logging import logger + +def set_dashboard_link(cluster): + """ + Checks whether the default user configuration for the dashboard link is valid. + If the configuration is invalid it tried to catch the following errors: + + * KeyError: 'JUPYTERHUB_SERVICE_PREFIX' -> The dashboard link is not valid because + the cluster was not launched from JupyterHub. In this case, the default dashboard + link is set to 'http://{host}:8787'. + + Parameters + ---------- + cluster : dask_jobqueue.SLURMCluster + The Dask cluster to set the dashboard link. + """ + try: + _ = cluster.dashboard_link + except KeyError as e: + if "JUPYTERHUB_SERVICE_PREFIX" in str(e): + logger.debug( + "Trying to use JupyterHub prefix for the dashboard link, but the it " + "was not launched from JupyterHub. Falling back to the default " + "dashboard link." + ) + default_dashboard_link = "http://{host}:8787" + dask.config.set({"distributed.dashboard.link": default_dashboard_link}) + else: + raise e diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index c7ca47f7..efc7749c 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -14,9 +14,14 @@ from prefect.futures import wait from rich.progress import track +from .cluster import set_dashboard_link from .config import PymorizeConfig, PymorizeConfigManager, parse_bool -from .data_request import (DataRequest, DataRequestTable, DataRequestVariable, - IgnoreTableFiles) +from .data_request import ( + DataRequest, + DataRequestTable, + DataRequestVariable, + IgnoreTableFiles, +) from .filecache import fc from .logging import logger from .pipeline import Pipeline @@ -120,6 +125,7 @@ def _post_init_create_dask_cluster(self): # FIXME: In the future, we can support PBS, too. logger.info("Setting up SLURMCluster...") self._cluster = SLURMCluster() + set_dashboard_link(self._cluster) cluster_mode = self._pymorize_cfg.get("cluster_mode", "adapt") if cluster_mode == "adapt": min_jobs = self._pymorize_cfg.get("minimum_jobs", 1) From 231a06f7f6318b6f41cd6fee097f390979563985 Mon Sep 17 00:00:00 2001 From: mandresm Date: Tue, 26 Nov 2024 17:47:16 +0100 Subject: [PATCH 02/13] add missing dependency bokeh --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index bce002db..29837f83 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ def read(filename): package_dir={"": "src"}, packages=find_packages(where="src", exclude=("tests",)), install_requires=[ + "bokeh", "cerberus", "cf_xarray", "cftime", From da0e6abd7467e79cacb92777c3c78dd3e5b89c63 Mon Sep 17 00:00:00 2001 From: Miguel Andres-Martinez Date: Tue, 26 Nov 2024 18:08:35 +0100 Subject: [PATCH 03/13] make http links mor visible so that users have the idea to rightclick and open link --- src/pymorize/ssh_tunnel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pymorize/ssh_tunnel.py b/src/pymorize/ssh_tunnel.py index 08c04caf..58bb9674 100644 --- a/src/pymorize/ssh_tunnel.py +++ b/src/pymorize/ssh_tunnel.py @@ -43,6 +43,9 @@ def ssh_tunnel_cli( """ Create an SSH tunnel to access Prefect and Dask dashboards on a remote compute node. """ + dask_link = click.style(f"http://localhost:{local_dask_port}/status", fg='blue', underline=True) + prefect_link = click.style(f"http://localhost:{local_prefect_port}", fg='blue', underline=True) + ssh_command = f"ssh -nNT -L {local_dask_port}:{compute_node}:{remote_dask_port} -L {local_prefect_port}:{compute_node}:{remote_prefect_port} {username}@{gateway}" click.echo(f"Creating SSH tunnel via: {ssh_command}") @@ -53,10 +56,10 @@ def ssh_tunnel_cli( f"Port forwarding: localhost:{local_prefect_port} -> {gateway}:{remote_prefect_port} -> {compute_node}:{remote_prefect_port}" ) click.echo( - f"Dask Dashboard will be accessible at http://localhost:{local_dask_port}/status" + f"Dask Dashboard will be accessible at {dask_link}" ) click.echo( - f"Prefect Dashboard will be accessible at http://localhost:{local_prefect_port}" + f"Prefect Dashboard will be accessible at {prefect_link}" ) click.echo("Press Ctrl+C to close the tunnel") From b81ea93d9d4072215dcf9f2271e8ea8d74ac4a16 Mon Sep 17 00:00:00 2001 From: mandresm Date: Tue, 26 Nov 2024 18:30:17 +0100 Subject: [PATCH 04/13] add reporting the pymorize ssh-tunnel command to be used from the local computer --- src/pymorize/cmorizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index efc7749c..52cd081b 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -3,6 +3,7 @@ from pathlib import Path import dask # noqa: F401 +import os import pandas as pd import questionary import xarray as xr # noqa: F401 @@ -142,6 +143,13 @@ def _post_init_create_dask_cluster(self): # FIXME: Client needs to be available here? logger.info(f"SLURMCluster can be found at: {self._cluster=}") logger.info(f"Dashboard {self._cluster.dashboard_link}") + # FIXME: Include the gateway option if possible + logger.info( + "To see the dashboards run the following command in your computer's " + "terminal:\n" + f"\tpymorize ssh-tunnel --username {os.getlogin()} --compute-node " + f"{os.uname().nodename}" + ) dask_extras = 0 logger.info("Importing Dask Extras...") From 1342f6789da7b770f378a243736525a0d76219d8 Mon Sep 17 00:00:00 2001 From: Miguel Andres-Martinez Date: Tue, 26 Nov 2024 18:46:41 +0100 Subject: [PATCH 05/13] add documentation for improvements in dashboards --- examples/README.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/README.rst b/examples/README.rst index e64eb377..9afe0a83 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -19,7 +19,7 @@ You can run the example via:: sbatch -A pymorize.slurm -The ``sample.yaml`` file shows a configuration for an ``AWI-CM 1`` +The ``sample.yaml`` file shows a configuration for an ``AWI-CM 1`` simulation, and processes one set of files, ``fgco2``, which was called ``CO2f`` in ``FESOM 1``. The default pipeline is used, and nothing special is done. @@ -35,17 +35,21 @@ or:: Monitoring the Dask Progress ============================ -``pymorize`` makes heavy use of ``dask``, and ``dask`` provides a dashboard to view the progress, however, you -need to set up SSH tunnels to properly see it. As a convenient shortcut, ``pymorize`` has tunneling built into -it's command line interface:: +``pymorize`` makes heavy use of ``dask``, and ``dask`` provides a dashboard to view the progress, however, you +need to set up SSH tunnels to properly see it from your local computer. As a convenient shortcut, ``pymorize`` +has tunneling built into it's command line interface:: pymorize ssh-tunnel --gateway= --username= --compute-node= +**Or even more convenient!** Search for ``ssh-tunnel`` in your ``slurm-.out`` (or in the stdout if you +are running ``pymorize process`` directly from the logging node). You should be able to find the precise +command you need to use in your local computer, matching the syntax above. + Note that ``JOB_NODE`` is where your main ``pymorize`` job starts, and **not** one of the dask worker jobs. You can also generate the required SSH tunnels by hand. On your local workstation:: - + ssh -L 8080:localhost:8080 -L 8080::8787 @ On the login node:: From 09255cf2681c2667e3676a20a62ad9e1d1c053dc Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:03:21 +0100 Subject: [PATCH 06/13] doc: typo in example README --- examples/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.rst b/examples/README.rst index 9afe0a83..753a50f7 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -42,7 +42,7 @@ has tunneling built into it's command line interface:: pymorize ssh-tunnel --gateway= --username= --compute-node= **Or even more convenient!** Search for ``ssh-tunnel`` in your ``slurm-.out`` (or in the stdout if you -are running ``pymorize process`` directly from the logging node). You should be able to find the precise +are running ``pymorize process`` directly from the login node). You should be able to find the precise command you need to use in your local computer, matching the syntax above. Note that ``JOB_NODE`` is where your main ``pymorize`` job starts, and **not** one of the dask worker From 51308530bc79c88fa383593c7bf2f6a8e99b6da6 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:05:12 +0100 Subject: [PATCH 07/13] chore(cmorizer): cleanup imports --- src/pymorize/cmorizer.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index 52cd081b..740b5ba7 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -1,9 +1,9 @@ import copy +import os from importlib.resources import files from pathlib import Path import dask # noqa: F401 -import os import pandas as pd import questionary import xarray as xr # noqa: F401 @@ -16,19 +16,14 @@ from rich.progress import track from .cluster import set_dashboard_link -from .config import PymorizeConfig, PymorizeConfigManager, parse_bool -from .data_request import ( - DataRequest, - DataRequestTable, - DataRequestVariable, - IgnoreTableFiles, -) +from .config import PymorizeConfig, PymorizeConfigManager +from .data_request import (DataRequest, DataRequestTable, DataRequestVariable, + IgnoreTableFiles) from .filecache import fc from .logging import logger from .pipeline import Pipeline from .rule import Rule from .timeaverage import _frequency_from_approx_interval -from .units import handle_unit_conversion from .utils import wait_for_workers from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR @@ -406,7 +401,7 @@ def is_unit_scalar(value): if is_unit_scalar(cmor_units): if not is_unit_scalar(model_units): dimless = rule.get("dimensionless_unit_mappings", {}) - if not cmor_units in dimless.get(cmor_variable, {}): + if cmor_units not in dimless.get(cmor_variable, {}): errors.append( f"Missing mapping for dimensionless variable {cmor_variable}" ) From 1cb2ca097b759220fd04ae8ddeddddd42f65bf8c Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:06:16 +0100 Subject: [PATCH 08/13] chore(cluster): cleanup isort + black --- src/pymorize/cluster.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pymorize/cluster.py b/src/pymorize/cluster.py index 36012ff6..b34d9759 100644 --- a/src/pymorize/cluster.py +++ b/src/pymorize/cluster.py @@ -1,16 +1,18 @@ """ This module contains the functions to manage the Dask cluster. """ + import dask from .logging import logger + def set_dashboard_link(cluster): """ Checks whether the default user configuration for the dashboard link is valid. If the configuration is invalid it tried to catch the following errors: - * KeyError: 'JUPYTERHUB_SERVICE_PREFIX' -> The dashboard link is not valid because + * ``KeyError``: 'JUPYTERHUB_SERVICE_PREFIX' -> The dashboard link is not valid because the cluster was not launched from JupyterHub. In this case, the default dashboard link is set to 'http://{host}:8787'. From adc628559bc5878aa109c130657b47a0195fd760 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:22:09 +0100 Subject: [PATCH 09/13] fix(cmorizer): workaround for os.uname().nodename not being available in CI context --- src/pymorize/cmorizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index 740b5ba7..be78f41c 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -138,12 +138,14 @@ def _post_init_create_dask_cluster(self): # FIXME: Client needs to be available here? logger.info(f"SLURMCluster can be found at: {self._cluster=}") logger.info(f"Dashboard {self._cluster.dashboard_link}") + # FIXME(PG): In CI context, nodename is not available (???) + nodename = getattr(os.uname(), "nodename", "UNKNOWN") # FIXME: Include the gateway option if possible logger.info( "To see the dashboards run the following command in your computer's " "terminal:\n" f"\tpymorize ssh-tunnel --username {os.getlogin()} --compute-node " - f"{os.uname().nodename}" + f"{nodename}" ) dask_extras = 0 From 3360a37e4c8e7830221dda44988da26259717be9 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:30:22 +0100 Subject: [PATCH 10/13] trying out which part of @mandresm's new os part fails in CI --- tests/meta/test_os_login_in_CI.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/meta/test_os_login_in_CI.py diff --git a/tests/meta/test_os_login_in_CI.py b/tests/meta/test_os_login_in_CI.py new file mode 100644 index 00000000..cc9917a9 --- /dev/null +++ b/tests/meta/test_os_login_in_CI.py @@ -0,0 +1,13 @@ +import os + + +def test_os_login(): + assert os.getlogin() + + +def test_os_uname(): + assert os.uname() + + +def test_os_uname_nodename(): + assert os.uname().nodename From 7dd2fa9843ec7efc539bea1c302d345865936d8c Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:37:18 +0100 Subject: [PATCH 11/13] fix(cmorizer): getpass.getuser is a safer way to get the user name --- src/pymorize/cmorizer.py | 6 ++++-- tests/meta/test_os_login_in_CI.py | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index be78f41c..0fb66b3f 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -1,4 +1,5 @@ import copy +import getpass import os from importlib.resources import files from pathlib import Path @@ -138,13 +139,14 @@ def _post_init_create_dask_cluster(self): # FIXME: Client needs to be available here? logger.info(f"SLURMCluster can be found at: {self._cluster=}") logger.info(f"Dashboard {self._cluster.dashboard_link}") - # FIXME(PG): In CI context, nodename is not available (???) + # FIXME(PG): In CI context, os.getlogin and nodename may not be available (???) + username = getpass.getuser() nodename = getattr(os.uname(), "nodename", "UNKNOWN") # FIXME: Include the gateway option if possible logger.info( "To see the dashboards run the following command in your computer's " "terminal:\n" - f"\tpymorize ssh-tunnel --username {os.getlogin()} --compute-node " + f"\tpymorize ssh-tunnel --username {username} --compute-node " f"{nodename}" ) diff --git a/tests/meta/test_os_login_in_CI.py b/tests/meta/test_os_login_in_CI.py index cc9917a9..3663a953 100644 --- a/tests/meta/test_os_login_in_CI.py +++ b/tests/meta/test_os_login_in_CI.py @@ -1,8 +1,12 @@ import os +import warnings def test_os_login(): - assert os.getlogin() + try: + assert os.getlogin() + except OSError: + warnings.warning("os.getlogin() failed") def test_os_uname(): From 914f5724709f458af93c01a4c2a50ac4879b3400 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:41:52 +0100 Subject: [PATCH 12/13] test: fix for warning on bad os.getlogin --- tests/meta/test_os_login_in_CI.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/meta/test_os_login_in_CI.py b/tests/meta/test_os_login_in_CI.py index 3663a953..12780224 100644 --- a/tests/meta/test_os_login_in_CI.py +++ b/tests/meta/test_os_login_in_CI.py @@ -6,7 +6,7 @@ def test_os_login(): try: assert os.getlogin() except OSError: - warnings.warning("os.getlogin() failed") + warnings.warn("os.getlogin() failed") def test_os_uname(): From fe11f31bddbedab7f2e45bde8209b1468263d7d5 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Wed, 27 Nov 2024 09:49:29 +0100 Subject: [PATCH 13/13] Update src/pymorize/cmorizer.py --- src/pymorize/cmorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index 0fb66b3f..7c8567a5 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -139,7 +139,7 @@ def _post_init_create_dask_cluster(self): # FIXME: Client needs to be available here? logger.info(f"SLURMCluster can be found at: {self._cluster=}") logger.info(f"Dashboard {self._cluster.dashboard_link}") - # FIXME(PG): In CI context, os.getlogin and nodename may not be available (???) + # NOTE(PG): In CI context, os.getlogin and nodename may not be available (???) username = getpass.getuser() nodename = getattr(os.uname(), "nodename", "UNKNOWN") # FIXME: Include the gateway option if possible