modify cluster status checks tests

run-house · Aug 26, 2024 · 0bb4d36 · 0bb4d36
1 parent a1e13a8
commit 0bb4d36
Show file tree

Hide file tree

Showing 8 changed files with 261 additions and 181 deletions.
diff --git a/runhouse/resources/hardware/on_demand_cluster.py b/runhouse/resources/hardware/on_demand_cluster.py
@@ -557,6 +557,7 @@ def teardown(self):
  Example:
  >>> rh.ondemand_cluster("rh-cpu").teardown()
  """
+ status_code_den_request = None
  # TODO [SB]: remove the den_auth check once we will get status of clusters without den_auth as well.
  if self.den_auth:
  try:
@@ -577,13 +578,16 @@ def teardown(self):
  logger.warning(
  "Failed to update Den with terminated cluster status"
  )
+ status_code_den_request = status_resp.status_code
  except Exception as e:
  logger.warning(e)
 
  # Stream logs
  sky.down(self.name)
  self.address = None
 
+ return status_code_den_request
+
  def teardown_and_delete(self):
  """Teardown cluster and delete it from configs.
 

diff --git a/runhouse/servers/cluster_servlet.py b/runhouse/servers/cluster_servlet.py
@@ -2,6 +2,7 @@
 import copy
 import datetime
 import json
+import os.path
 import threading
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
@@ -70,10 +71,10 @@ async def __init__(
  )
 
  logger.info("Creating periodic_cluster_checks thread.")
- cluster_checks_thread = threading.Thread(
+ self.cluster_checks_thread = threading.Thread(
  target=self.periodic_cluster_checks, daemon=True
  )
- cluster_checks_thread.start()
+ self.cluster_checks_thread.start()
 
  ##############################################
  # Cluster config state storage methods
@@ -87,6 +88,16 @@ async def aset_cluster_config(self, cluster_config: Dict[str, Any]):
 
  self.cluster_config = cluster_config
 
+ new_cluster_name = self.cluster_config.get("name", None)
+
+ if self._cluster_name != new_cluster_name:
+ self._cluster_name = new_cluster_name
+ self._cluster_uri = (
+ rns_client.format_rns_address(self._cluster_name)
+ if self._cluster_name
+ else None
+ )
+
  # Propagate the changes to all other process's obj_stores
  await asyncio.gather(
  *[
@@ -235,7 +246,9 @@ async def asave_status_metrics_to_den(self, status: dict):
 
  status_data = {
  "status": ResourceServerStatus.running,
- "resource_type": status_copy.get("cluster_config").pop("resource_type"),
+ "resource_type": status_copy.get("cluster_config").pop(
+ "resource_type", "cluster"
+ ),
  "resource_info": status_copy,
  "env_servlet_processes": env_servlet_processes,
  }
@@ -251,6 +264,66 @@ async def asave_status_metrics_to_den(self, status: dict):
  def save_status_metrics_to_den(self, status: dict):
  return sync_function(self.asave_status_metrics_to_den)(status)
 
+ async def acheck_cluster_status_and_logs(
+ self, interval_size: int, send_to_den: bool = True
+ ):
+
+ logger.debug("Performing cluster checks")
+ status, den_resp_status = await self.astatus(send_to_den=send_to_den)
+ logs_resp_status_code, new_start_log_line, new_end_log_line = None, None, None
+
+ if not send_to_den:
+ return (
+ den_resp_status,
+ logs_resp_status_code,
+ new_start_log_line,
+ new_end_log_line,
+ status,
+ )
+
+ status_code_cluster_status = den_resp_status.status_code
+
+ if status_code_cluster_status == 404:
+ logger.info(
+ "Cluster has not yet been saved to Den, cannot update status or logs."
+ )
+ elif status_code_cluster_status != 200:
+ logger.error(
+ f"Failed to send cluster status to Den: {den_resp_status.json()}"
+ )
+ else:
+ logger.debug("Successfully sent cluster status to Den.")
+
+ cluster_config = await self.aget_cluster_config()
+ prev_end_log_line = cluster_config.get("end_log_line", 0)
+ (
+ logs_resp_status_code,
+ new_start_log_line,
+ new_end_log_line,
+ ) = await self.send_cluster_logs_to_den(
+ prev_end_log_line=prev_end_log_line,
+ )
+ if not logs_resp_status_code:
+ logger.debug(
+ f"No logs were generated in the past {interval_size} minute(s), logs were not sent to Den."
+ )
+
+ elif logs_resp_status_code == 200:
+ logger.debug("Successfully sent cluster logs to Den.")
+ await self.aset_cluster_config_value(
+ key="start_log_line", value=new_start_log_line
+ )
+ await self.aset_cluster_config_value(
+ key="end_log_line", value=new_end_log_line
+ )
+ return (
+ status_code_cluster_status,
+ logs_resp_status_code,
+ new_start_log_line,
+ new_end_log_line,
+ status,
+ )
+
  async def aperiodic_cluster_checks(self):
  """Periodically check the status of the cluster, gather metrics about the cluster's utilization & memory,
  and save it to Den."""
@@ -260,71 +333,34 @@ async def aperiodic_cluster_checks(self):
  "status_check_interval", DEFAULT_STATUS_CHECK_INTERVAL
  )
  while True:
- try:
- # Only if one of these is true, do we actually need to get the status from each EnvServlet
- should_send_status_and_logs_to_den: bool = (
- configs.token is not None
- and interval_size != -1
- and self._cluster_uri is not None
- )
- should_update_autostop: bool = self.autostop_helper is not None
+ should_send_status_and_logs_to_den: bool = (
+ configs.token is not None
+ and interval_size != -1
+ and self._cluster_uri is not None
+ )
+ should_update_autostop: bool = self.autostop_helper is not None
 
- if (
- not should_send_status_and_logs_to_den
- and not should_update_autostop
- ):
- break
+ if not should_send_status_and_logs_to_den and not should_update_autostop:
+ break
 
- logger.debug("Performing cluster checks")
- status, den_resp = await self.astatus(
- send_to_den=should_send_status_and_logs_to_den
+ try:
+ (
+ status_code_cluster_status,
+ logs_resp_status_code,
+ new_start_log_line,
+ new_end_log_line,
+ status,
+ ) = await self.acheck_cluster_status_and_logs(
+ interval_size=interval_size,
+ send_to_den=should_send_status_and_logs_to_den,
  )
 
  if should_update_autostop:
  logger.debug("Updating autostop")
  await self._update_autostop(status)
-
- if not should_send_status_and_logs_to_den:
+ elif not should_send_status_and_logs_to_den:
  break
 
- status_code = den_resp.status_code
-
- if status_code == 404:
- logger.info(
- "Cluster has not yet been saved to Den, cannot update status or logs."
- )
- elif status_code != 200:
- logger.error(
- f"Failed to send cluster status to Den: {den_resp.json()}"
- )
- else:
- logger.debug("Successfully sent cluster status to Den.")
-
- prev_end_log_line = cluster_config.get("end_log_line", 0)
- (
- logs_resp_status_code,
- new_start_log_line,
- new_end_log_line,
- ) = await self.send_cluster_logs_to_den(
- prev_end_log_line=prev_end_log_line,
- )
- if not logs_resp_status_code:
- logger.debug(
- f"No logs were generated in the past {interval_size} minute(s), logs were not sent to Den."
- )
-
- elif logs_resp_status_code == 200:
- logger.debug("Successfully sent cluster logs to Den.")
- await self.aset_cluster_config_value(
- key="start_log_line", value=new_start_log_line
- )
- await self.aset_cluster_config_value(
- key="end_log_line", value=new_end_log_line
- )
- # since we are setting a new values to the cluster_config, we need to reload it so the next
- # cluster check iteration will reference to the updated cluster config.
- cluster_config = await self.aget_cluster_config()
-
  except Exception:
  self.logger.error(
  "Cluster checks have failed.\n"
@@ -525,6 +561,10 @@ def status(self, send_to_den: bool = False):
  # Save cluster logs to Den
  ##############################################
  def _get_logs(self):
+
+ if not os.path.exists(SERVER_LOGFILE):
+ return ""
+
  with open(SERVER_LOGFILE) as log_file:
  log_lines = log_file.readlines()
  cleaned_log_lines = [ColoredFormatter.format_log(line) for line in log_lines]
@@ -573,3 +613,6 @@ async def send_cluster_logs_to_den(
  )
 
  return resp_status_code, prev_end_log_line, new_end_log_line
+
+ def cluster_periodic_thread_is_alive(self):
+ return self.cluster_checks_thread.is_alive()
diff --git a/runhouse/servers/env_servlet.py b/runhouse/servers/env_servlet.py
@@ -202,7 +202,7 @@ def _get_env_cpu_usage(self, cluster_config: dict = None):
 
  if not cluster_config.get("resource_subtype") == "Cluster":
  stable_internal_external_ips = cluster_config.get(
- "stable_internal_external_ips"
+ "stable_internal_external_ips", []
  )
  for ips_set in stable_internal_external_ips:
  internal_ip, external_ip = ips_set[0], ips_set[1]
@@ -214,7 +214,7 @@ def _get_env_cpu_usage(self, cluster_config: dict = None):
  node_name = f"worker_{stable_internal_external_ips.index(ips_set)} ({external_ip})"
  else:
  # a case it is a BYO cluster, assume that first ip in the ips list is the head.
- ips = cluster_config.get("ips")
+ ips = cluster_config.get("ips", [])
  if len(ips) == 1 or node_ip == ips[0]:
  node_name = f"head ({node_ip})"
  else:

diff --git a/tests/test_resources/test_clusters/test_on_demand_cluster.py b/tests/test_resources/test_clusters/test_on_demand_cluster.py
@@ -2,13 +2,8 @@
 import time
 
 import pytest
-import requests
 
 import runhouse as rh
-from runhouse.constants import SERVER_LOGFILE_PATH
-from runhouse.globals import rns_client
-from runhouse.logger import ColoredFormatter
-from runhouse.resources.hardware.utils import ResourceServerStatus
 
 import tests.test_resources.test_clusters.test_cluster
 from tests.utils import friend_account
@@ -201,97 +196,13 @@ def test_fn_to_docker_container(self, ondemand_aws_cluster):
  # Status tests
  ####################################################################################################
 
- @pytest.mark.level("local")
- @pytest.mark.clustertest
- def test_status_scheduler_basic_flow(self, cluster):
-
- cluster_uri = rh.globals.rns_client.format_rns_address(cluster.rns_address)
- api_server_url = rh.globals.rns_client.api_server_url
- headers = rh.globals.rns_client.request_headers()
-
- get_status_data_resp = requests.get(
- f"{api_server_url}/resource/{cluster_uri}/cluster/status?limit=1",
- headers=headers,
- )
-
- assert get_status_data_resp.status_code == 200
- resp_data = get_status_data_resp.json().get("data")[0]
- assert resp_data.get("status") == ResourceServerStatus.running
-
- # TODO [sb]: call cluster's /logs endpoint once introduced.
- cluster_logs = cluster.run([f"cat {SERVER_LOGFILE_PATH}"], stream_logs=False)[
- 0
- ][1].split(
- "\n"
- ) # create list of lines
- cluster_logs = [
- ColoredFormatter.format_log(log) for log in cluster_logs
- ] # clean log formatting
- cluster_logs = "\n".join(cluster_logs) # make logs list into one string
-
- assert "Cluster checks have failed" not in cluster_logs
- assert "Failed to send cluster status to Den: " not in cluster_logs
-
  @pytest.mark.level("minimal")
- @pytest.mark.skip("Test requires terminating the cluster")
- def test_set_status_after_teardown(self, cluster):
-
+ def test_set_status_after_teardown(self, cluster, mocker):
+ mock_function = mocker.patch("sky.down")
+ response = cluster.teardown()
+ assert isinstance(response, int)
+ assert (
+ response == 200
+ ) # that means that the call to post status endpoint in den was successful
+ mock_function.assert_called_once()
  assert cluster.is_up()
- cluster_config = cluster.config()
- cluster_uri = rns_client.format_rns_address(cluster.rns_address)
- api_server_url = cluster_config.get("api_server_url", rns_client.api_server_url)
- cluster.teardown()
- get_status_data_resp = requests.get(
- f"{api_server_url}/resource/{cluster_uri}/cluster/status",
- headers=rns_client.request_headers(),
- )
-
- assert get_status_data_resp.status_code == 200
- # For UI displaying purposes, the cluster/status endpoint returns cluster status history.
- # The latest status info is the first element in the list returned by the endpoint.
- get_status_data = get_status_data_resp.json()["data"][0]
- assert get_status_data["resource_type"] == cluster_config.get("resource_type")
- assert get_status_data["status"] == ResourceServerStatus.terminated
-
- ####################################################################################################
- # Logs surfacing tests
- ####################################################################################################
- @pytest.mark.level("minimal")
- def test_logs_surfacing_scheduler_basic_flow(self, cluster):
- cluster_uri = rh.globals.rns_client.format_rns_address(cluster.rns_address)
- headers = rh.globals.rns_client.request_headers()
- api_server_url = rh.globals.rns_client.api_server_url
-
- get_logs_data_resp = requests.get(
- f"{api_server_url}/resource/{cluster_uri}/logs",
- headers=headers,
- )
-
- # TODO [sb]: call cluster's /logs endpoint once introduced.
- cluster_logs = cluster.run([f"cat {SERVER_LOGFILE_PATH}"], stream_logs=False)[
- 0
- ][1].split(
- "\n"
- ) # create list of lines
- cluster_logs = [
- ColoredFormatter.format_log(log) for log in cluster_logs
- ] # clean log formatting
- cluster_logs = "\n".join(cluster_logs) # make logs list into one string
-
- assert "Cluster checks have failed" not in cluster_logs
- assert "Failed to send cluster logs to Den: " not in cluster_logs
-
- cluster_status = cluster.status()
- logs_start_line = cluster_status.get("cluster_config").get(
- "start_log_line", None
- )
- logs_end_line = cluster_status.get("cluster_config").get("end_log_line", None)
-
- assert logs_start_line
- assert logs_end_line
- assert logs_end_line > logs_start_line
-
- resp_data = get_logs_data_resp.json().get("data")
- assert get_logs_data_resp.status_code == 200
- cluster_logs_from_s3 = resp_data["logs_text"][0][1:].replace("\n ", "\n")
- assert cluster_logs_from_s3 in cluster_logs