diff --git a/python/fedml/computing/scheduler/master/server_runner_deprecated.py b/python/fedml/computing/scheduler/master/server_runner_deprecated.py deleted file mode 100755 index 238349a3e..000000000 --- a/python/fedml/computing/scheduler/master/server_runner_deprecated.py +++ /dev/null @@ -1,2775 +0,0 @@ -import base64 -import copy -import json -import logging -import platform -import queue -import sys - -import multiprocessing -from multiprocessing import Process, Queue, Value, Array -import os -import shutil -import stat -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from os import listdir -from urllib.parse import urljoin, urlparse - -import requests - -import fedml -from ..comm_utils.job_cleanup import JobCleanup -from ..scheduler_core.scheduler_matcher import SchedulerMatcher -from ..comm_utils.constants import SchedulerConstants -from ..comm_utils.job_utils import JobRunnerUtils -from ..comm_utils.run_process_utils import RunProcessUtils -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from ..slave.client_constants import ClientConstants -from ..master.server_constants import ServerConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from ..comm_utils import sys_utils -from .server_data_interface import FedMLServerDataInterface -from ....core.mlops.mlops_utils import MLOpsUtils -from ..scheduler_entry.constants import Constants -from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner -from ..model_scheduler.device_model_cards import FedMLModelCards -from ..model_scheduler import device_client_constants -from ..scheduler_core.log_manager import LogsManager -from ..scheduler_core.metrics_manager import MetricsManager -from ..scheduler_core.master_api_daemon import MasterApiDaemon -from fedml.utils.debugging import debug -from ..scheduler_core.message_center import FedMLMessageCenter -import ssl - - -class RunnerError(Exception): - """ Runner stopped. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLServerRunner(FedMLMessageCenter): - FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-" - debug_cloud_server = False - - def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0): - super().__init__() - self.master_api_daemon = None - self.run_stop_process = None - self.run_stop_process_map = dict() - self.run_edge_id_status_queue_map = dict() - self.run_metrics_queue_map = dict() - self.run_events_queue_map = dict() - self.run_artifacts_queue_map = dict() - self.run_logs_queue_map = dict() - self.async_check_timeout = 0 - self.enable_async_cluster = False - self.origin_fedml_config_object = None - self.package_type = SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_process_event_map_for_stop = dict() - self.edge_device_info_queue = None - self.run_edge_device_info_queue_map = dict() - self.run_edge_device_info_queue_map_for_stop = dict() - self.run_edge_device_info_global_queue = None - self.run_edge_device_info_global_queue_for_stop = None - self.run_process = None - self.run_process_map = dict() - self.start_request_json = None - self.server_docker_image = None - self.cloud_server_name = None - self.run_as_cloud_agent = False - self.run_as_cloud_server = False - self.run_as_edge_server_and_agent = False - self.run_as_cloud_server_and_agent = False - self.fedml_packages_base_dir = None - self.fedml_packages_unzip_dir = None - self.mqtt_mgr = None - self.running_request_json = dict() - self.run_id = run_id - self.unique_device_id = None - self.edge_id = edge_id - self.server_agent_id = 0 - if request_json is not None: - self.server_agent_id = request_json.get("server_id", 0) - self.process = None - self.args = args - self.request_json = copy.deepcopy(request_json) - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - - image_version = self.version - if image_version == "local": - image_version = "dev" - self.server_docker_base_image = "/fedml-device-image:" + image_version - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { - "${FEDSYS.RUN_ID}": "", - "${FEDSYS.PRIVATE_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_ID_LIST}": "", - "${FEDSYS.SYNTHETIC_DATA_URL}": "", - "${FEDSYS.IS_USING_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_NUM}": "", - "${FEDSYS.CLIENT_INDEX}": "", - "${FEDSYS.CLIENT_OBJECT_LIST}": "", - "${FEDSYS.LOG_SERVER_URL}": "", - } - - self.mlops_metrics = None - self.client_agent_active_list = dict() - self.server_active_list = dict() - self.run_status = None - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.runner_list = dict() - self.enable_simulation_cloud_agent = False - self.use_local_process_as_cloud_server = False - - self.model_device_server = None - self.run_model_device_ids = dict() - self.run_edge_ids = dict() - self.master_api_process = None - - self.subscribed_topics = list() - self.user_name = None - self.message_center = None - - def build_dynamic_constrain_variables(self, run_id, run_config): - data_config = run_config.get("data_config", {}) - server_edge_id_list = self.request_json["edgeids"] - is_using_local_data = 0 - private_data_dir = data_config.get("privateLocalData", "") - synthetic_data_url = data_config.get("syntheticDataUrl", "") - edges = self.request_json["edges"] - # if private_data_dir is not None \ - # and len(str(private_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - if private_data_dir is None or len(str(private_data_dir).strip(" ")) <= 0: - params_config = run_config.get("parameters", None) - private_data_dir = ServerConstants.get_data_dir() - if synthetic_data_url is None or len(str(synthetic_data_url)) <= 0: - synthetic_data_url = private_data_dir - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.RUN_ID}"] = run_id - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.PRIVATE_LOCAL_DATA}"] = private_data_dir.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_ID_LIST}"] = str(server_edge_id_list).replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.SYNTHETIC_DATA_URL}"] = synthetic_data_url.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.IS_USING_LOCAL_DATA}"] = str(is_using_local_data) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_NUM}"] = len(server_edge_id_list) - client_objects = str(json.dumps(edges)) - client_objects = client_objects.replace(" ", "").replace("\n", "").replace('"', '\\"') - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_OBJECT_LIST}"] = client_objects - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.LOG_SERVER_URL}"] = self.agent_config["ml_ops_config"][ - "LOG_SERVER_URL" - ] - - def unzip_file(self, zip_file, unzip_file_path) -> str: - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unziped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook funtion is stateless, we need a state to avoid printing progress repeatly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ServerConstants.get_package_download_dir() - os.makedirs(local_package_path, exist_ok=True) - filename, filename_without_extension, file_extension = ServerConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(local_package_path, f"fedml_run_{self.run_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - unzip_package_path = os.path.join(ClientConstants.get_package_unzip_dir(), - f"unzip_fedml_run_{self.run_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - pass - - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) # Using unziped folder name - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path - - def update_local_fedml_config(self, run_id, run_config): - packages_config = run_config["packages_config"] - - # Copy config file from the client - server_package_name = packages_config.get("server", None) - server_package_url = packages_config.get("serverUrl", None) - unzip_package_path = self.retrieve_and_unzip_package(server_package_name, server_package_url) - self.fedml_packages_unzip_dir = unzip_package_path - fedml_local_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - - # Load the above config to memory - config_from_container = load_yaml_config(fedml_local_config_file) - container_entry_file_config = config_from_container["entry_config"] - container_dynamic_args_config = config_from_container["dynamic_args"] - entry_file = container_entry_file_config["entry_file"] - conf_file = container_entry_file_config["conf_file"] - self.package_type = container_entry_file_config.get("package_type", SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT) - full_conf_path = os.path.join(unzip_package_path, "fedml", "config", os.path.basename(conf_file)) - - # Dynamically build constrain variable with realtime parameters from server - self.build_dynamic_constrain_variables(run_id, run_config) - - # Update entry arguments value with constrain variable values with realtime parameters from server - # currently we support the following constrain variables: - # ${FEDSYS_RUN_ID}: a run id represented one entire Federated Learning flow - # ${FEDSYS_PRIVATE_LOCAL_DATA}: private local data path in the Federated Learning client - # ${FEDSYS_CLIENT_ID_LIST}: client list in one entire Federated Learning flow - # ${FEDSYS_SYNTHETIC_DATA_URL}: synthetic data url from server, - # if this value is not null, the client will download data from this URL to use it as - # federated training data set - # ${FEDSYS_IS_USING_LOCAL_DATA}: whether use private local data as federated training data set - # container_dynamic_args_config["data_cache_dir"] = "${FEDSYS.PRIVATE_LOCAL_DATA}" - for constrain_variable_key, constrain_variable_value in self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES.items(): - for argument_key, argument_value in container_dynamic_args_config.items(): - if argument_value is not None and str(argument_value).find(constrain_variable_key) == 0: - replaced_argument_value = str(argument_value).replace( - constrain_variable_key, str(constrain_variable_value) - ) - container_dynamic_args_config[argument_key] = replaced_argument_value - - # Merge all container new config sections as new config dictionary - package_conf_object = dict() - package_conf_object["entry_config"] = container_entry_file_config - package_conf_object["dynamic_args"] = container_dynamic_args_config - package_conf_object["dynamic_args"]["config_version"] = self.args.config_version - container_dynamic_args_config["mqtt_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["mqtt_config_path"]) - ) - container_dynamic_args_config["s3_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["s3_config_path"]) - ) - log_file_dir = ServerConstants.get_log_file_dir() - os.makedirs(log_file_dir, exist_ok=True) - package_conf_object["dynamic_args"]["log_file_dir"] = log_file_dir - - # Save new config dictionary to local file - fedml_updated_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - ServerConstants.generate_yaml_doc(package_conf_object, fedml_updated_config_file) - - # Build dynamic arguments and set arguments to fedml config object - if not self.build_dynamic_args(run_id, run_config, package_conf_object, unzip_package_path): - return None, None - - return unzip_package_path, package_conf_object - - def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): - fedml_conf_file = package_conf_object["entry_config"]["conf_file"] - fedml_conf_file_processed = str(fedml_conf_file).replace('\\', os.sep).replace('/', os.sep) - fedml_conf_path = os.path.join(base_dir, "fedml", "config", - os.path.basename(fedml_conf_file_processed)) - fedml_conf_object = load_yaml_config(fedml_conf_path) - self.origin_fedml_config_object = fedml_conf_object.copy() - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - - # Replace local fedml config objects with parameters from MLOps web - parameters_object = run_config.get("parameters", None) - if parameters_object is not None: - for config_k, config_v in fedml_conf_object.items(): - parameter_v = parameters_object.get(config_k, None) - if parameter_v is not None: - fedml_conf_object[config_k] = parameter_v - parameters_object.pop(config_k) - - for config_k, config_v in parameters_object.items(): - fedml_conf_object[config_k] = config_v - - package_dynamic_args = package_conf_object["dynamic_args"] - if fedml_conf_object.get("comm_args", None) is not None: - fedml_conf_object["comm_args"]["mqtt_config_path"] = package_dynamic_args["mqtt_config_path"] - fedml_conf_object["comm_args"]["s3_config_path"] = package_dynamic_args["s3_config_path"] - fedml_conf_object["common_args"]["using_mlops"] = True - if fedml_conf_object.get("train_args", None) is not None: - fedml_conf_object["train_args"]["run_id"] = package_dynamic_args["run_id"] - fedml_conf_object["train_args"]["client_id_list"] = package_dynamic_args["client_id_list"] - fedml_conf_object["train_args"]["client_num_in_total"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_num_per_round"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["server_id"] = self.edge_id - fedml_conf_object["train_args"]["server_agent_id"] = self.request_json.get("cloud_agent_id", self.edge_id) - fedml_conf_object["train_args"]["group_server_id_list"] = self.request_json.get("group_server_id_list", - list()) - if fedml_conf_object.get("device_args", None) is not None: - fedml_conf_object["device_args"]["worker_num"] = int(package_dynamic_args["client_num_in_total"]) - # fedml_conf_object["data_args"]["data_cache_dir"] = package_dynamic_args["data_cache_dir"] - if fedml_conf_object.get("tracking_args", None) is not None: - fedml_conf_object["tracking_args"]["log_file_dir"] = package_dynamic_args["log_file_dir"] - fedml_conf_object["tracking_args"]["log_server_url"] = package_dynamic_args["log_server_url"] - - bootstrap_script_path = None - env_args = fedml_conf_object.get("environment_args", None) - if env_args is not None: - bootstrap_script_file = env_args.get("bootstrap", None) - if bootstrap_script_file is not None: - bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep) - if platform.system() == 'Windows': - bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat' - if bootstrap_script_file is not None: - bootstrap_script_dir = os.path.join(base_dir, "fedml", os.path.dirname(bootstrap_script_file)) - bootstrap_script_path = os.path.join( - bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file) - ) - # try: - # os.makedirs(package_dynamic_args["data_cache_dir"], exist_ok=True) - # except Exception as e: - # pass - fedml_conf_object["dynamic_args"] = package_dynamic_args - - ServerConstants.generate_yaml_doc(fedml_conf_object, fedml_conf_path) - - is_bootstrap_run_ok = True - try: - if bootstrap_script_path is not None: - if os.path.exists(bootstrap_script_path): - bootstrap_stat = os.stat(bootstrap_script_path) - if platform.system() == 'Windows': - os.chmod(bootstrap_script_path, - bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - bootstrap_scripts = "{}".format(bootstrap_script_path) - else: - os.chmod(bootstrap_script_path, - bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - bootstrap_scripts = "cd {}; ./{}".format(bootstrap_script_dir, - os.path.basename(bootstrap_script_file)) - bootstrap_scripts = str(bootstrap_scripts).replace('\\', os.sep).replace('/', os.sep) - logging.info("Bootstrap scripts are being executed...") - shell_cmd_list = list() - shell_cmd_list.append(bootstrap_scripts) - process, error_list = ServerConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_run_bootstrap) - - ret_code, out, err = process.returncode, None, None - if ret_code is None or ret_code <= 0: - if error_list is not None and len(error_list) > 0: - is_bootstrap_run_ok = False - else: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - sys_utils.log_return_info(bootstrap_script_file, 0) - - is_bootstrap_run_ok = True - else: - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - sys_utils.log_return_info(bootstrap_script_file, ret_code) - - is_bootstrap_run_ok = False - except Exception as e: - logging.error("Bootstrap scripts error: {}".format(traceback.format_exc())) - - is_bootstrap_run_ok = False - - return is_bootstrap_run_ok - - def callback_run_bootstrap(self, job_pid): - ServerConstants.save_bootstrap_process(self.run_id, job_pid) - - @debug - def run( - self, process_event, completed_event, edge_id_status_queue=None, - edge_device_info_queue=None, run_metrics_queue=None, - run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, - message_center_queue=None, edge_device_info_global_queue=None - ): - print(f"Server runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - self.run_impl(edge_id_status_queue, edge_device_info_queue, run_metrics_queue, - run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue) - except RunnerError: - logging.info("Runner stopped.") - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error("Runner exits with exceptions. {}".format(traceback.format_exc())) - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - finally: - logging.info("Release resources.") - self._process_run_metrics_queue(run_metrics_queue) - self._process_run_logs_queue(run_logs_queue) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - ServerConstants.cleanup_run_process(self.run_id) - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - def check_runner_stop_event(self): - if self.run_process_event is not None and self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def deploy_model(self, serving_devices, request_json, run_id): - run_config = request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_type == Constants.JOB_TASK_TYPE_DEPLOY or job_type == Constants.JOB_TASK_TYPE_SERVE: - computing = job_yaml.get("computing", {}) - num_gpus = computing.get("minimum_num_gpus", 1) - serving_args = run_params.get("serving_args", {}) - model_id = serving_args.get("model_id", None) - model_name = serving_args.get("model_name", None) - model_version = serving_args.get("model_version", None) - model_storage_url = serving_args.get("model_storage_url", None) - endpoint_name = serving_args.get("endpoint_name", None) - endpoint_id = serving_args.get("endpoint_id", None) - random = serving_args.get("random", "") - random_out = sys_utils.random2(random, "FEDML@9999GREAT") - random_list = random_out.split("FEDML@") - device_type = device_client_constants.ClientConstants.login_role_list[ - device_client_constants.ClientConstants.LOGIN_MODE_FEDML_CLOUD_INDEX] - FedMLModelCards.get_instance().deploy_model( - model_name, device_type, json.dumps(serving_devices), - "", random_list[1], None, - in_model_id=model_id, in_model_version=model_version, - endpoint_name=endpoint_name, endpoint_id=endpoint_id, run_id=run_id) - - @debug - def run_impl( - self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue, - run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue - ): - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config["data_config"] - edge_ids = self.request_json["edgeids"] - - self.check_runner_stop_event() - - self.run_id = run_id - self.args.run_id = self.run_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - # report server running status - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - logging.info("Detect all status of Edge ids: " + str(edge_ids)) - - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - callback_when_edges_ready=self.send_training_request_to_edges) - logging.info(f"Status OK: {status_ok}, Active edge info dict: {active_edge_info_dict}, " - f"inactivate edges: {inactivate_edges}") - if not status_ok: - logging.error(f"Status of edge device is not OK. Active edge info dict: {active_edge_info_dict}, " - f"Inactivate edges: {inactivate_edges}") - return - - if not self.should_continue_run_job(run_id): - if FedMLServerRunner.debug_cloud_server: - while True: - time.sleep(30) - # Check if the run status is normal - self.aggregate_run_status_metrics_logs( - run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, - run_metrics_queue, run_logs_queue) - return - - # Start the server job - self._start_runner_process(run_id, self.request_json, is_server_job=True) - - # Check if the run status is normal - self.aggregate_run_status_metrics_logs( - run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, - run_metrics_queue, run_logs_queue) - - def aggregate_run_status_metrics_logs( - self, run_id, edge_id_list, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, run_metrics_queue, run_logs_queue): - total_sleep_seconds = 0 - sleep_seconds = 3 - allowed_status_check_sleep_seconds = 60 * 25 - server_id = self.edge_id - normal_response_status_list = [ - ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_TRAINING, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING - ] - edges_id_status_timeout_map = dict() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - running_edges_list = list() - inactivate_edge_list = list() - current_edge_id_status_map = dict() - - while True: - self.check_runner_stop_event() - - # Process run metrics - self._process_run_metrics_queue(run_metrics_queue) - - # Process run logs - self._process_run_logs_queue(run_logs_queue) - - # Fetch edge id and status from the edge id status queue - while True: - try: - queue_item = edge_id_status_queue.get(block=False, timeout=3) - if queue_item is not None: - current_edge_id_status_map.update(queue_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - # Calc the total completed device number - server_id = current_edge_id_status_map.get("server", 0) - running_edges_list.clear() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - for edge_id_item, status_item in current_edge_id_status_map.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # Process the no response edges and accumulate the counter. - for edge_id_item in edge_id_list: - status_dict = edges_id_status_timeout_map.get(str(edge_id_item)) - status_item = current_edge_id_status_map.get(str(edge_id_item)) - if status_item is None: - continue - if status_dict is None: - status_dict = {"status": status_item, "count": 0} - else: - if status_item in normal_response_status_list: - status_dict["count"] = 0 - else: - status_dict["count"] += 1 - edges_id_status_timeout_map[str(edge_id_item)] = status_dict - - # If the completed device number is equal total device number, then break - if len(running_edges_list) <= 0 and len(current_edge_id_status_map.keys()) == len(edge_id_list) + 1: - break - - # Calc the timeout value to wait to device killed. - self.check_runner_stop_event() - time.sleep(sleep_seconds) - total_sleep_seconds += sleep_seconds - no_response_edge_ids = list() - for no_res_edge, no_res_status in edges_id_status_timeout_map.items(): - if no_res_status.get("count") * sleep_seconds > allowed_status_check_sleep_seconds: - no_response_edge_ids.append(no_res_edge) - - # If timeout, then report killed device status - if len(no_response_edge_ids) > 0: - for edge_id_item in no_response_edge_ids: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - server_id=self.edge_id, run_id=self.run_id) - - # Check if we can get the response device info from edge devices - # and set the inactive edges to killed status. - self.check_runner_stop_event() - given_edge_ids = list(set(edge_id_list) - set(inactivate_edge_list)) - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - need_to_trigger_exception=False, status_timeout=60, - given_edge_ids=given_edge_ids, callback_when_detecting=self.callback_when_detecting_on_aggregation, - args_for_callback_when_detecting=(run_metrics_queue, run_logs_queue) - ) - if not status_ok: - inactivate_edge_list.extend(inactivate_edges) - for edge_id_item in inactivate_edges: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE, - server_id=self.edge_id, run_id=self.run_id) - - # Calc the final run status based on the completed device numbers and fault tolerance parameters. - enable_fault_tolerance, fault_tolerance_rate = self.parse_fault_tolerance_params(run_id) - running_edges_list = list(set(running_edges_list)) - status_to_report = self.calculate_server_status( - run_id, len(edge_id_list), number_of_failed_edges, number_of_finished_edges, - number_of_killed_edges, running_edges_list, enable_fault_tolerance=enable_fault_tolerance, - fault_tolerance_rate=fault_tolerance_rate) - if status_to_report is not None: - logging.info( - f"Run completed when aggregating status, metrics and logs, will report status {status_to_report}") - self.mlops_metrics.report_server_id_status( - self.run_id, status_to_report, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def callback_when_detecting_on_aggregation(self, detecting_args): - # Process run metrics - self._process_run_metrics_queue(detecting_args[0]) - - # Process run logs - self._process_run_logs_queue(detecting_args[1]) - - def _process_run_metrics_queue(self, run_metrics_queue): - # Fetch metrics from the run metrics queue - while True: - try: - metrics_item = run_metrics_queue.get(block=False, timeout=3) - MetricsManager.get_instance().save_metrics(metrics_item) - metric_json = json.loads(metrics_item) - if metric_json.get("is_endpoint", False): - metric_json().pop("is_endpoint") - self.mlops_metrics.report_endpoint_metric({}, payload=json.dumps(metric_json)) - else: - self.mlops_metrics.report_server_training_metric({}, payload=metrics_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - def _process_run_logs_queue(self, run_logs_queue): - # Fetch logs from the run logs queue - while True: - try: - logs_item = run_logs_queue.get(block=False, timeout=3) - LogsManager.save_logs(logs_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - def run_server_job_impl(self, process_event, completed_event, edge_id_status_queue=None, - edge_device_info_queue=None, run_metrics_queue=None, - run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, - message_center_queue=None, edge_device_info_global_queue=None): - print(f"Server runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config["data_config"] - edge_ids = self.request_json["edgeids"] - - self.check_runner_stop_event() - - # get training params - private_local_data_dir = data_config.get("privateLocalData", "") - is_using_local_data = 0 - # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - - # start a run according to the hyper-parameters - # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) - fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") - fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") - if is_using_local_data: - fedml_local_data_dir = private_local_data_dir - self.fedml_data_dir = self.fedml_data_local_package_dir - - self.check_runner_stop_event() - - logging.info("download packages and run the bootstrap script...") - - # update local config with real time parameters from server and dynamically replace variables value - unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) - if unzip_package_path is None or fedml_config_object is None: - logging.info("failed to update local fedml config.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.send_training_stop_request_to_edges_when_exception(edge_ids, payload=self.start_request_json, - run_id=run_id) - return - - logging.info("cleanup the previous aggregation process and check downloaded packages...") - - entry_file_config = fedml_config_object["entry_config"] - dynamic_args_config = fedml_config_object["dynamic_args"] - entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) - entry_file = os.path.basename(entry_file) - conf_file = entry_file_config["conf_file"] - conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) - ServerConstants.cleanup_learning_process(run_id) - self.check_runner_stop_event() - if not os.path.exists(unzip_package_path): - logging.info("failed to unzip file.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.send_training_stop_request_to_edges_when_exception(edge_ids, payload=self.start_request_json, - run_id=run_id) - return - os.chdir(os.path.join(unzip_package_path, "fedml")) - - self.check_runner_stop_event() - - logging.info("starting the server user process...") - - entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) - conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) - logging.info(" ") - logging.info(" ") - logging.info("====Your Run Logs Begin===") - process, is_launch_task, error_list = self.execute_job_task(entry_file_full_path, conf_file_full_path, run_id) - logging.info("====Your Run Logs End===") - logging.info(" ") - logging.info(" ") - - ret_code, out, err = process.returncode, None, None - is_run_ok = sys_utils.is_runner_finished_normally(process.pid) - if is_launch_task: - is_run_ok = True - if error_list is not None and len(error_list) > 0: - is_run_ok = False - if ret_code is None or ret_code <= 0: - self.check_runner_stop_event() - - if is_run_ok: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", 0) - else: - sys_utils.log_return_info(entry_file, 0) - else: - is_run_ok = False - - if not is_run_ok: - # If the run status is killed or finished, then return with the normal state. - current_job = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if current_job is not None and (current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED or - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED): - return - - self.check_runner_stop_event() - - logging.error("failed to run the aggregation process...") - - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - - self.send_training_stop_request_to_edges_when_exception(edge_ids, run_id=run_id) - - def init_job_task(self, request_json): - run_id = request_json["runId"] - run_config = request_json["run_config"] - edge_ids = request_json["edgeids"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", None) - server_id = request_json["server_id"] - if self.run_as_cloud_agent: - server_id = self.edge_id - - self.setup_listeners_for_edge_status(run_id, edge_ids, server_id) - self.setup_listener_for_run_metrics(run_id) - self.setup_listener_for_run_logs(run_id) - - def should_continue_run_job(self, run_id): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - framework_type = job_yaml.get("framework_type", None) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_yaml_default_none is not None: - if job_type == Constants.JOB_TASK_TYPE_FEDERATE: - return True - - if framework_type is None or framework_type != Constants.JOB_FRAMEWORK_TYPE_FEDML: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - return False - - return True - - def execute_job_task(self, entry_file_full_path, conf_file_full_path, run_id): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - job_api_key = job_yaml.get("run_api_key", None) - job_api_key = job_yaml.get("fedml_run_dynamic_params", None) if job_api_key is None else job_api_key - assigned_gpu_ids = run_params.get("gpu_ids", None) - framework_type = job_yaml.get("framework_type", None) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - conf_file_object = load_yaml_config(conf_file_full_path) - entry_args_dict = conf_file_object.get("fedml_entry_args", {}) - entry_args = entry_args_dict.get("arg_items", None) - - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - - if job_yaml_default_none is None: - # Generate the job executing commands for previous federated learning (Compatibility) - python_program = get_python_program() - logging.info("Run the server: {} {} --cf {} --rank 0 --role server".format( - python_program, entry_file_full_path, conf_file_full_path)) - entry_command = f"{python_program} {entry_file_full_path} --cf " \ - f"{conf_file_full_path} --rank 0 --role server" - shell_cmd_list = [entry_command] - - # Run the job executing commands for previous federated learning (Compatibility) - process, error_list = ClientConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) - is_launch_task = False - else: - self.check_runner_stop_event() - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - # Generate the job executing commands - job_executing_commands = JobRunnerUtils.generate_job_execute_commands( - run_id=self.run_id, edge_id=self.edge_id, version=self.version, package_type=self.package_type, - executable_interpreter=executable_interpreter, entry_file_full_path=entry_file_full_path, - conf_file_object=conf_file_object, entry_args=entry_args, assigned_gpu_ids=assigned_gpu_ids, - job_api_key=job_api_key, client_rank=0) - - # Run the job executing commands - logging.info(f"Run the server job with job id {self.run_id}, device id {self.edge_id}.") - process, error_list = ServerConstants.execute_commands_with_live_logs( - job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor) - is_launch_task = True - - return process, is_launch_task, error_list - - def callback_start_fl_job(self, job_pid): - ServerConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_sys_perf( - self.args, self.agent_config["mqtt_config"], job_process_id=job_pid) - - def start_job_perf(self, job_pid): - ServerConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_job_perf(self.args, self.agent_config["mqtt_config"], job_pid) - - def job_error_processor(self, error_list): - self.check_runner_stop_event() - - error_str = "\n".join(error_list) - raise Exception(f"Error occurs when running the job... {error_str}") - - def process_job_status(self, run_id, edge_id, status): - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - edge_id_status_dict = self.client_agent_active_list.get(f"{run_id}", {}) - server_id = edge_id_status_dict.get("server", 0) - enable_fault_tolerance, fault_tolerance_rate = self.parse_fault_tolerance_params(run_id) - running_edges_list = list() - for edge_id_item, status_item in edge_id_status_dict.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # Report client status - edge_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else status - self.mlops_metrics.report_client_training_status(edge_id, edge_status, run_id=run_id) - self.mlops_metrics.report_client_device_status_to_web_ui(edge_id, edge_status, run_id=run_id) - - # Report server status based on the fault tolerance model and parameters - edge_nums = len(edge_id_status_dict.keys()) - 1 - status_to_report = self.calculate_server_status( - run_id, edge_nums, number_of_failed_edges, number_of_finished_edges, number_of_killed_edges, - running_edges_list, enable_fault_tolerance=enable_fault_tolerance, - fault_tolerance_rate=fault_tolerance_rate) - if status_to_report is not None: - logging.info(f"Run completed when processing edge status, will report status {status_to_report}") - self.report_server_status(run_id, server_id, status_to_report) - - def calculate_server_status( - self, run_id, total_edge_nums, number_of_failed_edges, number_of_finished_edges, - number_of_killed_edges, running_edges_list, enable_fault_tolerance=False, - fault_tolerance_rate=0.8 - ): - # Report server status based on the fault tolerance model and parameters - actual_failed_rate = number_of_failed_edges / total_edge_nums - all_edges_run_completed = True if len(running_edges_list) <= 0 else False - if all_edges_run_completed: - status_to_report = None - if enable_fault_tolerance: - if actual_failed_rate >= fault_tolerance_rate: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - self.send_training_stop_request_to_edges_when_exception( - running_edges_list, run_id=run_id, status=status_to_report) - return status_to_report - else: - if number_of_killed_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - else: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - else: - if number_of_failed_edges > 0: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - elif number_of_finished_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - elif number_of_killed_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - - return status_to_report - - def parse_fault_tolerance_params(self, run_id): - run_json = self.running_request_json.get(str(run_id), None) - if run_json is None: - run_json = self.request_json - run_config = run_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - common_args = run_params.get("common_args", {}) - enable_fault_tolerance = common_args.get("enable_fault_tolerance", False) - fault_tolerance_rate = common_args.get("fault_tolerance_rate", 0) - return enable_fault_tolerance, fault_tolerance_rate - - def report_server_status(self, run_id, server_id, status): - self.mlops_metrics.report_server_id_status(run_id, status, edge_id=self.edge_id, - server_id=server_id, server_agent_id=self.edge_id) - - def stop_run_when_starting_failed(self): - edge_id_list = self.request_json["edgeids"] - run_id = self.request_json.get("run_id", 0) - logging.error("edge ids {}".format(str(edge_id_list))) - - payload = self.running_request_json.get(str(run_id)) - if payload is not None: - self.send_training_stop_request_to_edges(edge_id_list, payload=json.dumps(payload), run_id=run_id) - - # logging.info("Stop run successfully when starting failed.") - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def cleanup_run_when_finished(self, should_send_server_id_status=True): - # logging.info("Cleanup run successfully when finished.") - - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id - ) - - if should_send_server_id_status: - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_starting_failed( - self, status=ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, should_send_server_id_status=True): - # logging.info("Cleanup run successfully when starting failed.") - - self.mlops_metrics.report_server_training_status( - self.run_id, status, edge_id=self.edge_id) - - if should_send_server_id_status: - self.mlops_metrics.report_server_id_status( - self.run_id, status, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def should_process_async_cluster(self): - run_config = self.request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - common_args = run_params.get("common_args", {}) - self.enable_async_cluster = common_args.get("enable_async_cluster", False) - self.async_check_timeout = common_args.get("async_check_timeout", 0) - if self.enable_async_cluster: - return True, self.async_check_timeout - - return False, self.async_check_timeout - - @debug - def detect_edges_status( - self, edge_device_info_queue, edge_device_info_global_queue=None, callback_when_edges_ready=None, status_timeout=None, - need_to_trigger_exception=True, status_check_context=None, given_edge_ids=None, - callback_when_detecting=None, args_for_callback_when_detecting=None - ): - run_id = self.request_json["runId"] - run_id_str = str(run_id) - edge_id_list = self.request_json["edgeids"] - if given_edge_ids is not None: - edge_id_list = given_edge_ids - - # Init realtime status of all edges - run_edges_realtime_status = dict() - run_edges_realtime_status[run_id_str] = dict() - - edge_info_global_dict = dict() - if edge_device_info_global_queue is not None: - for edge_info_global in edge_device_info_global_queue: - edge_info_id = edge_info_global.get("edge_id") - edge_info_global_dict[edge_info_id] = edge_info_global - - # Send status message to all edges - allowed_cache_edge_status_time = 60 - for edge_id in edge_id_list: - # Check if the edge status was filled allowed_cache_edge_status_time seconds ago, - # if so no more checking message would be sent. - edge_info = edge_info_global_dict.get(edge_id, None) - if edge_info is not None: - timestamp = edge_info.get("timestamp", None) - time_interval = time.time() - timestamp - if time_interval <= allowed_cache_edge_status_time: - continue - - self.send_status_check_msg(run_id, edge_id, self.edge_id, context=status_check_context) - time.sleep(3) - - total_sleep_seconds = 0 - status_check_sleep_seconds = 10 - allowed_status_check_sleep_seconds = 60 * 2 if status_timeout is None else status_timeout - allowed_status_check_sleep_seconds_for_async = 30 - inactivate_edges = list() - active_edge_info_dict = dict() - log_active_edge_info_flag = True - while True: - if callback_when_detecting is not None: - callback_when_detecting(args_for_callback_when_detecting) - - # Fetch edge info from the edge status queue, which will be added to realtime status map - while True: - self.check_runner_stop_event() - - try: - edge_info = edge_device_info_queue.get(block=False, timeout=1) - if edge_info is not None: - edge_id = edge_info.get("edge_id", None) - if edge_id is not None: - run_edges_realtime_status[run_id_str][edge_id] = edge_info - except queue.Empty as e: # If queue is empty, then break loop - break - - self.check_runner_stop_event() - - # Check all edges which don't send response status successfully - # and retry to send the status checking message. - active_edges_count = 0 - inactivate_edges.clear() - active_edge_info_dict.clear() - for edge_id in edge_id_list: - edge_info_dict = run_edges_realtime_status.get(run_id_str, {}) - edge_info = edge_info_dict.get(edge_id, None) - edge_info = edge_info_dict.get(str(edge_id), None) if edge_info is None else edge_info - if edge_info is not None: - active_edges_count += 1 - active_edge_info_dict[str(edge_id)] = edge_info - else: - # Check if the edge status was filled allowed_cache_edge_status_time seconds ago, - # if so no more checking message would be sent. - edge_info = edge_info_global_dict.get(edge_id, None) - if edge_info is not None: - timestamp = edge_info.get("timestamp", None) - time_interval = time.time() - timestamp - if time_interval <= allowed_cache_edge_status_time: - active_edges_count += 1 - active_edge_info_dict[str(edge_id)] = edge_info - continue - - inactivate_edges.append(edge_id) - self.send_status_check_msg(run_id, edge_id, self.edge_id, context=status_check_context) - - # If all edges are ready then send the starting job message to them - if active_edges_count == len(edge_id_list): - if log_active_edge_info_flag: - logging.debug(f"All edges are ready. Active edge id list is as follows. {active_edge_info_dict}") - log_active_edge_info_flag = False - if callback_when_edges_ready is not None: - logging.info("All edges are ready. Start to process the callback function.") - callback_when_edges_ready(active_edge_info_dict=active_edge_info_dict) - else: - logging.debug("All edges are ready. No callback function to process.") - break - else: - logging.info(f"All edges are not ready. Active edge id list: {active_edge_info_dict}, " - f"Inactive edge id list: {inactivate_edges}") - log_active_edge_info_flag = True - - # Check if runner needs to stop and sleep specific time - self.check_runner_stop_event() - time.sleep(status_check_sleep_seconds) - total_sleep_seconds += status_check_sleep_seconds - - # Check if the status response message has timed out to receive - if total_sleep_seconds >= allowed_status_check_sleep_seconds: - # If so, send failed message to MLOps and send exception message to all edges. - logging.error(f"There are inactive edge devices. " - f"Inactivate edge id list is as follows. {inactivate_edges}") - if need_to_trigger_exception: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - return False, active_edge_info_dict, inactivate_edges - - # If we enable the mode for async cluster, then sleep some time and send messages to all clients. - if callback_when_edges_ready is not None: - should_async, async_timeout = self.should_process_async_cluster() - if should_async and total_sleep_seconds >= allowed_status_check_sleep_seconds_for_async: - if async_timeout > allowed_status_check_sleep_seconds_for_async: - time.sleep(async_timeout - allowed_status_check_sleep_seconds_for_async) - self.send_training_request_to_edges() - return True, active_edge_info_dict, inactivate_edges - - return True, active_edge_info_dict, inactivate_edges - - def send_status_check_msg(self, run_id, edge_id, server_id, context=None): - topic_get_model_device_id = "server/client/request_device_info/" + str(edge_id) - payload = {"server_id": server_id, "run_id": run_id} - if context is not None: - payload["context"] = context - self.message_center.send_message(topic_get_model_device_id, json.dumps(payload)) - - @debug - def send_training_request_to_edges(self, active_edge_info_dict=None): - run_id = self.request_json["runId"] - edge_id_list = self.request_json["edgeids"] - run_config = self.request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - computing = job_yaml.get("computing", {}) - request_num_gpus = computing.get("minimum_num_gpus", None) - job_gpu_id_list = self.request_json.get("job_gpu_id_list", None) - - logging.info("Send training request to Edge ids: " + str(edge_id_list)) - - should_match_gpu = False - if job_yaml_default_none is not None and request_num_gpus is not None and \ - int(request_num_gpus) > 0 and active_edge_info_dict is not None: - should_match_gpu = True - SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True) - - # Match and assign gpus to each device - assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices( - request_num_gpus, edge_id_list, active_edge_info_dict, job_gpu_id_list=job_gpu_id_list) - if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None: - # If no resources available, send failed message to MLOps and send exception message to all edges. - gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( - active_edge_info_dict, should_print=True) - err_info = f"No resources available." \ - f"Total available GPU count {gpu_available_count} is less than " \ - f"request GPU count {request_num_gpus}" - logging.error(err_info) - - # Bug fix: This mqtt message needs to be sent so platform can clean up the failed run and change the - # status from running to failed. - self.mlops_metrics.report_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id - ) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - - serving_args = job_yaml.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - if endpoint_id is not None: - fedml.mlops.log_endpoint_status( - endpoint_id, device_client_constants.ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - fedml.mlops.log_run_log_lines( - endpoint_id, 0, [err_info], - log_source=device_client_constants.ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT - ) - return - - # Generate master node addr and port - master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, - active_edge_info_dict) - - # Generate new edge id list after matched - edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict) - if len(edge_id_list) <= 0: - gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( - active_edge_info_dict, should_print=True) - logging.error(f"Request parameter for GPU num is invalid." - f"Total available GPU count {gpu_available_count}." - f"Request GPU num {request_num_gpus}") - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - return - - if should_match_gpu: - # Report gpu num and related infos to MLOps. - serving_args = job_yaml.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - if endpoint_id is not None: - endpoint_info = list() - for edge_id_item, gpu_num in assigned_gpu_num_dict.items(): - edge_info = active_edge_info_dict.get(str(edge_id_item), {}) - endpoint_info.append({ - "machine_id": edge_id_item, "endpoint_gpu_count": gpu_num, - "master_deploy_id": edge_info.get("master_device_id", 0), - "slave_deploy_id": edge_info.get("slave_device_id", 0)}) - topic_name = f"compute/mlops/endpoint" - endpoint_info_json = {"endpoint_id": endpoint_id, "endpoint_info": endpoint_info} - print(f"endpoint_info_json {endpoint_info_json}") - self.message_center.send_message(topic_name, json.dumps(endpoint_info_json)) - - client_rank = 1 - for edge_id in edge_id_list: - topic_start_train = "flserver_agent/" + str(edge_id) + "/start_train" - logging.info("start_train: send topic " + topic_start_train + " to client...") - request_json = self.request_json - request_json["client_rank"] = client_rank - client_rank += 1 - - if active_edge_info_dict is not None: - edge_info = active_edge_info_dict.get(str(edge_id), {}) - model_master_device_id = edge_info.get("master_device_id", None) - model_slave_device_id = edge_info.get("slave_device_id", None) - model_slave_device_id_list = edge_info.get("slave_device_id_list", None) - - if should_match_gpu: - request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler( - edge_id, edge_id_list, master_node_addr, master_node_port, - assigned_gpu_num_dict, assigned_gpu_ids_dict, - model_master_device_id=model_master_device_id, - model_slave_device_id=model_slave_device_id, - model_slave_device_id_list=model_slave_device_id_list - ) - - self.message_center.send_message(topic_start_train, json.dumps(request_json)) - - def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): - self.client_agent_active_list[f"{run_id}"] = dict() - self.client_agent_active_list[f"{run_id}"][f"server"] = server_id - for edge_id in edge_ids: - self.client_agent_active_list[f"{run_id}"][f"{edge_id}"] = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" - self.add_message_listener(edge_status_topic, self.callback_edge_status) - self.subscribe_msg(edge_status_topic) - - def remove_listeners_for_edge_status(self, edge_ids=None): - if edge_ids is None: - edge_ids = self.request_json["edgeids"] - - for edge_id in edge_ids: - edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" - self.unsubscribe_msg(edge_status_topic) - - def setup_listener_for_run_metrics(self, run_id): - metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" - self.add_message_listener(metric_topic, self.callback_run_metrics) - self.subscribe_msg(metric_topic) - - def remove_listener_for_run_metrics(self, run_id): - metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" - self.unsubscribe_msg(metric_topic) - - def setup_listener_for_run_logs(self, run_id): - logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" - self.add_message_listener(logs_topic, self.callback_run_logs) - self.subscribe_msg(logs_topic) - - def remove_listener_for_run_logs(self, run_id): - logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" - self.unsubscribe_msg(logs_topic) - - def callback_run_logs(self, topic, payload): - run_id = str(topic).split('/')[-1] - run_id_str = str(run_id) - if self.run_logs_queue_map.get(run_id_str) is None: - self.run_logs_queue_map[run_id_str] = Queue() - self.run_logs_queue_map[run_id_str].put(payload) - - def callback_run_metrics(self, topic, payload): - print(f"callback_run_metrics topic {topic}, payload {payload}") - run_id = str(topic).split('/')[-1] - run_id_str = str(run_id) - if self.run_metrics_queue_map.get(run_id_str) is None: - self.run_metrics_queue_map[run_id_str] = Queue() - self.run_metrics_queue_map[run_id_str].put(payload) - - def callback_edge_status(self, topic, payload): - payload_json = json.loads(payload) - run_id = payload_json.get("run_id", None) - edge_id = payload_json.get("edge_id", None) - status = payload_json.get("status", None) - if run_id is not None and edge_id is not None: - active_item_dict = self.client_agent_active_list.get(f"{run_id}", None) - if active_item_dict is None: - return - self.client_agent_active_list[f"{run_id}"][f"{edge_id}"] = status - - if self.run_edge_id_status_queue_map.get(f"{run_id}") is None: - self.run_edge_id_status_queue_map[f"{run_id}"] = Queue() - self.run_edge_id_status_queue_map[f"{run_id}"].put(self.client_agent_active_list[f"{run_id}"]) - - self.process_job_status(run_id, edge_id, status) - - def ota_upgrade(self, payload, request_json): - run_id = request_json["runId"] - force_ota = False - ota_version = None - - try: - run_config = request_json.get("run_config", None) - parameters = run_config.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - job_obj = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job_obj is None: - FedMLServerDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - payload) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_train(self, topic=None, payload=None): - print("callback_start_train: ") - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # [NOTES] Example Request JSON: https://fedml-inc.larksuite.com/wiki/ScnIwUif9iupbjkYS0LuBrd6sod#WjbEdhYrvogmlGxKTOGu98C6sSb - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - - # Process the log - run_id = request_json["runId"] - run_id_str = str(run_id) - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - # Start log processor for current run - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) - logging.info("start the log processor.") - elif self.run_as_cloud_agent: - # Start log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, request_json.get("server_id", "0"), SchedulerConstants.get_log_source(request_json) - ) - elif self.run_as_cloud_server: - self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) - run_id = request_json["runId"] - run_id_str = str(run_id) - - # Start log processor for current run - self.args.run_id = run_id - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) - - logging.info("callback_start_train payload: {}".format(payload)) - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - # if not self.run_as_cloud_agent and not self.run_as_cloud_server: - # self.ota_upgrade(payload, request_json) - - # report server running status - if not self.run_as_cloud_server: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - self.start_request_json = payload - self.run_id = run_id - ServerConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - # Start server with multiprocessing mode - self.request_json = request_json - self.running_request_json[run_id_str] = request_json - edge_id_list = request_json.get("edgeids", list()) - self.run_edge_ids[run_id_str] = edge_id_list - - logging.info("subscribe the client exception message.") - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - self.init_job_task(request_json) - - self.args.run_id = run_id - - self._start_runner_process(run_id, request_json) - - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - elif self.run_as_cloud_agent: - self.init_job_task(request_json) - - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - server_runner.run_as_cloud_agent = self.run_as_cloud_agent - server_runner.start_request_json = json.dumps(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - - if not self.use_local_process_as_cloud_server: - self.run_process_map[run_id_str] = Process(target=server_runner.start_cloud_server_process_entry) - self.run_process_map[run_id_str].start() - else: - message_bytes = json.dumps(self.request_json).encode("ascii") - base64_bytes = base64.b64encode(message_bytes) - runner_cmd_encoded = base64_bytes.decode("ascii") - logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) - - cloud_device_id = request_json.get("cloudServerDeviceId", "0") - - self.run_process_map[run_id_str] = Process( - target=FedMLServerRunner.start_local_cloud_server, - args=(run_id_str, self.args.user, self.version, cloud_device_id, runner_cmd_encoded)) - self.run_process_map[run_id_str].start() - time.sleep(1) - - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - elif self.run_as_cloud_server: - self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) - self.start_request_json = json.dumps(request_json) - run_id = request_json["runId"] - run_id_str = str(run_id) - - self.init_job_task(request_json) - - self.args.run_id = run_id - - self._start_runner_process(run_id, request_json) - # ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - @staticmethod - def start_local_cloud_server(run_id, user, version, cloud_device_id, runner_cmd_encoded): - print(f"start cloud server, device id {cloud_device_id}, runner cmd {runner_cmd_encoded}") - if not FedMLServerRunner.debug_cloud_server: - pip_source_dir = os.path.dirname(__file__) - login_cmd = os.path.join(pip_source_dir, "server_login.py") - run_cmd = f"{get_python_program()} -W ignore {login_cmd} -t login -r cloud_server -u {str(user)} " \ - f"-v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" - os.system(run_cmd) - - def _start_runner_process(self, run_id, request_json, is_server_job=False): - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - run_id_str = str(run_id) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.edge_id = self.edge_id - server_runner.server_agent_id = self.server_agent_id - server_runner.start_request_json = json.dumps(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - server_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map[run_id_str] = Queue() - if self.run_metrics_queue_map.get(run_id_str, None) is None: - self.run_metrics_queue_map[run_id_str] = Queue() - if self.run_events_queue_map.get(run_id_str, None) is None: - self.run_events_queue_map[run_id_str] = Queue() - if self.run_artifacts_queue_map.get(run_id_str, None) is None: - self.run_artifacts_queue_map[run_id_str] = Queue() - if self.run_logs_queue_map.get(run_id_str, None) is None: - self.run_logs_queue_map[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue is None: - # self.run_edge_device_info_global_queue = Array('i', list()) - server_runner.edge_id_status_queue = self.run_edge_id_status_queue_map[run_id_str] - server_runner.edge_device_info_queue = self.run_edge_device_info_queue_map[run_id_str] - self.run_process_map[run_id_str] = Process( - target=server_runner.run if not is_server_job else server_runner.run_server_job_impl, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], self.run_edge_device_info_queue_map[run_id_str], - self.run_metrics_queue_map[run_id_str], self.run_events_queue_map[run_id_str], - self.run_artifacts_queue_map[run_id_str], self.run_logs_queue_map[run_id_str], - self.message_center.get_message_queue(), - self.run_edge_device_info_global_queue - ) - ) - self.run_process_map[run_id_str].start() - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - def start_cloud_server_process_entry(self): - try: - self.start_cloud_server_process() - except Exception as e: - pass - - def start_cloud_server_process(self): - run_config = self.request_json["run_config"] - packages_config = run_config["packages_config"] - self.start_cloud_server(packages_config) - - def start_cloud_server(self, packages_config): - server_id = self.request_json["server_id"] - self.cloud_server_name = FedMLServerRunner.FEDML_CLOUD_SERVER_PREFIX + str(self.run_id) + "-" + str(server_id) - self.server_docker_image = ( - self.agent_config["docker_config"]["registry_server"] - + self.agent_config["docker_config"]["registry_dir"] - + self.server_docker_base_image - ) - - logging.info("docker image {}".format(self.server_docker_image)) - # logging.info("file_sys_driver {}".format(self.agent_config["docker_config"]["file_sys_driver"])) - - registry_secret_cmd = ( - "kubectl create namespace fedml-devops-aggregator-" - + self.version - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete secret secret-" - + self.cloud_server_name - + " ;kubectl create secret docker-registry secret-" - + self.cloud_server_name - + " --docker-server=" - + self.agent_config["docker_config"]["registry_server"] - + " --docker-username=" - + self.agent_config["docker_config"]["user_name"] - + " --docker-password=$(aws ecr-public get-login-password --region " - + self.agent_config["docker_config"]["public_cloud_region"] - + ")" - + " --docker-email=fedml@fedml.ai -n fedml-devops-aggregator-" - + self.version - ) - logging.info("Create secret cmd: " + registry_secret_cmd) - os.system(registry_secret_cmd) - - message_bytes = json.dumps(self.request_json).encode("ascii") - base64_bytes = base64.b64encode(message_bytes) - runner_cmd_encoded = base64_bytes.decode("ascii") - logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) - # logging.info("runner_cmd_decoded: {}".format(base64.b64decode(runner_cmd_encoded).decode())) - cur_dir = os.path.dirname(__file__) - run_deployment_cmd = ( - "export FEDML_AGGREGATOR_NAME=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_SVC=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_VERSION=" - + self.version - + ';export FEDML_AGGREGATOR_IMAGE_PATH="' - + self.server_docker_image - + '"' - + ";export FEDML_CONF_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PV_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PVC_ID=" - + self.cloud_server_name - + ";export FEDML_REGISTRY_SECRET_SUFFIX=" - + self.cloud_server_name - + ";export FEDML_ACCOUNT_ID=0" - + ";export FEDML_SERVER_DEVICE_ID=" - + self.request_json.get("cloudServerDeviceId", "0") - + ";export FEDML_VERSION=" - + self.version - + ";export FEDML_PACKAGE_NAME=" - + packages_config.get("server", "") - + ";export FEDML_PACKAGE_URL=" - + packages_config.get("serverUrl", "") - + ";export FEDML_RUNNER_CMD=" - + runner_cmd_encoded - + ";envsubst < " - + os.path.join(cur_dir, "templates", "fedml-server-deployment.yaml") - + " | kubectl apply -f - " - ) - logging.info("FedMLServerRunner.run with k8s: " + run_deployment_cmd) - os.system(run_deployment_cmd) - - def stop_cloud_server(self): - self.cloud_server_name = FedMLServerRunner.FEDML_CLOUD_SERVER_PREFIX + str(self.run_id) \ - + "-" + str(self.edge_id) - self.server_docker_image = ( - self.agent_config["docker_config"]["registry_server"] - + self.agent_config["docker_config"]["registry_dir"] - + self.server_docker_base_image - ) - delete_deployment_cmd = ( - "export FEDML_AGGREGATOR_NAME=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_SVC=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_VERSION=" - + self.version - + ';export FEDML_AGGREGATOR_IMAGE_PATH="' - + self.server_docker_image - + '"' - + ";export FEDML_CONF_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PV_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PVC_ID=" - + self.cloud_server_name - + ";export FEDML_REGISTRY_SECRET_SUFFIX=" - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete deployment " - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete svc " - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete secret secret-" - + self.cloud_server_name - ) - logging.info("FedMLServerRunner.stop_run with k8s: " + delete_deployment_cmd) - os.system(delete_deployment_cmd) - - def setup_message_center(self): - if self.message_center is not None: - return - - self.message_center = FedMLMessageCenter(agent_config=self.agent_config) - self.message_center.start_sender() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def rebuild_message_center(self, message_center_queue): - self.message_center = FedMLMessageCenter(message_queue=message_center_queue) - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def release_message_center(self): - try: - if self.message_center is not None: - self.message_center.stop() - self.message_center = None - - except Exception as e: - logging.error( - f"Failed to release client mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def send_training_stop_request_to_edges( - self, edge_id_list, payload=None, run_id=0): - if payload is None: - payload_obj = {"runId": run_id, "edgeids": edge_id_list} - else: - payload_obj = json.loads(payload) - - for edge_id in edge_id_list: - topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, json.dumps(payload_obj)) - - def send_training_stop_request_to_specific_edge(self, edge_id, payload): - topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, payload) - - def send_training_stop_request_to_cloud_server(self, edge_id, payload): - topic_stop_train = "mlops/flserver_agent_" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, payload) - - def send_training_stop_request_to_edges_when_exception( - self, edge_id_list, payload=None, run_id=0, server_id=None, status=None): - if payload is None: - payload_obj = {"runId": run_id, "edgeids": edge_id_list} - if server_id is not None: - payload_obj["serverId"] = server_id - else: - payload_obj = json.loads(payload) - payload_obj["run_status"] = ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION if status is None else status - topic_stop_train = "flserver_agent/" + str(self.edge_id) + "/stop_train" - self.callback_stop_train(topic_stop_train, json.dumps(payload_obj), use_payload=payload_obj) - - def callback_stop_train(self, topic, payload, use_payload=None): - # logging.info("callback_stop_train: topic = %s, payload = %s" % (topic, payload)) - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("id", None) - - edge_id_list = request_json["edgeids"] - server_id = request_json.get("serverId", None) - if server_id is None: - server_id = request_json.get("server_id", None) - - if run_id is None or server_id is None: - logging.info("Json format is not correct!") - return - - # logging.info("Stop run with multiprocessing.") - - # Stop server with multiprocessing mode - run_id_str = str(run_id) - stop_request_json = self.running_request_json.get(run_id_str, None) - if stop_request_json is None: - stop_request_json = request_json - if use_payload is not None: - stop_request_json = use_payload - - if self.run_process_event_map.get(run_id_str) is not None: - self.run_process_event_map.get(run_id_str).set() - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config, - edge_id=self.edge_id - ) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - self.run_process_event_map_for_stop[run_id_str] = multiprocessing.Event() - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - - self.run_stop_process_map[run_id_str] = Process( - target=server_runner.run_stop, args=( - self.run_process_event_map_for_stop[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], - self.run_edge_device_info_queue_map_for_stop[run_id_str], - self.run_edge_device_info_global_queue_for_stop, - self.message_center.get_message_queue(), - ) - ) - self.run_stop_process_map[run_id_str].start() - elif self.run_as_cloud_agent: - self.send_training_stop_request_to_cloud_server(server_id, payload) - return - elif self.run_as_cloud_server: - # if not self.use_local_process_as_cloud_server: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config, - edge_id=server_id - ) - server_runner.run_as_cloud_agent = self.run_as_cloud_agent - self.run_process_event_map_for_stop[run_id_str] = multiprocessing.Event() - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - - self.run_stop_process_map[run_id_str] = Process( - target=server_runner.run_stop, args=( - self.run_process_event_map_for_stop[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], - self.run_edge_device_info_queue_map_for_stop[run_id_str], - self.run_edge_device_info_global_queue_for_stop, - self.message_center.get_message_queue(), - ) - ) - self.run_stop_process_map[run_id_str].start() - return - - if self.running_request_json.get(run_id_str, None) is not None: - self.running_request_json.pop(run_id_str) - - if self.run_process_map.get(run_id_str, None) is not None: - self.run_process_map.pop(run_id_str) - - def run_stop(self, process_event, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, message_center_queue): - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - self.run_stop_impl(edge_id_status_queue, edge_device_info_queue, edge_device_info_global_queue) - except Exception as e: - logging.error("Stop runner exits with exceptions. {}".format(traceback.format_exc())) - finally: - logging.info("Release resources.") - - def run_stop_impl(self, edge_id_status_queue, edge_device_info_queue, edge_device_info_global_queue): - run_id_str = str(self.run_id) - edge_id_list = self.request_json["edgeids"] - - # Detect running status of all edges - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - status_timeout=120, need_to_trigger_exception=False, - status_check_context=SchedulerConstants.STATUS_CHECK_FRO_RUN_STOP_CONTEXT) - - # Send the training stopping request to running edges. - for edge_id_item, _ in active_edge_info_dict.items(): - self.send_training_stop_request_to_specific_edge(edge_id_item, json.dumps(self.request_json)) - time.sleep(0.2) - time.sleep(3) - - total_sleep_seconds = 0 - allowed_status_check_sleep_seconds = 60 - server_id = self.edge_id - running_edges_list = list() - current_edge_id_status_map = dict() - - while True: - # Fetch edge id and status from the edge id status queue - while True: - try: - queue_item = edge_id_status_queue.get(block=False, timeout=3) - if queue_item is not None: - current_edge_id_status_map.update(queue_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - # Calc the total killed device number - running_edges_list.clear() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - for edge_id_item, status_item in current_edge_id_status_map.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # If the killed device number is equal total device number, then break - if len(running_edges_list) <= 0 and len(current_edge_id_status_map.keys()) == len(edge_id_list) + 1: - break - - # Calc the timeout value to wait to device killed. - time.sleep(3) - total_sleep_seconds += 3 - if total_sleep_seconds < allowed_status_check_sleep_seconds: - continue - - # If timeout, then report killed device status - no_response_edges = list(set(edge_id_list) - set(running_edges_list)) - if len(no_response_edges) <= 0: - break - for edge_id_item in no_response_edges: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - server_id=self.edge_id, run_id=self.run_id) - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - elif self.run_as_cloud_agent: - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, server_id) - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def set_run_status(self, run_id, status, running_request_json): - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=running_request_json, agent_config=self.agent_config - ) - server_runner.edge_id = self.edge_id - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.run_status = status - server_runner.message_center = self.message_center - server_runner.mlops_metrics = self.mlops_metrics - server_runner.cleanup_client_with_status() - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - # logging.info( - # f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - # ) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - status = request_json["status"] - edge_id = request_json["edge_id"] - server_id = request_json.get("server_id", None) - run_id_str = str(run_id) - - if ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - ): - completed_event = self.run_process_completed_event_map.get(run_id_str, None) - if completed_event is not None: - completed_event.set() - - FedMLServerDataInterface.get_instance().save_job_status(run_id, self.edge_id, status, status) - - # Stop server with multiprocessing mode - running_request_json = self.running_request_json.get(run_id_str, None) - if running_request_json is None: - running_request_json = request_json - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - self.set_run_status(run_id, status, running_request_json) - - run_process = self.run_process_map.get(run_id_str, None) - if run_process is not None: - if run_process.pid is not None: - RunProcessUtils.kill_process(run_process.pid) - - self.run_process_map.pop(run_id_str) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - elif self.run_as_cloud_agent: - pass - elif self.run_as_cloud_server: - self.set_run_status(run_id, status, running_request_json) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.use_local_process_as_cloud_server: - # RunProcessUtils.kill_process(os.getpid()) - cloud_server_process = self.run_process_map.get(run_id_str, None) - if cloud_server_process is not None: - RunProcessUtils.kill_process(cloud_server_process.pid) - else: - self.stop_cloud_server() - - if self.run_process_map.get(run_id_str, None) is not None: - self.run_process_map.pop(run_id_str) - - self.remove_listener_for_run_metrics(self.run_id) - self.remove_listener_for_run_logs(self.run_id) - elif ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION - ): - request_json = self.running_request_json.get(run_id_str, None) - if request_json is not None: - edge_id_list = request_json.get("edgeids", list()) - server_id = request_json.get("serverId", None) - server_id = request_json.get("server_id", None) if server_id is None else server_id - self.send_training_stop_request_to_edges_when_exception( - edge_id_list, run_id=run_id, server_id=server_id, - status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - FedMLServerDataInterface.get_instance().save_job_status(run_id, self.edge_id, status, status) - else: - request_json = self.running_request_json.get(run_id_str, None) - if request_json is None: - request_json = self.start_request_json - self.mlops_metrics.report_server_training_status( - run_id, status, edge_id=self.edge_id, running_json=json.dumps(request_json)) - - def cleanup_client_with_status(self): - if self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - # logging.info("received to finished status.") - self.cleanup_run_when_finished(should_send_server_id_status=False) - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - # logging.info("received to failed status.") - self.cleanup_run_when_starting_failed(should_send_server_id_status=False) - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - # logging.info("received to failed status.") - self.cleanup_run_when_starting_failed( - status=self.run_status, should_send_server_id_status=False) - - def callback_report_current_status(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - if self.run_as_edge_server_and_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_server: - pass - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_server_ota_msg(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ServerConstants.FEDML_OTA_CMD_UPGRADE: - try: - self.process_ota_upgrade_msg() - # Process(target=FedMLServerRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - except Exception as e: - pass - elif cmd == ServerConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def callback_response_device_info(self, topic, payload): - # Parse payload - payload_json = json.loads(payload) - run_id = payload_json.get("run_id", 0) - context = payload_json.get("context", None) - master_device_id = payload_json.get("master_device_id", 0) - slave_device_id = payload_json.get("slave_device_id", 0) - slave_device_id_list = payload_json.get("slave_device_id_list", 0) - edge_id = payload_json.get("edge_id", 0) - device_info = payload_json.get("edge_info", 0) - device_info["master_device_id"] = master_device_id - device_info["slave_device_id"] = slave_device_id - device_info["slave_device_id_list"] = slave_device_id_list - run_id_str = str(run_id) - - # Put device info into a multiprocessing queue so master runner checks if all edges are ready - if context is None: - if self.run_edge_device_info_queue_map.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map[run_id_str] = Queue() - self.run_edge_device_info_queue_map[run_id_str].put(device_info) - - # if self.run_edge_device_info_global_queue is None: - # self.run_edge_device_info_global_queue = Array('i', list()) - # - # self.run_edge_device_info_global_queue[len(self.run_edge_device_info_global_queue)] = \ - # {"timestamp": time.time(), "edge_id": edge_id, "device_info": device_info} - - self.check_model_device_ready_and_deploy(run_id, master_device_id, slave_device_id, - slave_device_id_list=slave_device_id_list) - elif context == SchedulerConstants.STATUS_CHECK_FRO_RUN_STOP_CONTEXT: - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - self.run_edge_device_info_queue_map_for_stop[run_id_str].put(device_info) - - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - # - # self.run_edge_device_info_global_queue_for_stop[len(self.run_edge_device_info_global_queue_for_stop)] = \ - # {"timestamp": time.time(), "edge_id": edge_id, "device_info": device_info} - - def check_model_device_ready_and_deploy(self, run_id, master_device_id, slave_device_id, slave_device_id_list=None): - request_json = self.running_request_json.get(str(run_id), None) - if request_json is None: - return - run_config = request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_type != Constants.JOB_TASK_TYPE_DEPLOY and job_type != Constants.JOB_TASK_TYPE_SERVE: - return - - # Init model device ids for each run - run_id_str = str(run_id) - if self.run_model_device_ids.get(run_id_str, None) is None: - self.run_model_device_ids[run_id_str] = list() - - # Append master device and slave devices to the model devices map - self.run_model_device_ids[run_id_str].append({"master_device_id": master_device_id, - "slave_device_id": slave_device_id}) - model_device_ids = self.run_model_device_ids.get(run_id_str, None) - if model_device_ids is None: - return - - # Check if all model devices are ready - if len(model_device_ids) != len(self.run_edge_ids.get(run_id_str, list())): - return - - # Generate model master ids and model slave device ids - device_master_ids = list() - device_slave_ids = list() - for device_ids in model_device_ids: - model_master_id = device_ids.get("master_device_id") - model_slave_id = device_ids.get("slave_device_id") - device_master_ids.append(model_master_id) - device_slave_ids.append(model_slave_id) - - if len(device_master_ids) <= 0: - return - - # Generate serving devices for deploying - serving_devices = list() - serving_devices.append(device_master_ids[0]) - serving_devices.extend(device_slave_ids) - - # Start to deploy the model - self.deploy_model(serving_devices, request_json, run_id=run_id) - - def callback_request_device_info_from_mlops(self, topic, payload): - self.response_device_info_to_mlops(topic, payload) - - def response_device_info_to_mlops(self, topic, payload): - response_topic = f"deploy/master_agent/mlops/response_device_info" - payload_json = json.loads(payload) - need_gpu_info = payload_json.get("need_gpu_info", False) - if self.mlops_metrics is not None: - if not need_gpu_info: - response_payload = { - "run_id": self.run_id, - "master_agent_device_id": self.edge_id, - "fedml_version": fedml.__version__ - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, \ - gpu_cores_total, gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = \ - sys_utils.get_sys_realtime_stats() - gpu_available_ids = JobRunnerUtils.get_instance().get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - response_payload = { - "run_id": self.run_id, - "master_agent_device_id": self.edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "networkTraffic": sent_bytes + recv_bytes, - "timestamp": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__ - } - self.mlops_metrics.report_json_message(response_topic, json.dumps(response_payload)) - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def bind_account_and_device_id(self, url, account_id, device_id, os_name, api_key="", role=None): - if role is None: - role = "edge_server" - if self.run_as_edge_server_and_agent: - role = "edge_server" - elif self.run_as_cloud_agent: - role = "cloud_agent" - elif self.run_as_cloud_server: - role = "cloud_server" - - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "api_key": api_key, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flserver_agent/active" - status = MLOpsStatus.get_instance().get_server_agent_status(self.edge_id) - if ( - status is not None - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ): - return - - if self.run_as_cloud_agent: - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - else: - try: - current_job = FedMLServerDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - current_job = None - if current_job is None: - if status is not None and status == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE: - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - else: - return - else: - status = ServerConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_server_agent_status(self.edge_id, status) - if self.mqtt_mgr is not None: - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - else: - self.send_message_json(active_topic, json.dumps(active_msg)) - - def recover_start_train_msg_after_upgrading(self): - try: - current_job = FedMLServerDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING: - logging.info("start training after upgrading.") - server_agent_id = self.edge_id - topic_start_train = "mlops/flserver_agent_" + str(server_agent_id) + "/start_train" - self.callback_start_train(topic_start_train, current_job.running_json) - except Exception as e: - logging.info("recover starting train message after upgrading: {}".format(traceback.format_exc())) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting training - server_agent_id = self.edge_id - topic_start_train = "mlops/flserver_agent_" + str(server_agent_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "mlops/flserver_agent_" + str(server_agent_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for server status switching - topic_server_status = "fl_server/flserver_agent_" + str(server_agent_id) + "/status" - self.add_message_listener(topic_server_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_server_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.add_message_listener(topic_report_status, self.callback_report_current_status) - self.mqtt_mgr.add_message_listener(topic_report_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flserver_agent_" + str(server_agent_id) + "/ota" - self.add_message_listener(topic_ota_msg, self.callback_server_ota_msg) - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.listener_message_dispatch_center) - - # Setup MQTT message listener to request device info from the client. - topic_response_device_info = "client/server/response_device_info/" + str(self.edge_id) - self.add_message_listener(topic_response_device_info, self.callback_response_device_info) - self.mqtt_mgr.add_message_listener(topic_response_device_info, self.listener_message_dispatch_center) - - # Setup MQTT message listener to request device info from MLOps. - topic_request_device_info_from_mlops = f"deploy/mlops/master_agent/request_device_info/{self.edge_id}" - self.add_message_listener(topic_request_device_info_from_mlops, self.callback_request_device_info_from_mlops) - self.mqtt_mgr.add_message_listener( - topic_request_device_info_from_mlops, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_train, qos=2) - mqtt_client_object.subscribe(topic_stop_train, qos=2) - mqtt_client_object.subscribe(topic_server_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - mqtt_client_object.subscribe(topic_response_device_info, qos=2) - mqtt_client_object.subscribe(topic_request_device_info_from_mlops, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_server_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - self.subscribed_topics.append(topic_response_device_info) - self.subscribed_topics.append(topic_request_device_info_from_mlops) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Start the message center for listener - self.start_listener(sender_message_queue=self.message_center.get_message_queue(), - agent_config=self.agent_config) - - if self.run_as_cloud_server: - # Start the FedML server - message_bytes = self.args.runner_cmd.encode("ascii") - base64_bytes = base64.b64decode(message_bytes) - payload = base64_bytes.decode("ascii") - self.receive_message_json(topic_start_train, payload) - - # Echo results - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout() - print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - print( - "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - + str(self.unique_device_id) - ) - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout(enable=True) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - ) - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_ServerAgent_Daemon_@{self.user_name}@_@{self.args.current_device_id}@_@{str(uuid.uuid4())}@", - "flserver_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) - - # Init local database - FedMLServerDataInterface.get_instance().create_job_table() - - # Start the message center to process edge related messages. - self.setup_message_center() - - server_api_cmd = "fedml.computing.scheduler.master.server_api:api" - server_api_pids = RunProcessUtils.get_pid_from_cmd_line(server_api_cmd) - if server_api_pids is None or len(server_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ServerConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, server_api_cmd, ServerConstants.LOCAL_SERVER_API_PORT, - fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Server local API process id {self.local_api_process.pid}") - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - # Report the IDLE status to MLOps - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, edge_id=self.edge_id) - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ) - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"], is_client=False) - - if not self.run_as_cloud_server: - self.recover_start_train_msg_after_upgrading() - - JobCleanup.get_instance().sync_data_on_startup(self.edge_id, is_client=False) - - self.master_api_daemon = MasterApiDaemon() - self.master_api_process = Process(target=self.master_api_daemon.run) - self.master_api_process.start() - - # if self.model_device_server is None: - # self.model_device_server = FedMLModelDeviceServerRunner(self.args, self.args.current_device_id, - # self.args.os_name, self.args.is_from_docker, - # self.agent_config) - # self.model_device_server.start() - - def start_agent_mqtt_loop(self): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Server tracing: {}".format(traceback.format_exc())) - - finally: - login_exit_file = os.path.join(ServerConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - - self.stop_agent() - - time.sleep(5) - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - self.release_message_center() - - def get_runner(self): - runner = FedMLServerRunner( - self.args, run_id=self.run_id, request_json=self.request_json, - agent_config=self.agent_config - ) - runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - runner.edge_id = self.edge_id - runner.server_agent_id = self.server_agent_id - runner.start_request_json = self.start_request_json - runner.unique_device_id = self.unique_device_id - runner.user_name = self.user_name - runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - runner.run_as_cloud_agent = self.run_as_cloud_agent - runner.run_as_cloud_server = self.run_as_cloud_server - return runner diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_runner_deprecated.py b/python/fedml/computing/scheduler/model_scheduler/device_client_runner_deprecated.py deleted file mode 100755 index 8bb03eebb..000000000 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_runner_deprecated.py +++ /dev/null @@ -1,1483 +0,0 @@ -import json -import logging -import multiprocessing -import sys - -from multiprocessing import Process -import os -import platform -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from urllib.parse import urlparse, urljoin - -import requests - -import yaml - -import fedml -from fedml import mlops -from fedml.computing.scheduler.model_scheduler.device_model_msg_object import FedMLModelMsgObject -from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager - -from fedml.computing.scheduler.scheduler_core.compute_utils import ComputeUtils -from fedml.core.distributed.communication.s3.remote_storage import S3Storage -from .device_model_cache import FedMLModelCache -from ..comm_utils import sys_utils, security_utils - -from ..comm_utils.container_utils import ContainerUtils - -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .device_client_constants import ClientConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .device_model_deployment import start_deployment, run_http_inference_with_curl_request -from .device_client_data_interface import FedMLClientDataInterface -from ....core.mlops.mlops_utils import MLOpsUtils -from ..comm_utils.job_utils import JobRunnerUtils -from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils -from .device_mqtt_inference_protocol import FedMLMqttInference -from .device_model_db import FedMLModelDatabase -from ..comm_utils.constants import SchedulerConstants -from fedml.computing.scheduler.comm_utils.job_monitor import JobMonitor - -from .device_replica_handler import FedMLDeviceReplicaHandler - -from fedml.computing.scheduler.scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol -import ssl - - -class RunnerError(Exception): - """ Runner failed. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLClientRunner: - FEDML_BOOTSTRAP_RUN_OK = "[FedML]Bootstrap Finished" - - def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0): - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_inference_event_map = dict() - self.run_inference_response_map = dict() - self.run_process_map = dict() - self.device_status = None - self.current_training_status = None - self.mqtt_mgr = None - self.client_mqtt_mgr = None - self.client_mqtt_is_connected = False - self.client_mqtt_lock = None - self.edge_id = edge_id - self.run_id = run_id - self.unique_device_id = None - self.args = args - self.request_json = request_json - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - self.sudo_cmd = "" - self.is_mac = False - if platform.system() == "Darwin": - self.is_mac = True - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = {} - - self.mlops_metrics = None - self.client_active_list = dict() - self.infer_host = "127.0.0.1" - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - self.model_runner_mapping = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.running_request_json = dict() - self.endpoint_inference_runners = dict() - self.mqtt_inference_obj = None - - self.subscribed_topics = list() - self.user_name = None - - self.replica_handler = None - - def unzip_file(self, zip_file, unzip_file_path) -> str: - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unziped_file_name - - def retrieve_and_unzip_package(self, package_name, package_url): - """ - Download the package from the url and unzip it to the local package directory - ~/.fedml/fedml-model-client/fedml/model_packages/${end_point_id}_${end_point_name}_${model_name}_${model_version} - Under this folder, there should be the zipped file and the unzipped folder. - the zipped file starts with fedml_run_${end_point_id}_${end_point_name}_${model_name}_${model_version} - """ - # Models root directory - local_package_path = ClientConstants.get_model_package_dir() - os.makedirs(local_package_path, exist_ok=True) - - # Specify this model directory using ${end_point_id}_${end_point_name}_${model_name}_${model_version} - run_id = self.request_json["end_point_id"] - end_point_name = self.request_json["end_point_name"] - model_config = self.request_json["model_config"] - model_name = model_config["model_name"] - model_version = model_config["model_version"] - - model_version = model_version.replace(" ", "-") # Avoid using space for folder name - model_version = model_version.replace(":", "-") # Since docker mount will conflict with ":" - - this_run_model_dir = f"{run_id}_{end_point_name}_{model_name}_{model_version}" - this_run_model_full_path = os.path.join(local_package_path, this_run_model_dir) - os.makedirs(this_run_model_full_path, exist_ok=True) - - # Download the zipped package, overwrite it even if it exists - filename, filename_without_extension, file_extension = ClientConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(this_run_model_full_path, - f"fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - logging.info("Download from package_url {}".format(package_url)) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - - # Unzip the package in the same folder, overwrite the unzipped folder even if it exists - unzip_package_path = os.path.join(this_run_model_full_path, - f"unzip_fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - pass - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - model_bin_file = os.path.join(unzip_package_path, "fedml_model.bin") # Will deprecated - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path, model_bin_file - - def retrieve_binary_model_file(self, package_name, package_url): - local_package_path = ClientConstants.get_model_package_dir() - if not os.path.exists(local_package_path): - os.makedirs(local_package_path, exist_ok=True) - unzip_package_path = ClientConstants.get_model_dir() - local_package_file = "{}".format(os.path.join(local_package_path, package_name)) - if os.path.exists(local_package_file): - os.remove(local_package_file) - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - - unzip_package_path = os.path.join(unzip_package_path, package_name) - if not os.path.exists(unzip_package_path): - os.makedirs(unzip_package_path, exist_ok=True) - dst_model_file = os.path.join(unzip_package_path, package_name) - if os.path.exists(local_package_file): - shutil.copy(local_package_file, dst_model_file) - - return unzip_package_path, dst_model_file - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook function is stateless, we need a state to avoid printing progress repeatedly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def build_dynamic_constrain_variables(self, run_id, run_config): - pass - - def update_local_fedml_config(self, run_id, model_config, model_config_parameters): - model_name = model_config["model_name"] - model_storage_url = model_config["model_storage_url"] - - # Retrieve model package or model binary file. - unzip_package_path, model_bin_file = self.retrieve_and_unzip_package(model_name, model_storage_url) - - # Load the config to memory - fedml_local_config_file = os.path.join(unzip_package_path, "fedml_model_config.yaml") - - # Inject the config from UI to pkg yaml - package_conf_object = model_config_parameters - - # Save the config to local - with open(fedml_local_config_file, "w") as f: - yaml.dump(package_conf_object, f) - - logging.info("The package_conf_object is {}".format(package_conf_object)) - - return unzip_package_path, model_bin_file, package_conf_object - - def build_dynamic_args(self, run_config, package_conf_object, base_dir): - pass - - def download_model_package(self, package_name, package_url): - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - package_name, package_url - ) - - return unzip_package_path - - def run(self, process_event, completed_event): - # print(f"Model worker runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - run_id = self.request_json.get("end_point_id") - - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ClientConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - - MLOpsUtils.set_ntp_offset(self.ntp_offset) - self.setup_client_mqtt_mgr() - - if not self.run_impl(): - logging.info( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed to run the model deployment. run_impl return False.") - - # This if condition only happens when run_impl return False in a controllable way - # Under this condition, the run_impl itself should have handled the cleanup - # So no need to self.release_gpu_ids(run_id) - except RunnerError: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to RunnerError {traceback.format_exc()}") - self.release_gpu_ids(run_id) - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - except RunnerCompletedError: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to RunnerCompletedError {traceback.format_exc()}") - self.release_gpu_ids(run_id) - - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - except Exception as e: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to exception {traceback.format_exc()}") - - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - self.release_gpu_ids(run_id) - - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - time.sleep(2) - sys.exit(1) - finally: - logging.info("[Worker] Release resources after deployment.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - self.release_client_mqtt_mgr() - - def release_gpu_ids(self, run_id): - JobRunnerUtils.get_instance().release_gpu_ids(run_id, self.edge_id) - - def check_runner_stop_event(self): - if self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def run_impl(self): - # Get deployment params - run_id = self.request_json["end_point_id"] - end_point_name = self.request_json["end_point_name"] - device_ids = self.request_json["device_ids"] - master_ip = self.request_json["master_node_ip"] - model_config = self.request_json["model_config"] - model_name = model_config["model_name"] - model_id = model_config["model_id"] - model_version = model_config["model_version"] - model_config_parameters = self.request_json["parameters"] - inference_port = model_config_parameters.get("worker_internal_port", - ClientConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("worker_external_port", inference_port) - inference_engine = model_config_parameters.get("inference_engine", - ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT) - inference_end_point_id = run_id - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - logging.info(f"[Worker] Received model deployment request from master for endpoint {run_id}.") - if self.replica_handler is not None: - logging.info(f"=================Worker replica Handler ======================" - f"Reconcile with num diff {self.replica_handler.replica_num_diff} " - f"and version diff {self.replica_handler.replica_version_diff}." - f"=============================================================") - else: - logging.error(f"[Worker] Replica handler is None.") - return False - - self.check_runner_stop_event() - - # Report the deployment status to mlops - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, - is_from_model=True, running_json=json.dumps(self.request_json), run_id=run_id) - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, - is_from_model=True, run_id=run_id) - - self.check_runner_stop_event() - - # Reconcile the replica number (op: add, remove) - prev_rank, op, op_num = self.replica_handler.reconcile_num_replica() - - # Reconcile the replica version (op: update) - replica_rank_to_update = [] - if not op: - replica_rank_to_update, op = self.replica_handler.reconcile_replica_version() - - if not op: - logging.info("[Worker] No need to reconcile.") - return True - - logging.info( - f"================Worker Reconcile Operations ======================\n" - f" op: {op}; op num: {op_num}.\n" - f"==================================================================\n") - - # If not rollback, download package from MLOps; otherwise, use the backup package - if op != "rollback": - logging.info("Download and unzip model to local...") - unzip_package_path, _, _ = \ - self.update_local_fedml_config(run_id, model_config, model_config_parameters) - if unzip_package_path is None: - logging.info("Failed to update local fedml config.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg(run_id, self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - return False - - if not os.path.exists(unzip_package_path): - logging.info("Failed to unzip file.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg(run_id, self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - return False - else: - logging.info("Try to use backup package to rollback...") - # Find folder under "~/.fedml/fedml-model-client/fedml/model_packages \ - # /${end_point_id}_${end_point_name}_${model_name}_${model_version}" - backup_folder_full_path = None - models_root_dir = ClientConstants.get_model_package_dir() - - # Find the version (notified by master) to rollback - version_diff_dict = self.request_json["replica_version_diff"][str(self.edge_id)] - version_rollback_to = None - for replica_no, rollback_ops in version_diff_dict.items(): - version_rollback_to = rollback_ops["new_version"] # Note that new_version is the version to rollback - break - if version_rollback_to is None: - logging.error(f"No old version found for run_id: {self.run_id} " - f"edge_id: {self.edge_id}, rollback failed. No old version found in request_json.") - return False - model_version = version_rollback_to - - # Format the version to match the folder name - model_version_formatted = version_rollback_to.replace(" ", "-") - model_version_formatted = model_version_formatted.replace(":", "-") - - last_run_folder_sub_fd = f"{run_id}_{end_point_name}_{model_name}_{model_version_formatted}" - for folder in os.listdir(models_root_dir): - if last_run_folder_sub_fd in folder: - backup_folder_full_path = os.path.join(models_root_dir, folder) - break - if backup_folder_full_path is None: - logging.error(f"No backup folder found for run_id: {self.run_id} edge_id: {self.edge_id} " - f"under {models_root_dir} with sub folder {last_run_folder_sub_fd}, rollback failed.") - return False - - # Inside backup folder, find unzipped package with prefix unzip_fedml_run - unzip_package_path_parent = None - for folder in os.listdir(backup_folder_full_path): - if folder.startswith("unzip_fedml_run"): - unzip_package_path_parent = os.path.join(backup_folder_full_path, folder) - break - - # Inside unzip folder, find the unzipped package, should be the only one - unzip_package_path = None - for folder in os.listdir(unzip_package_path_parent): - if os.path.isdir(os.path.join(unzip_package_path_parent, folder)): - unzip_package_path = os.path.join(unzip_package_path_parent, folder) - break - - if unzip_package_path is None: - logging.error(f"No unzipped package found for run_id: {self.run_id} edge_id: {self.edge_id} " - f"under {backup_folder_full_path}, rollback failed.") - return False - - self.check_runner_stop_event() - - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - "", "", model_version, {}, {} - - if op == "add": - worker_ip = self.get_ip_address(self.request_json) - for rank in range(prev_rank + 1, prev_rank + 1 + op_num): - try: - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - start_deployment( - end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, - model_version=model_version, model_storage_local_path=unzip_package_path, - inference_model_name=model_name, inference_engine=inference_engine, - infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, - master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) - ) - except Exception as e: - inference_output_url = "" - logging.error(f"[Worker] Exception at deployment: {traceback.format_exc()}") - - if inference_output_url == "": - logging.error("[Worker] Failed to deploy the model.") - - # Release the gpu occupancy - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - logging.info(f"Release gpu ids {replica_occupied_gpu_ids_str} for " - f"failed deployment of replica no {rank + 1}.") - - if replica_occupied_gpu_ids_str is not None: - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, - self.edge_id, replica_occupied_gpu_ids) - - # Send failed result back to master - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, - model_id, model_name, inference_output_url, inference_model_version, inference_port, - inference_engine, model_metadata, model_config) - - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - return False - else: - # Send failed successful result back to master - logging.info("Finished deployment, continue to send results to master...") - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - if inference_port_external != inference_port: - # Save internal port to local db - logging.info("inference_port_external {} != inference_port {}".format( - inference_port_external, inference_port)) - result_payload = self.construct_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - FedMLModelDatabase.get_instance().set_deployment_result( - run_id, end_point_name, model_name, model_version, self.edge_id, - json.dumps(result_payload), replica_no=rank + 1) - - logging.info(f"Deploy replica {rank + 1} / {prev_rank + 1 + op_num} successfully.") - time.sleep(5) - - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - return True - elif op == "remove": - for rank_to_delete in range(prev_rank, prev_rank - op_num, -1): - self.replica_handler.remove_replica(rank_to_delete) - - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank_to_delete + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - - JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, self.edge_id, replica_occupied_gpu_ids) - - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( - run_id, end_point_name, model_name, self.edge_id, rank_to_delete) - - # Report the deletion msg to master - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank_to_delete + 1) - - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - - # TODO: If delete all replica, then delete the job and related resources - if rank_to_delete == 0: - pass - return True - elif op == "update" or op == "rollback": - # Update is combine of delete and add - worker_ip = self.get_ip_address(self.request_json) - for rank in replica_rank_to_update: - # Delete a replica (container) if exists - self.replica_handler.remove_replica(rank) - - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - logging.info(f"Release gpu ids {replica_occupied_gpu_ids} for update / rollback.") - - # TODO (Raphael) check if this will allow another job to seize the gpu during high concurrency: - try: - JobRunnerUtils.get_instance().release_partial_job_gpu( - run_id, self.edge_id, replica_occupied_gpu_ids) - except Exception as e: - if op == "rollback": - pass - else: - logging.error(f"Failed to release gpu ids {replica_occupied_gpu_ids} for update.") - return False - - # Delete the deployment result from local db - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( - run_id, end_point_name, model_name, self.edge_id, rank) - - logging.info(f"Delete replica with no {rank + 1} successfully.") - time.sleep(1) - - # Add a replica (container) - # TODO: Reduce the duplicated code - logging.info(f"Start to deploy the model with replica no {rank + 1} ...") - try: - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - start_deployment( - end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, - model_version=model_version, model_storage_local_path=unzip_package_path, - inference_model_name=model_name, inference_engine=inference_engine, - infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, - master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) - ) - except Exception as e: - inference_output_url = "" - logging.error(f"Exception at deployment: {traceback.format_exc()}") - - if inference_output_url == "": - logging.error("Failed to deploy the model...") - - # If update failed, should release this replica's gpu - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - - JobRunnerUtils.get_instance().release_partial_job_gpu( - run_id, self.edge_id, replica_occupied_gpu_ids) - - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, - model_id, model_name, inference_output_url, inference_model_version, inference_port, - inference_engine, model_metadata, model_config) - - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - return False - else: - logging.info("Finished deployment, continue to send results to master...") - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - if inference_port_external != inference_port: # Save internal port to local db - logging.info("inference_port_external {} != inference_port {}".format( - inference_port_external, inference_port)) - result_payload = self.construct_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - FedMLModelDatabase.get_instance().set_deployment_result( - run_id, end_point_name, model_name, model_version, self.edge_id, - json.dumps(result_payload), replica_no=rank + 1) - - logging.info(f"Update replica with no {rank + 1} successfully. Op num {op_num}") - time.sleep(5) - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - return True - - else: - # The delete op will be handled by callback_delete_deployment - logging.error(f"Unsupported op {op} with op num {op_num}") - return False - - def construct_deployment_results(self, end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): - deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, - "model_id": model_id, "model_name": model_name, - "model_url": model_inference_url, "model_version": model_version, - "port": inference_port, - "inference_engine": inference_engine, - "model_metadata": model_metadata, - "model_config": model_config, - "model_status": model_status, - "inference_port": inference_port, - "replica_no": replica_no, - } - return deployment_results_payload - - def construct_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - deployment_status_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, - "device_id": device_id, - "model_id": model_id, "model_name": model_name, - "model_version": model_version, - "model_url": model_inference_url, "model_status": model_status, - "inference_port": inference_port, - "replica_no": replica_no, - } - return deployment_status_payload - - def send_deployment_results(self, end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - self.run_id, device_id) - - deployment_results_payload = self.construct_deployment_results( - end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=replica_no) - - logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic, - deployment_results_payload)) - self.client_mqtt_mgr.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) - return deployment_results_payload - - def send_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - # Deprecated - pass - - def reset_devices_status(self, edge_id, status): - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = edge_id - self.mlops_metrics.broadcast_client_training_status( - edge_id, status, is_from_model=True, run_id=self.run_id) - - def cleanup_run_when_starting_failed(self): - logging.info("Cleanup run successfully when starting failed.") - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - def cleanup_run_when_finished(self): - logging.info("Cleanup run successfully when finished.") - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - def on_client_mqtt_disconnected(self, mqtt_client_object): - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = False - self.client_mqtt_lock.release() - - def on_client_mqtt_connected(self, mqtt_client_object): - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = True - self.client_mqtt_lock.release() - - def setup_client_mqtt_mgr(self): - if self.client_mqtt_mgr is not None: - return - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelClientAgent_Metrics_@{}@_{}_{}_{}".format(self.user_name, self.args.current_device_id, - str(os.getpid()), - str(uuid.uuid4())) - ) - - self.client_mqtt_mgr.add_connected_listener(self.on_client_mqtt_connected) - self.client_mqtt_mgr.add_disconnected_listener(self.on_client_mqtt_disconnected) - self.client_mqtt_mgr.connect() - self.client_mqtt_mgr.loop_start() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - - def release_client_mqtt_mgr(self): - try: - if self.client_mqtt_mgr is not None: - self.client_mqtt_mgr.loop_stop() - self.client_mqtt_mgr.disconnect() - - self.client_mqtt_lock.acquire() - if self.client_mqtt_mgr is not None: - self.client_mqtt_is_connected = False - self.client_mqtt_mgr = None - self.client_mqtt_lock.release() - except Exception: - pass - - def ota_upgrade(self, payload, request_json): - run_id = request_json["end_point_id"] - force_ota = False - ota_version = None - - try: - parameters = request_json.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - FedMLClientDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - payload) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_deployment(self, topic, payload): - # Get deployment params - request_json = json.loads(payload) - run_id = request_json["end_point_id"] - inference_end_point_id = run_id - - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # Start log processor for current run - run_id = inference_end_point_id - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog(args=self.args).init_logs() - MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( - ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) - - # self.ota_upgrade(payload, request_json) - - # Start client with multiprocessing mode - request_json["run_id"] = run_id - run_id_str = str(run_id) - self.request_json = request_json - self.running_request_json[run_id_str] = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - client_runner.infer_host = self.get_ip_address(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - client_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - client_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - self.model_runner_mapping[run_id_str] = client_runner - - # Replica Handler will be init for every deployment - replica_handler = FedMLDeviceReplicaHandler(self.edge_id, self.request_json) - client_runner.replica_handler = replica_handler - - self.run_id = run_id - self.run_process_map[run_id_str] = Process(target=client_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str] - )) - - self.run_process_map[run_id_str].start() - ClientConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - ClientConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - def set_runner_stopped_event(self, run_id): - run_id_str = str(run_id) - client_runner = self.model_runner_mapping.get(run_id_str, None) - if client_runner is not None: - if client_runner.run_process_event is not None: - client_runner.run_process_event.set() - self.model_runner_mapping.pop(run_id_str) - - def set_runner_completed_event(self, run_id): - run_id_str = str(run_id) - client_runner = self.model_runner_mapping.get(run_id_str, None) - if client_runner is not None: - if client_runner.run_process_completed_event is not None: - client_runner.run_process_completed_event.set() - self.model_runner_mapping.pop(run_id_str) - - def callback_delete_deployment(self, topic, payload): - logging.info("[Worker] callback_delete_deployment") - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Delete all replicas on this device - try: - ClientConstants.remove_deployment( - model_msg_object.end_point_name, model_msg_object.model_name, model_msg_object.model_version, - model_msg_object.run_id, model_msg_object.model_id, edge_id=self.edge_id) - except Exception as e: - logging.info(f"Exception when removing deployment {traceback.format_exc()}") - pass - - self.set_runner_stopped_event(model_msg_object.run_id) - - logging.info(f"[endpoint/device][{model_msg_object.run_id}/{self.edge_id}] " - f"Release gpu resource when the worker deployment deleted.") - JobRunnerUtils.get_instance().release_gpu_ids(model_msg_object.run_id, self.edge_id) - - if self.running_request_json.get(str(model_msg_object.run_id)) is not None: - try: - self.running_request_json.pop(str(model_msg_object.run_id)) - except Exception as e: - logging.error(f"Error when removing running_request_json: {traceback.format_exc()}") - pass - - FedMLClientDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id( - model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, - self.edge_id) - - # Delete FEDML_GLOBAL_ENDPOINT_RUN_ID_MAP_TAG-${run_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_endpoint_run_id_map(str(model_msg_object.run_id)) - - # Delete FEDML_EDGE_ID_MODEL_DEVICE_ID_MAP_TAG-${run_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_edge_model_id_map(str(model_msg_object.run_id)) - - # Delete FEDML_GLOBAL_DEVICE_RUN_GPU_IDS_TAG-${run_id}-${device_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_device_run_gpu_ids(str(self.edge_id), - str(model_msg_object.run_id)) - - # Delete FEDML_GLOBAL_DEVICE_RUN_NUM_GPUS_TAG-${run_id}-${device_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_device_run_num_gpus(str(self.edge_id), - str(model_msg_object.run_id)) - - # Delete FEDML_MODEL_REPLICA_GPU_IDS_TAG-${run_id}-${end_point_name}-${model_name}-${device_id}-* - FedMLModelCache.get_instance().set_redis_params() - FedMLModelCache.get_instance().delete_all_replica_gpu_ids(model_msg_object.run_id, - model_msg_object.end_point_name, - model_msg_object.model_name, self.edge_id) - - def exit_run_with_exception_entry(self): - try: - self.setup_client_mqtt_mgr() - self.exit_run_with_exception() - except Exception as e: - self.release_client_mqtt_mgr() - sys.exit(1) - finally: - self.release_client_mqtt_mgr() - - def exit_run_with_exception(self): - logging.info("Exit run successfully.") - - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - time.sleep(1) - - def callback_exit_train_with_exception(self, topic, payload): - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("run_id", None) - if run_id is None: - run_id = request_json.get("id", None) - - if run_id is None: - return - - # Stop client with multiprocessing mode - self.request_json = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - try: - Process(target=client_runner.exit_run_with_exception_entry).start() - except Exception as e: - pass - - def cleanup_client_with_status(self): - self.setup_client_mqtt_mgr() - - if self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: - self.cleanup_run_when_finished() - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - self.cleanup_run_when_starting_failed() - - self.release_client_mqtt_mgr() - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - run_id = request_json["run_id"] - edge_id = request_json["edge_id"] - status = request_json["status"] - - self.save_training_status(edge_id, status) - - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - # Stop client with multiprocessing mode - self.request_json = request_json - client_runner = FedMLClientRunner( - self.args, - edge_id=self.edge_id, - request_json=request_json, - agent_config=self.agent_config, - run_id=run_id, - ) - client_runner.device_status = status - status_process = Process(target=client_runner.cleanup_client_with_status) - status_process.start() - status_process.join(15) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) - - def callback_report_current_status(self, topic, payload): - self.send_agent_active_msg() - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_client_ota_msg(self, topic, payload): - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ClientConstants.FEDML_OTA_CMD_UPGRADE: - FedMLClientRunner.process_ota_upgrade_msg() - # Process(target=FedMLClientRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - elif cmd == ClientConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def save_training_status(self, edge_id, training_status): - self.current_training_status = training_status - ClientConstants.save_training_infos(edge_id, training_status) - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ClientConstants.get_data_dir(), - ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - logging.info(device_id) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def get_ip_address(self, request_json): - # OPTION 1: Use local ip - ip = ClientConstants.get_local_ip() - - # OPTION 2: Auto detect public ip - if "parameters" in request_json and \ - ClientConstants.AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ - request_json["parameters"][ClientConstants.AUTO_DETECT_PUBLIC_IP]: - ip = ClientConstants.get_public_ip() - logging.info("Auto detect public ip for worker: " + ip) - - # OPTION 3: Use user indicated ip - if self.infer_host is not None and self.infer_host != "127.0.0.1" and self.infer_host != "localhost": - ip = self.infer_host - - return ip - - def bind_account_and_device_id(self, url, account_id, device_id, os_name, role="md.on_premise_device"): - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "type": os_name, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flclient_agent/active" - status = MLOpsStatus.get_instance().get_client_agent_status(self.edge_id) - if ( - status is not None - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - ): - return - - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - current_job = None - if current_job is None: - if status is not None and status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE: - status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - else: - return - else: - status = ClientConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - - def recover_start_deployment_msg_after_upgrading(self): - try: - current_job = FedMLClientDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING: - logging.info("start deployment after upgrading.") - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.callback_start_deployment(topic_start_deployment, current_job.running_json) - except Exception as e: - logging.info("recover starting deployment message after upgrading: {}".format(traceback.format_exc())) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting deployment - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_start_deployment, self.callback_start_deployment) - - # Setup MQTT message listener for delete deployment - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_delete_deployment, self.callback_delete_deployment) - - # Setup MQTT message listener for running failed - topic_exit_train_with_exception = "flserver_agent/" + str(self.edge_id) + "/exit_train_with_exception" - self.mqtt_mgr.add_message_listener(topic_exit_train_with_exception, self.callback_exit_train_with_exception) - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.edge_id) + "/status" - self.mqtt_mgr.add_message_listener(topic_client_status, self.callback_runner_id_status) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.mqtt_mgr.add_message_listener(topic_report_status, self.callback_report_current_status) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flclient_agent_" + str(self.edge_id) + "/ota" - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.callback_client_ota_msg) - - if self.mqtt_inference_obj is None: - self.mqtt_inference_obj = FedMLMqttInference(agent_config=self.agent_config, mqtt_mgr=self.mqtt_mgr) - self.mqtt_inference_obj.setup_listener_for_endpoint_inference_request(self.edge_id) - - # Subscribe topics for starting deployment, stopping deployment and fetching client status. - mqtt_client_object.subscribe(topic_start_deployment, qos=2) - mqtt_client_object.subscribe(topic_delete_deployment, qos=2) - mqtt_client_object.subscribe(topic_client_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_exit_train_with_exception, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_deployment) - self.subscribed_topics.append(topic_delete_deployment) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_exit_train_with_exception) - self.subscribed_topics.append(topic_ota_msg) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Echo results - # print("\n\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - # print( - # "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - # + str(self.unique_device_id) - # + "\n" - # ) - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_client_agent_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - ) - - try: - if self.mqtt_inference_obj is not None: - self.mqtt_inference_obj.remove_listener_for_endpoint_inference_request(self.edge_id) - except Exception as e: - pass - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelClientAgent_Daemon_@" + self.user_name + "@_" + self.args.current_device_id + str(uuid.uuid4()), - "flclient_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLClientDataInterface.get_instance().create_job_table() - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ClientConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - except Exception as e: - pass - - client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api" - client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) - if client_api_pids is None or len(client_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ClientConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, client_api_cmd, - ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Model worker local API process id {self.local_api_process.pid}") - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - self.setup_client_mqtt_mgr() - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, is_from_model=True) - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - - self.recover_start_deployment_msg_after_upgrading() - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_client_mqtt_mgr() - - def start_agent_mqtt_loop(self, should_exit_sys=False): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Client tracing: {}".format(traceback.format_exc())) - finally: - self.stop_agent() - - if should_exit_sys: - time.sleep(5) - sys.exit(1) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_runner_deprecated.py b/python/fedml/computing/scheduler/model_scheduler/device_server_runner_deprecated.py deleted file mode 100755 index 4bcac6d2d..000000000 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_runner_deprecated.py +++ /dev/null @@ -1,2022 +0,0 @@ -import copy -import json -import logging -import multiprocessing -import platform -import sys - -from multiprocessing import Process -import os -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from os import listdir - -import requests -import torch - -import fedml -from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils -from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter - -from ..comm_utils import sys_utils -from .device_server_data_interface import FedMLServerDataInterface -from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .device_client_constants import ClientConstants -from .device_server_constants import ServerConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .device_model_cache import FedMLModelCache -from .device_model_msg_object import FedMLModelMsgObject -from ....core.mlops.mlops_utils import MLOpsUtils -from ..comm_utils.constants import SchedulerConstants -from .device_model_db import FedMLModelDatabase -from .device_replica_controller import FedMLDeviceReplicaController - - -class RunnerError(BaseException): - """ Runner failed. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLServerRunner: - FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-" - - def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0): - self.inference_gateway_process = None - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_as_cloud_agent = False - self.run_as_cloud_server = False - self.run_as_edge_server_and_agent = False - self.run_as_cloud_server_and_agent = False - self.fedml_packages_base_dir = None - self.fedml_packages_unzip_dir = None - self.mqtt_mgr = None - self.running_request_json = dict() - self.run_id = run_id - self.client_mqtt_mgr = None - self.client_mqtt_is_connected = False - self.client_mqtt_lock = None - self.unique_device_id = None - self.edge_id = edge_id - self.server_agent_id = 0 - if request_json is not None: - self.server_agent_id = request_json.get("server_id", 0) - self.process = None - self.args = args - self.request_json = copy.deepcopy(request_json) - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = {} - - self.mlops_metrics = None - self.run_status = None - self.infer_host = "127.0.0.1" - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - self.slave_deployment_statuses_mapping = dict() - self.slave_deployment_results_mapping = dict() - self.slave_update_result_mapping = dict() - - self.model_runner_mapping = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - - self.subscribed_topics = list() - self.user_name = None - - self.replica_controller = None - self.deployed_replica_payload = None - - self.autoscaler_launcher = None - - def build_dynamic_constrain_variables(self, run_id, run_config): - pass - - def unzip_file(self, zip_file, unzip_file_path): - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - - return unziped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook function is stateless, we need a state to avoid printing progress repeatedly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ServerConstants.get_model_package_dir() - if not os.path.exists(local_package_path): - os.makedirs(local_package_path, exist_ok=True) - local_package_file = "{}.zip".format(os.path.join(local_package_path, package_name)) - if os.path.exists(local_package_file): - os.remove(local_package_file) - - # Download without renaming - urllib.request.urlretrieve(package_url, filename=None, reporthook=self.package_download_progress) - - unzip_package_path = ServerConstants.get_model_dir() - self.fedml_packages_base_dir = unzip_package_path - try: - shutil.rmtree( - os.path.join(unzip_package_path, package_name), ignore_errors=True - ) - except Exception as e: - pass - logging.info("local_package_file {}, unzip_package_path {}".format( - local_package_file, unzip_package_path)) - package_name = self.unzip_file(local_package_file, unzip_package_path) - unzip_package_path = os.path.join(unzip_package_path, package_name) - return unzip_package_path - - def update_local_fedml_config(self, run_id, run_config): - model_config = run_config - model_name = model_config["model_name"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - inference_end_point_id = run_id - - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - model_name, model_storage_url - ) - fedml_local_config_file = os.path.join(unzip_package_path, "fedml_model_config.yaml") - - # Load the above config to memory - package_conf_object = {} - if os.path.exists(fedml_local_config_file): - package_conf_object = load_yaml_config(fedml_local_config_file) - - return unzip_package_path, package_conf_object - - def get_usr_indicated_token(self, request_json) -> str: - usr_indicated_token = "" - if "parameters" in request_json and "authentication_token" in request_json["parameters"]: - usr_indicated_token = request_json["parameters"]["authentication_token"] - return usr_indicated_token - - def build_dynamic_args(self, run_config, package_conf_object, base_dir): - pass - - def run(self, process_event, completed_event): - # print(f"Model master runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - run_id = self.request_json.get("end_point_id") - - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.setup_client_mqtt_mgr() - - self.run_impl() - except RunnerError: - logging.info("Runner stopped.") - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, - is_from_model=True, edge_id=self.edge_id) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error("Runner exits with exceptions.") - logging.error(traceback.format_exc()) - logging.error(e) - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=self.edge_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - sys.exit(1) - finally: - logging.info("[Master] Deployment finished, release resources.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - if not self.run_as_cloud_server: - self.release_client_mqtt_mgr() - - def parse_model_run_params(self, running_json): - run_id = running_json["end_point_id"] - end_point_name = running_json["end_point_name"] - token = running_json["token"] - user_id = running_json["user_id"] - user_name = running_json["user_name"] - device_ids = running_json["device_ids"] - device_objs = running_json["device_objs"] - - model_config = running_json["model_config"] - model_name = model_config["model_name"] - model_id = model_config["model_id"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - model_is_from_open = model_config["is_from_open"] - inference_end_point_id = run_id - use_gpu = "gpu" # TODO: Get GPU from device infos - memory_size = "256m" # TODO: Get Memory size for each instance - model_version = model_config["model_version"] - model_config_parameters = running_json.get("parameters", {}) - - inference_port = model_config_parameters.get("server_internal_port", # Internal port is for the gateway - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) - - return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port - - def inference_run(self): - # run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - # model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - # inference_end_point_id, use_gpu, memory_size, model_version, inference_port = - # self.parse_model_run_params(self.request_json) - # - # inference_server = FedMLModelServingServer(self.args, - # end_point_name, - # model_name, - # model_version, - # inference_request=self.request_json) - # inference_server.run() - pass - - def run_impl(self): - run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port = self.parse_model_run_params( - self.request_json) - - # TODO(Raphael): This measurement is for the host machine. Change to container's metrics - self.mlops_metrics.report_sys_perf(self.args, self.agent_config["mqtt_config"], run_id=run_id) - - self.check_runner_stop_event() - - # Send stage: MODEL_DEPLOYMENT_STAGE4 = "ForwardRequest2Slave" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE4["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"]) - - self.args.run_id = self.run_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - # Report server running status - self.check_runner_stop_event() - self.mlops_metrics.report_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, - is_from_model=True, running_json=json.dumps(self.request_json), edge_id=self.edge_id) - self.send_deployment_status(self.run_id, end_point_name, - model_name, "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING) - - # Start unified inference gateway if it has not started - self.start_device_inference_gateway( - run_id, end_point_name, model_id, model_name, model_version, inference_port=inference_port) - - # (re)Start inference monitor server - self.stop_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - self.start_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - - # Changed the master's status to "IDLE" - self.mlops_metrics.broadcast_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, - is_from_model=True, edge_id=self.edge_id) - - # Forward deployment request to slave devices - self.check_runner_stop_event() - - # Handle "op:add" && "op:remove" - devices_sent_add_or_remove_msg = self.send_deployment_start_request_to_edges() - - # Handle "op:update" - try: - devices_sent_update_remove_msg = self.send_first_scroll_update_msg() - - if len(devices_sent_add_or_remove_msg) == 0 and len(devices_sent_update_remove_msg) == 0: - # No device is added, updated or removed - logging.info("No device is added, updated or removed. No action needed for reconciliation.") - ip = self.get_ip_address(self.request_json) - master_port = os.getenv("FEDML_MASTER_PORT", None) - if master_port is not None: - inference_port = int(master_port) - model_inference_port = inference_port - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/api/v1/predict".format(ip) - else: - model_inference_url = "http://{}:{}/api/v1/predict".format(ip, model_inference_port) - - self.set_runner_completed_event(run_id) - - self.send_deployment_status(run_id, end_point_name, - model_name, - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED) - - # Set setting to "DEPLOYED" for autoscaling service reference - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - update_user_setting_replica_num(end_point_id=run_id, state="DEPLOYED") - - return - except Exception as e: - logging.error(f"Failed to send first scroll update message due to {e}.") - logging.error(f"Exception traceback {traceback.format_exc()}.") - - logging.info("Start waiting for result callback from workers ...") - - while True: - # Wait for all devices to finish the add / delete / update operation - self.check_runner_stop_event() - time.sleep(3) - - def check_runner_stop_event(self): - if self.run_process_event is not None and self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def start_device_inference_gateway( - self, run_id, end_point_name, model_id, - model_name, model_version, inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): - # start unified inference server - running_model_name = ServerConstants.get_running_model_name(end_point_name, - model_name, model_version, run_id, model_id) - python_program = get_python_program() - master_port = os.getenv("FEDML_MASTER_PORT", None) - if master_port is not None: - inference_port = int(master_port) - if not ServerConstants.is_running_on_k8s(): - logging.info(f"start the model inference gateway, end point {run_id}, " - f"model name {model_name} at port {inference_port}...") - self.check_runner_stop_event() - - use_mqtt_inference = os.getenv("FEDML_USE_MQTT_INFERENCE", "False") - use_mqtt_inference = True if use_mqtt_inference.lower() == 'true' else False - use_worker_gateway = os.getenv("FEDML_USE_WORKER_GATEWAY", "False") - use_worker_gateway = True if use_worker_gateway.lower() == 'true' else False - inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api" - inference_gateway_pids = RunProcessUtils.get_pid_from_cmd_line(inference_gw_cmd) - if inference_gateway_pids is None or len(inference_gateway_pids) <= 0: - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - connect_str = "@FEDML@" - ext_info = sys_utils.random1( - self.agent_config["mqtt_config"]["BROKER_HOST"] + connect_str + - str(self.agent_config["mqtt_config"]["BROKER_PORT"]) + connect_str + - self.agent_config["mqtt_config"]["MQTT_USER"] + connect_str + - self.agent_config["mqtt_config"]["MQTT_PWD"] + connect_str + - str(self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"]), "FEDML@9999GREAT") - self.inference_gateway_process = ServerConstants.exec_console_with_script( - "REDIS_ADDR=\"{}\" REDIS_PORT=\"{}\" REDIS_PASSWORD=\"{}\" " - "END_POINT_NAME=\"{}\" " - "MODEL_NAME=\"{}\" MODEL_VERSION=\"{}\" MODEL_INFER_URL=\"{}\" VERSION=\"{}\" " - "USE_MQTT_INFERENCE={} USE_WORKER_GATEWAY={} EXT_INFO={} " - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - self.redis_addr, self.redis_port, self.redis_password, - end_point_name, - model_name, model_version, "", self.args.version, - use_mqtt_inference, use_worker_gateway, ext_info, - python_program, inference_gw_cmd, str(inference_port), fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - - def start_device_inference_monitor(self, run_id, end_point_name, - model_id, model_name, model_version, check_stopped_event=True): - # start inference monitor server - # Will report the qps related metrics to the MLOps - logging.info(f"start the model inference monitor, end point {run_id}, model name {model_name}...") - if check_stopped_event: - self.check_runner_stop_event() - run_id_str = str(run_id) - pip_source_dir = os.path.dirname(__file__) - monitor_file = os.path.join(pip_source_dir, "device_model_monitor.py") - python_program = get_python_program() - running_model_name = ServerConstants.get_running_model_name(end_point_name, - model_name, model_version, run_id, model_id) - self.monitor_process = ServerConstants.exec_console_with_shell_script_list( - [ - python_program, - monitor_file, - "-v", - self.args.version, - "-ep", - run_id_str, - "-epn", - str(end_point_name), - "-mi", - str(model_id), - "-mn", - model_name, - "-mv", - model_version, - "-iu", - "infer_url", - "-ra", - self.redis_addr, - "-rp", - self.redis_port, - "-rpw", - self.redis_password - ], - should_capture_stdout=False, - should_capture_stderr=False - ) - - def stop_device_inference_monitor(self, run_id, end_point_name, model_id, model_name, model_version): - # stop inference monitor server - logging.info(f"stop the model inference monitor, end point {run_id}, model name {model_name}...") - sys_utils.cleanup_model_monitor_processes(run_id, end_point_name, - model_id, model_name, model_version) - - def cleanup_run_when_finished(self): - logging.info("Cleanup run successfully when finished.") - - self.mlops_metrics.broadcast_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, - is_from_model=True, edge_id=self.edge_id - ) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_starting_failed(self): - logging.info("Cleanup run successfully when starting failed.") - - self.mlops_metrics.broadcast_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_deploy_failed(self): - topic = f"model_ops/model_device/delete_deployment/{self.edge_id}" - self.callback_delete_deployment(topic, payload=json.dumps(self.request_json)) - - def callback_deployment_result_message(self, topic=None, payload=None): - """ - This method is called when a deployment result is received from a worker device. - """ - # Save deployment result to local cache - topic_splits = str(topic).split('/') - device_id = topic_splits[-1] - payload_json = json.loads(payload) - end_point_id = payload_json["end_point_id"] - end_point_name = payload_json["end_point_name"] - model_id = payload_json["model_id"] - model_name = payload_json["model_name"] - model_version = payload_json["model_version"] - model_status = payload_json["model_status"] - replica_no = payload_json.get("replica_no", None) # "no" Idx start from 1 - run_id_str = str(end_point_id) - - # HotFix(Raphael): logging service cross talk - # Change the handler since each handler need to write to different log files - try: - # Remove the existing file handler - root_logger = logging.getLogger() - for handler in root_logger.handlers: - if isinstance(handler, logging.FileHandler): - root_logger.removeHandler(handler) - - # Correct log path: ~/.fedml/fedml-model-server/fedml/logs/fedml-run-$rid-edge-$eid.log - log_file = os.path.join(ServerConstants.get_log_file_dir(), - f"fedml-run-{run_id_str}-edge-{self.edge_id}.log") - - filehandler = logging.FileHandler(log_file, "a") - - program_prefix = "FedML-Server @device-id-{}".format(self.edge_id) - formatter = MLOpsFormatter(fmt="[" + program_prefix + "] [%(asctime)s] [%(levelname)s] " - "[%(filename)s:%(lineno)d:%(funcName)s] %(" - "message)s") - - filehandler.setFormatter(formatter) - root_logger.addHandler(filehandler) - except Exception as e: - logging.warning(f"Failed to change the logging handler due to {e}.") - - assert run_id_str in self.model_runner_mapping, (f"Run id {run_id_str} is not in the model runner mapping." - f"Current mapping {self.model_runner_mapping}.") - - logging.info("========== callback_deployment_result_message ==========\n") - # Identify the operation for this run (add, remove, update) - if run_id_str not in self.running_request_json: - logging.error(f"Run id {run_id_str} is not in the running request json.") - return - - # The rolling update and scale out / in operation should not happen at the same time - assert not ("replica_num_diff" in self.running_request_json[run_id_str] and - len(self.running_request_json[run_id_str]["replica_num_diff"]) > 0 and - "replica_version_diff" in self.running_request_json[run_id_str]) - - if "replica_version_diff" in self.running_request_json[run_id_str]: - run_operation = "UPDATE" - elif "replica_num_diff" in self.running_request_json[run_id_str] and \ - len(self.running_request_json[run_id_str]["replica_num_diff"]) > 0: - run_operation = "ADD_OR_REMOVE" - else: - logging.error(f"Unsupported operation for run id {run_id_str}. and request json " - f"{self.running_request_json[run_id_str]}") - return - - logging.info(f"End point {end_point_id}; Device {device_id}; replica {replica_no}; " - f"run_operation {run_operation} model status {model_status}.") - - # OPTIONAL DEBUG PARAMS - # this_run_controller = self.model_runner_mapping[run_id_str].replica_controller - # logging.info(f"The current replica controller state is " - # f"Total version diff num {this_run_controller.total_replica_version_diff_num}") - # logging.info(f"self.request_json now {self.request_json}") # request_json will be deprecated - # this_run_request_json = self.running_request_json.get(run_id_str, None) - # logging.info(f"self.running_request_json now {this_run_request_json}") - - # Set redis + sqlite deployment result - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - - # Deal with different model status - if model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED: - # remove - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - delete_deployment_result_with_device_id_and_replica_no( - end_point_id, end_point_name, model_name, device_id, replica_no) - elif model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - # add or update or update-failed-rollback - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_deployment_result(end_point_id, end_point_name, - model_name, model_version, - device_id, payload, replica_no) - - # Note: To display the result in the UI, we need to save successful deployment result to the database - self.model_runner_mapping[run_id_str].deployed_replica_payload = copy.deepcopy(payload_json) - else: - if model_status != ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED: - logging.error(f"Unsupported model status {model_status}.") - - # Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps - if self.model_runner_mapping[run_id_str].replica_controller.under_rollback: - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - return - - # Failure handler, send the rollback message to the worker devices only if it has not been rollback - if run_operation == "ADD_OR_REMOVE": - # During Scale out / in, - # the worker that already been scaled out / in should be sent the rollback message - rollback_dict = self.model_runner_mapping[run_id_str].replica_controller.rollback_add_or_remove_replica( - device_id=device_id, replica_no=replica_no, op_type=run_operation - ) - self.model_runner_mapping[run_id_str].replica_controller.under_rollback = True - - if rollback_dict is not None and len(rollback_dict) > 0: - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING) - self.send_rollback_add_remove_op(run_id_str, rollback_dict) - return - else: - # This is the last worker that failed, so we should continue to "ABORTED" status - model_config_parameters = self.running_request_json[run_id_str]["parameters"] - inference_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) - ip = self.get_ip_address(self.running_request_json[run_id_str]) - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/inference/{}".format(ip, end_point_id) - else: - model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, - end_point_id) - - self.send_deployment_status(end_point_id, end_point_name, - payload_json["model_name"], - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED) - - # For auto-scaling, should update the state to "DEPLOYED" - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - update_user_setting_replica_num(end_point_id=end_point_id, state="DEPLOYED") - - self.model_runner_mapping[run_id_str].replica_controller.under_rollback = False - - return - elif run_operation == "UPDATE": - # Overwrite the json with the rollback version diff - rollback_version_diff = \ - self.model_runner_mapping[run_id_str].replica_controller.rollback_get_replica_version_diff( - device_id_trigger=device_id, replica_no_trigger=replica_no) - - # Change the target version to the start version - self.model_runner_mapping[run_id_str].replica_controller.rollback_setback_target_replica_version() - - self.running_request_json[run_id_str]["replica_version_diff"] = copy.deepcopy(rollback_version_diff) - - # Send the rollback message to the worker devices - self.send_rollback_msg(run_id_str) - - # Set the deployment status to ABORTING - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING) - - # TODO(Raphael): Check if resource left not cleaned up - return - else: - logging.error(f"Unsupported operation {run_operation}.") - return - - # Move to the next state (rolling update, finish the deployment, etc.) - # Notify the replica number controller - (self.model_runner_mapping[run_id_str]. - replica_controller.callback_update_curr_replica_num_state(device_id, replica_no, model_status)) - - # Notify the replica version controller, which might trigger the next rolling update - self.send_next_scroll_update_msg(run_id_str, device_id, replica_no) - - # Update the global deployment result mapping - if run_id_str not in self.slave_deployment_results_mapping: - self.slave_deployment_results_mapping[run_id_str] = dict() - if str(device_id) not in self.slave_deployment_results_mapping[run_id_str]: - self.slave_deployment_results_mapping[run_id_str][str(device_id)] = dict() - self.slave_deployment_results_mapping[run_id_str][str(device_id)][str(replica_no)] = model_status - - logging.info("callback_deployment_result_message: topic {}, payload {}, result mapping {}.".format( - topic, payload, self.slave_deployment_results_mapping[run_id_str])) - - request_json = self.running_request_json.get(run_id_str, None) - if request_json is None: - logging.error(f"The endpoint {end_point_id} is no longer running.") - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - return - - # Wait for all replica-level's result, not device-level - if (self.model_runner_mapping[run_id_str].replica_controller.is_all_replica_num_reconciled() and - self.model_runner_mapping[run_id_str].replica_controller.is_all_replica_version_reconciled()): - """ - When all the devices have finished the add / delete / update operation - """ - # Generate one unified inference api - # Note that here we use the gateway port instead of the inference port that is used by the slave device - model_config_parameters = request_json["parameters"] - inference_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) - ip = self.get_ip_address(request_json) - - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/inference/{}".format(ip, end_point_id) - else: - model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, end_point_id) - - # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress" - self.send_deployment_stages(end_point_id, model_name, model_id, - model_inference_url, - ServerConstants.MODEL_DEPLOYMENT_STAGE5["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE5["text"], - "inference url: {}".format(model_inference_url)) - - # Send the result to MLOps - if self.model_runner_mapping[run_id_str].deployed_replica_payload is not None: - payload_json = self.model_runner_mapping[run_id_str].deployed_replica_payload - model_slave_url = payload_json["model_url"] - payload_json["model_url"] = model_inference_url - payload_json["port"] = inference_port_external - token = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_token( - end_point_id, end_point_name, model_name) - - model_metadata = payload_json["model_metadata"] - model_inputs = model_metadata["inputs"] - ret_inputs = list() - if "type" in model_metadata and model_metadata["type"] == "default": - payload_json["input_json"] = {"end_point_name": end_point_name, - "model_name": model_name, - "token": str(token), - "inputs": model_inputs, - "outputs": []} - payload_json["output_json"] = model_metadata["outputs"] - else: - raise Exception(f"Unsupported model metadata type {model_metadata['type']}") - - self.send_deployment_results_with_payload( - end_point_id, end_point_name, payload_json, - self.model_runner_mapping[run_id_str].replica_controller.target_replica_ids) - - payload_json_saved = payload_json - payload_json_saved["model_slave_url"] = model_slave_url - FedMLServerDataInterface.get_instance().save_job_result(end_point_id, self.edge_id, - json.dumps(payload_json_saved)) - else: - # Arrive here because only contains remove ops, so we do not need to update the model metadata - pass - - # For auto-scaling, should update the state to "DEPLOYED" - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - update_user_setting_replica_num(end_point_id=end_point_id, state="DEPLOYED") - - if self.model_runner_mapping[run_id_str].replica_controller.under_rollback: - # If first time failed (Still might need rollback), then send failed message to the MLOps - if not (FedMLModelCache.get_instance(self.redis_addr, self.redis_port). - get_end_point_activation(end_point_id)): - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - else: - self.send_deployment_status(end_point_id, end_point_name, - payload_json["model_name"], - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED) - self.model_runner_mapping[run_id_str].replica_controller.under_rollback = False - else: - # Set the end point activation status to True, for scaling out / in and rolling update - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_activation(end_point_id, end_point_name, True) - - self.send_deployment_status(end_point_id, end_point_name, - payload_json["model_name"], - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED) - - self.slave_deployment_results_mapping[run_id_str] = dict() - - time.sleep(3) - self.set_runner_completed_event(end_point_id) - - def callback_deployment_status_message(self, topic=None, payload=None): - # [Deprecated] Merge the logic into callback_deployment_result_message - logging.info("[Deprecated] callback_deployment_status_message: topic {}, payload {}.".format( - topic, payload)) - pass - - def send_deployment_start_request_to_edges(self, in_request_json=None): - if in_request_json is not None: - self.request_json = in_request_json - - # Iterate through replica_num_diff, both add and replace should be sent to the edge devices - if "replica_num_diff" not in self.request_json or self.request_json["replica_num_diff"] is None: - return [] - - edge_id_list = [] - for device_id in self.request_json["replica_num_diff"].keys(): - edge_id_list.append(device_id) - - self.request_json["master_node_ip"] = self.get_ip_address(self.request_json) - should_added_devices = [] - for edge_id in edge_id_list: - if edge_id == self.edge_id: - continue - should_added_devices.append(edge_id) - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.request_json) - return should_added_devices - - def send_deployment_start_request_to_edge(self, edge_id, res_json): - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(edge_id)) - logging.info("start_deployment: send topic " + topic_start_deployment + f" to client {edge_id}...") - self.client_mqtt_mgr.send_message_json(topic_start_deployment, json.dumps(res_json)) - - def get_ip_address(self, request_json): - # OPTION 1: Use local ip - ip = ServerConstants.get_local_ip() - - # OPTION 2: Auto detect public ip - if "parameters" in request_json and \ - ServerConstants.AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ - request_json["parameters"][ServerConstants.AUTO_DETECT_PUBLIC_IP]: - ip = ServerConstants.get_public_ip() - - # OPTION 3: Use user indicated ip - if self.infer_host is not None and self.infer_host != "127.0.0.1" and self.infer_host != "localhost": - ip = self.infer_host - - return ip - - def send_deployment_delete_request_to_edges(self, payload, model_msg_object): - edge_id_list_to_delete = model_msg_object.device_ids - - # Remove the model master node id from the list using index 0 - edge_id_list_to_delete = edge_id_list_to_delete[1:] - - logging.info("Device ids to be deleted: " + str(edge_id_list_to_delete)) - - for edge_id in edge_id_list_to_delete: - if edge_id == self.edge_id: - continue - # send delete deployment request to each model device - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(edge_id)) - logging.info("delete_deployment: send topic " + topic_delete_deployment + " to client...") - self.client_mqtt_mgr.send_message_json(topic_delete_deployment, payload) - - def ota_upgrade(self, payload, request_json): - run_id = request_json["end_point_id"] - force_ota = False - ota_version = None - - try: - parameters = request_json.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - job_obj = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job_obj is None: - FedMLServerDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - payload) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_deployment(self, topic, payload): - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # Get deployment params - request_json = json.loads(payload) - run_id = request_json["end_point_id"] - end_point_name = request_json["end_point_name"] - token = request_json["token"] - user_id = request_json["user_id"] - user_name = request_json["user_name"] - device_ids = request_json["device_ids"] - device_objs = request_json["device_objs"] - - model_config = request_json["model_config"] - model_name = model_config["model_name"] - model_version = model_config["model_version"] - model_id = model_config["model_id"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - enable_auto_scaling = request_json.get("enable_auto_scaling", False) - desired_replica_num = request_json.get("desired_replica_num", 1) - - target_queries_per_replica = request_json.get("target_queries_per_replica", 10) - aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) - scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) - - inference_end_point_id = run_id - - logging.info("[Master] received start deployment request for end point {}.".format(run_id)) - - # Set redis config - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - - # Save the user setting (about replica number) of this run to Redis, if existed, update it - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num( - end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version, - replica_num=desired_replica_num, enable_auto_scaling=enable_auto_scaling, - scale_min=scale_min, scale_max=scale_max, state="DEPLOYING", - aggregation_window_size_seconds=aggregation_window_size_seconds, - target_queries_per_replica=target_queries_per_replica, - scale_down_delay_seconds=int(scale_down_delay_seconds) - ) - - # Start log processor for current run - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog(args=self.args).init_logs() - MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( - ServerConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) - - # # Deprecated - # self.ota_upgrade(payload, request_json) - - # Add additional parameters to the request_json - run_id = inference_end_point_id - self.args.run_id = run_id - self.run_id = run_id - request_json["run_id"] = run_id - self.request_json = request_json - run_id_str = str(run_id) - self.running_request_json[run_id_str] = request_json - self.request_json["master_node_ip"] = self.get_ip_address(self.request_json) - - # Set the target status of the devices to redis - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_device_info(request_json["end_point_id"], end_point_name, json.dumps(device_objs)) - - # Setup Token - usr_indicated_token = self.get_usr_indicated_token(request_json) - if usr_indicated_token != "": - logging.info(f"Change Token from{token} to {usr_indicated_token}") - token = usr_indicated_token - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_token(run_id, end_point_name, model_name, token) - - self.subscribe_slave_devices_message(request_json) - - # Report stage to mlops: MODEL_DEPLOYMENT_STAGE1 = "Received" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE1["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE1["text"], - "Received request for endpoint {}".format(run_id)) - - # Report stage to mlops: MODEL_DEPLOYMENT_STAGE2 = "Initializing" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE2["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"]) - - ServerConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - if self.run_as_edge_server_and_agent: - # Replica Controller is per deployment - replica_controller = FedMLDeviceReplicaController(self.edge_id, self.request_json) - - # Prepare num diff - new_request_with_num_diff = replica_controller.generate_diff_to_request_json() - self.running_request_json[run_id_str] = new_request_with_num_diff - request_json = new_request_with_num_diff - - # Listen to extra worker topics, especially when worker's replica remove to zero, - # In this case, currently Java will NOT send those worker ids to the master, but still need to listen to it. - if "replica_num_diff" in request_json and len(request_json["replica_num_diff"]) > 0: - for device_id in request_json["replica_num_diff"].keys(): - # {"op": "remove", "curr_num": 1, "target_num": 0} - if request_json["replica_num_diff"][device_id]["op"] == "remove" and \ - request_json["replica_num_diff"][device_id]["target_num"] == 0: - self.subscribe_spec_device_message(run_id, device_id) - - # Prepare version diff - new_request_with_version_diff = replica_controller.init_first_update_device_replica_mapping() - self.running_request_json[run_id_str] = new_request_with_version_diff - request_json = new_request_with_version_diff - - # Init the model runner - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.edge_id = self.edge_id - server_runner.infer_host = self.infer_host - server_runner.redis_addr = self.redis_addr - server_runner.redis_port = self.redis_port - server_runner.redis_password = self.redis_password - server_runner.replica_controller = replica_controller - - logging.info(f"[Master] new request for id {run_id_str}") - logging.info(f"[Master] model runner mapping before: {self.model_runner_mapping.items()}") - - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - server_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - self.model_runner_mapping[run_id_str] = server_runner - - logging.info(f"[Master] model runner mapping after: {self.model_runner_mapping.items()}") - - # This subprocess will copy the server_runner and run it, but they are not the same object - server_process = Process(target=server_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str] - )) - server_process.start() - ServerConstants.save_run_process(run_id, server_process.pid) - - # Send stage: MODEL_DEPLOYMENT_STAGE3 = "StartRunner" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE3["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"]) - - def send_first_scroll_update_msg(self): - """ - Replica-level rolling update. - Delete the record of the replaced device and send the deployment msg to the devices - """ - if "replica_version_diff" not in self.request_json or self.request_json["replica_version_diff"] is None: - return [] - - first_chunk_dict = self.request_json["replica_version_diff"] - - # Delete the record of the replaced device - self.delete_device_replica_info_on_master( - self.request_json["end_point_id"], self.request_json["end_point_name"], - self.request_json["model_config"]["model_name"], first_chunk_dict) - - logging.info(f"Send the first scroll update msg to the device {first_chunk_dict} ") - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in first_chunk_dict.keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.request_json) - return list(first_chunk_dict.keys()) - - def send_rollback_msg(self, run_id_str): - # Avoid using the old request_json - self.delete_device_replica_info_on_master( - self.running_request_json[run_id_str]["end_point_id"], - self.running_request_json[run_id_str]["end_point_name"], - self.running_request_json[run_id_str]["model_config"]["model_name"], - self.running_request_json[run_id_str]["replica_version_diff"]) - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in self.running_request_json[run_id_str]["replica_version_diff"].keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.running_request_json[run_id_str]) - - def delete_device_replica_info_on_master(self, endpoint_id, endpoint_name, model_name, edge_id_replica_no_dict): - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - # Remove the record of the replaced device - # [Deprecated] deployment status & device info - # Delete the result in deployment result list in Redis / SQLite - device_result_list = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_deployment_result_list(endpoint_id, endpoint_name, model_name) - - delete_device_result_list = [] - for device_result in device_result_list: - device_result_dict = json.loads(device_result) - if (str(device_result_dict["cache_device_id"]) in edge_id_replica_no_dict.keys() and - str(device_result_dict["cache_replica_no"]) in - edge_id_replica_no_dict[str(device_result_dict["cache_device_id"])]): - delete_device_result_list.append(device_result) - - for delete_item in delete_device_result_list: - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).delete_deployment_result( - delete_item, endpoint_id, endpoint_name, model_name - ) - - logging.info(f"Deleted the replica record on master: {edge_id_replica_no_dict}") - - def send_next_scroll_update_msg(self, run_id_str, device_id, replica_no): - """ - Send the next scroll update msg to the devices if needed. - If there is no need for the next scroll update, directly return. - """ - if replica_no is None: - return - - replica_controller = self.model_runner_mapping[run_id_str].replica_controller - - if replica_controller.total_replica_version_diff_num == 0: - return - - if replica_controller.under_rollback: - replica_controller.intermediate_replica_version[device_id][replica_no] = replica_controller.start_version - return - - logging.info(f"Curr updating window: {replica_controller.curr_replica_updating_window} " - f"Curr version diff num: {replica_controller.total_replica_version_diff_num}") - - replica_controller.callback_update_updating_window(device_id, replica_no) - - # Decide whether to send the next scroll update - next_chunk_dict = replica_controller.get_next_chunk_devices_replica() - - if next_chunk_dict: - logging.info(f"The next scroll update for end point {run_id_str} is {next_chunk_dict}") - # Update curr updating window - replica_controller.curr_replica_updating_window = copy.deepcopy(next_chunk_dict) - - # Use global deployment result mapping to decide whether to send the next scroll update - self.running_request_json[run_id_str]["replica_version_diff"] = next_chunk_dict - - # Avoid using the old request_json - self.delete_device_replica_info_on_master( - self.running_request_json[run_id_str]["end_point_id"], - self.running_request_json[run_id_str]["end_point_name"], - self.running_request_json[run_id_str]["model_config"]["model_name"], - next_chunk_dict) - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in next_chunk_dict.keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.running_request_json[run_id_str]) - return - - def send_rollback_add_remove_op(self, run_id, rollback_replica_dict): - """ - This method is used when the original add op failed, we need to rollback by delete the existed replicas - Input example: - rollback_replica_dict = {'96684': {'curr_num': 2, 'op': 'remove', 'target_num': 1}} - """ - existed_request_json = self.running_request_json[str(run_id)] - updated_request_json = copy.deepcopy(existed_request_json) - - # Reverse the replica_num_diff - updated_request_json["replica_num_diff"] = rollback_replica_dict - - self.send_deployment_start_request_to_edges(in_request_json=updated_request_json) - - def callback_activate_deployment(self, topic, payload): - logging.info("callback_activate_deployment: topic = %s, payload = %s" % (topic, payload)) - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Get the previous deployment status. - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_status(model_msg_object.inference_end_point_id) - if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - return - - # Set end point as activated status - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( - model_msg_object.inference_end_point_id, model_msg_object.end_point_name, True) - - def callback_deactivate_deployment(self, topic, payload): - logging.info("callback_deactivate_deployment: topic = %s, payload = %s" % (topic, payload)) - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Get the endpoint status - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_status(model_msg_object.inference_end_point_id) - if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - return - - # Set end point as deactivated status - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( - model_msg_object.inference_end_point_id, model_msg_object.model_name, False) - - def set_runner_stopped_event(self, run_id): - run_id_str = str(run_id) - server_runner = self.model_runner_mapping.get(run_id_str, None) - if server_runner is not None: - if server_runner.run_process_event is not None: - server_runner.run_process_event.set() - self.model_runner_mapping.pop(run_id_str) - - def set_runner_completed_event(self, run_id): - run_id_str = str(run_id) - server_runner = self.model_runner_mapping.get(run_id_str, None) - if server_runner is not None: - if server_runner.run_process_completed_event is not None: - server_runner.run_process_completed_event.set() - self.model_runner_mapping.pop(run_id_str) - - def callback_delete_deployment(self, topic, payload): - logging.info("[Master] callback_delete_deployment") - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Delete SQLite records - FedMLServerDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) - FedMLModelDatabase.get_instance().delete_deployment_result( - model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, - model_version=model_msg_object.model_version) - FedMLModelDatabase.get_instance().delete_deployment_run_info( - end_point_id=model_msg_object.inference_end_point_id) - - # Delete Redis Records - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_activation(model_msg_object.inference_end_point_id, - model_msg_object.end_point_name, False) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - delete_end_point(model_msg_object.inference_end_point_id, model_msg_object.end_point_name, - model_msg_object.model_name, model_msg_object.model_version) - - # Send delete deployment request to the edge devices - self.send_deployment_delete_request_to_edges(payload, model_msg_object) - - # Stop processes on master - self.set_runner_stopped_event(model_msg_object.run_id) - self.stop_device_inference_monitor(model_msg_object.run_id, model_msg_object.end_point_name, - model_msg_object.model_id, model_msg_object.model_name, - model_msg_object.model_version) - - def send_deployment_results_with_payload(self, end_point_id, end_point_name, payload, replica_id_list=None): - self.send_deployment_results(end_point_id, end_point_name, - payload["model_name"], payload["model_url"], - payload["model_version"], payload["port"], - payload["inference_engine"], - payload["model_metadata"], - payload["model_config"], - payload["input_json"], - payload["output_json"], - replica_id_list=replica_id_list) - - def send_deployment_results(self, end_point_id, end_point_name, - model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, input_json, output_json, replica_id_list=None): - deployment_results_topic_prefix = "model_ops/model_device/return_deployment_result" - deployment_results_topic = "{}/{}".format(deployment_results_topic_prefix, end_point_id) - deployment_results_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, - "model_name": model_name, "model_url": model_inference_url, - "version": model_version, "port": inference_port, - "inference_engine": inference_engine, - "model_metadata": model_metadata, - "model_config": model_config, - "input_json": input_json, - "output_json": output_json, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f')), - "replica_ids": replica_id_list} - logging.info(f"[Master] deployment_results_payload is sent to mlops: {deployment_results_payload}") - - self.client_mqtt_mgr.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) - self.client_mqtt_mgr.send_message_json(deployment_results_topic_prefix, json.dumps(deployment_results_payload)) - - def send_deployment_status(self, end_point_id, end_point_name, model_name, model_inference_url, model_status): - deployment_status_topic_prefix = "model_ops/model_device/return_deployment_status" - deployment_status_topic = "{}/{}".format(deployment_status_topic_prefix, end_point_id) - deployment_status_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, - "model_name": model_name, - "model_url": model_inference_url, - "model_status": model_status, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} - logging.info(f"[Master] deployment_status_payload is sent to mlops: {deployment_status_payload}") - - self.client_mqtt_mgr.send_message_json(deployment_status_topic, json.dumps(deployment_status_payload)) - self.client_mqtt_mgr.send_message_json(deployment_status_topic_prefix, json.dumps(deployment_status_payload)) - - def send_deployment_stages(self, end_point_id, model_name, model_id, model_inference_url, - model_stages_index, model_stages_title, model_stage_detail): - deployment_stages_topic_prefix = "model_ops/model_device/return_deployment_stages" - deployment_stages_topic = "{}/{}".format(deployment_stages_topic_prefix, end_point_id) - deployment_stages_payload = {"model_name": model_name, - "model_id": model_id, - "model_url": model_inference_url, - "end_point_id": end_point_id, - "model_stage_index": model_stages_index, - "model_stage_title": model_stages_title, - "model_stage_detail": model_stage_detail, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} - - self.client_mqtt_mgr.send_message_json(deployment_stages_topic, json.dumps(deployment_stages_payload)) - self.client_mqtt_mgr.send_message_json(deployment_stages_topic_prefix, json.dumps(deployment_stages_payload)) - - logging.info(f"-------- Stages has been sent to mlops with stage {model_stages_index} and " - f"payload {deployment_stages_payload}") - time.sleep(2) - - def on_client_mqtt_disconnected(self, mqtt_client_object): - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = False - self.client_mqtt_lock.release() - - logging.info("on_client_mqtt_disconnected: {}.".format(self.client_mqtt_is_connected)) - - def on_client_mqtt_connected(self, mqtt_client_object): - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = True - self.client_mqtt_lock.release() - - # logging.info("on_client_mqtt_connected: {}.".format(self.client_mqtt_is_connected)) - - def setup_client_mqtt_mgr(self): - if self.client_mqtt_mgr is not None: - return - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - # logging.info( - # "server agent config: {},{}".format( - # self.agent_config["mqtt_config"]["BROKER_HOST"], self.agent_config["mqtt_config"]["BROKER_PORT"] - # ) - # ) - - self.client_mqtt_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelServerAgent_Metrics_@{}@_{}_{}_{}".format(self.user_name, self.args.current_device_id, - str(os.getpid()), - str(uuid.uuid4())) - ) - self.client_mqtt_mgr.add_connected_listener(self.on_client_mqtt_connected) - self.client_mqtt_mgr.add_disconnected_listener(self.on_client_mqtt_disconnected) - self.client_mqtt_mgr.connect() - self.client_mqtt_mgr.loop_start() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def release_client_mqtt_mgr(self): - try: - if self.client_mqtt_mgr is not None: - self.client_mqtt_mgr.loop_stop() - self.client_mqtt_mgr.disconnect() - - self.client_mqtt_lock.acquire() - if self.client_mqtt_mgr is not None: - self.client_mqtt_is_connected = False - self.client_mqtt_mgr = None - self.client_mqtt_lock.release() - except Exception: - pass - - def send_deployment_stop_request_to_edges(self, edge_id_list, payload): - for edge_id in edge_id_list: - topic_stop_deployment = "model_ops/model_device/stop_deployment/{}".format(str(self.edge_id)) - logging.info("stop_deployment: send topic " + topic_stop_deployment) - self.client_mqtt_mgr.send_message_json(topic_stop_deployment, payload) - - def send_exit_train_with_exception_request_to_edges(self, edge_id_list, payload): - for edge_id in edge_id_list: - topic_exit_train = "flserver_agent/" + str(edge_id) + "/exit_train_with_exception" - logging.info("exit_train_with_exception: send topic " + topic_exit_train) - self.client_mqtt_mgr.send_message_json(topic_exit_train, payload) - - def exit_run_with_exception_entry(self): - try: - self.setup_client_mqtt_mgr() - self.exit_run_with_exception() - except Exception as e: - self.release_client_mqtt_mgr() - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - finally: - self.release_client_mqtt_mgr() - - def exit_run_with_exception(self): - logging.info("Exit run successfully.") - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_run_process(self.run_id) - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id) - - time.sleep(1) - - def callback_exit_train_with_exception(self, topic, payload): - # logging.info("callback_exit_train_with_exception: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("run_id", None) - if run_id is None: - run_id = request_json.get("id", None) - - if run_id is None: - return - - edge_ids = request_json.get("edgeids", None) - - self.send_exit_train_with_exception_request_to_edges(edge_ids, payload) - - # Stop server with multiprocessing mode - self.request_json = request_json - server_runner = FedMLServerRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - try: - Process(target=server_runner.exit_run_with_exception_entry).start() - except Exception as e: - pass - - def callback_client_exit_train_with_exception(self, topic, payload): - # logging.info("callback_client_exit_train_with_exception: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - run_id = request_json.get("run_id", None) - edge_id = request_json.get("edge_id", None) - if run_id is None: - logging.info("callback_client_exit_train_with_exception run id is none") - return - - job = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job is not None and job.running_json is not None and job.running_json != "": - job_json_obj = json.loads(job.running_json) - edge_ids = job_json_obj.get("edgeids", None) - - self.mlops_metrics.broadcast_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=edge_id) - - self.send_exit_train_with_exception_request_to_edges(edge_ids, job.running_json) - - self.exit_run_with_exception() - - def callback_runner_id_status(self, topic, payload): - logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - status = request_json["status"] - edge_id = request_json["edge_id"] - run_id_str = str(run_id) - - if ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - ): - # Stop server with multiprocessing mode - stop_request_json = self.running_request_json.get(run_id_str, None) - if stop_request_json is None: - stop_request_json = request_json - if self.run_as_edge_server_and_agent: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config - ) - server_runner.edge_id = self.edge_id - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.run_status = status - status_process = Process(target=server_runner.cleanup_client_with_status) - status_process.start() - status_process.join(10) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - - def cleanup_client_with_status(self): - if self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - logging.info("received to finished status.") - self.cleanup_run_when_finished() - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - logging.info("received to failed status.") - self.cleanup_run_when_starting_failed() - - def callback_report_current_status(self, topic, payload): - request_json = json.loads(payload) - if self.run_as_edge_server_and_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_server: - pass - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_server_ota_msg(self, topic, payload): - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ServerConstants.FEDML_OTA_CMD_UPGRADE: - try: - self.process_ota_upgrade_msg() - # Process(target=FedMLServerRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - except Exception as e: - pass - elif cmd == ServerConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def bind_account_and_device_id(self, url, account_id, device_id, os_name): - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_ON_PREMISE_MASTER_INDEX] - if self.run_as_edge_server_and_agent: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_ON_PREMISE_MASTER_INDEX] - elif self.run_as_cloud_agent: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_FEDML_CLOUD_MASTER_INDEX] - elif self.run_as_cloud_server: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_INFERENCE_INSTANCE_INDEX] - - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flserver_agent/active" - status = MLOpsStatus.get_instance().get_server_agent_status(self.edge_id) - if ( - status is not None - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ): - return - - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_server_agent_status(self.edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - - def subscribe_slave_devices_message(self, request_json): - if request_json is None: - return - run_id = request_json["run_id"] - edge_id_list = request_json["device_ids"] - for edge_id in edge_id_list: - if str(edge_id) == str(self.edge_id): - continue - - # subscribe deployment result message for each model device - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - run_id, edge_id) - - self.mqtt_mgr.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) - self.mqtt_mgr.subscribe_msg(deployment_results_topic) - - def subscribe_spec_device_message(self, run_id, device_id): - if device_id == self.edge_id: - return - - # subscribe deployment result message for each model device - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - run_id, device_id) - - self.mqtt_mgr.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) - self.mqtt_mgr.subscribe_msg(deployment_results_topic) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting deployment - server_agent_id = self.edge_id - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_start_deployment, self.callback_start_deployment) - - # Setup MQTT message listener for activating deployment - topic_activate_deployment = "model_ops/model_device/activate_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_activate_deployment, self.callback_activate_deployment) - - # Setup MQTT message listener for deactivating deployment - topic_deactivate_deployment = "model_ops/model_device/deactivate_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_deactivate_deployment, self.callback_deactivate_deployment) - - # Setup MQTT message listener for delete deployment - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_delete_deployment, self.callback_delete_deployment) - - # Setup MQTT message listener for server status switching - topic_server_status = "fl_server/flserver_agent_" + str(server_agent_id) + "/status" - self.mqtt_mgr.add_message_listener(topic_server_status, self.callback_runner_id_status) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.mqtt_mgr.add_message_listener(topic_report_status, self.callback_report_current_status) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flserver_agent_" + str(server_agent_id) + "/ota" - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.callback_server_ota_msg) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_deployment, qos=2) - mqtt_client_object.subscribe(topic_activate_deployment, qos=2) - mqtt_client_object.subscribe(topic_deactivate_deployment, qos=2) - mqtt_client_object.subscribe(topic_delete_deployment, qos=2) - mqtt_client_object.subscribe(topic_server_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_deployment) - self.subscribed_topics.append(topic_activate_deployment) - self.subscribed_topics.append(topic_deactivate_deployment) - self.subscribed_topics.append(topic_delete_deployment) - self.subscribed_topics.append(topic_server_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - - self.endpoint_sync_protocol = FedMLEndpointSyncProtocol(agent_config=self.agent_config, mqtt_mgr=self.mqtt_mgr) - self.endpoint_sync_protocol.setup_listener_for_sync_device_info(self.edge_id) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Echo results - # print("\n\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - # print( - # "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - # + str(self.unique_device_id) - # + "\n" - # ) - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - ) - - def recover_inference_and_monitor(self): - try: - history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs() - for job in history_jobs.job_list: - if job.running_json is None: - continue - - if job.deployment_result == "": - continue - - run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port = \ - self.parse_model_run_params(json.loads(job.running_json)) - - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - is_activated = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_activation(run_id) - if not is_activated: - continue - - self.start_device_inference_gateway(run_id, end_point_name, model_id, model_name, model_version, - inference_port=inference_port) - - self.stop_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - self.start_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - except Exception as e: - logging.info("recover inference and monitor: {}".format(traceback.format_exc())) - - def recover_start_deployment_msg_after_upgrading(self): - try: - current_job = FedMLServerDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING: - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - is_activated = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_activation(current_job.job_id) - if not is_activated: - return - logging.info("start deployment after upgrading.") - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.callback_start_deployment(topic_start_deployment, current_job.running_json) - except Exception as e: - logging.info("recover starting deployment message after upgrading: {}".format(traceback.format_exc())) - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelServerAgent_Daemon_@" + self.user_name + "@_" + self.args.current_device_id + str(uuid.uuid4()), - "flserver_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLServerDataInterface.get_instance().create_job_table() - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ServerConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - except Exception as e: - pass - - server_api_cmd = "fedml.computing.scheduler.model_scheduler.device_server_api:api" - server_api_pids = RunProcessUtils.get_pid_from_cmd_line(server_api_cmd) - if server_api_pids is None or len(server_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ServerConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, server_api_cmd, ServerConstants.LOCAL_SERVER_API_PORT, - fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Model master local API process id {self.local_api_process.pid}") - - self.recover_inference_and_monitor() - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - self.setup_client_mqtt_mgr() - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - is_from_model=True, edge_id=self.edge_id) - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ) - - self.recover_start_deployment_msg_after_upgrading() - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_client_mqtt_mgr() - - def start_agent_mqtt_loop(self, should_exit_sys=True): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - print("Server tracing: {}".format(traceback.format_exc())) - finally: - self.stop_agent() - if should_exit_sys: - pass - """ - # Deprecated, will kill the process by the parent process. - time.sleep(5) - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - """ - diff --git a/python/fedml/computing/scheduler/slave/client_runner_deprecated.py b/python/fedml/computing/scheduler/slave/client_runner_deprecated.py deleted file mode 100755 index 79b569772..000000000 --- a/python/fedml/computing/scheduler/slave/client_runner_deprecated.py +++ /dev/null @@ -1,1872 +0,0 @@ -import json -import logging -import multiprocessing -import sys - -from multiprocessing import Process -import os -import platform -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from urllib.parse import urljoin, urlparse - -import requests - -import fedml -from ..comm_utils.constants import SchedulerConstants -from ..comm_utils.job_cleanup import JobCleanup -from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs -from ..comm_utils.run_process_utils import RunProcessUtils -from ..scheduler_entry.constants import Constants -from ....core.mlops.mlops_device_perfs import MLOpsDevicePerfStats -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .client_constants import ClientConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .client_data_interface import FedMLClientDataInterface -from ..comm_utils import sys_utils -from ....core.mlops.mlops_utils import MLOpsUtils -from ..model_scheduler.model_device_client import FedMLModelDeviceClientRunner -from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner -from ..comm_utils import security_utils -from ..scheduler_core.compute_cache_manager import ComputeCacheManager -from ..scheduler_core.message_center import FedMLMessageCenter -import ssl - - -class RunnerError(Exception): - """ Runner stopped. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLClientRunner(FedMLMessageCenter): - - def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0, - cuda_visible_gpu_ids_str=None): - super().__init__() - self.model_device_server_id = None - self.model_device_client_edge_id_list = None - self.disable_client_login = False - self.model_device_server = None - self.model_device_client_list = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_process = None - self.run_process_map = dict() - self.running_request_json = dict() - self.local_api_process = None - self.start_request_json = None - self.device_status = None - self.current_training_status = None - self.mqtt_mgr = None - self.edge_id = edge_id - self.edge_user_name = None - self.edge_extra_url = None - self.run_id = run_id - self.unique_device_id = None - self.args = args - self.request_json = request_json - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - self.sudo_cmd = "" - self.is_mac = False - if platform.system() == "Darwin": - self.is_mac = True - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { - "${FEDSYS.RUN_ID}": "", - "${FEDSYS.PRIVATE_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_ID_LIST}": "", - "${FEDSYS.SYNTHETIC_DATA_URL}": "", - "${FEDSYS.IS_USING_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_NUM}": "", - "${FEDSYS.CLIENT_INDEX}": "", - "${FEDSYS.CLIENT_OBJECT_LIST}": "", - "${FEDSYS.LOG_SERVER_URL}": "", - } - - self.mlops_metrics = None - self.client_active_list = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.server_id = None - self.computing_started_time = 0 - self.fedml_config_object = None - self.package_type = SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT - self.cuda_visible_gpu_ids_str = cuda_visible_gpu_ids_str - # logging.info("Current directory of client agent: " + self.cur_dir) - self.subscribed_topics = list() - self.user_name = None - self.general_edge_id = None - self.message_center = None - - def __repr__(self): - return "<{klass} @{id:x} {attrs}>".format( - klass=self.__class__.__name__, - id=id(self) & 0xFFFFFF, - attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), - ) - - def copy_runner(self): - copy_runner = FedMLClientRunner(self.args) - copy_runner.disable_client_login = self.disable_client_login - copy_runner.model_device_server = self.model_device_server - copy_runner.model_device_client_list = self.model_device_client_list - copy_runner.run_process_event = self.run_process_event - copy_runner.run_process_event_map = self.run_process_event_map - copy_runner.run_process_completed_event = self.run_process_completed_event - copy_runner.run_process_completed_event_map = self.run_process_completed_event_map - copy_runner.run_process = self.run_process - copy_runner.run_process_map = self.run_process_map - copy_runner.running_request_json = self.running_request_json - copy_runner.local_api_process = self.local_api_process - copy_runner.start_request_json = self.start_request_json - copy_runner.device_status = self.device_status - copy_runner.current_training_status = self.current_training_status - copy_runner.mqtt_mgr = self.mqtt_mgr - copy_runner.edge_id = self.edge_id - copy_runner.edge_user_name = self.edge_user_name - copy_runner.edge_extra_url = self.edge_extra_url - copy_runner.run_id = self.run_id - copy_runner.unique_device_id = self.unique_device_id - copy_runner.args = self.args - copy_runner.request_json = self.request_json - copy_runner.version =self.version - copy_runner.device_id = self.device_id - copy_runner.cur_dir = self.cur_dir - copy_runner.cur_dir = self.cur_dir - copy_runner.sudo_cmd = self.sudo_cmd - copy_runner.is_mac = self.is_mac - - copy_runner.agent_config = self.agent_config - copy_runner.fedml_data_base_package_dir = self.fedml_data_base_package_dir - copy_runner.fedml_data_local_package_dir = self.fedml_data_local_package_dir - copy_runner.fedml_data_dir = self.fedml_data_dir - copy_runner.fedml_config_dir = self.fedml_config_dir - - copy_runner.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES - - copy_runner.mlops_metrics = self.mlops_metrics - copy_runner.client_active_list = self.client_active_list - copy_runner.ntp_offset = self.ntp_offset - copy_runner.server_id = self.server_id - copy_runner.computing_started_time = self.computing_started_time - copy_runner.fedml_config_object = self.fedml_config_object - copy_runner.package_type = self.package_type - copy_runner.cuda_visible_gpu_ids_str = self.cuda_visible_gpu_ids_str - copy_runner.subscribed_topics = self.subscribed_topics - copy_runner.user_name = self.user_name - copy_runner.general_edge_id = self.general_edge_id - copy_runner.message_center = self.message_center - - return copy_runner - - def build_dynamic_constrain_variables(self, run_id, run_config): - data_config = run_config.get("data_config", {}) - server_edge_id_list = self.request_json["edgeids"] - local_edge_id_list = list() - local_edge_id_list.append(int(self.edge_id)) - is_using_local_data = 0 - private_data_dir = data_config.get("privateLocalData", "") - synthetic_data_url = data_config.get("syntheticDataUrl", "") - edges = self.request_json["edges"] - # if private_data_dir is not None \ - # and len(str(private_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - if private_data_dir is None or len(str(private_data_dir).strip(" ")) <= 0: - params_config = run_config.get("parameters", None) - private_data_dir = ClientConstants.get_data_dir() - if synthetic_data_url is None or len(str(synthetic_data_url)) <= 0: - synthetic_data_url = private_data_dir - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.RUN_ID}"] = run_id - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.PRIVATE_LOCAL_DATA}"] = private_data_dir.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_ID_LIST}"] = str(local_edge_id_list).replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.SYNTHETIC_DATA_URL}"] = synthetic_data_url.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.IS_USING_LOCAL_DATA}"] = str(is_using_local_data) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_NUM}"] = len(server_edge_id_list) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = 1 - for cur_index, id_value in enumerate(server_edge_id_list): - if str(id_value) == str(self.edge_id): - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = cur_index + 1 - break - client_objects = str(json.dumps(edges)) - client_objects = client_objects.replace(" ", "").replace("\n", "").replace('"', '\\"') - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_OBJECT_LIST}"] = client_objects - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.LOG_SERVER_URL}"] = self.agent_config["ml_ops_config"][ - "LOG_SERVER_URL" - ] - - def unzip_file(self, zip_file, unzip_file_path) -> str: - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unzipped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unzipped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook funtion is stateless, we need a state to avoid print progress repeatly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ClientConstants.get_package_download_dir() - os.makedirs(local_package_path, exist_ok=True) - filename, filename_without_extension, file_extension = ClientConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(local_package_path, f"fedml_run_{self.run_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - unzip_package_path = os.path.join(ClientConstants.get_package_unzip_dir(), - f"unzip_fedml_run_{self.run_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - logging.error( - f"Failed to remove directory {unzip_package_path}, Exception: {e}, Traceback: {traceback.format_exc()}") - pass - - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) # Using unziped folder name - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path - - def update_local_fedml_config(self, run_id, run_config): - packages_config = run_config["packages_config"] - - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - packages_config["linuxClient"], packages_config["linuxClientUrl"] - ) - fedml_local_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - - # Load the above config to memory - config_from_container = load_yaml_config(fedml_local_config_file) - container_entry_file_config = config_from_container["entry_config"] - container_dynamic_args_config = config_from_container["dynamic_args"] - entry_file = container_entry_file_config["entry_file"] - conf_file = container_entry_file_config["conf_file"] - self.package_type = container_entry_file_config.get("package_type", SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT) - full_conf_path = os.path.join(unzip_package_path, "fedml", "config", os.path.basename(conf_file)) - - # Dynamically build constrain variable with realtime parameters from server - self.build_dynamic_constrain_variables(run_id, run_config) - - # Update entry arguments value with constrain variable values with realtime parameters from server - # currently we support the following constrain variables: - # ${FEDSYS_RUN_ID}: a run id represented one entire Federated Learning flow - # ${FEDSYS_PRIVATE_LOCAL_DATA}: private local data path in the Federated Learning client - # ${FEDSYS_CLIENT_ID_LIST}: client list in one entire Federated Learning flow - # ${FEDSYS_SYNTHETIC_DATA_URL}: synthetic data url from server, - # if this value is not null, the client will download data from this URL to use it as - # federated training data set - # ${FEDSYS_IS_USING_LOCAL_DATA}: whether use private local data as federated training data set - # container_dynamic_args_config["data_cache_dir"] = "${FEDSYS.PRIVATE_LOCAL_DATA}" - for constrain_variable_key, constrain_variable_value in self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES.items(): - for argument_key, argument_value in container_dynamic_args_config.items(): - if argument_value is not None and str(argument_value).find(constrain_variable_key) == 0: - replaced_argument_value = str(argument_value).replace( - constrain_variable_key, str(constrain_variable_value) - ) - container_dynamic_args_config[argument_key] = replaced_argument_value - - # Merge all container new config sections as new config dictionary - package_conf_object = dict() - package_conf_object["entry_config"] = container_entry_file_config - package_conf_object["dynamic_args"] = container_dynamic_args_config - package_conf_object["dynamic_args"]["config_version"] = self.args.config_version - container_dynamic_args_config["mqtt_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["mqtt_config_path"]) - ) - container_dynamic_args_config["s3_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["s3_config_path"]) - ) - log_file_dir = ClientConstants.get_log_file_dir() - os.makedirs(log_file_dir, exist_ok=True) - package_conf_object["dynamic_args"]["log_file_dir"] = log_file_dir - - # Save new config dictionary to local file - fedml_updated_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - ClientConstants.generate_yaml_doc(package_conf_object, fedml_updated_config_file) - - # Build dynamic arguments and set arguments to fedml config object - self.build_dynamic_args(run_id, run_config, package_conf_object, unzip_package_path) - return unzip_package_path, package_conf_object - - def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): - fedml_conf_file = package_conf_object["entry_config"]["conf_file"] - fedml_conf_file_processed = str(fedml_conf_file).replace('\\', os.sep).replace('/', os.sep) - fedml_conf_path = os.path.join(base_dir, "fedml", "config", - os.path.basename(fedml_conf_file_processed)) - fedml_conf_object = load_yaml_config(fedml_conf_path) - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - - # Replace local fedml config objects with parameters from MLOps web - parameters_object = run_config.get("parameters", None) - if parameters_object is not None: - for config_k, config_v in fedml_conf_object.items(): - parameter_v = parameters_object.get(config_k, None) - if parameter_v is not None: - fedml_conf_object[config_k] = parameter_v - parameters_object.pop(config_k) - - for config_k, config_v in parameters_object.items(): - fedml_conf_object[config_k] = config_v - - package_dynamic_args = package_conf_object["dynamic_args"] - if fedml_conf_object.get("comm_args", None) is not None: - fedml_conf_object["comm_args"]["mqtt_config_path"] = package_dynamic_args["mqtt_config_path"] - fedml_conf_object["comm_args"]["s3_config_path"] = package_dynamic_args["s3_config_path"] - fedml_conf_object["common_args"]["using_mlops"] = True - if fedml_conf_object.get("train_args", None) is not None: - fedml_conf_object["train_args"]["run_id"] = package_dynamic_args["run_id"] - fedml_conf_object["train_args"]["client_id_list"] = package_dynamic_args["client_id_list"] - fedml_conf_object["train_args"]["client_num_in_total"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_num_per_round"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_id"] = self.edge_id - fedml_conf_object["train_args"]["server_id"] = self.request_json.get("server_id", "0") - if fedml_conf_object.get("device_args", None) is not None: - fedml_conf_object["device_args"]["worker_num"] = int(package_dynamic_args["client_num_in_total"]) - # fedml_conf_object["data_args"]["data_cache_dir"] = package_dynamic_args["data_cache_dir"] - data_args = fedml_conf_object.get("data_args") - if data_args is not None: - data_cache_dir = fedml_conf_object["data_args"].get("data_cache_dir") - if data_cache_dir is not None: - data_cache_dir = os.path.join(data_cache_dir, str(self.edge_id)) - fedml_conf_object["data_args"]["data_cache_dir"] = data_cache_dir - if fedml_conf_object.get("tracking_args", None) is not None: - fedml_conf_object["tracking_args"]["log_file_dir"] = package_dynamic_args["log_file_dir"] - fedml_conf_object["tracking_args"]["log_server_url"] = package_dynamic_args["log_server_url"] - - fedml_conf_object["dynamic_args"] = package_dynamic_args - self.fedml_config_object = fedml_conf_object.copy() - ClientConstants.generate_yaml_doc(fedml_conf_object, fedml_conf_path) - - def run_bootstrap_script(self, bootstrap_cmd_list, bootstrap_script_file): - try: - logging.info("Bootstrap commands are being executed...") - process, error_list = ClientConstants.execute_commands_with_live_logs(bootstrap_cmd_list, - callback=self.callback_run_bootstrap) - - ret_code, out, err = process.returncode, None, None - if ret_code is None or ret_code <= 0: - if error_list is not None and len(error_list) > 0: - is_bootstrap_run_ok = False - else: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - sys_utils.log_return_info(bootstrap_script_file, 0) - - is_bootstrap_run_ok = True - else: - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - sys_utils.log_return_info(bootstrap_script_file, ret_code) - - is_bootstrap_run_ok = False - except Exception as e: - logging.error(f"Bootstrap script error: Exception: {e}, Traceback: {traceback.format_exc()}") - is_bootstrap_run_ok = False - return is_bootstrap_run_ok - - def callback_run_bootstrap(self, job_pid): - ClientConstants.save_bootstrap_process(self.run_id, job_pid) - - def run(self, process_event, completed_event, message_center_queue): - print(f"Client runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - self.rebuild_message_center(message_center_queue) - self.run_impl() - except RunnerError: - logging.info("Runner stopped.") - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error(f"Runner exited with errors. Exception: {e}, Traceback {traceback.format_exc()}") - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - server_id=self.server_id, run_id=self.run_id) - finally: - if self.mlops_metrics is not None: - computing_ended_time = MLOpsUtils.get_ntp_time() - self.mlops_metrics.report_edge_job_computing_cost(self.run_id, self.edge_id, - self.computing_started_time, computing_ended_time, - self.args.user, self.args.api_key) - logging.info("Release resources.") - self.cleanup_containers_and_release_gpus(self.run_id, self.edge_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - - def check_runner_stop_event(self): - if self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def run_impl(self): - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config.get("data_config", {}) - packages_config = run_config["packages_config"] - - self.computing_started_time = MLOpsUtils.get_ntp_time() - self.mlops_metrics.report_edge_job_computing_cost(run_id, self.edge_id, - self.computing_started_time, 0, - self.args.user, self.args.api_key) - - self.check_runner_stop_event() - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, - running_json=self.start_request_json, run_id=run_id) - - # get training params - private_local_data_dir = data_config.get("privateLocalData", "") - is_using_local_data = 0 - # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - - # start a run according to the hyper-parameters - # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) - fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") - fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") - if is_using_local_data: - fedml_local_data_dir = private_local_data_dir - self.fedml_data_dir = self.fedml_data_local_package_dir - - self.check_runner_stop_event() - - logging.info("Download packages") - - # update local config with real time parameters from server and dynamically replace variables value - unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) - # if unzip_package_path is None or fedml_config_object is None: - # logging.info("failed to update local fedml config.") - # self.check_runner_stop_event() - # # Send failed msg when exceptions. - # self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - # return - - logging.info("Check downloaded packages...") - - entry_file_config = fedml_config_object["entry_config"] - dynamic_args_config = fedml_config_object["dynamic_args"] - entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) - entry_file = os.path.basename(entry_file) - conf_file = entry_file_config["conf_file"] - conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) - ##### - # ClientConstants.cleanup_learning_process(run_id) - # ClientConstants.cleanup_bootstrap_process(run_id) - ##### - - if not os.path.exists(unzip_package_path): - logging.info("failed to unzip file.") - self.check_runner_stop_event() - # Send failed msg when exceptions. - self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - return - os.chdir(os.path.join(unzip_package_path, "fedml")) - - self.check_runner_stop_event() - - logging.info("starting the user process...") - - entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) - conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) - logging.info("waiting the user process to finish...") - logging.info(" ") - logging.info(" ") - logging.info("====Your Run Logs Begin===") - - process, is_launch_task, error_list = self.execute_job_task(unzip_package_path=unzip_package_path, - entry_file_full_path=entry_file_full_path, - conf_file_full_path=conf_file_full_path, - dynamic_args_config=dynamic_args_config, - fedml_config_object=self.fedml_config_object) - - logging.info("====Your Run Logs End===") - logging.info(" ") - logging.info(" ") - - ret_code, out, err = process.returncode if process else None, None, None - is_run_ok = sys_utils.is_runner_finished_normally(process.pid) - if is_launch_task: - is_run_ok = True - if error_list is not None and len(error_list) > 0: - is_run_ok = False - if ret_code is None or ret_code <= 0: - self.check_runner_stop_event() - - if is_run_ok: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - server_id=self.server_id, run_id=run_id) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - else: - is_run_ok = False - - if not is_run_ok: - # If the run status is killed or finished, then return with the normal state. - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) - if current_job is not None and (current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED): - return - - self.check_runner_stop_event() - - logging.error("failed to run the learning process...") - - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - - # Send failed msg when exceptions. - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - server_id=self.server_id, run_id=run_id) - - def execute_job_task(self, unzip_package_path, entry_file_full_path, conf_file_full_path, dynamic_args_config, - fedml_config_object): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - client_rank = self.request_json.get("client_rank", 1) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - job_api_key = job_yaml.get("run_api_key", None) - job_api_key = job_yaml.get("fedml_run_dynamic_params", None) if job_api_key is None else job_api_key - assigned_gpu_ids = run_params.get("gpu_ids", None) - job_type = job_yaml.get("job_type", None) - containerize = fedml_config_object.get("containerize", None) - image_pull_policy = fedml_config_object.get("image_pull_policy", Constants.IMAGE_PULL_POLICY_ALWAYS) - # TODO: Can we remove task_type? - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - conf_file_object = load_yaml_config(conf_file_full_path) - entry_args_dict = conf_file_object.get("fedml_entry_args", {}) - entry_args = entry_args_dict.get("arg_items", None) - scheduler_match_info = self.request_json.get("scheduler_match_info", {}) - if job_type == Constants.JOB_TASK_TYPE_TRAIN: - containerize = True if containerize is None else containerize - - # Bootstrap Info - bootstrap_script_path, bootstrap_script_dir, bootstrap_script_file = [None] * 3 - env_args = fedml_config_object.get("environment_args", None) - - if env_args is not None: - bootstrap_script_file = env_args.get("bootstrap", None) - if bootstrap_script_file is not None: - bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep) - if platform.system() == 'Windows': - bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat' - if bootstrap_script_file is not None: - bootstrap_script_dir = os.path.join(unzip_package_path, "fedml", - os.path.dirname(bootstrap_script_file)) - bootstrap_script_path = os.path.join( - bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file) - ) - - bootstrap_cmd_list = list() - if bootstrap_script_path: - logging.info("Bootstrap commands are being generated...") - bootstrap_cmd_list = JobRunnerUtils.generate_bootstrap_commands(bootstrap_script_path=bootstrap_script_path, - bootstrap_script_dir=bootstrap_script_dir, - bootstrap_script_file=bootstrap_script_file) - logging.info(f"Generated following Bootstrap commands: {bootstrap_cmd_list}") - - if not containerize: - if len(bootstrap_cmd_list) and not (job_type == Constants.JOB_TASK_TYPE_DEPLOY or - job_type == Constants.JOB_TASK_TYPE_SERVE): - bootstrapping_successful = self.run_bootstrap_script(bootstrap_cmd_list=bootstrap_cmd_list, - bootstrap_script_file=bootstrap_script_file) - - if not bootstrapping_successful: - logging.info("failed to update local fedml config.") - self.check_runner_stop_event() - # Send failed msg when exceptions. - self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - raise Exception(f"Failed to execute following bootstrap commands: {bootstrap_cmd_list}") - - logging.info("cleanup the previous learning process and bootstrap process...") - ClientConstants.cleanup_learning_process(self.request_json["runId"]) - ClientConstants.cleanup_bootstrap_process(self.request_json["runId"]) - - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - - if job_yaml_default_none is None: - # Generate the job executing commands for previous federated learning (Compatibility) - python_program = get_python_program() - logging.info("Run the client: {} {} --cf {} --rank {} --role client".format( - python_program, entry_file_full_path, conf_file_full_path, str(dynamic_args_config.get("rank", 1)))) - rank = str(dynamic_args_config.get("rank", 1)) - entry_command = f"{python_program} {entry_file_full_path} --cf " \ - f"{conf_file_full_path} --rank {rank} --role client" - shell_cmd_list = [entry_command] - - # Run the job executing commands for previous federated learning (Compatibility) - process, error_list = ClientConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) - is_launch_task = False - else: - self.check_runner_stop_event() - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, run_id=self.run_id) - - # Generate the job executing commands - job_executing_commands = JobRunnerUtils.generate_job_execute_commands( - self.run_id, self.edge_id, self.version, - self.package_type, executable_interpreter, entry_file_full_path, - conf_file_object, entry_args, assigned_gpu_ids, - job_api_key, client_rank, scheduler_match_info=scheduler_match_info, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str) - - if containerize is not None and containerize is True: - docker_args = fedml_config_object.get("docker", {}) - docker_args = JobRunnerUtils.create_instance_from_dict(DockerArgs, docker_args) - try: - job_executing_commands = JobRunnerUtils.generate_launch_docker_command(docker_args=docker_args, - run_id=self.run_id, - edge_id=self.edge_id, - unzip_package_path=unzip_package_path, - executable_interpreter=executable_interpreter, - entry_file_full_path=entry_file_full_path, - bootstrap_cmd_list=bootstrap_cmd_list, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str, - image_pull_policy=image_pull_policy) - except Exception as e: - logging.error(f"Error occurred while generating containerized launch commands. " - f"Exception: {e}, Traceback: {traceback.format_exc()}") - return None, None, None - - if not job_executing_commands: - raise Exception("Failed to generate docker execution command") - - # Run the job executing commands - logging.info(f"Run the client job with job id {self.run_id}, device id {self.edge_id}.") - process, error_list = ClientConstants.execute_commands_with_live_logs( - job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor, - should_write_log_file=False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True) - is_launch_task = False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True - - return process, is_launch_task, error_list - - def callback_start_fl_job(self, job_pid): - ClientConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_sys_perf( - self.args, self.agent_config["mqtt_config"], job_process_id=job_pid) - - def start_job_perf(self, job_pid): - ClientConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_job_perf(self.args, self.agent_config["mqtt_config"], job_pid) - - def job_error_processor(self, error_list): - self.check_runner_stop_event() - - error_str = "\n".join(error_list) - error_message = f"Error occurred when running the job... {error_str}" - logging.error(error_message) - raise Exception(error_message) - - def reset_devices_status(self, edge_id, status, should_send_client_id_status=True): - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = edge_id - - if should_send_client_id_status: - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION: - self.mlops_metrics.report_client_id_status( - edge_id, status, server_id=self.server_id, run_id=self.run_id) - - def sync_run_stop_status(self, run_status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED): - try: - if self.run_process_event is not None: - self.run_process_event.set() - - self.mlops_metrics.report_client_id_status( - self.edge_id, run_status, server_id=self.server_id, run_id=self.run_id) - except Exception as e: - logging.error(f"Failed to sync run stop status with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def cleanup_run_when_starting_failed( - self, status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, should_send_client_id_status=True): - # logging.error("Cleanup run successfully when starting failed.") - - self.reset_devices_status( - self.edge_id, status, should_send_client_id_status=should_send_client_id_status) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - logging.error(f"Failed to stop sys perf with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - - time.sleep(1) - - try: - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_bootstrap_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - except Exception as e: - logging.error( - f"Failed to cleanup run when starting failed with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def cleanup_run_when_finished(self): - # logging.info("Cleanup run successfully when finished.") - - self.reset_devices_status(self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - should_send_client_id_status=False) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - logging.error(f"Failed to stop sys perf with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - - time.sleep(1) - - try: - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_bootstrap_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - except Exception as e: - logging.error( - f"Failed to cleanup run when finished with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def setup_message_center(self): - if self.message_center is not None: - return - - self.message_center = FedMLMessageCenter(agent_config=self.agent_config) - self.message_center.start_sender() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - - def rebuild_message_center(self, message_center_queue): - self.message_center = FedMLMessageCenter(message_queue=message_center_queue) - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - - def release_message_center(self): - try: - if self.message_center is not None: - self.message_center.stop() - self.message_center = None - - except Exception as e: - logging.error( - f"Failed to release client mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def ota_upgrade(self, payload, request_json): - run_id = request_json["runId"] - force_ota = False - ota_version = None - - try: - run_config = request_json.get("run_config", None) - parameters = run_config.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) if common_args is not None else False - ota_version = common_args.get("ota_version", None) if common_args is not None else None - except Exception as e: - logging.error( - f"Failed to get ota upgrade parameters with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - logging.error(f"Failed to check fedml version with Exception {e}. Traceback: {traceback.format_exc()}") - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - FedMLClientDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - payload) - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, run_id=run_id) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - raise Exception("Restarting after upgraded...") - - def callback_start_train(self, topic, payload): - # Get training params - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["runId"] - - # Start log processor for current run - train_edge_id = str(topic).split("/")[-2] - self.args.run_id = run_id - self.args.edge_id = train_edge_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, train_edge_id, log_source=SchedulerConstants.get_log_source(request_json)) - logging.info("start the log processor") - - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - logging.error(f"Failed to fetch all configs with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - if not FedMLClientDataInterface.get_instance().get_agent_status(): - request_json = json.loads(payload) - run_id = request_json["runId"] - logging.error( - "FedMLDebug - Receive: topic ({}), payload ({}), but the client agent is disabled. {}".format( - topic, payload, traceback.format_exc() - ) - ) - # Send failed msg when exceptions. - self.mlops_metrics.report_client_id_status( - train_edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION, run_id=run_id, - msg=f"the client agent {train_edge_id} is disabled") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, train_edge_id) - return - - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - # Terminate previous process about starting or stopping run command - logging.info("cleanup and save runner information") - server_agent_id = request_json["cloud_agent_id"] - ClientConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, train_edge_id, run_id=run_id) - - # OTA upgrade - # self.ota_upgrade(payload, request_json) - - # Occupy GPUs - scheduler_match_info = request_json.get("scheduler_match_info", {}) - matched_gpu_num = scheduler_match_info.get("matched_gpu_num", 0) - model_master_device_id = scheduler_match_info.get("model_master_device_id", None) - model_slave_device_id = scheduler_match_info.get("model_slave_device_id", None) - model_slave_device_id_list = scheduler_match_info.get("model_slave_device_id_list", None) - run_config = request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - serving_args = run_params.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", SchedulerConstants.JOB_TASK_TYPE_TRAIN) - cuda_visible_gpu_ids_str = None - if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or - job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): - cuda_visible_gpu_ids_str = JobRunnerUtils.get_instance().occupy_gpu_ids( - run_id, matched_gpu_num, train_edge_id, inner_id=endpoint_id, - model_master_device_id=model_master_device_id, - model_slave_device_id=model_slave_device_id) - logging.info( - f"Run started, available gpu ids: {JobRunnerUtils.get_instance().get_available_gpu_id_list(train_edge_id)}") - - # Start server with multiprocessing mode - self.request_json = request_json - run_id_str = str(run_id) - self.running_request_json[run_id_str] = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=train_edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id, - cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str - ) - client_runner.start_request_json = payload - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - client_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - client_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - client_runner.server_id = request_json.get("server_id", "0") - logging.info("start the runner process.") - self.run_process_map[run_id_str] = Process(target=client_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str], - self.message_center.get_message_queue())) - self.run_process_map[run_id_str].start() - ClientConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - def callback_stop_train(self, topic, payload): - # logging.info("callback_stop_train: topic = %s, payload = %s" % (topic, payload)) - # logging.info( - # f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - # ) - - train_edge_id = str(topic).split("/")[-2] - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("id", None) - run_status = request_json.get("run_status", ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - - # logging.info("Stop run with multiprocessing...") - - # Stop client with multiprocessing mode - run_id_str = str(run_id) - client_runner = FedMLClientRunner( - self.args, edge_id=train_edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - self.cleanup_containers_and_release_gpus(run_id, train_edge_id) - client_runner.run_process_event = self.run_process_event_map.get(run_id_str, None) - client_runner.run_process = self.run_process_map.get(run_id_str, None) - client_runner.message_center = self.message_center - client_runner.mlops_metrics = self.mlops_metrics - client_runner.sync_run_stop_status(run_status=run_status) - - def cleanup_containers_and_release_gpus(self, run_id, edge_id): - job_type = JobRunnerUtils.get_job_type_from_run_id(run_id) - - if not job_type: - logging.info(f"Failed to get job type from run id {run_id}. This is not an error as it would usually " - f"happen when the job is not found in the database because job is already finished and " - f"cleaned up. Exiting cleanup_containers_and_release_gpus.") - return - - # Check if the job type is not "serve" or "deploy" - if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or - job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): - - # Terminate the run docker container if exists - container_name = JobRunnerUtils.get_run_container_name(run_id) - docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) - logging.info(f"Terminating the run docker container {container_name} if exists...") - try: - JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) - except Exception as e: - logging.error(f"Exception {e} occurred when terminating docker container. " - f"Traceback: {traceback.format_exc()}") - - # Release the GPU ids and update the GPU availability in the persistent store - JobRunnerUtils.get_instance().release_gpu_ids(run_id, edge_id) - - # Send mqtt message reporting the new gpu availability to the backend - MLOpsDevicePerfStats.report_gpu_device_info(self.edge_id, mqtt_mgr=self.mqtt_mgr) - - def cleanup_client_with_status(self): - if self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: - # logging.info("received to finished status.") - self.cleanup_run_when_finished() - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - # logging.error("received to failed status from the server agent") - self.cleanup_run_when_starting_failed(should_send_client_id_status=False) - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: - # logging.error("received to failed status from the server agent") - self.cleanup_run_when_starting_failed(status=self.device_status, should_send_client_id_status=False) - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - # logging.info(f"FedMLDebug - Receive: topic ({topic}), payload ({payload})") - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - edge_id = str(topic).split("/")[-2].split('_')[-1] - status = request_json["status"] - run_id_str = str(run_id) - - self.save_training_status( - edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else status) - - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: - completed_event = self.run_process_completed_event_map.get(run_id_str, None) - if completed_event is not None: - completed_event.set() - - # Stop client with multiprocessing mode - client_runner = FedMLClientRunner( - self.args, - edge_id=edge_id, - request_json=request_json, - agent_config=self.agent_config, - run_id=run_id, - ) - client_runner.device_status = status - client_runner.message_center = self.message_center - client_runner.mlops_metrics = self.mlops_metrics - client_runner.cleanup_client_with_status() - - running_json = self.running_request_json.get(run_id_str) - if running_json is None: - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) - running_json = json.loads(current_job.running_json) - except Exception as e: - logging.error(f"Failed to get running json with Exception {e}. Traceback: {traceback.format_exc()}") - - if running_json is not None: - job_type = JobRunnerUtils.parse_job_type(running_json) - if not SchedulerConstants.is_deploy_job(job_type): - logging.info(f"[run/device][{run_id}/{edge_id}] Release gpu resource when run ended.") - self.cleanup_containers_and_release_gpus(run_id, edge_id) - - run_process = self.run_process_map.get(run_id_str, None) - if run_process is not None: - if run_process.pid is not None: - RunProcessUtils.kill_process(run_process.pid) - - # Terminate the run docker container if exists - try: - container_name = JobRunnerUtils.get_run_container_name(run_id) - docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) - logging.info(f"Terminating the run docker container {container_name} if exists...") - JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) - except Exception as e: - logging.error(f"Error occurred when terminating docker container." - f"Exception: {e}, Traceback: {traceback.format_exc()}.") - - self.run_process_map.pop(run_id_str) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) - - def callback_report_current_status(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - self.send_agent_active_msg() - if self.general_edge_id is not None: - self.send_agent_active_msg(self.general_edge_id) - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - @staticmethod - def callback_client_ota_msg(topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ClientConstants.FEDML_OTA_CMD_UPGRADE: - FedMLClientRunner.process_ota_upgrade_msg() - # Process(target=FedMLClientRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - elif cmd == ClientConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def get_all_run_process_list_map(self): - run_process_dict = dict() - for run_id_str, process in self.run_process_map.items(): - cur_run_process_list = ClientConstants.get_learning_process_list(run_id_str) - run_process_dict[run_id_str] = cur_run_process_list - - return run_process_dict - - def response_device_info_to_mlops(self, topic, payload): - payload_json = json.loads(payload) - server_id = payload_json.get("server_id", 0) - run_id = payload_json.get("run_id", 0) - listen_edge_id = str(topic).split("/")[-1] - context = payload_json.get("context", None) - need_gpu_info = payload_json.get("need_gpu_info", False) - need_running_process_list = payload_json.get("need_running_process_list", False) - response_topic = f"deploy/slave_agent/mlops/response_device_info" - if self.mlops_metrics is not None and self.model_device_client_edge_id_list is not None and \ - self.model_device_server_id is not None: - if not need_gpu_info: - device_info_json = { - "edge_id": listen_edge_id, - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \ - gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() - host_ip = sys_utils.get_host_ip() - host_port = sys_utils.get_available_port() - gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - gpu_list = sys_utils.get_gpu_list() - device_info_json = { - "edge_id": listen_edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "gpu_available_ids": gpu_available_ids, - "gpu_list": gpu_list, - "node_ip": host_ip, - "node_port": host_port, - "networkTraffic": sent_bytes + recv_bytes, - "updateTime": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - if need_running_process_list: - device_info_json["run_process_list_map"] = self.get_all_run_process_list_map() - salve_device_ids = list() - for model_client_edge_id in self.model_device_client_edge_id_list: - salve_device_ids.append(model_client_edge_id) - response_payload = {"slave_device_id": self.model_device_client_edge_id_list[0], - "slave_device_id_list": salve_device_ids, - "master_device_id": self.model_device_server_id, - "run_id": run_id, "edge_id": listen_edge_id, - "edge_info": device_info_json} - if context is not None: - response_payload["context"] = context - self.message_center.send_message(response_topic, json.dumps(response_payload), run_id=run_id) - - def callback_report_device_info(self, topic, payload): - payload_json = json.loads(payload) - server_id = payload_json.get("server_id", 0) - run_id = payload_json.get("run_id", 0) - listen_edge_id = str(topic).split("/")[-1] - context = payload_json.get("context", None) - need_gpu_info = payload_json.get("need_gpu_info", False) - need_running_process_list = payload_json.get("need_running_process_list", False) - response_topic = f"client/server/response_device_info/{server_id}" - if self.mlops_metrics is not None and self.model_device_client_edge_id_list is not None and \ - self.model_device_server_id is not None: - if not need_gpu_info: - device_info_json = { - "edge_id": listen_edge_id, - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \ - gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() - host_ip = sys_utils.get_host_ip() - host_port = sys_utils.get_available_port() - gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - gpu_list = sys_utils.get_gpu_list() - device_info_json = { - "edge_id": listen_edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "gpu_available_ids": gpu_available_ids, - "gpu_list": gpu_list, - "node_ip": host_ip, - "node_port": host_port, - "networkTraffic": sent_bytes + recv_bytes, - "updateTime": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - if need_running_process_list: - device_info_json["run_process_list_map"] = self.get_all_run_process_list_map() - salve_device_ids = list() - for model_client_edge_id in self.model_device_client_edge_id_list: - salve_device_ids.append(model_client_edge_id) - response_payload = {"slave_device_id": self.model_device_client_edge_id_list[0], - "slave_device_id_list": salve_device_ids, - "master_device_id": self.model_device_server_id, - "run_id": run_id, "edge_id": listen_edge_id, - "edge_info": device_info_json} - if context is not None: - response_payload["context"] = context - self.message_center.send_message(response_topic, json.dumps(response_payload), run_id=run_id) - - def callback_client_logout(self, topic, payload): - payload_json = json.loads(payload) - secret = payload_json.get("auth", None) - if secret is None or str(secret) != "246b1be6-0eeb-4b17-b118-7d74de1975d4": - return - logging.info("Received the logout request.") - if self.run_process_event is not None: - self.run_process_event.set() - if self.run_process_completed_event is not None: - self.run_process_completed_event.set() - self.disable_client_login = True - time.sleep(3) - os.system("fedml logout") - - def save_training_status(self, edge_id, training_status): - self.current_training_status = training_status - ClientConstants.save_training_infos(edge_id, training_status) - - @staticmethod - def get_gpu_machine_id(): - gpu_list = sys_utils.get_gpu_list() - gpu_uuids = "" - if len(gpu_list) > 0: - for gpu in gpu_list: - gpu_uuids += gpu.get("uuid", "") - else: - gpu_uuids = str(uuid.uuid4()) - device_id_combination = \ - f"{FedMLClientRunner.get_machine_id()}-{hex(uuid.getnode())}-{gpu_uuids}" - device_id = security_utils.get_content_hash(device_id_combination) - return device_id - - @staticmethod - def get_device_id(use_machine_id=False): - device_file_path = os.path.join(ClientConstants.get_data_dir(), - ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path, exist_ok=True) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - if not use_machine_id: - device_id = hex(uuid.getnode()) - else: - device_id = FedMLClientRunner.get_gpu_machine_id() - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - logging.error(f"Failed to get uuid with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - return str(guid) - - device_id = str(get_uuid()) - logging.info(device_id) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - if not use_machine_id: - device_id = hex(uuid.getnode()) - else: - device_id = device_id = FedMLClientRunner.get_gpu_machine_id() - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - @staticmethod - def get_machine_id(): - try: - import machineid - return machineid.id().replace('\n', '').replace('\r\n', '').strip() - except Exception as e: - logging.error(f"Failed to get machine id with Exception {e}. Traceback: {traceback.format_exc()}") - return hex(uuid.getnode()) - - @staticmethod - def bind_account_and_device_id(url, account_id, device_id, os_name, api_key="", role="client"): - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "api_key": api_key, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - logging.error( - f"Failed to bind account and device id with error: {err}, traceback: {traceback.format_exc()}") - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id, user_name, extra_url, general_edge_id = -1, None, None, None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - general_edge_id = response.json().get("data").get("general_edge_id", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None, None - return edge_id, user_name, extra_url, general_edge_id - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self, edge_id): - active_topic = "flclient_agent/active" - status = MLOpsStatus.get_instance().get_client_agent_status(edge_id) - if ( - status is not None - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - ): - return - - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - logging.error(f"Failed to get current job with Exception {e}. Traceback: {traceback.format_exc()}") - current_job = None - if current_job is None: - if status is not None and status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE: - status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - else: - return - else: - status = ClientConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": edge_id, "status": status} - MLOpsStatus.get_instance().set_client_agent_status(edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - logging.info(f"Send agent active msg {active_msg}") - - def recover_start_train_msg_after_upgrading(self): - try: - current_job = FedMLClientDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING: - logging.info("start training after upgrading.") - topic_start_train = "flserver_agent/" + str(self.edge_id) + "/start_train" - self.callback_start_train(topic_start_train, current_job.running_json) - except Exception as e: - logging.error(f"recover starting train message after upgrading failed with exception {e}, " - f"Traceback {traceback.format_exc()}") - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting training - topic_start_train = "flserver_agent/" + str(self.edge_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "flserver_agent/" + str(self.edge_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.edge_id) + "/status" - self.add_message_listener(topic_client_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_client_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.add_message_listener(topic_report_status, self.callback_report_current_status) - self.mqtt_mgr.add_message_listener(topic_report_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flclient_agent_" + str(self.edge_id) + "/ota" - self.add_message_listener(topic_ota_msg, self.callback_client_ota_msg) - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_request_device_info = "server/client/request_device_info/" + str(self.edge_id) - self.add_message_listener(topic_request_device_info, self.callback_report_device_info) - self.mqtt_mgr.add_message_listener(topic_request_device_info, self.listener_message_dispatch_center) - - topic_request_edge_device_info_from_mlops = f"deploy/mlops/slave_agent/request_device_info/{self.edge_id}" - self.add_message_listener(topic_request_edge_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_edge_device_info_from_mlops, self.listener_message_dispatch_center) - - topic_request_deploy_master_device_info_from_mlops = None - if self.model_device_server_id is not None: - topic_request_deploy_master_device_info_from_mlops = f"deploy/mlops/master_agent/request_device_info/{self.model_device_server_id}" - self.add_message_listener(topic_request_deploy_master_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_deploy_master_device_info_from_mlops, self.listener_message_dispatch_center) - - topic_request_deploy_slave_device_info_from_mlops = None - if self.model_device_client_edge_id_list is not None and len(self.model_device_client_edge_id_list) > 0: - topic_request_deploy_slave_device_info_from_mlops = f"deploy/mlops/slave_agent/request_device_info/{self.model_device_client_edge_id_list[0]}" - self.add_message_listener(topic_request_deploy_slave_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_deploy_slave_device_info_from_mlops, self.listener_message_dispatch_center) - - # Setup MQTT message listener to logout from MLOps. - topic_client_logout = "mlops/client/logout/" + str(self.edge_id) - self.add_message_listener(topic_client_logout, self.callback_client_logout) - self.mqtt_mgr.add_message_listener(topic_client_logout, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_train, qos=2) - mqtt_client_object.subscribe(topic_stop_train, qos=2) - mqtt_client_object.subscribe(topic_client_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - mqtt_client_object.subscribe(topic_request_device_info, qos=2) - mqtt_client_object.subscribe(topic_request_edge_device_info_from_mlops, qos=2) - if topic_request_deploy_master_device_info_from_mlops is not None: - mqtt_client_object.subscribe(topic_request_deploy_master_device_info_from_mlops, qos=2) - if topic_request_deploy_slave_device_info_from_mlops is not None: - mqtt_client_object.subscribe(topic_request_deploy_slave_device_info_from_mlops, qos=2) - mqtt_client_object.subscribe(topic_client_logout, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - self.subscribed_topics.append(topic_request_device_info) - self.subscribed_topics.append(topic_request_edge_device_info_from_mlops) - if topic_request_deploy_master_device_info_from_mlops is not None: - self.subscribed_topics.append(topic_request_deploy_master_device_info_from_mlops) - if topic_request_deploy_slave_device_info_from_mlops is not None: - self.subscribed_topics.append(topic_request_deploy_slave_device_info_from_mlops) - self.subscribed_topics.append(topic_client_logout) - - # Subscribe the messages for federated learning. - self.subscribe_fl_msgs() - - # Broadcast the first active message. - self.send_agent_active_msg(self.edge_id) - if self.general_edge_id is not None: - self.send_agent_active_msg(self.general_edge_id) - - # Echo results - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout() - worker_deploy_id_list = [modeld_device_clint.edge_id for index, modeld_device_clint in - enumerate(self.model_device_client_list)] - print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - print(f"Your FedML Edge ID is {str(self.edge_id)}, unique device ID is {str(self.unique_device_id)}, " - f"master deploy ID is {str(self.model_device_server.edge_id)}, " - f"worker deploy ID is {worker_deploy_id_list}" - ) - if self.edge_extra_url is not None and self.edge_extra_url != "": - print(f"You may visit the following url to fill in more information with your device.\n" - f"{self.edge_extra_url}") - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout(enable=False) - - from fedml.core.mlops import sync_deploy_id - sync_deploy_id( - self.edge_id, self.model_device_server.edge_id, worker_deploy_id_list) - - # Start the message center for listener - self.start_listener(sender_message_queue=self.message_center.get_message_queue(), - agent_config=self.agent_config) - - def subscribe_fl_msgs(self): - if self.general_edge_id is None: - return - - # Setup MQTT message listener for starting training - topic_start_train = "flserver_agent/" + str(self.general_edge_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "flserver_agent/" + str(self.general_edge_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.general_edge_id) + "/status" - self.add_message_listener(topic_client_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_client_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_request_device_info = "server/client/request_device_info/" + str(self.general_edge_id) - self.add_message_listener(topic_request_device_info, self.callback_report_device_info) - self.mqtt_mgr.add_message_listener(topic_request_device_info, self.listener_message_dispatch_center) - - topic_request_device_info_from_mlops = f"deploy/mlops/client_agent/request_device_info/{self.general_edge_id}" - self.add_message_listener(topic_request_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_device_info_from_mlops, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - self.mqtt_mgr.subscribe_msg(topic_start_train) - self.mqtt_mgr.subscribe_msg(topic_stop_train) - self.mqtt_mgr.subscribe_msg(topic_client_status) - self.mqtt_mgr.subscribe_msg(topic_request_device_info) - self.mqtt_mgr.subscribe_msg(topic_request_device_info_from_mlops) - - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_request_device_info) - self.subscribed_topics.append(topic_request_device_info_from_mlops) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_client_agent_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - ) - pass - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_ClientAgent_Daemon_@{self.user_name}@_@{self.args.current_device_id}@_@{str(uuid.uuid4())}@", - "flclient_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLClientDataInterface.get_instance().create_job_table() - - # Start the message center to process edge related messages. - self.setup_message_center() - - # Start local API services - client_api_cmd = "fedml.computing.scheduler.slave.client_api:api" - client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) - if client_api_pids is None or len(client_api_pids) <= 0: - python_program = get_python_program() - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - self.local_api_process = ClientConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} " - "--reload --reload-delay 3 --reload-dir {} --log-level critical".format( - python_program, client_api_cmd, ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Client local API process id {self.local_api_process.pid}") - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - # Report the IDLE status to MLOps - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - self.recover_start_train_msg_after_upgrading() - - infer_host = os.getenv("FEDML_INFER_HOST", None) - infer_redis_addr = os.getenv("FEDML_INFER_REDIS_ADDR", None) - infer_redis_port = os.getenv("FEDML_INFER_REDIS_PORT", None) - infer_redis_password = os.getenv("FEDML_INFER_REDIS_PASSWORD", None) - model_client_num = os.getenv("FEDML_MODEL_WORKER_NUM", None) - os.environ["FEDML_CURRENT_EDGE_ID"] = str(self.edge_id) - - if not ComputeCacheManager.get_instance().set_redis_params(): - os.environ["FEDML_DISABLE_REDIS_CONNECTION"] = "1" - - if self.model_device_client_edge_id_list is None: - self.model_device_client_edge_id_list = list() - if self.model_device_client_list is None: - model_client_num = 1 if model_client_num is None else int(model_client_num) - self.model_device_client_list = list() - for client_index in range(model_client_num): - model_device_client = FedMLModelDeviceClientRunner( - self.args, f"{self.args.current_device_id}_{client_index + 1}", self.args.os_name, - self.args.is_from_docker, self.agent_config) - if infer_host is not None: - model_device_client.infer_host = infer_host - if infer_redis_addr is not None: - model_device_client.redis_addr = infer_redis_addr - if infer_redis_port is not None: - model_device_client.redis_port = infer_redis_port - if infer_redis_password is not None: - model_device_client.redis_password = infer_redis_password - model_device_client.start() - self.model_device_client_list.append(model_device_client) - self.model_device_client_edge_id_list.append(model_device_client.get_edge_id()) - - if self.model_device_server is None: - self.model_device_server = FedMLModelDeviceServerRunner(self.args, self.args.current_device_id, - self.args.os_name, self.args.is_from_docker, - self.agent_config) - if infer_host is not None: - self.model_device_server.infer_host = infer_host - if infer_redis_addr is not None: - self.model_device_server.redis_addr = infer_redis_addr - if infer_redis_port is not None: - self.model_device_server.redis_port = infer_redis_port - if infer_redis_password is not None: - self.model_device_server.redis_password = infer_redis_password - - self.model_device_server.start() - self.model_device_server_id = self.model_device_server.get_edge_id() - - JobCleanup.get_instance().sync_data_on_startup(self.edge_id) - - os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id()) - os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list]) - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"]) - - def start_agent_mqtt_loop(self): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - logging.error(f"Errors in the MQTT loop: Exception {e}, Traceback: {traceback.format_exc()}") - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Client tracing: {}".format(traceback.format_exc())) - finally: - print("finally") - login_exit_file = os.path.join(ClientConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - - self.stop_agent() - - time.sleep(5) - sys_utils.cleanup_all_fedml_client_login_processes( - ClientConstants.CLIENT_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.model_device_server is not None: - self.model_device_server.stop() - self.model_device_server = None - - if self.model_device_client_list is not None: - for model_client in self.model_device_client_list: - model_client.stop() - self.model_device_client_list.clear() - self.model_device_client_list = None - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - logging.error(f"Unsubscribe topics error: {e}, Traceback: {traceback.format_exc()}") - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_message_center() - - def get_runner(self): - runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=self.request_json, - agent_config=self.agent_config, run_id=self.run_id, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str - ) - runner.edge_user_name = self.user_name - runner.edge_extra_url = self.edge_extra_url - runner.unique_device_id = self.unique_device_id - runner.user_name = self.user_name - runner.general_edge_id = self.general_edge_id - runner.model_device_client_edge_id_list = self.model_device_client_edge_id_list - runner.model_device_server_id = self.model_device_server_id - return runner