diff --git a/app/main.py b/app/main.py index c2acdd9a..a9a4733f 100755 --- a/app/main.py +++ b/app/main.py @@ -76,6 +76,14 @@ help="Config directory path", ) +argparser.add_argument( + "-d", + "--database", + dest="databases", + default="/opt/nebula", + help="Nebula databases path", +) + argparser.add_argument( "-l", "--logs", diff --git a/nebula/addons/waf/Dockerfile-waf b/nebula/addons/waf/Dockerfile-waf index 0af60bdc..e0b0fdf3 100755 --- a/nebula/addons/waf/Dockerfile-waf +++ b/nebula/addons/waf/Dockerfile-waf @@ -36,7 +36,7 @@ RUN cp nginx-modules/nginx-1.24.0/objs/ngx_http_geoip2_module.so /usr/lib/nginx/ RUN cp nginx-modules/nginx-1.24.0/objs/ngx_stream_geoip2_module.so /usr/lib/nginx/modules # geoip2 database downloaded -RUN wget https://git.io/GeoLite2-Country.mmdb -P /usr/share/GeoIP/ +RUN wget https://github.com/P3TERX/GeoLite.mmdb/raw/download/GeoLite2-Country.mmdb -P /usr/share/GeoIP/ # nginx configuration files COPY default.conf /etc/nginx/templates/conf.d/default.conf.template diff --git a/nebula/addons/waf/default.conf b/nebula/addons/waf/default.conf index e2ee5892..0930d99c 100755 --- a/nebula/addons/waf/default.conf +++ b/nebula/addons/waf/default.conf @@ -12,7 +12,7 @@ server { listen 80 default_server; server_name localhost; - set $upstream http://nebula-nebula-frontend; # Change this + set $upstream http://nebula_nebula-frontend; # Change this set $always_redirect off; modsecurity on; location /nebula { diff --git a/nebula/controller.py b/nebula/controller.py index 96b7e8c6..7054d2b4 100755 --- a/nebula/controller.py +++ b/nebula/controller.py @@ -339,6 +339,7 @@ def __init__(self, args): self.statistics_port = args.statsport if hasattr(args, "statsport") else 8080 self.simulation = args.simulation self.config_dir = args.config + self.db_dir = args.databases if hasattr(args, "databases") else "/opt/nebula" self.test = args.test if hasattr(args, "test") else False self.log_dir = args.logs self.cert_dir = args.certs @@ -453,6 +454,7 @@ def start(self): else: self.run_frontend() logging.info(f"NEBULA Frontend is running at http://localhost:{self.frontend_port}") + logging.info(f"NEBULA Databases created in {self.db_dir}") # Watchdog for running additional scripts in the host machine (i.e. during the execution of a federation) event_handler = NebulaEventHandler() @@ -516,7 +518,7 @@ def run_controller_api(self): ) def run_waf(self): - network_name = f"{os.environ['USER']}-nebula-net-base" + network_name = f"{os.environ['USER']}_nebula-net-base" base = DockerUtils.create_docker_network(network_name) client = docker.from_env() @@ -537,7 +539,7 @@ def run_waf(self): container_id_waf = client.api.create_container( image="nebula-waf", - name=f"{os.environ['USER']}-nebula-waf", + name=f"{os.environ['USER']}_nebula-waf", detach=True, volumes=volumes_waf, host_config=host_config_waf, @@ -571,7 +573,7 @@ def run_waf(self): container_id = client.api.create_container( image="nebula-waf-grafana", - name=f"{os.environ['USER']}-nebula-waf-grafana", + name=f"{os.environ['USER']}_nebula-waf-grafana", detach=True, environment=environment, host_config=host_config, @@ -595,7 +597,7 @@ def run_waf(self): container_id_loki = client.api.create_container( image="nebula-waf-loki", - name=f"{os.environ['USER']}-nebula-waf-loki", + name=f"{os.environ['USER']}_nebula-waf-loki", detach=True, command=command, host_config=host_config_loki, @@ -619,7 +621,7 @@ def run_waf(self): container_id_promtail = client.api.create_container( image="nebula-waf-promtail", - name=f"{os.environ['USER']}-nebula-waf-promtail", + name=f"{os.environ['USER']}_nebula-waf-promtail", detach=True, volumes=volumes_promtail, host_config=host_config_promtail, @@ -646,7 +648,7 @@ def run_frontend(self): except Exception: logging.info("No GPU available for the frontend, nodes will be deploy in CPU mode") - network_name = f"{os.environ['USER']}-nebula-net-base" + network_name = f"{os.environ['USER']}_nebula-net-base" # Create the Docker network base = DockerUtils.create_docker_network(network_name) @@ -681,6 +683,7 @@ def run_frontend(self): f"{self.root_path}:/nebula", "/var/run/docker.sock:/var/run/docker.sock", f"{self.root_path}/nebula/frontend/config/nebula:/etc/nginx/sites-available/default", + f"{self.db_dir}/databases:/nebula/nebula/frontend/databases" ], extra_hosts={"host.docker.internal": "host-gateway"}, port_bindings={80: self.frontend_port, 8080: self.statistics_port}, @@ -692,7 +695,7 @@ def run_frontend(self): container_id = client.api.create_container( image="nebula-frontend", - name=f"{os.environ['USER']}-nebula-frontend", + name=f"{os.environ['USER']}_nebula-frontend", detach=True, environment=environment, volumes=volumes, @@ -708,16 +711,17 @@ def run_test(self): @staticmethod def stop_waf(): - DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}-nebula-waf") + DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}_nebula-waf") @staticmethod def stop(): logging.info("Closing NEBULA (exiting from components)... Please wait") - DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}") + DockerUtils.remove_containers_by_prefix(f"{os.environ['USER']}_nebula") + DockerUtils.remove_containers_by_prefix(f"nebula_") ScenarioManagement.stop_blockchain() ScenarioManagement.stop_participants() Controller.stop_waf() - DockerUtils.remove_docker_networks_by_prefix(f"{os.environ['USER']}") + DockerUtils.remove_docker_networks_by_prefix(f"{os.environ['USER']}_") controller_pid_file = os.path.join(os.path.dirname(__file__), "controller.pid") try: with open(controller_pid_file) as f: diff --git a/nebula/core/training/lightning.py b/nebula/core/training/lightning.py index 98bc841f..59552275 100755 --- a/nebula/core/training/lightning.py +++ b/nebula/core/training/lightning.py @@ -185,7 +185,7 @@ def create_trainer(self): self._trainer = Trainer( callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()], max_epochs=self.epochs, - accelerator=self.config.participant["device_args"]["accelerator"], + accelerator="gpu", devices=gpu_index, logger=self._logger, enable_checkpointing=False, @@ -197,7 +197,7 @@ def create_trainer(self): self._trainer = Trainer( callbacks=[ModelSummary(max_depth=1), NebulaProgressBar()], max_epochs=self.epochs, - accelerator=self.config.participant["device_args"]["accelerator"], + accelerator="cpu", devices="auto", logger=self._logger, enable_checkpointing=False, diff --git a/nebula/frontend/app.py b/nebula/frontend/app.py index 3635b66e..2f3e09e2 100755 --- a/nebula/frontend/app.py +++ b/nebula/frontend/app.py @@ -472,15 +472,15 @@ async def check_enough_resources(): resources = await get_host_resources() mem_percent = resources.get("memory_percent") - gpu_memory_percent = resources.get("gpu_memory_percent", []) + # gpu_memory_percent = resources.get("gpu_memory_percent", []) # if cpu_percent >= settings.resources_threshold or mem_percent >= settings.resources_threshold: if mem_percent >= settings.resources_threshold: return False - elif len(gpu_memory_percent) > 0: - for gpu_mem in gpu_memory_percent: - if gpu_mem >= settings.resources_threshold: - return False + # elif len(gpu_memory_percent) > 0: + # for gpu_mem in gpu_memory_percent: + # if gpu_mem >= settings.resources_threshold: + # return False return True @@ -493,17 +493,17 @@ async def monitor_resources(user): if not enough_resources: running_scenario = get_running_scenario(user) if running_scenario: - # Wich card has big memory consumption - gpu = await get_least_memory_gpu() - # Stop scenario if is using the high memory gpu + # # Wich card has big memory consumption + # gpu = await get_least_memory_gpu() + # # Stop scenario if is using the high memory gpu running_scenario_as_dict = dict(running_scenario) - if running_scenario_as_dict["gpu_id"] == gpu.get("available_gpu_index"): - scenario_name = running_scenario_as_dict["name"] - stop_scenario(scenario_name, user) - user_data.scenarios_list_length -= 1 - user_data.finish_scenario_event.set() + scenario_name = running_scenario_as_dict["name"] + # if running_scenario_as_dict["gpu_id"] == gpu.get("available_gpu_index"): + stop_scenario(scenario_name, user) + user_data.scenarios_list_length -= 1 + user_data.finish_scenario_event.set() - await asyncio.sleep(5) + await asyncio.sleep(20) @@ -876,9 +876,9 @@ def stop_scenario(scenario_name, user): from nebula.scenarios import ScenarioManagement ScenarioManagement.stop_participants(scenario_name) - DockerUtils.remove_containers_by_prefix(f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{user}-participant") + DockerUtils.remove_containers_by_prefix(f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{user}-participant") DockerUtils.remove_docker_network( - f"{(os.environ.get('NEBULA_CONTROLLER_NAME'))}-{str(user).lower()}-nebula-net-scenario" + f"{(os.environ.get('NEBULA_CONTROLLER_NAME'))}_{str(user).lower()}-nebula-net-scenario" ) ScenarioManagement.stop_blockchain() scenario_set_status_to_finished(scenario_name) @@ -1227,19 +1227,42 @@ async def node_stopped(scenario_name: str, request: Request): raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) -async def assign_available_gpu(scenario_data, role): +async def assign_available_gpu(scenario_data, role): + available_gpus = [] + if scenario_data["accelerator"] == "cpu": scenario_data["gpu_id"] = [] else: response = await get_available_gpus() - available_gpus = response.get("available_gpus") - if len(available_gpus) > 0: - if role == "user": - scenario_data["gpu_id"] = [available_gpus.pop()] - elif role == "admin": - scenario_data["gpu_id"] = available_gpus - else: - scenario_data["gpu_id"] = [] + # Obtain available system_gpus + available_system_gpus = response.get("available_gpus") + running_scenarios = get_running_scenario(get_all=True) + # Obtain currently used gpus + if running_scenarios: + running_gpus = [] + for scenario in running_scenarios: + scenario_gpus = json.loads(scenario["gpu_id"]) + for gpu in scenario_gpus: + if gpu not in running_gpus: + running_gpus.append(gpu) + + # Obtain gpus that are not in use + for gpu in available_system_gpus: + if gpu not in running_gpus: + available_gpus.append(gpu) + else: + available_gpus = available_system_gpus + + # Assign gpus based in user role + if len(available_gpus) > 0: + if role == "user": + scenario_data["gpu_id"] = [available_gpus.pop()] + elif role == "admin": + scenario_data["gpu_id"] = available_gpus + else: + scenario_data["gpu_id"] = [] + else: + scenario_data["gpu_id"] = [] return scenario_data diff --git a/nebula/frontend/database.py b/nebula/frontend/database.py index dbeff58e..c855b00b 100755 --- a/nebula/frontend/database.py +++ b/nebula/frontend/database.py @@ -1,5 +1,6 @@ import asyncio import datetime +import logging import sqlite3 import aiosqlite @@ -23,10 +24,15 @@ async def setup_database(db_file_location): - async with aiosqlite.connect(db_file_location) as db: - for pragma in PRAGMA_SETTINGS: - await db.execute(pragma) - await db.commit() + try: + async with aiosqlite.connect(db_file_location) as db: + for pragma in PRAGMA_SETTINGS: + await db.execute(pragma) + await db.commit() + except PermissionError: + logging.info("No permission to create the databases. Change the default databases directory") + except Exception as e: + logging.exception(f"An error has ocurred during setup_database: {e}") async def ensure_columns(conn, table_name, desired_columns): @@ -545,7 +551,7 @@ def scenario_set_status_to_completed(scenario_name): print(f"Database error: {e}") -def get_running_scenario(username=None): +def get_running_scenario(username=None, get_all=False): with sqlite3.connect(scenario_db_file_location) as conn: conn.row_factory = sqlite3.Row c = conn.cursor() @@ -556,11 +562,15 @@ def get_running_scenario(username=None): WHERE (status = ? OR status = ?) AND username = ?; """ c.execute(command, ("running", "completed", username)) + + result = c.fetchone() else: command = "SELECT * FROM scenarios WHERE status = ? OR status = ?;" c.execute(command, ("running", "completed")) - - result = c.fetchone() + if get_all: + result = c.fetchall() + else: + result = c.fetchone() return result diff --git a/nebula/frontend/databases/__init__.py b/nebula/frontend/databases/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/nebula/scenarios.py b/nebula/scenarios.py index ea9dffc1..e93ad3ad 100644 --- a/nebula/scenarios.py +++ b/nebula/scenarios.py @@ -253,7 +253,7 @@ def __init__(self, scenario, user=None): # Assign the controller endpoint if self.scenario.deployment == "docker": - self.controller = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-frontend" + self.controller = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-frontend" else: self.controller = f"127.0.0.1:{os.environ.get('NEBULA_FRONTEND_PORT')}" @@ -646,7 +646,7 @@ def start_nodes_docker(self): logging.info("Starting nodes using Docker Compose...") logging.info(f"env path: {self.env_path}") - network_name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{str(self.user).lower()}-nebula-net-scenario" + network_name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{str(self.user).lower()}-nebula-net-scenario" # Create the Docker network base = DockerUtils.create_docker_network(network_name) @@ -658,7 +658,7 @@ def start_nodes_docker(self): container_ids = [] for idx, node in enumerate(self.config.participants): image = "nebula-core" - name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-{self.user}-participant{node['device_args']['idx']}" + name = f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_{self.user}-participant{node['device_args']['idx']}" if node["device_args"]["accelerator"] == "gpu": environment = {"NVIDIA_DISABLE_REQUIRE": True} @@ -691,7 +691,7 @@ def start_nodes_docker(self): f"{network_name}": client.api.create_endpoint_config( ipv4_address=f"{base}.{i}", ), - f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-net-base": client.api.create_endpoint_config(), + f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-net-base": client.api.create_endpoint_config(), "chainnet": client.api.create_endpoint_config(), }) else: @@ -699,7 +699,7 @@ def start_nodes_docker(self): f"{network_name}": client.api.create_endpoint_config( ipv4_address=f"{base}.{i}", ), - f"{os.environ.get('NEBULA_CONTROLLER_NAME')}-nebula-net-base": client.api.create_endpoint_config(), + f"{os.environ.get('NEBULA_CONTROLLER_NAME')}_nebula-net-base": client.api.create_endpoint_config(), }) node["tracking_args"]["log_dir"] = "/nebula/app/logs"