Skip to content

Commit

Permalink
fix: bug in get_available_cores within container (#546)
Browse files Browse the repository at this point in the history
Instead of counting all devices under /sys/class/neuron_device/, we now count the devices under /dev that correspond to the neuron major.
  • Loading branch information
oOraph authored Apr 2, 2024
1 parent e5238d7 commit bb66802
Showing 1 changed file with 37 additions and 1 deletion.
38 changes: 37 additions & 1 deletion optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
"""Base class for text-generation model architectures on neuron devices."""

import copy
import functools
import logging
import os
import re
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
Expand All @@ -34,6 +36,10 @@
from .utils.version_utils import check_compiler_compatibility, get_neuronxcc_version


NEURON_DEV_PATTERN = re.compile(r"^neuron\d+$", re.IGNORECASE)
MAJORS_FILE = "/proc/devices"
NEURON_MAJOR_LINE = re.compile(r"^\s*(\d+)\s+neuron\s*$")

if is_transformers_neuronx_available():
from transformers_neuronx.config import ContinuousBatchingConfig, NeuronConfig
from transformers_neuronx.module import save_split
Expand All @@ -50,14 +56,44 @@ def get_exporter(config, task):
return TasksManager.get_exporter_config_constructor(model_type=config.model_type, exporter="neuron", task=task)()


# Note: with python 3.9, functools.cache would be more suited
@functools.lru_cache()
def get_neuron_major() -> int:
with open(MAJORS_FILE, "r") as f:
for l in f.readlines():
m = NEURON_MAJOR_LINE.match(l)
if m:
return int(m.group(1))
logger.error("No major for neuron device could be found in /proc/devices!")
return -1


@requires_transformers_neuronx
def get_available_cores() -> int:
"""A helper to get the number of available cores.
This number depends first on the actual number of cores, then on the
content of the NEURON_RT_NUM_CORES and NEURON_RT_VISIBLE_CORES variables.
"""
max_cores = len(os.listdir("/sys/class/neuron_device/")) * 2
device_count = 0
neuron_major = get_neuron_major()
root, _, files = next(os.walk("/dev"))
# Just look for devices in dev, non recursively
for f in files:
if neuron_major > 0:
try:
dev_major = os.major(os.stat("{}/{}".format(root, f)).st_rdev)
if dev_major == neuron_major:
device_count += 1
except FileNotFoundError:
# Just to avoid race conditions where some devices would be deleted while running this
pass
else:
# We were not able to get the neuron major properly we fallback on counting neuron devices based on the
# device name
if NEURON_DEV_PATTERN.match(f):
device_count += 1
max_cores = device_count * 2
num_cores = os.environ.get("NEURON_RT_NUM_CORES", max_cores)
if num_cores != max_cores:
num_cores = int(num_cores)
Expand Down

0 comments on commit bb66802

Please sign in to comment.