You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
2024-11-09 03:23:10,955 (73b4ed6f99d5:runner): WARNING/MainProcess - resources.py - Lock dir "/var/lib/terra/lock/73b4ed6f99d5/7/gpu" is not empty. Deleting it now for soft-lock support.
Exception in runner on 73b4ed6f99d5
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/src/dsm/service_runners/crop_and_calibrate.py", line 18, in
from dsm.tasks.crop_and_calibrate import crop_and_calibrate_task
File "/src/dsm/tasks/init.py", line 16, in
ResourceManager.register_resource('gpu', GPU_INDICES, WORKERS_PER_GPU)
File "/src/external/terra/terra/executor/resources.py", line 438, in register_resource
resource = Resource(name, *args, **kwargs)
File "/src/external/terra/terra/executor/resources.py", line 169, in init
rmtree(self.lock_dir)
File "/usr/local/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/usr/local/lib/python3.10/shutil.py", line 681, in _rmtree_safe_fd
onerror(os.unlink, fullname, sys.exc_info())
File "/usr/local/lib/python3.10/shutil.py", line 679, in _rmtree_safe_fd
os.unlink(entry.name, dir_fd=topfd)
FileNotFoundError: [Errno 2] No such file or directory: 'tmp5dj5n3rv'
Exception ignored in: <function Resource.del at 0x7fc1877f1b40>
Traceback (most recent call last):
File "/src/external/terra/terra/executor/resources.py", line 333, in del
if self.is_locked:
File "/src/external/terra/terra/executor/resources.py", line 203, in is_locked
if self._local.lock is None:
AttributeError: 'Resource' object has no attribute '_local'
The text was updated successfully, but these errors were encountered:
Exception in runner on 587d4f32f127
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/local/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/src/external/terra/terra/task.py", line 135, in call
return_value = self.run(*args, **kwargs)
...
with ResourceManager.get_resource('gpu') as gpu_index:
File "/src/external/terra/terra/executor/resources.py", line 326, in enter
return self.acquire()
File "/src/external/terra/terra/executor/resources.py", line 255, in acquire
return self._acquire(lock_file, resource_index, repeat)
File "/src/external/terra/terra/executor/resources.py", line 209, in _acquire
lock.acquire()
File "/venv/src-NVTF7jWz/lib/python3.10/site-packages/filelock/_api.py", line 182, in acquire
self._acquire()
File "/venv/src-NVTF7jWz/lib/python3.10/site-packages/filelock/_unix.py", line 35, in _acquire
fd = os.open(self._lock_file, open_flags)
FileNotFoundError: [Errno 2] No such file or directory: '/var/lib/terra/lock/587d4f32f127/7/gpu/0.0.lock
We have a problem (usually with a low number of workers) where one worker will delete the folder because it happens to be empty because all the workers are switching over simultaneously. Guaranteed to happen with 1 worker, common with 2, more rare as the number goes up.
The work around has been to temporarily comment out the clean up code to prevent this, a better solution is needed
2024-11-09 03:23:10,955 (73b4ed6f99d5:runner): WARNING/MainProcess - resources.py - Lock dir "/var/lib/terra/lock/73b4ed6f99d5/7/gpu" is not empty. Deleting it now for soft-lock support.
Exception in runner on 73b4ed6f99d5
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/src/dsm/service_runners/crop_and_calibrate.py", line 18, in
from dsm.tasks.crop_and_calibrate import crop_and_calibrate_task
File "/src/dsm/tasks/init.py", line 16, in
ResourceManager.register_resource('gpu', GPU_INDICES, WORKERS_PER_GPU)
File "/src/external/terra/terra/executor/resources.py", line 438, in register_resource
resource = Resource(name, *args, **kwargs)
File "/src/external/terra/terra/executor/resources.py", line 169, in init
rmtree(self.lock_dir)
File "/usr/local/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/usr/local/lib/python3.10/shutil.py", line 681, in _rmtree_safe_fd
onerror(os.unlink, fullname, sys.exc_info())
File "/usr/local/lib/python3.10/shutil.py", line 679, in _rmtree_safe_fd
os.unlink(entry.name, dir_fd=topfd)
FileNotFoundError: [Errno 2] No such file or directory: 'tmp5dj5n3rv'
Exception ignored in: <function Resource.del at 0x7fc1877f1b40>
Traceback (most recent call last):
File "/src/external/terra/terra/executor/resources.py", line 333, in del
if self.is_locked:
File "/src/external/terra/terra/executor/resources.py", line 203, in is_locked
if self._local.lock is None:
AttributeError: 'Resource' object has no attribute '_local'
The text was updated successfully, but these errors were encountered: