You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
torchrun --nproc_per_node=1 train.py --train_args_file train_args/Glm.yaml
Traceback (most recent call last):
File "/home/yierde/anaconda3/envs/tn/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 237, in launch_agent
result = agent.run()
^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 844, in _invoke_run
self._initialize_workers(self._worker_group)
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 681, in _initialize_workers
worker_ids = self._start_workers(worker_group)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 271, in _start_workers
self._pcontext = start_processes(
^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/init.py", line 207, in start_processes
redirs = to_map(redirects, nprocs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 162, in to_map
map[i] = val_or_map.get(i, Std.NONE)
^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get'
怎么解决它? 我没办法跑起来。。。
The text was updated successfully, but these errors were encountered:
torchrun --nproc_per_node=1 train.py --train_args_file train_args/Glm.yaml
Traceback (most recent call last):
File "/home/yierde/anaconda3/envs/tn/bin/torchrun", line 8, in
sys.exit(main())
^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 237, in launch_agent
result = agent.run()
^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 844, in _invoke_run
self._initialize_workers(self._worker_group)
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 681, in _initialize_workers
worker_ids = self._start_workers(worker_group)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 271, in _start_workers
self._pcontext = start_processes(
^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/init.py", line 207, in start_processes
redirs = to_map(redirects, nprocs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yierde/anaconda3/envs/tn/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 162, in to_map
map[i] = val_or_map.get(i, Std.NONE)
^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'get'
怎么解决它? 我没办法跑起来。。。
The text was updated successfully, but these errors were encountered: