segfault - Error toggling CUDA -- NCCL #5

d4l3k · 2024-04-30T06:04:49Z

I'm running into errors when restoring a process that's using NCCL.

$ sudo ../criu/criu/criu restore --shell-job --images-dir images/1 --tcp-established --restore-detached
$ sudo cuda-checkpoint --toggle --pid 219321
Error toggling CUDA in process ID 219321: "OS call failed or operation not supported on this OS"

From dmesg log:

[18740.219311] cuda0000240000a[219353]: segfault at 7b5860c00000 ip 00007b58659dd24c sp 00007b5839fffa50 error 4 in libcuda.so.550.76[7b58656db000+498000] likely on CPU 13 (core 5, socket 0)
[18740.219327] Code: 00 85 c0 0f 85 55 01 00 00 48 8b 83 80 84 00 00 48 85 c0 0f 84 45 01 00 00 48 8b 40 10 48 85 c0 0f 84 38 01 00 00 48 8b 40 10 <8b> 10 89 55 c0 8b 50 04 89 55 c4 8b 50 08 89 55 c8 0f b7 50 0c 66

Repro script:

import time
import os
import os.path
import multiprocessing as mp
import subprocess
from dataclasses import dataclass
from pathlib import Path
import tempfile

WORLD_SIZE = 2
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "25901"

def signal_file(pid: int, signal: str) -> None:
    return os.path.join(tempfile.gettempdir(), f"signal-{pid}-{signal}")

def wait_for_file(path: str) -> None:
    while True:
        if os.path.isfile(path):
            return
        time.sleep(2.0)


def touch(path: str) -> None:
    Path(path).touch()

def check_pid(pid):
    """ Check For the existence of a unix pid. """
    try:
        os.kill(pid, 0)
    except OSError:
        return False
    else:
        return True

def train_main(rank: int) -> None:
    os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
    import torch
    import torch.distributed as dist

    device = torch.device("cuda")

    a = torch.tensor(rank, device=device)
    print(rank, a, os.getpid())

    dist.init_process_group(backend="nccl", rank=rank, world_size=WORLD_SIZE)

    dist.all_reduce(a)
    print(rank, a)

    touch(signal_file(os.getpid(), "init"))
    wait_for_file(signal_file(os.getpid(), "resume"))

    print("resumed")

def run(args):
    p = subprocess.run(args)
    assert p.returncode == 0

@dataclass
class Process:
    process: object

if __name__ == '__main__':
    ctx = mp.get_context('spawn')

    processes = []
    for i in range(WORLD_SIZE):
        p = ctx.Process(target=train_main, args=(i,))
        p.start()

        processes.append(p)

    print("waiting")
    for p in processes:
        wait_for_file(signal_file(p.pid, "init"))
    print("checkpointing")

    run(["nvidia-smi"])

    pids = []

    for rank, p in enumerate(processes):
        pid = p.pid
        pids.append(pid)

        out_dir = f"images/{rank}"
        os.makedirs(out_dir, exist_ok=True)
        print("dump", rank, out_dir, pid)
        run([
            "sudo", "cuda-checkpoint",
            "--toggle", "--pid", str(pid),
        ])
        run([
            "sudo", "../criu/criu/criu",
            "dump",
            "--shell-job",
            "--tcp-established",
            "--images-dir", out_dir,
            "--ghost-limit", "10000000",
            "--tree", str(pid),
        ])
        p.close()

    time.sleep(1)

    for rank, p in enumerate(processes):
        pid = pids[rank]

        print(f"resuming {rank} {pid}")
        out_dir = f"images/{rank}"
        p = subprocess.Popen([
            "sudo", "../criu/criu/criu",
            "restore",
            "--shell-job",
            "--tcp-established",
            "--restore-detached",
            "--images-dir", out_dir,
        ])
        #p.start()

        time.sleep(2.0)
        assert check_pid(pid), pid

        run([
            "sudo", "cuda-checkpoint",
            "--toggle", "--pid", str(pid),
        ])

    time.sleep(1)

    #run(["nvidia-smi"])

    for pid in pids:
        touch(signal_file(pid, "resume"))

The text was updated successfully, but these errors were encountered:

d4l3k changed the title ~~segfault - Error toggling CUDA~~ segfault - Error toggling CUDA -- NCCL Apr 30, 2024

paulpopelka mentioned this issue Oct 11, 2024

cuDevicePrimaryCtxGetState() returns error 3 (CUDA_ERROR_NOT_INITIALIZED) in a resumed snapshot under certain circumstances #15

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

segfault - Error toggling CUDA -- NCCL #5

segfault - Error toggling CUDA -- NCCL #5

d4l3k commented Apr 30, 2024 •

edited

Loading

segfault - Error toggling CUDA -- NCCL #5

segfault - Error toggling CUDA -- NCCL #5

Comments

d4l3k commented Apr 30, 2024 • edited Loading

d4l3k commented Apr 30, 2024 •

edited

Loading