Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

segfault - Error toggling CUDA -- NCCL #5

Open
d4l3k opened this issue Apr 30, 2024 · 0 comments
Open

segfault - Error toggling CUDA -- NCCL #5

d4l3k opened this issue Apr 30, 2024 · 0 comments

Comments

@d4l3k
Copy link

d4l3k commented Apr 30, 2024

I'm running into errors when restoring a process that's using NCCL.

$ sudo ../criu/criu/criu restore --shell-job --images-dir images/1 --tcp-established --restore-detached
$ sudo cuda-checkpoint --toggle --pid 219321
Error toggling CUDA in process ID 219321: "OS call failed or operation not supported on this OS"

From dmesg log:

[18740.219311] cuda0000240000a[219353]: segfault at 7b5860c00000 ip 00007b58659dd24c sp 00007b5839fffa50 error 4 in libcuda.so.550.76[7b58656db000+498000] likely on CPU 13 (core 5, socket 0)
[18740.219327] Code: 00 85 c0 0f 85 55 01 00 00 48 8b 83 80 84 00 00 48 85 c0 0f 84 45 01 00 00 48 8b 40 10 48 85 c0 0f 84 38 01 00 00 48 8b 40 10 <8b> 10 89 55 c0 8b 50 04 89 55 c4 8b 50 08 89 55 c8 0f b7 50 0c 66

Repro script:

import time
import os
import os.path
import multiprocessing as mp
import subprocess
from dataclasses import dataclass
from pathlib import Path
import tempfile

WORLD_SIZE = 2
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "25901"

def signal_file(pid: int, signal: str) -> None:
    return os.path.join(tempfile.gettempdir(), f"signal-{pid}-{signal}")

def wait_for_file(path: str) -> None:
    while True:
        if os.path.isfile(path):
            return
        time.sleep(2.0)


def touch(path: str) -> None:
    Path(path).touch()

def check_pid(pid):
    """ Check For the existence of a unix pid. """
    try:
        os.kill(pid, 0)
    except OSError:
        return False
    else:
        return True

def train_main(rank: int) -> None:
    os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
    import torch
    import torch.distributed as dist

    device = torch.device("cuda")

    a = torch.tensor(rank, device=device)
    print(rank, a, os.getpid())

    dist.init_process_group(backend="nccl", rank=rank, world_size=WORLD_SIZE)

    dist.all_reduce(a)
    print(rank, a)

    touch(signal_file(os.getpid(), "init"))
    wait_for_file(signal_file(os.getpid(), "resume"))

    print("resumed")

def run(args):
    p = subprocess.run(args)
    assert p.returncode == 0

@dataclass
class Process:
    process: object

if __name__ == '__main__':
    ctx = mp.get_context('spawn')

    processes = []
    for i in range(WORLD_SIZE):
        p = ctx.Process(target=train_main, args=(i,))
        p.start()

        processes.append(p)

    print("waiting")
    for p in processes:
        wait_for_file(signal_file(p.pid, "init"))
    print("checkpointing")

    run(["nvidia-smi"])

    pids = []

    for rank, p in enumerate(processes):
        pid = p.pid
        pids.append(pid)

        out_dir = f"images/{rank}"
        os.makedirs(out_dir, exist_ok=True)
        print("dump", rank, out_dir, pid)
        run([
            "sudo", "cuda-checkpoint",
            "--toggle", "--pid", str(pid),
        ])
        run([
            "sudo", "../criu/criu/criu",
            "dump",
            "--shell-job",
            "--tcp-established",
            "--images-dir", out_dir,
            "--ghost-limit", "10000000",
            "--tree", str(pid),
        ])
        p.close()

    time.sleep(1)

    for rank, p in enumerate(processes):
        pid = pids[rank]

        print(f"resuming {rank} {pid}")
        out_dir = f"images/{rank}"
        p = subprocess.Popen([
            "sudo", "../criu/criu/criu",
            "restore",
            "--shell-job",
            "--tcp-established",
            "--restore-detached",
            "--images-dir", out_dir,
        ])
        #p.start()

        time.sleep(2.0)
        assert check_pid(pid), pid

        run([
            "sudo", "cuda-checkpoint",
            "--toggle", "--pid", str(pid),
        ])

    time.sleep(1)

    #run(["nvidia-smi"])

    for pid in pids:
        touch(signal_file(pid, "resume"))
@d4l3k d4l3k changed the title segfault - Error toggling CUDA segfault - Error toggling CUDA -- NCCL Apr 30, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant