diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index ba5a51bdea9..9f718ea62dc 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -2784,64 +2784,6 @@ jobs: sleep 5 done - # For when a user wants to use an agent instead of launcher. - - when: - condition: - equal: ["-A", <>] - steps: - - wait-for-master: - scheme: <> - host: <> - port: <> - - run: - name: Transfer and Allocate agent resources on VM - command: | - ZONE=$(terraform -chdir=tools/slurm/terraform output --raw zone) - INSTANCE_NAME=$(terraform -chdir=tools/slurm/terraform output --raw instance_name) - PROJECT=$(terraform -chdir=tools/slurm/terraform output --raw project) - gcloud compute scp agent/build/determined-agent "$INSTANCE_NAME":~ --zone $ZONE - gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- \ - srun determined-agent --master-host=<> --master-port=<> --resource-pool=default --container-runtime=<> - background: true - - run: - name: Query the job to ensure Determined Agent is running - command: | - # 60 tries gives 30 minutes of tries until the query times out - tries=60 - # squeue command gives name and state information. There must be an escape character for the gcloud command. - ZONE=$(terraform -chdir=tools/slurm/terraform output --raw zone) - INSTANCE_NAME=$(terraform -chdir=tools/slurm/terraform output --raw instance_name) - PROJECT=$(terraform -chdir=tools/slurm/terraform output --raw project) - gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T" - # Queries until the jobname is shown in a running state. The agent job must be running to run e2e tests - until [[ -n $(gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T" | grep "determined-agent" | grep "RUNNING") ]] ; do - if [[ $((--tries)) -eq 0 ]]; then - echo "The job determined-agent did not start. Please check if there are other jobs in the queue preventing this job from starting" - exit 1 - fi - echo "Waiting 30s to query for the job name again..." - sleep 30 - echo "Retrying job query..." - gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T" - done - - run: - name: Query the slot count to ensure slots are allocated - command: | - tries=20 - export DET_USER=determined - export DET_PASS=${INITIAL_USER_PASSWORD} - det slot list - until [[ $(det slot list | wc -l) -gt 2 ]] ; do - if [[ $((--tries)) -eq 0 ]]; then - echo "ERROR: determined-agent failed to register at least 2 slots. Check the 'Transfer and Allocate agent resources on VM' for any failures." - exit 1 - fi - echo "Waiting 5s to query slots again..." - sleep 5 - echo "Retrying slot query..." - det slot list - done - - run-e2e-tests: mark: <> wait-for-master: true @@ -3124,62 +3066,7 @@ jobs: } } EOF - - - when: - condition: - equal: ["-A", <>] - steps: - - run: - name: Allocate Resources on Cluster - # Sed command reduces the slots to 8 to match '-N2 --gpus-per-node=4' usage below. - # Affects test e2e_tests/tests/cluster/test_slurm.py:test_mnist_pytorch_distributed - command: | - DET_MASTER_HOST=<> - # determined_master_host is localhost, so use actual hostname to pass to the agent - MASTER_HOST=$(hostname) - MASTER_PORT=$(echo $DET_MASTER_HOST | cut -d: -f2) - sed -i 's/slots_per_trial: 16/slots_per_trial: 8/' examples/tutorials/mnist_pytorch/distributed.yaml - sudo cp agent/dist/determined-agent_linux_amd64_v1/determined-agent /scratch/launcher/.launcher.$HOSTNAME - # Include 40 minute time limit, name the job (determined-agent-$HOSTNAME) so we can selectively kill it - sudo srun --uid launcher --export=ALL -N2 --gpus-per-node=4 --time=40 -Jdetermined-agent-$HOSTNAME \ - --chdir=/scratch/launcher/.launcher.$HOSTNAME /scratch/launcher/.launcher.$HOSTNAME/determined-agent \ - --master-host=$MASTER_HOST --master-port=$MASTER_PORT --resource-pool=default --container-runtime=singularity --slot-type=cuda \ - --image-root=/lustre/hdd/foundation_engineering/images - background: true - - run: - name: Query the job to ensure Determined Agent is running - command: | - # 60 tries gives 30 minutes of tries until the query times out - tries=60 - squeue -u launcher -o "%j %T" - # Queries until the jobname is shown in a running state. The agent job must be running to run e2e tests - until [[ -n $(squeue -u launcher -o "%j %T" | grep "determined-agent-$HOSTNAME" | grep "RUNNING") ]] ; do - if [[ $((--tries)) -eq 0 ]]; then - echo "The job determined-agent-$HOSTNAME did not start. Please check if there are jobs in the queue preventing this job from starting" - exit 1 - fi - echo "Waiting 30s to query for the job name again..." - sleep 30 - echo "Retrying job query..." - squeue -u launcher -o "%j %T" - done - - run: - name: Query the slot count to ensure slots are allocated - command: | - tries=20 - export DET_PASS=${INITIAL_USER_PASSWORD} - det slot list - until [[ $(det slot list | wc -l) -gt 2 ]] ; do - if [[ $((--tries)) -eq 0 ]]; then - echo "ERROR: determined-agent failed to register at least 2 slots. Check the 'Transfer and Allocate agent resources on VM' for any failures." - exit 1 - fi - echo "Waiting 5s to query slots again..." - sleep 5 - echo "Retrying slot query..." - det slot list - done - + - run-e2e-tests: mark: <> master-host: localhost @@ -3190,15 +3077,6 @@ jobs: - store_test_results: path: /tmp/test-results/ - - when: - condition: - equal: ["-A", <>] - steps: - - run: - name: Deallocate Agent Resources on cluster - command: | - scancel -u $USER --jobname=determined-agent-$HOSTNAME - test-e2e: parameters: tf2: @@ -4511,28 +4389,6 @@ workflows: only: - main - # Podman over SLURM test using Agent on GCP - - test-e2e-hpc-gcp: - context: - # Provides the GITHUB_USERNAME and GITHUB_TOKEN enviroment variable - # that's required by the "gh" command for authentication. - - github-read - - gcp - - gcp-ci-cluster-default-user-credentials - matrix: - parameters: - name: [test-e2e-slurm-agent-podman-gcp] - agent-use: ["-A"] - container-run-type: ["podman"] - mark: ["e2e_slurm and not parallel and not gpu_required"] - extra-pytest-flags: ["-k 'not test_slurm_verify_home'"] - requires: - - build-go-ee - filters: - branches: - only: - - main - - test-e2e: name: test-e2e-rbac context: @@ -5413,26 +5269,6 @@ workflows: - build-go - request-hpc-tests - # Podman over SLURM test using Agent on GCP - - test-e2e-hpc-gcp: - filters: *upstream-feature-branch - context: - # Provides the GITHUB_USERNAME and GITHUB_TOKEN enviroment variable - # that's required by the "gh" command for authentication. - - github-read - - gcp - - gcp-ci-cluster-default-user-credentials - matrix: - parameters: - name: [test-e2e-slurm-agent-podman-gcp] - agent-use: ["-A"] - container-run-type: ["podman"] - mark: ["e2e_slurm and not parallel and not gpu_required"] - extra-pytest-flags: ["-k 'not test_slurm_verify_home'"] - requires: - - build-go - - request-hpc-tests - nightly: when: << pipeline.parameters.do_nightly_tests >> jobs: diff --git a/agent/cmd/determined-agent/init.go b/agent/cmd/determined-agent/init.go index 7c5523a55c9..15e7ab2e715 100644 --- a/agent/cmd/determined-agent/init.go +++ b/agent/cmd/determined-agent/init.go @@ -118,8 +118,6 @@ func registerAgentConfig() { registerBool(flags, name("debug"), defaults.Debug, "Enable verbose script output") registerInt(flags, name("artificial-slots"), defaults.ArtificialSlots, "") flags.Lookup("artificial-slots").Hidden = true - registerString(flags, name("image-root"), defaults.ImageRoot, - "Path to local container image cache") // Endpoint TLS flags. registerBool(flags, name("tls"), defaults.TLS, "Use TLS for the API server") diff --git a/agent/internal/agent.go b/agent/internal/agent.go index da767b840c9..80693407d32 100644 --- a/agent/internal/agent.go +++ b/agent/internal/agent.go @@ -21,8 +21,6 @@ import ( "github.com/determined-ai/determined/agent/internal/options" "github.com/determined-ai/determined/agent/pkg/docker" "github.com/determined-ai/determined/agent/pkg/events" - "github.com/determined-ai/determined/agent/pkg/podman" - "github.com/determined-ai/determined/agent/pkg/singularity" "github.com/determined-ai/determined/master/pkg/aproto" "github.com/determined-ai/determined/master/pkg/cproto" "github.com/determined-ai/determined/master/pkg/device" @@ -121,46 +119,23 @@ func (a *Agent) run(ctx context.Context) error { } a.log.Tracef("setting up %s runtime", a.opts.ContainerRuntime) - var cruntime container.ContainerRuntime - switch a.opts.ContainerRuntime { - case options.PodmanContainerRuntime: - acl, sErr := podman.New(a.opts) - if sErr != nil { - return fmt.Errorf("failed to build podman client: %w", sErr) - } - defer func() { - if cErr := acl.Close(); cErr != nil { - a.log.WithError(cErr).Error("failed to close podman client") - } - }() - cruntime = acl - case options.ApptainerContainerRuntime: - fallthrough - case options.SingularityContainerRuntime: - acl, sErr := singularity.New(a.opts) - if sErr != nil { - return fmt.Errorf("failed to build singularity client: %w", sErr) - } - defer func() { - if cErr := acl.Close(); cErr != nil { - a.log.WithError(cErr).Error("failed to close singularity client") - } - }() - cruntime = acl - case options.DockerContainerRuntime: - dcl, dErr := dclient.NewClientWithOpts(dclient.WithAPIVersionNegotiation(), dclient.FromEnv) - if dErr != nil { - return fmt.Errorf("failed to build docker client: %w", dErr) - } - defer func() { - a.log.Trace("cleaning up docker client") - if cErr := dcl.Close(); cErr != nil { - a.log.WithError(cErr).Error("failed to close docker client") - } - }() - cl := docker.NewClient(dcl) - cruntime = cl + if a.opts.ContainerRuntime != options.DockerContainerRuntime { + a.log.Error(a.opts.ContainerRuntime, + " container runtime is not supported, please update runtime config to use docker instead.") + return fmt.Errorf("container runtime not available: %s", a.opts.ContainerRuntime) + } + + dcl, dErr := dclient.NewClientWithOpts(dclient.WithAPIVersionNegotiation(), dclient.FromEnv) + if dErr != nil { + return fmt.Errorf("failed to build docker client: %w", dErr) } + defer func() { + a.log.Trace("cleaning up docker client") + if cErr := dcl.Close(); cErr != nil { + a.log.WithError(cErr).Error("failed to close docker client") + } + }() + cruntime := docker.NewClient(dcl) a.log.Trace("setting up container manager") outbox := make(chan *aproto.MasterMessage, eventChanSize) // covers many from socket lifetimes diff --git a/agent/internal/container/container_runtime.go b/agent/internal/container/container_runtime.go index f2b5b652285..0daadf1e49d 100644 --- a/agent/internal/container/container_runtime.go +++ b/agent/internal/container/container_runtime.go @@ -13,7 +13,7 @@ import ( "github.com/docker/docker/api/types/filters" ) -// ContainerRuntime is our interface for interacting with runtimes like Docker or Singularity. +// ContainerRuntime is our interface for interacting with runtimes like Docker. type ContainerRuntime interface { ReattachContainer( ctx context.Context, diff --git a/agent/internal/options/options.go b/agent/internal/options/options.go index ad091560bc7..74d72b9e426 100644 --- a/agent/internal/options/options.go +++ b/agent/internal/options/options.go @@ -58,9 +58,8 @@ type Options struct { Security SecurityOptions `json:"security"` - Debug bool `json:"debug"` - ArtificialSlots int `json:"artificial_slots"` - ImageRoot string `json:"image_root"` + Debug bool `json:"debug"` + ArtificialSlots int `json:"artificial_slots"` TLS bool `json:"tls"` TLSCertFile string `json:"tls_cert"` @@ -80,9 +79,7 @@ type Options struct { // master config. AgentReconnectBackoff int `json:"agent_reconnect_backoff"` - ContainerRuntime string `json:"container_runtime"` - SingularityOptions SingularityOptions `json:"singularity_options"` - PodmanOptions PodmanOptions `json:"podman_options"` + ContainerRuntime string `json:"container_runtime"` ContainerAutoRemoveDisabled bool `json:"container_auto_remove_disabled"` @@ -214,25 +211,9 @@ type ContainerRuntime string // Available container runtimes. const ( - ApptainerContainerRuntime = "apptainer" - SingularityContainerRuntime = "singularity" - DockerContainerRuntime = "docker" - PodmanContainerRuntime = "podman" + DockerContainerRuntime = "docker" ) -// SingularityOptions configures how we interact with Singularity. -type SingularityOptions struct { - // AllowNetworkCreation allows the agent to use `singularity run`'s `--net` option, which sets - // up and launches containers into a new network namespace. Disabled by default since this - // requires root or a suid installation with /etc/subuid --fakeroot. - AllowNetworkCreation bool `json:"allow_network_creation"` -} - -// PodmanOptions configures how we interact with podman. -type PodmanOptions struct { - AllowNetworkCreation bool `json:"allow_network_creation"` // review -} - // VisibleGPUsFromEnvironment returns GPU visibility information from the environment // if any, else "". func VisibleGPUsFromEnvironment() (visDevices string) { diff --git a/agent/internal/options/options_test.go b/agent/internal/options/options_test.go index 6a47020ca24..3c114a2f513 100644 --- a/agent/internal/options/options_test.go +++ b/agent/internal/options/options_test.go @@ -97,7 +97,6 @@ security: master_cert_name: master_certificate debug: true artificial_slots: 12 -image_root: docker_image_root tls: true tls_cert: tls_certificate_file tls_key: tls_key_file @@ -136,7 +135,6 @@ container_runtime: docker_runtime_env }, Debug: true, ArtificialSlots: 12, - ImageRoot: "docker_image_root", TLS: true, TLSCertFile: "tls_certificate_file", TLSKeyFile: "tls_key_file", diff --git a/agent/pkg/podman/podman.go b/agent/pkg/podman/podman.go deleted file mode 100644 index ccb011685d6..00000000000 --- a/agent/pkg/podman/podman.go +++ /dev/null @@ -1,477 +0,0 @@ -package podman - -import ( - "context" - "fmt" - "io" - "os" - "os/exec" - "os/user" - "path" - "path/filepath" - "strconv" - "strings" - "sync" - "sync/atomic" - "syscall" - "time" - - "github.com/docker/docker/api/types" - dcontainer "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/filters" - "github.com/docker/docker/api/types/mount" - "github.com/docker/docker/pkg/stdcopy" - "github.com/sirupsen/logrus" - - "github.com/determined-ai/determined/agent/internal/container" - "github.com/determined-ai/determined/agent/internal/options" - "github.com/determined-ai/determined/agent/pkg/cruntimes" - "github.com/determined-ai/determined/agent/pkg/docker" - "github.com/determined-ai/determined/agent/pkg/events" - "github.com/determined-ai/determined/master/pkg/aproto" - "github.com/determined-ai/determined/master/pkg/cproto" - "github.com/determined-ai/determined/master/pkg/model" - "github.com/determined-ai/determined/master/pkg/syncx/waitgroupx" - "github.com/determined-ai/determined/master/pkg/tasks" -) - -var podmanWrapperEntrypoint = path.Join(tasks.RunDir, tasks.SingularityEntrypointWrapperScript) - -const ( - hostNetworking = "host" - bridgeNetworking = "bridge" - envFileName = "envfile" - archivesName = "archives" -) - -// PodmanContainer captures the state of a container. -type PodmanContainer struct { - PID int `json:"pid"` - Req cproto.RunSpec `json:"req"` - TmpDir string `json:"tmp_dir"` - Proc *os.Process `json:"-"` - Started atomic.Bool `json:"started"` -} - -// PodmanClient implements ContainerRuntime. -type PodmanClient struct { - log *logrus.Entry - opts options.PodmanOptions - mu sync.Mutex - wg waitgroupx.Group - containers map[cproto.ID]*PodmanContainer - agentTmp string - debug bool -} - -// New returns a new podman client, which launches and tracks containers. -func New(opts options.Options) (*PodmanClient, error) { - agentTmp, err := cruntimes.BaseTempDirName(opts.AgentID) - if err != nil { - return nil, fmt.Errorf("unable to compose agentTmp directory path: %w", err) - } - - if err := os.RemoveAll(agentTmp); err != nil { - return nil, fmt.Errorf("removing agent tmp from previous runs: %w", err) - } - - if err := os.MkdirAll(agentTmp, 0o700); err != nil { - return nil, fmt.Errorf("preparing agent tmp: %w", err) - } - - return &PodmanClient{ - log: logrus.WithField("component", "podman"), - opts: opts.PodmanOptions, - wg: waitgroupx.WithContext(context.Background()), - containers: make(map[cproto.ID]*PodmanContainer), - agentTmp: agentTmp, - debug: opts.Debug, - }, nil -} - -// Close the client, killing all running containers and removing our scratch space. -func (s *PodmanClient) Close() error { - s.mu.Lock() - defer s.mu.Unlock() - - // Since we launch procs with exec.CommandContext under s.wg's context, this cleans them up. - s.wg.Close() - - if err := os.RemoveAll(s.agentTmp); err != nil { - return fmt.Errorf("cleaning up agent tmp: %w", err) - } - return nil -} - -func getPullCommand(req docker.PullImage, image string) (string, []string) { - // C.f. singularity where if req.ForcePull is set then a 'pull --force' is done. - // podman does not have this option, though it does have '--pull always' on the - // run command. - return "podman", []string{"pull", image} -} - -// PullImage implements container.ContainerRuntime. -func (s *PodmanClient) PullImage( - ctx context.Context, - req docker.PullImage, - p events.Publisher[docker.Event], -) (err error) { - return cruntimes.PullImage(ctx, req, p, &s.wg, s.log, getPullCommand) -} - -// CreateContainer implements container.ContainerRuntime. -func (s *PodmanClient) CreateContainer( - ctx context.Context, - id cproto.ID, - req cproto.RunSpec, - p events.Publisher[docker.Event], -) (string, error) { - s.mu.Lock() - defer s.mu.Unlock() - - s.containers[id] = &PodmanContainer{Req: req} - return id.String(), nil -} - -// RunContainer implements container.ContainerRuntime. -// nolint: golint,maintidx // Both contexts can't both be first / TODO refactor. -func (s *PodmanClient) RunContainer( - ctx context.Context, - waitCtx context.Context, - id string, - p events.Publisher[docker.Event], -) (*docker.Container, error) { - s.mu.Lock() - var cont *PodmanContainer - for cID, rcont := range s.containers { - if cproto.ID(id) != cID { - continue - } - cont = rcont - break - } - s.mu.Unlock() - - if cont == nil { - return nil, container.ErrMissing - } - req := cont.Req - - u, err := user.Current() - if err != nil { - return nil, fmt.Errorf("checking user: %w", err) - } - - uidgid := fmt.Sprintf("%s:%s", u.Uid, u.Gid) - if req.ContainerConfig.User != uidgid { - return nil, fmt.Errorf( - "agent running as %s cannot launch as user %s", - uidgid, req.ContainerConfig.User, - ) - } - - tmpdir, err := os.MkdirTemp(s.agentTmp, fmt.Sprintf("*-%s", id)) - if err != nil { - return nil, fmt.Errorf("making tmp dir for archives: %w", err) - } - - var args []string - args = append(args, "run") - args = append(args, "--rm") - args = append(args, "--workdir", req.ContainerConfig.WorkingDir) - - // Env. variables. c.f. launcher PodmanOverSlurm.java - args = append(args, "--env", "SLURM_*") - args = append(args, "--env", "CUDA_VISIBLE_DEVICES") - args = append(args, "--env", "NVIDIA_VISIBLE_DEVICES") - args = append(args, "--env", "ROCR_VISIBLE_DEVICES") - args = append(args, "--env", "HIP_VISIBLE_DEVICES") - if s.debug { - args = append(args, "--env", "DET_DEBUG=1") - } - envFilePath := path.Join(tmpdir, envFileName) - envFile, err := os.OpenFile( - envFilePath, - os.O_APPEND|os.O_CREATE|os.O_WRONLY, - 0o600, - ) // #nosec G304 // We made this filepath, and it is randomized. - if err != nil { - return nil, fmt.Errorf("creating envfile %s: %w", envFilePath, err) - } - args = append(args, "--env-file", envFilePath) - - for _, env := range req.ContainerConfig.Env { - _, err = envFile.WriteString(env + "\n") - if err != nil { - return nil, fmt.Errorf("writing to envfile: %w", err) - } - } - if err = envFile.Close(); err != nil { - return nil, fmt.Errorf("closing envfile: %w", err) - } - - switch { - case req.HostConfig.NetworkMode == bridgeNetworking && s.opts.AllowNetworkCreation: - // ?? publish ports only for bridgeNetworking - for port := range req.ContainerConfig.ExposedPorts { - p := port.Int() - args = append(args, "-p", fmt.Sprintf("%d:%d/tcp", p, p)) - } - args = append(args, "--network=bridge") - case req.HostConfig.NetworkMode == bridgeNetworking: - if err = p.Publish(ctx, docker.NewLogEvent( - model.LogLevelDebug, - "container requested network virtualization, but network creation isn't allowed; "+ - "overriding to host networking", - )); err != nil { - return nil, err - } - req.HostConfig.NetworkMode = hostNetworking - fallthrough - case req.HostConfig.NetworkMode == hostNetworking: - args = append(args, "--network=host") - default: - return nil, fmt.Errorf("unsupported network mode %s", req.HostConfig.NetworkMode) - } - - archivesPath := filepath.Join(tmpdir, archivesName) - mountPoints, wErr := cruntimes.ArchiveMountPoints(ctx, req, p, archivesPath, s.log) - if wErr != nil { - return nil, fmt.Errorf("determining mount points: %w", err) - } - for _, m := range mountPoints { - args = append(args, "--volume", fmt.Sprintf("%s:%s", path.Join(archivesPath, m), m)) - } - - for _, m := range req.HostConfig.Mounts { - args = hostMountsToPodmanArgs(m, args) - } - - // from master task_container_defaults.shm_size_bytes - if shmsize := req.HostConfig.ShmSize; shmsize != 4294967296 { // 4294967296 is the default. - args = append(args, "--shm-size", strconv.Itoa(int(shmsize))) - } - - args = capabilitiesToPodmanArgs(req, args) - - image := cruntimes.CanonicalizeImage(req.ContainerConfig.Image) - args = append(args, image) - args = append(args, podmanWrapperEntrypoint) - args = append(args, req.ContainerConfig.Cmd...) - - if err = s.pprintPodmanCommand(ctx, args, p); err != nil { - return nil, err - } - - // #nosec G204 // We launch arbitrary user code as a service. - cmd := exec.CommandContext(waitCtx, "podman", args...) - stdout, err := cmd.StdoutPipe() - if err != nil { - return nil, fmt.Errorf("creating stdout pipe: %w", err) - } - stderr, err := cmd.StderrPipe() - if err != nil { - return nil, fmt.Errorf("creating stderr pipe: %w", err) - } - s.wg.Go(func(ctx context.Context) { s.shipPodmanCmdLogs(ctx, stdout, stdcopy.Stdout, p) }) - s.wg.Go(func(ctx context.Context) { s.shipPodmanCmdLogs(ctx, stderr, stdcopy.Stderr, p) }) - - if err := cmd.Start(); err != nil { - return nil, fmt.Errorf("starting podman container: %w", err) - } - - cont.PID = cmd.Process.Pid - cont.Proc = cmd.Process - cont.TmpDir = tmpdir - cont.Started.Store(true) - at := time.Now().String() - s.log.Infof("started container %s with pid %d", id, cont.PID) - - return &docker.Container{ - ContainerInfo: types.ContainerJSON{ - ContainerJSONBase: &types.ContainerJSONBase{ - ID: strconv.Itoa(cont.Proc.Pid), - Created: at, - Path: podmanWrapperEntrypoint, - Args: req.ContainerConfig.Cmd, - State: &types.ContainerState{ - Status: "running", - Running: true, - Pid: cont.Proc.Pid, - StartedAt: at, - }, - Image: image, - HostConfig: &dcontainer.HostConfig{ - NetworkMode: req.HostConfig.NetworkMode, - }, - }, - Config: &dcontainer.Config{ - ExposedPorts: req.ContainerConfig.ExposedPorts, - }, - }, - ContainerWaiter: s.waitOnContainer(cproto.ID(id), cont, p), - }, nil -} - -func capabilitiesToPodmanArgs(req cproto.RunSpec, args []string) []string { - for _, cap := range req.HostConfig.CapAdd { - args = append(args, "--cap-add", cap) - } - for _, cap := range req.HostConfig.CapDrop { - args = append(args, "--cap-drop", cap) - } - return args -} - -func hostMountsToPodmanArgs(m mount.Mount, args []string) []string { - var mountOptions []string - if m.ReadOnly { - mountOptions = append(mountOptions, "ro") - } - if m.BindOptions != nil && string(m.BindOptions.Propagation) != "" { - mountOptions = append(mountOptions, string(m.BindOptions.Propagation)) - } - var options string - if len(mountOptions) > 0 { - options = fmt.Sprintf(":%s", strings.Join(mountOptions, ",")) - } - return append(args, "--volume", fmt.Sprintf("%s:%s%s", m.Source, m.Target, options)) -} - -// ReattachContainer implements container.ContainerRuntime. -// TODO(DET-9082): Ensure orphaned processes are cleaned up on reattach. -func (s *PodmanClient) ReattachContainer( - ctx context.Context, - reattachID cproto.ID, -) (*docker.Container, *aproto.ExitCode, error) { - return nil, nil, container.ErrMissing -} - -// RemoveContainer implements container.ContainerRuntime. -func (s *PodmanClient) RemoveContainer(ctx context.Context, id string, force bool) error { - s.mu.Lock() - defer s.mu.Unlock() - - cont, ok := s.containers[cproto.ID(id)] - if !ok { - return container.ErrMissing - } - - if cont.Started.Load() { - return cont.Proc.Kill() - } - return fmt.Errorf("cannot kill container %s that is not started", id) -} - -// SignalContainer implements container.ContainerRuntime. -func (s *PodmanClient) SignalContainer( - ctx context.Context, - id string, - sig syscall.Signal, -) error { - s.mu.Lock() - defer s.mu.Unlock() - - cont, ok := s.containers[cproto.ID(id)] - if !ok { - return container.ErrMissing - } - - if cont.Started.Load() { - return cont.Proc.Signal(sig) - } - return fmt.Errorf("cannot signal container %s with %s that is not started", id, sig) -} - -// ListRunningContainers implements container.ContainerRuntime. -func (s *PodmanClient) ListRunningContainers( - ctx context.Context, - fs filters.Args, -) (map[cproto.ID]types.Container, error) { - resp := make(map[cproto.ID]types.Container) - - s.mu.Lock() - defer s.mu.Unlock() - for id, cont := range s.containers { - resp[id] = types.Container{ - ID: string(id), - Labels: cont.Req.ContainerConfig.Labels, - } - } - return resp, nil -} - -func (s *PodmanClient) waitOnContainer( - id cproto.ID, - cont *PodmanContainer, - p events.Publisher[docker.Event], -) docker.ContainerWaiter { - wchan := make(chan dcontainer.WaitResponse, 1) - errchan := make(chan error) - s.wg.Go(func(ctx context.Context) { - defer close(wchan) - defer close(errchan) - - var body dcontainer.WaitResponse - switch state, err := cont.Proc.Wait(); { - case ctx.Err() != nil && err == nil && state.ExitCode() == -1: - s.log.Trace("detached from container process") - return - case err != nil: - s.log.Tracef("proc %d for container %s exited: %s", cont.PID, id, err) - body.Error = &dcontainer.WaitExitError{Message: err.Error()} - default: - s.log.Tracef("proc %d for container %s exited with %d", cont.PID, id, state.ExitCode()) - body.StatusCode = int64(state.ExitCode()) - } - - select { - case wchan <- body: - case <-ctx.Done(): - return - } - - s.mu.Lock() - defer s.mu.Unlock() - s.log.Tracef("forgetting completed container: %s", id) - delete(s.containers, id) - - // Defer file cleanup until restart if debug logging is enabled. - if s.log.Logger.Level <= logrus.DebugLevel { - if err := p.Publish(ctx, docker.NewLogEvent( - model.LogLevelDebug, - fmt.Sprintf("leaving tmpdir %s for inspection", cont.TmpDir), - )); err != nil { - return - } - } else { - if err := os.RemoveAll(cont.TmpDir); err != nil { - if err = p.Publish(ctx, docker.NewLogEvent( - model.LogLevelWarning, - fmt.Sprintf("failed to cleanup tmpdir (ephemeral mounts, etc): %s", err), - )); err != nil { - logrus.WithError(err).Error("publishing cleanup failure warning") - return - } - } - } - }) - return docker.ContainerWaiter{Waiter: wchan, Errs: errchan} -} - -func (s *PodmanClient) shipPodmanCmdLogs( - ctx context.Context, - r io.ReadCloser, - stdtype stdcopy.StdType, - p events.Publisher[docker.Event], -) { - cruntimes.ShipContainerCommandLogs(ctx, r, stdtype, p) -} - -func (s *PodmanClient) pprintPodmanCommand( - ctx context.Context, - args []string, - p events.Publisher[docker.Event], -) error { - return cruntimes.PprintCommand(ctx, "podman", args, p, s.log) -} diff --git a/agent/pkg/podman/podman_test.go b/agent/pkg/podman/podman_test.go deleted file mode 100644 index bf90c3a6fe5..00000000000 --- a/agent/pkg/podman/podman_test.go +++ /dev/null @@ -1,107 +0,0 @@ -package podman - -import ( - "reflect" - "testing" - - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/mount" - - "github.com/determined-ai/determined/master/pkg/cproto" -) - -func Test_addHostMounts(t *testing.T) { - type args struct { - m mount.Mount - args []string - } - tests := []struct { - name string - args args - want []string - }{ - { - name: "Simple case", - args: args{ - m: mount.Mount{ - Source: "/host", - Target: "/container", - BindOptions: &mount.BindOptions{}, - }, - args: []string{}, - }, - want: []string{"--volume", "/host:/container"}, - }, - { - name: "Read-only", - args: args{ - m: mount.Mount{ - Source: "/host", - Target: "/container", - ReadOnly: true, - }, - args: []string{}, - }, - want: []string{"--volume", "/host:/container:ro"}, - }, - { - name: "Read-only with propagation", - args: args{ - m: mount.Mount{ - Type: "", - Source: "/host", - Target: "/container", - ReadOnly: true, - BindOptions: &mount.BindOptions{ - Propagation: "rprivate", - }, - }, - args: []string{}, - }, - want: []string{"--volume", "/host:/container:ro,rprivate"}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := hostMountsToPodmanArgs(tt.args.m, tt.args.args); !reflect.DeepEqual(got, tt.want) { - t.Errorf("addHostMounts() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_processCapabilities(t *testing.T) { - type args struct { - req cproto.RunSpec - args []string - } - tests := []struct { - name string - args args - want []string - }{ - { - name: "Add/drop caps test", - args: args{ - req: cproto.RunSpec{ - HostConfig: container.HostConfig{ - CapAdd: []string{"add-one", "add-two"}, - CapDrop: []string{"drop-two", "drop-one"}, - }, - }, - args: []string{}, - }, - want: []string{ - "--cap-add", "add-one", "--cap-add", "add-two", - "--cap-drop", "drop-two", "--cap-drop", "drop-one", - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := capabilitiesToPodmanArgs(tt.args.req, tt.args.args); !reflect.DeepEqual(got, tt.want) { - t.Errorf("processCapabilities() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/agent/pkg/singularity/singularity.go b/agent/pkg/singularity/singularity.go deleted file mode 100644 index 0e931446103..00000000000 --- a/agent/pkg/singularity/singularity.go +++ /dev/null @@ -1,612 +0,0 @@ -package singularity - -import ( - "context" - "encoding/base64" - "fmt" - "io" - "os" - "os/exec" - "os/user" - "path" - "path/filepath" - "strconv" - "strings" - "sync" - "sync/atomic" - "syscall" - "time" - - "github.com/docker/docker/api/types" - dcontainer "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/filters" - "github.com/docker/docker/pkg/stdcopy" - "github.com/sirupsen/logrus" - - "github.com/determined-ai/determined/agent/internal/container" - "github.com/determined-ai/determined/agent/internal/options" - "github.com/determined-ai/determined/agent/pkg/cruntimes" - "github.com/determined-ai/determined/agent/pkg/docker" - "github.com/determined-ai/determined/agent/pkg/events" - "github.com/determined-ai/determined/master/pkg/aproto" - "github.com/determined-ai/determined/master/pkg/cproto" - "github.com/determined-ai/determined/master/pkg/device" - "github.com/determined-ai/determined/master/pkg/model" - "github.com/determined-ai/determined/master/pkg/syncx/waitgroupx" - "github.com/determined-ai/determined/master/pkg/tasks" -) - -var singularityWrapperEntrypoint = path.Join(tasks.RunDir, tasks.SingularityEntrypointWrapperScript) - -const ( - hostNetworking = "host" - bridgeNetworking = "bridge" - envFileName = "envfile" - archivesName = "archives" -) - -// SingularityContainer captures the state of a container. -type SingularityContainer struct { - PID int `json:"pid"` - Req cproto.RunSpec `json:"req"` - TmpDir string `json:"tmp_dir"` - Proc *os.Process `json:"-"` - Started atomic.Bool `json:"started"` -} - -// SingularityClient implements ContainerRuntime. -type SingularityClient struct { - log *logrus.Entry - opts options.SingularityOptions - mu sync.Mutex - wg waitgroupx.Group - containers map[cproto.ID]*SingularityContainer - agentTmp string - debug bool - isApptainer bool - imageRoot string // optional path name to root of image cache -} - -// New returns a new singularity client, which launches and tracks containers. -func New(opts options.Options) (*SingularityClient, error) { - agentTmp, err := cruntimes.BaseTempDirName(opts.AgentID) - if err != nil { - return nil, fmt.Errorf("unable to compose agentTmp directory path: %w", err) - } - - if err := os.RemoveAll(agentTmp); err != nil { - return nil, fmt.Errorf("removing agent tmp from previous runs: %w", err) - } - - if err := os.MkdirAll(agentTmp, 0o700); err != nil { - return nil, fmt.Errorf("preparing agent tmp: %w", err) - } - - // Validate image root (if provided) - if opts.ImageRoot != "" { - if _, err := os.ReadDir(opts.ImageRoot); err != nil { - return nil, fmt.Errorf("reading image root directory: %w", err) - } - } - - return &SingularityClient{ - log: logrus.WithField("component", "singularity"), - opts: opts.SingularityOptions, - wg: waitgroupx.WithContext(context.Background()), - containers: make(map[cproto.ID]*SingularityContainer), - agentTmp: agentTmp, - debug: opts.Debug, - isApptainer: opts.ContainerRuntime == options.ApptainerContainerRuntime, - imageRoot: opts.ImageRoot, - }, nil -} - -// Close the client, killing all running containers and removing our scratch space. -func (s *SingularityClient) Close() error { - s.mu.Lock() - defer s.mu.Unlock() - - // Since we launch procs with exec.CommandContext under s.wg's context, this cleans them up. - s.wg.Close() - - if err := os.RemoveAll(s.agentTmp); err != nil { - return fmt.Errorf("cleaning up agent tmp: %w", err) - } - return nil -} - -// PullImage implements container.ContainerRuntime. -func (s *SingularityClient) PullImage( - ctx context.Context, - req docker.PullImage, - p events.Publisher[docker.Event], -) (err error) { - // Singularity pull outputs a file, so skip it - return nil -} - -// CreateContainer implements container.ContainerRuntime. -func (s *SingularityClient) CreateContainer( - ctx context.Context, - id cproto.ID, - req cproto.RunSpec, - p events.Publisher[docker.Event], -) (string, error) { - s.mu.Lock() - defer s.mu.Unlock() - - s.containers[id] = &SingularityContainer{Req: req} - return id.String(), nil -} - -// RunContainer implements container.ContainerRuntime. -// nolint: golint,maintidx // Both contexts can't both be first / TODO refactor. -func (s *SingularityClient) RunContainer( - ctx context.Context, - waitCtx context.Context, - id string, - p events.Publisher[docker.Event], -) (*docker.Container, error) { - s.mu.Lock() - var cont *SingularityContainer - for cID, rcont := range s.containers { - if cproto.ID(id) != cID { - continue - } - cont = rcont - break - } - s.mu.Unlock() - - if cont == nil { - return nil, container.ErrMissing - } - req := cont.Req - - u, err := user.Current() - if err != nil { - return nil, fmt.Errorf("checking user: %w", err) - } - - uidgid := fmt.Sprintf("%s:%s", u.Uid, u.Gid) - if req.ContainerConfig.User != uidgid { - return nil, fmt.Errorf( - "agent running as %s cannot launch as user %s", - uidgid, req.ContainerConfig.User, - ) - } - - tmpdir, err := os.MkdirTemp(s.agentTmp, fmt.Sprintf("*-%s", id)) - if err != nil { - return nil, fmt.Errorf("making tmp dir for archives: %w", err) - } - - var args []string - args = append(args, "run") - args = append(args, "--writable-tmpfs") - args = append(args, "--pwd", req.ContainerConfig.WorkingDir) - - envFilePath := path.Join(tmpdir, envFileName) - envFile, err := os.OpenFile( - envFilePath, - os.O_APPEND|os.O_CREATE|os.O_WRONLY, - 0o600, - ) // #nosec G304 // We made this filepath, and it is randomized. - if err != nil { - return nil, fmt.Errorf("creating envfile %s: %w", envFilePath, err) - } - args = append(args, "--env-file", envFilePath) - - var b64EnvVars []string - for _, env := range req.ContainerConfig.Env { - parts := strings.SplitN(env, "=", 2) - - var formattedEnv string - switch len(parts) { - case 0: - continue // Must be empty envvar. - case 1: - formattedEnv = fmt.Sprintf("%s=", env) - case 2: - // Don't even attempt to escape quotes, strconv.Quote doesn't work - singularity seems - // to unescape it multiple times. - if strings.Contains(parts[1], "\"") { - b64EnvVars = append(b64EnvVars, parts[0]) - formattedEnv = fmt.Sprintf( - "%s=\"%s\"", - parts[0], base64.StdEncoding.EncodeToString([]byte(parts[1])), - ) - } else { - formattedEnv = fmt.Sprintf("%s=%s", parts[0], strconv.Quote(parts[1])) - } - } - - _, err = envFile.WriteString(formattedEnv + "\n") - if err != nil { - return nil, fmt.Errorf("writing to envfile: %w", err) - } - } - - _, err = envFile.WriteString(fmt.Sprintf( - "DET_B64_ENCODED_ENVVARS=%s\n", - strings.Join(b64EnvVars, ","), - )) - if err != nil { - return nil, fmt.Errorf("writing to envfile: %w", err) - } - detDebug := 0 - if s.debug { - detDebug = 1 - } - if _, err = envFile.WriteString(fmt.Sprintf("DET_DEBUG=%d\n", detDebug)); err != nil { - return nil, fmt.Errorf("writing to envfile: %w", err) - } - if err = envFile.Close(); err != nil { - return nil, fmt.Errorf("closing envfile: %w", err) - } - switch { - case req.HostConfig.NetworkMode == bridgeNetworking && s.opts.AllowNetworkCreation: - // --net sets up a bridge network by default - // (see https://apptainer.org/user-docs/3.0/networking.html#net) - args = append(args, "--net") - // Do the equivalent of Docker's PublishAllPorts = true - for port := range req.ContainerConfig.ExposedPorts { - p := port.Int() - args = append(args, "--network-args", fmt.Sprintf("portmap=%d:%d/tcp", p, p)) - } - case req.HostConfig.NetworkMode == bridgeNetworking: - if err = p.Publish(ctx, docker.NewLogEvent( - model.LogLevelDebug, - "container requested network virtualization, but network creation isn't allowed; "+ - "overriding to host networking", - )); err != nil { - return nil, err - } - req.HostConfig.NetworkMode = hostNetworking - fallthrough - case req.HostConfig.NetworkMode == hostNetworking: - default: - return nil, fmt.Errorf("unsupported network mode %s", req.HostConfig.NetworkMode) - } - - archivesPath := filepath.Join(tmpdir, archivesName) - mountPoints, wErr := cruntimes.ArchiveMountPoints(ctx, req, p, archivesPath, s.log) - if wErr != nil { - return nil, fmt.Errorf("determining mount points: %w", err) - } - for _, m := range mountPoints { - args = append(args, "--bind", fmt.Sprintf("%s:%s", path.Join(archivesPath, m), m)) - } - - for _, m := range req.HostConfig.Mounts { - if m.BindOptions != nil && m.BindOptions.Propagation != "rprivate" { // rprivate is default. - if err = p.Publish(ctx, docker.NewLogEvent(model.LogLevelWarning, fmt.Sprintf( - "mount %s:%s had propagation settings but singularity does not support this; "+ - "will bind mount anyway, without them", - m.Source, m.Target, - ))); err != nil { - return nil, err - } - } - bindMountTemplate := "%s:%s" - if m.ReadOnly { - bindMountTemplate += ":ro" - } - args = append(args, "--bind", fmt.Sprintf(bindMountTemplate, m.Source, m.Target)) - } - - if shmsize := req.HostConfig.ShmSize; shmsize != 4294967296 { // 4294967296 is the default. - if err = p.Publish(ctx, docker.NewLogEvent(model.LogLevelWarning, fmt.Sprintf( - "shmsize was requested as %d but singularity does not support this; "+ - "we do not launch with `--contain`, so we inherit the configuration of the host", - shmsize, - ))); err != nil { - return nil, err - } - } - - s.log.Tracef("Device type is %s", req.DeviceType) - if req.DeviceType == device.ROCM { - args = append(args, "--rocm") - } - - // Visible devices are set later by modifying the exec.Command's env. - var cudaVisibleDevices []string - for _, d := range cont.Req.HostConfig.DeviceRequests { - if d.Driver == "nvidia" { - cudaVisibleDevices = append(cudaVisibleDevices, d.DeviceIDs...) - } - } - if len(cudaVisibleDevices) > 0 { - args = append(args, "--nv", "--nvccli", "--contain") - if s.isApptainer { - // Using --userns to avoid this error when using --nvccli: - // FATAL: nvidia-container-cli not allowed in setuid mode - // (e.g.: casablanca, apptainer version 1.2.2-1) - args = append(args, "--userns") - } - } - - args = capabilitiesToSingularityArgs(req, args) - image := s.computeImageReference(req.ContainerConfig.Image) - args = append(args, image) - args = append(args, singularityWrapperEntrypoint) - args = append(args, req.ContainerConfig.Cmd...) - - if err = s.pprintSingularityCommand(ctx, args, p); err != nil { - return nil, err - } - - // #nosec G204 // We launch arbitrary user code as a service. - cmd := exec.CommandContext(waitCtx, "singularity", args...) - stdout, err := cmd.StdoutPipe() - if err != nil { - return nil, fmt.Errorf("creating stdout pipe: %w", err) - } - stderr, err := cmd.StderrPipe() - if err != nil { - return nil, fmt.Errorf("creating stderr pipe: %w", err) - } - s.wg.Go(func(ctx context.Context) { s.shipSingularityCmdLogs(ctx, stdout, stdcopy.Stdout, p) }) - s.wg.Go(func(ctx context.Context) { s.shipSingularityCmdLogs(ctx, stderr, stdcopy.Stderr, p) }) - - s.setCommandEnvironment(req, cudaVisibleDevices, cmd) - - if err := cmd.Start(); err != nil { - return nil, fmt.Errorf("starting singularity container: %w", err) - } - - cont.PID = cmd.Process.Pid - cont.Proc = cmd.Process - cont.TmpDir = tmpdir - cont.Started.Store(true) - at := time.Now().String() - s.log.Infof("started container %s with pid %d", id, cont.PID) - - return &docker.Container{ - ContainerInfo: types.ContainerJSON{ - ContainerJSONBase: &types.ContainerJSONBase{ - ID: strconv.Itoa(cont.Proc.Pid), - Created: at, - Path: singularityWrapperEntrypoint, - Args: req.ContainerConfig.Cmd, - State: &types.ContainerState{ - Status: "running", - Running: true, - Pid: cont.Proc.Pid, - StartedAt: at, - }, - Image: image, - HostConfig: &dcontainer.HostConfig{ - NetworkMode: req.HostConfig.NetworkMode, - }, - }, - Config: &dcontainer.Config{ - ExposedPorts: req.ContainerConfig.ExposedPorts, - }, - }, - ContainerWaiter: s.waitOnContainer(cproto.ID(id), cont, p), - }, nil -} - -// computeImageReference computes a reference to the requested image. If an image cache -// directory is defined and it contains the required image, a path name to the cached image -// will be returned, else a docker reference so that the image can be downloaded. -func (s *SingularityClient) computeImageReference(requestedImage string) string { - s.log.Tracef("requested image: %s", requestedImage) - if s.imageRoot != "" { - cachePathName := s.imageRoot + "/" + requestedImage - _, err := os.Stat(cachePathName) - if err == nil { - return cachePathName - } - s.log.Tracef("image is not in cache: %s", err.Error()) - } - return cruntimes.CanonicalizeImage(requestedImage) -} - -// Sets the environment of the process that will run the Singularity/apptainer command. -func (s *SingularityClient) setCommandEnvironment( - req cproto.RunSpec, cudaVisibleDevices []string, cmd *exec.Cmd, -) { - // Per https://pkg.go.dev/os/exec#Cmd.Env, if cmd.Env is nil, the new process uses the current - // process's environment. If this is not the case, for example because we specify something to - // control Singularity operation, then we need to explicitly specify any value needed from the - // current environment. - if req.DeviceType == device.CUDA { - cudaVisibleDevicesVar := strings.Join(cudaVisibleDevices, ",") - - // NVIDIA_VISIBLE_DEVICES together with options on the singularity/apptainer run - // command control the visibility of the GPUs in the container. - visibleDevices := fmt.Sprintf("NVIDIA_VISIBLE_DEVICES=%s", cudaVisibleDevicesVar) - s.log.Trace(visibleDevices) - cmd.Env = append(cmd.Env, visibleDevices) - } - if req.DeviceType == device.ROCM { - // Avoid this problem: https://github.com/determined-ai/determined-ee/pull/922 - // by not setting both ROCR_VISIBLE_DEVICES & CUDA_VISIBLE_DEVICES - s.addToTargetCommandEnvironmentIfSet(cmd, "ROCR_VISIBLE_DEVICES") - } - - s.addOptionalRegistryAuthCredentials(req, cmd) - - // HACK(singularity): without this, --nv doesn't work right. If the singularity run command - // cannot find nvidia-smi, the --nv fails to make it available inside the container, e.g., - // env -i /usr/bin/singularity run --nv \\ - // docker://determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0 nvidia-smi - cmd.Env = append(cmd.Env, fmt.Sprintf("PATH=%s", os.Getenv("PATH"))) - - s.addToTargetCommandEnvironmentIfSet(cmd, "http_proxy") - s.addToTargetCommandEnvironmentIfSet(cmd, "https_proxy") - s.addToTargetCommandEnvironmentIfSet(cmd, "no_proxy") - s.addToTargetCommandEnvironmentIfSet(cmd, "APPTAINER_CACHEDIR") - s.addToTargetCommandEnvironmentIfSet(cmd, "SINGULARITY_CACHEDIR") -} - -// addToTargetCommandEnvironmentIfSet adds to the target command environment the value of -// the specified environment variable from the current process, if set. -func (s *SingularityClient) addToTargetCommandEnvironmentIfSet(cmd *exec.Cmd, variable string) { - if value, present := os.LookupEnv(variable); present { - s.log.Debugf("Forwarding %s=%s", variable, value) - cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", variable, value)) - } -} - -func (s *SingularityClient) addOptionalRegistryAuthCredentials(req cproto.RunSpec, cmd *exec.Cmd) { - s.log.Trace("Checking for supplied credentials") - if req.Registry != nil { - cmd.Env = append(cmd.Env, - fmt.Sprintf("SINGULARITY_DOCKER_USERNAME=%s", req.Registry.Username), - fmt.Sprintf("SINGULARITY_DOCKER_PASSWORD=%s", req.Registry.Password), - fmt.Sprintf("APPTAINER_DOCKER_USERNAME=%s", req.Registry.Username), - fmt.Sprintf("APPTAINER_DOCKER_PASSWORD=%s", req.Registry.Password)) - } -} - -func capabilitiesToSingularityArgs(req cproto.RunSpec, args []string) []string { - if len(req.HostConfig.CapAdd) > 0 { - args = append(args, "--add-caps", strings.Join(req.HostConfig.CapAdd, ",")) - } - if len(req.HostConfig.CapDrop) > 0 { - args = append(args, "--drop-caps", strings.Join(req.HostConfig.CapDrop, ",")) - } - return args -} - -// ReattachContainer implements container.ContainerRuntime. -// TODO(DET-9082): Ensure orphaned processes are cleaned up on reattach. -func (s *SingularityClient) ReattachContainer( - ctx context.Context, - reattachID cproto.ID, -) (*docker.Container, *aproto.ExitCode, error) { - return nil, nil, container.ErrMissing -} - -// RemoveContainer implements container.ContainerRuntime. -func (s *SingularityClient) RemoveContainer(ctx context.Context, id string, force bool) error { - s.mu.Lock() - defer s.mu.Unlock() - - cont, ok := s.containers[cproto.ID(id)] - if !ok { - return container.ErrMissing - } - - if cont.Started.Load() { - return cont.Proc.Kill() - } - return fmt.Errorf("cannot kill container %s that is not started", id) -} - -// SignalContainer implements container.ContainerRuntime. -func (s *SingularityClient) SignalContainer( - ctx context.Context, - id string, - sig syscall.Signal, -) error { - s.mu.Lock() - defer s.mu.Unlock() - - cont, ok := s.containers[cproto.ID(id)] - if !ok { - return container.ErrMissing - } - - if cont.Started.Load() { - return cont.Proc.Signal(sig) - } - return fmt.Errorf("cannot signal container %s with %s that is not started", id, sig) -} - -// ListRunningContainers implements container.ContainerRuntime. -func (s *SingularityClient) ListRunningContainers( - ctx context.Context, - fs filters.Args, -) (map[cproto.ID]types.Container, error) { - resp := make(map[cproto.ID]types.Container) - - s.mu.Lock() - defer s.mu.Unlock() - for id, cont := range s.containers { - resp[id] = types.Container{ - ID: string(id), - Labels: cont.Req.ContainerConfig.Labels, - } - } - return resp, nil -} - -func (s *SingularityClient) waitOnContainer( - id cproto.ID, - cont *SingularityContainer, - p events.Publisher[docker.Event], -) docker.ContainerWaiter { - wchan := make(chan dcontainer.WaitResponse, 1) - errchan := make(chan error) - s.wg.Go(func(ctx context.Context) { - defer close(wchan) - defer close(errchan) - - var body dcontainer.WaitResponse - switch state, err := cont.Proc.Wait(); { - case ctx.Err() != nil && err == nil && state.ExitCode() == -1: - s.log.Trace("detached from container process") - return - case err != nil: - s.log.Tracef("proc %d for container %s exited: %s", cont.PID, id, err) - body.Error = &dcontainer.WaitExitError{Message: err.Error()} - default: - s.log.Tracef("proc %d for container %s exited with %d", cont.PID, id, state.ExitCode()) - body.StatusCode = int64(state.ExitCode()) - } - - select { - case wchan <- body: - case <-ctx.Done(): - return - } - - s.mu.Lock() - defer s.mu.Unlock() - s.log.Tracef("forgetting completed container: %s", id) - delete(s.containers, id) - - // Defer file cleanup until restart if debug logging is enabled. - if s.log.Logger.Level <= logrus.DebugLevel { - if err := p.Publish(ctx, docker.NewLogEvent( - model.LogLevelDebug, - fmt.Sprintf("leaving tmpdir %s for inspection", cont.TmpDir), - )); err != nil { - return - } - } else { - if err := os.RemoveAll(cont.TmpDir); err != nil { - if err = p.Publish(ctx, docker.NewLogEvent( - model.LogLevelWarning, - fmt.Sprintf("failed to cleanup tmpdir (ephemeral mounts, etc): %s", err), - )); err != nil { - logrus.WithError(err).Error("publishing cleanup failure warning") - return - } - } - } - }) - return docker.ContainerWaiter{Waiter: wchan, Errs: errchan} -} - -func (s *SingularityClient) shipSingularityCmdLogs( - ctx context.Context, - r io.ReadCloser, - stdtype stdcopy.StdType, - p events.Publisher[docker.Event], -) { - cruntimes.ShipContainerCommandLogs(ctx, r, stdtype, p) -} - -func (s *SingularityClient) pprintSingularityCommand( - ctx context.Context, - args []string, - p events.Publisher[docker.Event], -) error { - return cruntimes.PprintCommand(ctx, "singularity", args, p, s.log) -} diff --git a/agent/pkg/singularity/singularity_api_test.go b/agent/pkg/singularity/singularity_api_test.go deleted file mode 100644 index 98b95b9d277..00000000000 --- a/agent/pkg/singularity/singularity_api_test.go +++ /dev/null @@ -1,80 +0,0 @@ -//go:build integration && singularity - -package singularity_test - -import ( - "context" - "fmt" - "os/user" - "testing" - - "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/network" - "github.com/docker/docker/api/types/registry" - "github.com/docker/docker/api/types/strslice" - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/require" - - "github.com/determined-ai/determined/agent/internal/options" - "github.com/determined-ai/determined/agent/pkg/docker" - "github.com/determined-ai/determined/agent/pkg/events" - "github.com/determined-ai/determined/agent/pkg/singularity" - "github.com/determined-ai/determined/master/pkg/cproto" - "github.com/determined-ai/determined/master/pkg/schemas/expconf" -) - -// TODO(DET-9077): Get coverage to 70-80%. -func TestSingularity(t *testing.T) { - logrus.SetLevel(logrus.TraceLevel) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - t.Log("creating client") - cl, err := singularity.New(options.SingularityOptions{}) - require.NoError(t, err) - - t.Log("pulling container image") - image := fmt.Sprintf("docker://%s", expconf.CPUImage) - cprotoID := cproto.NewID() - evs := make(chan docker.Event, 1024) - pub := events.ChannelPublisher(evs) - err = cl.PullImage(ctx, docker.PullImage{ - Name: image, - Registry: ®istry.AuthConfig{}, - }, pub) - require.NoError(t, err) - - t.Log("creating container") - u, err := user.Current() - require.NoError(t, err) - - id, err := cl.CreateContainer( - ctx, - cprotoID, - cproto.RunSpec{ - ContainerConfig: container.Config{ - Image: image, - Cmd: strslice.StrSlice{"/run/determined/train/entrypoint.sh"}, - Env: []string{}, - User: fmt.Sprintf("%s:%s", u.Uid, u.Gid), - }, - HostConfig: container.HostConfig{ - NetworkMode: "host", - }, - NetworkingConfig: network.NetworkingConfig{}, - Archives: []cproto.RunArchive{}, - }, - pub, - ) - require.NoError(t, err) - - t.Log("running container") - waiter, err := cl.RunContainer(ctx, ctx, id, pub) - require.NoError(t, err) - - select { - case res := <-waiter.ContainerWaiter.Waiter: - require.Nil(t, res.Error) - case <-ctx.Done(): - } -} diff --git a/docs/.redirects/redirects.json b/docs/.redirects/redirects.json index 51390a604a8..7f6023f8b3a 100644 --- a/docs/.redirects/redirects.json +++ b/docs/.redirects/redirects.json @@ -13,7 +13,8 @@ "architecture/system-architecture": "../get-started/architecture/system-architecture.html", "architecture/introduction": "../get-started/architecture/introduction.html", "setup-cluster/deploy-cluster/slurm/install-on-slurm": "../../slurm/install-on-slurm.html", - "setup-cluster/deploy-cluster/slurm/hpc-with-agent": "../../slurm/hpc-with-agent.html", + "setup-cluster/deploy-cluster/slurm/hpc-with-agent": "../../slurm/_index.html", + "setup-cluster/slurm/hpc-with-agent": "../slurm/_index.html", "setup-cluster/deploy-cluster/slurm/hpc-launching-architecture": "../../slurm/hpc-launching-architecture.html", "setup-cluster/deploy-cluster/slurm/slurm-requirements": "../../slurm/slurm-requirements.html", "setup-cluster/deploy-cluster/slurm/upgrade-on-hpc": "../../slurm/upgrade-on-hpc.html", diff --git a/docs/release-notes/deprecate-containerRuntime.rst b/docs/release-notes/deprecate-containerRuntime.rst new file mode 100644 index 00000000000..4ce0d854ecd --- /dev/null +++ b/docs/release-notes/deprecate-containerRuntime.rst @@ -0,0 +1,15 @@ +:orphan: + +**Deprecations** + +- AgentRM: Support for Singularity, Podman, and Apptainer was deprecated in 0.33.0 and is now + removed. Docker is the only container runtime supported by Agent resource manager (AgentRM). It + is still possible to use podman with AgentRM by using the podman emulation layer. For detailed + instructions, visit: `Emulating Docker CLI with Podman + `. You + might need to also configure checkpoint_storage in experiment or master configurations: `Master + Config Reference + https://docs.determined.ai/latest/reference/deploy/master-config-reference.html#checkpoint-storage` + +In enterprise edition, Slurm resource manager still supports Singularity, Podman, or Apptainer use. +For detailed instructions, visit :ref:deploy-on-slurm-pbs. diff --git a/docs/setup-cluster/slurm/_index.rst b/docs/setup-cluster/slurm/_index.rst index 375d953c5f7..8c0e08418c7 100644 --- a/docs/setup-cluster/slurm/_index.rst +++ b/docs/setup-cluster/slurm/_index.rst @@ -58,4 +58,3 @@ follow the steps in the :ref:`install-on-slurm` document. upgrade-on-hpc singularity slurm-known-issues - hpc-with-agent diff --git a/docs/setup-cluster/slurm/hpc-with-agent.rst b/docs/setup-cluster/slurm/hpc-with-agent.rst deleted file mode 100644 index add00194406..00000000000 --- a/docs/setup-cluster/slurm/hpc-with-agent.rst +++ /dev/null @@ -1,152 +0,0 @@ -.. _hpc-with-agent: - -#################### - Agent on Slurm/PBS -#################### - -As an alternative to using the HPC Launcher, you may instead utilize the Determined agent. In this -usage model, the system administrator creates a custom resource pool for each Determined user. You -then start a Determined agent on one or more compute nodes of the cluster using Slurm or PBS -commands to provide the resources for your resource pool. As work is submitted to this resource -pool, it is distributed to the set of available agents. If your Slurm/PBS job is terminated (for -example due to a time limit) before your Determined work is completed, your Determined work remains -in your resource pool until additional agents are started. You may add additional resources to your -resource pool by starting additional agents on your cluster. If your Determined work is complete -before any time limits are hit on the Slurm/PBS job providing resources, you terminate the agent -jobs manually using Slurm/PBS commands. - -The primary advantages of this model are: - -#. You have dedicated access to the compute resources provided by the agents you start for the - duration of your HPC job. This can provide more predictable throughput as it avoids contention in - a highly utilized cluster. - -#. Your Determined experiments are seen by the workload manager as a single large job, rather than - many smaller jobs. In some HPC environments, larger jobs are given preference in workload manager - scheduling. - -#. If you have jobs of different sizes sharing the same set of resources, you reduce the potential - for fragmentation where larger jobs may be delayed in running because the free resources are - distributed across many nodes. - -#. It eliminates the need for user impersonation, which the HPC Launcher uses to submit jobs to the - Slurm or PBS workload manager on your behalf, using a sudo configuration. - -There are several disadvantages to this model as well: - -#. You must interact with Slurm or PBS directly to submit and terminate jobs. Using the HPC launcher - provides a more seamless user experience that focuses solely on interacting with Determined - commands and interfaces. - -#. Overall system utilization will likely be less. Direct human control over resource allocation and - release introduces inefficiency. If you fail to keep sufficient work queued up in your resource - pool or fail to terminate the Determined agents when you are through, you prevent other users - from accessing those resources. - -***************************************** - Install the Determined Master and Agent -***************************************** - -Before users can make use of Determined agents, a system administrator must provide the following: - -#. The system administrator installs the on-premise Determined master component, as described in - :ref:`install-using-linux-packages`, and the Determined agent on all nodes of the cluster, but - does not enable or start the ``determined-agent.service``. - -#. The system administrator creates a custom resource pool in the :ref:`cluster-resource-pools` - configuration for each Determined user in the ``master.yaml``. A fragment for creating custom - resource pools for ``user`` and ``user2`` using the default settings is as follows: - - .. code:: yaml - - resource_pools: - - pool_name: user1 - - pool_name: user2 - - It is recommended that :ref:`rbac` be used to limit access to the intended user of each of these - resource pools. - -*************************************** - Create a per-user Agent Configuration -*************************************** - -This step may be completed either by the system administrator or the intended user. In a -cluster-wide shared directory (examples in this section use ``$HOME``), create an ``agent.yaml`` -file. Below is a minimal example using a resource pool named for the user (``$USER``) and -``singularity`` as the container runtime platform. If configured using variables such as ``$HOME``, -a single ``agent.yaml`` could be shared by all users. - -.. code:: yaml - - master_host: master.mycluster.com - master_port: 8090 - resource_pool: $USER - container_runtime: singularity - -There are several other settings commonly configured in the `agent.yaml` which are listed in the -table below. For the full list of options, see :ref:`agent-config-reference`. - -+----------------------------+----------------------------------------------------------------+ -| Option | Description | -+============================+================================================================+ -| ``image_root`` | To avoid multiple image downloads, configure an image cache as | -| | per :ref:`singularity-image-cache` | -+----------------------------+----------------------------------------------------------------+ -| ``container_runtime`` | Instead of ``singularity``, you could specify ``podman`` as | -| | the container runtime. | -+----------------------------+----------------------------------------------------------------+ -| ``security`` | Secure the communications between the master and agent using | -| | TLS. Configure the sections of the ``security`` block as per | -| | :ref:`tls`. | -+----------------------------+----------------------------------------------------------------+ - -**************************************************** - Start Per-User Agents to Provide Compute Resources -**************************************************** - -The user may then start one or more agents to provide resources to their resource pool using the -agent.yaml configured above. - -In the command examples below, it is assumed that the agent.yaml for a given user is provided in -`$HOME``. Paths may need to be updated depending on your local configuration. - -On Slurm, you can allocate resources with the ``srun`` or ``sbatch`` commands with the desired -resource configuration options. - -.. code:: bash - - srun --gpus=8 /usr/bin/determined-agent --config-file $HOME/agent.yaml - -or - -.. code:: bash - - sbatch -N4 --gpus-per-node=tesla:4 --wrap="srun /usr/bin/determined-agent --config-file $HOME/agent.yaml" - -On PBS, you can launch the agent on multiple nodes with the qsub command. - -.. code:: bash - - qsub -l select=2:ngpus=4 -- /opt/pbs/bin/pbsdsh -- /usr/bin/determined-agent --config-file $HOME/agent.yaml - -You can add incremental resources to your resource pool, by submitting an additional job and -starting additional agents. - -************************************************** - Launch Jobs and Experiments on the Resource Pool -************************************************** - -You can then submit experiments or other tasks to the agents you have started by selecting the -proper resource pool. The resource pool to be used can be specified on the command line or via the -experiment config using the ``resources.resource_pool`` setting. - -.. code:: bash - - det command run --config resources.resource_pool=$USER hostname - -******************************* - Release the Cluster Resources -******************************* - -When your jobs and experiments have been completed, be sure to release the resources by canceling -your Slurm/PBS job. diff --git a/tools/slurm/README.md b/tools/slurm/README.md index 5fe34b25698..e88e4d7a39d 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -5,7 +5,7 @@ 1. Install Terraform following [these instructions](https://developer.hashicorp.com/terraform/downloads). 2. Download the [GCP CLI](https://cloud.google.com/sdk/docs/install-sdk) and run `gcloud auth application-default login` to get credentials. 3. Run `make slurmcluster` from the root of the repo and wait (up to 10 minutes) for it to start. - - To specify which container runtime environment to use, pass in `FLAGS="-c {container_run_type}"` to `make slurmcluster`. Choose from either `singularity` (default), `podman`, or `enroot`. + - To specify which container runtime environment to use, pass in `FLAGS="-c {container_run_type}"` to `make slurmcluster`. You can choose from ``singularity`` (default), ``podman``, or ``enroot``. We no longer support Slurmcluster with Determined Agents. - To specify which workload manager to use, pass in `FLAGS="-w {workload_manager}"` to `make slurmcluster`. Choose from either `slurm` (default) or `pbs`. Note: in specifying the workload manager, `make slurmcluster` will automatically load the appropriate boot disk image (found in `terraform/images.conf`). - The default configuration yields a Slurm cluster with a single compute node and 8 CPUs (`n1-standard-8`). You can control the machine_type, and gpus of the compute node using `FLAGS="-m {machine_type} -g {gpu_type}:{count}"`. See below. - By default, all VMs created with `make slurmcluster` will be destroyed after 7200 seconds (2 hours). To specify a different amount of time, pass in `FLAGS="-t {0-9}[d|h|m|s]"` to `make slurmcluster`. @@ -102,27 +102,8 @@ One can load a developer launcher on their dev box created by `make slurmcluster 3. In the [hpc-ard-capsules-core repository]([d.com](https://github.hpe.com/hpe/hpc-ard-capsules-core)), run `./loadDevLauncher.sh -g [$USER]@[EXTERNAL_IP]` which will spin up a developer launcher on port 18080 on the specified VM. 4. From the root of this repository, run `make slurmcluster FLAGS="-d"` which will start a devcluster pointing at port 18080 on the instance. -## Using Slurmcluster with Determined Agents - -`make slurmcluster` supports using Determined agents to run jobs. To do this with `make slurmcluster` do the following steps from the `determined-ee/` directory: - -1. `make -C agent build package` -2. `make slurmcluster FLAGS="-A"` -3. `gcloud compute scp agent/dist/determined-agent_linux_amd64_v1/determined-agent $USER-dev-box:/home/$USER --zone us-west1-b` - -The `FLAGS="-A"` in `make slurmcluster` removes the resource_manager section in the slurmcluster.yaml that would otherwise be used. This then defaults to the agent rm and the master waits for agents to connect and provide resources. The scp command brings the determined-agent to the dev-box. `$USER` will be replaced with your username when initiating GCP. - -Then, connect to your dev-box. This can be done with `make -C tools/slurm/terraform connect` or `gcloud compute ssh $USER-dev-box --project=determined-ai --zone=us-west1-b`. Input the following command on the devbox in order to allocate resources on slurm. - -`srun $HOME/determined-agent --master-host=localhost --master-port=8080 --resource-pool=default --container-runtime=singularity` - -You can also use podman by changing the value for `container-runtime` to `podman`. - -This command allocates the 8-core CPU that is on the GCP machine. Unfortunately, there are currently no gpus on the VM so we can not allocate any. - -Now, you can launch jobs like normal using the Determined CLI. You can check the status of the allocated resources using `det slot list`. - -If you encounter an issue with jobs failing due to `ModuleNotFoundError: No module named 'determined'` run `make clean all` to rebuild determined. +### Note: Slurmcluster with Determined Agents is no longer supported. + # Running pytest Suites ## In Development diff --git a/tools/slurm/scripts/slurmcluster.sh b/tools/slurm/scripts/slurmcluster.sh index 1a2c15a1e2e..fc7ea602de9 100755 --- a/tools/slurm/scripts/slurmcluster.sh +++ b/tools/slurm/scripts/slurmcluster.sh @@ -12,7 +12,6 @@ VALID_WORKLOAD_MANAGERS="slurm pbs" export OPT_CONTAINER_RUN_TYPE="singularity" export OPT_WORKLOAD_MANAGER="slurm" export OPT_LAUNCHER_PORT=8081 -DETERMINED_AGENT= MACHINE_TYPE= GPUS= @@ -34,10 +33,6 @@ while [[ $# -gt 0 ]]; do fi shift 2 ;; - -A) - DETERMINED_AGENT=1 - shift - ;; -m) # This is processed already by generate-tfvars.sh MACHINE_TYPE=$2 @@ -73,9 +68,6 @@ while [[ $# -gt 0 ]]; do echo "as always." echo "" echo "FLAGS:" - echo ' -A ' - echo " Description: Invokes a slurmcluster that uses agents instead of the launcher." - echo " Example: $0 -A" echo ' -c {enroot|podman|singularity}' echo " Description: Invokes a slurmcluster using the specified container run type." echo " Options are 'enroot', 'podman', or 'singularity'. Default is 'singularity'." @@ -100,7 +92,7 @@ while [[ $# -gt 0 ]]; do echo "You can also combine the flags. When invoked via 'make slurmcluster' flags are passed" echo 'via the FLAGS="options" argument.' echo "" - echo ' Example: FLAGS="-A -c enroot -w pbs -g nvidia-tesla-t4:2"' + echo ' Example: FLAGS="-c enroot -w pbs -g nvidia-tesla-t4:2"' echo "" exit 0 ;; @@ -148,15 +140,13 @@ trap 'kill $TUNNEL_PID' EXIT echo "Started bidirectional tunnels to $INSTANCE_NAME" # Grab launcher token. -if [[ -z $DETERMINED_AGENT ]]; then - REMOTE_TOKEN_SOURCE=/opt/launcher/jetty/base/etc/.launcher.token - LOCAL_TOKEN_DEST=$TEMPDIR/.launcher.token - gcloud compute scp --quiet --zone "us-west1-b" --project "determined-ai" root@$INSTANCE_NAME:$REMOTE_TOKEN_SOURCE $LOCAL_TOKEN_DEST - echo "Copied launcher token to $LOCAL_TOKEN_DEST" - # The launcher service verifies communication with the launcher, so just have to be sure it is started. - # Also show the status for extra confirmaiton in the logs of the state. - gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- "sudo systemctl start launcher.service ; systemctl status launcher.service --no-pager" -fi +REMOTE_TOKEN_SOURCE=/opt/launcher/jetty/base/etc/.launcher.token +LOCAL_TOKEN_DEST=$TEMPDIR/.launcher.token +gcloud compute scp --quiet --zone "us-west1-b" --project "determined-ai" root@$INSTANCE_NAME:$REMOTE_TOKEN_SOURCE $LOCAL_TOKEN_DEST +echo "Copied launcher token to $LOCAL_TOKEN_DEST" +# The launcher service verifies communication with the launcher, so just have to be sure it is started. +# Also show the status for extra confirmaiton in the logs of the state. +gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- "sudo systemctl start launcher.service ; systemctl status launcher.service --no-pager" # Build devcluster.yaml. @@ -216,12 +206,6 @@ fi TEMPYAML=$TEMPDIR/slurmcluster.yaml envsubst <$PARENT_PATH/slurmcluster.yaml >$TEMPYAML -if [[ -n $DETERMINED_AGENT ]]; then - # When deploying with the determined agent, remove the resource_manager section - # that would otherwise be used. This then defaults to the agent rm and - # the master waits for agents to connect and provide resources. - sed -i -e '/resource_manager/,/resource_manager_end/d' $TEMPYAML -fi echo "Generated devcluster file: $TEMPYAML" # We connect to the Slurm VM using an external IP address, but although it's a