From d5d0c06a7d2b58f57768fb9776478f0261d32efa Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 5 May 2025 23:20:52 +0000 Subject: [PATCH 01/13] wip: initial agent skeleton --- agent/cmd/beegfs-agent/main.go | 107 ++++++++++++++++++++++++++++++++ agent/internal/config/config.go | 29 +++++++++ agent/internal/server/server.go | 105 +++++++++++++++++++++++++++++++ agent/pkg/agent/reconciler.go | 79 +++++++++++++++++++++++ 4 files changed, 320 insertions(+) create mode 100644 agent/cmd/beegfs-agent/main.go create mode 100644 agent/internal/config/config.go create mode 100644 agent/internal/server/server.go create mode 100644 agent/pkg/agent/reconciler.go diff --git a/agent/cmd/beegfs-agent/main.go b/agent/cmd/beegfs-agent/main.go new file mode 100644 index 00000000..ccc2846f --- /dev/null +++ b/agent/cmd/beegfs-agent/main.go @@ -0,0 +1,107 @@ +package main + +import ( + "fmt" + "log" + "os" + "os/signal" + "syscall" + + "github.com/spf13/pflag" + "github.com/thinkparq/beegfs-go/agent/internal/config" + "github.com/thinkparq/beegfs-go/agent/internal/server" + "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/beegfs-go/common/configmgr" + "github.com/thinkparq/beegfs-go/common/logger" + "go.uber.org/zap" +) + +const ( + envVarPrefix = "BEEAGENT_" +) + +// Set by the build process using ldflags. +var ( + binaryName = "unknown" + version = "unknown" + commit = "unknown" + buildTime = "unknown" +) + +func main() { + pflag.Bool("version", false, "Print the version then exit.") + pflag.String("cfg-file", "", "The path to the a configuration file (can be omitted to set all configuration using flags and/or environment variables). When Remote Storage Targets are configured using a file, they can be updated without restarting the application.") + pflag.String("log.type", "stderr", "Where log messages should be sent ('stderr', 'stdout', 'syslog', 'logfile').") + pflag.String("log.file", "/var/log/beegfs/beegfs-remote.log", "The path to the desired log file when logType is 'log.file' (if needed the directory and all parent directories will be created).") + pflag.Int8("log.level", 3, "Adjust the logging level (0=Fatal, 1=Error, 2=Warn, 3=Info, 4+5=Debug).") + pflag.Int("log.max-size", 1000, "When log.type is 'logfile' the maximum size of the log.file in megabytes before it is rotated.") + pflag.Int("log.num-rotated-files", 5, "When log.type is 'logfile' the maximum number old log.file(s) to keep when log.max-size is reached and the log is rotated.") + pflag.Bool("log.developer", false, "Enable developer logging including stack traces and setting the equivalent of log.level=5 and log.type=stdout (all other log settings are ignored).") + pflag.String("server.address", "0.0.0.0:9008", "The hostname:port where this Agent should listen for requests from the BeeGFS CTL tool.") + pflag.String("server.tls-cert-file", "/etc/beegfs/cert.pem", "Path to a certificate file that provides the identify of this Agent's gRPC server.") + pflag.String("server.tls-key-file", "/etc/beegfs/key.pem", "Path to the key file belonging to the certificate for this Agent's gRPC server.") + pflag.Bool("server.tls-disable", false, "Disable TLS entirely for gRPC communication to this Agent's gRPC server.") + pflag.CommandLine.SortFlags = false + pflag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) + pflag.PrintDefaults() + helpText := ` +Further info: + Configuration may be set using a mix of flags, environment variables, and values from a TOML configuration file. + Configuration will be merged using the following precedence order (highest->lowest): (1) flags (2) environment variables (3) configuration file (4) defaults. +Using environment variables: + To specify configuration using environment variables specify %sKEY=VALUE where KEY is the flag name you want to specify in all capitals replacing dots (.) with a double underscore (__) and hyphens (-) with an underscore (_). + Examples: + export %sLOG__DEBUG=true +` + fmt.Fprintf(os.Stderr, helpText, envVarPrefix, envVarPrefix) + os.Exit(0) + } + pflag.Parse() + + if printVersion, _ := pflag.CommandLine.GetBool("version"); printVersion { + fmt.Printf("%s %s (commit: %s, built: %s)\n", binaryName, version, commit, buildTime) + os.Exit(0) + } + + cfgMgr, err := configmgr.New(pflag.CommandLine, envVarPrefix, &config.AppConfig{}) + if err != nil { + log.Fatalf("unable to get initial configuration: %s", err) + } + c := cfgMgr.Get() + initialCfg, ok := c.(*config.AppConfig) + if !ok { + log.Fatalf("configuration manager returned invalid configuration (expected Agent application configuration)") + } + if initialCfg.Developer.DumpConfig { + fmt.Printf("Dumping AppConfig and exiting...\n\n") + fmt.Printf("%+v\n", initialCfg) + os.Exit(0) + } + + logger, err := logger.New(initialCfg.Log) + if err != nil { + log.Fatalf("unable to initialize logger: %s", err) + } + defer logger.Sync() + + agentServer, err := server.New(logger.Logger, initialCfg.Server, agent.New(logger.Logger, initialCfg.Agent)) + if err != nil { + logger.Fatal("unable to initialize gRPC server", zap.Error(err)) + } + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, os.Interrupt, syscall.SIGTERM, syscall.SIGINT) + + errChan := make(chan error, 2) + agentServer.ListenAndServe(errChan) + + select { + case err := <-errChan: + logger.Error("component terminated unexpectedly", zap.Error(err)) + case <-sigs: + logger.Info("shutdown signal received") + } + agentServer.Stop() + logger.Info("shutdown all components, exiting") +} diff --git a/agent/internal/config/config.go b/agent/internal/config/config.go new file mode 100644 index 00000000..ef7bad0b --- /dev/null +++ b/agent/internal/config/config.go @@ -0,0 +1,29 @@ +package config + +import ( + "github.com/thinkparq/beegfs-go/agent/internal/server" + "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/beegfs-go/common/configmgr" + "github.com/thinkparq/beegfs-go/common/logger" +) + +type AppConfig struct { + Log logger.Config `mapstructure:"log"` + Agent agent.Config `mapstructure:"agent"` + Server server.Config `mapstructure:"server"` + Developer struct { + DumpConfig bool `mapstructure:"dump-config"` + } +} + +func (c *AppConfig) NewEmptyInstance() configmgr.Configurable { + return new(AppConfig) +} + +func (c *AppConfig) UpdateAllowed(newConfig configmgr.Configurable) error { + return nil +} + +func (c *AppConfig) ValidateConfig() error { + return nil +} diff --git a/agent/internal/server/server.go b/agent/internal/server/server.go new file mode 100644 index 00000000..e421fa87 --- /dev/null +++ b/agent/internal/server/server.go @@ -0,0 +1,105 @@ +package server + +import ( + "context" + "fmt" + "net" + "path" + "reflect" + "sync" + + "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/protobuf/go/beegfs" + "go.uber.org/zap" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" +) + +type Config struct { + Address string `mapstructure:"address"` + TlsCertFile string `mapstructure:"tls-cert-file"` + TlsKeyFile string `mapstructure:"tls-key-file"` + TlsDisable bool `mapstructure:"tls-disable"` +} + +type AgentServer struct { + beegfs.UnimplementedAgentServer + log *zap.Logger + wg *sync.WaitGroup + Config + grpcServer *grpc.Server + reconciler agent.Reconciler +} + +func New(log *zap.Logger, config Config, reconciler agent.Reconciler) (*AgentServer, error) { + log = log.With(zap.String("component", path.Base(reflect.TypeOf(AgentServer{}).PkgPath()))) + + s := AgentServer{ + log: log, + Config: config, + wg: new(sync.WaitGroup), + reconciler: reconciler, + } + var grpcServerOpts []grpc.ServerOption + if !s.TlsDisable && s.TlsCertFile != "" && s.TlsKeyFile != "" { + creds, err := credentials.NewServerTLSFromFile(s.TlsCertFile, s.TlsKeyFile) + if err != nil { + return nil, err + } + grpcServerOpts = append(grpcServerOpts, grpc.Creds(creds)) + } else { + s.log.Warn("not using TLS because it was explicitly disabled or a certificate and/or key were not specified") + } + s.grpcServer = grpc.NewServer(grpcServerOpts...) + beegfs.RegisterAgentServer(s.grpcServer, &s) + return &s, nil +} + +func (s *AgentServer) ListenAndServe(errChan chan<- error) { + go func() { + s.log.Info("listening on local network address", zap.Any("address", s.Address)) + lis, err := net.Listen("tcp", s.Address) + if err != nil { + errChan <- fmt.Errorf("remote server: error listening on the specified address %s: %w", s.Address, err) + return + } + s.log.Info("serving gRPC requests") + err = s.grpcServer.Serve(lis) + if err != nil { + errChan <- fmt.Errorf("remote server: error serving gRPC requests: %w", err) + } + }() +} + +func (s *AgentServer) Stop() { + s.log.Info("attempting to stop gRPC server") + s.grpcServer.Stop() + s.wg.Wait() +} + +func (s *AgentServer) Apply(ctx context.Context, request *beegfs.AgentApplyRequest) (*beegfs.AgentResponse, error) { + s.wg.Add(1) + defer s.wg.Done() + result, err := s.reconciler.Apply(ctx, request.Config) + return &beegfs.AgentResponse{ + Status: result.Status, + }, err +} + +func (s *AgentServer) Destroy(ctx context.Context, request *beegfs.AgentDestroyRequest) (*beegfs.AgentResponse, error) { + s.wg.Add(1) + defer s.wg.Done() + result, err := s.reconciler.Destroy(ctx, request.Config) + return &beegfs.AgentResponse{ + Status: result.Status, + }, err +} + +func (s *AgentServer) Status(ctx context.Context, request *beegfs.AgentStatusRequest) (*beegfs.AgentResponse, error) { + s.wg.Add(1) + defer s.wg.Done() + result, err := s.reconciler.Status(ctx) + return &beegfs.AgentResponse{ + Status: result.Status, + }, err +} diff --git a/agent/pkg/agent/reconciler.go b/agent/pkg/agent/reconciler.go new file mode 100644 index 00000000..f1d35553 --- /dev/null +++ b/agent/pkg/agent/reconciler.go @@ -0,0 +1,79 @@ +package agent + +import ( + "context" + "path" + "reflect" + "sync" + "time" + + "github.com/thinkparq/protobuf/go/beegfs" + "go.uber.org/zap" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" +) + +type Config struct { +} + +type Reconciler interface { + Apply(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) + Destroy(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) + Status(ctx context.Context) (ReconcileResult, error) +} + +type ReconcileResult struct { + Status *beegfs.AgentStatus +} + +type defaultReconciler struct { + log *zap.Logger + mu sync.RWMutex + config Config + currentState beegfs.AgentStatus + historicalState map[time.Time]beegfs.AgentStatus +} + +// TODO (current): Wrap the zap.Logger with an intermediate handler that pushes a structured message +// into the currentState.Messages then also logs out the message. By default only info and above +// should be added to the messages. +// +// Then follow the standard that when we enter Apply/Destroy we push the currentState to +// historicalState and start a new currentState that is used to collect the events that happen +// during the current reconcilation loop. + +func New(log *zap.Logger, config Config) Reconciler { + log = log.With(zap.String("component", path.Base(reflect.TypeOf(defaultReconciler{}).PkgPath()))) + return &defaultReconciler{ + log: log, + config: config, + currentState: beegfs.AgentStatus{ + State: beegfs.AgentStatus_IDLE, + Messages: []string{"[AGENT] Startup"}, + Updated: timestamppb.Now(), + }, + historicalState: make(map[time.Time]beegfs.AgentStatus), + } +} + +func (r *defaultReconciler) Apply(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) { + r.mu.Lock() + defer r.mu.Unlock() + // TODO + return ReconcileResult{}, nil +} + +func (r *defaultReconciler) Destroy(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) { + r.mu.Lock() + defer r.mu.Unlock() + // TODO + return ReconcileResult{}, nil +} + +func (r *defaultReconciler) Status(ctx context.Context) (ReconcileResult, error) { + r.mu.RLock() + defer r.mu.RUnlock() + return ReconcileResult{ + Status: proto.Clone(&r.currentState).(*beegfs.AgentStatus), + }, nil +} From 1634305310463b745dc4e222788a52adab0d97e9 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Wed, 7 May 2025 21:14:15 +0000 Subject: [PATCH 02/13] wip: agent config/manifest handling --- agent/build/agent.toml | 14 ++ agent/cmd/beegfs-agent/main.go | 21 ++- agent/internal/config/config.go | 16 +- agent/internal/server/server.go | 70 +++++--- agent/pkg/agent/reconciler.go | 79 -------- agent/pkg/manifest/filesystem.go | 248 ++++++++++++++++++++++++++ agent/pkg/manifest/filesystem_test.go | 48 +++++ agent/pkg/manifest/manifest.yaml | 67 +++++++ agent/pkg/reconciler/errors.go | 9 + agent/pkg/reconciler/reconciler.go | 125 +++++++++++++ agent/pkg/reconciler/state.go | 102 +++++++++++ common/beegfs/nodetype.go | 48 ++++- common/beegfs/nodetype_test.go | 4 + go.mod | 4 +- go.sum | 2 - 15 files changed, 743 insertions(+), 114 deletions(-) create mode 100644 agent/build/agent.toml delete mode 100644 agent/pkg/agent/reconciler.go create mode 100644 agent/pkg/manifest/filesystem.go create mode 100644 agent/pkg/manifest/filesystem_test.go create mode 100644 agent/pkg/manifest/manifest.yaml create mode 100644 agent/pkg/reconciler/errors.go create mode 100644 agent/pkg/reconciler/reconciler.go create mode 100644 agent/pkg/reconciler/state.go diff --git a/agent/build/agent.toml b/agent/build/agent.toml new file mode 100644 index 00000000..3c24a214 --- /dev/null +++ b/agent/build/agent.toml @@ -0,0 +1,14 @@ +agent-id = "agent1" + +[log] +level = 5 +type = "stdout" + +[server] +address = "0.0.0.0:9008" +tls-cert-file = "/etc/beegfs/cert.pem" +tls-key-file = "/etc/beegfs/key.pem" + +[reconciler] +manifest-path = "/etc/beegfs/manifest.yaml" +active-manifest-path = "/etc/beegfs/.active.manifest.yaml" diff --git a/agent/cmd/beegfs-agent/main.go b/agent/cmd/beegfs-agent/main.go index ccc2846f..37e0605d 100644 --- a/agent/cmd/beegfs-agent/main.go +++ b/agent/cmd/beegfs-agent/main.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "log" "os" @@ -10,7 +11,7 @@ import ( "github.com/spf13/pflag" "github.com/thinkparq/beegfs-go/agent/internal/config" "github.com/thinkparq/beegfs-go/agent/internal/server" - "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/beegfs-go/agent/pkg/reconciler" "github.com/thinkparq/beegfs-go/common/configmgr" "github.com/thinkparq/beegfs-go/common/logger" "go.uber.org/zap" @@ -30,7 +31,8 @@ var ( func main() { pflag.Bool("version", false, "Print the version then exit.") - pflag.String("cfg-file", "", "The path to the a configuration file (can be omitted to set all configuration using flags and/or environment variables). When Remote Storage Targets are configured using a file, they can be updated without restarting the application.") + pflag.String("cfg-file", "/etc/beegfs/agent.toml", "The path to the a configuration file (can be omitted to set all configuration using flags and/or environment variables). When Remote Storage Targets are configured using a file, they can be updated without restarting the application.") + pflag.String("agent-id", "0", "A unique ID used to identify what nodes from the manifest this agent is responsible for. Should not change after initially starting the agent.") pflag.String("log.type", "stderr", "Where log messages should be sent ('stderr', 'stdout', 'syslog', 'logfile').") pflag.String("log.file", "/var/log/beegfs/beegfs-remote.log", "The path to the desired log file when logType is 'log.file' (if needed the directory and all parent directories will be created).") pflag.Int8("log.level", 3, "Adjust the logging level (0=Fatal, 1=Error, 2=Warn, 3=Info, 4+5=Debug).") @@ -41,6 +43,10 @@ func main() { pflag.String("server.tls-cert-file", "/etc/beegfs/cert.pem", "Path to a certificate file that provides the identify of this Agent's gRPC server.") pflag.String("server.tls-key-file", "/etc/beegfs/key.pem", "Path to the key file belonging to the certificate for this Agent's gRPC server.") pflag.Bool("server.tls-disable", false, "Disable TLS entirely for gRPC communication to this Agent's gRPC server.") + pflag.String("reconciler.manifest-path", "/etc/beegfs/manifest.yaml", "The path to the BeeGFS manifest this agent should apply. The manifest will be identical to the active manifest if applied successfully.") + pflag.String("reconciler.active-manifest-path", "/etc/beegfs/.active.manifest.yaml", "The past to the last BeeGFS manifest successfully applied by this agent.") + pflag.Bool("developer.dump-config", false, "Dump the full configuration and immediately exit.") + pflag.CommandLine.MarkHidden("developer.dump-config") pflag.CommandLine.SortFlags = false pflag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) @@ -85,23 +91,26 @@ Using environment variables: } defer logger.Sync() - agentServer, err := server.New(logger.Logger, initialCfg.Server, agent.New(logger.Logger, initialCfg.Agent)) + reconciler := reconciler.New(initialCfg.AgentID, logger.Logger, initialCfg.Reconciler) + cfgMgr.AddListener(reconciler) + agentServer, err := server.New(logger.Logger, initialCfg.Server, reconciler) if err != nil { logger.Fatal("unable to initialize gRPC server", zap.Error(err)) } - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, os.Interrupt, syscall.SIGTERM, syscall.SIGINT) + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM, syscall.SIGINT) errChan := make(chan error, 2) agentServer.ListenAndServe(errChan) + go cfgMgr.Manage(ctx, logger.Logger) select { case err := <-errChan: logger.Error("component terminated unexpectedly", zap.Error(err)) - case <-sigs: + case <-ctx.Done(): logger.Info("shutdown signal received") } + cancel() agentServer.Stop() logger.Info("shutdown all components, exiting") } diff --git a/agent/internal/config/config.go b/agent/internal/config/config.go index ef7bad0b..c912b513 100644 --- a/agent/internal/config/config.go +++ b/agent/internal/config/config.go @@ -2,16 +2,17 @@ package config import ( "github.com/thinkparq/beegfs-go/agent/internal/server" - "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/beegfs-go/agent/pkg/reconciler" "github.com/thinkparq/beegfs-go/common/configmgr" "github.com/thinkparq/beegfs-go/common/logger" ) type AppConfig struct { - Log logger.Config `mapstructure:"log"` - Agent agent.Config `mapstructure:"agent"` - Server server.Config `mapstructure:"server"` - Developer struct { + AgentID string `mapstructure:"agent-id"` + Log logger.Config `mapstructure:"log"` + Reconciler reconciler.Config `mapstructure:"reconciler"` + Server server.Config `mapstructure:"server"` + Developer struct { DumpConfig bool `mapstructure:"dump-config"` } } @@ -27,3 +28,8 @@ func (c *AppConfig) UpdateAllowed(newConfig configmgr.Configurable) error { func (c *AppConfig) ValidateConfig() error { return nil } + +// GetReconcilerConfig returns only the part of an AppConfig expected by the reconciler. +func (c *AppConfig) GetReconcilerConfig() reconciler.Config { + return c.Reconciler +} diff --git a/agent/internal/server/server.go b/agent/internal/server/server.go index e421fa87..9f63c13c 100644 --- a/agent/internal/server/server.go +++ b/agent/internal/server/server.go @@ -2,17 +2,21 @@ package server import ( "context" + "errors" "fmt" "net" "path" "reflect" "sync" - "github.com/thinkparq/beegfs-go/agent/pkg/agent" + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" + "github.com/thinkparq/beegfs-go/agent/pkg/reconciler" "github.com/thinkparq/protobuf/go/beegfs" "go.uber.org/zap" "google.golang.org/grpc" + "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/status" ) type Config struct { @@ -23,15 +27,15 @@ type Config struct { } type AgentServer struct { - beegfs.UnimplementedAgentServer + beegfs.UnimplementedBeeAgentServer log *zap.Logger wg *sync.WaitGroup Config grpcServer *grpc.Server - reconciler agent.Reconciler + reconciler reconciler.Reconciler } -func New(log *zap.Logger, config Config, reconciler agent.Reconciler) (*AgentServer, error) { +func New(log *zap.Logger, config Config, reconciler reconciler.Reconciler) (*AgentServer, error) { log = log.With(zap.String("component", path.Base(reflect.TypeOf(AgentServer{}).PkgPath()))) s := AgentServer{ @@ -51,7 +55,7 @@ func New(log *zap.Logger, config Config, reconciler agent.Reconciler) (*AgentSer s.log.Warn("not using TLS because it was explicitly disabled or a certificate and/or key were not specified") } s.grpcServer = grpc.NewServer(grpcServerOpts...) - beegfs.RegisterAgentServer(s.grpcServer, &s) + beegfs.RegisterBeeAgentServer(s.grpcServer, &s) return &s, nil } @@ -77,29 +81,55 @@ func (s *AgentServer) Stop() { s.wg.Wait() } -func (s *AgentServer) Apply(ctx context.Context, request *beegfs.AgentApplyRequest) (*beegfs.AgentResponse, error) { +func (s *AgentServer) Update(ctx context.Context, request *beegfs.AgentUpdateRequest) (*beegfs.AgentUpdateResponse, error) { s.wg.Add(1) defer s.wg.Done() - result, err := s.reconciler.Apply(ctx, request.Config) - return &beegfs.AgentResponse{ - Status: result.Status, - }, err + if err := s.reconciler.UpdateConfiguration(manifest.FromProto(request.GetConfig())); err != nil { + return nil, grpcStatusFrom(err) + } + return &beegfs.AgentUpdateResponse{ + FsUuid: s.reconciler.GetFsUUID(), + AgentId: s.reconciler.GetAgentID(), + }, nil } -func (s *AgentServer) Destroy(ctx context.Context, request *beegfs.AgentDestroyRequest) (*beegfs.AgentResponse, error) { +func (s *AgentServer) Status(ctx context.Context, request *beegfs.AgentStatusRequest) (*beegfs.AgentStatusResponse, error) { s.wg.Add(1) defer s.wg.Done() - result, err := s.reconciler.Destroy(ctx, request.Config) - return &beegfs.AgentResponse{ - Status: result.Status, - }, err + if result, err := s.reconciler.Status(); err != nil { + return nil, grpcStatusFrom(err) + } else { + return &beegfs.AgentStatusResponse{ + Status: result.Status, + FsUuid: s.reconciler.GetFsUUID(), + AgentId: s.reconciler.GetAgentID(), + }, nil + } } -func (s *AgentServer) Status(ctx context.Context, request *beegfs.AgentStatusRequest) (*beegfs.AgentResponse, error) { +func (s *AgentServer) Cancel(ctx context.Context, request *beegfs.AgentCancelRequest) (*beegfs.AgentCancelResponse, error) { s.wg.Add(1) defer s.wg.Done() - result, err := s.reconciler.Status(ctx) - return &beegfs.AgentResponse{ - Status: result.Status, - }, err + if result, err := s.reconciler.Cancel(request.GetReason()); err != nil { + return nil, grpcStatusFrom(err) + } else { + return &beegfs.AgentCancelResponse{ + Status: result.Status, + FsUuid: s.reconciler.GetFsUUID(), + AgentId: s.reconciler.GetAgentID(), + }, nil + } +} + +func grpcStatusFrom(err error) error { + var grpcErr error + switch { + case errors.Is(err, reconciler.ErrSavingManifest): + grpcErr = status.Error(codes.FailedPrecondition, err.Error()) + case errors.Is(err, reconciler.ErrBadManifest): + grpcErr = status.Error(codes.InvalidArgument, err.Error()) + default: + grpcErr = status.Error(codes.Unknown, err.Error()) + } + return grpcErr } diff --git a/agent/pkg/agent/reconciler.go b/agent/pkg/agent/reconciler.go deleted file mode 100644 index f1d35553..00000000 --- a/agent/pkg/agent/reconciler.go +++ /dev/null @@ -1,79 +0,0 @@ -package agent - -import ( - "context" - "path" - "reflect" - "sync" - "time" - - "github.com/thinkparq/protobuf/go/beegfs" - "go.uber.org/zap" - "google.golang.org/protobuf/proto" - "google.golang.org/protobuf/types/known/timestamppb" -) - -type Config struct { -} - -type Reconciler interface { - Apply(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) - Destroy(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) - Status(ctx context.Context) (ReconcileResult, error) -} - -type ReconcileResult struct { - Status *beegfs.AgentStatus -} - -type defaultReconciler struct { - log *zap.Logger - mu sync.RWMutex - config Config - currentState beegfs.AgentStatus - historicalState map[time.Time]beegfs.AgentStatus -} - -// TODO (current): Wrap the zap.Logger with an intermediate handler that pushes a structured message -// into the currentState.Messages then also logs out the message. By default only info and above -// should be added to the messages. -// -// Then follow the standard that when we enter Apply/Destroy we push the currentState to -// historicalState and start a new currentState that is used to collect the events that happen -// during the current reconcilation loop. - -func New(log *zap.Logger, config Config) Reconciler { - log = log.With(zap.String("component", path.Base(reflect.TypeOf(defaultReconciler{}).PkgPath()))) - return &defaultReconciler{ - log: log, - config: config, - currentState: beegfs.AgentStatus{ - State: beegfs.AgentStatus_IDLE, - Messages: []string{"[AGENT] Startup"}, - Updated: timestamppb.Now(), - }, - historicalState: make(map[time.Time]beegfs.AgentStatus), - } -} - -func (r *defaultReconciler) Apply(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) { - r.mu.Lock() - defer r.mu.Unlock() - // TODO - return ReconcileResult{}, nil -} - -func (r *defaultReconciler) Destroy(ctx context.Context, host *beegfs.Host) (ReconcileResult, error) { - r.mu.Lock() - defer r.mu.Unlock() - // TODO - return ReconcileResult{}, nil -} - -func (r *defaultReconciler) Status(ctx context.Context) (ReconcileResult, error) { - r.mu.RLock() - defer r.mu.RUnlock() - return ReconcileResult{ - Status: proto.Clone(&r.currentState).(*beegfs.AgentStatus), - }, nil -} diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go new file mode 100644 index 00000000..c566d44d --- /dev/null +++ b/agent/pkg/manifest/filesystem.go @@ -0,0 +1,248 @@ +package manifest + +import ( + "fmt" + "os" + "strings" + + "github.com/thinkparq/beegfs-go/common/beegfs" + pb "github.com/thinkparq/protobuf/go/beegfs" + "gopkg.in/yaml.v3" +) + +func New() Filesystem { + return Filesystem{ + Agents: make(map[string]Agent), + Common: Common{}, + MetaConfig: make(map[string]string), + StorageConfig: make(map[string]string), + ClientConfig: make(map[string]string), + } +} + +type Filesystem struct { + Agents map[string]Agent `yaml:"agents"` + Common Common `yaml:"common"` + MetaConfig map[string]string `yaml:"meta_config"` + StorageConfig map[string]string `yaml:"storage_config"` + ClientConfig map[string]string `yaml:"client_config"` +} + +type Common struct { + Auth string `yaml:"auth"` +} + +type Agent struct { + Nodes []Node `yaml:"nodes"` +} + +type Node struct { + ID beegfs.NumId `yaml:"id"` + Type beegfs.NodeType `yaml:"type"` + Config map[string]string `yaml:"config"` + Interfaces []Nic `yaml:"interfaces"` + Targets []Target `yaml:"targets"` +} + +type Nic struct { + Name string `yaml:"name"` + Addr string `yaml:"address"` +} + +type Target struct { + ID beegfs.NumId `yaml:"id"` + RootDir string `yaml:"root_dir"` + ULFS *UnderlyingFS `yaml:"ulfs"` +} + +type UnderlyingFS struct { + Device string `yaml:"device"` + Type UnderlyingFSType `yaml:"type"` + FormatFlags string `yaml:"format_flags"` + MountFlags string `yaml:"mount_flags"` +} + +type UnderlyingFSType int + +const ( + UnknownUnderlyingFS UnderlyingFSType = iota + EXT4UnderlyingFS +) + +func (t UnderlyingFSType) String() string { + switch t { + case EXT4UnderlyingFS: + return "ext4" + default: + return "unknown" + } +} + +func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(interface{}) error) error { + var s string + if err := unmarshal(&s); err != nil { + return err + } + + switch strings.ToLower(s) { + case "ext4": + *t = EXT4UnderlyingFS + default: + return fmt.Errorf("invalid underlying fs type: %s", s) + } + return nil +} + +func (t UnderlyingFSType) MarshalYAML() (interface{}, error) { + switch t { + case EXT4UnderlyingFS: + return "ext4", nil + default: + return nil, fmt.Errorf("unknown fs type: %d", t) + } +} + +func fsTypeFromProto(fs pb.Target_UnderlyingFSOpts_FsType) UnderlyingFSType { + switch fs { + case pb.Target_UnderlyingFSOpts_EXT4: + return EXT4UnderlyingFS + default: + return UnknownUnderlyingFS + } +} + +func fsTypeToProto(fs UnderlyingFSType) pb.Target_UnderlyingFSOpts_FsType { + switch fs { + case EXT4UnderlyingFS: + return pb.Target_UnderlyingFSOpts_EXT4 + default: + return pb.Target_UnderlyingFSOpts_UNSPECIFIED + } +} + +func FromProto(protoFS *pb.Filesystem) Filesystem { + fs := New() + if protoFS == nil { + return fs + } + + fs.Common = Common{ + Auth: protoFS.GetCommon().GetAuth(), + } + fs.MetaConfig = protoFS.GetMetaConfig() + fs.StorageConfig = protoFS.GetStorageConfig() + fs.ClientConfig = protoFS.GetClientConfig() + + for id, a := range protoFS.GetAgent() { + agent := Agent{ + Nodes: make([]Node, 0), + } + for _, n := range a.GetNodes() { + node := Node{ + ID: beegfs.NumId(n.GetNumId()), + Type: beegfs.NodeTypeFromProto(n.NodeType), + Config: n.GetConfig(), + Interfaces: make([]Nic, 0), + Targets: make([]Target, 0), + } + + for _, i := range n.GetInterfaces() { + node.Interfaces = append(node.Interfaces, Nic{ + Name: i.Name, + Addr: i.Addr, + }) + } + + for _, t := range n.GetTargets() { + target := Target{ + ID: beegfs.NumId(t.GetNumId()), + RootDir: t.GetRootDir(), + } + if t.GetUlfs() != nil { + target.ULFS = &UnderlyingFS{ + Device: t.GetUlfs().GetDevice(), + Type: fsTypeFromProto(t.GetUlfs().GetType()), + FormatFlags: t.GetUlfs().GetFormatFlags(), + MountFlags: t.GetUlfs().GetMountFlags(), + } + + } + node.Targets = append(node.Targets, target) + } + agent.Nodes = append(agent.Nodes, node) + } + fs.Agents[id] = agent + } + return fs +} + +func ToProto(fs *Filesystem) *pb.Filesystem { + pbFS := &pb.Filesystem{ + Common: &pb.Filesystem_Common{ + Auth: fs.Common.Auth, + }, + MetaConfig: fs.MetaConfig, + StorageConfig: fs.StorageConfig, + ClientConfig: fs.ClientConfig, + Agent: make(map[string]*pb.Agent), + } + + for agentID, agent := range fs.Agents { + pbAgent := &pb.Agent{ + Nodes: make([]*pb.Node, 0, len(agent.Nodes)), + } + for _, node := range agent.Nodes { + pbNode := &pb.Node{ + NumId: uint32(node.ID), + NodeType: *node.Type.ToProto(), + Config: node.Config, + Interfaces: make([]*pb.Nic, 0, len(node.Interfaces)), + Targets: make([]*pb.Target, 0, len(node.Targets)), + } + for _, nic := range node.Interfaces { + pbNode.Interfaces = append(pbNode.Interfaces, &pb.Nic{ + Name: nic.Name, + Addr: nic.Addr, + }) + } + for _, tgt := range node.Targets { + pbTarget := &pb.Target{ + NumId: uint32(tgt.ID), + RootDir: tgt.RootDir, + } + if tgt.ULFS != nil { + pbTarget.Ulfs = &pb.Target_UnderlyingFSOpts{ + Device: tgt.ULFS.Device, + Type: fsTypeToProto(tgt.ULFS.Type), + FormatFlags: tgt.ULFS.FormatFlags, + MountFlags: tgt.ULFS.MountFlags, + } + } + pbNode.Targets = append(pbNode.Targets, pbTarget) + } + pbAgent.Nodes = append(pbAgent.Nodes, pbNode) + } + pbFS.Agent[agentID] = pbAgent + } + return pbFS +} + +func FromDisk(path string) (Filesystem, error) { + data, err := os.ReadFile(path) + if err != nil { + return Filesystem{}, err + } + var fs Filesystem + if err := yaml.Unmarshal(data, &fs); err != nil { + return fs, err + } + return fs, nil +} + +func ToDisk(fs Filesystem, path string) error { + data, err := yaml.Marshal(&fs) + if err != nil { + return err + } + return os.WriteFile(path, data, 0644) +} diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go new file mode 100644 index 00000000..aa73060b --- /dev/null +++ b/agent/pkg/manifest/filesystem_test.go @@ -0,0 +1,48 @@ +package manifest + +import ( + "testing" + + "github.com/stretchr/testify/assert" + pb "github.com/thinkparq/protobuf/go/beegfs" +) + +func TestFromToProto_RoundTrip(t *testing.T) { + original := &pb.Filesystem{ + Common: &pb.Filesystem_Common{Auth: "secret"}, + MetaConfig: map[string]string{"key1": "val1"}, + StorageConfig: map[string]string{"key2": "val2"}, + ClientConfig: map[string]string{"key3": "val3"}, + Agent: map[string]*pb.Agent{ + "agent1": { + Nodes: []*pb.Node{ + { + NumId: 1, + NodeType: pb.NodeType_META, + Config: map[string]string{"nkey": "nval"}, + Interfaces: []*pb.Nic{ + {Name: "ib0", Addr: "10.0.0.1/16"}, + }, + Targets: []*pb.Target{ + { + NumId: 101, + RootDir: "/mnt", + Ulfs: &pb.Target_UnderlyingFSOpts{ + Device: "/dev/sda1", + Type: pb.Target_UnderlyingFSOpts_EXT4, + FormatFlags: "force", + MountFlags: "ro", + }, + }, + }, + }, + }, + }, + }, + } + + goStruct := FromProto(original) + roundTripped := ToProto(&goStruct) + + assert.Equal(t, original, roundTripped, "round-trip protobuf -> go -> protobuf did not match original") +} diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml new file mode 100644 index 00000000..a2404eb0 --- /dev/null +++ b/agent/pkg/manifest/manifest.yaml @@ -0,0 +1,67 @@ +common: + auth: secret + +meta_config: + quotaEnableEnforcement: true + storeClientXAttrs: true + storeClientACLs: true + +storage_config: + quotaEnableEnforcement: true + +client_config: + quotaEnabled: true + +agents: + agent1: # agent-id + nodes: + - type: meta + id: 1 + interfaces: + - name: ib0 + address: 10.0.0.101/16 + targets: + - id: 101 + ulfs: + device: /dev/sda1 + type: ext4 + format_flags: foo + mount_flags: baz + agent2: # agent-id + nodes: + - type: storage + id: 1 + interfaces: + - name: ib0 + address: 10.0.0.102/16 + targets: + - id: 101 + ulfs: + device: /dev/sda1 + type: ext4 + format_flags: foo + mount_flags: baz + - id: 102 + ulfs: + device: /dev/sda1 + type: ext4 + format_flags: foo + mount_flags: baz + - type: storage + id: 2 + interfaces: + - name: ib1 + address: 10.0.0.102/16 + targets: + - id: 201 + ulfs: + device: /dev/sda2 + type: ext4 + format_flags: foo + mount_flags: baz + - id: 202 + ulfs: + device: /dev/sda2 + type: ext4 + format_flags: foo + mount_flags: baz diff --git a/agent/pkg/reconciler/errors.go b/agent/pkg/reconciler/errors.go new file mode 100644 index 00000000..f80e43c4 --- /dev/null +++ b/agent/pkg/reconciler/errors.go @@ -0,0 +1,9 @@ +package reconciler + +import "errors" + +var ( + ErrLoadingManifest = errors.New("unable to load manifest from disk") + ErrSavingManifest = errors.New("unable to save manifest to disk") + ErrBadManifest = errors.New("manifest failed verification") +) diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go new file mode 100644 index 00000000..20a0fead --- /dev/null +++ b/agent/pkg/reconciler/reconciler.go @@ -0,0 +1,125 @@ +package reconciler + +import ( + "fmt" + "path" + "reflect" + "sync" + + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" + "github.com/thinkparq/protobuf/go/beegfs" + "go.uber.org/zap" +) + +type Config struct { + ManifestPath string `mapstructure:"manifest-path"` + ActiveManifestPath string `mapstructure:"active-manifest-path"` +} + +type Configurer interface { + GetReconcilerConfig() Config +} + +type Reconciler interface { + GetAgentID() string + GetFsUUID() string + Status() (ReconcileResult, error) + Cancel(string) (ReconcileResult, error) + UpdateConfiguration(any) error +} + +type ReconcileResult struct { + Status *beegfs.AgentStatus +} + +type defaultReconciler struct { + agentID string + log *zap.Logger + mu sync.Mutex + fs manifest.Filesystem + state state + config Config +} + +func New(agentID string, log *zap.Logger, config Config) Reconciler { + log = log.With(zap.String("component", path.Base(reflect.TypeOf(defaultReconciler{}).PkgPath()))) + // Setting the initial config and file system manifest will be triggered later by ConfigMgr. + return &defaultReconciler{ + agentID: agentID, + log: log, + state: newAgentState(), + mu: sync.Mutex{}, + } +} + +func (r *defaultReconciler) GetAgentID() string { + return r.agentID +} + +func (r *defaultReconciler) GetFsUUID() string { + return "TODO" +} + +func (r *defaultReconciler) Status() (ReconcileResult, error) { + return ReconcileResult{ + Status: r.state.get(), + }, nil +} + +func (r *defaultReconciler) Cancel(reason string) (ReconcileResult, error) { + r.state.cancel(reason) + return r.Status() +} + +// UpdateConfiguration handles: +// +// - Local config updates from ConfigMgr where the new manifest is loaded from disk. +// - Remote config updates from the gRPC server where a new manifest is saved to disk. +// +// In both cases it will verify the new manifest and attempt to reconcile it if possible. +func (r *defaultReconciler) UpdateConfiguration(config any) error { + if configurer, ok := config.(Configurer); ok { + r.mu.Lock() + r.config = configurer.GetReconcilerConfig() + r.log.Info("loading file system manifest", zap.String("path", r.config.ManifestPath)) + newFS, err := manifest.FromDisk(r.config.ManifestPath) + r.mu.Unlock() + if err != nil { + return fmt.Errorf("%w: %w", ErrLoadingManifest, err) + } + return r.verify(newFS) + } else if newFS, ok := config.(manifest.Filesystem); ok { + r.mu.Lock() + r.log.Info("saving file system manifest", zap.String("path", r.config.ActiveManifestPath)) + err := manifest.ToDisk(newFS, r.config.ManifestPath) + r.mu.Unlock() + if err != nil { + return fmt.Errorf("%w: %w", ErrBadManifest, err) + } + return r.verify(newFS) + } + return fmt.Errorf("received unexpected reconciler configuration (most likely this indicates a bug and a report should be filed)") +} + +func (r *defaultReconciler) verify(newFS manifest.Filesystem) error { + // TODO: + // * Avoid necessary reconciliations by seeing if the manifest changed. + // * Validate we can migrate from currentFS to newFS. + r.log.Info("verifying manifest") + go r.reconcile(newFS) + return nil +} + +func (r *defaultReconciler) reconcile(fs manifest.Filesystem) { + r.mu.Lock() + defer r.mu.Unlock() + r.log.Info("starting reconciliation") + r.log.Debug("reconciling", zap.Any("filesystem", fs)) + ctx := r.state.start() + ctx.Err() + // TODO: Reconcile + r.fs = fs + manifest.ToDisk(r.fs, r.config.ActiveManifestPath) + r.state.complete(beegfs.AgentStatus_SUCCESS) + r.log.Info("completed reconciliation") +} diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go new file mode 100644 index 00000000..215555a5 --- /dev/null +++ b/agent/pkg/reconciler/state.go @@ -0,0 +1,102 @@ +package reconciler + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/thinkparq/protobuf/go/beegfs" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" +) + +type state struct { + current beegfs.AgentStatus + historical map[time.Time]*beegfs.AgentStatus + mu sync.Mutex + ctx context.Context + ctxCancel context.CancelFunc +} + +type op string + +const ( + unknown op = "UNKNOWN" + agent = "AGENT" + mount = "MOUNT" +) + +func newAgentState() state { + return state{ + current: beegfs.AgentStatus{ + State: beegfs.AgentStatus_IDLE, + Messages: []string{}, + Updated: timestamppb.Now(), + }, + historical: make(map[time.Time]*beegfs.AgentStatus), + mu: sync.Mutex{}, + } +} + +// start() marks the beginning of a reconciliation. It returns a context that will be cancelled if +// the reconciliation is cancelled early. +func (s *state) start() context.Context { + s.mu.Lock() + defer s.mu.Unlock() + s.historical[time.Now()] = proto.Clone(&s.current).(*beegfs.AgentStatus) + s.current = beegfs.AgentStatus{ + State: beegfs.AgentStatus_APPLYING, + Messages: []string{}, + Updated: timestamppb.Now(), + } + ctx, cancel := context.WithCancel(context.Background()) + s.ctx = ctx + s.ctxCancel = cancel + s.logUnlocked(agent, "began reconciliation") + return s.ctx +} + +func (s *state) get() *beegfs.AgentStatus { + s.mu.Lock() + defer s.mu.Unlock() + return proto.Clone(&s.current).(*beegfs.AgentStatus) +} + +func (s *state) logUnlocked(cat op, message string) { + s.current.Updated = timestamppb.Now() + s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s [%s]: %s", s.current.Updated.String(), cat, message)) +} + +func (s *state) log(cat op, message string) { + s.mu.Lock() + defer s.mu.Unlock() + s.current.Updated = timestamppb.Now() + s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s [%s]: %s", s.current.Updated.String(), cat, message)) +} + +func (s *state) fail(message string) *beegfs.AgentStatus { + s.mu.Lock() + defer s.mu.Unlock() + s.current.State = beegfs.AgentStatus_FAILED + s.logUnlocked(agent, "failed reconciliation") + s.ctxCancel() + return proto.Clone(&s.current).(*beegfs.AgentStatus) +} + +func (s *state) cancel(message string) *beegfs.AgentStatus { + s.mu.Lock() + defer s.mu.Unlock() + s.current.State = beegfs.AgentStatus_CANCELLED + s.logUnlocked(agent, "cancelled reconciliation") + s.ctxCancel() + return proto.Clone(&s.current).(*beegfs.AgentStatus) +} + +func (s *state) complete(finalState beegfs.AgentStatus_State) { + s.mu.Lock() + defer s.mu.Unlock() + s.current.State = finalState + s.logUnlocked(agent, "finished reconciliation") + s.ctxCancel() +} diff --git a/common/beegfs/nodetype.go b/common/beegfs/nodetype.go index fec834be..d141d64b 100644 --- a/common/beegfs/nodetype.go +++ b/common/beegfs/nodetype.go @@ -1,6 +1,7 @@ package beegfs import ( + "fmt" "strings" pb "github.com/thinkparq/protobuf/go/beegfs" @@ -10,12 +11,15 @@ import ( // (which is technically correct, a meta target can only be on a meta server after all). type NodeType int +const InvalidNodeTypeString = "" const ( InvalidNodeType NodeType = iota Client Meta Storage Management + Remote + Sync ) // Create a NodeType from a string. Providing a non-ambiguous prefix is sufficient, e.g. for client, @@ -34,12 +38,19 @@ func NodeTypeFromString(input string) NodeType { return Management } + // To avoid ambiguity with storage, specifying sync requires at least 2 characters. + if len(input) >= 2 && (strings.HasPrefix("sync", input)) { + return Sync + } + if strings.HasPrefix("client", input) { return Client } else if strings.HasPrefix("storage", input) { return Storage } else if strings.HasPrefix("metadata", input) { return Meta + } else if strings.HasPrefix("remote", input) { + return Remote } return InvalidNodeType @@ -55,6 +66,10 @@ func NodeTypeFromProto(input pb.NodeType) NodeType { return Storage case pb.NodeType_MANAGEMENT: return Management + case pb.NodeType_REMOTE: + return Remote + case pb.NodeType_SYNC: + return Sync } return InvalidNodeType @@ -72,6 +87,10 @@ func (n NodeType) ToProto() *pb.NodeType { nt = pb.NodeType_STORAGE case Management: nt = pb.NodeType_MANAGEMENT + case Remote: + nt = pb.NodeType_REMOTE + case Sync: + nt = pb.NodeType_SYNC } return &nt @@ -88,7 +107,34 @@ func (n NodeType) String() string { return "storage" case Management: return "management" + case Remote: + return "remote" + case Sync: + return "sync" default: - return "" + return InvalidNodeTypeString + } +} + +func (n *NodeType) UnmarshalYAML(unmarshal func(any) error) error { + var s string + if err := unmarshal(&s); err != nil { + return err + } + + nodeType := NodeTypeFromString(s) + if nodeType == InvalidNodeType { + return fmt.Errorf("invalid node type: %q", s) + } + + *n = nodeType + return nil +} + +func (n NodeType) MarshalYAML() (any, error) { + str := n.String() + if str == InvalidNodeTypeString { + return nil, fmt.Errorf("cannot marshal invalid NodeType: %d", n) } + return str, nil } diff --git a/common/beegfs/nodetype_test.go b/common/beegfs/nodetype_test.go index bfb897af..a6131ffa 100644 --- a/common/beegfs/nodetype_test.go +++ b/common/beegfs/nodetype_test.go @@ -15,6 +15,10 @@ func TestFromString(t *testing.T) { assert.Equal(t, Client, NodeTypeFromString("c")) assert.Equal(t, Management, NodeTypeFromString(" management ")) assert.Equal(t, Management, NodeTypeFromString("ma")) + assert.Equal(t, Remote, NodeTypeFromString(" remote ")) + assert.Equal(t, Remote, NodeTypeFromString("r")) + assert.Equal(t, Sync, NodeTypeFromString(" sync ")) + assert.Equal(t, Sync, NodeTypeFromString("sy")) assert.Equal(t, InvalidNodeType, NodeTypeFromString("")) assert.Equal(t, InvalidNodeType, NodeTypeFromString("abc")) diff --git a/go.mod b/go.mod index 0ddf0d93..7b30a3e8 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,8 @@ module github.com/thinkparq/beegfs-go go 1.23.8 +replace github.com/thinkparq/protobuf => ../protobuf + require ( github.com/aws/aws-sdk-go-v2 v1.25.2 github.com/aws/aws-sdk-go-v2/config v1.27.6 @@ -28,6 +30,7 @@ require ( google.golang.org/grpc v1.71.1 google.golang.org/protobuf v1.36.6 gopkg.in/natefinch/lumberjack.v2 v2.2.1 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -77,5 +80,4 @@ require ( google.golang.org/genproto/googleapis/rpc v0.0.0-20250407143221-ac9807e6c755 // indirect gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect gopkg.in/ini.v1 v1.67.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index a8648034..59d66ee2 100644 --- a/go.sum +++ b/go.sum @@ -169,8 +169,6 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/thinkparq/protobuf v0.8.1-0.20250602183119-1bcce2b457a2 h1:j5myww+83y4lGdENQzcrpR0uEAaKMiHbfQYAeazSJx4= -github.com/thinkparq/protobuf v0.8.1-0.20250602183119-1bcce2b457a2/go.mod h1:AaUUy9mWaja/EggLSfzbKydAe+We+440z/6FdmPz5yI= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= From ca0c90e459888efddeb76c43d5d9bfad3cf3b7dc Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Fri, 9 May 2025 00:37:28 +0000 Subject: [PATCH 03/13] wip: agent deployer --- agent/build/agent.toml | 1 + agent/cmd/beegfs-agent/main.go | 6 +- agent/pkg/deploy/deploy.go | 30 ++++ agent/pkg/deploy/mount.go | 29 ++++ agent/pkg/deploy/network.go | 41 +++++ agent/pkg/deploy/service.go | 30 ++++ agent/pkg/deploy/source.go | 43 +++++ agent/pkg/manifest/filesystem.go | 237 +++++++++++++++++++++++--- agent/pkg/manifest/filesystem_test.go | 174 ++++++++++++++++++- agent/pkg/manifest/manifest.yaml | 113 ++++++------ agent/pkg/reconciler/reconciler.go | 82 ++++++--- agent/pkg/reconciler/state.go | 13 +- 12 files changed, 694 insertions(+), 105 deletions(-) create mode 100644 agent/pkg/deploy/deploy.go create mode 100644 agent/pkg/deploy/mount.go create mode 100644 agent/pkg/deploy/network.go create mode 100644 agent/pkg/deploy/service.go create mode 100644 agent/pkg/deploy/source.go diff --git a/agent/build/agent.toml b/agent/build/agent.toml index 3c24a214..9271bc6d 100644 --- a/agent/build/agent.toml +++ b/agent/build/agent.toml @@ -10,5 +10,6 @@ tls-cert-file = "/etc/beegfs/cert.pem" tls-key-file = "/etc/beegfs/key.pem" [reconciler] +deployment-strategy = "default" manifest-path = "/etc/beegfs/manifest.yaml" active-manifest-path = "/etc/beegfs/.active.manifest.yaml" diff --git a/agent/cmd/beegfs-agent/main.go b/agent/cmd/beegfs-agent/main.go index 37e0605d..3732ec59 100644 --- a/agent/cmd/beegfs-agent/main.go +++ b/agent/cmd/beegfs-agent/main.go @@ -45,6 +45,7 @@ func main() { pflag.Bool("server.tls-disable", false, "Disable TLS entirely for gRPC communication to this Agent's gRPC server.") pflag.String("reconciler.manifest-path", "/etc/beegfs/manifest.yaml", "The path to the BeeGFS manifest this agent should apply. The manifest will be identical to the active manifest if applied successfully.") pflag.String("reconciler.active-manifest-path", "/etc/beegfs/.active.manifest.yaml", "The past to the last BeeGFS manifest successfully applied by this agent.") + pflag.String("reconciler.deployment-strategy", "default", "The deployment strategy used by the reconciler.") pflag.Bool("developer.dump-config", false, "Dump the full configuration and immediately exit.") pflag.CommandLine.MarkHidden("developer.dump-config") pflag.CommandLine.SortFlags = false @@ -91,7 +92,10 @@ Using environment variables: } defer logger.Sync() - reconciler := reconciler.New(initialCfg.AgentID, logger.Logger, initialCfg.Reconciler) + reconciler, err := reconciler.New(initialCfg.AgentID, logger.Logger, initialCfg.Reconciler) + if err != nil { + logger.Fatal("unable to initialize reconciler", zap.Error(err)) + } cfgMgr.AddListener(reconciler) agentServer, err := server.New(logger.Logger, initialCfg.Server, reconciler) if err != nil { diff --git a/agent/pkg/deploy/deploy.go b/agent/pkg/deploy/deploy.go new file mode 100644 index 00000000..3d21cfd7 --- /dev/null +++ b/agent/pkg/deploy/deploy.go @@ -0,0 +1,30 @@ +package deploy + +// Deployer is responsible for carrying out the steps needed to manage a BeeGFS "node" and handles +// starting/modifying/stopping various system resources. +type Deployer interface { + Sourcerer + Networker + Mounter + Servicer +} + +func NewDefaultStrategy() (Deployer, error) { + pm, err := DetectPackageManager() + if err != nil { + return nil, err + } + return &defaultStrategy{ + Systemd: Systemd{}, + Mount: Mount{}, + IP: IP{}, + Package: Package{PackageManager: pm}, + }, nil +} + +type defaultStrategy struct { + Package // implements Sourcerer + Mount // implements Mounter + IP // implements Networker + Systemd // implements Servicer +} diff --git a/agent/pkg/deploy/mount.go b/agent/pkg/deploy/mount.go new file mode 100644 index 00000000..14a33591 --- /dev/null +++ b/agent/pkg/deploy/mount.go @@ -0,0 +1,29 @@ +package deploy + +import ( + "context" + "errors" + + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" +) + +type Mounter interface { + AddTargets(ctx context.Context, add []manifest.Target) error + ModifyTargets(ctx context.Context, old []manifest.Target, new []manifest.Target) error + DestroyTargets(ctx context.Context, remove []manifest.Target) error +} + +type Mount struct { +} + +func (m *Mount) AddTargets(ctx context.Context, add []manifest.Target) error { + return errors.New("not implemented") +} + +func (m *Mount) ModifyTargets(ctx context.Context, old []manifest.Target, new []manifest.Target) error { + return errors.New("not implemented") +} + +func (m *Mount) DestroyTargets(ctx context.Context, remove []manifest.Target) error { + return errors.New("not implemented") +} diff --git a/agent/pkg/deploy/network.go b/agent/pkg/deploy/network.go new file mode 100644 index 00000000..eca92841 --- /dev/null +++ b/agent/pkg/deploy/network.go @@ -0,0 +1,41 @@ +package deploy + +import ( + "context" + "errors" + "fmt" + "os/exec" + "strings" + + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" +) + +type Networker interface { + AddInterfaces(ctx context.Context, add []manifest.Nic) error + ModifyInterfaces(ctx context.Context, old []manifest.Nic, new []manifest.Nic) error + DestroyInterfaces(ctx context.Context, remove []manifest.Nic) error +} + +type IP struct { +} + +func (i *IP) AddInterfaces(ctx context.Context, add []manifest.Nic) error { + for _, nic := range add { + output, err := exec.CommandContext(ctx, "ip", "addr", "show", "dev", nic.Name).Output() + if err != nil { + return fmt.Errorf("unable to query interface %s: %w", nic.Name, err) + } + if !strings.Contains(string(output), nic.Addr) { + return fmt.Errorf("interface %s does not have expected address %s", nic.Name, nic.Addr) + } + } + return nil +} + +func (i *IP) ModifyInterfaces(ctx context.Context, old []manifest.Nic, new []manifest.Nic) error { + return errors.New("not implemented") +} + +func (i *IP) DestroyInterfaces(ctx context.Context, remove []manifest.Nic) error { + return errors.New("not implemented") +} diff --git a/agent/pkg/deploy/service.go b/agent/pkg/deploy/service.go new file mode 100644 index 00000000..f3237e7a --- /dev/null +++ b/agent/pkg/deploy/service.go @@ -0,0 +1,30 @@ +package deploy + +import ( + "context" + "errors" + + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" +) + +type Servicer interface { + Apply(ctx context.Context, add manifest.Node) error + Modify(ctx context.Context, old manifest.Node, new manifest.Node) error + Destroy(ctx context.Context, remove manifest.Node) error +} + +// Systemd provides a method to deploy BeeGFS nodes using systemd. +type Systemd struct { +} + +func (d *Systemd) Apply(ctx context.Context, add manifest.Node) error { + return errors.New("not implemented") +} + +func (d *Systemd) Modify(ctx context.Context, old manifest.Node, new manifest.Node) error { + return errors.New("not implemented") +} + +func (d *Systemd) Destroy(ctx context.Context, remove manifest.Node) error { + return errors.New("not implemented") +} diff --git a/agent/pkg/deploy/source.go b/agent/pkg/deploy/source.go new file mode 100644 index 00000000..49c18b8c --- /dev/null +++ b/agent/pkg/deploy/source.go @@ -0,0 +1,43 @@ +package deploy + +import ( + "context" + "errors" + "fmt" + + "github.com/thinkparq/beegfs-go/agent/pkg/manifest" +) + +type Sourcerer interface { + AddSource(ctx context.Context, add manifest.Source) error + UpdateSource(ctx context.Context, old manifest.Source, new manifest.Source) error + DeleteSource(ctx context.Context, remove manifest.Source) error +} + +// Package provides the ability to install BeeGFS using the package manager. +type Package struct { + PackageManager +} + +type PackageManager interface { + InstallRepo(ctx context.Context, repo string) error + RemoveRepo(ctx context.Context, repo string) error + InstallPackage(ctx context.Context, pkg string) error + RemovePackage(ctx context.Context, pkg string) error +} + +func DetectPackageManager() (PackageManager, error) { + return nil, fmt.Errorf("detecting package manager: unsupported or undetectable package manager") +} + +func (p *Package) AddSource(ctx context.Context, add manifest.Source) error { + return errors.New("not implemented") +} + +func (p *Package) UpdateSource(ctx context.Context, old manifest.Source, new manifest.Source) error { + return errors.New("not implemented") +} + +func (p *Package) DeleteSource(ctx context.Context, remove manifest.Source) error { + return errors.New("not implemented") +} diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index c566d44d..2f054a5f 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -1,3 +1,7 @@ +// Package manifest defines Go-native structs for defining a BeeGFS instance. This includes +// functions for converting to/from protobuf messages and loading/unloading from YAML files. +// Protobuf structs are not used directly (as is done in other BeeGFS Go projects) to provide a more +// user-friendly YAML manifest than what protobuf generated structs allow. package manifest import ( @@ -12,28 +16,158 @@ import ( func New() Filesystem { return Filesystem{ - Agents: make(map[string]Agent), - Common: Common{}, - MetaConfig: make(map[string]string), - StorageConfig: make(map[string]string), - ClientConfig: make(map[string]string), + Agents: make(map[string]Agent), + Common: Common{ + MetaConfig: make(map[string]string), + StorageConfig: make(map[string]string), + ClientConfig: make(map[string]string), + Source: Source{}, + }, } } type Filesystem struct { - Agents map[string]Agent `yaml:"agents"` - Common Common `yaml:"common"` - MetaConfig map[string]string `yaml:"meta_config"` - StorageConfig map[string]string `yaml:"storage_config"` - ClientConfig map[string]string `yaml:"client_config"` + Agents map[string]Agent `yaml:"agents"` + Common Common `yaml:"common"` +} + +type SourceType int + +const ( + UnknownSource SourceType = iota + LocalSource + PackageSource +) + +func (s SourceType) ToProto() pb.SourceType { + switch s { + case LocalSource: + return pb.SourceType_LOCAL + case PackageSource: + return pb.SourceType_PACKAGE + default: + return pb.SourceType_UNKNOWN + } +} + +func SourceTypeFromProto(st pb.SourceType) SourceType { + switch st { + case pb.SourceType_LOCAL: + return LocalSource + case pb.SourceType_PACKAGE: + return PackageSource + default: + return UnknownSource + } +} + +func (s *SourceType) UnmarshalYAML(unmarshal func(any) error) error { + var str string + if err := unmarshal(&str); err != nil { + return err + } + switch str { + case "local": + *s = LocalSource + case "package": + *s = PackageSource + default: + *s = UnknownSource + } + return nil +} + +func (s SourceType) MarshalYAML() (any, error) { + switch s { + case LocalSource: + return "local", nil + case PackageSource: + return "package", nil + default: + return "unknown", nil + } +} + +type Source struct { + Type SourceType `yaml:"type"` + Repo string `yaml:"repo"` + Management string `yaml:"management"` + Meta string `yaml:"meta"` + Storage string `yaml:"storage"` + Remote string `yaml:"remote"` + Sync string `yaml:"sync"` +} + +func (s Source) refForNodeType(t beegfs.NodeType) string { + switch t { + case beegfs.Meta: + return s.Meta + case beegfs.Storage: + return s.Storage + case beegfs.Management: + return s.Management + case beegfs.Remote: + return s.Remote + case beegfs.Sync: + return s.Sync + default: + return "" + } +} + +func (f *Filesystem) InheritGlobalConfig() { + for agentID, agent := range f.Agents { + for i := range agent.Nodes { + node := &agent.Nodes[i] + // Inherit global interface configuration if there are no node specific interfaces. + if len(node.Interfaces) == 0 { + node.Interfaces = agent.Interfaces + } + // Inherit global node configuration based on the node type. + switch agent.Nodes[i].Type { + case beegfs.Meta: + node.Config = inheritMapDefaults(f.Common.MetaConfig, node.Config) + case beegfs.Storage: + node.Config = inheritMapDefaults(f.Common.StorageConfig, node.Config) + case beegfs.Client: + node.Config = inheritMapDefaults(f.Common.ClientConfig, node.Config) + } + // Inherit global source configuration based on the node type. + if node.Source == nil || node.Source.Ref == "" { + node.Source = &NodeSource{ + Type: f.Common.Source.Type, + Ref: f.Common.Source.refForNodeType(node.Type), + } + } + } + f.Agents[agentID] = agent + } +} + +func inheritMapDefaults(defaults, target map[string]string) map[string]string { + if target == nil { + target = make(map[string]string, 0) + } + for k, v := range defaults { + if _, ok := target[k]; !ok { + target[k] = v + } + } + return target } type Common struct { - Auth string `yaml:"auth"` + Auth string `yaml:"auth"` + MetaConfig map[string]string `yaml:"meta_config"` + StorageConfig map[string]string `yaml:"storage_config"` + ClientConfig map[string]string `yaml:"client_config"` + Source Source `yaml:"source"` } type Agent struct { Nodes []Node `yaml:"nodes"` + // Global interfaces potentially reused by multiple nodes. + Interfaces []Nic `yaml:"interfaces"` } type Node struct { @@ -42,6 +176,12 @@ type Node struct { Config map[string]string `yaml:"config"` Interfaces []Nic `yaml:"interfaces"` Targets []Target `yaml:"targets"` + Source *NodeSource `yaml:"source,omitempty"` +} + +type NodeSource struct { + Type SourceType `yaml:"type"` + Ref string `yaml:"ref"` } type Nic struct { @@ -78,7 +218,7 @@ func (t UnderlyingFSType) String() string { } } -func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(interface{}) error) error { +func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(any) error) error { var s string if err := unmarshal(&s); err != nil { return err @@ -93,7 +233,7 @@ func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(interface{}) error) erro return nil } -func (t UnderlyingFSType) MarshalYAML() (interface{}, error) { +func (t UnderlyingFSType) MarshalYAML() (any, error) { switch t { case EXT4UnderlyingFS: return "ext4", nil @@ -126,16 +266,33 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { return fs } + pSrc := protoFS.GetCommon().GetSource() fs.Common = Common{ - Auth: protoFS.GetCommon().GetAuth(), + Auth: protoFS.GetCommon().GetAuth(), + MetaConfig: protoFS.GetCommon().GetMetaConfig(), + StorageConfig: protoFS.GetCommon().GetStorageConfig(), + ClientConfig: protoFS.GetCommon().GetClientConfig(), + Source: Source{ + Type: SourceTypeFromProto(pSrc.Type), + Repo: pSrc.Repo, + Management: pSrc.Management, + Meta: pSrc.Meta, + Storage: pSrc.Storage, + Remote: pSrc.Remote, + Sync: pSrc.Sync, + }, } - fs.MetaConfig = protoFS.GetMetaConfig() - fs.StorageConfig = protoFS.GetStorageConfig() - fs.ClientConfig = protoFS.GetClientConfig() for id, a := range protoFS.GetAgent() { agent := Agent{ - Nodes: make([]Node, 0), + Nodes: make([]Node, 0), + Interfaces: make([]Nic, 0), + } + for _, i := range a.GetInterfaces() { + agent.Interfaces = append(agent.Interfaces, Nic{ + Name: i.Name, + Addr: i.Addr, + }) } for _, n := range a.GetNodes() { node := Node{ @@ -146,6 +303,13 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { Targets: make([]Target, 0), } + if n.Source != nil { + node.Source = &NodeSource{ + Type: SourceTypeFromProto(n.GetSource().GetType()), + Ref: n.GetSource().GetRef(), + } + } + for _, i := range n.GetInterfaces() { node.Interfaces = append(node.Interfaces, Nic{ Name: i.Name, @@ -179,17 +343,34 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { func ToProto(fs *Filesystem) *pb.Filesystem { pbFS := &pb.Filesystem{ Common: &pb.Filesystem_Common{ - Auth: fs.Common.Auth, + Auth: fs.Common.Auth, + MetaConfig: fs.Common.MetaConfig, + StorageConfig: fs.Common.StorageConfig, + ClientConfig: fs.Common.ClientConfig, + Source: &pb.Filesystem_Common_Source{ + Type: fs.Common.Source.Type.ToProto(), + Repo: fs.Common.Source.Repo, + Management: fs.Common.Source.Management, + Meta: fs.Common.Source.Meta, + Storage: fs.Common.Source.Storage, + Remote: fs.Common.Source.Remote, + Sync: fs.Common.Source.Sync, + }, }, - MetaConfig: fs.MetaConfig, - StorageConfig: fs.StorageConfig, - ClientConfig: fs.ClientConfig, - Agent: make(map[string]*pb.Agent), + + Agent: make(map[string]*pb.Agent), } for agentID, agent := range fs.Agents { pbAgent := &pb.Agent{ - Nodes: make([]*pb.Node, 0, len(agent.Nodes)), + Nodes: make([]*pb.Node, 0, len(agent.Nodes)), + Interfaces: make([]*pb.Nic, 0, len(agent.Interfaces)), + } + for _, i := range agent.Interfaces { + pbAgent.Interfaces = append(pbAgent.Interfaces, &pb.Nic{ + Name: i.Name, + Addr: i.Addr, + }) } for _, node := range agent.Nodes { pbNode := &pb.Node{ @@ -199,6 +380,14 @@ func ToProto(fs *Filesystem) *pb.Filesystem { Interfaces: make([]*pb.Nic, 0, len(node.Interfaces)), Targets: make([]*pb.Target, 0, len(node.Targets)), } + + if node.Source != nil { + pbNode.Source = &pb.Node_Source{ + Type: node.Source.Type.ToProto(), + Ref: node.Source.Ref, + } + } + for _, nic := range node.Interfaces { pbNode.Interfaces = append(pbNode.Interfaces, &pb.Nic{ Name: nic.Name, diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index aa73060b..a4ca7fe6 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -4,17 +4,27 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/thinkparq/beegfs-go/common/beegfs" pb "github.com/thinkparq/protobuf/go/beegfs" ) func TestFromToProto_RoundTrip(t *testing.T) { original := &pb.Filesystem{ - Common: &pb.Filesystem_Common{Auth: "secret"}, - MetaConfig: map[string]string{"key1": "val1"}, - StorageConfig: map[string]string{"key2": "val2"}, - ClientConfig: map[string]string{"key3": "val3"}, + Common: &pb.Filesystem_Common{ + Auth: "secret", + MetaConfig: map[string]string{"key1": "val1"}, + StorageConfig: map[string]string{"key2": "val2"}, + ClientConfig: map[string]string{"key3": "val3"}, + Source: &pb.Filesystem_Common_Source{ + Type: pb.SourceType_PACKAGE, + }, + }, + Agent: map[string]*pb.Agent{ "agent1": { + Interfaces: []*pb.Nic{ + {Name: "eth0", Addr: "11.0.0.1/16"}, + }, Nodes: []*pb.Node{ { NumId: 1, @@ -23,6 +33,10 @@ func TestFromToProto_RoundTrip(t *testing.T) { Interfaces: []*pb.Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, + Source: &pb.Node_Source{ + Type: pb.SourceType_LOCAL, + Ref: "12345", + }, Targets: []*pb.Target{ { NumId: 101, @@ -46,3 +60,155 @@ func TestFromToProto_RoundTrip(t *testing.T) { assert.Equal(t, original, roundTripped, "round-trip protobuf -> go -> protobuf did not match original") } + +func TestInheritGlobalConfig(t *testing.T) { + tests := []struct { + name string + input Filesystem + expectedNIC string // Expected NIC name in node if inherited + expectedCfg map[string]string + expectedSrc NodeSource + }{ + { + name: "inherit source, NIC and meta config", + input: Filesystem{ + Common: Common{ + MetaConfig: map[string]string{ + "foo": "bar", + "baz": "global", + }, + Source: Source{ + Type: PackageSource, + Meta: "beegfs-meta=8.0.1", + }, + }, + Agents: map[string]Agent{ + "agent1": { + Interfaces: []Nic{ + {Name: "ib0", Addr: "10.0.0.1/16"}, + }, + Nodes: []Node{ + { + Type: beegfs.Meta, + ID: 1, + Config: map[string]string{"baz": "node-specific"}, + }, + }, + }, + }, + }, + expectedNIC: "ib0", + expectedCfg: map[string]string{ + "foo": "bar", // inherited + "baz": "node-specific", // overridden + }, + expectedSrc: NodeSource{ + Type: PackageSource, + Ref: "beegfs-meta=8.0.1", + }, + }, + { + name: "no inheritance if NICs or source are present", + input: Filesystem{ + Common: Common{ + MetaConfig: map[string]string{ + "quota": "enabled", + }, + Source: Source{ + Type: PackageSource, + Meta: "beegfs-meta=8.0.1", + }, + }, + Agents: map[string]Agent{ + "agent1": { + Interfaces: []Nic{ + {Name: "ib0", Addr: "10.0.0.1/16"}, + }, + Nodes: []Node{ + { + Type: beegfs.Meta, + ID: 2, + Interfaces: []Nic{ + {Name: "eth0", Addr: "192.168.0.1/24"}, + }, + Config: map[string]string{"quota": "override"}, + Source: &NodeSource{ + Type: LocalSource, + Ref: "/home/tux/beegfs-meta", + }, + }, + }, + }, + }, + }, + expectedNIC: "eth0", + expectedCfg: map[string]string{ + "quota": "override", + }, + expectedSrc: NodeSource{ + Type: LocalSource, + Ref: "/home/tux/beegfs-meta", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fs := tt.input + fs.InheritGlobalConfig() + agent := fs.Agents["agent1"] + node := agent.Nodes[0] + assert.Equal(t, tt.expectedNIC, node.Interfaces[0].Name) + assert.Equal(t, tt.expectedCfg, node.Config) + }) + } +} + +func TestInheritMapDefaults(t *testing.T) { + tests := []struct { + name string + defaults map[string]string + target map[string]string + expected map[string]string + }{ + { + name: "adds missing keys", + defaults: map[string]string{ + "a": "1", + "b": "2", + }, + target: map[string]string{ + "a": "1-overridden", + }, + expected: map[string]string{ + "a": "1-overridden", // should NOT be overridden + "b": "2", // should be added + }, + }, + { + name: "target already has all keys", + defaults: map[string]string{"a": "1"}, + target: map[string]string{"a": "custom"}, + expected: map[string]string{"a": "custom"}, + }, + { + name: "empty defaults", + defaults: map[string]string{}, + target: map[string]string{"a": "existing"}, + expected: map[string]string{"a": "existing"}, + }, + { + name: "empty target", + defaults: map[string]string{"a": "1"}, + target: map[string]string{}, + expected: map[string]string{"a": "1"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := inheritMapDefaults(tt.defaults, tt.target) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index a2404eb0..b515fe8f 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -1,39 +1,38 @@ common: auth: secret + meta_config: + quotaEnableEnforcement: true + storeClientXAttrs: true + storeClientACLs: true + storage_config: + quotaEnableEnforcement: true + client_config: + quotaEnabled: true + source: + type: package + repo: https://www.beegfs.io/release/beegfs_8.0/ + meta: beegfs-meta=8.0.1 + storage: beegfs-storage=8.0.1 -meta_config: - quotaEnableEnforcement: true - storeClientXAttrs: true - storeClientACLs: true - -storage_config: - quotaEnableEnforcement: true - -client_config: - quotaEnabled: true +# source: +# type: container +# repo: ghcr.io/thinkparq +# meta: beegfs-meta:8.0.1 +# storage: beegfs-storage:8.0.1 agents: agent1: # agent-id + interfaces: + - name: enp0s1 + address: "192.168.64.5/24" nodes: - type: meta id: 1 + source: + type: local + ref: /home/joe/development/beegfs/meta/build/beegfs-meta interfaces: - - name: ib0 - address: 10.0.0.101/16 - targets: - - id: 101 - ulfs: - device: /dev/sda1 - type: ext4 - format_flags: foo - mount_flags: baz - agent2: # agent-id - nodes: - - type: storage - id: 1 - interfaces: - - name: ib0 - address: 10.0.0.102/16 + - name: enp0s1 # IP configuration handled globally targets: - id: 101 ulfs: @@ -41,27 +40,41 @@ agents: type: ext4 format_flags: foo mount_flags: baz - - id: 102 - ulfs: - device: /dev/sda1 - type: ext4 - format_flags: foo - mount_flags: baz - - type: storage - id: 2 - interfaces: - - name: ib1 - address: 10.0.0.102/16 - targets: - - id: 201 - ulfs: - device: /dev/sda2 - type: ext4 - format_flags: foo - mount_flags: baz - - id: 202 - ulfs: - device: /dev/sda2 - type: ext4 - format_flags: foo - mount_flags: baz + # agent2: # agent-id + # nodes: + # - type: storage + # id: 1 + # interfaces: + # - name: ib0 + # address: 10.0.0.102/16 + # targets: + # - id: 101 + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - id: 102 + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - type: storage + # id: 2 + # interfaces: + # - name: ib1 + # address: 10.0.0.102/16 + # targets: + # - id: 201 + # ulfs: + # device: /dev/sda2 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - id: 202 + # ulfs: + # device: /dev/sda2 + # type: ext4 + # format_flags: foo + # mount_flags: baz diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 20a0fead..0d704eb1 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -6,20 +6,30 @@ import ( "reflect" "sync" + "github.com/thinkparq/beegfs-go/agent/pkg/deploy" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" "github.com/thinkparq/protobuf/go/beegfs" "go.uber.org/zap" ) type Config struct { - ManifestPath string `mapstructure:"manifest-path"` - ActiveManifestPath string `mapstructure:"active-manifest-path"` + ManifestPath string `mapstructure:"manifest-path"` + ActiveManifestPath string `mapstructure:"active-manifest-path"` + DeploymentStrategy Strategy `mapstructure:"deployment-strategy"` } +type Strategy string + +const ( + DefaultStrategy Strategy = "default" +) + type Configurer interface { GetReconcilerConfig() Config } +// Reconciler is responsible for comparing the current/active Filesystem with the new desired +// Filesystem and decides what needs to be created, updated, or destroyed. type Reconciler interface { GetAgentID() string GetFsUUID() string @@ -33,23 +43,35 @@ type ReconcileResult struct { } type defaultReconciler struct { - agentID string - log *zap.Logger - mu sync.Mutex - fs manifest.Filesystem - state state - config Config + agentID string + log *zap.Logger + mu sync.Mutex + currentFS manifest.Filesystem + state state + config Config + strategy deploy.Deployer } -func New(agentID string, log *zap.Logger, config Config) Reconciler { +func New(agentID string, log *zap.Logger, config Config) (Reconciler, error) { log = log.With(zap.String("component", path.Base(reflect.TypeOf(defaultReconciler{}).PkgPath()))) + var deploymentStrategy deploy.Deployer + var err error + switch config.DeploymentStrategy { + case DefaultStrategy: + if deploymentStrategy, err = deploy.NewDefaultStrategy(); err != nil { + return nil, fmt.Errorf("unable to configure deployment strategy: %w", err) + } + default: + return nil, fmt.Errorf("unknown deployment strategy: %v", config.DeploymentStrategy) + } // Setting the initial config and file system manifest will be triggered later by ConfigMgr. return &defaultReconciler{ - agentID: agentID, - log: log, - state: newAgentState(), - mu: sync.Mutex{}, - } + agentID: agentID, + log: log, + state: newAgentState(log), + mu: sync.Mutex{}, + strategy: deploymentStrategy, + }, nil } func (r *defaultReconciler) GetAgentID() string { @@ -101,25 +123,39 @@ func (r *defaultReconciler) UpdateConfiguration(config any) error { return fmt.Errorf("received unexpected reconciler configuration (most likely this indicates a bug and a report should be filed)") } +// Verify performs any checks that can be done without actually reconciling the manifest. This +// allows a response to be returned quickly while the reconciliation happens in the background. func (r *defaultReconciler) verify(newFS manifest.Filesystem) error { + r.log.Info("verifying manifest") + newFS.InheritGlobalConfig() // TODO: // * Avoid necessary reconciliations by seeing if the manifest changed. // * Validate we can migrate from currentFS to newFS. - r.log.Info("verifying manifest") go r.reconcile(newFS) return nil } -func (r *defaultReconciler) reconcile(fs manifest.Filesystem) { +// Reconcile attempts to move the local state from the currentFS to the newFS. +func (r *defaultReconciler) reconcile(newFS manifest.Filesystem) { r.mu.Lock() defer r.mu.Unlock() - r.log.Info("starting reconciliation") - r.log.Debug("reconciling", zap.Any("filesystem", fs)) + r.log.Debug("reconciling", zap.Any("filesystem", newFS)) ctx := r.state.start() - ctx.Err() - // TODO: Reconcile - r.fs = fs - manifest.ToDisk(r.fs, r.config.ActiveManifestPath) + + agent, ok := newFS.Agents[r.agentID] + if !ok { + r.state.cancel("no configuration for this agent found in the provided manifest") + return + } + + if err := r.strategy.AddInterfaces(ctx, agent.Interfaces); err != nil { + r.state.fail(err.Error()) + return + } + + // TODO + + r.currentFS = newFS + manifest.ToDisk(r.currentFS, r.config.ActiveManifestPath) r.state.complete(beegfs.AgentStatus_SUCCESS) - r.log.Info("completed reconciliation") } diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go index 215555a5..3ae5b10a 100644 --- a/agent/pkg/reconciler/state.go +++ b/agent/pkg/reconciler/state.go @@ -7,11 +7,13 @@ import ( "time" "github.com/thinkparq/protobuf/go/beegfs" + "go.uber.org/zap" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" ) type state struct { + logger *zap.Logger current beegfs.AgentStatus historical map[time.Time]*beegfs.AgentStatus mu sync.Mutex @@ -27,7 +29,7 @@ const ( mount = "MOUNT" ) -func newAgentState() state { +func newAgentState(l *zap.Logger) state { return state{ current: beegfs.AgentStatus{ State: beegfs.AgentStatus_IDLE, @@ -36,6 +38,7 @@ func newAgentState() state { }, historical: make(map[time.Time]*beegfs.AgentStatus), mu: sync.Mutex{}, + logger: l, } } @@ -44,6 +47,7 @@ func newAgentState() state { func (s *state) start() context.Context { s.mu.Lock() defer s.mu.Unlock() + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_APPLYING.String())) s.historical[time.Now()] = proto.Clone(&s.current).(*beegfs.AgentStatus) s.current = beegfs.AgentStatus{ State: beegfs.AgentStatus_APPLYING, @@ -78,8 +82,9 @@ func (s *state) log(cat op, message string) { func (s *state) fail(message string) *beegfs.AgentStatus { s.mu.Lock() defer s.mu.Unlock() + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_FAILED.String())) s.current.State = beegfs.AgentStatus_FAILED - s.logUnlocked(agent, "failed reconciliation") + s.logUnlocked(agent, message) s.ctxCancel() return proto.Clone(&s.current).(*beegfs.AgentStatus) } @@ -87,8 +92,9 @@ func (s *state) fail(message string) *beegfs.AgentStatus { func (s *state) cancel(message string) *beegfs.AgentStatus { s.mu.Lock() defer s.mu.Unlock() + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_CANCELLED.String())) s.current.State = beegfs.AgentStatus_CANCELLED - s.logUnlocked(agent, "cancelled reconciliation") + s.logUnlocked(agent, message) s.ctxCancel() return proto.Clone(&s.current).(*beegfs.AgentStatus) } @@ -96,6 +102,7 @@ func (s *state) cancel(message string) *beegfs.AgentStatus { func (s *state) complete(finalState beegfs.AgentStatus_State) { s.mu.Lock() defer s.mu.Unlock() + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", finalState.String())) s.current.State = finalState s.logUnlocked(agent, "finished reconciliation") s.ctxCancel() From 82ac444230c2e955a4fbe93961af2ba183d432bd Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 12 May 2025 23:09:45 +0000 Subject: [PATCH 04/13] wip: support multiple fs, simplify deployer+reconciler state --- agent/cmd/beegfs-agent/main.go | 8 +- agent/internal/server/server.go | 14 ++- agent/pkg/deploy/deploy.go | 26 ++++- agent/pkg/deploy/mount.go | 21 ++-- agent/pkg/deploy/network.go | 16 ++- agent/pkg/deploy/service.go | 28 +++-- agent/pkg/deploy/source.go | 74 +++++++++--- agent/pkg/manifest/filesystem.go | 49 ++++++-- agent/pkg/manifest/filesystem_test.go | 13 ++- agent/pkg/manifest/manifest.yaml | 159 +++++++++++++------------- agent/pkg/reconciler/reconciler.go | 113 +++++++++++++----- agent/pkg/reconciler/state.go | 71 ++++++------ go.mod | 2 + go.sum | 4 + 14 files changed, 389 insertions(+), 209 deletions(-) diff --git a/agent/cmd/beegfs-agent/main.go b/agent/cmd/beegfs-agent/main.go index 3732ec59..3810ee99 100644 --- a/agent/cmd/beegfs-agent/main.go +++ b/agent/cmd/beegfs-agent/main.go @@ -91,8 +91,9 @@ Using environment variables: log.Fatalf("unable to initialize logger: %s", err) } defer logger.Sync() + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM, syscall.SIGINT) - reconciler, err := reconciler.New(initialCfg.AgentID, logger.Logger, initialCfg.Reconciler) + reconciler, err := reconciler.New(ctx, initialCfg.AgentID, logger.Logger, initialCfg.Reconciler) if err != nil { logger.Fatal("unable to initialize reconciler", zap.Error(err)) } @@ -102,8 +103,6 @@ Using environment variables: logger.Fatal("unable to initialize gRPC server", zap.Error(err)) } - ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM, syscall.SIGINT) - errChan := make(chan error, 2) agentServer.ListenAndServe(errChan) go cfgMgr.Manage(ctx, logger.Logger) @@ -116,5 +115,8 @@ Using environment variables: } cancel() agentServer.Stop() + if err := reconciler.Stop(); err != nil { + logger.Error("error stopping reconciler", zap.Error(err)) + } logger.Info("shutdown all components, exiting") } diff --git a/agent/internal/server/server.go b/agent/internal/server/server.go index 9f63c13c..3c53e282 100644 --- a/agent/internal/server/server.go +++ b/agent/internal/server/server.go @@ -84,11 +84,19 @@ func (s *AgentServer) Stop() { func (s *AgentServer) Update(ctx context.Context, request *beegfs.AgentUpdateRequest) (*beegfs.AgentUpdateResponse, error) { s.wg.Add(1) defer s.wg.Done() - if err := s.reconciler.UpdateConfiguration(manifest.FromProto(request.GetConfig())); err != nil { + + filesystems := make(map[string]manifest.Filesystem, len(request.GetConfig())) + for fsUUID, protoFS := range request.GetConfig() { + if protoFS == nil { + return nil, status.Error(codes.InvalidArgument, "file system configuration was unexpectedly nil for fsUUID "+fsUUID) + } + filesystems[fsUUID] = manifest.FromProto(protoFS) + } + + if err := s.reconciler.UpdateConfiguration(filesystems); err != nil { return nil, grpcStatusFrom(err) } return &beegfs.AgentUpdateResponse{ - FsUuid: s.reconciler.GetFsUUID(), AgentId: s.reconciler.GetAgentID(), }, nil } @@ -101,7 +109,6 @@ func (s *AgentServer) Status(ctx context.Context, request *beegfs.AgentStatusReq } else { return &beegfs.AgentStatusResponse{ Status: result.Status, - FsUuid: s.reconciler.GetFsUUID(), AgentId: s.reconciler.GetAgentID(), }, nil } @@ -115,7 +122,6 @@ func (s *AgentServer) Cancel(ctx context.Context, request *beegfs.AgentCancelReq } else { return &beegfs.AgentCancelResponse{ Status: result.Status, - FsUuid: s.reconciler.GetFsUUID(), AgentId: s.reconciler.GetAgentID(), }, nil } diff --git a/agent/pkg/deploy/deploy.go b/agent/pkg/deploy/deploy.go index 3d21cfd7..2c2d0fb1 100644 --- a/agent/pkg/deploy/deploy.go +++ b/agent/pkg/deploy/deploy.go @@ -1,5 +1,7 @@ package deploy +import "context" + // Deployer is responsible for carrying out the steps needed to manage a BeeGFS "node" and handles // starting/modifying/stopping various system resources. type Deployer interface { @@ -7,24 +9,36 @@ type Deployer interface { Networker Mounter Servicer + // Cleanup should be called once the deployer is no longer needed to cleanup any long lived + // resources created by setting up a particular deployment strategy. + Cleanup() error } -func NewDefaultStrategy() (Deployer, error) { - pm, err := DetectPackageManager() +func NewDefaultStrategy(ctx context.Context) (Deployer, error) { + packageManager, err := DetectPackageManager() + if err != nil { + return nil, err + } + + systemd, err := NewSystemd(ctx) if err != nil { return nil, err } return &defaultStrategy{ - Systemd: Systemd{}, - Mount: Mount{}, + Package: packageManager, IP: IP{}, - Package: Package{PackageManager: pm}, + Mount: Mount{}, + Systemd: systemd, }, nil } type defaultStrategy struct { Package // implements Sourcerer - Mount // implements Mounter IP // implements Networker + Mount // implements Mounter Systemd // implements Servicer } + +func (s *defaultStrategy) Cleanup() error { + return s.Systemd.Cleanup() +} diff --git a/agent/pkg/deploy/mount.go b/agent/pkg/deploy/mount.go index 14a33591..e6997aaa 100644 --- a/agent/pkg/deploy/mount.go +++ b/agent/pkg/deploy/mount.go @@ -3,25 +3,30 @@ package deploy import ( "context" "errors" + "fmt" + "os" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" ) type Mounter interface { - AddTargets(ctx context.Context, add []manifest.Target) error - ModifyTargets(ctx context.Context, old []manifest.Target, new []manifest.Target) error + ApplyTargets(ctx context.Context, add []manifest.Target) error DestroyTargets(ctx context.Context, remove []manifest.Target) error } type Mount struct { } -func (m *Mount) AddTargets(ctx context.Context, add []manifest.Target) error { - return errors.New("not implemented") -} - -func (m *Mount) ModifyTargets(ctx context.Context, old []manifest.Target, new []manifest.Target) error { - return errors.New("not implemented") +func (m *Mount) ApplyTargets(ctx context.Context, add []manifest.Target) error { + for _, target := range add { + if target.ULFS != nil { + return fmt.Errorf("unable to apply target %d: formatting and/or mounting an underlying file system is not implemented yet", target.ID) + } + if err := os.MkdirAll(target.GetPath(), 0700); err != nil { + return fmt.Errorf("unable to apply target %d: unable to create root directory %s: %w", target.ID, target.RootDir, err) + } + } + return nil } func (m *Mount) DestroyTargets(ctx context.Context, remove []manifest.Target) error { diff --git a/agent/pkg/deploy/network.go b/agent/pkg/deploy/network.go index eca92841..9a1fc677 100644 --- a/agent/pkg/deploy/network.go +++ b/agent/pkg/deploy/network.go @@ -11,31 +11,29 @@ import ( ) type Networker interface { - AddInterfaces(ctx context.Context, add []manifest.Nic) error - ModifyInterfaces(ctx context.Context, old []manifest.Nic, new []manifest.Nic) error + ApplyInterfaces(ctx context.Context, add []manifest.Nic) error DestroyInterfaces(ctx context.Context, remove []manifest.Nic) error } type IP struct { } -func (i *IP) AddInterfaces(ctx context.Context, add []manifest.Nic) error { +func (i *IP) ApplyInterfaces(ctx context.Context, add []manifest.Nic) error { for _, nic := range add { + if nic.Addr == "" { + continue // no-op + } output, err := exec.CommandContext(ctx, "ip", "addr", "show", "dev", nic.Name).Output() if err != nil { - return fmt.Errorf("unable to query interface %s: %w", nic.Name, err) + return fmt.Errorf("unable to verify IP %s is configured for interface %s: %w", nic.Addr, nic.Name, err) } if !strings.Contains(string(output), nic.Addr) { - return fmt.Errorf("interface %s does not have expected address %s", nic.Name, nic.Addr) + return fmt.Errorf("unable to apply IP %s to interface %s: configuring IPs is not supported yet", nic.Addr, nic.Name) } } return nil } -func (i *IP) ModifyInterfaces(ctx context.Context, old []manifest.Nic, new []manifest.Nic) error { - return errors.New("not implemented") -} - func (i *IP) DestroyInterfaces(ctx context.Context, remove []manifest.Nic) error { return errors.New("not implemented") } diff --git a/agent/pkg/deploy/service.go b/agent/pkg/deploy/service.go index f3237e7a..5801cd95 100644 --- a/agent/pkg/deploy/service.go +++ b/agent/pkg/deploy/service.go @@ -3,28 +3,42 @@ package deploy import ( "context" "errors" + "fmt" + "github.com/coreos/go-systemd/v22/dbus" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" ) type Servicer interface { - Apply(ctx context.Context, add manifest.Node) error - Modify(ctx context.Context, old manifest.Node, new manifest.Node) error - Destroy(ctx context.Context, remove manifest.Node) error + ApplyService(ctx context.Context, add manifest.Node) error + DestroyService(ctx context.Context, remove manifest.Node) error +} + +func NewSystemd(ctx context.Context) (Systemd, error) { + conn, err := dbus.NewSystemConnectionContext(ctx) + if err != nil { + return Systemd{}, fmt.Errorf("unable to connect to the system bus: %w", err) + } + return Systemd{ + conn: conn, + }, nil + } // Systemd provides a method to deploy BeeGFS nodes using systemd. type Systemd struct { + conn *dbus.Conn } -func (d *Systemd) Apply(ctx context.Context, add manifest.Node) error { - return errors.New("not implemented") +func (d *Systemd) Cleanup() error { + d.conn.Close() + return nil } -func (d *Systemd) Modify(ctx context.Context, old manifest.Node, new manifest.Node) error { +func (d *Systemd) ApplyService(ctx context.Context, add manifest.Node) error { return errors.New("not implemented") } -func (d *Systemd) Destroy(ctx context.Context, remove manifest.Node) error { +func (d *Systemd) DestroyService(ctx context.Context, remove manifest.Node) error { return errors.New("not implemented") } diff --git a/agent/pkg/deploy/source.go b/agent/pkg/deploy/source.go index 49c18b8c..89c93a68 100644 --- a/agent/pkg/deploy/source.go +++ b/agent/pkg/deploy/source.go @@ -4,40 +4,86 @@ import ( "context" "errors" "fmt" + "os/exec" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" ) type Sourcerer interface { - AddSource(ctx context.Context, add manifest.Source) error - UpdateSource(ctx context.Context, old manifest.Source, new manifest.Source) error + ApplySource(ctx context.Context, add manifest.Source) error DeleteSource(ctx context.Context, remove manifest.Source) error + ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error + DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error } -// Package provides the ability to install BeeGFS using the package manager. +func DetectPackageManager() (Package, error) { + isExecutableInPath := func(name string) bool { + _, err := exec.LookPath(name) + return err == nil + } + switch { + case isExecutableInPath("apt"): + return Package{ + manager: &AptPackage{}, + }, nil + } + return Package{}, fmt.Errorf("detecting package manager: unsupported or undetectable package manager") +} + +// Package provides the ability to install BeeGFS using the package manager. It implements any +// general functionality and defers to the actual manager based on the specific distribution. type Package struct { - PackageManager + manager Sourcerer + // isLocal is set if the manifest specifies the source type is local. This indicates all package + // manager operations should be a no-op for this FS in the manifest. This allows the manifest to + // fully control the installation source independent of the deployment strategy for each agent. + isLocal bool +} + +func (p *Package) ApplySource(ctx context.Context, add manifest.Source) error { + if add.Type == manifest.LocalSource { + p.isLocal = true + return nil + } + return p.manager.ApplySource(ctx, add) +} + +func (p *Package) DeleteSource(ctx context.Context, remove manifest.Source) error { + if remove.Type == manifest.LocalSource { + p.isLocal = false + return nil + } + return p.manager.DeleteSource(ctx, remove) } -type PackageManager interface { - InstallRepo(ctx context.Context, repo string) error - RemoveRepo(ctx context.Context, repo string) error - InstallPackage(ctx context.Context, pkg string) error - RemovePackage(ctx context.Context, pkg string) error +func (p *Package) ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error { + if p.isLocal || source.Type == manifest.LocalSource { + return nil + } + return p.manager.ApplySourceInstall(ctx, source) } -func DetectPackageManager() (PackageManager, error) { - return nil, fmt.Errorf("detecting package manager: unsupported or undetectable package manager") +func (p *Package) DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error { + if p.isLocal || source.Type == manifest.LocalSource { + return nil + } + return p.manager.DeleteSourceInstall(ctx, source) } -func (p *Package) AddSource(ctx context.Context, add manifest.Source) error { +type AptPackage struct{} + +func (p *AptPackage) ApplySource(ctx context.Context, add manifest.Source) error { return errors.New("not implemented") } -func (p *Package) UpdateSource(ctx context.Context, old manifest.Source, new manifest.Source) error { +func (p *AptPackage) DeleteSource(ctx context.Context, remove manifest.Source) error { return errors.New("not implemented") } -func (p *Package) DeleteSource(ctx context.Context, remove manifest.Source) error { +func (p *AptPackage) ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error { + return errors.New("not implemented") +} + +func (p *AptPackage) DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error { return errors.New("not implemented") } diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 2f054a5f..f08fbaf4 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -7,6 +7,7 @@ package manifest import ( "fmt" "os" + "path" "strings" "github.com/thinkparq/beegfs-go/common/beegfs" @@ -26,6 +27,9 @@ func New() Filesystem { } } +// Filesystems is a map of FsUUIDs to file systems. +type Filesystems map[string]Filesystem + type Filesystem struct { Agents map[string]Agent `yaml:"agents"` Common Common `yaml:"common"` @@ -115,10 +119,11 @@ func (s Source) refForNodeType(t beegfs.NodeType) string { } } -func (f *Filesystem) InheritGlobalConfig() { +func (f *Filesystem) InheritGlobalConfig(fsUUID string) { for agentID, agent := range f.Agents { for i := range agent.Nodes { node := &agent.Nodes[i] + node.fsUUID = fsUUID // Inherit global interface configuration if there are no node specific interfaces. if len(node.Interfaces) == 0 { node.Interfaces = agent.Interfaces @@ -139,6 +144,11 @@ func (f *Filesystem) InheritGlobalConfig() { Ref: f.Common.Source.refForNodeType(node.Type), } } + // Inherit target configuration from the FS and node: + for t := range node.Targets { + agent.Nodes[i].Targets[t].fsUUID = fsUUID + agent.Nodes[i].Targets[t].nodeType = node.Type + } } f.Agents[agentID] = agent } @@ -171,6 +181,9 @@ type Agent struct { } type Node struct { + // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // and identifiers in case resources for multiple file systems exist on the same machine. + fsUUID string ID beegfs.NumId `yaml:"id"` Type beegfs.NodeType `yaml:"type"` Config map[string]string `yaml:"config"` @@ -179,6 +192,10 @@ type Node struct { Source *NodeSource `yaml:"source,omitempty"` } +func (n Node) GetSystemdUnit() string { + return fmt.Sprintf("beegfs-%s-%s-%d.service", n.fsUUID, n.Type, n.ID) +} + type NodeSource struct { Type SourceType `yaml:"type"` Ref string `yaml:"ref"` @@ -190,9 +207,17 @@ type Nic struct { } type Target struct { - ID beegfs.NumId `yaml:"id"` - RootDir string `yaml:"root_dir"` - ULFS *UnderlyingFS `yaml:"ulfs"` + // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // and identifiers in case resources for multiple file systems exist on the same machine. + fsUUID string + nodeType beegfs.NodeType + ID beegfs.NumId `yaml:"id"` + RootDir string `yaml:"root_dir"` + ULFS *UnderlyingFS `yaml:"ulfs"` +} + +func (t Target) GetPath() string { + return path.Join(t.RootDir, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) } type UnderlyingFS struct { @@ -416,20 +441,20 @@ func ToProto(fs *Filesystem) *pb.Filesystem { return pbFS } -func FromDisk(path string) (Filesystem, error) { +func FromDisk(path string) (Filesystems, error) { data, err := os.ReadFile(path) if err != nil { - return Filesystem{}, err + return nil, err } - var fs Filesystem - if err := yaml.Unmarshal(data, &fs); err != nil { - return fs, err + var filesystems Filesystems + if err := yaml.Unmarshal(data, &filesystems); err != nil { + return nil, err } - return fs, nil + return filesystems, nil } -func ToDisk(fs Filesystem, path string) error { - data, err := yaml.Marshal(&fs) +func ToDisk(filesystems Filesystems, path string) error { + data, err := yaml.Marshal(&filesystems) if err != nil { return err } diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index a4ca7fe6..d3e2a7de 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -92,6 +92,12 @@ func TestInheritGlobalConfig(t *testing.T) { Type: beegfs.Meta, ID: 1, Config: map[string]string{"baz": "node-specific"}, + Targets: []Target{ + { + ID: beegfs.NumId(1), + RootDir: "/beegfs/", + }, + }, }, }, }, @@ -155,11 +161,16 @@ func TestInheritGlobalConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { fs := tt.input - fs.InheritGlobalConfig() + fs.InheritGlobalConfig("testFS") agent := fs.Agents["agent1"] node := agent.Nodes[0] assert.Equal(t, tt.expectedNIC, node.Interfaces[0].Name) assert.Equal(t, tt.expectedCfg, node.Config) + assert.Equal(t, "testFS", node.fsUUID) + for _, target := range node.Targets { + assert.Equal(t, "/beegfs/testFS/meta_1", target.GetPath(), "generated target path did not match") + } + }) } } diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index b515fe8f..574879a4 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -1,80 +1,83 @@ -common: - auth: secret - meta_config: - quotaEnableEnforcement: true - storeClientXAttrs: true - storeClientACLs: true - storage_config: - quotaEnableEnforcement: true - client_config: - quotaEnabled: true - source: - type: package - repo: https://www.beegfs.io/release/beegfs_8.0/ - meta: beegfs-meta=8.0.1 - storage: beegfs-storage=8.0.1 +beegfs01: #fsUUID + common: + auth: secret + meta_config: + quotaEnableEnforcement: true + storeClientXAttrs: true + storeClientACLs: true + storage_config: + quotaEnableEnforcement: true + client_config: + quotaEnabled: true + source: + # type: package + type: local + repo: https://www.beegfs.io/release/beegfs_8.0/ + meta: beegfs-meta=8.0.1 + storage: beegfs-storage=8.0.1 -# source: -# type: container -# repo: ghcr.io/thinkparq -# meta: beegfs-meta:8.0.1 -# storage: beegfs-storage:8.0.1 + # source: + # type: container + # repo: ghcr.io/thinkparq + # meta: beegfs-meta:8.0.1 + # storage: beegfs-storage:8.0.1 -agents: - agent1: # agent-id - interfaces: - - name: enp0s1 - address: "192.168.64.5/24" - nodes: - - type: meta - id: 1 - source: - type: local - ref: /home/joe/development/beegfs/meta/build/beegfs-meta - interfaces: - - name: enp0s1 # IP configuration handled globally - targets: - - id: 101 - ulfs: - device: /dev/sda1 - type: ext4 - format_flags: foo - mount_flags: baz - # agent2: # agent-id - # nodes: - # - type: storage - # id: 1 - # interfaces: - # - name: ib0 - # address: 10.0.0.102/16 - # targets: - # - id: 101 - # ulfs: - # device: /dev/sda1 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - id: 102 - # ulfs: - # device: /dev/sda1 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - type: storage - # id: 2 - # interfaces: - # - name: ib1 - # address: 10.0.0.102/16 - # targets: - # - id: 201 - # ulfs: - # device: /dev/sda2 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - id: 202 - # ulfs: - # device: /dev/sda2 - # type: ext4 - # format_flags: foo - # mount_flags: baz + agents: + agent1: # agentID + interfaces: + - name: enp0s1 + address: "10.0.0.100/24" + nodes: + - type: meta + id: 1 + source: + type: local + ref: /development/beegfs/meta/build/beegfs-meta + interfaces: + - name: enp0s1 # IP configuration handled globally + targets: + - id: 101 + root_dir: /beegfs/ + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # agent2: # agent-id + # nodes: + # - type: storage + # id: 1 + # interfaces: + # - name: ib0 + # address: 10.0.0.102/16 + # targets: + # - id: 101 + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - id: 102 + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - type: storage + # id: 2 + # interfaces: + # - name: ib1 + # address: 10.0.0.102/16 + # targets: + # - id: 201 + # ulfs: + # device: /dev/sda2 + # type: ext4 + # format_flags: foo + # mount_flags: baz + # - id: 202 + # ulfs: + # device: /dev/sda2 + # type: ext4 + # format_flags: foo + # mount_flags: baz diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 0d704eb1..f91c079d 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -1,6 +1,8 @@ package reconciler import ( + "context" + "errors" "fmt" "path" "reflect" @@ -36,6 +38,7 @@ type Reconciler interface { Status() (ReconcileResult, error) Cancel(string) (ReconcileResult, error) UpdateConfiguration(any) error + Stop() error } type ReconcileResult struct { @@ -43,22 +46,22 @@ type ReconcileResult struct { } type defaultReconciler struct { - agentID string - log *zap.Logger - mu sync.Mutex - currentFS manifest.Filesystem - state state - config Config - strategy deploy.Deployer + agentID string + log *zap.Logger + mu sync.Mutex + activeManifest manifest.Filesystems + state state + config Config + strategy deploy.Deployer } -func New(agentID string, log *zap.Logger, config Config) (Reconciler, error) { +func New(ctx context.Context, agentID string, log *zap.Logger, config Config) (Reconciler, error) { log = log.With(zap.String("component", path.Base(reflect.TypeOf(defaultReconciler{}).PkgPath()))) var deploymentStrategy deploy.Deployer var err error switch config.DeploymentStrategy { case DefaultStrategy: - if deploymentStrategy, err = deploy.NewDefaultStrategy(); err != nil { + if deploymentStrategy, err = deploy.NewDefaultStrategy(ctx); err != nil { return nil, fmt.Errorf("unable to configure deployment strategy: %w", err) } default: @@ -74,6 +77,14 @@ func New(agentID string, log *zap.Logger, config Config) (Reconciler, error) { }, nil } +func (r *defaultReconciler) Stop() error { + r.log.Info("attempting to stop reconciler") + r.state.cancel("agent is shutting down") + r.mu.Lock() + defer r.mu.Unlock() + return r.strategy.Cleanup() +} + func (r *defaultReconciler) GetAgentID() string { return r.agentID } @@ -110,7 +121,7 @@ func (r *defaultReconciler) UpdateConfiguration(config any) error { return fmt.Errorf("%w: %w", ErrLoadingManifest, err) } return r.verify(newFS) - } else if newFS, ok := config.(manifest.Filesystem); ok { + } else if newFS, ok := config.(manifest.Filesystems); ok { r.mu.Lock() r.log.Info("saving file system manifest", zap.String("path", r.config.ActiveManifestPath)) err := manifest.ToDisk(newFS, r.config.ManifestPath) @@ -125,37 +136,79 @@ func (r *defaultReconciler) UpdateConfiguration(config any) error { // Verify performs any checks that can be done without actually reconciling the manifest. This // allows a response to be returned quickly while the reconciliation happens in the background. -func (r *defaultReconciler) verify(newFS manifest.Filesystem) error { +func (r *defaultReconciler) verify(newManifest manifest.Filesystems) error { r.log.Info("verifying manifest") - newFS.InheritGlobalConfig() - // TODO: - // * Avoid necessary reconciliations by seeing if the manifest changed. - // * Validate we can migrate from currentFS to newFS. - go r.reconcile(newFS) + if len(newManifest) == 0 { + return errors.New("manifest does not contain any file systems") + } + for fsUUID, fs := range newManifest { + // TODO: + // * Avoid necessary reconciliations by seeing if the manifest changed. + // * Validate we can migrate from currentFS to newFS. + // * Validate the FS config: + // * All nodes have IPs + targets. + // * Nodes have the correct number of targets (i.e., 1 for mgmtd meta, remote, sync). + // Note these should be implemented as methods on manifest.Filesystem. + fs.InheritGlobalConfig(fsUUID) + } + go r.reconcile(newManifest) return nil } // Reconcile attempts to move the local state from the currentFS to the newFS. -func (r *defaultReconciler) reconcile(newFS manifest.Filesystem) { +func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { r.mu.Lock() defer r.mu.Unlock() - r.log.Debug("reconciling", zap.Any("filesystem", newFS)) + r.log.Debug("reconciling", zap.Any("filesystem", newManifest)) ctx := r.state.start() - agent, ok := newFS.Agents[r.agentID] - if !ok { - r.state.cancel("no configuration for this agent found in the provided manifest") - return - } + for fsUUID, fs := range newManifest { + agent, ok := fs.Agents[r.agentID] + if !ok { + // Not all file systems in this manifest may have configuration for this agent. It is + // also valid that this manifest has no nodes managed by this agent. + r.log.Debug("file system has no nodes assigned to this agent", zap.String("fsUUID", fsUUID)) + continue + } - if err := r.strategy.AddInterfaces(ctx, agent.Interfaces); err != nil { - r.state.fail(err.Error()) - return - } + // Don't apply any common configuration if the agent doesn't have any nodes for this file system. + if err := r.strategy.ApplySource(ctx, fs.Common.Source); err != nil { + r.state.fail(fmt.Sprintf("unable to apply source configuration for %s: %s", fsUUID, err.Error())) + return + } - // TODO + if err := r.strategy.ApplyInterfaces(ctx, agent.Interfaces); err != nil { + r.state.fail(fmt.Sprintf("unable to apply global interface configuration for %s: %s", fsUUID, err.Error())) + return + } - r.currentFS = newFS - manifest.ToDisk(r.currentFS, r.config.ActiveManifestPath) + for _, node := range agent.Nodes { + if err := r.strategy.ApplyInterfaces(ctx, node.Interfaces); err != nil { + r.state.fail(fmt.Sprintf("unable to apply interface configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + return + } + if err := r.strategy.ApplyTargets(ctx, node.Targets); err != nil { + r.state.fail(fmt.Sprintf("unable to apply target configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + return + } + + // Currently the source for the node should always be set by the user or inherited + // automatically from the global configuration. This might change so avoid a panic. + if node.Source != nil { + if err := r.strategy.ApplySourceInstall(ctx, *node.Source); err != nil { + r.state.fail(fmt.Sprintf("unable to apply source installation for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + } + } else { + r.log.Warn("node source was unexpectedly nil (ignoring)", zap.String("fsUUID", fsUUID), zap.String("nodeType", node.Type.String()), zap.Any("nodeID", node.ID)) + } + + if err := r.strategy.ApplyService(ctx, node); err != nil { + r.state.fail(fmt.Sprintf("unable to apply service configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + return + } + } + } + r.activeManifest = newManifest + manifest.ToDisk(r.activeManifest, r.config.ActiveManifestPath) r.state.complete(beegfs.AgentStatus_SUCCESS) } diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go index 3ae5b10a..e76ac2cd 100644 --- a/agent/pkg/reconciler/state.go +++ b/agent/pkg/reconciler/state.go @@ -6,7 +6,8 @@ import ( "sync" "time" - "github.com/thinkparq/protobuf/go/beegfs" + "github.com/thinkparq/beegfs-go/common/beegfs" + pb "github.com/thinkparq/protobuf/go/beegfs" "go.uber.org/zap" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" @@ -14,96 +15,92 @@ import ( type state struct { logger *zap.Logger - current beegfs.AgentStatus - historical map[time.Time]*beegfs.AgentStatus + current pb.AgentStatus + historical map[time.Time]*pb.AgentStatus mu sync.Mutex ctx context.Context ctxCancel context.CancelFunc } -type op string - -const ( - unknown op = "UNKNOWN" - agent = "AGENT" - mount = "MOUNT" -) - func newAgentState(l *zap.Logger) state { return state{ - current: beegfs.AgentStatus{ - State: beegfs.AgentStatus_IDLE, + current: pb.AgentStatus{ + State: pb.AgentStatus_IDLE, Messages: []string{}, Updated: timestamppb.Now(), }, - historical: make(map[time.Time]*beegfs.AgentStatus), + historical: make(map[time.Time]*pb.AgentStatus), mu: sync.Mutex{}, logger: l, } } +func getFsNodeID(fsUUID string, nt beegfs.NodeType, id beegfs.NumId) string { + return fmt.Sprintf("%s:%s:%d", fsUUID, nt, id) +} + // start() marks the beginning of a reconciliation. It returns a context that will be cancelled if // the reconciliation is cancelled early. func (s *state) start() context.Context { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_APPLYING.String())) - s.historical[time.Now()] = proto.Clone(&s.current).(*beegfs.AgentStatus) - s.current = beegfs.AgentStatus{ - State: beegfs.AgentStatus_APPLYING, + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_APPLYING.String())) + s.historical[time.Now()] = proto.Clone(&s.current).(*pb.AgentStatus) + s.current = pb.AgentStatus{ + State: pb.AgentStatus_APPLYING, Messages: []string{}, Updated: timestamppb.Now(), } ctx, cancel := context.WithCancel(context.Background()) s.ctx = ctx s.ctxCancel = cancel - s.logUnlocked(agent, "began reconciliation") + s.logUnlocked("began reconciliation") return s.ctx } -func (s *state) get() *beegfs.AgentStatus { +func (s *state) get() *pb.AgentStatus { s.mu.Lock() defer s.mu.Unlock() - return proto.Clone(&s.current).(*beegfs.AgentStatus) + return proto.Clone(&s.current).(*pb.AgentStatus) } -func (s *state) logUnlocked(cat op, message string) { +func (s *state) logUnlocked(message string) { s.current.Updated = timestamppb.Now() - s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s [%s]: %s", s.current.Updated.String(), cat, message)) + s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s: %s", s.current.Updated.String(), message)) } -func (s *state) log(cat op, message string) { +func (s *state) log(message string) { s.mu.Lock() defer s.mu.Unlock() s.current.Updated = timestamppb.Now() - s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s [%s]: %s", s.current.Updated.String(), cat, message)) + s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s: %s", s.current.Updated.String(), message)) } -func (s *state) fail(message string) *beegfs.AgentStatus { +func (s *state) fail(message string) *pb.AgentStatus { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_FAILED.String())) - s.current.State = beegfs.AgentStatus_FAILED - s.logUnlocked(agent, message) + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_FAILED.String()), zap.Any("message", message)) + s.current.State = pb.AgentStatus_FAILED + s.logUnlocked(message) s.ctxCancel() - return proto.Clone(&s.current).(*beegfs.AgentStatus) + return proto.Clone(&s.current).(*pb.AgentStatus) } -func (s *state) cancel(message string) *beegfs.AgentStatus { +func (s *state) cancel(message string) *pb.AgentStatus { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", beegfs.AgentStatus_CANCELLED.String())) - s.current.State = beegfs.AgentStatus_CANCELLED - s.logUnlocked(agent, message) + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_CANCELLED.String()), zap.Any("message", message)) + s.current.State = pb.AgentStatus_CANCELLED + s.logUnlocked(message) s.ctxCancel() - return proto.Clone(&s.current).(*beegfs.AgentStatus) + return proto.Clone(&s.current).(*pb.AgentStatus) } -func (s *state) complete(finalState beegfs.AgentStatus_State) { +func (s *state) complete(finalState pb.AgentStatus_State) { s.mu.Lock() defer s.mu.Unlock() s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", finalState.String())) s.current.State = finalState - s.logUnlocked(agent, "finished reconciliation") + s.logUnlocked("finished reconciliation") s.ctxCancel() } diff --git a/go.mod b/go.mod index 7b30a3e8..74f3a397 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/s3 v1.51.3 github.com/aws/smithy-go v1.20.4 github.com/bmatcuk/doublestar/v4 v4.8.1 + github.com/coreos/go-systemd/v22 v22.5.0 github.com/dgraph-io/badger/v4 v4.3.0 github.com/dsnet/golib/unitconv v1.0.2 github.com/google/uuid v1.6.0 @@ -52,6 +53,7 @@ require ( github.com/dgraph-io/ristretto v0.1.2-0.20240116140435-c67e07994f91 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect diff --git a/go.sum b/go.sum index 59d66ee2..882186f7 100644 --- a/go.sum +++ b/go.sum @@ -43,6 +43,8 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -70,6 +72,8 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= From ded65b96f0585fde2b6f52ee8314682bbf1a4159 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Tue, 13 May 2025 19:38:06 +0000 Subject: [PATCH 05/13] wip: move agent protos to a new package --- agent/internal/server/server.go | 18 ++++++------ agent/pkg/manifest/filesystem.go | 2 +- agent/pkg/manifest/filesystem_test.go | 5 ++-- agent/pkg/reconciler/reconciler.go | 6 ++-- agent/pkg/reconciler/state.go | 42 +++++++++++++-------------- 5 files changed, 37 insertions(+), 36 deletions(-) diff --git a/agent/internal/server/server.go b/agent/internal/server/server.go index 3c53e282..18f6f6ba 100644 --- a/agent/internal/server/server.go +++ b/agent/internal/server/server.go @@ -11,7 +11,7 @@ import ( "github.com/thinkparq/beegfs-go/agent/pkg/manifest" "github.com/thinkparq/beegfs-go/agent/pkg/reconciler" - "github.com/thinkparq/protobuf/go/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -27,7 +27,7 @@ type Config struct { } type AgentServer struct { - beegfs.UnimplementedBeeAgentServer + pb.UnimplementedBeeAgentServer log *zap.Logger wg *sync.WaitGroup Config @@ -55,7 +55,7 @@ func New(log *zap.Logger, config Config, reconciler reconciler.Reconciler) (*Age s.log.Warn("not using TLS because it was explicitly disabled or a certificate and/or key were not specified") } s.grpcServer = grpc.NewServer(grpcServerOpts...) - beegfs.RegisterBeeAgentServer(s.grpcServer, &s) + pb.RegisterBeeAgentServer(s.grpcServer, &s) return &s, nil } @@ -81,7 +81,7 @@ func (s *AgentServer) Stop() { s.wg.Wait() } -func (s *AgentServer) Update(ctx context.Context, request *beegfs.AgentUpdateRequest) (*beegfs.AgentUpdateResponse, error) { +func (s *AgentServer) Update(ctx context.Context, request *pb.UpdateRequest) (*pb.UpdateResponse, error) { s.wg.Add(1) defer s.wg.Done() @@ -96,31 +96,31 @@ func (s *AgentServer) Update(ctx context.Context, request *beegfs.AgentUpdateReq if err := s.reconciler.UpdateConfiguration(filesystems); err != nil { return nil, grpcStatusFrom(err) } - return &beegfs.AgentUpdateResponse{ + return &pb.UpdateResponse{ AgentId: s.reconciler.GetAgentID(), }, nil } -func (s *AgentServer) Status(ctx context.Context, request *beegfs.AgentStatusRequest) (*beegfs.AgentStatusResponse, error) { +func (s *AgentServer) Status(ctx context.Context, request *pb.StatusRequest) (*pb.StatusResponse, error) { s.wg.Add(1) defer s.wg.Done() if result, err := s.reconciler.Status(); err != nil { return nil, grpcStatusFrom(err) } else { - return &beegfs.AgentStatusResponse{ + return &pb.StatusResponse{ Status: result.Status, AgentId: s.reconciler.GetAgentID(), }, nil } } -func (s *AgentServer) Cancel(ctx context.Context, request *beegfs.AgentCancelRequest) (*beegfs.AgentCancelResponse, error) { +func (s *AgentServer) Cancel(ctx context.Context, request *pb.CancelRequest) (*pb.CancelResponse, error) { s.wg.Add(1) defer s.wg.Done() if result, err := s.reconciler.Cancel(request.GetReason()); err != nil { return nil, grpcStatusFrom(err) } else { - return &beegfs.AgentCancelResponse{ + return &pb.CancelResponse{ Status: result.Status, AgentId: s.reconciler.GetAgentID(), }, nil diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index f08fbaf4..8c4fb9c6 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -11,7 +11,7 @@ import ( "strings" "github.com/thinkparq/beegfs-go/common/beegfs" - pb "github.com/thinkparq/protobuf/go/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" "gopkg.in/yaml.v3" ) diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index d3e2a7de..0daaea88 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -5,7 +5,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/thinkparq/beegfs-go/common/beegfs" - pb "github.com/thinkparq/protobuf/go/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" + pbb "github.com/thinkparq/protobuf/go/beegfs" ) func TestFromToProto_RoundTrip(t *testing.T) { @@ -28,7 +29,7 @@ func TestFromToProto_RoundTrip(t *testing.T) { Nodes: []*pb.Node{ { NumId: 1, - NodeType: pb.NodeType_META, + NodeType: pbb.NodeType_META, Config: map[string]string{"nkey": "nval"}, Interfaces: []*pb.Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index f91c079d..83a17c18 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -10,7 +10,7 @@ import ( "github.com/thinkparq/beegfs-go/agent/pkg/deploy" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" - "github.com/thinkparq/protobuf/go/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" "go.uber.org/zap" ) @@ -42,7 +42,7 @@ type Reconciler interface { } type ReconcileResult struct { - Status *beegfs.AgentStatus + Status *pb.Status } type defaultReconciler struct { @@ -210,5 +210,5 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { } r.activeManifest = newManifest manifest.ToDisk(r.activeManifest, r.config.ActiveManifestPath) - r.state.complete(beegfs.AgentStatus_SUCCESS) + r.state.complete(pb.Status_SUCCESS) } diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go index e76ac2cd..476c7f5d 100644 --- a/agent/pkg/reconciler/state.go +++ b/agent/pkg/reconciler/state.go @@ -7,7 +7,7 @@ import ( "time" "github.com/thinkparq/beegfs-go/common/beegfs" - pb "github.com/thinkparq/protobuf/go/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" "go.uber.org/zap" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" @@ -15,8 +15,8 @@ import ( type state struct { logger *zap.Logger - current pb.AgentStatus - historical map[time.Time]*pb.AgentStatus + current pb.Status + historical map[time.Time]*pb.Status mu sync.Mutex ctx context.Context ctxCancel context.CancelFunc @@ -24,12 +24,12 @@ type state struct { func newAgentState(l *zap.Logger) state { return state{ - current: pb.AgentStatus{ - State: pb.AgentStatus_IDLE, + current: pb.Status{ + State: pb.Status_IDLE, Messages: []string{}, Updated: timestamppb.Now(), }, - historical: make(map[time.Time]*pb.AgentStatus), + historical: make(map[time.Time]*pb.Status), mu: sync.Mutex{}, logger: l, } @@ -44,10 +44,10 @@ func getFsNodeID(fsUUID string, nt beegfs.NodeType, id beegfs.NumId) string { func (s *state) start() context.Context { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_APPLYING.String())) - s.historical[time.Now()] = proto.Clone(&s.current).(*pb.AgentStatus) - s.current = pb.AgentStatus{ - State: pb.AgentStatus_APPLYING, + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.Status_APPLYING.String())) + s.historical[time.Now()] = proto.Clone(&s.current).(*pb.Status) + s.current = pb.Status{ + State: pb.Status_APPLYING, Messages: []string{}, Updated: timestamppb.Now(), } @@ -58,10 +58,10 @@ func (s *state) start() context.Context { return s.ctx } -func (s *state) get() *pb.AgentStatus { +func (s *state) get() *pb.Status { s.mu.Lock() defer s.mu.Unlock() - return proto.Clone(&s.current).(*pb.AgentStatus) + return proto.Clone(&s.current).(*pb.Status) } func (s *state) logUnlocked(message string) { @@ -76,27 +76,27 @@ func (s *state) log(message string) { s.current.Messages = append(s.current.Messages, fmt.Sprintf("%s: %s", s.current.Updated.String(), message)) } -func (s *state) fail(message string) *pb.AgentStatus { +func (s *state) fail(message string) *pb.Status { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_FAILED.String()), zap.Any("message", message)) - s.current.State = pb.AgentStatus_FAILED + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.Status_FAILED.String()), zap.Any("message", message)) + s.current.State = pb.Status_FAILED s.logUnlocked(message) s.ctxCancel() - return proto.Clone(&s.current).(*pb.AgentStatus) + return proto.Clone(&s.current).(*pb.Status) } -func (s *state) cancel(message string) *pb.AgentStatus { +func (s *state) cancel(message string) *pb.Status { s.mu.Lock() defer s.mu.Unlock() - s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.AgentStatus_CANCELLED.String()), zap.Any("message", message)) - s.current.State = pb.AgentStatus_CANCELLED + s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", pb.Status_CANCELLED.String()), zap.Any("message", message)) + s.current.State = pb.Status_CANCELLED s.logUnlocked(message) s.ctxCancel() - return proto.Clone(&s.current).(*pb.AgentStatus) + return proto.Clone(&s.current).(*pb.Status) } -func (s *state) complete(finalState pb.AgentStatus_State) { +func (s *state) complete(finalState pb.Status_State) { s.mu.Lock() defer s.mu.Unlock() s.logger.Info("state update", zap.String("oldState", s.current.State.String()), zap.String("newState", finalState.String())) From 907bdc85107f98e440449948abf79f47d9f1212e Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 17:19:18 +0000 Subject: [PATCH 06/13] wip: expand/improve common config - add auth and TLS structs - reduce boiler plate on common service and source config - split out filesystem.go into multiple files --- agent/pkg/manifest/common.go | 175 +++++++++++++++ agent/pkg/manifest/filesystem.go | 302 +++++--------------------- agent/pkg/manifest/filesystem_test.go | 45 ++-- agent/pkg/manifest/manifest.yaml | 96 +++----- agent/pkg/manifest/node.go | 28 +++ agent/pkg/manifest/target.go | 89 ++++++++ agent/pkg/reconciler/state.go | 5 + 7 files changed, 417 insertions(+), 323 deletions(-) create mode 100644 agent/pkg/manifest/common.go create mode 100644 agent/pkg/manifest/node.go create mode 100644 agent/pkg/manifest/target.go diff --git a/agent/pkg/manifest/common.go b/agent/pkg/manifest/common.go new file mode 100644 index 00000000..fb2b8713 --- /dev/null +++ b/agent/pkg/manifest/common.go @@ -0,0 +1,175 @@ +package manifest + +import ( + "fmt" + + "github.com/thinkparq/beegfs-go/common/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" +) + +type Common struct { + Auth *Auth `yaml:"auth"` + TLS *TLS `yaml:"tls"` + GlobalConfig NodeConfigs `yaml:"config"` + Source Source `yaml:"source"` +} + +type Auth struct { + Secret string `yaml:"secret"` +} + +type TLS struct { + Key string `yaml:"key"` + Cert string `yaml:"cert"` +} + +type NodeConfigs map[beegfs.NodeType]map[string]string + +func (s *NodeConfigs) UnmarshalYAML(unmarshal func(any) error) error { + // We cannot directly apply validation to map[beegfs.NodeType]... during unmarshal because the + // YAML input uses string keys and as a result things blow up (spectacularly). + intermediate := map[string]map[string]string{} + if err := unmarshal(&intermediate); err != nil { + return err + } + + result := make(NodeConfigs, len(intermediate)) + for key, val := range intermediate { + nodeType := beegfs.NodeTypeFromString(key) + if nodeType == beegfs.InvalidNodeType { + return fmt.Errorf("invalid node type '%s' in config", key) + } + result[nodeType] = val + } + + *s = result + return nil +} + +func (c NodeConfigs) toProto() []*pb.NodeConfig { + pbNodeConfigs := make([]*pb.NodeConfig, 0, len(c)) + for nodeType, nodeMap := range c { + pbNodeConfigs = append(pbNodeConfigs, &pb.NodeConfig{ + NodeType: *nodeType.ToProto(), + StringMap: nodeMap, + }) + } + return pbNodeConfigs +} + +func nodeConfigsFromProto(m []*pb.NodeConfig) NodeConfigs { + nsm := make(NodeConfigs, len(m)) + for _, node := range m { + if node != nil && node.GetStringMap() != nil { + nsm[beegfs.NodeTypeFromProto(node.NodeType)] = node.GetStringMap() + } + } + return nsm +} + +type Source struct { + Type SourceType `yaml:"type"` + Repo string `yaml:"repo"` + Refs SourceRefs `yaml:"refs"` +} + +type SourceRefs map[beegfs.NodeType]string + +func (s *SourceRefs) UnmarshalYAML(unmarshal func(any) error) error { + // We cannot directly apply validation to map[beegfs.NodeType]... during unmarshal because the + // YAML input uses string keys and as a result things blow up (spectacularly). + intermediate := map[string]string{} + if err := unmarshal(&intermediate); err != nil { + return err + } + + result := make(SourceRefs, len(intermediate)) + for key, val := range intermediate { + nodeType := beegfs.NodeTypeFromString(key) + if nodeType == beegfs.InvalidNodeType { + return fmt.Errorf("invalid node type '%s' in source refs", key) + } + result[nodeType] = val + } + + *s = result + return nil +} + +func (c SourceRefs) toProto() []*pb.SourceRef { + pbSourceRefs := make([]*pb.SourceRef, 0, len(c)) + for nodeType, ref := range c { + pbSourceRefs = append(pbSourceRefs, &pb.SourceRef{ + NodeType: *nodeType.ToProto(), + Ref: ref, + }) + } + return pbSourceRefs +} + +func sourceRefsFromProto(r []*pb.SourceRef) SourceRefs { + srs := make(SourceRefs, len(r)) + for _, ref := range r { + if ref != nil { + srs[beegfs.NodeTypeFromProto(ref.NodeType)] = ref.GetRef() + } + } + return srs +} + +type SourceType int + +const ( + UnknownSource SourceType = iota + LocalSource + PackageSource +) + +func (s SourceType) ToProto() pb.SourceType { + switch s { + case LocalSource: + return pb.SourceType_LOCAL + case PackageSource: + return pb.SourceType_PACKAGE + default: + return pb.SourceType_UNKNOWN + } +} + +func sourceTypeFromProto(st pb.SourceType) SourceType { + switch st { + case pb.SourceType_LOCAL: + return LocalSource + case pb.SourceType_PACKAGE: + return PackageSource + default: + return UnknownSource + } +} + +func (s *SourceType) UnmarshalYAML(unmarshal func(any) error) error { + var str string + if err := unmarshal(&str); err != nil { + return err + } + switch str { + case "local": + *s = LocalSource + case "package": + *s = PackageSource + default: + *s = UnknownSource + } + return nil +} + +func (s SourceType) MarshalYAML() (any, error) { + switch s { + case LocalSource: + return "local", nil + case PackageSource: + return "package", nil + default: + return "unknown", nil + } +} diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 8c4fb9c6..8acf8662 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -5,28 +5,13 @@ package manifest import ( - "fmt" "os" - "path" - "strings" "github.com/thinkparq/beegfs-go/common/beegfs" pb "github.com/thinkparq/protobuf/go/agent" "gopkg.in/yaml.v3" ) -func New() Filesystem { - return Filesystem{ - Agents: make(map[string]Agent), - Common: Common{ - MetaConfig: make(map[string]string), - StorageConfig: make(map[string]string), - ClientConfig: make(map[string]string), - Source: Source{}, - }, - } -} - // Filesystems is a map of FsUUIDs to file systems. type Filesystems map[string]Filesystem @@ -35,88 +20,15 @@ type Filesystem struct { Common Common `yaml:"common"` } -type SourceType int - -const ( - UnknownSource SourceType = iota - LocalSource - PackageSource -) - -func (s SourceType) ToProto() pb.SourceType { - switch s { - case LocalSource: - return pb.SourceType_LOCAL - case PackageSource: - return pb.SourceType_PACKAGE - default: - return pb.SourceType_UNKNOWN - } -} - -func SourceTypeFromProto(st pb.SourceType) SourceType { - switch st { - case pb.SourceType_LOCAL: - return LocalSource - case pb.SourceType_PACKAGE: - return PackageSource - default: - return UnknownSource - } -} - -func (s *SourceType) UnmarshalYAML(unmarshal func(any) error) error { - var str string - if err := unmarshal(&str); err != nil { - return err - } - switch str { - case "local": - *s = LocalSource - case "package": - *s = PackageSource - default: - *s = UnknownSource - } - return nil -} - -func (s SourceType) MarshalYAML() (any, error) { - switch s { - case LocalSource: - return "local", nil - case PackageSource: - return "package", nil - default: - return "unknown", nil - } -} - -type Source struct { - Type SourceType `yaml:"type"` - Repo string `yaml:"repo"` - Management string `yaml:"management"` - Meta string `yaml:"meta"` - Storage string `yaml:"storage"` - Remote string `yaml:"remote"` - Sync string `yaml:"sync"` +type Agent struct { + Nodes []Node `yaml:"nodes"` + // Global agent interfaces potentially reused by multiple nodes. + Interfaces []Nic `yaml:"interfaces"` } -func (s Source) refForNodeType(t beegfs.NodeType) string { - switch t { - case beegfs.Meta: - return s.Meta - case beegfs.Storage: - return s.Storage - case beegfs.Management: - return s.Management - case beegfs.Remote: - return s.Remote - case beegfs.Sync: - return s.Sync - default: - return "" - } +type Nic struct { + Name string `yaml:"name"` + Addr string `yaml:"address"` } func (f *Filesystem) InheritGlobalConfig(fsUUID string) { @@ -129,19 +41,16 @@ func (f *Filesystem) InheritGlobalConfig(fsUUID string) { node.Interfaces = agent.Interfaces } // Inherit global node configuration based on the node type. - switch agent.Nodes[i].Type { - case beegfs.Meta: - node.Config = inheritMapDefaults(f.Common.MetaConfig, node.Config) - case beegfs.Storage: - node.Config = inheritMapDefaults(f.Common.StorageConfig, node.Config) - case beegfs.Client: - node.Config = inheritMapDefaults(f.Common.ClientConfig, node.Config) + if commonNodeConfig, ok := f.Common.GlobalConfig[agent.Nodes[i].Type]; ok { + node.Config = inheritMapDefaults(commonNodeConfig, node.Config) } // Inherit global source configuration based on the node type. if node.Source == nil || node.Source.Ref == "" { node.Source = &NodeSource{ Type: f.Common.Source.Type, - Ref: f.Common.Source.refForNodeType(node.Type), + } + if ref, ok := f.Common.Source.Refs[node.Type]; ok { + node.Source.Ref = ref } } // Inherit target configuration from the FS and node: @@ -166,148 +75,36 @@ func inheritMapDefaults(defaults, target map[string]string) map[string]string { return target } -type Common struct { - Auth string `yaml:"auth"` - MetaConfig map[string]string `yaml:"meta_config"` - StorageConfig map[string]string `yaml:"storage_config"` - ClientConfig map[string]string `yaml:"client_config"` - Source Source `yaml:"source"` -} - -type Agent struct { - Nodes []Node `yaml:"nodes"` - // Global interfaces potentially reused by multiple nodes. - Interfaces []Nic `yaml:"interfaces"` -} - -type Node struct { - // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names - // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string - ID beegfs.NumId `yaml:"id"` - Type beegfs.NodeType `yaml:"type"` - Config map[string]string `yaml:"config"` - Interfaces []Nic `yaml:"interfaces"` - Targets []Target `yaml:"targets"` - Source *NodeSource `yaml:"source,omitempty"` -} - -func (n Node) GetSystemdUnit() string { - return fmt.Sprintf("beegfs-%s-%s-%d.service", n.fsUUID, n.Type, n.ID) -} - -type NodeSource struct { - Type SourceType `yaml:"type"` - Ref string `yaml:"ref"` -} - -type Nic struct { - Name string `yaml:"name"` - Addr string `yaml:"address"` -} - -type Target struct { - // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names - // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string - nodeType beegfs.NodeType - ID beegfs.NumId `yaml:"id"` - RootDir string `yaml:"root_dir"` - ULFS *UnderlyingFS `yaml:"ulfs"` -} - -func (t Target) GetPath() string { - return path.Join(t.RootDir, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) -} - -type UnderlyingFS struct { - Device string `yaml:"device"` - Type UnderlyingFSType `yaml:"type"` - FormatFlags string `yaml:"format_flags"` - MountFlags string `yaml:"mount_flags"` -} - -type UnderlyingFSType int - -const ( - UnknownUnderlyingFS UnderlyingFSType = iota - EXT4UnderlyingFS -) - -func (t UnderlyingFSType) String() string { - switch t { - case EXT4UnderlyingFS: - return "ext4" - default: - return "unknown" - } -} - -func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(any) error) error { - var s string - if err := unmarshal(&s); err != nil { - return err - } - - switch strings.ToLower(s) { - case "ext4": - *t = EXT4UnderlyingFS - default: - return fmt.Errorf("invalid underlying fs type: %s", s) - } - return nil -} - -func (t UnderlyingFSType) MarshalYAML() (any, error) { - switch t { - case EXT4UnderlyingFS: - return "ext4", nil - default: - return nil, fmt.Errorf("unknown fs type: %d", t) - } -} - -func fsTypeFromProto(fs pb.Target_UnderlyingFSOpts_FsType) UnderlyingFSType { - switch fs { - case pb.Target_UnderlyingFSOpts_EXT4: - return EXT4UnderlyingFS - default: - return UnknownUnderlyingFS - } -} - -func fsTypeToProto(fs UnderlyingFSType) pb.Target_UnderlyingFSOpts_FsType { - switch fs { - case EXT4UnderlyingFS: - return pb.Target_UnderlyingFSOpts_EXT4 - default: - return pb.Target_UnderlyingFSOpts_UNSPECIFIED - } -} - func FromProto(protoFS *pb.Filesystem) Filesystem { - fs := New() + var fs Filesystem if protoFS == nil { return fs } pSrc := protoFS.GetCommon().GetSource() fs.Common = Common{ - Auth: protoFS.GetCommon().GetAuth(), - MetaConfig: protoFS.GetCommon().GetMetaConfig(), - StorageConfig: protoFS.GetCommon().GetStorageConfig(), - ClientConfig: protoFS.GetCommon().GetClientConfig(), + GlobalConfig: nodeConfigsFromProto(protoFS.Common.GetGlobalConfig()), Source: Source{ - Type: SourceTypeFromProto(pSrc.Type), - Repo: pSrc.Repo, - Management: pSrc.Management, - Meta: pSrc.Meta, - Storage: pSrc.Storage, - Remote: pSrc.Remote, - Sync: pSrc.Sync, + Type: sourceTypeFromProto(pSrc.Type), + Repo: pSrc.Repo, + Refs: sourceRefsFromProto(pSrc.Refs), }, } + if protoFS.GetCommon().GetAuth() != nil { + fs.Common.Auth = &Auth{ + Secret: protoFS.GetCommon().GetAuth().GetSecret(), + } + } + + if protoFS.GetCommon().GetTls() != nil { + fs.Common.TLS = &TLS{ + Key: protoFS.GetCommon().GetTls().GetKey(), + Cert: protoFS.GetCommon().GetTls().GetCert(), + } + } + + fs.Agents = make(map[string]Agent, len(protoFS.GetAgent())) for id, a := range protoFS.GetAgent() { agent := Agent{ Nodes: make([]Node, 0), @@ -330,7 +127,7 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { if n.Source != nil { node.Source = &NodeSource{ - Type: SourceTypeFromProto(n.GetSource().GetType()), + Type: sourceTypeFromProto(n.GetSource().GetType()), Ref: n.GetSource().GetRef(), } } @@ -350,7 +147,7 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { if t.GetUlfs() != nil { target.ULFS = &UnderlyingFS{ Device: t.GetUlfs().GetDevice(), - Type: fsTypeFromProto(t.GetUlfs().GetType()), + Type: ulfsTypeFromProto(t.GetUlfs().GetType()), FormatFlags: t.GetUlfs().GetFormatFlags(), MountFlags: t.GetUlfs().GetMountFlags(), } @@ -368,24 +165,29 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { func ToProto(fs *Filesystem) *pb.Filesystem { pbFS := &pb.Filesystem{ Common: &pb.Filesystem_Common{ - Auth: fs.Common.Auth, - MetaConfig: fs.Common.MetaConfig, - StorageConfig: fs.Common.StorageConfig, - ClientConfig: fs.Common.ClientConfig, - Source: &pb.Filesystem_Common_Source{ - Type: fs.Common.Source.Type.ToProto(), - Repo: fs.Common.Source.Repo, - Management: fs.Common.Source.Management, - Meta: fs.Common.Source.Meta, - Storage: fs.Common.Source.Storage, - Remote: fs.Common.Source.Remote, - Sync: fs.Common.Source.Sync, + GlobalConfig: fs.Common.GlobalConfig.toProto(), + Source: &pb.Source{ + Type: fs.Common.Source.Type.ToProto(), + Repo: fs.Common.Source.Repo, + Refs: fs.Common.Source.Refs.toProto(), }, }, - Agent: make(map[string]*pb.Agent), } + if fs.Common.Auth != nil { + pbFS.Common.Auth = &pb.Auth{ + Secret: fs.Common.Auth.Secret, + } + } + + if fs.Common.TLS != nil { + pbFS.Common.Tls = &pb.TLS{ + Key: fs.Common.TLS.Key, + Cert: fs.Common.TLS.Cert, + } + } + for agentID, agent := range fs.Agents { pbAgent := &pb.Agent{ Nodes: make([]*pb.Node, 0, len(agent.Nodes)), @@ -427,7 +229,7 @@ func ToProto(fs *Filesystem) *pb.Filesystem { if tgt.ULFS != nil { pbTarget.Ulfs = &pb.Target_UnderlyingFSOpts{ Device: tgt.ULFS.Device, - Type: fsTypeToProto(tgt.ULFS.Type), + Type: tgt.ULFS.Type.toProto(), FormatFlags: tgt.ULFS.FormatFlags, MountFlags: tgt.ULFS.MountFlags, } diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index 0daaea88..5448a6a9 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -12,12 +12,27 @@ import ( func TestFromToProto_RoundTrip(t *testing.T) { original := &pb.Filesystem{ Common: &pb.Filesystem_Common{ - Auth: "secret", - MetaConfig: map[string]string{"key1": "val1"}, - StorageConfig: map[string]string{"key2": "val2"}, - ClientConfig: map[string]string{"key3": "val3"}, - Source: &pb.Filesystem_Common_Source{ + Auth: &pb.Auth{ + Secret: "secret", + }, + Tls: &pb.TLS{ + Key: "tlsKey", + Cert: "tlsCert", + }, + GlobalConfig: []*pb.NodeConfig{ + { + NodeType: pbb.NodeType_META, + StringMap: map[string]string{"key": "val"}, + }, + }, + Source: &pb.Source{ Type: pb.SourceType_PACKAGE, + Refs: []*pb.SourceRef{ + { + NodeType: pbb.NodeType_META, + Ref: "ref", + }, + }, }, }, @@ -74,13 +89,14 @@ func TestInheritGlobalConfig(t *testing.T) { name: "inherit source, NIC and meta config", input: Filesystem{ Common: Common{ - MetaConfig: map[string]string{ - "foo": "bar", - "baz": "global", - }, + GlobalConfig: NodeConfigs{beegfs.Meta: map[string]string{ + "foo": "bar", // inherited + "baz": "node-specific", // overridden + }}, Source: Source{ + Refs: SourceRefs{beegfs.Meta: "beegfs-meta=8.0.1"}, Type: PackageSource, - Meta: "beegfs-meta=8.0.1", + Repo: "repoURL", }, }, Agents: map[string]Agent{ @@ -118,12 +134,15 @@ func TestInheritGlobalConfig(t *testing.T) { name: "no inheritance if NICs or source are present", input: Filesystem{ Common: Common{ - MetaConfig: map[string]string{ - "quota": "enabled", + GlobalConfig: NodeConfigs{ + beegfs.Meta: map[string]string{ + "quota": "enabled", + }, }, Source: Source{ Type: PackageSource, - Meta: "beegfs-meta=8.0.1", + Refs: SourceRefs{beegfs.Meta: "beegfs-meta=8.0.1"}, + Repo: "repoURL", }, }, Agents: map[string]Agent{ diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index 574879a4..2ae190bc 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -1,38 +1,52 @@ beegfs01: #fsUUID common: - auth: secret - meta_config: - quotaEnableEnforcement: true - storeClientXAttrs: true - storeClientACLs: true - storage_config: - quotaEnableEnforcement: true - client_config: - quotaEnabled: true + auth: + secret: "sharedSecret" + tls: + key: | + tlsKey + cert: | + tlsCert + config: + meta: + quotaEnableEnforcement: true + storeClientXAttrs: true + storeClientACLs: true + storage: + quotaEnableEnforcement: true + client: + quotaEnabled: true source: - # type: package - type: local + type: package repo: https://www.beegfs.io/release/beegfs_8.0/ - meta: beegfs-meta=8.0.1 - storage: beegfs-storage=8.0.1 - - # source: - # type: container - # repo: ghcr.io/thinkparq - # meta: beegfs-meta:8.0.1 - # storage: beegfs-storage:8.0.1 - + refs: + mgmtd: beegfs-mgmtd=8.0.1 + meta: beegfs-meta=8.0.1 + storage: beegfs-storage=8.0.1 + client: beegfs-client=8.0.1 + remote: beegfs-remote=8.0.1 + sync: beegfs-sync=8.0.1 + # source: + # type: container + # repo: ghcr.io/thinkparq + # refs: + # mgmtd: beegfs-mgmtd:8.0.1 + # meta: beegfs-meta:8.0.1 + # storage: beegfs-storage:8.0.1 + # client: beegfs-client:8.0.1 + # remote: beegfs-remote:8.0.1 + # sync: beegfs-sync:8.0.1 agents: agent1: # agentID interfaces: - name: enp0s1 - address: "10.0.0.100/24" + address: "10.10.10.1/16" nodes: - type: meta id: 1 source: type: local - ref: /development/beegfs/meta/build/beegfs-meta + ref: /home/tux/development/beegfs/meta/build/beegfs-meta interfaces: - name: enp0s1 # IP configuration handled globally targets: @@ -43,41 +57,3 @@ beegfs01: #fsUUID # type: ext4 # format_flags: foo # mount_flags: baz - # agent2: # agent-id - # nodes: - # - type: storage - # id: 1 - # interfaces: - # - name: ib0 - # address: 10.0.0.102/16 - # targets: - # - id: 101 - # ulfs: - # device: /dev/sda1 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - id: 102 - # ulfs: - # device: /dev/sda1 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - type: storage - # id: 2 - # interfaces: - # - name: ib1 - # address: 10.0.0.102/16 - # targets: - # - id: 201 - # ulfs: - # device: /dev/sda2 - # type: ext4 - # format_flags: foo - # mount_flags: baz - # - id: 202 - # ulfs: - # device: /dev/sda2 - # type: ext4 - # format_flags: foo - # mount_flags: baz diff --git a/agent/pkg/manifest/node.go b/agent/pkg/manifest/node.go new file mode 100644 index 00000000..7724b336 --- /dev/null +++ b/agent/pkg/manifest/node.go @@ -0,0 +1,28 @@ +package manifest + +import ( + "fmt" + + "github.com/thinkparq/beegfs-go/common/beegfs" +) + +type Node struct { + // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // and identifiers in case resources for multiple file systems exist on the same machine. + fsUUID string + ID beegfs.NumId `yaml:"id"` + Type beegfs.NodeType `yaml:"type"` + Config map[string]string `yaml:"config"` + Interfaces []Nic `yaml:"interfaces"` + Targets []Target `yaml:"targets"` + Source *NodeSource `yaml:"source,omitempty"` +} + +func (n Node) GetSystemdUnit() string { + return fmt.Sprintf("beegfs-%s-%s-%d.service", n.fsUUID, n.Type, n.ID) +} + +type NodeSource struct { + Type SourceType `yaml:"type"` + Ref string `yaml:"ref"` +} diff --git a/agent/pkg/manifest/target.go b/agent/pkg/manifest/target.go new file mode 100644 index 00000000..b41a7238 --- /dev/null +++ b/agent/pkg/manifest/target.go @@ -0,0 +1,89 @@ +package manifest + +import ( + "fmt" + "path" + "strings" + + "github.com/thinkparq/beegfs-go/common/beegfs" + pb "github.com/thinkparq/protobuf/go/agent" +) + +type Target struct { + // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // and identifiers in case resources for multiple file systems exist on the same machine. + fsUUID string + nodeType beegfs.NodeType + ID beegfs.NumId `yaml:"id"` + RootDir string `yaml:"root_dir"` + ULFS *UnderlyingFS `yaml:"ulfs"` +} + +func (t Target) GetPath() string { + return path.Join(t.RootDir, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) +} + +type UnderlyingFS struct { + Device string `yaml:"device"` + Type UnderlyingFSType `yaml:"type"` + FormatFlags string `yaml:"format_flags"` + MountFlags string `yaml:"mount_flags"` +} + +type UnderlyingFSType int + +const ( + UnknownUnderlyingFS UnderlyingFSType = iota + EXT4UnderlyingFS +) + +func (t UnderlyingFSType) String() string { + switch t { + case EXT4UnderlyingFS: + return "ext4" + default: + return "unknown" + } +} + +func (t *UnderlyingFSType) UnmarshalYAML(unmarshal func(any) error) error { + var s string + if err := unmarshal(&s); err != nil { + return err + } + + switch strings.ToLower(s) { + case "ext4": + *t = EXT4UnderlyingFS + default: + return fmt.Errorf("invalid underlying fs type: %s", s) + } + return nil +} + +func (t UnderlyingFSType) MarshalYAML() (any, error) { + switch t { + case EXT4UnderlyingFS: + return "ext4", nil + default: + return nil, fmt.Errorf("unknown fs type: %d", t) + } +} + +func ulfsTypeFromProto(fs pb.Target_UnderlyingFSOpts_FsType) UnderlyingFSType { + switch fs { + case pb.Target_UnderlyingFSOpts_EXT4: + return EXT4UnderlyingFS + default: + return UnknownUnderlyingFS + } +} + +func (fs UnderlyingFSType) toProto() pb.Target_UnderlyingFSOpts_FsType { + switch fs { + case EXT4UnderlyingFS: + return pb.Target_UnderlyingFSOpts_EXT4 + default: + return pb.Target_UnderlyingFSOpts_UNSPECIFIED + } +} diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go index 476c7f5d..1f1970b8 100644 --- a/agent/pkg/reconciler/state.go +++ b/agent/pkg/reconciler/state.go @@ -23,6 +23,9 @@ type state struct { } func newAgentState(l *zap.Logger) state { + // Always initialize with a valid context even though it is always replaced when starting a + // reconciliation. Otherwise stopping the reconciler would SIGSEGV when calling cancel. + ctx, cancel := context.WithCancel(context.Background()) return state{ current: pb.Status{ State: pb.Status_IDLE, @@ -32,6 +35,8 @@ func newAgentState(l *zap.Logger) state { historical: make(map[time.Time]*pb.Status), mu: sync.Mutex{}, logger: l, + ctx: ctx, + ctxCancel: cancel, } } From e155b15840e3deedd134701a483503273602b12f Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 17:38:06 +0000 Subject: [PATCH 07/13] wip: rename source to install-source --- agent/pkg/deploy/deploy.go | 2 +- agent/pkg/deploy/source.go | 44 +++++++++---------- agent/pkg/manifest/common.go | 62 +++++++++++++-------------- agent/pkg/manifest/filesystem.go | 38 ++++++++-------- agent/pkg/manifest/filesystem_test.go | 30 ++++++------- agent/pkg/manifest/manifest.yaml | 25 +++++++++-- agent/pkg/manifest/node.go | 20 ++++----- agent/pkg/reconciler/reconciler.go | 6 +-- 8 files changed, 123 insertions(+), 104 deletions(-) diff --git a/agent/pkg/deploy/deploy.go b/agent/pkg/deploy/deploy.go index 2c2d0fb1..c5282a7a 100644 --- a/agent/pkg/deploy/deploy.go +++ b/agent/pkg/deploy/deploy.go @@ -5,7 +5,7 @@ import "context" // Deployer is responsible for carrying out the steps needed to manage a BeeGFS "node" and handles // starting/modifying/stopping various system resources. type Deployer interface { - Sourcerer + Installer Networker Mounter Servicer diff --git a/agent/pkg/deploy/source.go b/agent/pkg/deploy/source.go index 89c93a68..0a9adbef 100644 --- a/agent/pkg/deploy/source.go +++ b/agent/pkg/deploy/source.go @@ -9,11 +9,11 @@ import ( "github.com/thinkparq/beegfs-go/agent/pkg/manifest" ) -type Sourcerer interface { - ApplySource(ctx context.Context, add manifest.Source) error - DeleteSource(ctx context.Context, remove manifest.Source) error - ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error - DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error +type Installer interface { + ApplySourceRepo(ctx context.Context, add manifest.InstallSource) error + DeleteSourceRepo(ctx context.Context, remove manifest.InstallSource) error + ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error + DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error } func DetectPackageManager() (Package, error) { @@ -33,57 +33,57 @@ func DetectPackageManager() (Package, error) { // Package provides the ability to install BeeGFS using the package manager. It implements any // general functionality and defers to the actual manager based on the specific distribution. type Package struct { - manager Sourcerer + manager Installer // isLocal is set if the manifest specifies the source type is local. This indicates all package // manager operations should be a no-op for this FS in the manifest. This allows the manifest to // fully control the installation source independent of the deployment strategy for each agent. isLocal bool } -func (p *Package) ApplySource(ctx context.Context, add manifest.Source) error { - if add.Type == manifest.LocalSource { +func (p *Package) ApplySourceRepo(ctx context.Context, add manifest.InstallSource) error { + if add.Type == manifest.LocalInstall { p.isLocal = true return nil } - return p.manager.ApplySource(ctx, add) + return p.manager.ApplySourceRepo(ctx, add) } -func (p *Package) DeleteSource(ctx context.Context, remove manifest.Source) error { - if remove.Type == manifest.LocalSource { +func (p *Package) DeleteSourceRepo(ctx context.Context, remove manifest.InstallSource) error { + if remove.Type == manifest.LocalInstall { p.isLocal = false return nil } - return p.manager.DeleteSource(ctx, remove) + return p.manager.DeleteSourceRepo(ctx, remove) } -func (p *Package) ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error { - if p.isLocal || source.Type == manifest.LocalSource { +func (p *Package) ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error { + if p.isLocal || source.Type == manifest.LocalInstall { return nil } - return p.manager.ApplySourceInstall(ctx, source) + return p.manager.ApplyInstall(ctx, source) } -func (p *Package) DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error { - if p.isLocal || source.Type == manifest.LocalSource { +func (p *Package) DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error { + if p.isLocal || source.Type == manifest.LocalInstall { return nil } - return p.manager.DeleteSourceInstall(ctx, source) + return p.manager.DeleteInstall(ctx, source) } type AptPackage struct{} -func (p *AptPackage) ApplySource(ctx context.Context, add manifest.Source) error { +func (p *AptPackage) ApplySourceRepo(ctx context.Context, add manifest.InstallSource) error { return errors.New("not implemented") } -func (p *AptPackage) DeleteSource(ctx context.Context, remove manifest.Source) error { +func (p *AptPackage) DeleteSourceRepo(ctx context.Context, remove manifest.InstallSource) error { return errors.New("not implemented") } -func (p *AptPackage) ApplySourceInstall(ctx context.Context, source manifest.NodeSource) error { +func (p *AptPackage) ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error { return errors.New("not implemented") } -func (p *AptPackage) DeleteSourceInstall(ctx context.Context, source manifest.NodeSource) error { +func (p *AptPackage) DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error { return errors.New("not implemented") } diff --git a/agent/pkg/manifest/common.go b/agent/pkg/manifest/common.go index fb2b8713..f85a0fce 100644 --- a/agent/pkg/manifest/common.go +++ b/agent/pkg/manifest/common.go @@ -8,10 +8,10 @@ import ( ) type Common struct { - Auth *Auth `yaml:"auth"` - TLS *TLS `yaml:"tls"` - GlobalConfig NodeConfigs `yaml:"config"` - Source Source `yaml:"source"` + Auth *Auth `yaml:"auth"` + TLS *TLS `yaml:"tls"` + GlobalConfig NodeConfigs `yaml:"config"` + InstallSource InstallSource `yaml:"install-source"` } type Auth struct { @@ -67,10 +67,10 @@ func nodeConfigsFromProto(m []*pb.NodeConfig) NodeConfigs { return nsm } -type Source struct { - Type SourceType `yaml:"type"` - Repo string `yaml:"repo"` - Refs SourceRefs `yaml:"refs"` +type InstallSource struct { + Type InstallType `yaml:"type"` + Repo string `yaml:"repo"` + Refs SourceRefs `yaml:"refs"` } type SourceRefs map[beegfs.NodeType]string @@ -117,57 +117,57 @@ func sourceRefsFromProto(r []*pb.SourceRef) SourceRefs { return srs } -type SourceType int +type InstallType int const ( - UnknownSource SourceType = iota - LocalSource - PackageSource + UnknownInstall InstallType = iota + LocalInstall + PackageInstall ) -func (s SourceType) ToProto() pb.SourceType { +func (s InstallType) ToProto() pb.InstallType { switch s { - case LocalSource: - return pb.SourceType_LOCAL - case PackageSource: - return pb.SourceType_PACKAGE + case LocalInstall: + return pb.InstallType_LOCAL + case PackageInstall: + return pb.InstallType_PACKAGE default: - return pb.SourceType_UNKNOWN + return pb.InstallType_UNKNOWN } } -func sourceTypeFromProto(st pb.SourceType) SourceType { +func sourceTypeFromProto(st pb.InstallType) InstallType { switch st { - case pb.SourceType_LOCAL: - return LocalSource - case pb.SourceType_PACKAGE: - return PackageSource + case pb.InstallType_LOCAL: + return LocalInstall + case pb.InstallType_PACKAGE: + return PackageInstall default: - return UnknownSource + return UnknownInstall } } -func (s *SourceType) UnmarshalYAML(unmarshal func(any) error) error { +func (s *InstallType) UnmarshalYAML(unmarshal func(any) error) error { var str string if err := unmarshal(&str); err != nil { return err } switch str { case "local": - *s = LocalSource + *s = LocalInstall case "package": - *s = PackageSource + *s = PackageInstall default: - *s = UnknownSource + *s = UnknownInstall } return nil } -func (s SourceType) MarshalYAML() (any, error) { +func (s InstallType) MarshalYAML() (any, error) { switch s { - case LocalSource: + case LocalInstall: return "local", nil - case PackageSource: + case PackageInstall: return "package", nil default: return "unknown", nil diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 8acf8662..3f083e1e 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -45,12 +45,12 @@ func (f *Filesystem) InheritGlobalConfig(fsUUID string) { node.Config = inheritMapDefaults(commonNodeConfig, node.Config) } // Inherit global source configuration based on the node type. - if node.Source == nil || node.Source.Ref == "" { - node.Source = &NodeSource{ - Type: f.Common.Source.Type, + if node.InstallSource == nil || node.InstallSource.Ref == "" { + node.InstallSource = &NodeInstallSource{ + Type: f.Common.InstallSource.Type, } - if ref, ok := f.Common.Source.Refs[node.Type]; ok { - node.Source.Ref = ref + if ref, ok := f.Common.InstallSource.Refs[node.Type]; ok { + node.InstallSource.Ref = ref } } // Inherit target configuration from the FS and node: @@ -81,10 +81,10 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { return fs } - pSrc := protoFS.GetCommon().GetSource() + pSrc := protoFS.GetCommon().GetInstallSource() fs.Common = Common{ GlobalConfig: nodeConfigsFromProto(protoFS.Common.GetGlobalConfig()), - Source: Source{ + InstallSource: InstallSource{ Type: sourceTypeFromProto(pSrc.Type), Repo: pSrc.Repo, Refs: sourceRefsFromProto(pSrc.Refs), @@ -125,10 +125,10 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { Targets: make([]Target, 0), } - if n.Source != nil { - node.Source = &NodeSource{ - Type: sourceTypeFromProto(n.GetSource().GetType()), - Ref: n.GetSource().GetRef(), + if n.InstallSource != nil { + node.InstallSource = &NodeInstallSource{ + Type: sourceTypeFromProto(n.GetInstallSource().GetType()), + Ref: n.GetInstallSource().GetRef(), } } @@ -166,10 +166,10 @@ func ToProto(fs *Filesystem) *pb.Filesystem { pbFS := &pb.Filesystem{ Common: &pb.Filesystem_Common{ GlobalConfig: fs.Common.GlobalConfig.toProto(), - Source: &pb.Source{ - Type: fs.Common.Source.Type.ToProto(), - Repo: fs.Common.Source.Repo, - Refs: fs.Common.Source.Refs.toProto(), + InstallSource: &pb.InstallSource{ + Type: fs.Common.InstallSource.Type.ToProto(), + Repo: fs.Common.InstallSource.Repo, + Refs: fs.Common.InstallSource.Refs.toProto(), }, }, Agent: make(map[string]*pb.Agent), @@ -208,10 +208,10 @@ func ToProto(fs *Filesystem) *pb.Filesystem { Targets: make([]*pb.Target, 0, len(node.Targets)), } - if node.Source != nil { - pbNode.Source = &pb.Node_Source{ - Type: node.Source.Type.ToProto(), - Ref: node.Source.Ref, + if node.InstallSource != nil { + pbNode.InstallSource = &pb.Node_InstallSource{ + Type: node.InstallSource.Type.ToProto(), + Ref: node.InstallSource.Ref, } } diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index 5448a6a9..fe30d181 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -25,8 +25,8 @@ func TestFromToProto_RoundTrip(t *testing.T) { StringMap: map[string]string{"key": "val"}, }, }, - Source: &pb.Source{ - Type: pb.SourceType_PACKAGE, + InstallSource: &pb.InstallSource{ + Type: pb.InstallType_PACKAGE, Refs: []*pb.SourceRef{ { NodeType: pbb.NodeType_META, @@ -49,8 +49,8 @@ func TestFromToProto_RoundTrip(t *testing.T) { Interfaces: []*pb.Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, - Source: &pb.Node_Source{ - Type: pb.SourceType_LOCAL, + InstallSource: &pb.Node_InstallSource{ + Type: pb.InstallType_LOCAL, Ref: "12345", }, Targets: []*pb.Target{ @@ -83,7 +83,7 @@ func TestInheritGlobalConfig(t *testing.T) { input Filesystem expectedNIC string // Expected NIC name in node if inherited expectedCfg map[string]string - expectedSrc NodeSource + expectedSrc NodeInstallSource }{ { name: "inherit source, NIC and meta config", @@ -93,9 +93,9 @@ func TestInheritGlobalConfig(t *testing.T) { "foo": "bar", // inherited "baz": "node-specific", // overridden }}, - Source: Source{ + InstallSource: InstallSource{ Refs: SourceRefs{beegfs.Meta: "beegfs-meta=8.0.1"}, - Type: PackageSource, + Type: PackageInstall, Repo: "repoURL", }, }, @@ -125,8 +125,8 @@ func TestInheritGlobalConfig(t *testing.T) { "foo": "bar", // inherited "baz": "node-specific", // overridden }, - expectedSrc: NodeSource{ - Type: PackageSource, + expectedSrc: NodeInstallSource{ + Type: PackageInstall, Ref: "beegfs-meta=8.0.1", }, }, @@ -139,8 +139,8 @@ func TestInheritGlobalConfig(t *testing.T) { "quota": "enabled", }, }, - Source: Source{ - Type: PackageSource, + InstallSource: InstallSource{ + Type: PackageInstall, Refs: SourceRefs{beegfs.Meta: "beegfs-meta=8.0.1"}, Repo: "repoURL", }, @@ -158,8 +158,8 @@ func TestInheritGlobalConfig(t *testing.T) { {Name: "eth0", Addr: "192.168.0.1/24"}, }, Config: map[string]string{"quota": "override"}, - Source: &NodeSource{ - Type: LocalSource, + InstallSource: &NodeInstallSource{ + Type: LocalInstall, Ref: "/home/tux/beegfs-meta", }, }, @@ -171,8 +171,8 @@ func TestInheritGlobalConfig(t *testing.T) { expectedCfg: map[string]string{ "quota": "override", }, - expectedSrc: NodeSource{ - Type: LocalSource, + expectedSrc: NodeInstallSource{ + Type: LocalInstall, Ref: "/home/tux/beegfs-meta", }, }, diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index 2ae190bc..e7aa5479 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -8,15 +8,20 @@ beegfs01: #fsUUID cert: | tlsCert config: + mgmtd: + beemsg-port: 10000 meta: + connMetaPort: 0 quotaEnableEnforcement: true storeClientXAttrs: true storeClientACLs: true storage: + connStoragePort: 0 quotaEnableEnforcement: true client: + connClientPort: 0 quotaEnabled: true - source: + install-source: type: package repo: https://www.beegfs.io/release/beegfs_8.0/ refs: @@ -38,13 +43,18 @@ beegfs01: #fsUUID # sync: beegfs-sync:8.0.1 agents: agent1: # agentID + address: "127.0.0.1:9010" interfaces: - name: enp0s1 - address: "10.10.10.1/16" + address: "127.0.0.1/24" nodes: + - type: mgmtd + targets: + - id: 101 + root_dir: /beegfs/ - type: meta id: 1 - source: + install-source: type: local ref: /home/tux/development/beegfs/meta/build/beegfs-meta interfaces: @@ -57,3 +67,12 @@ beegfs01: #fsUUID # type: ext4 # format_flags: foo # mount_flags: baz + agent2: # agentID + nodes: + - type: storage + id: 1 + config: + tuneNumWorkers: 28 + agent3: # agentID + nodes: + - type: client diff --git a/agent/pkg/manifest/node.go b/agent/pkg/manifest/node.go index 7724b336..1f686129 100644 --- a/agent/pkg/manifest/node.go +++ b/agent/pkg/manifest/node.go @@ -9,20 +9,20 @@ import ( type Node struct { // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string - ID beegfs.NumId `yaml:"id"` - Type beegfs.NodeType `yaml:"type"` - Config map[string]string `yaml:"config"` - Interfaces []Nic `yaml:"interfaces"` - Targets []Target `yaml:"targets"` - Source *NodeSource `yaml:"source,omitempty"` + fsUUID string + ID beegfs.NumId `yaml:"id"` + Type beegfs.NodeType `yaml:"type"` + Config map[string]string `yaml:"config"` + Interfaces []Nic `yaml:"interfaces"` + Targets []Target `yaml:"targets"` + InstallSource *NodeInstallSource `yaml:"install-source,omitempty"` } func (n Node) GetSystemdUnit() string { return fmt.Sprintf("beegfs-%s-%s-%d.service", n.fsUUID, n.Type, n.ID) } -type NodeSource struct { - Type SourceType `yaml:"type"` - Ref string `yaml:"ref"` +type NodeInstallSource struct { + Type InstallType `yaml:"type"` + Ref string `yaml:"ref"` } diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 83a17c18..aa271815 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -172,7 +172,7 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { } // Don't apply any common configuration if the agent doesn't have any nodes for this file system. - if err := r.strategy.ApplySource(ctx, fs.Common.Source); err != nil { + if err := r.strategy.ApplySourceRepo(ctx, fs.Common.InstallSource); err != nil { r.state.fail(fmt.Sprintf("unable to apply source configuration for %s: %s", fsUUID, err.Error())) return } @@ -194,8 +194,8 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { // Currently the source for the node should always be set by the user or inherited // automatically from the global configuration. This might change so avoid a panic. - if node.Source != nil { - if err := r.strategy.ApplySourceInstall(ctx, *node.Source); err != nil { + if node.InstallSource != nil { + if err := r.strategy.ApplyInstall(ctx, *node.InstallSource); err != nil { r.state.fail(fmt.Sprintf("unable to apply source installation for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) } } else { From 25cda3a286ddcc3f760955ee150bbe4c7a05e817 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 17:53:56 +0000 Subject: [PATCH 08/13] wip: rename target root_dir to path --- agent/pkg/deploy/mount.go | 2 +- agent/pkg/manifest/filesystem.go | 8 ++++---- agent/pkg/manifest/filesystem_test.go | 8 ++++---- agent/pkg/manifest/manifest.yaml | 4 ++-- agent/pkg/manifest/target.go | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/agent/pkg/deploy/mount.go b/agent/pkg/deploy/mount.go index e6997aaa..1aeef6ca 100644 --- a/agent/pkg/deploy/mount.go +++ b/agent/pkg/deploy/mount.go @@ -23,7 +23,7 @@ func (m *Mount) ApplyTargets(ctx context.Context, add []manifest.Target) error { return fmt.Errorf("unable to apply target %d: formatting and/or mounting an underlying file system is not implemented yet", target.ID) } if err := os.MkdirAll(target.GetPath(), 0700); err != nil { - return fmt.Errorf("unable to apply target %d: unable to create root directory %s: %w", target.ID, target.RootDir, err) + return fmt.Errorf("unable to apply target %d: unable to create root directory %s: %w", target.ID, target.Path, err) } } return nil diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 3f083e1e..e79b3974 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -141,8 +141,8 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { for _, t := range n.GetTargets() { target := Target{ - ID: beegfs.NumId(t.GetNumId()), - RootDir: t.GetRootDir(), + ID: beegfs.NumId(t.GetNumId()), + Path: t.GetPath(), } if t.GetUlfs() != nil { target.ULFS = &UnderlyingFS{ @@ -223,8 +223,8 @@ func ToProto(fs *Filesystem) *pb.Filesystem { } for _, tgt := range node.Targets { pbTarget := &pb.Target{ - NumId: uint32(tgt.ID), - RootDir: tgt.RootDir, + NumId: uint32(tgt.ID), + Path: tgt.Path, } if tgt.ULFS != nil { pbTarget.Ulfs = &pb.Target_UnderlyingFSOpts{ diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index fe30d181..4d17b33b 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -55,8 +55,8 @@ func TestFromToProto_RoundTrip(t *testing.T) { }, Targets: []*pb.Target{ { - NumId: 101, - RootDir: "/mnt", + NumId: 101, + Path: "/mnt", Ulfs: &pb.Target_UnderlyingFSOpts{ Device: "/dev/sda1", Type: pb.Target_UnderlyingFSOpts_EXT4, @@ -111,8 +111,8 @@ func TestInheritGlobalConfig(t *testing.T) { Config: map[string]string{"baz": "node-specific"}, Targets: []Target{ { - ID: beegfs.NumId(1), - RootDir: "/beegfs/", + ID: beegfs.NumId(1), + Path: "/beegfs/", }, }, }, diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index e7aa5479..f88489f0 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -51,7 +51,7 @@ beegfs01: #fsUUID - type: mgmtd targets: - id: 101 - root_dir: /beegfs/ + path: /beegfs/ - type: meta id: 1 install-source: @@ -61,7 +61,7 @@ beegfs01: #fsUUID - name: enp0s1 # IP configuration handled globally targets: - id: 101 - root_dir: /beegfs/ + path: /beegfs/ # ulfs: # device: /dev/sda1 # type: ext4 diff --git a/agent/pkg/manifest/target.go b/agent/pkg/manifest/target.go index b41a7238..3003021d 100644 --- a/agent/pkg/manifest/target.go +++ b/agent/pkg/manifest/target.go @@ -15,12 +15,12 @@ type Target struct { fsUUID string nodeType beegfs.NodeType ID beegfs.NumId `yaml:"id"` - RootDir string `yaml:"root_dir"` + Path string `yaml:"path"` ULFS *UnderlyingFS `yaml:"ulfs"` } func (t Target) GetPath() string { - return path.Join(t.RootDir, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) + return path.Join(t.Path, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) } type UnderlyingFS struct { From 3b67e6eab6218fd1f901042f4db4e1c99d1eea37 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 18:28:19 +0000 Subject: [PATCH 09/13] wip: rename nodes to services --- agent/cmd/beegfs-agent/main.go | 2 +- agent/pkg/deploy/deploy.go | 4 +- agent/pkg/deploy/service.go | 10 +-- agent/pkg/deploy/source.go | 12 +-- agent/pkg/manifest/common.go | 44 +++++------ agent/pkg/manifest/filesystem.go | 106 +++++++++++++------------- agent/pkg/manifest/filesystem_test.go | 58 +++++++------- agent/pkg/manifest/manifest.yaml | 8 +- agent/pkg/manifest/node.go | 28 ------- agent/pkg/manifest/service.go | 28 +++++++ agent/pkg/reconciler/reconciler.go | 34 ++++----- agent/pkg/reconciler/state.go | 2 +- 12 files changed, 168 insertions(+), 168 deletions(-) delete mode 100644 agent/pkg/manifest/node.go create mode 100644 agent/pkg/manifest/service.go diff --git a/agent/cmd/beegfs-agent/main.go b/agent/cmd/beegfs-agent/main.go index 3810ee99..ae037584 100644 --- a/agent/cmd/beegfs-agent/main.go +++ b/agent/cmd/beegfs-agent/main.go @@ -32,7 +32,7 @@ var ( func main() { pflag.Bool("version", false, "Print the version then exit.") pflag.String("cfg-file", "/etc/beegfs/agent.toml", "The path to the a configuration file (can be omitted to set all configuration using flags and/or environment variables). When Remote Storage Targets are configured using a file, they can be updated without restarting the application.") - pflag.String("agent-id", "0", "A unique ID used to identify what nodes from the manifest this agent is responsible for. Should not change after initially starting the agent.") + pflag.String("agent-id", "0", "A unique ID used to identify what services from the manifest this agent is responsible for. Should not change after initially starting the agent.") pflag.String("log.type", "stderr", "Where log messages should be sent ('stderr', 'stdout', 'syslog', 'logfile').") pflag.String("log.file", "/var/log/beegfs/beegfs-remote.log", "The path to the desired log file when logType is 'log.file' (if needed the directory and all parent directories will be created).") pflag.Int8("log.level", 3, "Adjust the logging level (0=Fatal, 1=Error, 2=Warn, 3=Info, 4+5=Debug).") diff --git a/agent/pkg/deploy/deploy.go b/agent/pkg/deploy/deploy.go index c5282a7a..3cebf10c 100644 --- a/agent/pkg/deploy/deploy.go +++ b/agent/pkg/deploy/deploy.go @@ -2,8 +2,8 @@ package deploy import "context" -// Deployer is responsible for carrying out the steps needed to manage a BeeGFS "node" and handles -// starting/modifying/stopping various system resources. +// Deployer is responsible for carrying out the steps needed to manage a BeeGFS "service" and +// handles starting/modifying/stopping various system resources. type Deployer interface { Installer Networker diff --git a/agent/pkg/deploy/service.go b/agent/pkg/deploy/service.go index 5801cd95..c4390d84 100644 --- a/agent/pkg/deploy/service.go +++ b/agent/pkg/deploy/service.go @@ -10,8 +10,8 @@ import ( ) type Servicer interface { - ApplyService(ctx context.Context, add manifest.Node) error - DestroyService(ctx context.Context, remove manifest.Node) error + ApplyService(ctx context.Context, add manifest.Service) error + DestroyService(ctx context.Context, remove manifest.Service) error } func NewSystemd(ctx context.Context) (Systemd, error) { @@ -25,7 +25,7 @@ func NewSystemd(ctx context.Context) (Systemd, error) { } -// Systemd provides a method to deploy BeeGFS nodes using systemd. +// Systemd provides a method to deploy BeeGFS services using systemd. type Systemd struct { conn *dbus.Conn } @@ -35,10 +35,10 @@ func (d *Systemd) Cleanup() error { return nil } -func (d *Systemd) ApplyService(ctx context.Context, add manifest.Node) error { +func (d *Systemd) ApplyService(ctx context.Context, add manifest.Service) error { return errors.New("not implemented") } -func (d *Systemd) DestroyService(ctx context.Context, remove manifest.Node) error { +func (d *Systemd) DestroyService(ctx context.Context, remove manifest.Service) error { return errors.New("not implemented") } diff --git a/agent/pkg/deploy/source.go b/agent/pkg/deploy/source.go index 0a9adbef..0e0f69ff 100644 --- a/agent/pkg/deploy/source.go +++ b/agent/pkg/deploy/source.go @@ -12,8 +12,8 @@ import ( type Installer interface { ApplySourceRepo(ctx context.Context, add manifest.InstallSource) error DeleteSourceRepo(ctx context.Context, remove manifest.InstallSource) error - ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error - DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error + ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error + DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error } func DetectPackageManager() (Package, error) { @@ -56,14 +56,14 @@ func (p *Package) DeleteSourceRepo(ctx context.Context, remove manifest.InstallS return p.manager.DeleteSourceRepo(ctx, remove) } -func (p *Package) ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error { +func (p *Package) ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error { if p.isLocal || source.Type == manifest.LocalInstall { return nil } return p.manager.ApplyInstall(ctx, source) } -func (p *Package) DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error { +func (p *Package) DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error { if p.isLocal || source.Type == manifest.LocalInstall { return nil } @@ -80,10 +80,10 @@ func (p *AptPackage) DeleteSourceRepo(ctx context.Context, remove manifest.Insta return errors.New("not implemented") } -func (p *AptPackage) ApplyInstall(ctx context.Context, source manifest.NodeInstallSource) error { +func (p *AptPackage) ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error { return errors.New("not implemented") } -func (p *AptPackage) DeleteInstall(ctx context.Context, source manifest.NodeInstallSource) error { +func (p *AptPackage) DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error { return errors.New("not implemented") } diff --git a/agent/pkg/manifest/common.go b/agent/pkg/manifest/common.go index f85a0fce..3665032f 100644 --- a/agent/pkg/manifest/common.go +++ b/agent/pkg/manifest/common.go @@ -8,10 +8,10 @@ import ( ) type Common struct { - Auth *Auth `yaml:"auth"` - TLS *TLS `yaml:"tls"` - GlobalConfig NodeConfigs `yaml:"config"` - InstallSource InstallSource `yaml:"install-source"` + Auth *Auth `yaml:"auth"` + TLS *TLS `yaml:"tls"` + GlobalConfig ServiceConfigs `yaml:"config"` + InstallSource InstallSource `yaml:"install-source"` } type Auth struct { @@ -23,9 +23,9 @@ type TLS struct { Cert string `yaml:"cert"` } -type NodeConfigs map[beegfs.NodeType]map[string]string +type ServiceConfigs map[beegfs.NodeType]map[string]string -func (s *NodeConfigs) UnmarshalYAML(unmarshal func(any) error) error { +func (s *ServiceConfigs) UnmarshalYAML(unmarshal func(any) error) error { // We cannot directly apply validation to map[beegfs.NodeType]... during unmarshal because the // YAML input uses string keys and as a result things blow up (spectacularly). intermediate := map[string]map[string]string{} @@ -33,7 +33,7 @@ func (s *NodeConfigs) UnmarshalYAML(unmarshal func(any) error) error { return err } - result := make(NodeConfigs, len(intermediate)) + result := make(ServiceConfigs, len(intermediate)) for key, val := range intermediate { nodeType := beegfs.NodeTypeFromString(key) if nodeType == beegfs.InvalidNodeType { @@ -46,22 +46,22 @@ func (s *NodeConfigs) UnmarshalYAML(unmarshal func(any) error) error { return nil } -func (c NodeConfigs) toProto() []*pb.NodeConfig { - pbNodeConfigs := make([]*pb.NodeConfig, 0, len(c)) - for nodeType, nodeMap := range c { - pbNodeConfigs = append(pbNodeConfigs, &pb.NodeConfig{ - NodeType: *nodeType.ToProto(), - StringMap: nodeMap, +func (c ServiceConfigs) toProto() []*pb.ServiceConfig { + pbServiceConfigs := make([]*pb.ServiceConfig, 0, len(c)) + for nodeType, serviceMap := range c { + pbServiceConfigs = append(pbServiceConfigs, &pb.ServiceConfig{ + ServiceType: *nodeType.ToProto(), + StringMap: serviceMap, }) } - return pbNodeConfigs + return pbServiceConfigs } -func nodeConfigsFromProto(m []*pb.NodeConfig) NodeConfigs { - nsm := make(NodeConfigs, len(m)) - for _, node := range m { - if node != nil && node.GetStringMap() != nil { - nsm[beegfs.NodeTypeFromProto(node.NodeType)] = node.GetStringMap() +func serviceConfigsFromProto(m []*pb.ServiceConfig) ServiceConfigs { + nsm := make(ServiceConfigs, len(m)) + for _, service := range m { + if service != nil && service.GetStringMap() != nil { + nsm[beegfs.NodeTypeFromProto(service.ServiceType)] = service.GetStringMap() } } return nsm @@ -100,8 +100,8 @@ func (c SourceRefs) toProto() []*pb.SourceRef { pbSourceRefs := make([]*pb.SourceRef, 0, len(c)) for nodeType, ref := range c { pbSourceRefs = append(pbSourceRefs, &pb.SourceRef{ - NodeType: *nodeType.ToProto(), - Ref: ref, + ServiceType: *nodeType.ToProto(), + Ref: ref, }) } return pbSourceRefs @@ -111,7 +111,7 @@ func sourceRefsFromProto(r []*pb.SourceRef) SourceRefs { srs := make(SourceRefs, len(r)) for _, ref := range r { if ref != nil { - srs[beegfs.NodeTypeFromProto(ref.NodeType)] = ref.GetRef() + srs[beegfs.NodeTypeFromProto(ref.ServiceType)] = ref.GetRef() } } return srs diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index e79b3974..e08b3491 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -21,8 +21,8 @@ type Filesystem struct { } type Agent struct { - Nodes []Node `yaml:"nodes"` - // Global agent interfaces potentially reused by multiple nodes. + Services []Service `yaml:"services"` + // Global agent interfaces potentially reused by multiple services. Interfaces []Nic `yaml:"interfaces"` } @@ -33,30 +33,30 @@ type Nic struct { func (f *Filesystem) InheritGlobalConfig(fsUUID string) { for agentID, agent := range f.Agents { - for i := range agent.Nodes { - node := &agent.Nodes[i] - node.fsUUID = fsUUID - // Inherit global interface configuration if there are no node specific interfaces. - if len(node.Interfaces) == 0 { - node.Interfaces = agent.Interfaces + for i := range agent.Services { + service := &agent.Services[i] + service.fsUUID = fsUUID + // Inherit global interface configuration if there are no service specific interfaces. + if len(service.Interfaces) == 0 { + service.Interfaces = agent.Interfaces } - // Inherit global node configuration based on the node type. - if commonNodeConfig, ok := f.Common.GlobalConfig[agent.Nodes[i].Type]; ok { - node.Config = inheritMapDefaults(commonNodeConfig, node.Config) + // Inherit global service configuration based on the service type. + if commonServiceConfig, ok := f.Common.GlobalConfig[agent.Services[i].Type]; ok { + service.Config = inheritMapDefaults(commonServiceConfig, service.Config) } - // Inherit global source configuration based on the node type. - if node.InstallSource == nil || node.InstallSource.Ref == "" { - node.InstallSource = &NodeInstallSource{ + // Inherit global source configuration based on the service type. + if service.InstallSource == nil || service.InstallSource.Ref == "" { + service.InstallSource = &ServiceInstallSource{ Type: f.Common.InstallSource.Type, } - if ref, ok := f.Common.InstallSource.Refs[node.Type]; ok { - node.InstallSource.Ref = ref + if ref, ok := f.Common.InstallSource.Refs[service.Type]; ok { + service.InstallSource.Ref = ref } } - // Inherit target configuration from the FS and node: - for t := range node.Targets { - agent.Nodes[i].Targets[t].fsUUID = fsUUID - agent.Nodes[i].Targets[t].nodeType = node.Type + // Inherit target configuration from the FS and service: + for t := range service.Targets { + agent.Services[i].Targets[t].fsUUID = fsUUID + agent.Services[i].Targets[t].nodeType = service.Type } } f.Agents[agentID] = agent @@ -83,7 +83,7 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { pSrc := protoFS.GetCommon().GetInstallSource() fs.Common = Common{ - GlobalConfig: nodeConfigsFromProto(protoFS.Common.GetGlobalConfig()), + GlobalConfig: serviceConfigsFromProto(protoFS.Common.GetGlobalConfig()), InstallSource: InstallSource{ Type: sourceTypeFromProto(pSrc.Type), Repo: pSrc.Repo, @@ -107,7 +107,7 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { fs.Agents = make(map[string]Agent, len(protoFS.GetAgent())) for id, a := range protoFS.GetAgent() { agent := Agent{ - Nodes: make([]Node, 0), + Services: make([]Service, 0), Interfaces: make([]Nic, 0), } for _, i := range a.GetInterfaces() { @@ -116,30 +116,30 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { Addr: i.Addr, }) } - for _, n := range a.GetNodes() { - node := Node{ - ID: beegfs.NumId(n.GetNumId()), - Type: beegfs.NodeTypeFromProto(n.NodeType), - Config: n.GetConfig(), + for _, s := range a.GetServices() { + service := Service{ + ID: beegfs.NumId(s.GetNumId()), + Type: beegfs.NodeTypeFromProto(s.ServiceType), + Config: s.GetConfig(), Interfaces: make([]Nic, 0), Targets: make([]Target, 0), } - if n.InstallSource != nil { - node.InstallSource = &NodeInstallSource{ - Type: sourceTypeFromProto(n.GetInstallSource().GetType()), - Ref: n.GetInstallSource().GetRef(), + if s.InstallSource != nil { + service.InstallSource = &ServiceInstallSource{ + Type: sourceTypeFromProto(s.GetInstallSource().GetType()), + Ref: s.GetInstallSource().GetRef(), } } - for _, i := range n.GetInterfaces() { - node.Interfaces = append(node.Interfaces, Nic{ + for _, i := range s.GetInterfaces() { + service.Interfaces = append(service.Interfaces, Nic{ Name: i.Name, Addr: i.Addr, }) } - for _, t := range n.GetTargets() { + for _, t := range s.GetTargets() { target := Target{ ID: beegfs.NumId(t.GetNumId()), Path: t.GetPath(), @@ -153,9 +153,9 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { } } - node.Targets = append(node.Targets, target) + service.Targets = append(service.Targets, target) } - agent.Nodes = append(agent.Nodes, node) + agent.Services = append(agent.Services, service) } fs.Agents[id] = agent } @@ -190,7 +190,7 @@ func ToProto(fs *Filesystem) *pb.Filesystem { for agentID, agent := range fs.Agents { pbAgent := &pb.Agent{ - Nodes: make([]*pb.Node, 0, len(agent.Nodes)), + Services: make([]*pb.Service, 0, len(agent.Services)), Interfaces: make([]*pb.Nic, 0, len(agent.Interfaces)), } for _, i := range agent.Interfaces { @@ -199,29 +199,29 @@ func ToProto(fs *Filesystem) *pb.Filesystem { Addr: i.Addr, }) } - for _, node := range agent.Nodes { - pbNode := &pb.Node{ - NumId: uint32(node.ID), - NodeType: *node.Type.ToProto(), - Config: node.Config, - Interfaces: make([]*pb.Nic, 0, len(node.Interfaces)), - Targets: make([]*pb.Target, 0, len(node.Targets)), + for _, service := range agent.Services { + pbService := &pb.Service{ + NumId: uint32(service.ID), + ServiceType: *service.Type.ToProto(), + Config: service.Config, + Interfaces: make([]*pb.Nic, 0, len(service.Interfaces)), + Targets: make([]*pb.Target, 0, len(service.Targets)), } - if node.InstallSource != nil { - pbNode.InstallSource = &pb.Node_InstallSource{ - Type: node.InstallSource.Type.ToProto(), - Ref: node.InstallSource.Ref, + if service.InstallSource != nil { + pbService.InstallSource = &pb.Service_InstallSource{ + Type: service.InstallSource.Type.ToProto(), + Ref: service.InstallSource.Ref, } } - for _, nic := range node.Interfaces { - pbNode.Interfaces = append(pbNode.Interfaces, &pb.Nic{ + for _, nic := range service.Interfaces { + pbService.Interfaces = append(pbService.Interfaces, &pb.Nic{ Name: nic.Name, Addr: nic.Addr, }) } - for _, tgt := range node.Targets { + for _, tgt := range service.Targets { pbTarget := &pb.Target{ NumId: uint32(tgt.ID), Path: tgt.Path, @@ -234,9 +234,9 @@ func ToProto(fs *Filesystem) *pb.Filesystem { MountFlags: tgt.ULFS.MountFlags, } } - pbNode.Targets = append(pbNode.Targets, pbTarget) + pbService.Targets = append(pbService.Targets, pbTarget) } - pbAgent.Nodes = append(pbAgent.Nodes, pbNode) + pbAgent.Services = append(pbAgent.Services, pbService) } pbFS.Agent[agentID] = pbAgent } diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index 4d17b33b..22e2c91c 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -19,18 +19,18 @@ func TestFromToProto_RoundTrip(t *testing.T) { Key: "tlsKey", Cert: "tlsCert", }, - GlobalConfig: []*pb.NodeConfig{ + GlobalConfig: []*pb.ServiceConfig{ { - NodeType: pbb.NodeType_META, - StringMap: map[string]string{"key": "val"}, + ServiceType: pbb.NodeType_META, + StringMap: map[string]string{"key": "val"}, }, }, InstallSource: &pb.InstallSource{ Type: pb.InstallType_PACKAGE, Refs: []*pb.SourceRef{ { - NodeType: pbb.NodeType_META, - Ref: "ref", + ServiceType: pbb.NodeType_META, + Ref: "ref", }, }, }, @@ -41,15 +41,15 @@ func TestFromToProto_RoundTrip(t *testing.T) { Interfaces: []*pb.Nic{ {Name: "eth0", Addr: "11.0.0.1/16"}, }, - Nodes: []*pb.Node{ + Services: []*pb.Service{ { - NumId: 1, - NodeType: pbb.NodeType_META, - Config: map[string]string{"nkey": "nval"}, + NumId: 1, + ServiceType: pbb.NodeType_META, + Config: map[string]string{"nkey": "nval"}, Interfaces: []*pb.Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, - InstallSource: &pb.Node_InstallSource{ + InstallSource: &pb.Service_InstallSource{ Type: pb.InstallType_LOCAL, Ref: "12345", }, @@ -81,17 +81,17 @@ func TestInheritGlobalConfig(t *testing.T) { tests := []struct { name string input Filesystem - expectedNIC string // Expected NIC name in node if inherited + expectedNIC string // Expected NIC name in service if inherited expectedCfg map[string]string - expectedSrc NodeInstallSource + expectedSrc ServiceInstallSource }{ { name: "inherit source, NIC and meta config", input: Filesystem{ Common: Common{ - GlobalConfig: NodeConfigs{beegfs.Meta: map[string]string{ - "foo": "bar", // inherited - "baz": "node-specific", // overridden + GlobalConfig: ServiceConfigs{beegfs.Meta: map[string]string{ + "foo": "bar", // inherited + "baz": "service-specific", // overridden }}, InstallSource: InstallSource{ Refs: SourceRefs{beegfs.Meta: "beegfs-meta=8.0.1"}, @@ -104,11 +104,11 @@ func TestInheritGlobalConfig(t *testing.T) { Interfaces: []Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, - Nodes: []Node{ + Services: []Service{ { Type: beegfs.Meta, ID: 1, - Config: map[string]string{"baz": "node-specific"}, + Config: map[string]string{"baz": "service-specific"}, Targets: []Target{ { ID: beegfs.NumId(1), @@ -122,10 +122,10 @@ func TestInheritGlobalConfig(t *testing.T) { }, expectedNIC: "ib0", expectedCfg: map[string]string{ - "foo": "bar", // inherited - "baz": "node-specific", // overridden + "foo": "bar", // inherited + "baz": "service-specific", // overridden }, - expectedSrc: NodeInstallSource{ + expectedSrc: ServiceInstallSource{ Type: PackageInstall, Ref: "beegfs-meta=8.0.1", }, @@ -134,7 +134,7 @@ func TestInheritGlobalConfig(t *testing.T) { name: "no inheritance if NICs or source are present", input: Filesystem{ Common: Common{ - GlobalConfig: NodeConfigs{ + GlobalConfig: ServiceConfigs{ beegfs.Meta: map[string]string{ "quota": "enabled", }, @@ -150,7 +150,7 @@ func TestInheritGlobalConfig(t *testing.T) { Interfaces: []Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, - Nodes: []Node{ + Services: []Service{ { Type: beegfs.Meta, ID: 2, @@ -158,7 +158,7 @@ func TestInheritGlobalConfig(t *testing.T) { {Name: "eth0", Addr: "192.168.0.1/24"}, }, Config: map[string]string{"quota": "override"}, - InstallSource: &NodeInstallSource{ + InstallSource: &ServiceInstallSource{ Type: LocalInstall, Ref: "/home/tux/beegfs-meta", }, @@ -171,7 +171,7 @@ func TestInheritGlobalConfig(t *testing.T) { expectedCfg: map[string]string{ "quota": "override", }, - expectedSrc: NodeInstallSource{ + expectedSrc: ServiceInstallSource{ Type: LocalInstall, Ref: "/home/tux/beegfs-meta", }, @@ -183,11 +183,11 @@ func TestInheritGlobalConfig(t *testing.T) { fs := tt.input fs.InheritGlobalConfig("testFS") agent := fs.Agents["agent1"] - node := agent.Nodes[0] - assert.Equal(t, tt.expectedNIC, node.Interfaces[0].Name) - assert.Equal(t, tt.expectedCfg, node.Config) - assert.Equal(t, "testFS", node.fsUUID) - for _, target := range node.Targets { + service := agent.Services[0] + assert.Equal(t, tt.expectedNIC, service.Interfaces[0].Name) + assert.Equal(t, tt.expectedCfg, service.Config) + assert.Equal(t, "testFS", service.fsUUID) + for _, target := range service.Targets { assert.Equal(t, "/beegfs/testFS/meta_1", target.GetPath(), "generated target path did not match") } diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index f88489f0..180ec0c7 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -1,4 +1,4 @@ -beegfs01: #fsUUID +3b6f972b-64c7-4378-9f8e-172cf88c7d93: #fsUUID common: auth: secret: "sharedSecret" @@ -47,7 +47,7 @@ beegfs01: #fsUUID interfaces: - name: enp0s1 address: "127.0.0.1/24" - nodes: + services: - type: mgmtd targets: - id: 101 @@ -68,11 +68,11 @@ beegfs01: #fsUUID # format_flags: foo # mount_flags: baz agent2: # agentID - nodes: + services: - type: storage id: 1 config: tuneNumWorkers: 28 agent3: # agentID - nodes: + services: - type: client diff --git a/agent/pkg/manifest/node.go b/agent/pkg/manifest/node.go deleted file mode 100644 index 1f686129..00000000 --- a/agent/pkg/manifest/node.go +++ /dev/null @@ -1,28 +0,0 @@ -package manifest - -import ( - "fmt" - - "github.com/thinkparq/beegfs-go/common/beegfs" -) - -type Node struct { - // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names - // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string - ID beegfs.NumId `yaml:"id"` - Type beegfs.NodeType `yaml:"type"` - Config map[string]string `yaml:"config"` - Interfaces []Nic `yaml:"interfaces"` - Targets []Target `yaml:"targets"` - InstallSource *NodeInstallSource `yaml:"install-source,omitempty"` -} - -func (n Node) GetSystemdUnit() string { - return fmt.Sprintf("beegfs-%s-%s-%d.service", n.fsUUID, n.Type, n.ID) -} - -type NodeInstallSource struct { - Type InstallType `yaml:"type"` - Ref string `yaml:"ref"` -} diff --git a/agent/pkg/manifest/service.go b/agent/pkg/manifest/service.go new file mode 100644 index 00000000..accc6537 --- /dev/null +++ b/agent/pkg/manifest/service.go @@ -0,0 +1,28 @@ +package manifest + +import ( + "fmt" + + "github.com/thinkparq/beegfs-go/common/beegfs" +) + +type Service struct { + // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // and identifiers in case resources for multiple file systems exist on the same machine. + fsUUID string + ID beegfs.NumId `yaml:"id"` + Type beegfs.NodeType `yaml:"type"` + Config map[string]string `yaml:"config"` + Interfaces []Nic `yaml:"interfaces"` + Targets []Target `yaml:"targets"` + InstallSource *ServiceInstallSource `yaml:"install-source,omitempty"` +} + +func (s Service) GetSystemdUnit() string { + return fmt.Sprintf("beegfs-%s-%s-%d.service", s.fsUUID, s.Type, s.ID) +} + +type ServiceInstallSource struct { + Type InstallType `yaml:"type"` + Ref string `yaml:"ref"` +} diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index aa271815..3d5f8a8f 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -146,8 +146,8 @@ func (r *defaultReconciler) verify(newManifest manifest.Filesystems) error { // * Avoid necessary reconciliations by seeing if the manifest changed. // * Validate we can migrate from currentFS to newFS. // * Validate the FS config: - // * All nodes have IPs + targets. - // * Nodes have the correct number of targets (i.e., 1 for mgmtd meta, remote, sync). + // * All services have IPs + targets. + // * Services have the correct number of targets (i.e., 1 for mgmtd meta, remote, sync). // Note these should be implemented as methods on manifest.Filesystem. fs.InheritGlobalConfig(fsUUID) } @@ -166,12 +166,12 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { agent, ok := fs.Agents[r.agentID] if !ok { // Not all file systems in this manifest may have configuration for this agent. It is - // also valid that this manifest has no nodes managed by this agent. - r.log.Debug("file system has no nodes assigned to this agent", zap.String("fsUUID", fsUUID)) + // also valid that this manifest has no services managed by this agent. + r.log.Debug("file system has no services assigned to this agent", zap.String("fsUUID", fsUUID)) continue } - // Don't apply any common configuration if the agent doesn't have any nodes for this file system. + // Don't apply any common configuration if the agent doesn't have any services for this file system. if err := r.strategy.ApplySourceRepo(ctx, fs.Common.InstallSource); err != nil { r.state.fail(fmt.Sprintf("unable to apply source configuration for %s: %s", fsUUID, err.Error())) return @@ -182,28 +182,28 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { return } - for _, node := range agent.Nodes { - if err := r.strategy.ApplyInterfaces(ctx, node.Interfaces); err != nil { - r.state.fail(fmt.Sprintf("unable to apply interface configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + for _, service := range agent.Services { + if err := r.strategy.ApplyInterfaces(ctx, service.Interfaces); err != nil { + r.state.fail(fmt.Sprintf("unable to apply interface configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) return } - if err := r.strategy.ApplyTargets(ctx, node.Targets); err != nil { - r.state.fail(fmt.Sprintf("unable to apply target configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + if err := r.strategy.ApplyTargets(ctx, service.Targets); err != nil { + r.state.fail(fmt.Sprintf("unable to apply target configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) return } - // Currently the source for the node should always be set by the user or inherited + // Currently the source for the services should always be set by the user or inherited // automatically from the global configuration. This might change so avoid a panic. - if node.InstallSource != nil { - if err := r.strategy.ApplyInstall(ctx, *node.InstallSource); err != nil { - r.state.fail(fmt.Sprintf("unable to apply source installation for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + if service.InstallSource != nil { + if err := r.strategy.ApplyInstall(ctx, *service.InstallSource); err != nil { + r.state.fail(fmt.Sprintf("unable to apply source installation for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) } } else { - r.log.Warn("node source was unexpectedly nil (ignoring)", zap.String("fsUUID", fsUUID), zap.String("nodeType", node.Type.String()), zap.Any("nodeID", node.ID)) + r.log.Warn("service install source was unexpectedly nil (ignoring)", zap.String("fsUUID", fsUUID), zap.String("type", service.Type.String()), zap.Any("id", service.ID)) } - if err := r.strategy.ApplyService(ctx, node); err != nil { - r.state.fail(fmt.Sprintf("unable to apply service configuration for %s: %s", getFsNodeID(fsUUID, node.Type, node.ID), err.Error())) + if err := r.strategy.ApplyService(ctx, service); err != nil { + r.state.fail(fmt.Sprintf("unable to apply service configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) return } } diff --git a/agent/pkg/reconciler/state.go b/agent/pkg/reconciler/state.go index 1f1970b8..a99a2413 100644 --- a/agent/pkg/reconciler/state.go +++ b/agent/pkg/reconciler/state.go @@ -40,7 +40,7 @@ func newAgentState(l *zap.Logger) state { } } -func getFsNodeID(fsUUID string, nt beegfs.NodeType, id beegfs.NumId) string { +func getFsServiceID(fsUUID string, nt beegfs.NodeType, id beegfs.NumId) string { return fmt.Sprintf("%s:%s:%d", fsUUID, nt, id) } From d569e9b1270b4d1599ea3ff56cd3d561eb6cf850 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 19:12:14 +0000 Subject: [PATCH 10/13] wip: add generated metadata to manifests --- agent/internal/server/server.go | 16 +-- agent/pkg/manifest/filesystem.go | 19 ++-- agent/pkg/manifest/manifest.go | 19 ++++ agent/pkg/manifest/manifest.yaml | 157 +++++++++++++++-------------- agent/pkg/reconciler/reconciler.go | 52 +++++----- 5 files changed, 144 insertions(+), 119 deletions(-) create mode 100644 agent/pkg/manifest/manifest.go diff --git a/agent/internal/server/server.go b/agent/internal/server/server.go index 18f6f6ba..e1ca73c7 100644 --- a/agent/internal/server/server.go +++ b/agent/internal/server/server.go @@ -81,7 +81,7 @@ func (s *AgentServer) Stop() { s.wg.Wait() } -func (s *AgentServer) Update(ctx context.Context, request *pb.UpdateRequest) (*pb.UpdateResponse, error) { +func (s *AgentServer) UpdateManifest(ctx context.Context, request *pb.UpdateManifestRequest) (*pb.UpdateManifestResponse, error) { s.wg.Add(1) defer s.wg.Done() @@ -93,34 +93,36 @@ func (s *AgentServer) Update(ctx context.Context, request *pb.UpdateRequest) (*p filesystems[fsUUID] = manifest.FromProto(protoFS) } - if err := s.reconciler.UpdateConfiguration(filesystems); err != nil { + if err := s.reconciler.UpdateConfiguration(manifest.Manifest{ + Filesystems: filesystems, + }); err != nil { return nil, grpcStatusFrom(err) } - return &pb.UpdateResponse{ + return &pb.UpdateManifestResponse{ AgentId: s.reconciler.GetAgentID(), }, nil } -func (s *AgentServer) Status(ctx context.Context, request *pb.StatusRequest) (*pb.StatusResponse, error) { +func (s *AgentServer) ReconciliationStatus(ctx context.Context, request *pb.ReconciliationStatusRequest) (*pb.ReconciliationStatusResponse, error) { s.wg.Add(1) defer s.wg.Done() if result, err := s.reconciler.Status(); err != nil { return nil, grpcStatusFrom(err) } else { - return &pb.StatusResponse{ + return &pb.ReconciliationStatusResponse{ Status: result.Status, AgentId: s.reconciler.GetAgentID(), }, nil } } -func (s *AgentServer) Cancel(ctx context.Context, request *pb.CancelRequest) (*pb.CancelResponse, error) { +func (s *AgentServer) CancelReconciliation(ctx context.Context, request *pb.CancelReconciliationRequest) (*pb.CancelReconciliationResponse, error) { s.wg.Add(1) defer s.wg.Done() if result, err := s.reconciler.Cancel(request.GetReason()); err != nil { return nil, grpcStatusFrom(err) } else { - return &pb.CancelResponse{ + return &pb.CancelReconciliationResponse{ Status: result.Status, AgentId: s.reconciler.GetAgentID(), }, nil diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index e08b3491..09aed9a8 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -12,9 +12,6 @@ import ( "gopkg.in/yaml.v3" ) -// Filesystems is a map of FsUUIDs to file systems. -type Filesystems map[string]Filesystem - type Filesystem struct { Agents map[string]Agent `yaml:"agents"` Common Common `yaml:"common"` @@ -243,20 +240,20 @@ func ToProto(fs *Filesystem) *pb.Filesystem { return pbFS } -func FromDisk(path string) (Filesystems, error) { +func FromDisk(path string) (Manifest, error) { data, err := os.ReadFile(path) if err != nil { - return nil, err + return Manifest{}, err } - var filesystems Filesystems - if err := yaml.Unmarshal(data, &filesystems); err != nil { - return nil, err + var manifest Manifest + if err := yaml.Unmarshal(data, &manifest); err != nil { + return Manifest{}, err } - return filesystems, nil + return manifest, nil } -func ToDisk(filesystems Filesystems, path string) error { - data, err := yaml.Marshal(&filesystems) +func ToDisk(manifest Manifest, path string) error { + data, err := yaml.Marshal(&manifest) if err != nil { return err } diff --git a/agent/pkg/manifest/manifest.go b/agent/pkg/manifest/manifest.go new file mode 100644 index 00000000..e259734e --- /dev/null +++ b/agent/pkg/manifest/manifest.go @@ -0,0 +1,19 @@ +package manifest + +import "time" + +// Manifest includes both user-defined file systems and system generated metadata. It is intended to +// help future proof the manifest definition by encapsulating user-defined filesystems so we can add +// system generated or other field as needed in a backwards compatible manner (e.g., versioning). +type Manifest struct { + Metadata Metadata `yaml:"metadata"` + Filesystems Filesystems `yaml:"filesystems"` +} + +// Metadata contains auto-generated fields that are appended to the active manifest. +type Metadata struct { + Updated time.Time `yaml:"updated"` +} + +// Filesystems is a map of FsUUIDs to file systems. +type Filesystems map[string]Filesystem diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index 180ec0c7..39f3f14c 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -1,78 +1,79 @@ -3b6f972b-64c7-4378-9f8e-172cf88c7d93: #fsUUID - common: - auth: - secret: "sharedSecret" - tls: - key: | - tlsKey - cert: | - tlsCert - config: - mgmtd: - beemsg-port: 10000 - meta: - connMetaPort: 0 - quotaEnableEnforcement: true - storeClientXAttrs: true - storeClientACLs: true - storage: - connStoragePort: 0 - quotaEnableEnforcement: true - client: - connClientPort: 0 - quotaEnabled: true - install-source: - type: package - repo: https://www.beegfs.io/release/beegfs_8.0/ - refs: - mgmtd: beegfs-mgmtd=8.0.1 - meta: beegfs-meta=8.0.1 - storage: beegfs-storage=8.0.1 - client: beegfs-client=8.0.1 - remote: beegfs-remote=8.0.1 - sync: beegfs-sync=8.0.1 - # source: - # type: container - # repo: ghcr.io/thinkparq - # refs: - # mgmtd: beegfs-mgmtd:8.0.1 - # meta: beegfs-meta:8.0.1 - # storage: beegfs-storage:8.0.1 - # client: beegfs-client:8.0.1 - # remote: beegfs-remote:8.0.1 - # sync: beegfs-sync:8.0.1 - agents: - agent1: # agentID - address: "127.0.0.1:9010" - interfaces: - - name: enp0s1 - address: "127.0.0.1/24" - services: - - type: mgmtd - targets: - - id: 101 - path: /beegfs/ - - type: meta - id: 1 - install-source: - type: local - ref: /home/tux/development/beegfs/meta/build/beegfs-meta - interfaces: - - name: enp0s1 # IP configuration handled globally - targets: - - id: 101 - path: /beegfs/ - # ulfs: - # device: /dev/sda1 - # type: ext4 - # format_flags: foo - # mount_flags: baz - agent2: # agentID - services: - - type: storage - id: 1 - config: - tuneNumWorkers: 28 - agent3: # agentID - services: - - type: client +filesystems: + 3b6f972b-64c7-4378-9f8e-172cf88c7d93: #fsUUID + common: + auth: + secret: "sharedSecret" + tls: + key: | + tlsKey + cert: | + tlsCert + config: + mgmtd: + beemsg-port: 10000 + meta: + connMetaPort: 0 + quotaEnableEnforcement: true + storeClientXAttrs: true + storeClientACLs: true + storage: + connStoragePort: 0 + quotaEnableEnforcement: true + client: + connClientPort: 0 + quotaEnabled: true + install-source: + type: package + repo: https://www.beegfs.io/release/beegfs_8.0/ + refs: + mgmtd: beegfs-mgmtd=8.0.1 + meta: beegfs-meta=8.0.1 + storage: beegfs-storage=8.0.1 + client: beegfs-client=8.0.1 + remote: beegfs-remote=8.0.1 + sync: beegfs-sync=8.0.1 + # source: + # type: container + # repo: ghcr.io/thinkparq + # refs: + # mgmtd: beegfs-mgmtd:8.0.1 + # meta: beegfs-meta:8.0.1 + # storage: beegfs-storage:8.0.1 + # client: beegfs-client:8.0.1 + # remote: beegfs-remote:8.0.1 + # sync: beegfs-sync:8.0.1 + agents: + agent1: # agentID + address: "127.0.0.1:9010" + interfaces: + - name: enp0s1 + address: "127.0.0.1/24" + services: + - type: mgmtd + targets: + - id: 101 + path: /beegfs/ + - type: meta + id: 1 + install-source: + type: local + ref: /home/tux/development/beegfs/meta/build/beegfs-meta + interfaces: + - name: enp0s1 # IP configuration handled globally + targets: + - id: 101 + path: /beegfs/ + # ulfs: + # device: /dev/sda1 + # type: ext4 + # format_flags: foo + # mount_flags: baz + agent2: # agentID + services: + - type: storage + id: 1 + config: + tuneNumWorkers: 28 + agent3: # agentID + services: + - type: client diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 3d5f8a8f..0f4d7a4e 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -7,6 +7,7 @@ import ( "path" "reflect" "sync" + "time" "github.com/thinkparq/beegfs-go/agent/pkg/deploy" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" @@ -46,13 +47,13 @@ type ReconcileResult struct { } type defaultReconciler struct { - agentID string - log *zap.Logger - mu sync.Mutex - activeManifest manifest.Filesystems - state state - config Config - strategy deploy.Deployer + agentID string + log *zap.Logger + mu sync.Mutex + active manifest.Manifest + state state + config Config + strategy deploy.Deployer } func New(ctx context.Context, agentID string, log *zap.Logger, config Config) (Reconciler, error) { @@ -115,33 +116,33 @@ func (r *defaultReconciler) UpdateConfiguration(config any) error { r.mu.Lock() r.config = configurer.GetReconcilerConfig() r.log.Info("loading file system manifest", zap.String("path", r.config.ManifestPath)) - newFS, err := manifest.FromDisk(r.config.ManifestPath) + newManifest, err := manifest.FromDisk(r.config.ManifestPath) r.mu.Unlock() if err != nil { return fmt.Errorf("%w: %w", ErrLoadingManifest, err) } - return r.verify(newFS) - } else if newFS, ok := config.(manifest.Filesystems); ok { + return r.verify(newManifest.Filesystems) + } else if newManifest, ok := config.(manifest.Manifest); ok { r.mu.Lock() r.log.Info("saving file system manifest", zap.String("path", r.config.ActiveManifestPath)) - err := manifest.ToDisk(newFS, r.config.ManifestPath) + err := manifest.ToDisk(newManifest, r.config.ManifestPath) r.mu.Unlock() if err != nil { - return fmt.Errorf("%w: %w", ErrBadManifest, err) + return fmt.Errorf("%w: %w", ErrSavingManifest, err) } - return r.verify(newFS) + return r.verify(newManifest.Filesystems) } - return fmt.Errorf("received unexpected reconciler configuration (most likely this indicates a bug and a report should be filed)") + return fmt.Errorf("%w: received unexpected manifest (most likely this indicates a bug and a report should be filed)", ErrBadManifest) } // Verify performs any checks that can be done without actually reconciling the manifest. This // allows a response to be returned quickly while the reconciliation happens in the background. -func (r *defaultReconciler) verify(newManifest manifest.Filesystems) error { +func (r *defaultReconciler) verify(newFilesystems manifest.Filesystems) error { r.log.Info("verifying manifest") - if len(newManifest) == 0 { + if len(newFilesystems) == 0 { return errors.New("manifest does not contain any file systems") } - for fsUUID, fs := range newManifest { + for fsUUID, fs := range newFilesystems { // TODO: // * Avoid necessary reconciliations by seeing if the manifest changed. // * Validate we can migrate from currentFS to newFS. @@ -151,18 +152,18 @@ func (r *defaultReconciler) verify(newManifest manifest.Filesystems) error { // Note these should be implemented as methods on manifest.Filesystem. fs.InheritGlobalConfig(fsUUID) } - go r.reconcile(newManifest) + go r.reconcile(newFilesystems) return nil } // Reconcile attempts to move the local state from the currentFS to the newFS. -func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { +func (r *defaultReconciler) reconcile(newFilesystems manifest.Filesystems) { r.mu.Lock() defer r.mu.Unlock() - r.log.Debug("reconciling", zap.Any("filesystem", newManifest)) + r.log.Debug("reconciling", zap.Any("filesystem", newFilesystems)) ctx := r.state.start() - for fsUUID, fs := range newManifest { + for fsUUID, fs := range newFilesystems { agent, ok := fs.Agents[r.agentID] if !ok { // Not all file systems in this manifest may have configuration for this agent. It is @@ -208,7 +209,12 @@ func (r *defaultReconciler) reconcile(newManifest manifest.Filesystems) { } } } - r.activeManifest = newManifest - manifest.ToDisk(r.activeManifest, r.config.ActiveManifestPath) + r.active = manifest.Manifest{ + Metadata: manifest.Metadata{ + Updated: time.Now(), + }, + Filesystems: newFilesystems, + } + manifest.ToDisk(r.active, r.config.ActiveManifestPath) r.state.complete(pb.Status_SUCCESS) } From 0b449a0124212582b6e41c2e6c0769f090f5f37f Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 20:09:10 +0000 Subject: [PATCH 11/13] wip: enforce v4 UUIDs and derive a unique short UUID --- agent/pkg/manifest/filesystem.go | 17 ++++++++++++++--- agent/pkg/manifest/filesystem_test.go | 6 +++--- agent/pkg/manifest/service.go | 6 +++--- agent/pkg/manifest/target.go | 14 +++++++------- agent/pkg/reconciler/reconciler.go | 24 +++++++++++++++++++++++- 5 files changed, 50 insertions(+), 17 deletions(-) diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 09aed9a8..663dd68a 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -7,11 +7,18 @@ package manifest import ( "os" + "github.com/google/uuid" "github.com/thinkparq/beegfs-go/common/beegfs" pb "github.com/thinkparq/protobuf/go/agent" "gopkg.in/yaml.v3" ) +const ShortUUIDLen = 8 + +func ShortUUID(u uuid.UUID) string { + return u.String()[:ShortUUIDLen] +} + type Filesystem struct { Agents map[string]Agent `yaml:"agents"` Common Common `yaml:"common"` @@ -28,11 +35,15 @@ type Nic struct { Addr string `yaml:"address"` } -func (f *Filesystem) InheritGlobalConfig(fsUUID string) { +// InheritGlobalConfig accepts a shortUUID used internally to generate globally unique names and +// identifiers in case resources for multiple file systems exist on the same machine. Derived by +// taking the first ShortUUIDLen hex digits of the full 128-bit v4 UUID. The caller is responsible +// for validating the shortUUID including verifying no collisions are possible in this manifest. +func (f *Filesystem) InheritGlobalConfig(shortUUID string) { for agentID, agent := range f.Agents { for i := range agent.Services { service := &agent.Services[i] - service.fsUUID = fsUUID + service.shortUUID = shortUUID // Inherit global interface configuration if there are no service specific interfaces. if len(service.Interfaces) == 0 { service.Interfaces = agent.Interfaces @@ -52,7 +63,7 @@ func (f *Filesystem) InheritGlobalConfig(fsUUID string) { } // Inherit target configuration from the FS and service: for t := range service.Targets { - agent.Services[i].Targets[t].fsUUID = fsUUID + agent.Services[i].Targets[t].shortUUID = shortUUID agent.Services[i].Targets[t].nodeType = service.Type } } diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index 22e2c91c..26310c95 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -181,14 +181,14 @@ func TestInheritGlobalConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { fs := tt.input - fs.InheritGlobalConfig("testFS") + fs.InheritGlobalConfig("3b6f972b-64c7-4378-9f8e-172cf88c7d93") agent := fs.Agents["agent1"] service := agent.Services[0] assert.Equal(t, tt.expectedNIC, service.Interfaces[0].Name) assert.Equal(t, tt.expectedCfg, service.Config) - assert.Equal(t, "testFS", service.fsUUID) + assert.Equal(t, "3b6f972b", service.shortUUID) for _, target := range service.Targets { - assert.Equal(t, "/beegfs/testFS/meta_1", target.GetPath(), "generated target path did not match") + assert.Equal(t, "/beegfs/3b6f972b/meta_1", target.GetPath(), "generated target path did not match") } }) diff --git a/agent/pkg/manifest/service.go b/agent/pkg/manifest/service.go index accc6537..5c74fc1d 100644 --- a/agent/pkg/manifest/service.go +++ b/agent/pkg/manifest/service.go @@ -7,9 +7,9 @@ import ( ) type Service struct { - // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // shortUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string + shortUUID string ID beegfs.NumId `yaml:"id"` Type beegfs.NodeType `yaml:"type"` Config map[string]string `yaml:"config"` @@ -19,7 +19,7 @@ type Service struct { } func (s Service) GetSystemdUnit() string { - return fmt.Sprintf("beegfs-%s-%s-%d.service", s.fsUUID, s.Type, s.ID) + return fmt.Sprintf("beegfs-%s-%s-%d.service", s.shortUUID, s.Type, s.ID) } type ServiceInstallSource struct { diff --git a/agent/pkg/manifest/target.go b/agent/pkg/manifest/target.go index 3003021d..9fffb71b 100644 --- a/agent/pkg/manifest/target.go +++ b/agent/pkg/manifest/target.go @@ -10,17 +10,17 @@ import ( ) type Target struct { - // fsUUID is set by InheritGlobalConfig and used internally to generate globally unique names + // shortUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. - fsUUID string - nodeType beegfs.NodeType - ID beegfs.NumId `yaml:"id"` - Path string `yaml:"path"` - ULFS *UnderlyingFS `yaml:"ulfs"` + shortUUID string + nodeType beegfs.NodeType + ID beegfs.NumId `yaml:"id"` + Path string `yaml:"path"` + ULFS *UnderlyingFS `yaml:"ulfs"` } func (t Target) GetPath() string { - return path.Join(t.Path, t.fsUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) + return path.Join(t.Path, t.shortUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) } type UnderlyingFS struct { diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 0f4d7a4e..47f0eab2 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -6,9 +6,11 @@ import ( "fmt" "path" "reflect" + "strings" "sync" "time" + "github.com/google/uuid" "github.com/thinkparq/beegfs-go/agent/pkg/deploy" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" pb "github.com/thinkparq/protobuf/go/agent" @@ -142,7 +144,27 @@ func (r *defaultReconciler) verify(newFilesystems manifest.Filesystems) error { if len(newFilesystems) == 0 { return errors.New("manifest does not contain any file systems") } + + // shortToFullUUIDs is used to ensure the short UUIDs derived from the full v4 FS UUID do not + // have any collisions. While true collisions are HIGHLY unlikely from properly generated v4 + // UUIDs, this might happen if there are user generated UUIDs, typos, or copy/paste errors. + shortToFullUUIDs := map[string]string{} for fsUUID, fs := range newFilesystems { + fsUUID = strings.ToLower(fsUUID) + u, err := uuid.Parse(fsUUID) + if err != nil { + return fmt.Errorf("error parsing file system UUID: %w (is it a valid v4 UUID?)", err) + } else if u.Version() != 4 { + return fmt.Errorf("unsupported file system UUID version: %d (must be v4)", u.Version()) + } + shortUUID := manifest.ShortUUID(u) + if conflictingUUID, ok := shortToFullUUIDs[shortUUID]; ok { + return fmt.Errorf( + "short UUID collision: %q derived from %q and %q (first %d characters are identical)", + shortUUID, fsUUID, conflictingUUID, manifest.ShortUUIDLen, + ) + } + shortToFullUUIDs[shortUUID] = fsUUID // TODO: // * Avoid necessary reconciliations by seeing if the manifest changed. // * Validate we can migrate from currentFS to newFS. @@ -150,7 +172,7 @@ func (r *defaultReconciler) verify(newFilesystems manifest.Filesystems) error { // * All services have IPs + targets. // * Services have the correct number of targets (i.e., 1 for mgmtd meta, remote, sync). // Note these should be implemented as methods on manifest.Filesystem. - fs.InheritGlobalConfig(fsUUID) + fs.InheritGlobalConfig(shortUUID) } go r.reconcile(newFilesystems) return nil From 9e02fa15da4fdec4ca8008cca91583069dfd44c6 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Mon, 19 May 2025 21:38:02 +0000 Subject: [PATCH 12/13] wip: switch per service install source to a simple executable path --- agent/pkg/deploy/{source.go => install.go} | 20 ++++++------- agent/pkg/manifest/common.go | 30 +++++++++++++++++++ agent/pkg/manifest/filesystem.go | 32 ++++++++------------ agent/pkg/manifest/filesystem_test.go | 35 ++++++++-------------- agent/pkg/manifest/manifest.yaml | 4 +-- agent/pkg/manifest/service.go | 14 ++++----- agent/pkg/reconciler/reconciler.go | 27 ++++++++++------- 7 files changed, 89 insertions(+), 73 deletions(-) rename agent/pkg/deploy/{source.go => install.go} (72%) diff --git a/agent/pkg/deploy/source.go b/agent/pkg/deploy/install.go similarity index 72% rename from agent/pkg/deploy/source.go rename to agent/pkg/deploy/install.go index 0e0f69ff..31af6e53 100644 --- a/agent/pkg/deploy/source.go +++ b/agent/pkg/deploy/install.go @@ -12,8 +12,8 @@ import ( type Installer interface { ApplySourceRepo(ctx context.Context, add manifest.InstallSource) error DeleteSourceRepo(ctx context.Context, remove manifest.InstallSource) error - ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error - DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error + ApplyInstall(ctx context.Context, ref string) error + DeleteInstall(ctx context.Context, ref string) error } func DetectPackageManager() (Package, error) { @@ -56,18 +56,18 @@ func (p *Package) DeleteSourceRepo(ctx context.Context, remove manifest.InstallS return p.manager.DeleteSourceRepo(ctx, remove) } -func (p *Package) ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error { - if p.isLocal || source.Type == manifest.LocalInstall { +func (p *Package) ApplyInstall(ctx context.Context, ref string) error { + if p.isLocal { return nil } - return p.manager.ApplyInstall(ctx, source) + return p.manager.ApplyInstall(ctx, ref) } -func (p *Package) DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error { - if p.isLocal || source.Type == manifest.LocalInstall { +func (p *Package) DeleteInstall(ctx context.Context, ref string) error { + if p.isLocal { return nil } - return p.manager.DeleteInstall(ctx, source) + return p.manager.DeleteInstall(ctx, ref) } type AptPackage struct{} @@ -80,10 +80,10 @@ func (p *AptPackage) DeleteSourceRepo(ctx context.Context, remove manifest.Insta return errors.New("not implemented") } -func (p *AptPackage) ApplyInstall(ctx context.Context, source manifest.ServiceInstallSource) error { +func (p *AptPackage) ApplyInstall(ctx context.Context, ref string) error { return errors.New("not implemented") } -func (p *AptPackage) DeleteInstall(ctx context.Context, source manifest.ServiceInstallSource) error { +func (p *AptPackage) DeleteInstall(ctx context.Context, ref string) error { return errors.New("not implemented") } diff --git a/agent/pkg/manifest/common.go b/agent/pkg/manifest/common.go index 3665032f..7125d430 100644 --- a/agent/pkg/manifest/common.go +++ b/agent/pkg/manifest/common.go @@ -2,11 +2,17 @@ package manifest import ( "fmt" + "path/filepath" + "strings" "github.com/thinkparq/beegfs-go/common/beegfs" pb "github.com/thinkparq/protobuf/go/agent" ) +const ( + DefaultExecutablePath = "/opt/beegfs/sbin/" +) + type Common struct { Auth *Auth `yaml:"auth"` TLS *TLS `yaml:"tls"` @@ -73,6 +79,30 @@ type InstallSource struct { Refs SourceRefs `yaml:"refs"` } +// nodeTypeToExecutablePath takes a node type and returns the default path to that node's binary. +func (s InstallSource) nodeTypeToExecutablePath(nodeType beegfs.NodeType) string { + switch nodeType { + case beegfs.Management: + return filepath.Clean(DefaultExecutablePath + "beegfs-mgmtd") + default: + return filepath.Clean(DefaultExecutablePath + nodeType.String()) + } +} + +// refToExecutablePath takes a reference to a package, container image, etc. and generates a default +// executable path based on the install source type and reference string format. +func (s InstallSource) refToExecutablePath(ref string) string { + switch s.Type { + case PackageInstall, LocalInstall: + if r := strings.Split(ref, "="); len(r) == 2 { + return filepath.Clean(DefaultExecutablePath + r[0]) + } + return filepath.Clean(DefaultExecutablePath + ref) + default: + return ref + } +} + type SourceRefs map[beegfs.NodeType]string func (s *SourceRefs) UnmarshalYAML(unmarshal func(any) error) error { diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 663dd68a..6e5a5c8f 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -53,12 +53,14 @@ func (f *Filesystem) InheritGlobalConfig(shortUUID string) { service.Config = inheritMapDefaults(commonServiceConfig, service.Config) } // Inherit global source configuration based on the service type. - if service.InstallSource == nil || service.InstallSource.Ref == "" { - service.InstallSource = &ServiceInstallSource{ - Type: f.Common.InstallSource.Type, - } + if service.Executable == "" { if ref, ok := f.Common.InstallSource.Refs[service.Type]; ok { - service.InstallSource.Ref = ref + // If there is a global install reference for this service type use this to + // derive the executable path. + service.Executable = f.Common.InstallSource.refToExecutablePath(ref) + } else { + // Otherwise get the default executable path for this service type. + service.Executable = f.Common.InstallSource.nodeTypeToExecutablePath(service.Type) } } // Inherit target configuration from the FS and service: @@ -66,6 +68,10 @@ func (f *Filesystem) InheritGlobalConfig(shortUUID string) { agent.Services[i].Targets[t].shortUUID = shortUUID agent.Services[i].Targets[t].nodeType = service.Type } + + // TODO: Inherit global conn.auth and TLS config based on service type. Return an error + // if users try to manually specify this in the global or per-service config somehow. + // Maybe this is implemented as methods on the Auth and TLS structs? } f.Agents[agentID] = agent } @@ -131,13 +137,7 @@ func FromProto(protoFS *pb.Filesystem) Filesystem { Config: s.GetConfig(), Interfaces: make([]Nic, 0), Targets: make([]Target, 0), - } - - if s.InstallSource != nil { - service.InstallSource = &ServiceInstallSource{ - Type: sourceTypeFromProto(s.GetInstallSource().GetType()), - Ref: s.GetInstallSource().GetRef(), - } + Executable: s.GetExecutable(), } for _, i := range s.GetInterfaces() { @@ -214,13 +214,7 @@ func ToProto(fs *Filesystem) *pb.Filesystem { Config: service.Config, Interfaces: make([]*pb.Nic, 0, len(service.Interfaces)), Targets: make([]*pb.Target, 0, len(service.Targets)), - } - - if service.InstallSource != nil { - pbService.InstallSource = &pb.Service_InstallSource{ - Type: service.InstallSource.Type.ToProto(), - Ref: service.InstallSource.Ref, - } + Executable: service.Executable, } for _, nic := range service.Interfaces { diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index 26310c95..c3fc87d9 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -49,10 +49,7 @@ func TestFromToProto_RoundTrip(t *testing.T) { Interfaces: []*pb.Nic{ {Name: "ib0", Addr: "10.0.0.1/16"}, }, - InstallSource: &pb.Service_InstallSource{ - Type: pb.InstallType_LOCAL, - Ref: "12345", - }, + Executable: "/opt/beegfs/beegfs-meta", Targets: []*pb.Target{ { NumId: 101, @@ -79,11 +76,11 @@ func TestFromToProto_RoundTrip(t *testing.T) { func TestInheritGlobalConfig(t *testing.T) { tests := []struct { - name string - input Filesystem - expectedNIC string // Expected NIC name in service if inherited - expectedCfg map[string]string - expectedSrc ServiceInstallSource + name string + input Filesystem + expectedNIC string // Expected NIC name in service if inherited + expectedCfg map[string]string + expectedExec string }{ { name: "inherit source, NIC and meta config", @@ -125,10 +122,7 @@ func TestInheritGlobalConfig(t *testing.T) { "foo": "bar", // inherited "baz": "service-specific", // overridden }, - expectedSrc: ServiceInstallSource{ - Type: PackageInstall, - Ref: "beegfs-meta=8.0.1", - }, + expectedExec: "/opt/beegfs/beegfs-meta", }, { name: "no inheritance if NICs or source are present", @@ -157,11 +151,8 @@ func TestInheritGlobalConfig(t *testing.T) { Interfaces: []Nic{ {Name: "eth0", Addr: "192.168.0.1/24"}, }, - Config: map[string]string{"quota": "override"}, - InstallSource: &ServiceInstallSource{ - Type: LocalInstall, - Ref: "/home/tux/beegfs-meta", - }, + Config: map[string]string{"quota": "override"}, + Executable: "/tmp/beegfs-meta", }, }, }, @@ -171,17 +162,15 @@ func TestInheritGlobalConfig(t *testing.T) { expectedCfg: map[string]string{ "quota": "override", }, - expectedSrc: ServiceInstallSource{ - Type: LocalInstall, - Ref: "/home/tux/beegfs-meta", - }, + expectedExec: "/tmp/beegfs-meta", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { fs := tt.input - fs.InheritGlobalConfig("3b6f972b-64c7-4378-9f8e-172cf88c7d93") + // 3b6f972b-64c7-4378-9f8e-172cf88c7d93 + fs.InheritGlobalConfig("3b6f972b") agent := fs.Agents["agent1"] service := agent.Services[0] assert.Equal(t, tt.expectedNIC, service.Interfaces[0].Name) diff --git a/agent/pkg/manifest/manifest.yaml b/agent/pkg/manifest/manifest.yaml index 39f3f14c..b93e912b 100644 --- a/agent/pkg/manifest/manifest.yaml +++ b/agent/pkg/manifest/manifest.yaml @@ -55,9 +55,7 @@ filesystems: path: /beegfs/ - type: meta id: 1 - install-source: - type: local - ref: /home/tux/development/beegfs/meta/build/beegfs-meta + executable: /home/tux/development/beegfs/meta/build/beegfs-meta interfaces: - name: enp0s1 # IP configuration handled globally targets: diff --git a/agent/pkg/manifest/service.go b/agent/pkg/manifest/service.go index 5c74fc1d..91654fbd 100644 --- a/agent/pkg/manifest/service.go +++ b/agent/pkg/manifest/service.go @@ -9,13 +9,13 @@ import ( type Service struct { // shortUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. - shortUUID string - ID beegfs.NumId `yaml:"id"` - Type beegfs.NodeType `yaml:"type"` - Config map[string]string `yaml:"config"` - Interfaces []Nic `yaml:"interfaces"` - Targets []Target `yaml:"targets"` - InstallSource *ServiceInstallSource `yaml:"install-source,omitempty"` + shortUUID string + ID beegfs.NumId `yaml:"id"` + Type beegfs.NodeType `yaml:"type"` + Config map[string]string `yaml:"config"` + Interfaces []Nic `yaml:"interfaces"` + Targets []Target `yaml:"targets"` + Executable string `yaml:"executable"` } func (s Service) GetSystemdUnit() string { diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index 47f0eab2..a0ed2587 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -13,6 +13,7 @@ import ( "github.com/google/uuid" "github.com/thinkparq/beegfs-go/agent/pkg/deploy" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" + "github.com/thinkparq/beegfs-go/common/beegfs" pb "github.com/thinkparq/protobuf/go/agent" "go.uber.org/zap" ) @@ -205,6 +206,21 @@ func (r *defaultReconciler) reconcile(newFilesystems manifest.Filesystems) { return } + // Install packages for all services types managed by this agent: + installPackages := map[beegfs.NodeType]struct{}{} + for _, service := range agent.Services { + if _, ok := installPackages[service.Type]; ok { + continue + } + installPackages[service.Type] = struct{}{} + if ref, ok := fs.Common.InstallSource.Refs[service.Type]; ok { + if err := r.strategy.ApplyInstall(ctx, ref); err != nil { + r.state.fail(fmt.Sprintf("unable to apply service installation for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) + } + } + } + + // Roll out services: for _, service := range agent.Services { if err := r.strategy.ApplyInterfaces(ctx, service.Interfaces); err != nil { r.state.fail(fmt.Sprintf("unable to apply interface configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) @@ -214,17 +230,6 @@ func (r *defaultReconciler) reconcile(newFilesystems manifest.Filesystems) { r.state.fail(fmt.Sprintf("unable to apply target configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) return } - - // Currently the source for the services should always be set by the user or inherited - // automatically from the global configuration. This might change so avoid a panic. - if service.InstallSource != nil { - if err := r.strategy.ApplyInstall(ctx, *service.InstallSource); err != nil { - r.state.fail(fmt.Sprintf("unable to apply source installation for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) - } - } else { - r.log.Warn("service install source was unexpectedly nil (ignoring)", zap.String("fsUUID", fsUUID), zap.String("type", service.Type.String()), zap.Any("id", service.ID)) - } - if err := r.strategy.ApplyService(ctx, service); err != nil { r.state.fail(fmt.Sprintf("unable to apply service configuration for %s: %s", getFsServiceID(fsUUID, service.Type, service.ID), err.Error())) return From 594d532465dfbb7f2da41745d32ddb1f5d11b321 Mon Sep 17 00:00:00 2001 From: Joe McCormick <31295332+iamjoemccormick@users.noreply.github.com> Date: Wed, 21 May 2025 19:57:58 +0000 Subject: [PATCH 13/13] todo: unstage + self-review --- agent/pkg/deploy/mount.go | 9 ++++++ agent/pkg/deploy/service.go | 45 +++++++++++++++++++++++++-- agent/pkg/manifest/filesystem.go | 19 ++++++++++- agent/pkg/manifest/filesystem_test.go | 2 +- agent/pkg/manifest/service.go | 38 +++++++++++++++++++--- agent/pkg/manifest/target.go | 15 +++++++++ agent/pkg/reconciler/reconciler.go | 4 ++- 7 files changed, 122 insertions(+), 10 deletions(-) diff --git a/agent/pkg/deploy/mount.go b/agent/pkg/deploy/mount.go index 1aeef6ca..171f51a1 100644 --- a/agent/pkg/deploy/mount.go +++ b/agent/pkg/deploy/mount.go @@ -5,6 +5,8 @@ import ( "errors" "fmt" "os" + "os/exec" + "strings" "github.com/thinkparq/beegfs-go/agent/pkg/manifest" ) @@ -25,6 +27,13 @@ func (m *Mount) ApplyTargets(ctx context.Context, add []manifest.Target) error { if err := os.MkdirAll(target.GetPath(), 0700); err != nil { return fmt.Errorf("unable to apply target %d: unable to create root directory %s: %w", target.ID, target.Path, err) } + name, args := target.GetInitCmd() + output, err := exec.CommandContext(ctx, name, args...).CombinedOutput() + if err != nil { + if !strings.Contains(string(output), "already exists") { + return fmt.Errorf("unable to initialize target %d: %s (%w)", target.ID, output, err) + } + } } return nil } diff --git a/agent/pkg/deploy/service.go b/agent/pkg/deploy/service.go index c4390d84..d41ffb1c 100644 --- a/agent/pkg/deploy/service.go +++ b/agent/pkg/deploy/service.go @@ -2,7 +2,6 @@ package deploy import ( "context" - "errors" "fmt" "github.com/coreos/go-systemd/v22/dbus" @@ -36,9 +35,49 @@ func (d *Systemd) Cleanup() error { } func (d *Systemd) ApplyService(ctx context.Context, add manifest.Service) error { - return errors.New("not implemented") + + if err := d.DestroyService(ctx, add); err != nil { + return fmt.Errorf("error destroying existing service to apply updates: %w", err) + } + + cmd := append([]string{add.Executable}, add.GetConfig()...) + properties := []dbus.Property{ + dbus.PropExecStart(cmd, false), + dbus.PropDescription(add.GetDescription()), + dbus.PropRemainAfterExit(false), + dbus.PropType("simple"), + } + _, err := d.conn.StartTransientUnitContext(ctx, add.GetSystemdUnit(), "replace", properties, nil) + if err != nil { + return fmt.Errorf("failed to start transient unit %s: %w", add.GetSystemdUnit(), err) + } + return nil } func (d *Systemd) DestroyService(ctx context.Context, remove manifest.Service) error { - return errors.New("not implemented") + units, err := d.conn.ListUnitsByNamesContext(ctx, []string{remove.GetSystemdUnit()}) + if err != nil { + return fmt.Errorf("error querying systemd units: %w", err) + } + + if len(units) == 0 || units[0].LoadState == "not-found" { + return nil + } + + ch := make(chan string) + _, err = d.conn.StopUnitContext(ctx, remove.GetSystemdUnit(), "replace", ch) + if err != nil { + return err + } + select { + case <-ch: + if units[0].SubState == "failed" { + if err := d.conn.ResetFailedUnitContext(ctx, remove.GetSystemdUnit()); err != nil { + return fmt.Errorf("error resetting failed unit context: %w", err) + } + } + return nil + case <-ctx.Done(): + return ctx.Err() + } } diff --git a/agent/pkg/manifest/filesystem.go b/agent/pkg/manifest/filesystem.go index 6e5a5c8f..d5b9dbd2 100644 --- a/agent/pkg/manifest/filesystem.go +++ b/agent/pkg/manifest/filesystem.go @@ -5,6 +5,7 @@ package manifest import ( + "fmt" "os" "github.com/google/uuid" @@ -39,11 +40,12 @@ type Nic struct { // identifiers in case resources for multiple file systems exist on the same machine. Derived by // taking the first ShortUUIDLen hex digits of the full 128-bit v4 UUID. The caller is responsible // for validating the shortUUID including verifying no collisions are possible in this manifest. -func (f *Filesystem) InheritGlobalConfig(shortUUID string) { +func (f *Filesystem) InheritGlobalConfig(shortUUID string, longUUID string) error { for agentID, agent := range f.Agents { for i := range agent.Services { service := &agent.Services[i] service.shortUUID = shortUUID + service.longUUID = longUUID // Inherit global interface configuration if there are no service specific interfaces. if len(service.Interfaces) == 0 { service.Interfaces = agent.Interfaces @@ -65,8 +67,22 @@ func (f *Filesystem) InheritGlobalConfig(shortUUID string) { } // Inherit target configuration from the FS and service: for t := range service.Targets { + agent.Services[i].Targets[t].longUUID = longUUID agent.Services[i].Targets[t].shortUUID = shortUUID agent.Services[i].Targets[t].nodeType = service.Type + // TODO: May be different for each service type. + agent.Services[i].Targets[t].initCmd = service.Executable + } + + if targetConfig, err := service.GetTargetsConfig(); err != nil { + return err + } else { + for k, v := range targetConfig { + if user, ok := service.Config[k]; ok { + return fmt.Errorf("auto-generated target config for %s=%s would overwrite user config %s (user config must be removed)", k, v, user) + } + service.Config[k] = v + } } // TODO: Inherit global conn.auth and TLS config based on service type. Return an error @@ -75,6 +91,7 @@ func (f *Filesystem) InheritGlobalConfig(shortUUID string) { } f.Agents[agentID] = agent } + return nil } func inheritMapDefaults(defaults, target map[string]string) map[string]string { diff --git a/agent/pkg/manifest/filesystem_test.go b/agent/pkg/manifest/filesystem_test.go index c3fc87d9..bf997690 100644 --- a/agent/pkg/manifest/filesystem_test.go +++ b/agent/pkg/manifest/filesystem_test.go @@ -170,7 +170,7 @@ func TestInheritGlobalConfig(t *testing.T) { t.Run(tt.name, func(t *testing.T) { fs := tt.input // 3b6f972b-64c7-4378-9f8e-172cf88c7d93 - fs.InheritGlobalConfig("3b6f972b") + fs.InheritGlobalConfig("3b6f972b", "3b6f972b-64c7-4378-9f8e-172cf88c7d93") agent := fs.Agents["agent1"] service := agent.Services[0] assert.Equal(t, tt.expectedNIC, service.Interfaces[0].Name) diff --git a/agent/pkg/manifest/service.go b/agent/pkg/manifest/service.go index 91654fbd..622ed692 100644 --- a/agent/pkg/manifest/service.go +++ b/agent/pkg/manifest/service.go @@ -2,6 +2,7 @@ package manifest import ( "fmt" + "path/filepath" "github.com/thinkparq/beegfs-go/common/beegfs" ) @@ -10,6 +11,7 @@ type Service struct { // shortUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. shortUUID string + longUUID string ID beegfs.NumId `yaml:"id"` Type beegfs.NodeType `yaml:"type"` Config map[string]string `yaml:"config"` @@ -18,11 +20,39 @@ type Service struct { Executable string `yaml:"executable"` } +// GetTargetsConfig returns the string used to initialize +func (s Service) GetTargetsConfig() (map[string]string, error) { + switch s.Type { + case beegfs.Management: + if len(s.Targets) != 1 { + return nil, fmt.Errorf("invalid number of targets for node type %s: %d", s.Type.String(), len(s.Targets)) + } + path := filepath.Clean(s.Targets[0].GetPath() + "/mgmtd.sqlite") + return map[string]string{"db-file": path}, nil + default: + return nil, nil + } + + // TODO: Implement remaining node types. + // return "", nil, fmt.Errorf("unsupported node type: %v", s.Type) +} + +func (s Service) GetDescription() string { + return fmt.Sprintf("BeeGFS %s-%s-%d (managed by BeeOND)", s.shortUUID, s.Type, s.ID) +} + func (s Service) GetSystemdUnit() string { - return fmt.Sprintf("beegfs-%s-%s-%d.service", s.shortUUID, s.Type, s.ID) + return fmt.Sprintf("%s-beegfs-%s-%d.service", s.shortUUID, s.Type, s.ID) } -type ServiceInstallSource struct { - Type InstallType `yaml:"type"` - Ref string `yaml:"ref"` +func (s Service) GetConfig() []string { + config := make([]string, 0, len(s.Config)) + for k, v := range s.Config { + if s.Type == beegfs.Management { + config = append(config, fmt.Sprintf("--%s=%v", k, v)) + } else { + config = append(config, fmt.Sprintf("%s=%v", k, v)) + } + } + return config } diff --git a/agent/pkg/manifest/target.go b/agent/pkg/manifest/target.go index 9fffb71b..73008681 100644 --- a/agent/pkg/manifest/target.go +++ b/agent/pkg/manifest/target.go @@ -3,6 +3,7 @@ package manifest import ( "fmt" "path" + "path/filepath" "strings" "github.com/thinkparq/beegfs-go/common/beegfs" @@ -13,12 +14,26 @@ type Target struct { // shortUUID is set by InheritGlobalConfig and used internally to generate globally unique names // and identifiers in case resources for multiple file systems exist on the same machine. shortUUID string + longUUID string nodeType beegfs.NodeType + initCmd string ID beegfs.NumId `yaml:"id"` Path string `yaml:"path"` ULFS *UnderlyingFS `yaml:"ulfs"` } +func (t Target) GetInitCmd() (string, []string) { + if t.nodeType == beegfs.Management { + return t.initCmd, []string{ + fmt.Sprintf("--fs-uuid=%s", t.longUUID), + fmt.Sprintf("--init"), + fmt.Sprintf("--db-file=%s", filepath.Clean(t.GetPath()+"/mgmtd.sqlite")), + } + } + // TODO: Setup init commands for other node types. + return t.initCmd, []string{} +} + func (t Target) GetPath() string { return path.Join(t.Path, t.shortUUID, fmt.Sprintf("%s_%d", t.nodeType, t.ID)) } diff --git a/agent/pkg/reconciler/reconciler.go b/agent/pkg/reconciler/reconciler.go index a0ed2587..6361854b 100644 --- a/agent/pkg/reconciler/reconciler.go +++ b/agent/pkg/reconciler/reconciler.go @@ -173,7 +173,9 @@ func (r *defaultReconciler) verify(newFilesystems manifest.Filesystems) error { // * All services have IPs + targets. // * Services have the correct number of targets (i.e., 1 for mgmtd meta, remote, sync). // Note these should be implemented as methods on manifest.Filesystem. - fs.InheritGlobalConfig(shortUUID) + if err := fs.InheritGlobalConfig(shortUUID, fsUUID); err != nil { + return fmt.Errorf("error propagating global configuration: %w", err) + } } go r.reconcile(newFilesystems) return nil