Skip to content

Commit

Permalink
Introduce Cluster Health Monitoring
Browse files Browse the repository at this point in the history
This PR introduces cluster health monitoring for tenants, on a periodic 3 minute interval, this is configurable via MONITORING_INTERVAL environment variable, in minutes

Signed-off-by: Daniel Valdivia <18384552+dvaldivia@users.noreply.github.com>
  • Loading branch information
dvaldivia committed May 20, 2021
1 parent cf7df61 commit 138cdec
Show file tree
Hide file tree
Showing 8 changed files with 377 additions and 11 deletions.
28 changes: 28 additions & 0 deletions helm/minio-operator/crds/minio.min.io_tenants.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3440,6 +3440,17 @@ spec:
type: object
currentState:
type: string
drivesHealing:
format: int32
type: integer
drivesOffline:
format: int32
type: integer
drivesOnline:
format: int32
type: integer
healthStatus:
type: string
pools:
items:
properties:
Expand All @@ -3458,6 +3469,9 @@ spec:
type: integer
syncVersion:
type: string
writeQuorum:
format: int32
type: integer
required:
- availableReplicas
- certificates
Expand Down Expand Up @@ -6915,6 +6929,17 @@ spec:
type: object
currentState:
type: string
drivesHealing:
format: int32
type: integer
drivesOffline:
format: int32
type: integer
drivesOnline:
format: int32
type: integer
healthStatus:
type: string
pools:
items:
properties:
Expand All @@ -6933,6 +6958,9 @@ spec:
type: integer
syncVersion:
type: string
writeQuorum:
format: int32
type: integer
required:
- availableReplicas
- certificates
Expand Down
11 changes: 0 additions & 11 deletions pkg/apis/minio.min.io/v1/globals.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,9 @@

package v1

import "os"

// ClusterDomain is used to store the Kubernetes cluster domain
var ClusterDomain string

// KESIdentity is the public identity generated for MinIO Server based on
// Used only during KES Deployments
var KESIdentity string

// InitGlobals initiates the global variables while Operator starts
func InitGlobals(t *Tenant) {
var ok bool
ClusterDomain, ok = os.LookupEnv("CLUSTER_DOMAIN")
if !ok {
ClusterDomain = "cluster.local"
}
}
5 changes: 5 additions & 0 deletions pkg/apis/minio.min.io/v2/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,8 @@ const tenantMinIOImageEnv = "TENANT_MINIO_IMAGE"
const tenantConsoleImageEnv = "TENANT_CONSOLE_IMAGE"

const tenantKesImageEnv = "TENANT_KES_IMAGE"

const monitoringIntervalEnv = "MONITORING_INTERVAL"

// DefaultMonitoringInterval is how often we run monitoring on tenants
const DefaultMonitoringInterval = 3
31 changes: 31 additions & 0 deletions pkg/apis/minio.min.io/v2/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,12 @@ var (
tenantMinIOImageOnce sync.Once
tenantConsoleImageOnce sync.Once
tenantKesImageOnce sync.Once
monitoringIntervalOnce sync.Once
k8sClusterDomain string
tenantMinIOImage string
tenantConsoleImage string
tenantKesImage string
monitoringInterval int
)

// GetPodCAFromFile assumes the operator is running inside a k8s pod and extract the
Expand Down Expand Up @@ -921,3 +923,32 @@ func GetTenantKesImage() string {
})
return tenantKesImage
}

// GetMonitoringInterval returns how ofter we should query tenants for cluster/health
func GetMonitoringInterval() int {
monitoringIntervalOnce.Do(func() {
monitoringIntervalStr := envGet(monitoringIntervalEnv, "")
if monitoringIntervalStr == "" {
monitoringInterval = DefaultMonitoringInterval
}
val, err := strconv.Atoi(monitoringIntervalStr)
if err != nil {
monitoringInterval = DefaultMonitoringInterval
} else {
monitoringInterval = val
}
})
return monitoringInterval
}

// GetTenantServiceURL gets tenant's service url with the proper scheme and port
func (t *Tenant) GetTenantServiceURL() (svcURL string) {
scheme := "http"
port := MinIOPortLoadBalancerSVC
if t.AutoCert() || t.ExternalCert() {
scheme = "https"
port = MinIOTLSPortLoadBalancerSVC
}
svc := fmt.Sprintf("%s.%s.svc.cluster.local", t.MinIOCIServiceName(), t.Namespace)
return fmt.Sprintf("%s://%s", scheme, net.JoinHostPort(svc, strconv.Itoa(port)))
}
32 changes: 32 additions & 0 deletions pkg/apis/minio.min.io/v2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,18 @@ type PoolStatus struct {
State PoolState `json:"state"`
}

// HealthStatus represents whether the tenant is healthy, with decreased service or offline
type HealthStatus string

const (
// HealthStatusGreen indicates a healthy tenant: all drives online
HealthStatusGreen HealthStatus = "green"
// HealthStatusYellow indicates a decreased resilience tenant, some drives offline
HealthStatusYellow HealthStatus = "yellow"
// HealthStatusRed indicates a the tenant is offline, or lost write quorum
HealthStatusRed HealthStatus = "red"
)

// TenantStatus is the status for a Tenant resource
type TenantStatus struct {
CurrentState string `json:"currentState"`
Expand All @@ -373,6 +385,26 @@ type TenantStatus struct {
// All the pools get an individual status
// +nullable
Pools []PoolStatus `json:"pools"`
// *Optional* +
//
// Minimum number of disks that need to be online
WriteQuorum int32 `json:"writeQuorum,omitempty"`
// *Optional* +
//
// Total number of drives online for the tenant
DrivesOnline int32 `json:"drivesOnline,omitempty"`
// *Optional* +
//
// Total number of drives offline
DrivesOffline int32 `json:"drivesOffline,omitempty"`
// *Optional* +
//
// Drives with healing going on
DrivesHealing int32 `json:"drivesHealing,omitempty"`
// *Optional* +
//
// Health State of the tenant
HealthStatus HealthStatus `json:"healthStatus,omitempty"`
}

// CertificateConfig (`certConfig`) defines controlling attributes associated to any TLS certificate automatically generated by the Operator as part of tenant creation. These fields have no effect if `spec.autoCert: false`.
Expand Down
3 changes: 3 additions & 0 deletions pkg/controller/cluster/main-controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,9 @@ func (c *Controller) Start(threadiness int, stopCh <-chan struct{}) error {
go wait.Until(c.runWorker, time.Second, stopCh)
}

// Launch a goroutine to monitor all Tenants
go c.recurrentTenantStatusMonitor(stopCh)

return nil
}

Expand Down
Loading

0 comments on commit 138cdec

Please sign in to comment.