Skip to content

Commit

Permalink
Remove fs-error label (#2382)
Browse files Browse the repository at this point in the history
fs-error has a high cardinality and will not be supported by GCM.
So, either we can drop it in gcsfuse or gcs-fuse-csi-driver will have to make
changes to drop the label. fs-error is likely not needed since we
already are classifying the most frequent errors into a new label called
fs-err-category and the other errors are being classified into
appropriate buckets. Moreover, we'd continue to have logs if we need to
get more details about an (infrequent) error. So, we can remove fs-error
label.
  • Loading branch information
kislaykishore authored Aug 23, 2024
1 parent 8288d45 commit f235e75
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 39 deletions.
23 changes: 7 additions & 16 deletions internal/fs/wrappers/monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func init() {
Measure: opsErrorCount,
Description: "The cumulative number of errors generated by file system operations",
Aggregation: view.Sum(),
TagKeys: []tag.Key{tags.FSOp, tags.FSError, tags.FSErrCategory},
TagKeys: []tag.Key{tags.FSOp, tags.FSErrCategory},
},
&view.View{
Name: "fs/ops_latency",
Expand All @@ -76,24 +76,17 @@ func init() {
}
}

// errStrAndCategory maps an error to an error string and an error category.
// Uncommon errors are bucketed into categories to reduce the cardinality of the
// error so that the metric is not rejected by Cloud Monarch.
func errStrAndCategory(err error) (str string, category string) {
// categorize maps an error to an error-category.
// This helps reduce the cardinality of the labels to less than 30.
// This lower number of errors allows the various errors to get piped to Cloud metrics without getting dropped.
func categorize(err error) string {
if err == nil {
return "", ""
return ""
}
var errno syscall.Errno
if !errors.As(err, &errno) {
errno = DefaultFSError
}
return errno.Error(), errCategory(errno)
}

// errCategory maps an error to an error-category.
// This helps reduce the cardinality of the labels to less than 30.
// This lower number of errors allows the various errors to get piped to Cloud metrics without getting dropped.
func errCategory(errno syscall.Errno) string {
switch errno {
case syscall.ELNRNG,
syscall.ENODEV,
Expand Down Expand Up @@ -262,7 +255,6 @@ func errCategory(errno syscall.Errno) string {

// Records file system operation count, failed operation count and the operation latency.
func recordOp(ctx context.Context, method string, start time.Time, fsErr error) {

// Recording opCount.
if err := stats.RecordWithTags(
ctx,
Expand All @@ -277,12 +269,11 @@ func recordOp(ctx context.Context, method string, start time.Time, fsErr error)

// Recording opErrorCount.
if fsErr != nil {
errStr, errCategory := errStrAndCategory(fsErr)
errCategory := categorize(fsErr)
if err := stats.RecordWithTags(
ctx,
[]tag.Mutator{
tag.Upsert(tags.FSOp, method),
tag.Upsert(tags.FSError, errStr),
tag.Upsert(tags.FSErrCategory, errCategory),
},
opsErrorCount.M(1),
Expand Down
21 changes: 1 addition & 20 deletions internal/fs/wrappers/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,82 +33,66 @@ func TestFsErrStrAndCategory(t *testing.T) {
t.Parallel()
tests := []struct {
fsErr error
expectedStr string
expectedCategory string
}{
{
fsErr: fmt.Errorf("some random error"),
expectedStr: "input/output error",
expectedCategory: "input/output error",
},
{
fsErr: syscall.ENOTEMPTY,
expectedStr: "directory not empty",
expectedCategory: "directory not empty",
},
{
fsErr: syscall.EEXIST,
expectedStr: "file exists",
expectedCategory: "file exists",
},
{
fsErr: syscall.EINVAL,
expectedStr: "invalid argument",
expectedCategory: "invalid argument",
},
{
fsErr: syscall.EINTR,
expectedStr: "interrupted system call",
expectedCategory: "interrupt errors",
},
{
fsErr: syscall.ENOSYS,
expectedStr: "function not implemented",
expectedCategory: "function not implemented",
},
{
fsErr: syscall.ENOSPC,
expectedStr: "no space left on device",
expectedCategory: "process/resource management errors",
},
{
fsErr: syscall.E2BIG,
expectedStr: "argument list too long",
expectedCategory: "invalid operation",
},
{
fsErr: syscall.EHOSTDOWN,
expectedStr: "host is down",
expectedCategory: "network errors",
},
{
fsErr: syscall.ENODATA,
expectedStr: "no data available",
expectedCategory: "miscellaneous errors",
},
{
fsErr: syscall.ENODEV,
expectedStr: "no such device",
expectedCategory: "device errors",
},
{
fsErr: syscall.EISDIR,
expectedStr: "is a directory",
expectedCategory: "file/directory errors",
},
{
fsErr: syscall.ENOSYS,
expectedStr: "function not implemented",
expectedCategory: "function not implemented",
},
{
fsErr: syscall.ENFILE,
expectedStr: "too many open files in system",
expectedCategory: "too many open files",
},
{
fsErr: syscall.EPERM,
expectedStr: "operation not permitted",
expectedCategory: "permission errors",
},
}
Expand All @@ -117,10 +101,7 @@ func TestFsErrStrAndCategory(t *testing.T) {
t.Run(fmt.Sprintf("fsErrStrAndCategor_case_%d", idx), func(t *testing.T) {
t.Parallel()

actualErrStr, actualErrGrp := errStrAndCategory(tc.fsErr)

assert.Equal(t, tc.expectedStr, actualErrStr)
assert.Equal(t, tc.expectedCategory, actualErrGrp)
assert.Equal(t, tc.expectedCategory, categorize(tc.fsErr))
})
}
}
Expand Down
3 changes: 0 additions & 3 deletions internal/monitor/tags/tags.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ var (
// FSOp annotates the file system op processed.
FSOp = tag.MustNewKey("fs_op")

// FSError annotates the file system failed operations with the error type
FSError = tag.MustNewKey("fs_error")

// FSErrCategory reduces the cardinality of FSError by grouping errors together.
FSErrCategory = tag.MustNewKey("fs_error_category")

Expand Down

0 comments on commit f235e75

Please sign in to comment.