diff --git a/pkg/pattern/tokenization/replacer.go b/pkg/pattern/tokenization/replacer.go new file mode 100644 index 0000000000000..35d9259236be2 --- /dev/null +++ b/pkg/pattern/tokenization/replacer.go @@ -0,0 +1,859 @@ +package tokenization + +import ( + "bytes" + "slices" +) + +var ( + placeholderNumber = []byte("") + placeholderHex = []byte("") + placeholderUUID = []byte("") + placeholderTimestamp = []byte("") + placeholderDuration = []byte("") + placeholderBytesize = []byte("") + placeholderIP = []byte("") +) + +// Integer numbers after these words won't be replaced by ``. +var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")} + +const ( + maxYear = 2100 // anything above that is unlikely to be a timestamp... + minHexLen = 12 // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings +) + +var boundaryChars = [256]bool{} + +func init() { + for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary + if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' { + boundaryChars[i] = true + } + } + + // We need these keys sorted in an ascending length to efficiently compare them. + slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int { + return len(a) - len(b) + }) +} + +type replacer struct { + source, dest []byte + + // This is the last position that was copied to the destination buffer. + tail int + // This is the current position we are working with + cur int + // This is essentially used for lookahed operations + head int + + // Here's some ASCII art to visualize that (the long string of dashes + // visualizes the log line: + // + // 0 `tail` `cur` `head` `len(source)` + // |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------| + + // A somewhat hacky solution that allows us to preserve specific numeric + // values we care about, like status and status_code. + preserveNextNumbers bool +} + +// commit advances the current position marker to the lookahead position, i.e. +// we are committing to consume everything we've looked ahead so far. +func (r *replacer) commit() { + r.cur = r.head +} + +func (r *replacer) resetHead() { + r.head = r.cur +} + +func (r *replacer) resetHeadExpr() bool { + r.head = r.cur + return true // useful when included in boolean expressions with advance() +} + +func (r *replacer) backtrack() { + r.head-- +} + +func (r *replacer) consumeUpToCurrent() { + r.tail = r.cur +} + +// advanceUintRet returns the actual value of the number we read, as well as its +// length. The value might overflow for large numbers, so it's also important to +// check the length! +func (r *replacer) advanceUintRet() (val uint, length uint) { + var c byte + for ; r.head < len(r.source); r.head++ { + c = r.source[r.head] + if c < '0' || '9' < c { + break + } + length++ + val = val*10 + uint(c-'0') // TODO: use bitwise trick? + } + + return val, length +} + +func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) { + var c byte + for ; r.head < len(r.source); r.head++ { + c = r.source[r.head] + if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) { + break + } + length++ + } + + return length +} + +func (r *replacer) advanceUint() bool { + _, l := r.advanceUintRet() + return l > 0 +} + +func (r *replacer) advanceUintUpTo(val uint, length uint) bool { + foundVal, foundLength := r.advanceUintRet() + return foundLength > 0 && foundVal <= val && foundLength <= length +} + +func (r *replacer) advanceYear() bool { + return r.advanceUintUpTo(maxYear, 4) +} + +func (r *replacer) advanceChar(c byte) bool { + if r.head < len(r.source) && r.source[r.head] == c { + r.head++ + return true + } + return false +} + +func (r *replacer) peekNextIsBoundary() bool { + if r.head >= len(r.source) { + return true // we are at the end of the line + } + return boundaryChars[r.source[r.head]] +} + +func (r *replacer) peekFirstNonInt() (c byte) { + for i := r.head; i < len(r.source); i++ { + c = r.source[i] + if c < '0' || '9' < c { + return c + } + } + return 0 // we can return the 0 value here! +} + +func (r *replacer) peekNext4() (result [4]byte) { + overhead := len(r.source) - r.head + switch { + case overhead > 3: + result[3] = r.source[r.head+3] + fallthrough + case overhead > 2: + result[2] = r.source[r.head+2] + fallthrough + case overhead > 1: + result[1] = r.source[r.head+1] + fallthrough + case overhead > 0: + result[0] = r.source[r.head+0] + } + return result +} + +func (r *replacer) advanceTimeZoneLetters() bool { + UCLetters := 0 + for { + nc, ok := r.advance() + if !ok { + break + } + if nc < 'A' || nc > 'Z' { + r.backtrack() + break + } + UCLetters++ + } + + return UCLetters >= 2 && UCLetters <= 5 +} + +func (r *replacer) advanceNumericTimeZone() bool { + // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5) +} + +// helper for handleWeirdTimestamp() +func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool { + curHead := r.head + if r.advanceChar(' ') && r.advanceNumericTimeZone() { + return true + } + r.head = curHead + if r.advanceChar(' ') && r.advanceTimeZoneLetters() { + return true + } + r.head = curHead + return isOptional +} + +func (r *replacer) advanceOneOf(chars ...byte) bool { + if r.head >= len(r.source) { + return false + } + c := r.source[r.head] + + for _, ec := range chars { + if c == ec { + r.head++ + return true + } + } + return false +} + +func (r *replacer) advanceTime(secondsOptional bool) bool { + return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2))) +} + +// Mon|Tue|Wed|Thu|Fri|Sat|Sun +func (r *replacer) advanceDayOfTheWeek() bool { + c1, ok1 := r.advance() + c2, ok2 := r.advance() + c3, ok3 := r.advance() + if !ok1 || !ok2 || !ok3 { + return false + } + return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun + (c1 == 'M' && c2 == 'o' && c3 == 'n') || + (c1 == 'T' && c2 == 'u' && c3 == 'e') || + (c1 == 'W' && c2 == 'e' && c3 == 'd') || + (c1 == 'T' && c2 == 'h' && c3 == 'u') || + (c1 == 'F' && c2 == 'r' && c3 == 'i') +} + +// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec +func (r *replacer) advanceMonthName() bool { + c1, ok1 := r.advance() + c2, ok2 := r.advance() + c3, ok3 := r.advance() + if !ok1 || !ok2 || !ok3 { + return false + } + + return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul + (c2 == 'a' && c3 == 'n'))) || // Jan + (c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May + (c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec + (c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr + (c2 == 'u' && c3 == 'g'))) || // Aug + (c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct + (c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov +} + +// Check if we in the middle of an UUID, exactly after the first 8 characters +// and the dash after them, e.g after "abcd0123-": +func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) { + return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary() +} + +// Only moves the head forward if it successfully matches a duration +func (r *replacer) advanceDuration() (matched bool) { + curHead := r.head + + var secsLen int + n := r.peekNext4() + if n[0] == 'h' { + r.head++ + if boundaryChars[n[1]] { + return true // e.g. "1h", "123h" + } + if !r.advanceUintUpTo(60, 2) { + goto restore + } + n = r.peekNext4() + } + if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here + r.head++ + if boundaryChars[n[1]] { + return true // e.g. "2h21m", "121m" + } + if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) { + goto restore + } + n = r.peekNext4() + } + + if n[0] == 's' && boundaryChars[n[1]] { + secsLen = 1 + } else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] { + secsLen = 2 + } else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] { + // This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu) + secsLen = 3 + } else { + goto restore + } + + r.head += secsLen + return true + +restore: // should be faster than a defer + r.head = curHead + return false +} + +var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true} + +// Only moves the head forward if it successfully matches a duration +func (r *replacer) advanceBytesize(c1 byte) (matched bool) { + if !byteSizes[c1] { + return false + } + n := r.peekNext4() + + var unitLen int // not counting the first character c1, which is already advanced to + if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] { + unitLen = 1 + } else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] { + unitLen = 2 + } else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] { + unitLen = 3 + } + + if unitLen > 0 { + r.head += unitLen + return true + } + return false +} + +func (r *replacer) advance() (c byte, advanced bool) { + if r.head >= len(r.source) { + return 0, false + } + + c = r.source[r.head] + r.head++ + return c, true +} + +func (r *replacer) emitNumber() { + r.commit() + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() +} + +func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) { + r.commit() + if !r.preserveNextNumbers { + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() + } else { + r.maybeEmitDash(hasMinusPrefix) + r.copyUpToCurrent() + } +} + +func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) { + r.commit() + r.maybeEmitDash(hasMinusPrefix) + r.dest = append(r.dest, placeholder...) + r.consumeUpToCurrent() +} + +func (r *replacer) maybeEmitDash(hasMinusPrefix bool) { + // This minus was actually a dash, so we just copy it to the result + if hasMinusPrefix { + r.dest = append(r.dest, '-') + } +} + +func (r *replacer) copyUpToCurrent() { + r.dest = append(r.dest, r.source[r.tail:r.cur]...) + r.consumeUpToCurrent() +} + +func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) { + // Special case that is likely a hex string of the format "0x", but we don't + // know whether the rest is upper case or lower case yet. + zeroHex := false + if n1 == 0 && l1 == 1 && c1 == 'x' { + zeroHex = true // these cannot be the start of an UUID + c1 = r.peekFirstNonInt() + } + + // Maybe we are at the start of a hex string, either something like + // "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper + // case letters, but to avoid false positives, we want hex replacements to + // happen only if the strings are fully lower case or fully upper case. + if 'a' <= c1 && c1 <= 'f' { + return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex) + } else if 'A' <= c1 && c1 <= 'F' { + return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex) + } else if zeroHex { + // Well, it probably wasn't a zero-hex after all, or it contained only + // digits, so try to handle that or emit what we absorbed + _, l2 := r.advanceUintRet() + if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen + r.emit(hasMinusPrefix, placeholderHex) + } else { + r.maybeEmitDash(hasMinusPrefix) + r.commit() + r.copyUpToCurrent() + } + return r.peekNextIsBoundary() + } + + return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1) +} + +func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) { + totalLen := l1 + r.advanceUintOrRangeRet(lc, hc) + r.commit() + + if totalLen == 8 && canBeUUID { + // We might be at the first part of a UUID, right before the first dash + if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) { + r.emit(hasMinusPrefix, placeholderUUID) + return true + } + r.resetHead() + } + + if totalLen >= minHexLen && r.peekNextIsBoundary() { + r.emit(hasMinusPrefix, placeholderHex) + return true + } + + r.copyUpToCurrent() + return r.peekNextIsBoundary() +} + +func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) { + if r.advanceBytesize(c1) { + // We do not subsume a minus sign - byte sizes are unlikely to be + // negative, it's more likely this is a dash as a part of a range + r.emit(hasMinusPrefix, placeholderBytesize) + return true + } + + r.backtrack() + if r.advanceDuration() { + // We subsume hasMinusPrefix, since durations can be negative + r.emit(false, placeholderDuration) + return true + } + + // We couldn't match anything, so just copy what existed. + r.maybeEmitDash(hasMinusPrefix) + r.copyUpToCurrent() + return false +} + +func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) { + n2, l2 := r.advanceUintRet() + if l2 == 0 { + // The dot wasn't followed by another number, so emit everything before it. + r.backtrack() + r.emitNumberOrCopyText(hasMinusPrefix) + return false + } + + // See if the number after the decimal is also followed by a boundary + b2, hasNext := r.advance() + + // We are at the end of the string, which is a boundary, replace evertything + // up to now with a number + if !hasNext { + r.emitNumber() // this also subsumes any minus sign we had + return true + } + + // The decimal number isn't followed by a boundary char (which include + // things like '.', ':', '/', etc.), so the part after the decimal is either + // not a real number, or it's some sort of a unit that can support decimals + // like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc. + if !boundaryChars[b2] { + return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2) + } + + // We have a decimal number followed by a non-dot boundary, so this is not + // an IP or a version number or anything like that. + if b2 != '.' { + r.backtrack() + r.emitNumber() + return true + } + + n3, l3 := r.advanceUintRet() + if l3 == 0 || !r.peekNextIsBoundary() { + // The second dot wasn't followed by another number and a boundary, so + // emit just the first number. + r.resetHead() + r.emitNumber() + return true + } + + // We have ".." at this point, so we either have something + // like a version number, or an IP address, but certainly not a simple + // decimal number we can just emit. + r.commit() + + // Check if this is an IP address... + if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() { + r.emit(hasMinusPrefix, placeholderIP) + return true + } + + // This wasn't an IP after all, so just emit "..". We don't + // want to assume this is a simple decimal number because of the second dot, + // e.g. this might be something like a version number. + r.resetHead() + r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus + r.dest = append(r.dest, placeholderNumber...) + r.dest = append(r.dest, '.') + r.dest = append(r.dest, placeholderNumber...) + r.dest = append(r.dest, '.') + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() + return true +} + +func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) { + if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) { + r.commit() + // continue down to parsing sub-second and timezone values + } else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() { + // We might not have a sane timestamp, but apparently a not-so-sane + // timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"... + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } else { + // Apparently that wasn't it either, we probably just have a dash or + // forward slash after a number, so we backtrack and emit the number. + r.resetHead() + r.emitNumberOrCopyText(hasMinusPrefix) + return true + } + + // We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or + // `YY[YY]/MM/DDZhh:mm:ss` + r.commit() // we want to keep this + + // Now we need to also check for sub-second precision and time zones: + c, canAdvance := r.advance() + if !canAdvance { + // End of the string + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + if c == '.' { + _, intl := r.advanceUintRet() + if intl == 0 { + // No sub-second precision, the dot was not part of the timestamp + r.backtrack() + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + + // We had sub-second precision, capture that too + r.commit() + + // Get the next char to see if we have time zone + c, canAdvance = r.advance() + if !canAdvance { + // We are at the end of the sting after we captured the + // sub-second value. + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + } + + if c == 'Z' || c == 'z' { + //UTC string, nothing to do after that + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) { + // e.g. "2020-09-30T00:00:59.9999+03:00" + r.commit() + } else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() { + // e.g. "2023-09-05 23:20:28.030285153 +0000 UTC" + r.commit() + } else { + r.resetHead() + } + r.emit(hasMinusPrefix, placeholderTimestamp) + return true +} + +func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) { + r.resetHead() // keep the head pos in sync with the current pos + // We know we were at a digit due to how handleNumber() is called + n1, l1 := r.advanceUintRet() + r.commit() // we will consume this one way or another for sure + + // See if the number is followed by a boundary + b1, hasNext := r.advance() + + switch { + // We are at the end of the string, which is a boundary, replace evertything + // up to now with a number + case !hasNext: + r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had + return true + + // The number isn't followed by a boundary char (which include things like + // '.', ' ', '/', etc.), so it's either not a real number or date, or it's + // some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB, + // 3MiB), etc. + case !boundaryChars[b1]: + return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1) + + // We have a decimal point, so this can either be a plain number, a unit + // that can have a float value, or an IP address. + case b1 == '.': + return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1) + + // This might be a timestamp that looks like '2024-04-01...' or + // `2024/03/27...`; timestamps in the far future are not supported :) + case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'): + return r.handleSaneTimestamp(hasMinusPrefix, n1, b1) + + // Weird RFC822 dates like "02 Jan 06 15:04 MST" + case n1 <= 31 && l1 <= 2 && b1 == ' ': + if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) { + r.commit() + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + // if not, go to default handler after switch statement + + // It could be a UUID that starts with 8 digits + case l1 == 8 && b1 == '-': + if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) { + r.emit(hasMinusPrefix, placeholderUUID) + return true + } + // if not, go to default handler after switch statement + } + + // Number with an unknown boundary - emit the number and leave the boundary + // for the following passes. + r.resetHead() + r.emitNumberOrCopyText(hasMinusPrefix) + return true +} + +var longDayNames = [...][]byte{ + []byte("Sunday"), + []byte("Monday"), + []byte("Tuesday"), + []byte("Wednesday"), + []byte("Thursday"), + []byte("Friday"), + []byte("Saturday"), +} + +func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) { + r.resetHead() + + if r.advanceDayOfTheWeek() { + r.commit() // we will always consume this + + // RFC1123 and RFC1123Z, e.g.: + // - "Mon, 02 Jan 2006 15:04:05 MST" + // - "Mon, 02 Jan 2006 15:04:05 -0700" + if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // ANSIC, UnixDatem, RubyDate e.g + // - "Mon Jan 2 15:04:05 2006" + // - "Mon Jan 2 15:04:05 MST 2006" + // - "Mon Jan 02 15:04:05 -0700 2006" + if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // Linux, e.g. + // - "Mon 2 Jan 15:04:05 MST 2006" + // - "Tue 23 Jan 15:04:05 -0700 2023" + if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // RFC850, e.g. + // - "Monday, 02-Jan-06 15:04:05 MST" + backtrackedSlice := r.source[r.head-3:] + var matchedDay []byte + for _, dw := range longDayNames { + if bytes.HasPrefix(backtrackedSlice, dw) { + matchedDay = dw + break + } + } + if matchedDay != nil { + r.head += len(matchedDay) - 3 + if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) { + r.emit(false, placeholderTimestamp) + return true + } + } + + r.cur -= 3 // unconsume + r.resetHead() + return false + } + r.resetHead() + + if r.advanceMonthName() { + r.commit() // provisionally consume this + + // Linux journald logs and others similar like this: + // - Feb 29 23:00:14 + // - Apr-10 23:43:46.807 + // - Jul 1 00:21:28 + if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) { + r.commit() + // This is already a timestamp, but let's try to match subseconds as well + if r.advanceChar('.') && r.advanceUint() { + r.commit() + } else { + r.resetHead() + } + r.emit(false, placeholderTimestamp) + return true + } + r.cur -= 3 // unconsume + r.resetHead() + return false + } + r.resetHead() + + return false +} + +func (r *replacer) wasNumPreservingKeyword() bool { + for _, key := range numPreservingKeys { + pPos := r.cur - 1 - len(key) + if pPos < -1 { + return false // all subsequent keys are longer + } + if pPos != -1 && !boundaryChars[r.source[pPos]] { + continue + } + if bytes.HasPrefix(r.source[pPos+1:], key) { + return true + } + } + return false +} + +func (r *replacer) replaceWithPlaceholders() { + lineLen := len(r.source) + + var c byte + onBoundary := true + for r.cur = 0; r.cur < lineLen; r.cur++ { + c = r.source[r.cur] + switch { + // If we're currently not at a boundary, the only thing we need to check + // is whether the current char would create a boundary in the next iteration. + case !onBoundary: + onBoundary = boundaryChars[c] + + // We weren't at a boundary and now we are, so check if we match one + // of the special keywords that will preserve numbers + if onBoundary { + r.preserveNextNumbers = r.wasNumPreservingKeyword() + } + + // If we've reached this far, it means we're currently at a boundary! + + // A lot of very complex logic if we encounter a number at a boundary, + // so we move that to a helper function. + case '0' <= c && c <= '9': + r.copyUpToCurrent() + onBoundary = r.handleNumberStart(false) + + // Handle negative numbers, potentially + case c == '-': + next := r.cur + 1 + // This might be a number, a date, an IP address, etc. So we don't + // know if this is a minus sign to replace or a dash to copy yet. + if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' { + // Copy everything before the dash, but mark it as consumed. + r.copyUpToCurrent() + r.cur++ + r.consumeUpToCurrent() + onBoundary = r.handleNumberStart(true) + } else { + onBoundary = true + } + + // Try to match weird timestamps. They require a lot of remaining + // length, generally start with a capitalized day of the week or month + // name (1 upper case letter followed by 2 lower case letters). + // + // We are basically looking for something that may match this here: + // Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec + // + // The detailed check would be performed by the actual handler: + case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 && + 'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' && + 'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y': + r.copyUpToCurrent() + onBoundary = r.handleWeirdTimestamp() + + // This could be the start of an lower case hex string: + case 'a' <= c && c <= 'f': + r.copyUpToCurrent() + r.resetHead() + onBoundary = r.handleHex(false, 0, 'a', 'f', true) + + // This could be the start of an upper case hex string: + case 'A' <= c && c <= 'F': + r.copyUpToCurrent() + r.resetHead() + onBoundary = r.handleHex(false, 0, 'A', 'F', true) + + // If we haven't actually matched anything, update whether we're still + // on a boundary character and continue onto the next one. + default: + onBoundary = boundaryChars[c] + } + } + + if r.cur > r.tail { + r.dest = append(r.dest, r.source[r.tail:]...) + r.consumeUpToCurrent() + } +} + +func Preprocess(content []byte) []byte { + // ~floor(120%), to allow for some expansion from replacements, hopefully + // without needing to allocate more memory + r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)} + r.replaceWithPlaceholders() + return r.dest +} diff --git a/pkg/pattern/tokenization/tokenization.go b/pkg/pattern/tokenization/tokenization.go index feb58040d4b19..d4918ccb8e048 100644 --- a/pkg/pattern/tokenization/tokenization.go +++ b/pkg/pattern/tokenization/tokenization.go @@ -2,855 +2,15 @@ package tokenization import ( "bytes" - "slices" "unsafe" ) -var ( - placeholderNumber = []byte("") - placeholderHex = []byte("") - placeholderUUID = []byte("") - placeholderTimestamp = []byte("") - placeholderDuration = []byte("") - placeholderBytesize = []byte("") - placeholderIP = []byte("") -) - -// Integer numbers after these words won't be replaced by ``. -var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")} - -const ( - placeholderEndOfLine = "<...>" - maxYear = 2100 // anything above that is unlikely to be a timestamp... - minHexLen = 12 // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings -) - -var boundaryChars = [256]bool{} - -func init() { - for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary - if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' { - boundaryChars[i] = true - } - } - - // We need these keys sorted in an ascending length to efficiently compare them. - slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int { - return len(a) - len(b) - }) -} - -type replacer struct { - source, dest []byte - - // This is the last position that was copied to the destination buffer. - tail int - // This is the current position we are working with - cur int - // This is essentially used for lookahed operations - head int - - // Here's some ASCII art to visualize that (the long string of dashes - // visualizes the log line: - // - // 0 `tail` `cur` `head` `len(source)` - // |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------| - - // A somewhat hacky solution that allows us to preserve specific numeric - // values we care about, like status and status_code. - preserveNextNumbers bool -} - -// commit advances the current position marker to the lookahead position, i.e. -// we are committing to consume everything we've looked ahead so far. -func (r *replacer) commit() { - r.cur = r.head -} - -func (r *replacer) resetHead() { - r.head = r.cur -} - -func (r *replacer) resetHeadExpr() bool { - r.head = r.cur - return true // useful when included in boolean expressions with advance() -} - -func (r *replacer) backtrack() { - r.head-- -} - -func (r *replacer) consumeUpToCurrent() { - r.tail = r.cur -} - -// advanceUintRet returns the actual value of the number we read, as well as its -// length. The value might overflow for large numbers, so it's also important to -// check the length! -func (r *replacer) advanceUintRet() (val uint, length uint) { - var c byte - for ; r.head < len(r.source); r.head++ { - c = r.source[r.head] - if c < '0' || '9' < c { - break - } - length++ - val = val*10 + uint(c-'0') // TODO: use bitwise trick? - } - - return val, length -} - -func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) { - var c byte - for ; r.head < len(r.source); r.head++ { - c = r.source[r.head] - if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) { - break - } - length++ - } - - return length -} - -func (r *replacer) advanceUint() bool { - _, l := r.advanceUintRet() - return l > 0 -} - -func (r *replacer) advanceUintUpTo(val uint, length uint) bool { - foundVal, foundLength := r.advanceUintRet() - return foundLength > 0 && foundVal <= val && foundLength <= length -} - -func (r *replacer) advanceYear() bool { - return r.advanceUintUpTo(maxYear, 4) -} - -func (r *replacer) advanceChar(c byte) bool { - if r.head < len(r.source) && r.source[r.head] == c { - r.head++ - return true - } - return false -} - -func (r *replacer) peekNextIsBoundary() bool { - if r.head >= len(r.source) { - return true // we are at the end of the line - } - return boundaryChars[r.source[r.head]] -} - -func (r *replacer) peekFirstNonInt() (c byte) { - for i := r.head; i < len(r.source); i++ { - c = r.source[i] - if c < '0' || '9' < c { - return c - } - } - return 0 // we can return the 0 value here! -} - -func (r *replacer) peekNext4() (result [4]byte) { - overhead := len(r.source) - r.head - switch { - case overhead > 3: - result[3] = r.source[r.head+3] - fallthrough - case overhead > 2: - result[2] = r.source[r.head+2] - fallthrough - case overhead > 1: - result[1] = r.source[r.head+1] - fallthrough - case overhead > 0: - result[0] = r.source[r.head+0] - } - return result -} - -func (r *replacer) advanceTimeZoneLetters() bool { - UCLetters := 0 - for { - nc, ok := r.advance() - if !ok { - break - } - if nc < 'A' || nc > 'Z' { - r.backtrack() - break - } - UCLetters++ - } - - return UCLetters >= 2 && UCLetters <= 5 -} - -func (r *replacer) advanceNumericTimeZone() bool { - // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones - return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5) -} +const placeholderEndOfLine = "<...>" -// helper for handleWeirdTimestamp() -func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool { - curHead := r.head - if r.advanceChar(' ') && r.advanceNumericTimeZone() { - return true - } - r.head = curHead - if r.advanceChar(' ') && r.advanceTimeZoneLetters() { - return true - } - r.head = curHead - return isOptional -} - -func (r *replacer) advanceOneOf(chars ...byte) bool { - if r.head >= len(r.source) { - return false - } - c := r.source[r.head] - - for _, ec := range chars { - if c == ec { - r.head++ - return true - } - } - return false -} - -func (r *replacer) advanceTime(secondsOptional bool) bool { - return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2))) -} - -// Mon|Tue|Wed|Thu|Fri|Sat|Sun -func (r *replacer) advanceDayOfTheWeek() bool { - c1, ok1 := r.advance() - c2, ok2 := r.advance() - c3, ok3 := r.advance() - if !ok1 || !ok2 || !ok3 { - return false - } - return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun - (c1 == 'M' && c2 == 'o' && c3 == 'n') || - (c1 == 'T' && c2 == 'u' && c3 == 'e') || - (c1 == 'W' && c2 == 'e' && c3 == 'd') || - (c1 == 'T' && c2 == 'h' && c3 == 'u') || - (c1 == 'F' && c2 == 'r' && c3 == 'i') -} - -// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec -func (r *replacer) advanceMonthName() bool { - c1, ok1 := r.advance() - c2, ok2 := r.advance() - c3, ok3 := r.advance() - if !ok1 || !ok2 || !ok3 { - return false - } - - return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul - (c2 == 'a' && c3 == 'n'))) || // Jan - (c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May - (c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec - (c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr - (c2 == 'u' && c3 == 'g'))) || // Aug - (c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct - (c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov -} - -// Check if we in the middle of an UUID, exactly after the first 8 characters -// and the dash after them, e.g after "abcd0123-": -func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) { - return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && - (r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary() -} - -// Only moves the head forward if it successfully matches a duration -func (r *replacer) advanceDuration() (matched bool) { - curHead := r.head - - var secsLen int - n := r.peekNext4() - if n[0] == 'h' { - r.head++ - if boundaryChars[n[1]] { - return true // e.g. "1h", "123h" - } - if !r.advanceUintUpTo(60, 2) { - goto restore - } - n = r.peekNext4() - } - if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here - r.head++ - if boundaryChars[n[1]] { - return true // e.g. "2h21m", "121m" - } - if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) { - goto restore - } - n = r.peekNext4() - } - - if n[0] == 's' && boundaryChars[n[1]] { - secsLen = 1 - } else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] { - secsLen = 2 - } else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] { - // This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu) - secsLen = 3 - } else { - goto restore - } - - r.head += secsLen - return true - -restore: // should be faster than a defer - r.head = curHead - return false -} - -var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true} - -// Only moves the head forward if it successfully matches a duration -func (r *replacer) advanceBytesize(c1 byte) (matched bool) { - if !byteSizes[c1] { - return false - } - n := r.peekNext4() - - var unitLen int // not counting the first character c1, which is already advanced to - if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] { - unitLen = 1 - } else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] { - unitLen = 2 - } else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] { - unitLen = 3 - } - - if unitLen > 0 { - r.head += unitLen - return true - } - return false -} - -func (r *replacer) advance() (c byte, advanced bool) { - if r.head >= len(r.source) { - return 0, false - } - - c = r.source[r.head] - r.head++ - return c, true -} - -func (r *replacer) emitNumber() { - r.commit() - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() -} - -func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) { - r.commit() - if !r.preserveNextNumbers { - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() - } else { - r.maybeEmitDash(hasMinusPrefix) - r.copyUpToCurrent() - } -} - -func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) { - r.commit() - r.maybeEmitDash(hasMinusPrefix) - r.dest = append(r.dest, placeholder...) - r.consumeUpToCurrent() -} - -func (r *replacer) maybeEmitDash(hasMinusPrefix bool) { - // This minus was actually a dash, so we just copy it to the result - if hasMinusPrefix { - r.dest = append(r.dest, '-') - } -} - -func (r *replacer) copyUpToCurrent() { - r.dest = append(r.dest, r.source[r.tail:r.cur]...) - r.consumeUpToCurrent() -} - -func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) { - // Special case that is likely a hex string of the format "0x", but we don't - // know whether the rest is upper case or lower case yet. - zeroHex := false - if n1 == 0 && l1 == 1 && c1 == 'x' { - zeroHex = true // these cannot be the start of an UUID - c1 = r.peekFirstNonInt() - } - - // Maybe we are at the start of a hex string, either something like - // "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper - // case letters, but to avoid false positives, we want hex replacements to - // happen only if the strings are fully lower case or fully upper case. - if 'a' <= c1 && c1 <= 'f' { - return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex) - } else if 'A' <= c1 && c1 <= 'F' { - return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex) - } else if zeroHex { - // Well, it probably wasn't a zero-hex after all, or it contained only - // digits, so try to handle that or emit what we absorbed - _, l2 := r.advanceUintRet() - if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen - r.emit(hasMinusPrefix, placeholderHex) - } else { - r.maybeEmitDash(hasMinusPrefix) - r.commit() - r.copyUpToCurrent() - } - return r.peekNextIsBoundary() - } - - return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1) -} - -func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) { - totalLen := l1 + r.advanceUintOrRangeRet(lc, hc) - r.commit() - - if totalLen == 8 && canBeUUID { - // We might be at the first part of a UUID, right before the first dash - if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) { - r.emit(hasMinusPrefix, placeholderUUID) - return true - } - r.resetHead() - } - - if totalLen >= minHexLen && r.peekNextIsBoundary() { - r.emit(hasMinusPrefix, placeholderHex) - return true - } - - r.copyUpToCurrent() - return r.peekNextIsBoundary() -} - -func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) { - if r.advanceBytesize(c1) { - // We do not subsume a minus sign - byte sizes are unlikely to be - // negative, it's more likely this is a dash as a part of a range - r.emit(hasMinusPrefix, placeholderBytesize) - return true - } - - r.backtrack() - if r.advanceDuration() { - // We subsume hasMinusPrefix, since durations can be negative - r.emit(false, placeholderDuration) - return true - } - - // We couldn't match anything, so just copy what existed. - r.maybeEmitDash(hasMinusPrefix) - r.copyUpToCurrent() - return false -} - -func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) { - n2, l2 := r.advanceUintRet() - if l2 == 0 { - // The dot wasn't followed by another number, so emit everything before it. - r.backtrack() - r.emitNumberOrCopyText(hasMinusPrefix) - return false - } - - // See if the number after the decimal is also followed by a boundary - b2, hasNext := r.advance() - - // We are at the end of the string, which is a boundary, replace evertything - // up to now with a number - if !hasNext { - r.emitNumber() // this also subsumes any minus sign we had - return true - } - - // The decimal number isn't followed by a boundary char (which include - // things like '.', ':', '/', etc.), so the part after the decimal is either - // not a real number, or it's some sort of a unit that can support decimals - // like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc. - if !boundaryChars[b2] { - return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2) - } - - // We have a decimal number followed by a non-dot boundary, so this is not - // an IP or a version number or anything like that. - if b2 != '.' { - r.backtrack() - r.emitNumber() - return true - } - - n3, l3 := r.advanceUintRet() - if l3 == 0 || !r.peekNextIsBoundary() { - // The second dot wasn't followed by another number and a boundary, so - // emit just the first number. - r.resetHead() - r.emitNumber() - return true - } - - // We have ".." at this point, so we either have something - // like a version number, or an IP address, but certainly not a simple - // decimal number we can just emit. - r.commit() - - // Check if this is an IP address... - if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() { - r.emit(hasMinusPrefix, placeholderIP) - return true - } - - // This wasn't an IP after all, so just emit "..". We don't - // want to assume this is a simple decimal number because of the second dot, - // e.g. this might be something like a version number. - r.resetHead() - r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus - r.dest = append(r.dest, placeholderNumber...) - r.dest = append(r.dest, '.') - r.dest = append(r.dest, placeholderNumber...) - r.dest = append(r.dest, '.') - r.dest = append(r.dest, placeholderNumber...) - r.consumeUpToCurrent() - return true -} - -func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) { - if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) { - r.commit() - // continue down to parsing sub-second and timezone values - } else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() { - // We might not have a sane timestamp, but apparently a not-so-sane - // timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"... - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } else { - // Apparently that wasn't it either, we probably just have a dash or - // forward slash after a number, so we backtrack and emit the number. - r.resetHead() - r.emitNumberOrCopyText(hasMinusPrefix) - return true - } - - // We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or - // `YY[YY]/MM/DDZhh:mm:ss` - r.commit() // we want to keep this - - // Now we need to also check for sub-second precision and time zones: - c, canAdvance := r.advance() - if !canAdvance { - // End of the string - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - if c == '.' { - _, intl := r.advanceUintRet() - if intl == 0 { - // No sub-second precision, the dot was not part of the timestamp - r.backtrack() - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - - // We had sub-second precision, capture that too - r.commit() - - // Get the next char to see if we have time zone - c, canAdvance = r.advance() - if !canAdvance { - // We are at the end of the sting after we captured the - // sub-second value. - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - } - - if c == 'Z' || c == 'z' { - //UTC string, nothing to do after that - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones - if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) { - // e.g. "2020-09-30T00:00:59.9999+03:00" - r.commit() - } else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() { - // e.g. "2023-09-05 23:20:28.030285153 +0000 UTC" - r.commit() - } else { - r.resetHead() - } - r.emit(hasMinusPrefix, placeholderTimestamp) - return true -} - -func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) { - r.resetHead() // keep the head pos in sync with the current pos - // We know we were at a digit due to how handleNumber() is called - n1, l1 := r.advanceUintRet() - r.commit() // we will consume this one way or another for sure - - // See if the number is followed by a boundary - b1, hasNext := r.advance() - - switch { - // We are at the end of the string, which is a boundary, replace evertything - // up to now with a number - case !hasNext: - r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had - return true - - // The number isn't followed by a boundary char (which include things like - // '.', ' ', '/', etc.), so it's either not a real number or date, or it's - // some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB, - // 3MiB), etc. - case !boundaryChars[b1]: - return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1) - - // We have a decimal point, so this can either be a plain number, a unit - // that can have a float value, or an IP address. - case b1 == '.': - return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1) - - // This might be a timestamp that looks like '2024-04-01...' or - // `2024/03/27...`; timestamps in the far future are not supported :) - case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'): - return r.handleSaneTimestamp(hasMinusPrefix, n1, b1) - - // Weird RFC822 dates like "02 Jan 06 15:04 MST" - case n1 <= 31 && l1 <= 2 && b1 == ' ': - if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) { - r.commit() - r.emit(hasMinusPrefix, placeholderTimestamp) - return true - } - // if not, go to default handler after switch statement - - // It could be a UUID that starts with 8 digits - case l1 == 8 && b1 == '-': - if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) { - r.emit(hasMinusPrefix, placeholderUUID) - return true - } - // if not, go to default handler after switch statement - } - - // Number with an unknown boundary - emit the number and leave the boundary - // for the following passes. - r.resetHead() - r.emitNumberOrCopyText(hasMinusPrefix) - return true -} - -var longDayNames = [...][]byte{ - []byte("Sunday"), - []byte("Monday"), - []byte("Tuesday"), - []byte("Wednesday"), - []byte("Thursday"), - []byte("Friday"), - []byte("Saturday"), -} - -func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) { - r.resetHead() - - if r.advanceDayOfTheWeek() { - r.commit() // we will always consume this - - // RFC1123 and RFC1123Z, e.g.: - // - "Mon, 02 Jan 2006 15:04:05 MST" - // - "Mon, 02 Jan 2006 15:04:05 -0700" - if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // ANSIC, UnixDatem, RubyDate e.g - // - "Mon Jan 2 15:04:05 2006" - // - "Mon Jan 2 15:04:05 MST 2006" - // - "Mon Jan 02 15:04:05 -0700 2006" - if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // Linux, e.g. - // - "Mon 2 Jan 15:04:05 MST 2006" - // - "Tue 23 Jan 15:04:05 -0700 2023" - if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() { - r.emit(false, placeholderTimestamp) - return true - } - r.resetHead() - - // RFC850, e.g. - // - "Monday, 02-Jan-06 15:04:05 MST" - backtrackedSlice := r.source[r.head-3:] - var matchedDay []byte - for _, dw := range longDayNames { - if bytes.HasPrefix(backtrackedSlice, dw) { - matchedDay = dw - break - } - } - if matchedDay != nil { - r.head += len(matchedDay) - 3 - if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) { - r.emit(false, placeholderTimestamp) - return true - } - } - - r.cur -= 3 // unconsume - r.resetHead() - return false - } - r.resetHead() - - if r.advanceMonthName() { - r.commit() // provisionally consume this - - // Linux journald logs and others similar like this: - // - Feb 29 23:00:14 - // - Apr-10 23:43:46.807 - // - Jul 1 00:21:28 - if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) { - r.commit() - // This is already a timestamp, but let's try to match subseconds as well - if r.advanceChar('.') && r.advanceUint() { - r.commit() - } else { - r.resetHead() - } - r.emit(false, placeholderTimestamp) - return true - } - r.cur -= 3 // unconsume - r.resetHead() - return false - } - r.resetHead() - - return false -} - -func (r *replacer) wasNumPreservingKeyword() bool { - for _, key := range numPreservingKeys { - pPos := r.cur - 1 - len(key) - if pPos < -1 { - return false // all subsequent keys are longer - } - if pPos != -1 && !boundaryChars[r.source[pPos]] { - continue - } - if bytes.HasPrefix(r.source[pPos+1:], key) { - return true - } - } - return false -} - -func (r *replacer) replaceWithPlaceholders() { - lineLen := len(r.source) - - var c byte - onBoundary := true - for r.cur = 0; r.cur < lineLen; r.cur++ { - c = r.source[r.cur] - switch { - // If we're currently not at a boundary, the only thing we need to check - // is whether the current char would create a boundary in the next iteration. - case !onBoundary: - onBoundary = boundaryChars[c] - - // We weren't at a boundary and now we are, so check if we match one - // of the special keywords that will preserve numbers - if onBoundary { - r.preserveNextNumbers = r.wasNumPreservingKeyword() - } - - // If we've reached this far, it means we're currently at a boundary! - - // A lot of very complex logic if we encounter a number at a boundary, - // so we move that to a helper function. - case '0' <= c && c <= '9': - r.copyUpToCurrent() - onBoundary = r.handleNumberStart(false) - - // Handle negative numbers, potentially - case c == '-': - next := r.cur + 1 - // This might be a number, a date, an IP address, etc. So we don't - // know if this is a minus sign to replace or a dash to copy yet. - if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' { - // Copy everything before the dash, but mark it as consumed. - r.copyUpToCurrent() - r.cur++ - r.consumeUpToCurrent() - onBoundary = r.handleNumberStart(true) - } else { - onBoundary = true - } - - // Try to match weird timestamps. They require a lot of remaining - // length, generally start with a capitalized day of the week or month - // name (1 upper case letter followed by 2 lower case letters). - // - // We are basically looking for something that may match this here: - // Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec - // - // The detailed check would be performed by the actual handler: - case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 && - 'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' && - 'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y': - r.copyUpToCurrent() - onBoundary = r.handleWeirdTimestamp() - - // This could be the start of an lower case hex string: - case 'a' <= c && c <= 'f': - r.copyUpToCurrent() - r.resetHead() - onBoundary = r.handleHex(false, 0, 'a', 'f', true) - - // This could be the start of an upper case hex string: - case 'A' <= c && c <= 'F': - r.copyUpToCurrent() - r.resetHead() - onBoundary = r.handleHex(false, 0, 'A', 'F', true) - - // If we haven't actually matched anything, update whether we're still - // on a boundary character and continue onto the next one. - default: - onBoundary = boundaryChars[c] - } - } - - if r.cur > r.tail { - r.dest = append(r.dest, r.source[r.tail:]...) - r.consumeUpToCurrent() - } -} +// Outside of quoted strings, these are the delimiters between tokens. However, +// they are not going to be a part of the tokens themselves and will be replaced +// by spaces in the actual log template. +var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true} type tokenizer struct { // Input @@ -869,11 +29,6 @@ type tokenizer struct { tokens []string } -// Outside of quoted strings, these are the delimiters between tokens. However, -// they are not going to be a part of the tokens themselves and will be replaced -// by spaces in the actual log template. -var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true} - func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) { if t.tokens != nil { // Intentionally written like this and not with append(), so this can @@ -1037,14 +192,6 @@ func (t *tokenizer) tokenize() []string { return t.tokens } -func Preprocess(content []byte) []byte { - // ~floor(120%), to allow for some expansion from replacements, hopefully - // without needing to allocate more memory - r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)} - r.replaceWithPlaceholders() - return r.dest -} - func PreprocessAndTokenize(content []byte) []string { content = bytes.TrimSpace(content)