Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add patterns and grok command #813

Merged
merged 28 commits into from
Sep 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5f27053
Add punct
joshuali925 Aug 26, 2022
4b08343
Add grok
joshuali925 Aug 26, 2022
7ac9391
Refactor and fix tests
joshuali925 Aug 29, 2022
25f13a7
Add grok and punct expression tests
joshuali925 Aug 29, 2022
ac99785
Fix checkstyle
joshuali925 Aug 30, 2022
8b76ebe
Fix non-included derived fields showing up
joshuali925 Aug 30, 2022
d200791
Update punct regex
joshuali925 Aug 30, 2022
697c4b7
Add punct and grok to parse docs
joshuali925 Aug 30, 2022
858ff61
Remove unused class
joshuali925 Aug 30, 2022
6da106a
Update punct regex and tests
joshuali925 Aug 30, 2022
b65ee40
Initial refactor to split grok and patterns from parse command
joshuali925 Sep 7, 2022
e4a7128
Update tests for refactoring
joshuali925 Sep 7, 2022
1e98b4e
Add docs for patterns and grok
joshuali925 Sep 7, 2022
6dd7a7a
Minor refactors
joshuali925 Sep 8, 2022
f92a48c
Merge branch 'main' into punct
joshuali925 Sep 12, 2022
29e7e00
Address comments
joshuali925 Sep 15, 2022
93091ef
Remove unused class
joshuali925 Sep 15, 2022
9c3b952
Move grok library into common module
joshuali925 Sep 16, 2022
a52f544
Remove unused grok discovery
joshuali925 Sep 19, 2022
3a0cc6b
Improve default patterns performance
joshuali925 Sep 19, 2022
e530734
Add thekrakken/java-grok to NOTICE file
joshuali925 Sep 19, 2022
bc0b49c
Remove punct reference
joshuali925 Sep 26, 2022
f331e09
Merge branch '2.x' into punct
joshuali925 Sep 26, 2022
585ff9e
Sanitize raw string in log
joshuali925 Sep 28, 2022
da52648
Remove stringbuilder
joshuali925 Sep 28, 2022
53c6867
Ignore .pid.lock file
joshuali925 Sep 28, 2022
65bfa78
Add more details for patterns doc
joshuali925 Sep 28, 2022
9ec7825
Add apache logs data to doctest
joshuali925 Sep 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ gen
.DS_Store

/artifacts/
/.pid.lock
3 changes: 3 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ Foundation (http://www.apache.org/).
This product includes software developed by
Joda.org (http://www.joda.org/).

This product includes software developed by
Kraken (https://github.com/thekrakken/java-grok).

This project is based on the Apache 2.0-licensed elasticsearch-sql project (https://github.com/NLPchina/elasticsearch-sql):

Copyright 2014 omershelef
Expand Down
4 changes: 4 additions & 0 deletions common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ dependencies {
api "org.antlr:antlr4-runtime:4.7.1"
api group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
api group: 'org.apache.logging.log4j', name: 'log4j-core', version:'2.17.1'
api group: 'org.apache.commons', name: 'commons-lang3', version: '3.10'

testImplementation group: 'junit', name: 'junit', version: '4.13.2'
testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.9.1'
testImplementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
testImplementation group: 'org.hamcrest', name: 'hamcrest-library', version: '2.1'
}
168 changes: 168 additions & 0 deletions common/src/main/java/org/opensearch/sql/common/grok/Converter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.common.grok;

import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.TemporalAccessor;
import java.util.AbstractMap;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* Convert String argument to the right type.
*/
public class Converter {

public enum Type {
BYTE(Byte::valueOf),
BOOLEAN(Boolean::valueOf),
SHORT(Short::valueOf),
INT(Integer::valueOf, "integer"),
LONG(Long::valueOf),
FLOAT(Float::valueOf),
DOUBLE(Double::valueOf),
DATETIME(new DateConverter(), "date"),
STRING(v -> v, "text");

public final IConverter<? extends Object> converter;
public final List<String> aliases;

Type(IConverter<? extends Object> converter, String... aliases) {
this.converter = converter;
this.aliases = Arrays.asList(aliases);
}
}

private static final Pattern SPLITTER = Pattern.compile("[:;]");

private static final Map<String, Type> TYPES =
Arrays.stream(Type.values())
.collect(Collectors.toMap(t -> t.name().toLowerCase(), t -> t));

private static final Map<String, Type> TYPE_ALIASES =
Arrays.stream(Type.values())
.flatMap(type -> type.aliases.stream()
.map(alias -> new AbstractMap.SimpleEntry<>(alias, type)))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

private static Type getType(String key) {
key = key.toLowerCase();
Type type = TYPES.getOrDefault(key, TYPE_ALIASES.get(key));
if (type == null) {
throw new IllegalArgumentException("Invalid data type :" + key);
}
return type;
}

/**
* getConverters.
*/
public static Map<String, IConverter<? extends Object>>
getConverters(Collection<String> groupNames, Object... params) {
return groupNames.stream()
.filter(Converter::containsDelimiter)
.collect(Collectors.toMap(Function.identity(), key -> {
String[] list = splitGrokPattern(key);
IConverter<? extends Object> converter = getType(list[1]).converter;
if (list.length == 3) {
converter = converter.newConverter(list[2], params);
}
return converter;
}));
}

/**
* getGroupTypes.
*/
public static Map<String, Type> getGroupTypes(Collection<String> groupNames) {
return groupNames.stream()
.filter(Converter::containsDelimiter)
.map(Converter::splitGrokPattern)
.collect(Collectors.toMap(
l -> l[0],
l -> getType(l[1])
));
}

public static String extractKey(String key) {
return splitGrokPattern(key)[0];
}

private static boolean containsDelimiter(String string) {
return string.indexOf(':') >= 0 || string.indexOf(';') >= 0;
}

private static String[] splitGrokPattern(String string) {
return SPLITTER.split(string, 3);
}

interface IConverter<T> {

T convert(String value);

default IConverter<T> newConverter(String param, Object... params) {
return this;
}
}


static class DateConverter implements IConverter<Instant> {

private final DateTimeFormatter formatter;
private final ZoneId timeZone;

public DateConverter() {
this.formatter = DateTimeFormatter.ISO_DATE_TIME;
this.timeZone = ZoneOffset.UTC;
}

private DateConverter(DateTimeFormatter formatter, ZoneId timeZone) {
this.formatter = formatter;
this.timeZone = timeZone;
}

@Override
public Instant convert(String value) {
TemporalAccessor dt = formatter
.parseBest(value.trim(), ZonedDateTime::from, LocalDateTime::from, OffsetDateTime::from,
Instant::from,
LocalDate::from);
if (dt instanceof ZonedDateTime) {
return ((ZonedDateTime) dt).toInstant();
} else if (dt instanceof LocalDateTime) {
return ((LocalDateTime) dt).atZone(timeZone).toInstant();
} else if (dt instanceof OffsetDateTime) {
return ((OffsetDateTime) dt).atZoneSameInstant(timeZone).toInstant();
} else if (dt instanceof Instant) {
return ((Instant) dt);
} else if (dt instanceof LocalDate) {
return ((LocalDate) dt).atStartOfDay(timeZone).toInstant();
} else {
return null;
}
}

@Override
public DateConverter newConverter(String param, Object... params) {
if (!(params.length == 1 && params[0] instanceof ZoneId)) {
throw new IllegalArgumentException("Invalid parameters");
}
return new DateConverter(DateTimeFormatter.ofPattern(param), (ZoneId) params[0]);
}
}
}
182 changes: 182 additions & 0 deletions common/src/main/java/org/opensearch/sql/common/grok/Grok.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.sql.common.grok;

import java.io.Serializable;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.opensearch.sql.common.grok.Converter.IConverter;

/**
* {@code Grok} parse arbitrary text and structure it.
* <br>
* {@code Grok} is simple API that allows you to easily parse logs
* and other files (single line). With {@code Grok},
* you can turn unstructured log and event data into structured data.
*
* @since 0.0.1
*/
public class Grok implements Serializable {
/**
* Named regex of the originalGrokPattern.
*/
private final String namedRegex;
/**
* Map of the named regex of the originalGrokPattern
* with id = namedregexid and value = namedregex.
*/
private final Map<String, String> namedRegexCollection;
/**
* Original {@code Grok} pattern (expl: %{IP}).
*/
private final String originalGrokPattern;
/**
* Pattern of the namedRegex.
*/
private final Pattern compiledNamedRegex;

/**
* {@code Grok} patterns definition.
*/
private final Map<String, String> grokPatternDefinition;

public final Set<String> namedGroups;

public final Map<String, Converter.Type> groupTypes;

public final Map<String, IConverter<? extends Object>> converters;

/**
* only use in grok discovery.
*/
private String savedPattern = "";

/**
* Grok.
*/
public Grok(String pattern,
String namedRegex,
Map<String, String> namedRegexCollection,
Map<String, String> patternDefinitions,
ZoneId defaultTimeZone) {
this.originalGrokPattern = pattern;
this.namedRegex = namedRegex;
this.compiledNamedRegex = Pattern.compile(namedRegex);
this.namedRegexCollection = namedRegexCollection;
this.namedGroups = GrokUtils.getNameGroups(namedRegex);
this.groupTypes = Converter.getGroupTypes(namedRegexCollection.values());
this.converters = Converter.getConverters(namedRegexCollection.values(), defaultTimeZone);
this.grokPatternDefinition = patternDefinitions;
}

public String getSaved_pattern() {
return savedPattern;
}

public void setSaved_pattern(String savedpattern) {
this.savedPattern = savedpattern;
}

/**
* Get the current map of {@code Grok} pattern.
*
* @return Patterns (name, regular expression)
*/
public Map<String, String> getPatterns() {
return grokPatternDefinition;
}

/**
* Get the named regex from the {@code Grok} pattern. <br>
*
* @return named regex
*/
public String getNamedRegex() {
return namedRegex;
}

/**
* Original grok pattern used to compile to the named regex.
*
* @return String Original Grok pattern
*/
public String getOriginalGrokPattern() {
return originalGrokPattern;
}

/**
* Get the named regex from the given id.
*
* @param id : named regex id
* @return String of the named regex
*/
public String getNamedRegexCollectionById(String id) {
return namedRegexCollection.get(id);
}

/**
* Get the full collection of the named regex.
*
* @return named RegexCollection
*/
public Map<String, String> getNamedRegexCollection() {
return namedRegexCollection;
}

/**
* Match the given <tt>log</tt> with the named regex.
* And return the json representation of the matched element
*
* @param log : log to match
* @return map containing matches
*/
public Map<String, Object> capture(String log) {
Match match = match(log);
return match.capture();
}

/**
* Match the given list of <tt>log</tt> with the named regex
* and return the list of json representation of the matched elements.
*
* @param logs : list of log
* @return list of maps containing matches
*/
public ArrayList<Map<String, Object>> capture(List<String> logs) {
final ArrayList<Map<String, Object>> matched = new ArrayList<>();
for (String log : logs) {
matched.add(capture(log));
}
return matched;
}

/**
* Match the given <tt>text</tt> with the named regex
* {@code Grok} will extract data from the string and get an extence of {@link Match}.
*
* @param text : Single line of log
* @return Grok Match
*/
public Match match(CharSequence text) {
if (compiledNamedRegex == null || text == null) {
return Match.EMPTY;
}

Matcher matcher = compiledNamedRegex.matcher(text);
if (matcher.find()) {
return new Match(
text, this, matcher, matcher.start(0), matcher.end(0)
);
}

return Match.EMPTY;
}
}
Loading