From 0b321cc0f61adc785368d24b66b783b66d1d9fa5 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 29 Apr 2020 10:24:24 -0400 Subject: [PATCH] [ML] Allow a certain number of ill-formatted rows when delimited format is specified (#55735) While it is good to not be lenient when attempting to guess the file format, it is frustrating to users when they KNOW it is CSV but there are a few ill-formatted rows in the file (via some entry error, etc.). This commit allows for up to 10% of sample rows to be considered "bad". These rows are effectively ignored while guessing the format. This percentage of "allows bad rows" is only applied when the user has specified delimited formatting options. As the structure finder needs some guidance on what a "bad row" actually means. related to https://github.com/elastic/elasticsearch/issues/38890 --- .../DelimitedFileStructureFinder.java | 37 ++++-- .../DelimitedFileStructureFinderFactory.java | 11 +- .../FileStructureFinderFactory.java | 4 +- .../FileStructureFinderManager.java | 8 +- .../NdJsonFileStructureFinderFactory.java | 2 +- .../TextLogFileStructureFinderFactory.java | 2 +- .../XmlFileStructureFinderFactory.java | 2 +- ...imitedFileStructureFinderFactoryTests.java | 28 ++--- .../DelimitedFileStructureFinderTests.java | 115 ++++++++++++++++-- .../NdJsonFileStructureFinderTests.java | 2 +- ...NdJsonFileStructureFinderFactoryTests.java | 14 +-- ...extLogFileStructureFinderFactoryTests.java | 2 +- .../TextLogFileStructureFinderTests.java | 12 +- .../XmlFileStructureFinderFactoryTests.java | 12 +- .../XmlFileStructureFinderTests.java | 2 +- 15 files changed, 191 insertions(+), 62 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 3bec88deb6a80..6adbeadb53120 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; @@ -34,7 +35,6 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])"; private static final int MAX_LEVENSHTEIN_COMPARISONS = 100; private static final int LONG_FIELD_THRESHOLD = 100; - private final List sampleMessages; private final FileStructure structure; @@ -80,6 +80,11 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List row = rows.get(index); int lineNumber = lineNumbers.get(index); + // Indicates an illformatted row. We allow a certain number of these + if (row.size() != columnNames.length) { + prevMessageEndLineNumber = lineNumber; + continue; + } Map sampleRecord = new LinkedHashMap<>(); Util.filterListToMap(sampleRecord, columnNames, trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row); @@ -488,7 +493,7 @@ static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) { } static boolean canCreateFromSample(List explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference, - String formatName) { + String formatName, double allowedFractionOfBadLines) { // Logstash's CSV parser won't tolerate fields where just part of the // value is quoted, whereas SuperCSV will, hence this extra check @@ -501,11 +506,13 @@ static boolean canCreateFromSample(List explanation, String sample, int } } + int numberOfLinesInSample = sampleLines.length; try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) { int fieldsInFirstRow = -1; int fieldsInLastRow = -1; + List illFormattedRows = new ArrayList<>(); int numberOfRows = 0; try { List row; @@ -529,11 +536,27 @@ static boolean canCreateFromSample(List explanation, String sample, int --fieldsInThisRow; } - if (fieldsInLastRow != fieldsInFirstRow) { - explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) + - "] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" + - fieldsInLastRow + "]"); - return false; + // TODO: might be good one day to gather a distribution of the most common field counts + // But, this would require iterating (or at least sampling) all the lines. + if (fieldsInThisRow != fieldsInFirstRow) { + illFormattedRows.add(numberOfRows - 1); + // This calculation is complicated by the possibility of multi-lined CSV columns + // `getLineNumber` is a current count of lines, regardless of row count, so + // this formula is just an approximation, but gets more accurate the further + // through the sample you are. + double totalNumberOfRows = (numberOfRows + numberOfLinesInSample - csvReader.getLineNumber()); + // We should only allow a certain percentage of ill formatted rows + // as it may have and down stream effects + if (illFormattedRows.size() > Math.ceil(allowedFractionOfBadLines * totalNumberOfRows)) { + explanation.add(new ParameterizedMessage( + "Not {} because {} or more rows did not have the same number of fields as the first row ({}). Bad rows {}", + formatName, + illFormattedRows.size(), + fieldsInFirstRow, + illFormattedRows).getFormattedMessage()); + return false; + } + continue; } fieldsInLastRow = fieldsInThisRow; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java index cc15491f2e6ca..e413bde60c326 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java @@ -14,6 +14,8 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderFactory { + static final double DELIMITER_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES = 0.10d; + static final double FORMAT_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES = 0.05d; private final CsvPreference csvPreference; private final int minFieldsPerRow; private final boolean trimFields; @@ -44,7 +46,7 @@ public boolean canFindFormat(FileStructure.Format format) { * it could have been truncated when the file was sampled. */ @Override - public boolean canCreateFromSample(List explanation, String sample) { + public boolean canCreateFromSample(List explanation, String sample, double allowedFractionOfBadLines) { String formatName; switch ((char) csvPreference.getDelimiterChar()) { case ',': @@ -57,7 +59,12 @@ public boolean canCreateFromSample(List explanation, String sample) { formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values"; break; } - return DelimitedFileStructureFinder.canCreateFromSample(explanation, sample, minFieldsPerRow, csvPreference, formatName); + return DelimitedFileStructureFinder.canCreateFromSample(explanation, + sample, + minFieldsPerRow, + csvPreference, + formatName, + allowedFractionOfBadLines); } @Override diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index 45edf96ce564f..1fc79e146e7e2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -25,10 +25,12 @@ public interface FileStructureFinderFactory { * @param explanation List of reasons for making decisions. May contain items when passed and new reasons * can be appended by this method. * @param sample A sample from the file to be ingested. + * @param allowedFractionOfBadLines How many lines of the passed sample are allowed to be considered "bad". + * Provided as a fraction from interval [0, 1] * @return true if this factory can create an appropriate * file structure given the sample; otherwise false. */ - boolean canCreateFromSample(List explanation, String sample); + boolean canCreateFromSample(List explanation, String sample, double allowedFractionOfBadLines); /** * Create an object representing the structure of a file. diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index 2fa8d1bb6d6f3..436ccd1c84e58 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -11,6 +11,7 @@ import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -276,14 +277,17 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam Character quote = overrides.getQuote(); Boolean shouldTrimFields = overrides.getShouldTrimFields(); List factories; + double allowedFractionOfBadLines = 0.0; if (delimiter != null) { + allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.DELIMITER_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES; // If a precise delimiter is specified, we only need one structure finder // factory, and we'll tolerate as little as one column in the input factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1, (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields)); - } else if (quote != null || shouldTrimFields != null) { + } else if (quote != null || shouldTrimFields != null || FileStructure.Format.DELIMITED.equals(overrides.getFormat())) { + allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.FORMAT_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES; // The delimiter is not specified, but some other aspect of delimited files is, // so clone our default delimited factories altering the overridden values @@ -301,7 +305,7 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam for (FileStructureFinderFactory factory : factories) { timeoutChecker.check("high level format detection"); - if (factory.canCreateFromSample(explanation, sample)) { + if (factory.canCreateFromSample(explanation, sample, allowedFractionOfBadLines)) { return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides, timeoutChecker); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java index 6970af01bb79d..b20ddac89228a 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java @@ -30,7 +30,7 @@ public boolean canFindFormat(FileStructure.Format format) { * documents must be non-empty, to prevent lines containing "{}" from matching. */ @Override - public boolean canCreateFromSample(List explanation, String sample) { + public boolean canCreateFromSample(List explanation, String sample, double allowedFractionOfBadLines) { int completeDocCount = 0; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index 2980d5d0678ca..54752d2dc0012 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -25,7 +25,7 @@ public boolean canFindFormat(FileStructure.Format format) { * non-blank lines. */ @Override - public boolean canCreateFromSample(List explanation, String sample) { + public boolean canCreateFromSample(List explanation, String sample, double allowedFractionOfBadLines) { if (sample.indexOf('\n') < 0) { explanation.add("Not text because sample contains no newlines"); return false; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index 382f2e7502719..aff9b29819e7d 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -43,7 +43,7 @@ public boolean canFindFormat(FileStructure.Format format) { * necessarily have to be complete (as the sample could have truncated it). */ @Override - public boolean canCreateFromSample(List explanation, String sample) { + public boolean canCreateFromSample(List explanation, String sample, double allowedFractionOfBadLines) { int completeDocCount = 0; String commonRootElementName = null; diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java index 0ed3f54112617..9d761ead28ee3 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java @@ -16,66 +16,66 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC public void testCanCreateCsvFromSampleGivenCsv() { - assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE)); + assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0)); } public void testCanCreateCsvFromSampleGivenTsv() { - assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE)); + assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0)); } public void testCanCreateCsvFromSampleGivenSemiColonDelimited() { - assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE)); + assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateCsvFromSampleGivenPipeDelimited() { - assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateCsvFromSampleGivenText() { - assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } // TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats public void testCanCreateTsvFromSampleGivenTsv() { - assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE)); + assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0)); } public void testCanCreateTsvFromSampleGivenSemiColonDelimited() { - assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE)); + assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateTsvFromSampleGivenPipeDelimited() { - assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateTsvFromSampleGivenText() { - assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } // Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() { - assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE)); + assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateSemiColonDelimitedFromSampleGivenPipeDelimited() { - assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateSemiColonDelimitedFromSampleGivenText() { - assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } // Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited @@ -83,11 +83,11 @@ public void testCanCreateSemiColonDelimitedFromSampleGivenText() { public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() { - assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreatePipeDelimitedFromSampleGivenText() { - assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 629d55abf1731..3f08c2c3e6f93 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -37,7 +37,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -65,6 +65,99 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); } + public void testCreateConfigsGivenIncompleteCsv() throws Exception { + String sample = "time,message\n" + + "2018-05-17T13:41:23,hello\n" + + "badrow\n" + // REALLY bad row + "2018-05-17T13:41:25,hello\n" + + "2018-05-17T13:41:26,hello\n" + + "2018-05-17T13:41:27,hello\n" + + "2018-05-17T13:41:28,hello\n" + + "2018-05-17T13:41:29,hello\n" + + "2018-05-17T13:41:30,hello\n" + + "2018-05-17T13:41:31,hello\n" + + "2018-05-17T13:41:32,hello\n" + + "2018-05-17T13:41:35\n" + // Just missing the column + "2018-05-17T13:41:33,hello again\n"; + assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05)); + assertTrue("assertion failed. Explanation " + explanation, + csvFactory.canCreateFromSample(explanation, sample, 0.10)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); + assertEquals("time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); + assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertEquals(structure.getNumMessagesAnalyzed(), 10); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getMultilineStartPattern()); + assertNull(structure.getShouldTrimFields()); + assertNull(structure.getGrokPattern()); + } + + public void testCreateConfigsGivenIncompleteCsvWithMultiLinedRows() throws Exception { + String sample = "time,message\n" + + "2018-05-17T13:41:23,\"hello\nnew line\"\n" + + "\"badrow\n\n\n\n\"\n" + // REALLY bad row + "2018-05-17T13:41:25,\"hello\nnew line\"\n" + + "2018-05-17T13:41:26,\"hello\nnew line\"\n" + + "2018-05-17T13:41:27,\"hello\nnew line\"\n" + + "2018-05-17T13:41:28,\"hello\nnew line\"\n" + + "2018-05-17T13:41:29,\"hello\nnew line\"\n" + + "2018-05-17T13:41:30,\"hello\nnew line\"\n" + + "2018-05-17T13:41:31,\"hello\nnew line\"\n" + + "2018-05-17T13:41:32,\"hello\nnew line\"\n" + + "2018-05-17T13:41:35\n" + // Just missing the column + "2018-05-17T13:41:33,\"hello again\nnew line\"\n"; + assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05)); + assertTrue("assertion failed. Explanation " + explanation, + csvFactory.canCreateFromSample(explanation, sample, 0.10)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.builder().setQuote('"').build(), + NOOP_TIMEOUT_CHECKER); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); + assertEquals("time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); + assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertEquals(structure.getNumMessagesAnalyzed(), 10); + assertTrue(structure.getHasHeaderRow()); + assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertNull(structure.getShouldTrimFields()); + assertNull(structure.getGrokPattern()); + } + public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception { FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build(); @@ -72,7 +165,7 @@ public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exc String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -109,7 +202,7 @@ public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Ex String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -142,7 +235,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception "2018-05-17T13:41:23,\"hello\n" + "world\",1\n" + "2019-01-18T14:46:57,\"hello again\n"; // note that this last record is truncated - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -177,7 +270,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -222,7 +315,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -262,7 +355,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -309,7 +402,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -347,7 +440,7 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" + "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" + "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -387,7 +480,7 @@ public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception "25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" + "25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" + "25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n"; - assertTrue(tsvFactory.canCreateFromSample(explanation, sample)); + assertTrue(tsvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -421,7 +514,7 @@ public void testCreateConfigsGivenDotInFieldName() throws Exception { String sample = "time.iso8601,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; - assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java index 048d2708e7740..b8bbe2e1491f9 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java @@ -14,7 +14,7 @@ public class NdJsonFileStructureFinderTests extends FileStructureTestCase { private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory(); public void testCreateConfigsGivenGoodJson() throws Exception { - assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java index 63353201669b4..5736b0815ffe7 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java @@ -11,36 +11,36 @@ public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCa public void testCanCreateFromSampleGivenNdJson() { - assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenXml() { - assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenCsv() { - assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenTsv() { - assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenSemiColonDelimited() { - assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenPipeDelimited() { - assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenText() { - assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java index 0148de7fd6ae0..33fa71e4e92ec 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java @@ -14,6 +14,6 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas public void testCanCreateFromSampleGivenText() { - assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index 4c921c8a9f9ba..90212c4cc81b1 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -30,7 +30,7 @@ public void testCreateConfigsGivenLowLineMergeSizeLimit() { "continuation line 2.4\n" + "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n"; - assertTrue(factory.canCreateFromSample(explanation, sample)); + assertTrue(factory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -44,7 +44,7 @@ public void testCreateConfigsGivenLowLineMergeSizeLimit() { } public void testCreateConfigsGivenElasticsearchLog() throws Exception { - assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -85,7 +85,7 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFormatOverride() t FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("M/d/yyyy h:mma").build(); - assertTrue(factory.canCreateFromSample(explanation, sample)); + assertTrue(factory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -121,7 +121,7 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build(); - assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -158,7 +158,7 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" + "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build(); - assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); @@ -199,7 +199,7 @@ public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverri FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" + "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build(); - assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java index 28368ebf73450..aabcde85cc0dc 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java @@ -13,31 +13,31 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase { public void testCanCreateFromSampleGivenXml() { - assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenCsv() { - assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenTsv() { - assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenSemiColonDelimited() { - assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenPipeDelimited() { - assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0)); } public void testCanCreateFromSampleGivenText() { - assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index 9ad07f6142782..2fa9f4a4bde10 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -14,7 +14,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase { private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory(); public void testCreateConfigsGivenGoodXml() throws Exception { - assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE)); + assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);