elastic · benwtrent · Apr 29, 2020 · Apr 24, 2020 · Apr 27, 2020 · Apr 27, 2020
diff --git a/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
@@ -5,6 +5,7 @@
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
@@ -80,6 +81,11 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
         for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
             List<String> row = rows.get(index);
             int lineNumber = lineNumbers.get(index);
+            // Indicates an illformatted row. We allow a certain number of these
+            if (row.size() != columnNames.length) {
+                prevMessageEndLineNumber = lineNumber;
+                continue;
+            }
             Map<String, String> sampleRecord = new LinkedHashMap<>();
             Util.filterListToMap(sampleRecord, columnNames,
                 trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row);
@@ -488,7 +494,7 @@ static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) {
     }
 
     static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
-                                       String formatName) {
+                                       String formatName, double allowedFractionOfBadLines) {
 
         // Logstash's CSV parser won't tolerate fields where just part of the
         // value is quoted, whereas SuperCSV will, hence this extra check
@@ -506,6 +512,7 @@ static boolean canCreateFromSample(List<String> explanation, String sample, int
             int fieldsInFirstRow = -1;
             int fieldsInLastRow = -1;
 
+            List<Integer> illFormattedRows = new ArrayList<>();
             int numberOfRows = 0;
             try {
                 List<String> row;
@@ -529,16 +536,25 @@ static boolean canCreateFromSample(List<String> explanation, String sample, int
                         --fieldsInThisRow;
                     }
 
-                    if (fieldsInLastRow != fieldsInFirstRow) {
-                        explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
-                            "] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
-                            fieldsInLastRow + "]");
-                        return false;
+                    if (fieldsInThisRow != fieldsInFirstRow) {
+                        illFormattedRows.add(fieldsInFirstRow);
+                        continue;
                     }
 
                     fieldsInLastRow = fieldsInThisRow;
                 }
 
+                // We should only allow a certain percentage of ill formatted rows
+                // as it may effects our sample sizing and down stream processing
+                if (illFormattedRows.size() > (allowedFractionOfBadLines * numberOfRows) ) {
+                    explanation.add(new ParameterizedMessage(
+                        "Not {} because more than {} rows did not have the same number of fields as the first row({}). Bad rows {}",
+                        formatName,
+                        illFormattedRows.size(),
+                        fieldsInFirstRow,
+                        illFormattedRows).getFormattedMessage());
+                    return false;
+                }
                 if (fieldsInLastRow > fieldsInFirstRow) {
                     explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow +
                         "] and [" + fieldsInLastRow + "]");

diff --git a/...a/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/...a/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java
@@ -14,6 +14,7 @@
 
 public class DelimitedFileStructureFinderFactory implements FileStructureFinderFactory {
 
+    static final double DEFAULT_BAD_ROWS_PERCENTAGE = 0.10d;
     private final CsvPreference csvPreference;
     private final int minFieldsPerRow;
     private final boolean trimFields;
@@ -44,7 +45,7 @@ public boolean canFindFormat(FileStructure.Format format) {
      * it could have been truncated when the file was sampled.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
         String formatName;
         switch ((char) csvPreference.getDelimiterChar()) {
             case ',':
@@ -57,7 +58,12 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
                 formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values";
                 break;
         }
-        return DelimitedFileStructureFinder.canCreateFromSample(explanation, sample, minFieldsPerRow, csvPreference, formatName);
+        return DelimitedFileStructureFinder.canCreateFromSample(explanation,
+            sample,
+            minFieldsPerRow,
+            csvPreference,
+            formatName,
+            allowedFractionOfBadLines);
     }
 
     @Override

diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java
@@ -25,10 +25,11 @@ public interface FileStructureFinderFactory {
      * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
      *                    can be appended by this method.
      * @param sample A sample from the file to be ingested.
+     * @param allowedFractionOfBadLines How many lines of the passed sample are allowed to be considered "bad".
      * @return <code>true</code> if this factory can create an appropriate
      *         file structure given the sample; otherwise <code>false</code>.
      */
-    boolean canCreateFromSample(List<String> explanation, String sample);
+    boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines);
 
     /**
      * Create an object representing the structure of a file.

diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java
@@ -474,14 +474,17 @@ FileStructureFinder makeBestStructureFinder(List<String> explanation, String sam
         Character quote = overrides.getQuote();
         Boolean shouldTrimFields = overrides.getShouldTrimFields();
         List<FileStructureFinderFactory> factories;
+        double allowedFractionOfBadLines = 0.0;
         if (delimiter != null) {
+            allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.DEFAULT_BAD_ROWS_PERCENTAGE;
 
             // If a precise delimiter is specified, we only need one structure finder
             // factory, and we'll tolerate as little as one column in the input
             factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1,
                 (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields));
 
         } else if (quote != null || shouldTrimFields != null) {
+            allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.DEFAULT_BAD_ROWS_PERCENTAGE;
 
             // The delimiter is not specified, but some other aspect of delimited files is,
             // so clone our default delimited factories altering the overridden values
@@ -499,7 +502,7 @@ FileStructureFinder makeBestStructureFinder(List<String> explanation, String sam
 
         for (FileStructureFinderFactory factory : factories) {
             timeoutChecker.check("high level format detection");
-            if (factory.canCreateFromSample(explanation, sample)) {
+            if (factory.canCreateFromSample(explanation, sample, allowedFractionOfBadLines)) {
                 return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides,
                     timeoutChecker);
             }

diff --git a/...java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java b/...java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java
@@ -30,7 +30,7 @@ public boolean canFindFormat(FileStructure.Format format) {
      * documents must be non-empty, to prevent lines containing "{}" from matching.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
 
         int completeDocCount = 0;
 

diff --git a/...ava/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/...ava/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java
@@ -25,7 +25,7 @@ public boolean canFindFormat(FileStructure.Format format) {
      * non-blank lines.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
         if (sample.indexOf('\n') < 0) {
             explanation.add("Not text because sample contains no newlines");
             return false;

diff --git a/...in/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/...in/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java
@@ -43,7 +43,7 @@ public boolean canFindFormat(FileStructure.Format format) {
      * necessarily have to be complete (as the sample could have truncated it).
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
 
         int completeDocCount = 0;
         String commonRootElementName = null;

diff --git a/.../elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/.../elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java
@@ -16,78 +16,78 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
 
     public void testCanCreateCsvFromSampleGivenCsv() {
 
-        assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE));
+        assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenTsv() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenPipeDelimited() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenText() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
 
     public void testCanCreateTsvFromSampleGivenTsv() {
 
-        assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenPipeDelimited() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenText() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
 
-        assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenPipeDelimited() {
 
-        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
 
-        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
     // values because they come earlier in the order we check formats
 
     public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
 
-        assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreatePipeDelimitedFromSampleGivenText() {
 
-        assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 }