diff --git a/lib/src/main/java/de/edux/data/handler/AverageFillIncompleteRecordsHandler.java b/lib/src/main/java/de/edux/data/handler/AverageFillIncompleteRecordsHandler.java index e84ed47..8234e4d 100644 --- a/lib/src/main/java/de/edux/data/handler/AverageFillIncompleteRecordsHandler.java +++ b/lib/src/main/java/de/edux/data/handler/AverageFillIncompleteRecordsHandler.java @@ -28,12 +28,13 @@ private List averageFillRecordsWithIncompleteNumericalFeature( } } - if (validFeatureCount == 0) { - continue; + if (validFeatureCount < dataset.size() * 0.5) { + throw new RuntimeException( + "Less than 50% of the records will be used to calculate the fill values. " + + "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); } average = sum / validFeatureCount; - for (String[] record : dataset) { if (!isCompleteFeature(record[columnIndex])) { record[columnIndex] = String.valueOf(average); @@ -41,20 +42,31 @@ private List averageFillRecordsWithIncompleteNumericalFeature( } } } + return dataset; } private List dropRecordsWithIncompleteCategoricalFeature( List dataset, List typeOfFeatures) { + List cleanedDataset = dataset; for (int columnIndex = 0; columnIndex < typeOfFeatures.size(); columnIndex++) { if (typeOfFeatures.get(columnIndex).equals("categorical")) { int columnIndexFin = columnIndex; - dataset = - dataset.stream().filter(record -> isCompleteFeature(record[columnIndexFin])).toList(); + cleanedDataset = + cleanedDataset.stream() + .filter(record -> isCompleteFeature(record[columnIndexFin])) + .toList(); } } - return dataset; + + if (cleanedDataset.size() < dataset.size() * 0.5) { + throw new RuntimeException( + "More than 50% of the records will be dropped with this IncompleteRecordsHandlerStrategy. " + + "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); + } + + return cleanedDataset; } private List getFeatureTypes(List dataset) { @@ -72,6 +84,10 @@ private List getFeatureTypes(List dataset) { } break; } + + if (featureTypes.isEmpty()) { + throw new RuntimeException("At least one full record needed with valid features"); + } return featureTypes; } diff --git a/lib/src/main/java/de/edux/data/handler/DropIncompleteRecordsHandler.java b/lib/src/main/java/de/edux/data/handler/DropIncompleteRecordsHandler.java index 49eb229..0ec1d09 100644 --- a/lib/src/main/java/de/edux/data/handler/DropIncompleteRecordsHandler.java +++ b/lib/src/main/java/de/edux/data/handler/DropIncompleteRecordsHandler.java @@ -5,8 +5,16 @@ public class DropIncompleteRecordsHandler implements IIncompleteRecordsHandler { @Override public List getCleanedDataset(List dataset) { + List cleanedDataset = + dataset.stream().filter(this::containsOnlyCompletedFeatures).toList(); - return dataset.stream().filter(this::containsOnlyCompletedFeatures).toList(); + if (cleanedDataset.size() < dataset.size() * 0.5) { + throw new RuntimeException( + "More than 50% of the records will be dropped with this IncompleteRecordsHandlerStrategy. " + + "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); + } + + return cleanedDataset; } private boolean containsOnlyCompletedFeatures(String[] record) { diff --git a/lib/src/test/java/de/edux/data/handler/AverageFillIncompleteRecordHandlerTest.java b/lib/src/test/java/de/edux/data/handler/AverageFillIncompleteRecordHandlerTest.java index 5ae25d5..eb9681d 100644 --- a/lib/src/test/java/de/edux/data/handler/AverageFillIncompleteRecordHandlerTest.java +++ b/lib/src/test/java/de/edux/data/handler/AverageFillIncompleteRecordHandlerTest.java @@ -26,41 +26,51 @@ void initializeList() { void dropRecordsWithIncompleteCategoricalFeature() { this.dataset.add(new String[] {"A", "1", "A"}); - this.dataset.add(new String[] {"", "1", ""}); + this.dataset.add(new String[] {"", "2", ""}); this.dataset.add(new String[] {"C", "", "C"}); - this.dataset.add(new String[] {"D", "1", ""}); - this.dataset.add(new String[] {"E", "1", "E"}); - for (String[] data : dataset) { - System.out.println(Arrays.toString(data)); - } - - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - System.out.println("----------------------------------------------------"); - for (String[] data : cleanedDataset) { - System.out.println(Arrays.toString(data)); - } - - assertEquals(3, cleanedDataset.size()); + this.dataset.add(new String[] {"D", "3", ""}); + this.dataset.add(new String[] {"E", "4", "E"}); + + assertAll( + () -> assertEquals(3, incompleteRecordHandler.getCleanedDataset(dataset).size()), + () -> + assertEquals( + 2.5, Double.valueOf(incompleteRecordHandler.getCleanedDataset(dataset).get(1)[1]))); } @Test - void fillWithAverageValues() { + void testThrowRuntimeExceptionForDroppingMoreThanHalfOfOriginalDataset() { - this.dataset.add(new String[] {"A", "1", "A"}); - this.dataset.add(new String[] {"", "1", ""}); - this.dataset.add(new String[] {"C", "", "C"}); - this.dataset.add(new String[] {"D", "1", ""}); - this.dataset.add(new String[] {"E", "1", "E"}); - for (String[] data : dataset) { - System.out.println(Arrays.toString(data)); - } - - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - System.out.println("----------------------------------------------------"); - for (String[] data : cleanedDataset) { - System.out.println(Arrays.toString(data)); - } - - assertEquals(1, Integer.valueOf(cleanedDataset.get(2)[1])); + this.dataset.add(new String[] {"", "1", "A"}); + this.dataset.add(new String[] {"B", "2", "B"}); + this.dataset.add(new String[] {"C", "3", "C"}); + this.dataset.add(new String[] {"D", "4", ""}); + this.dataset.add(new String[] {"", "5", "E"}); + + assertThrows(RuntimeException.class, () -> incompleteRecordHandler.getCleanedDataset(dataset)); + } + + @Test + void testThrowRuntimeExceptionForZeroValidNumericalFeatures() { + + this.dataset.add(new String[] {"A", "", "A"}); + this.dataset.add(new String[] {"B", "", "B"}); + this.dataset.add(new String[] {"C", "1", "C"}); + this.dataset.add(new String[] {"D", "", "D"}); + this.dataset.add(new String[] {"E", "", "E"}); + + assertThrows(RuntimeException.class, () -> incompleteRecordHandler.getCleanedDataset(dataset)); + } + + @Test + void testThrowRuntimeExceptionForAtLeastOneFullValidRecord() { + + this.dataset.add(new String[] {"", "1", "A"}); + this.dataset.add(new String[] {"B", "2", ""}); + this.dataset.add(new String[] {"", "", "C"}); + this.dataset.add(new String[] {"D", "3", ""}); + this.dataset.add(new String[] {"", "4", "E"}); + + assertThrows(RuntimeException.class, () -> incompleteRecordHandler.getCleanedDataset(dataset)); } } diff --git a/lib/src/test/java/de/edux/data/handler/DropIncompleteRecordHandlerTest.java b/lib/src/test/java/de/edux/data/handler/DropIncompleteRecordHandlerTest.java index f8868e5..92e0072 100644 --- a/lib/src/test/java/de/edux/data/handler/DropIncompleteRecordHandlerTest.java +++ b/lib/src/test/java/de/edux/data/handler/DropIncompleteRecordHandlerTest.java @@ -7,6 +7,7 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; class DropIncompleteRecordHandlerTest { private List dataset; @@ -28,8 +29,7 @@ void testDropZeroIncompleteResults() { this.dataset.add(new String[] {"A", "B", "C"}); this.dataset.add(new String[] {"A", "B", "C"}); - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - assertEquals(5, cleanedDataset.size()); + assertEquals(5, incompleteRecordHandler.getCleanedDataset(dataset).size()); } @Test @@ -41,33 +41,30 @@ void testDropOneIncompleteResult() { this.dataset.add(new String[] {"A", "B", "C"}); this.dataset.add(new String[] {"A", "B", "C"}); - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - assertEquals(4, cleanedDataset.size()); + assertEquals(4, incompleteRecordHandler.getCleanedDataset(dataset).size()); } @Test - void testDropThreeIncompleteResults() { + void testDropTwoIncompleteResult() { this.dataset.add(new String[] {"A", "B", "C"}); - this.dataset.add(new String[] {"", "B", "C"}); this.dataset.add(new String[] {"A", "", "C"}); - this.dataset.add(new String[] {"A", "B", ""}); + this.dataset.add(new String[] {"A", "", "C"}); + this.dataset.add(new String[] {"A", "B", "C"}); this.dataset.add(new String[] {"A", "B", "C"}); - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - assertEquals(2, cleanedDataset.size()); + assertEquals(3, incompleteRecordHandler.getCleanedDataset(dataset).size()); } @Test - void testDropAllIncompleteResults() { + void testThrowRuntimeExceptionForDroppingMoreThanHalfOfOriginalDataset() { - this.dataset.add(new String[] {"A", "", "C"}); + this.dataset.add(new String[] {"A", "B", "C"}); this.dataset.add(new String[] {"", "B", "C"}); this.dataset.add(new String[] {"A", "", "C"}); this.dataset.add(new String[] {"A", "B", ""}); - this.dataset.add(new String[] {"A", "", "C"}); + this.dataset.add(new String[] {"A", "B", "C"}); - List cleanedDataset = incompleteRecordHandler.getCleanedDataset(dataset); - assertEquals(0, cleanedDataset.size()); + assertThrows(RuntimeException.class, () -> incompleteRecordHandler.getCleanedDataset(dataset)); } }