Skip to content

Commit

Permalink
feat(Samyssmile#23): Prepare
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel Abramov authored and acsolle66 committed Oct 30, 2023
1 parent c498e8f commit 6f86149
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
package de.edux.data.handler;

import java.util.ArrayList;
import java.util.List;

public class DropIncompleteRecordsHandler implements IIncompleteRecordsHandler {
@Override
public List<String[]> getCleanedDataset(List<String[]> dataset) {
List<String[]> cleanedDataset =
List<String[]> filteredList =
dataset.stream().filter(this::containsOnlyCompletedFeatures).toList();

if (cleanedDataset.size() < dataset.size() * 0.5) {
if (filteredList.size() < dataset.size() * 0.5) {
throw new RuntimeException(
"More than 50% of the records will be dropped with this IncompleteRecordsHandlerStrategy. "
+ "Consider using another IncompleteRecordsHandlerStrategy or handle this exception.");
}

List<String[]> cleanedDataset = new ArrayList<>();
for (String[] item : filteredList) {
cleanedDataset.add(item);
}
return cleanedDataset;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import de.edux.functions.imputation.ImputationStrategy;

import java.util.List;
import java.util.Optional;

public interface DataPostProcessor {
DataPostProcessor normalize();
Expand All @@ -18,4 +19,10 @@ public interface DataPostProcessor {
DataProcessor split(double splitRatio);


public abstract Optional<Integer> getIndexOfColumn(String columnName);

public abstract String[] getColumnDataOf(String columnName);

public abstract String[] getColumnNames();

}
2 changes: 1 addition & 1 deletion lib/src/main/java/de/edux/data/reader/CSVIDataReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

public class CSVIDataReader implements IDataReader {

public List<String[]> readFile(File file, char separator) {
public List<String[]> readFile(File file, char separator ) {
CSVParser customCSVParser = new CSVParserBuilder().withSeparator(separator).build();
List<String[]> result;
try(CSVReader reader = new CSVReaderBuilder(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package de.edux.data.handler;

import de.edux.data.provider.SeabornDataProcessor;
import de.edux.data.provider.SeabornProvider;
import org.junit.jupiter.api.Test;

import java.io.File;
import java.net.URL;
import java.util.Optional;

class DropIncompleteRecordsHandlerTest {
private static final boolean SHUFFLE = true;
private static final boolean SKIP_HEADLINE = true;
private static final EIncompleteRecordsHandlerStrategy INCOMPLETE_RECORD_HANDLER_STRATEGY = EIncompleteRecordsHandlerStrategy.DROP_RECORDS;
private static final double TRAIN_TEST_SPLIT_RATIO = 0.7;
private static final String CSV_FILE_PATH = "testdatasets/seaborn-penguins/penguins.csv";
private SeabornProvider seabornProvider;

@Test
void shouldReturnColumnData() {
URL url = DropIncompleteRecordsHandlerTest.class.getClassLoader().getResource(CSV_FILE_PATH);
if (url == null) {
throw new IllegalStateException("Cannot find file: " + CSV_FILE_PATH);
}
File csvFile = new File(url.getPath());
var seabornDataProcessor = new SeabornDataProcessor();
var dataset = seabornDataProcessor.loadDataSetFromCSV(csvFile, ',', true, true, INCOMPLETE_RECORD_HANDLER_STRATEGY);
seabornDataProcessor.normalize(dataset);
Optional<Integer> indexOfSpecies = seabornDataProcessor.getIndexOfColumn("species");
String[] speciesData = seabornDataProcessor.getColumnDataOf("species");

assert indexOfSpecies.isPresent();
assert speciesData.length > 0;
}
}

0 comments on commit 6f86149

Please sign in to comment.