-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fear(#23): Replace filterIncompleteRecords boolean with Imputation Enum for Enhanced Data Handling #61
fear(#23): Replace filterIncompleteRecords boolean with Imputation Enum for Enhanced Data Handling #61
Changes from all commits
85eebf8
b72612f
af01c5a
8c969fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
package de.edux.data.handler; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class AverageFillIncompleteRecordsHandler implements IIncompleteRecordsHandler { | ||
@Override | ||
public List<String[]> getCleanedDataset(List<String[]> dataset) { | ||
List<String> typeOfFeatures = getFeatureTypes(dataset); | ||
List<String[]> cleanedDataset = | ||
dropRecordsWithIncompleteCategoricalFeature(dataset, typeOfFeatures); | ||
|
||
return averageFillRecordsWithIncompleteNumericalFeature(cleanedDataset, typeOfFeatures); | ||
} | ||
|
||
private List<String[]> averageFillRecordsWithIncompleteNumericalFeature( | ||
List<String[]> dataset, List<String> typeOfFeatures) { | ||
for (int columnIndex = 0; columnIndex < typeOfFeatures.size(); columnIndex++) { | ||
int validFeatureCount = 0; | ||
double sum = 0; | ||
double average; | ||
|
||
if (typeOfFeatures.get(columnIndex).equals("numerical")) { | ||
for (String[] record : dataset) { | ||
if (isCompleteFeature(record[columnIndex])) { | ||
validFeatureCount++; | ||
sum += Double.parseDouble(record[columnIndex]); | ||
} | ||
} | ||
|
||
if (validFeatureCount < dataset.size() * 0.5) { | ||
throw new RuntimeException( | ||
"Less than 50% of the records will be used to calculate the fill values. " | ||
+ "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); | ||
} | ||
|
||
average = sum / validFeatureCount; | ||
for (String[] record : dataset) { | ||
if (!isCompleteFeature(record[columnIndex])) { | ||
record[columnIndex] = String.valueOf(average); | ||
} | ||
} | ||
} | ||
} | ||
|
||
return dataset; | ||
} | ||
|
||
private List<String[]> dropRecordsWithIncompleteCategoricalFeature( | ||
List<String[]> dataset, List<String> typeOfFeatures) { | ||
List<String[]> cleanedDataset = dataset; | ||
|
||
for (int columnIndex = 0; columnIndex < typeOfFeatures.size(); columnIndex++) { | ||
if (typeOfFeatures.get(columnIndex).equals("categorical")) { | ||
int columnIndexFin = columnIndex; | ||
cleanedDataset = | ||
cleanedDataset.stream() | ||
.filter(record -> isCompleteFeature(record[columnIndexFin])) | ||
.toList(); | ||
} | ||
} | ||
|
||
if (cleanedDataset.size() < dataset.size() * 0.5) { | ||
throw new RuntimeException( | ||
"More than 50% of the records will be dropped with this IncompleteRecordsHandlerStrategy. " | ||
+ "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); | ||
} | ||
|
||
return cleanedDataset; | ||
} | ||
|
||
private List<String> getFeatureTypes(List<String[]> dataset) { | ||
List<String> featureTypes = new ArrayList<>(); | ||
for (String[] record : dataset) { | ||
if (containsIncompleteFeature(record)) { | ||
continue; | ||
} | ||
for (String feature : record) { | ||
if (isNumeric(feature)) { | ||
featureTypes.add("numerical"); | ||
} else { | ||
featureTypes.add("categorical"); | ||
} | ||
} | ||
break; | ||
} | ||
|
||
if (featureTypes.isEmpty()) { | ||
throw new RuntimeException("At least one full record needed with valid features"); | ||
} | ||
return featureTypes; | ||
} | ||
|
||
private boolean isNumeric(String feature) { | ||
return feature.matches("-?\\d+(\\.\\d+)?"); | ||
} | ||
|
||
private boolean isCompleteFeature(String feature) { | ||
return !feature.isBlank(); | ||
} | ||
|
||
private boolean containsIncompleteFeature(String[] record) { | ||
for (String feature : record) { | ||
if (feature.isBlank()) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package de.edux.data.handler; | ||
|
||
import java.util.List; | ||
|
||
public class DoNotHandleIncompleteRecords implements IIncompleteRecordsHandler { | ||
@Override | ||
public List<String[]> getCleanedDataset(List<String[]> dataset) { | ||
return dataset; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package de.edux.data.handler; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class DropIncompleteRecordsHandler implements IIncompleteRecordsHandler { | ||
@Override | ||
public List<String[]> getCleanedDataset(List<String[]> dataset) { | ||
List<String[]> filteredList = | ||
dataset.stream().filter(this::containsOnlyCompletedFeatures).toList(); | ||
|
||
if (filteredList.size() < dataset.size() * 0.5) { | ||
throw new RuntimeException( | ||
"More than 50% of the records will be dropped with this IncompleteRecordsHandlerStrategy. " | ||
+ "Consider using another IncompleteRecordsHandlerStrategy or handle this exception."); | ||
} | ||
|
||
List<String[]> cleanedDataset = new ArrayList<>(); | ||
for (String[] item : filteredList) { | ||
cleanedDataset.add(item); | ||
} | ||
return cleanedDataset; | ||
} | ||
|
||
private boolean containsOnlyCompletedFeatures(String[] record) { | ||
for (String feature : record) { | ||
if (feature.isBlank()) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package de.edux.data.handler; | ||
|
||
public enum EIncompleteRecordsHandlerStrategy { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Java World wie never prefix enums with 'E'. As in isssue#23 described you need name it "Imputation" here. Imputation .DROP_RECORDS.... |
||
DO_NOT_HANDLE(new DoNotHandleIncompleteRecords()), | ||
DROP_RECORDS(new DropIncompleteRecordsHandler()), | ||
FILL_RECORDS_WITH_AVERAGE(new AverageFillIncompleteRecordsHandler()); | ||
|
||
private final IIncompleteRecordsHandler incompleteRecordHandler; | ||
|
||
EIncompleteRecordsHandlerStrategy(IIncompleteRecordsHandler incompleteRecordHandler) { | ||
this.incompleteRecordHandler = incompleteRecordHandler; | ||
} | ||
|
||
public IIncompleteRecordsHandler getHandler() { | ||
return this.incompleteRecordHandler; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package de.edux.data.handler; | ||
|
||
import java.util.List; | ||
|
||
public interface IIncompleteRecordsHandler { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IImputationHandler |
||
List<String[]> getCleanedDataset(List<String[]> dataset); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why?