Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: For NPE when sorting dictionary-encoded null string columns #5758

Merged
merged 5 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,11 @@ private static SortMapping doSymbolTableMapping(SortingOrder order, ColumnSource
final ColumnSource<Long> reinterpreted = columnSource.reinterpret(long.class);
final Table symbolTable = ((SymbolTableSource<?>) columnSource).getStaticSymbolTable(rowSet, true);

if (symbolTable.isEmpty()) {
// All nulls, so we can just return the row set as the sort mapping
return new IndexedSortMapping(rowSet.size(), new long[] {rowSet.size()}, new RowSet[] {rowSet});
}

if (symbolTable.size() >= sortSize) {
// the very first thing we will do is sort the symbol table, using a regular sort; if it is larger than the
// actual table we care to sort, then it is wasteful to use the symbol table sorting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@ public boolean gatherDictionaryValuesRowSet(
@NotNull final RowSequence.Iterator knownKeys,
@NotNull final RowSetBuilderSequential sequentialBuilder) {
final long dictSize = getDictionaryChunk().size();

if (dictSize == 0) {
advanceToNextPage(knownKeys);
return advanceToNextPage(keysToVisit);
}
final long pageFirstKey = firstRow(keysToVisit.currentValue());
final long pageLastKey = pageFirstKey + dictSize - 1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@
import java.math.BigDecimal;
import java.math.BigInteger;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
Expand Down Expand Up @@ -1565,6 +1567,127 @@ public void stringDictionaryTest() {
}
}

/**
* Test if we can read/sort a parquet file with a null string column encoded as a dictionary.
*/
@Test
public void nullStringDictEncodingTest() {
final int NUM_ROWS = 2048;
{
// The following file has a null string column encoded as a dictionary. We should be able to read/sort
// it without any exceptions.
final String path =
ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded1.parquet").getFile();
final Table expected = TableTools.emptyTable(NUM_ROWS).update("someLong = i",
"someString = (String)null");
nullStringDictEncodingTestHelper(path, expected);
}

{
// The following file has a string column with all values null except for the last one encoded as a
// dictionary. We should be able to read/sort it without any exceptions.
final String path =
ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded2.parquet").getFile();
final Table expected = TableTools.emptyTable(NUM_ROWS).update("someLong = i",
"someString = i == 2047 ? `Hello` : (String)null");
nullStringDictEncodingTestHelper(path, expected);
}
}

private static void nullStringDictEncodingTestHelper(final String path, final Table expected) {
// Verify that the column uses dictionary encoding.
final ParquetMetadata metadata =
new ParquetTableLocationKey(new File(path).toURI(), 0, null, ParquetInstructions.EMPTY)
.getMetadata();
final String strColumnMetadata = metadata.getBlocks().get(0).getColumns().get(1).toString();
assertTrue(strColumnMetadata.contains("someString") && strColumnMetadata.contains("RLE_DICTIONARY"));

// Sort and compare
assertTableEquals(expected, readTable(path).sort("someString"));
assertTableEquals(expected.sortDescending("someString"),
readTable(path).sortDescending("someString"));
}

/**
* Extension of {@link #nullStringDictEncodingTest()} test, for testing if we can read/sort a table with regions
* having null string columns encoded as a dictionary.
*/
@Test
public void nullRegionDictionaryEncodingTest() throws IOException {
// The following file has a null string column encoded as a dictionary.
final File allNullsFile = new File(
ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded1.parquet").getFile());
final Table allNullsTable = readTable(allNullsFile.toString()).select();
final File allButOneNullFile = new File(
ParquetTableReadWriteTest.class.getResource("/ReferenceNullStringDictEncoded2.parquet").getFile());
final Table allButOneNullTable = readTable(allButOneNullFile.toString()).select();
final File testDir = new File(rootFile, "nullRegionDictionaryEncodingTest");
final File firstRegion = new File(testDir, "Region1.parquet");
final File secondRegion = new File(testDir, "Region2.parquet");
final File thirdRegion = new File(testDir, "Region3.parquet");
final File fourthRegion = new File(testDir, "Region4.parquet");

// The first test is for a table where all three regions have null values for the string column.
nullRegionDictionaryEncodingTestHelper(
allNullsFile, allNullsFile, allNullsFile,
allNullsTable, allNullsTable, allNullsTable,
firstRegion, secondRegion, thirdRegion);

// The next test is for a table where the first-region has non-null values and the second and third have null
// values for the string column.
nullRegionDictionaryEncodingTestHelper(
allButOneNullFile, allNullsFile, allNullsFile,
allButOneNullTable, allNullsTable, allNullsTable,
firstRegion, secondRegion, thirdRegion);

// The next test is for a table where the second-region has non-null values, and the first and third have null
// values for the string column.
nullRegionDictionaryEncodingTestHelper(
allNullsFile, allButOneNullFile, allNullsFile,
allNullsTable, allButOneNullTable, allNullsTable,
firstRegion, secondRegion, thirdRegion);

// The next test is for a table where the third-region has non-null values, and the first two have null values
// for the string column.
nullRegionDictionaryEncodingTestHelper(
allNullsFile, allNullsFile, allButOneNullFile,
allNullsTable, allNullsTable, allButOneNullTable,
firstRegion, secondRegion, thirdRegion);

{
// The next test is for a table where the first and region have null values and the second and fourth two
// have null values for the string column.
testDir.mkdir();
Files.copy(allNullsFile.toPath(), firstRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
Files.copy(allButOneNullFile.toPath(), secondRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
Files.copy(allNullsFile.toPath(), thirdRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
Files.copy(allButOneNullFile.toPath(), fourthRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
final Table expected = merge(allNullsTable, allButOneNullTable, allNullsTable, allButOneNullTable);
assertTableEquals(expected.sort("someString"),
readTable(testDir.toString()).sort("someString"));
assertTableEquals(expected.sortDescending("someString"),
readTable(testDir.toString()).sortDescending("someString"));
FileUtils.deleteRecursively(testDir);
}
}

private static void nullRegionDictionaryEncodingTestHelper(
final File firstSource, final File secondSource, final File thirdSource,
final Table firstTable, final Table secondTable, final Table thirdTable,
final File firstRegion, final File secondRegion, final File thirdRegion) throws IOException {
final File testDir = new File(rootFile, "nullRegionDictionaryEncodingTest");
testDir.mkdir();
Files.copy(firstSource.toPath(), firstRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
Files.copy(secondSource.toPath(), secondRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
Files.copy(thirdSource.toPath(), thirdRegion.toPath(), StandardCopyOption.REPLACE_EXISTING);
final Table expected = merge(firstTable, secondTable, thirdTable);
assertTableEquals(expected.sort("someString"),
readTable(testDir.toString()).sort("someString"));
assertTableEquals(expected.sortDescending("someString"),
readTable(testDir.toString()).sortDescending("someString"));
FileUtils.deleteRecursively(testDir);
}

/**
* Encoding bigDecimal is tricky -- the writer will try to pick the precision and scale automatically. Because of
* that tableTools.assertTableEquals will fail because, even though the numbers are identical, the representation
Expand Down
Git LFS file not shown
Git LFS file not shown
Loading