Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change Hive CSV escape character to backslash when set to double quote #18952

Merged
merged 1 commit into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
import java.util.List;
import java.util.Map;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.hive.formats.line.csv.CsvConstants.DEFAULT_QUOTE;
import static io.trino.hive.formats.line.csv.CsvConstants.DEFAULT_SEPARATOR;
import static io.trino.hive.formats.line.csv.CsvConstants.DESERIALIZER_DEFAULT_ESCAPE;
import static io.trino.hive.formats.line.csv.CsvConstants.ESCAPE_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.HIVE_SERDE_CLASS_NAMES;
import static io.trino.hive.formats.line.csv.CsvConstants.QUOTE_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.SEPARATOR_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.SERIALIZER_DEFAULT_ESCAPE;
import static io.trino.hive.formats.line.csv.CsvConstants.getCharProperty;

public class CsvDeserializerFactory
Expand All @@ -44,6 +46,15 @@ public LineDeserializer create(List<Column> columns, Map<String, String> serdePr
char separatorChar = getCharProperty(serdeProperties, SEPARATOR_KEY, DEFAULT_SEPARATOR);
char quoteChar = getCharProperty(serdeProperties, QUOTE_KEY, DEFAULT_QUOTE);
char escapeChar = getCharProperty(serdeProperties, ESCAPE_KEY, DESERIALIZER_DEFAULT_ESCAPE);
// Hive has a bug where when the escape character is explicitly set to double quote (char 34),
// it changes the escape character to backslash (char 92) when deserializing.
if (escapeChar == SERIALIZER_DEFAULT_ESCAPE) {
// Add an explicit checks for separator or quote being backslash, so a more helpful error message can be provided
// as this Hive behavior is not obvious
checkArgument(separatorChar != DESERIALIZER_DEFAULT_ESCAPE, "Separator character cannot be '\\' when escape character is '\"'");
checkArgument(quoteChar != DESERIALIZER_DEFAULT_ESCAPE, "Quote character cannot be '\\' when escape character is '\"'");
escapeChar = DESERIALIZER_DEFAULT_ESCAPE;
}
return new CsvDeserializer(columns, separatorChar, quoteChar, escapeChar);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES;
import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

public class TestCsvFormat
{
Expand Down Expand Up @@ -129,22 +130,32 @@ public void testCsv()
// For serialization the pipe character is escaped with a quote char, but for deserialization escape character is the backslash character
assertTrinoHiveByteForByte(false, Arrays.asList("|", "a", "b"), Optional.empty(), Optional.of('|'), Optional.empty());
assertTrinoHiveByteForByte(false, Arrays.asList("|", "a", "|"), Optional.empty(), Optional.of('|'), Optional.empty());

// Hive has strange special handling of the escape character. If the escape character is double quote (char 34)
// Hive will change the escape character to backslash (char 92).
assertLine("*a*|*b\\|b*|*c*", Arrays.asList("a", "b|b", "c"), Optional.of('|'), Optional.of('*'), Optional.of('"'));

// Hive does not allow the separator, quote, or escape characters to be the same, but this is checked after the escape character is changed
assertInvalidConfig(Optional.of('\\'), Optional.of('*'), Optional.of('"'));
assertInvalidConfig(Optional.of('*'), Optional.of('\\'), Optional.of('"'));

// Since the escape character is swapped, the quote or separator character can be the same as the original escape character
assertLine("\"a\"|\"b\\\"b\"|\"c\"", Arrays.asList("a", "b\"b", "c"), Optional.of('|'), Optional.of('"'), Optional.of('"'));
assertLine("*a*\"*b\\\"b*\"*c*", Arrays.asList("a", "b\"b", "c"), Optional.of('"'), Optional.of('*'), Optional.of('"'));
}

private static void assertLine(boolean shouldRoundTrip, String csvLine, List<String> expectedValues)
throws Exception
{
assertHiveLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertTrinoLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertTrinoHiveByteForByte(shouldRoundTrip, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());

csvLine = rewriteSpecialChars(csvLine, '_', '|', '~');
expectedValues = expectedValues.stream()
.map(value -> value == null ? null : rewriteSpecialChars(value, '_', '|', '~'))
.collect(Collectors.toList());

assertHiveLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
assertTrinoLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
assertLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
// after switching the special characters the values will round trip
assertTrinoHiveByteForByte(true, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
}
Expand Down Expand Up @@ -222,6 +233,13 @@ private static List<Column> createReadColumns(int columnCount)
.toList();
}

private static void assertLine(String csvLine, List<String> expectedValues, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException, IOException
{
assertHiveLine(csvLine, expectedValues, separatorChar, quoteChar, escapeChar);
assertTrinoLine(csvLine, expectedValues, separatorChar, quoteChar, escapeChar);
}

private static void assertHiveLine(String csvLine, List<String> expectedValues, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException
{
Expand All @@ -246,6 +264,16 @@ private static String writeHiveLine(List<String> expectedValues, Optional<Charac
return serializer.serialize(expectedValues, javaObjectInspector).toString();
}

private static void assertInvalidConfig(Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
{
assertThatThrownBy(() -> createHiveSerDe(3, separatorChar, quoteChar, escapeChar).deserialize(new Text("")))
.isInstanceOf(SerDeException.class)
.hasMessage("java.lang.UnsupportedOperationException: The separator, quote, and escape characters must be different!");
assertThatThrownBy(() -> new CsvDeserializerFactory().create(createReadColumns(3), createCsvProperties(separatorChar, quoteChar, escapeChar)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageMatching("(Quote|Separator) character cannot be '\\\\' when escape character is '\"'");
}

private static OpenCSVSerde createHiveSerDe(int columnCount, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException
{
Expand Down