Skip to content

Commit

Permalink
Fix Metacat handling of decimal column declarations with spaces (#535)
Browse files Browse the repository at this point in the history
  • Loading branch information
swaranga-netflix authored Feb 22, 2023
1 parent 1142f21 commit 34c9831
Show file tree
Hide file tree
Showing 2 changed files with 197 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
Expand All @@ -62,6 +64,20 @@
@Slf4j
public class HiveTypeConverter implements ConnectorTypeConverter {

// matches decimal declarations with only scale, ex: decimal(38)
// matches declarations with spaces around '(', the scale and ')'
private static final String DECIMAL_WITH_SCALE
= "decimal\\s*\\(\\s*[0-9]+\\s*\\)";

// matches decimal declarations with scale and precision, ex: decimal(38,9)
// matches declarations with spaces around '(', the scale, the precision, the comma and ')'
private static final String DECIMAL_WITH_SCALE_AND_PRECISION
= "decimal\\s*\\(\\s*[0-9]+\\s*,\\s*[0-9]*\\s*\\)";

// combined compiled pattern to match both
private static final Pattern DECIMAL_TYPE
= Pattern.compile(DECIMAL_WITH_SCALE + "|" + DECIMAL_WITH_SCALE_AND_PRECISION, Pattern.CASE_INSENSITIVE);

private static Type getPrimitiveType(final ObjectInspector fieldInspector) {
final PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) fieldInspector)
.getPrimitiveCategory();
Expand Down Expand Up @@ -89,8 +105,7 @@ private static Type getPrimitiveType(final ObjectInspector fieldInspector) {
@Override
public Type toMetacatType(final String type) {
// Hack to fix presto "varchar" type coming in with no length which is required by Hive.
final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(
"varchar".equals(type.toLowerCase()) ? serdeConstants.STRING_TYPE_NAME : type);
final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(sanitizeType(type));
ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
// The standard struct object inspector forces field names to lower case, however in Metacat we need to preserve
// the original case of the struct fields so we wrap it with our wrapper to force the fieldNames to keep
Expand Down Expand Up @@ -220,6 +235,73 @@ private String rowFieldToString(final RowType.RowField rowField) {
return prefix + fromMetacatType(rowField.getType());
}

/**
* Sanitize the type to handle Hive type conversion edge cases.
*
* @param type the type to sanitize
* @return the sanitized type
*/
public static String sanitizeType(final String type) {
if ("varchar".equalsIgnoreCase(type)) {
return serdeConstants.STRING_TYPE_NAME;
} else {
// the current version of Hive (1.2.1) cannot handle spaces in column definitions
// this was fixed in 1.3.0. See: https://issues.apache.org/jira/browse/HIVE-11476
// this bug caused an error in loading the table information in Metacat
// see: https://netflix.slack.com/archives/G0SUNC804/p1676930065306799
// Here the offending column definition was decimal(38, 9)
// which had a space between the command and the digit 9

// instead of upgrading the Hive version, we are making a targeted "fix"
// to handle this space in a decimal column declaration

// the regex we use tries to match various decimal declarations
// and handles decimal types inside other type declarations like array and struct
// see the unit test for those method for all the cases handled
final Matcher matcher = DECIMAL_TYPE.matcher(type);
final StringBuilder replacedType = new StringBuilder();

// keep track of the start of the substring that we haven't matched yet
// more explanation on how this is used is below
int prevStart = 0;

// we cannot simply use matcher.matches() and matcher.replaceAll()
// because that will replace the decimal declaration itself
// instead we use the region APIs to find the substring that matched
// and then apply the replace function to remove spaces in the decimal declaration
// we do this for all the matches in the type declaration and hence the usage of the while loop
while (matcher.find()) {
// this index represents the start index (inclusive) of our current match
final int currMatchStart = matcher.regionStart();

// this represents the end index (exclusive) of our current match
final int currMatchEnd = matcher.regionEnd();

replacedType
// first append any part of the string that did not match
// this is represented by the prevStart (inclusive) till the start of the current match (exclusive)
// this append should not need any replacement and can be added verbatim
.append(type, prevStart, currMatchStart)

// Then append the matching part which should be a decimal declaration
// The matching part is start (inclusive) and end (exclusive)
// This part should go through a replacement to remove spaces
.append(type.substring(currMatchStart, currMatchEnd).replaceAll("\\s", ""));

// update the prevStart marker so that for the next match
// we know where to start to add the non-matching part
prevStart = currMatchEnd;
}

// append any remaining part of the input type to the final answer
// again, no replacement necessary for this part since it should not contain and decimal declarations
// phew!
replacedType.append(type.substring(prevStart));

return replacedType.toString();
}
}

/**
* Returns the canonical type.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,122 @@ class HiveTypeConverterSpec extends Specification {
'struct<field1:bigint,field2:string,field3:string>',
'struct<field1:string,field2:bigint,field3:bigint>',
'struct<field1:string,field2:string,field3:bigint>',
'struct<field1:string,field2:string,field3:string>'
'struct<field1:string,field2:string,field3:string>',

"decimal",
"decimal( 38)",
"decimal( 38, 9)",
"decimal(38, 9)",
"decimal(38, 9)",
"decimal(38,9)",
"decimal(38,9 )",
"decimal (38,9 )",
"decimal (38)",
"decimal(38 )",
"array<decimal>",
"array<decimal(38, 9)>",
"array<decimal(38, 9)>",
"array<decimal(38,9)>",
"array<decimal(38,9 )>",
"array<decimal (38,9 )>",
"array<decimal (38)>",
"array<decimal(38 )>",
"struct<field1:string,field2:decimal,field3:bigint>",
"struct<field1:string,field2:decimal(38, 9),field3:bigint>",
"struct<field1:string,field2:decimal( 38, 9),field3:bigint>",
"struct<field1:string,field2:decimal(38, 9),field3:bigint>",
"struct<field1:string,field2:decimal(38,9),field3:bigint>",
"struct<field1:string,field2:decimal(38,9 ),field3:bigint>",
"struct<field1:string,field2:decimal (38,9 ),field3:bigint>",
"struct<field1:string,field2:decimal (38),field3:bigint>",
"struct<field1:string,field2:decimal(38 ),field3:bigint>",
]
}

@Unroll
def "handles sanitization of decimal declarations with spaces"() {
when:
def sanitizedType = HiveTypeConverter.sanitizeType(input)

then:
sanitizedType == expected

where:
input || expected
"tinyint" || "tinyint"
"smallint" || "smallint"
"int" || "int"
"bigint" || "bigint"
"float" || "float"
"double" || "double"
"decimal" || "decimal"
"decimal(4,2)" || "decimal(4,2)"
"array<decimal(4,2)>" || "array<decimal(4,2)>"
"timestamp" || "timestamp"
"date" || "date"
"string" || "string"
"varchar(10)" || "varchar(10)"
"char(10)" || "char(10)"
"boolean" || "boolean"
"binary" || "binary"
"array<bigint>" || "array<bigint>"
"array<boolean>" || "array<boolean>"
"array<double>" || "array<double>"
"array<bigint>" || "array<bigint>"
"array<string>" || "array<string>"
"array<map<bigint,bigint>>" || "array<map<bigint,bigint>>"
"array<map<bigint,string>>" || "array<map<bigint,string>>"
"array<map<string,bigint>>" || "array<map<string,bigint>>"
"array<map<string,string>>" || "array<map<string,string>>"
"array<struct<field1:bigint,field2:bigint>>" || "array<struct<field1:bigint,field2:bigint>>"
"array<struct<field1:bigint,field2:string>>" || "array<struct<field1:bigint,field2:string>>"
"array<struct<field1:string,field2:bigint>>" || "array<struct<field1:string,field2:bigint>>"
"array<struct<field1:string,field2:string>>" || "array<struct<field1:string,field2:string>>"
"map<boolean,boolean>" || "map<boolean,boolean>"
"map<boolean,string>" || "map<boolean,string>"
"map<bigint,bigint>" || "map<bigint,bigint>"
"map<string,double>" || "map<string,double>"
"map<string,bigint>" || "map<string,bigint>"
"map<string,string>" || "map<string,string>"
"map<string,struct<field1:array<bigint>>>" || "map<string,struct<field1:array<bigint>>>"
"struct<field1:bigint,field2:bigint,field3:bigint>" || "struct<field1:bigint,field2:bigint,field3:bigint>"
"struct<field1:bigint,field2:string,field3:double>" || "struct<field1:bigint,field2:string,field3:double>"
"struct<field1:bigint,field2:string,field3:string>" || "struct<field1:bigint,field2:string,field3:string>"
"struct<field1:string,field2:bigint,field3:bigint>" || "struct<field1:string,field2:bigint,field3:bigint>"
"struct<field1:string,field2:string,field3:bigint>" || "struct<field1:string,field2:string,field3:bigint>"
"struct<field1:string,field2:string,field3:string>" || "struct<field1:string,field2:string,field3:string>"

// problem types starts here; we expect the spaces in the decimal declarations to be removed
"decimal" || "decimal"
"decimal( 38)" || "decimal(38)"
"decimal( 38, 9)" || "decimal(38,9)"
"decimal(38, 9)" || "decimal(38,9)"
"decimal(38, 9)" || "decimal(38,9)"
"decimal(38,9)" || "decimal(38,9)"
"decimal(38,9 )" || "decimal(38,9)"
"decimal (38,9 )" || "decimal(38,9)"
"decimal (38)" || "decimal(38)"
"decimal(38 )" || "decimal(38)"
"array<decimal>" || "array<decimal>"
"array<decimal(38, 9)>" || "array<decimal(38,9)>"
"array<decimal(38, 9)>" || "array<decimal(38,9)>"
"array<decimal(38,9)>" || "array<decimal(38,9)>"
"array<decimal(38,9 )>" || "array<decimal(38,9)>"
"array<decimal (38,9 )>" || "array<decimal(38,9)>"
"array<decimal (38)>" || "array<decimal(38)>"
"array<decimal(38 )>" || "array<decimal(38)>"
"struct<field1:string,field2:decimal,field3:bigint>" || "struct<field1:string,field2:decimal,field3:bigint>"
"struct<field1:string,field2:decimal(38, 9),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal( 38, 9),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal( 38 , 9),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal(38, 9),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal(38,9),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal(38,9 ),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal (38,9 ),field3:bigint>" || "struct<field1:string,field2:decimal(38,9),field3:bigint>"
"struct<field1:string,field2:decimal (38),field3:bigint>" || "struct<field1:string,field2:decimal(38),field3:bigint>"
"struct<field1:string,field2:decimal(38 ),field3:bigint>" || "struct<field1:string,field2:decimal(38),field3:bigint>"
}

@Unroll
def 'fail to convert "#typeString" to a presto type and back'(String typeString) {
when:
Expand Down

0 comments on commit 34c9831

Please sign in to comment.