Skip to content

Commit

Permalink
AVRO-3704: name validator interface (apache#2053)
Browse files Browse the repository at this point in the history
* AVRO-3704: name validator interface
  • Loading branch information
clesaec authored and Ranbir Kumar committed May 13, 2024
1 parent c825234 commit 230fd9f
Show file tree
Hide file tree
Showing 8 changed files with 199 additions and 50 deletions.
138 changes: 104 additions & 34 deletions lang/java/avro/src/main/java/org/apache/avro/Schema.java
Original file line number Diff line number Diff line change
Expand Up @@ -990,8 +990,8 @@ public int hashCode() {
}
}

private static final ThreadLocal<Set> SEEN_EQUALS = ThreadLocalWithInitial.of(HashSet::new);
private static final ThreadLocal<Map> SEEN_HASHCODE = ThreadLocalWithInitial.of(IdentityHashMap::new);
private static final ThreadLocal<Set<SeenPair>> SEEN_EQUALS = ThreadLocalWithInitial.of(HashSet::new);
private static final ThreadLocal<Map<Schema, Schema>> SEEN_HASHCODE = ThreadLocalWithInitial.of(IdentityHashMap::new);

@SuppressWarnings(value = "unchecked")
private static class RecordSchema extends NamedSchema {
Expand Down Expand Up @@ -1087,7 +1087,7 @@ public boolean equals(Object o) {

@Override
int computeHash() {
Map seen = SEEN_HASHCODE.get();
Map<Schema, Schema> seen = SEEN_HASHCODE.get();
if (seen.containsKey(this))
return 0; // prevent stack overflow
boolean first = seen.isEmpty();
Expand All @@ -1109,8 +1109,8 @@ void toJson(Names names, JsonGenerator gen) throws IOException {
gen.writeStringField("type", isError ? "error" : "record");
writeName(names, gen);
names.space = name.space; // set default namespace
if (getDoc() != null)
gen.writeStringField("doc", getDoc());
if (this.getDoc() != null)
gen.writeStringField("doc", this.getDoc());

if (fields != null) {
gen.writeFieldName("fields");
Expand Down Expand Up @@ -1480,9 +1480,17 @@ public NullSchema() {
*/
public static class Parser {
private Names names = new Names();
private boolean validate = true;
private final Schema.NameValidator validate;
private boolean validateDefaults = true;

public Parser() {
this(NameValidator.UTF_VALIDATOR);
}

public Parser(final NameValidator validate) {
this.validate = validate;
}

/**
* Adds the provided types to the set of defined, named types known to this
* parser. deprecated: use addTypes(Iterable<Schema> types)
Expand Down Expand Up @@ -1510,17 +1518,6 @@ public Map<String, Schema> getTypes() {
return result;
}

/** Enable or disable name validation. */
public Parser setValidate(boolean validate) {
this.validate = validate;
return this;
}

/** True iff names are validated. True by default. */
public boolean getValidate() {
return this.validate;
}

/** Enable or disable default value validation. */
public Parser setValidateDefaults(boolean validateDefaults) {
this.validateDefaults = validateDefaults;
Expand Down Expand Up @@ -1587,7 +1584,7 @@ private static interface ParseFunction {
}

private Schema runParser(JsonParser parser, ParseFunction f) throws IOException {
boolean saved = validateNames.get();
NameValidator saved = validateNames.get();
boolean savedValidateDefaults = VALIDATE_DEFAULTS.get();
try {
validateNames.set(validate);
Expand Down Expand Up @@ -1683,7 +1680,8 @@ public static Schema parse(String jsonSchema) {
*/
@Deprecated
public static Schema parse(String jsonSchema, boolean validate) {
return new Parser().setValidate(validate).parse(jsonSchema);
final NameValidator validator = validate ? NameValidator.UTF_VALIDATOR : NameValidator.NO_VALIDATION;
return new Parser(validator).parse(jsonSchema);
}

static final Map<String, Type> PRIMITIVES = new HashMap<>();
Expand Down Expand Up @@ -1752,27 +1750,21 @@ public Schema put(Name name, Schema schema) {
}
}

private static ThreadLocal<Boolean> validateNames = ThreadLocalWithInitial.of(() -> true);
private static ThreadLocal<Schema.NameValidator> validateNames = ThreadLocalWithInitial
.of(() -> NameValidator.UTF_VALIDATOR);

private static String validateName(String name) {
if (!validateNames.get())
return name; // not validating names
if (name == null)
throw new SchemaParseException("Null name");
int length = name.length();
if (length == 0)
throw new SchemaParseException("Empty name");
char first = name.charAt(0);
if (!(Character.isLetter(first) || first == '_'))
throw new SchemaParseException("Illegal initial character: " + name);
for (int i = 1; i < length; i++) {
char c = name.charAt(i);
if (!(Character.isLetterOrDigit(c) || c == '_'))
throw new SchemaParseException("Illegal character in: " + name);
NameValidator.Result result = validateNames.get().validate(name);
if (!result.isOK()) {
throw new SchemaParseException(result.errors);
}
return name;
}

public static void setNameValidator(final Schema.NameValidator validator) {
Schema.validateNames.set(validator);
}

private static final ThreadLocal<Boolean> VALIDATE_DEFAULTS = ThreadLocalWithInitial.of(() -> true);

private static JsonNode validateDefault(String fieldName, Schema schema, JsonNode defaultValue) {
Expand Down Expand Up @@ -2299,6 +2291,84 @@ private static String getFieldAlias(Name record, String field, Map<Name, Map<Str
return alias;
}

public interface NameValidator {

class Result {
private final String errors;

public Result(final String errors) {
this.errors = errors;
}

public boolean isOK() {
return this == NameValidator.OK;
}

public String getErrors() {
return errors;
}
}

Result OK = new Result(null);

default Result validate(String name) {
return OK;
}

NameValidator NO_VALIDATION = new NameValidator() {
};

NameValidator UTF_VALIDATOR = new NameValidator() {
@Override
public Result validate(final String name) {
if (name == null)
return new Result("Null name");
int length = name.length();
if (length == 0)
return new Result("Empty name");
char first = name.charAt(0);
if (!(Character.isLetter(first) || first == '_'))
return new Result("Illegal initial character: " + name);
for (int i = 1; i < length; i++) {
char c = name.charAt(i);
if (!(Character.isLetterOrDigit(c) || c == '_'))
return new Result("Illegal character in: " + name);
}
return OK;
}
};

NameValidator STRICT_VALIDATOR = new NameValidator() {
@Override
public Result validate(final String name) {
if (name == null)
return new Result("Null name");
int length = name.length();
if (length == 0)
return new Result("Empty name");
char first = name.charAt(0);
if (!(isLetter(first) || first == '_'))
return new Result("Illegal initial character: " + name);
for (int i = 1; i < length; i++) {
char c = name.charAt(i);
if (!(isLetter(c) || isDigit(c) || c == '_'))
return new Result("Illegal character in: " + name);
}
return OK;
}

private boolean isLetter(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

private boolean isDigit(char c) {
return c >= '0' && c <= '9';
}

};

}

/**
* No change is permitted on LockableArrayList once lock() has been called on
* it.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void initialize(InputStream in, byte[] magic) throws IOException {

// finalize the header
header.metaKeyList = Collections.unmodifiableList(header.metaKeyList);
header.schema = new Schema.Parser().setValidate(false).setValidateDefaults(false)
header.schema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false)
.parse(getMetaString(DataFileConstants.SCHEMA));
this.codec = resolveCodec();
reader.setSchema(header.schema);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import java.util.stream.Stream;

class SchemaNameValidatorTest {

@ParameterizedTest
@MethodSource("data")
void validator(Schema.NameValidator validator, String input, boolean expectedResult) {
Schema.NameValidator.Result result = validator.validate(input);
Assertions.assertEquals(expectedResult, result.isOK(), result.getErrors());
}

static Stream<Arguments> data() {
return Stream.of(Arguments.of(Schema.NameValidator.UTF_VALIDATOR, null, false), // null not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, null, false), // null not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "", false), // empty not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "", false), // empty not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "Hello world", false), // space not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "Hello world", false), // space not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H&", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H&", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H=", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H=", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H]", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H]", false), // non letter or digit not accepted
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "Hello_world", true),
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "Hello_world", true),
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "éàçô", true), // Accept accent
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "éàçô", false), // Not Accept accent
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "5éàçô", false), // can't start with number
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "5éàçô", false), // can't start with number
Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "_Hello_world", true),
Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "_Hello_world", true));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void throttledInputStream() throws IOException {
// magic header check. This happens with throttled input stream,
// where we read into buffer less bytes than requested.

Schema legacySchema = new Schema.Parser().setValidate(false).setValidateDefaults(false)
Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false)
.parse("{\"type\": \"record\", \"name\": \"TestSchema\", \"fields\": "
+ "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}");
File f = Files.createTempFile("testThrottledInputStream", ".avro").toFile();
Expand Down Expand Up @@ -146,7 +146,7 @@ void inputStreamEOF() throws IOException {
// AVRO-2944 describes hanging/failure in reading Avro file with performing
// magic header check. This potentially happens with a defective input stream
// where a -1 value is unexpectedly returned from a read.
Schema legacySchema = new Schema.Parser().setValidate(false).setValidateDefaults(false)
Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false)
.parse("{\"type\": \"record\", \"name\": \"TestSchema\", \"fields\": "
+ "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}");
File f = Files.createTempFile("testInputStreamEOF", ".avro").toFile();
Expand Down Expand Up @@ -195,7 +195,7 @@ void ignoreSchemaValidationOnRead() throws IOException {
// This schema has an accent in the name and the default for the field doesn't
// match the first type in the union. A Java SDK in the past could create a file
// containing this schema.
Schema legacySchema = new Schema.Parser().setValidate(false).setValidateDefaults(false)
Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false)
.parse("{\"type\": \"record\", \"name\": \"InvalidAccëntWithInvalidNull\", \"fields\": "
+ "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}");

Expand Down
11 changes: 4 additions & 7 deletions lang/java/avro/src/test/java/org/apache/avro/TestSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -500,17 +500,15 @@ public void parseAliases() throws JsonProcessingException {
}

@Test
void testContentAfterAvsc() throws Exception {
Schema.Parser parser = new Schema.Parser();
parser.setValidate(true);
void testContentAfterAvsc() {
Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR);
parser.setValidateDefaults(true);
assertThrows(SchemaParseException.class, () -> parser.parse("{\"type\": \"string\"}; DROP TABLE STUDENTS"));
}

@Test
void testContentAfterAvscInInputStream() throws Exception {
Schema.Parser parser = new Schema.Parser();
parser.setValidate(true);
Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR);
parser.setValidateDefaults(true);
String avsc = "{\"type\": \"string\"}; DROP TABLE STUDENTS";
ByteArrayInputStream is = new ByteArrayInputStream(avsc.getBytes(StandardCharsets.UTF_8));
Expand All @@ -526,8 +524,7 @@ void testContentAfterAvscInFile() throws Exception {
writer.flush();
}

Schema.Parser parser = new Schema.Parser();
parser.setValidate(true);
Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR);
parser.setValidateDefaults(true);
assertThrows(SchemaParseException.class, () -> parser.parse(avscFile));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecordBuilder;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

Expand Down Expand Up @@ -874,4 +876,25 @@ void namespaceDefaulting() {

assertEquals(a2, a1);
}

@Test
void namesAcceptAll() throws InterruptedException {
// Ensure that Schema.setNameValidator won't interfere with others unit tests.
Runnable r = () -> {
Schema.setNameValidator(Schema.NameValidator.NO_VALIDATION);
final Schema schema = SchemaBuilder.record("7name").fields().name("123").type(Schema.create(Schema.Type.INT))
.noDefault().endRecord();
Assertions.assertNotNull(schema);
Assertions.assertEquals("7name", schema.getName());
final Schema.Field field = schema.getField("123");
Assertions.assertEquals("123", field.name());
};

final Throwable[] exception = new Throwable[] { null };
Thread t = new Thread(r);
t.setUncaughtExceptionHandler((Thread th, Throwable e) -> exception[0] = e);
t.start();
t.join();
Assertions.assertNull(exception[0], () -> exception[0].getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1232,7 +1232,7 @@ private static class Z {
@Test
void dollarTerminatedNamespaceCompatibility() {
ReflectData data = ReflectData.get();
Schema s = new Schema.Parser().setValidate(false).parse(
Schema s = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse(
"{\"type\":\"record\",\"name\":\"Z\",\"namespace\":\"org.apache.avro.reflect.TestReflect$\",\"fields\":[]}");
assertEquals(data.getSchema(data.getClass(s)).toString(),
"{\"type\":\"record\",\"name\":\"Z\",\"namespace\":\"org.apache.avro.reflect.TestReflect\",\"fields\":[]}");
Expand All @@ -1242,7 +1242,7 @@ void dollarTerminatedNamespaceCompatibility() {
void dollarTerminatedNestedStaticClassNamespaceCompatibility() {
ReflectData data = ReflectData.get();
// Older versions of Avro generated this namespace on nested records.
Schema s = new Schema.Parser().setValidate(false).parse(
Schema s = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse(
"{\"type\":\"record\",\"name\":\"AnotherSampleRecord\",\"namespace\":\"org.apache.avro.reflect.TestReflect$SampleRecord\",\"fields\":[]}");
assertThat(data.getSchema(data.getClass(s)).getFullName(),
is("org.apache.avro.reflect.TestReflect.SampleRecord.AnotherSampleRecord"));
Expand Down
Loading

0 comments on commit 230fd9f

Please sign in to comment.