Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -396,18 +396,14 @@ public class JsonNodeConvertingCodecProvider implements ConvertingCodecProvider
return new JsonNodeToDateRangeCodec(nullStrings);
case DefaultVectorType.VECTOR_CLASS_NAME:
VectorType vectorType = (VectorType) cqlType;
// Step 1: create a JSON codec which will take the input JSON nodes and generate
// something matching the expected data type
ConvertingCodec<JsonNode, ?> jsonCodec =
// Parser for JSON leaf nodes, each of which represents a value of the vector subtype
ConvertingCodec<JsonNode, ?> leafCodec =
createJsonNodeConvertingCodec(vectorType.getElementType(), codecFactory, false);
// Step 2: create a conventional codec which will take instances of the Java type
// generated by the JSON codec above and perform standard serde on them.
ConvertingCodec<?, ?> standardCodec =
codecFactory.createConvertingCodec(
vectorType.getElementType(), jsonCodec.getInternalJavaType(), false);
return new JsonNodeToVectorCodec(
new VectorCodec(vectorType, standardCodec),
jsonCodec,
new VectorCodec(
vectorType,
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType())),
leafCodec,
context.getAttribute(OBJECT_MAPPER),
nullStrings);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@
public class JsonNodeToVectorCodec<SubtypeT extends Number>
extends JsonNodeConvertingCodec<CqlVector<SubtypeT>> {

private final ConvertingCodec<JsonNode, SubtypeT> subtypeCodec;
private final ConvertingCodec<JsonNode, SubtypeT> leafCodec;
private final ObjectMapper objectMapper;

public JsonNodeToVectorCodec(
VectorCodec<SubtypeT> targetCodec,
ConvertingCodec<JsonNode, SubtypeT> subtypeCodec,
ConvertingCodec<JsonNode, SubtypeT> leafCodec,
ObjectMapper objectMapper,
List<String> nullStrings) {
super(targetCodec, nullStrings);
this.subtypeCodec = subtypeCodec;
this.leafCodec = leafCodec;
this.objectMapper = objectMapper;
}

Expand All @@ -47,7 +47,7 @@ public CqlVector<SubtypeT> externalToInternal(JsonNode jsonNode) {
if (jsonNode == null || !jsonNode.isArray()) return null;
List<SubtypeT> elems =
Streams.stream(jsonNode.elements())
.map(e -> subtypeCodec.externalToInternal(e))
.map(e -> leafCodec.externalToInternal(e))
.collect(Collectors.toCollection(ArrayList::new));
return CqlVector.newInstance(elems);
}
Expand All @@ -57,7 +57,7 @@ public JsonNode internalToExternal(CqlVector<SubtypeT> value) {
if (value == null) return null;
ArrayNode root = objectMapper.createArrayNode();
for (SubtypeT element : value) {
root.add(subtypeCodec.internalToExternal(element));
root.add(leafCodec.internalToExternal(element));
}
return root;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,15 @@ public class StringConvertingCodecProvider implements ConvertingCodecProvider {
return new StringToDateRangeCodec(nullStrings);
case DefaultVectorType.VECTOR_CLASS_NAME:
VectorType vectorType = (VectorType) cqlType;
return new StringToVectorCodec(
new VectorCodec(
VectorCodec<Number> vectorCodec =
new VectorCodec<>(
vectorType,
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType())),
nullStrings);
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType()));
ConvertingCodec<JsonNode, List<Number>> jsonCodec =
codecFactory.createConvertingCodec(
DataTypes.listOf(vectorType.getElementType()), JSON_NODE_TYPE, false);
return new StringToVectorCodec<>(
vectorCodec, jsonCodec, context.getAttribute(OBJECT_MAPPER), nullStrings);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See StringToVectorCodec changes below. jsonCodec is here to convert raw string values into Lists; StringToVectorCodec builds CqlVectors out of them.

}
}
// fall through
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,57 @@

import com.datastax.oss.driver.api.core.data.CqlVector;
import com.datastax.oss.driver.internal.core.type.codec.VectorCodec;
import com.datastax.oss.dsbulk.codecs.api.ConvertingCodec;
import com.datastax.oss.dsbulk.codecs.text.utils.StringUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

public class StringToVectorCodec<SubtypeT extends Number>
extends StringConvertingCodec<CqlVector<SubtypeT>> {

public StringToVectorCodec(VectorCodec<SubtypeT> targetCodec, List<String> nullStrings) {
private final ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec;
private final ObjectMapper objectMapper;

public StringToVectorCodec(
VectorCodec<SubtypeT> targetCodec,
ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec,
ObjectMapper objectMapper,
List<String> nullStrings) {
super(targetCodec, nullStrings);
this.jsonCodec = jsonCodec;
this.objectMapper = objectMapper;
}

@Override
public CqlVector<SubtypeT> externalToInternal(String s) {
return this.internalCodec.parse(s);
if (isNullOrEmpty(s)) {
return null;
}
try {
JsonNode node = objectMapper.readTree(StringUtils.ensureBrackets(s));
List<SubtypeT> vals = jsonCodec.externalToInternal(node);
return CqlVector.newInstance(vals);
Copy link
Collaborator Author

@absurdfarce absurdfarce Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use JSON codecs to eval input strings as JSON, build a list from that and then build a CqlVector from that list. This makes behaviour of the vector codec consistent with codecs for the collection types by enforcing a common policy around string representations of these types (i.e. they have to be JSON-friendly).

Idea (and implementation) provided by @adutra

} catch (IOException e) {
throw new IllegalArgumentException(String.format("Could not parse '%s' as Json", s), e);
}
}

@Override
public String internalToExternal(CqlVector<SubtypeT> cqlVector) {
return this.internalCodec.format(cqlVector);
if (cqlVector == null) {
return nullString();
}
try {
List<SubtypeT> vals = cqlVector.stream().collect(Collectors.toList());
JsonNode node = jsonCodec.internalToExternal(vals);
return objectMapper.writeValueAsString(node);
} catch (JsonProcessingException e) {
throw new IllegalArgumentException(
String.format("Could not format '%s' to Json", cqlVector), e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public class JsonNodeToVectorCodecTest {

private final ConvertingCodecFactory factory = new ConvertingCodecFactory();
private final JsonNodeConvertingCodecProvider provider = new JsonNodeConvertingCodecProvider();
private final JsonNodeToVectorCodec dsbulkCodec =
private final JsonNodeToVectorCodec<Float> dsbulkCodec =
new JsonNodeToVectorCodec(
vectorCodec,
provider
Expand Down Expand Up @@ -79,8 +79,21 @@ void should_convert_from_valid_internal() {
}

@Test
void should_not_convert_from_invalid_internal() {
assertThat(dsbulkCodec).cannotConvertFromInternal("not a valid vector");
void should_not_convert_from_invalid_external() {

/* Current impl only supports vectors of floats so this should fail. We'll have
* to revise this test once #512 is addressed.
*/
ArrayNode invalidTypeNode = JSON_NODE_FACTORY.arrayNode();
invalidTypeNode.add(JSON_NODE_FACTORY.textNode("not a valid vector"));
assertThat(dsbulkCodec).cannotConvertFromExternal(invalidTypeNode);

/* Issue 484: now that we're using the dsbulk string-to-subtype converters we should get
* enforcement of existing dsbulk policies. For our purposes that means the failure on
* arithmetic overflow */
ArrayNode tooPreciseNode = JSON_NODE_FACTORY.arrayNode();
tooPreciseNode.add(JSON_NODE_FACTORY.numberNode(6.646329843));
assertThat(dsbulkCodec).cannotConvertFromExternal(tooPreciseNode);
}

// To keep usage consistent with VectorCodec we confirm that we support encoding when too many
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,45 @@
import com.datastax.oss.driver.api.core.data.CqlVector;
import com.datastax.oss.driver.api.core.type.DataTypes;
import com.datastax.oss.driver.api.core.type.codec.TypeCodecs;
import com.datastax.oss.driver.api.core.type.reflect.GenericType;
import com.datastax.oss.driver.internal.core.type.DefaultVectorType;
import com.datastax.oss.driver.internal.core.type.codec.VectorCodec;
import com.datastax.oss.driver.shaded.guava.common.collect.Lists;
import com.datastax.oss.dsbulk.codecs.api.ConversionContext;
import com.datastax.oss.dsbulk.codecs.api.ConvertingCodecFactory;
import com.datastax.oss.dsbulk.codecs.text.TextConversionContext;
import java.util.ArrayList;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

public class StringToVectorCodecTest {

private final ArrayList<Float> values = Lists.newArrayList(1.1f, 2.2f, 3.3f, 4.4f, 5.5f);
private final CqlVector vector = CqlVector.newInstance(values);
private final VectorCodec vectorCodec =
new VectorCodec(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT);
private final CqlVector<Float> vector = CqlVector.newInstance(values);
private final VectorCodec<Float> vectorCodec =
new VectorCodec<>(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT);

private final StringToVectorCodec dsbulkCodec =
new StringToVectorCodec(vectorCodec, Lists.newArrayList("NULL"));
private StringToVectorCodec<Float> codec;

@BeforeEach
void setUp() {
ConversionContext context = new TextConversionContext().setNullStrings("NULL");
ConvertingCodecFactory codecFactory = new ConvertingCodecFactory(context);
codec =
(StringToVectorCodec<Float>)
codecFactory.<String, CqlVector<Float>>createConvertingCodec(
DataTypes.vectorOf(DataTypes.FLOAT, 5), GenericType.STRING, true);
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a much cleaner way to get to a workable ConversionContext! 👍


@Test
void should_convert_from_valid_external() {
assertThat(dsbulkCodec)
.convertsFromExternal(vectorCodec.format(vector)) // standard pattern
assertThat(codec)
.convertsFromExternal(
vectorCodec.format(vector)) // CQL representation is parsable as a json array
.toInternal(vector)
.convertsFromExternal("[1.1,2.2,3.3,4.4,5.5]")
.toInternal(vector)
.convertsFromExternal("[1.1000,2.2000,3.3000,4.4000,5.5000]")
.toInternal(vector)
.convertsFromExternal("")
.toInternal(null)
Expand All @@ -53,39 +72,44 @@ void should_convert_from_valid_external() {

@Test
void should_convert_from_valid_internal() {
assertThat(dsbulkCodec)
assertThat(codec)
.convertsFromInternal(vector)
.toExternal(vectorCodec.format(vector))
.toExternal(
"[1.1,2.2,3.3,4.4,5.5]") // this is NOT 100% identical to vector CQL representation
.convertsFromInternal(null)
.toExternal("NULL");

// We should encode
}

@Test
void should_not_convert_from_invalid_internal() {
assertThat(dsbulkCodec).cannotConvertFromInternal("not a valid vector");
void should_not_convert_from_invalid_external() {

// Issue 484: now that we're using the dsbulk string-to-subtype converters we should get
// enforcement of existing dsbulk policies. For our purposes that means the failure on
// arithmetic overflow.
assertThat(codec).cannotConvertFromExternal("6.646329843");

// dsbulk should effectively treat this as "6.646329843" so we should also fail in this case
assertThat(codec).cannotConvertFromExternal("[6.646329843]");
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This effectively winds up duplicating should_not_convert_too_much_precision() in a way that isn't very clear. The original intent of this method was to perform something similar to JsonNodeToVectorCodecTest.should_not_convert_from_invalid_internal(), specifically given something that isn't a CqlVector this method fails completely. We could certainly add a few more cases but I'd argue it's worthwhile to preserve the symmetry.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has now been resolved by more recent changes

}

// To keep usage consistent with VectorCodec we confirm that we support encoding when too many
// elements are
// available but not when too few are. Note that it's actually VectorCodec that enforces this
// constraint so we
// have to go through encode() rather than the internal/external methods.
// elements are available but not when too few are. Note that it's actually VectorCodec that
// enforces this constraint so we have to go through encode() rather than the internal/external
// methods.
@Test
void should_encode_too_many_but_not_too_few() {

ArrayList<Float> tooMany = Lists.newArrayList(values);
tooMany.add(6.6f);
CqlVector<Float> tooManyVector = CqlVector.newInstance(tooMany);
String tooManyString = dsbulkCodec.internalToExternal(tooManyVector);
String tooManyString = codec.internalToExternal(tooManyVector);
ArrayList<Float> tooFew = Lists.newArrayList(values);
tooFew.remove(0);
CqlVector<Float> tooFewVector = CqlVector.newInstance(tooFew);
String tooFewString = dsbulkCodec.internalToExternal(tooFewVector);
String tooFewString = codec.internalToExternal(tooFewVector);

assertThat(dsbulkCodec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull();
assertThatThrownBy(() -> dsbulkCodec.encode(tooFewString, ProtocolVersion.DEFAULT))
assertThat(codec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull();
assertThatThrownBy(() -> codec.encode(tooFewString, ProtocolVersion.DEFAULT))
.isInstanceOf(IllegalArgumentException.class);
}
}