-
Notifications
You must be signed in to change notification settings - Fork 32
Leverage dsbulk string codecs to convert from strings to Java types (and back again) #496
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 1.x
Are you sure you want to change the base?
Changes from all commits
ddc0f45
c35bcfd
e0c84b4
d436693
7e1d16d
54a896d
a0dd808
5262bda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,22 +17,57 @@ | |
|
|
||
| import com.datastax.oss.driver.api.core.data.CqlVector; | ||
| import com.datastax.oss.driver.internal.core.type.codec.VectorCodec; | ||
| import com.datastax.oss.dsbulk.codecs.api.ConvertingCodec; | ||
| import com.datastax.oss.dsbulk.codecs.text.utils.StringUtils; | ||
| import com.fasterxml.jackson.core.JsonProcessingException; | ||
| import com.fasterxml.jackson.databind.JsonNode; | ||
| import com.fasterxml.jackson.databind.ObjectMapper; | ||
| import java.io.IOException; | ||
| import java.util.List; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| public class StringToVectorCodec<SubtypeT extends Number> | ||
| extends StringConvertingCodec<CqlVector<SubtypeT>> { | ||
|
|
||
| public StringToVectorCodec(VectorCodec<SubtypeT> targetCodec, List<String> nullStrings) { | ||
| private final ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec; | ||
| private final ObjectMapper objectMapper; | ||
|
|
||
| public StringToVectorCodec( | ||
| VectorCodec<SubtypeT> targetCodec, | ||
| ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec, | ||
| ObjectMapper objectMapper, | ||
| List<String> nullStrings) { | ||
| super(targetCodec, nullStrings); | ||
| this.jsonCodec = jsonCodec; | ||
| this.objectMapper = objectMapper; | ||
| } | ||
|
|
||
| @Override | ||
| public CqlVector<SubtypeT> externalToInternal(String s) { | ||
| return this.internalCodec.parse(s); | ||
| if (isNullOrEmpty(s)) { | ||
| return null; | ||
| } | ||
| try { | ||
| JsonNode node = objectMapper.readTree(StringUtils.ensureBrackets(s)); | ||
| List<SubtypeT> vals = jsonCodec.externalToInternal(node); | ||
| return CqlVector.newInstance(vals); | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use JSON codecs to eval input strings as JSON, build a list from that and then build a CqlVector from that list. This makes behaviour of the vector codec consistent with codecs for the collection types by enforcing a common policy around string representations of these types (i.e. they have to be JSON-friendly). Idea (and implementation) provided by @adutra |
||
| } catch (IOException e) { | ||
| throw new IllegalArgumentException(String.format("Could not parse '%s' as Json", s), e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public String internalToExternal(CqlVector<SubtypeT> cqlVector) { | ||
| return this.internalCodec.format(cqlVector); | ||
| if (cqlVector == null) { | ||
| return nullString(); | ||
| } | ||
| try { | ||
| List<SubtypeT> vals = cqlVector.stream().collect(Collectors.toList()); | ||
| JsonNode node = jsonCodec.internalToExternal(vals); | ||
| return objectMapper.writeValueAsString(node); | ||
| } catch (JsonProcessingException e) { | ||
| throw new IllegalArgumentException( | ||
| String.format("Could not format '%s' to Json", cqlVector), e); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,26 +22,45 @@ | |
| import com.datastax.oss.driver.api.core.data.CqlVector; | ||
| import com.datastax.oss.driver.api.core.type.DataTypes; | ||
| import com.datastax.oss.driver.api.core.type.codec.TypeCodecs; | ||
| import com.datastax.oss.driver.api.core.type.reflect.GenericType; | ||
| import com.datastax.oss.driver.internal.core.type.DefaultVectorType; | ||
| import com.datastax.oss.driver.internal.core.type.codec.VectorCodec; | ||
| import com.datastax.oss.driver.shaded.guava.common.collect.Lists; | ||
| import com.datastax.oss.dsbulk.codecs.api.ConversionContext; | ||
| import com.datastax.oss.dsbulk.codecs.api.ConvertingCodecFactory; | ||
| import com.datastax.oss.dsbulk.codecs.text.TextConversionContext; | ||
| import java.util.ArrayList; | ||
| import org.junit.jupiter.api.BeforeEach; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| public class StringToVectorCodecTest { | ||
|
|
||
| private final ArrayList<Float> values = Lists.newArrayList(1.1f, 2.2f, 3.3f, 4.4f, 5.5f); | ||
| private final CqlVector vector = CqlVector.newInstance(values); | ||
| private final VectorCodec vectorCodec = | ||
| new VectorCodec(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT); | ||
| private final CqlVector<Float> vector = CqlVector.newInstance(values); | ||
| private final VectorCodec<Float> vectorCodec = | ||
| new VectorCodec<>(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT); | ||
|
|
||
| private final StringToVectorCodec dsbulkCodec = | ||
| new StringToVectorCodec(vectorCodec, Lists.newArrayList("NULL")); | ||
| private StringToVectorCodec<Float> codec; | ||
|
|
||
| @BeforeEach | ||
| void setUp() { | ||
| ConversionContext context = new TextConversionContext().setNullStrings("NULL"); | ||
| ConvertingCodecFactory codecFactory = new ConvertingCodecFactory(context); | ||
| codec = | ||
| (StringToVectorCodec<Float>) | ||
| codecFactory.<String, CqlVector<Float>>createConvertingCodec( | ||
| DataTypes.vectorOf(DataTypes.FLOAT, 5), GenericType.STRING, true); | ||
| } | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a much cleaner way to get to a workable ConversionContext! 👍 |
||
|
|
||
| @Test | ||
| void should_convert_from_valid_external() { | ||
| assertThat(dsbulkCodec) | ||
| .convertsFromExternal(vectorCodec.format(vector)) // standard pattern | ||
| assertThat(codec) | ||
| .convertsFromExternal( | ||
| vectorCodec.format(vector)) // CQL representation is parsable as a json array | ||
| .toInternal(vector) | ||
| .convertsFromExternal("[1.1,2.2,3.3,4.4,5.5]") | ||
| .toInternal(vector) | ||
| .convertsFromExternal("[1.1000,2.2000,3.3000,4.4000,5.5000]") | ||
| .toInternal(vector) | ||
| .convertsFromExternal("") | ||
| .toInternal(null) | ||
|
|
@@ -53,39 +72,44 @@ void should_convert_from_valid_external() { | |
|
|
||
| @Test | ||
| void should_convert_from_valid_internal() { | ||
| assertThat(dsbulkCodec) | ||
| assertThat(codec) | ||
| .convertsFromInternal(vector) | ||
| .toExternal(vectorCodec.format(vector)) | ||
| .toExternal( | ||
| "[1.1,2.2,3.3,4.4,5.5]") // this is NOT 100% identical to vector CQL representation | ||
| .convertsFromInternal(null) | ||
| .toExternal("NULL"); | ||
|
|
||
| // We should encode | ||
| } | ||
|
|
||
| @Test | ||
| void should_not_convert_from_invalid_internal() { | ||
| assertThat(dsbulkCodec).cannotConvertFromInternal("not a valid vector"); | ||
| void should_not_convert_from_invalid_external() { | ||
|
|
||
| // Issue 484: now that we're using the dsbulk string-to-subtype converters we should get | ||
| // enforcement of existing dsbulk policies. For our purposes that means the failure on | ||
| // arithmetic overflow. | ||
| assertThat(codec).cannotConvertFromExternal("6.646329843"); | ||
|
|
||
| // dsbulk should effectively treat this as "6.646329843" so we should also fail in this case | ||
| assertThat(codec).cannotConvertFromExternal("[6.646329843]"); | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This effectively winds up duplicating should_not_convert_too_much_precision() in a way that isn't very clear. The original intent of this method was to perform something similar to JsonNodeToVectorCodecTest.should_not_convert_from_invalid_internal(), specifically given something that isn't a CqlVector this method fails completely. We could certainly add a few more cases but I'd argue it's worthwhile to preserve the symmetry.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has now been resolved by more recent changes |
||
| } | ||
|
|
||
| // To keep usage consistent with VectorCodec we confirm that we support encoding when too many | ||
| // elements are | ||
| // available but not when too few are. Note that it's actually VectorCodec that enforces this | ||
| // constraint so we | ||
| // have to go through encode() rather than the internal/external methods. | ||
| // elements are available but not when too few are. Note that it's actually VectorCodec that | ||
| // enforces this constraint so we have to go through encode() rather than the internal/external | ||
| // methods. | ||
| @Test | ||
| void should_encode_too_many_but_not_too_few() { | ||
|
|
||
| ArrayList<Float> tooMany = Lists.newArrayList(values); | ||
| tooMany.add(6.6f); | ||
| CqlVector<Float> tooManyVector = CqlVector.newInstance(tooMany); | ||
| String tooManyString = dsbulkCodec.internalToExternal(tooManyVector); | ||
| String tooManyString = codec.internalToExternal(tooManyVector); | ||
| ArrayList<Float> tooFew = Lists.newArrayList(values); | ||
| tooFew.remove(0); | ||
| CqlVector<Float> tooFewVector = CqlVector.newInstance(tooFew); | ||
| String tooFewString = dsbulkCodec.internalToExternal(tooFewVector); | ||
| String tooFewString = codec.internalToExternal(tooFewVector); | ||
|
|
||
| assertThat(dsbulkCodec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull(); | ||
| assertThatThrownBy(() -> dsbulkCodec.encode(tooFewString, ProtocolVersion.DEFAULT)) | ||
| assertThat(codec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull(); | ||
| assertThatThrownBy(() -> codec.encode(tooFewString, ProtocolVersion.DEFAULT)) | ||
| .isInstanceOf(IllegalArgumentException.class); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See StringToVectorCodec changes below. jsonCodec is here to convert raw string values into Lists; StringToVectorCodec builds CqlVectors out of them.