Skip to content

Commit

Permalink
fix(go/java): Enhance ASCII check in meta string encoding (#1631)
Browse files Browse the repository at this point in the history
<!--
**Thanks for contributing to Fury.**

**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).**

Contribution Checklist

- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/incubator-fury/blob/main/CONTRIBUTING.md).

- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->

## What does this PR do?

<!-- Describe the purpose of this PR. -->
This PR enhances the current ASCII check (before meta string encoding) I
implemented in #1620 to return a UTF-8 encoded `MetaString` early if the
input is non-ASCII. This improves efficiency and saves time on
`computeEncoding` and `encode`. Unit tests are also added.




## Related issues
#1619

## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/incubator-fury/issues/new/choose)
describing the need to do so and update the document if necessary.
-->

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?


## Benchmark

<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->

---------

Signed-off-by: Jason Mok <jjasonmok1@gmail.com>
  • Loading branch information
jasonmokk authored May 14, 2024
1 parent f744640 commit b904639
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 7 deletions.
21 changes: 15 additions & 6 deletions go/fury/meta/meta_string_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,23 @@ func NewEncoder(specialCh1 byte, specialCh2 byte) *Encoder {

// Encode the input string to MetaString using adaptive encoding
func (e *Encoder) Encode(input string) (MetaString, error) {
if !isASCII(input) {
return MetaString{
inputString: input,
encoding: UTF_8,
specialChar1: e.specialChar1,
specialChar2: e.specialChar2,
encodedBytes: []byte(input),
}, nil
}
encoding := e.ComputeEncoding(input)
return e.EncodeWithEncoding(input, encoding)
}

// EncodeWithEncoding Encodes the input string to MetaString using specified encoding.
func (e *Encoder) EncodeWithEncoding(input string, encoding Encoding) (MetaString, error) {
if encoding != UTF_8 && !isASCII(input) {
return MetaString{}, errors.New("non-ASCII characters in meta string are not allowed")
return MetaString{}, errors.New("non-ASCII characters in meta string are not allowed")
}
if len(input) > 32767 {
return MetaString{}, errors.New("long meta string than 32767 is not allowed")
Expand Down Expand Up @@ -171,12 +180,12 @@ func (e *Encoder) ComputeEncoding(input string) Encoding {
}

func isASCII(input string) bool {
for _, r := range input {
if r > 127 {
for _, r := range input {
if r > 127 {
return false
}
}
return true
}
}
return true
}

type stringStatistics struct {
Expand Down
27 changes: 26 additions & 1 deletion go/fury/meta/meta_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
package meta

import (
"github.com/stretchr/testify/require"
"testing"

"github.com/stretchr/testify/require"
)

func TestEncodeAndDecodeMetaString(t *testing.T) {
Expand Down Expand Up @@ -80,3 +81,27 @@ func calcTotalBytes(src string, bitsPerChar int, encoding Encoding) int {
}
return (ret + 7) / 8
}

func TestAsciiEncoding(t *testing.T) {
encoder := NewEncoder('.', '_')

data, err := encoder.Encode("asciiOnly")
require.NoError(t, err)
require.NotEqual(t, UTF_8, data.GetEncoding(), "Encoding should not be UTF-8 for ASCII strings")
}

func TestNonAsciiEncoding(t *testing.T) {
encoder := NewEncoder('.', '_')

data, err := encoder.Encode("こんにちは") // Non-ASCII String
require.NoError(t, err)
require.Equal(t, UTF_8, data.GetEncoding(), "Encoding should be UTF-8 for non-ASCII strings")
}

func TestEncodeWithEncodingNonAscii(t *testing.T) {
encoder := NewEncoder('.', '_')

_, err := encoder.EncodeWithEncoding("こんにちは", LOWER_SPECIAL)
require.Error(t, err, "Expected error for non-ASCII characters in non-UTF-8 encoding")
require.Equal(t, "non-ASCII characters in meta string are not allowed", err.Error())
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ public MetaString encode(String input, Encoding[] encodings) {
if (input.isEmpty()) {
return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]);
}
if (!StringSerializer.isLatin(input.toCharArray())) {
return new MetaString(
input,
Encoding.UTF_8,
specialChar1,
specialChar2,
input.getBytes(StandardCharsets.UTF_8));
}
Encoding encoding = computeEncoding(input, encodings);
return encode(input, encoding);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,33 @@ public void testEmptyString() {
String decoded = decoder.decode(metaString.getBytes(), metaString.getEncoding());
assertEquals(decoded, "");
}

@Test
public void testAsciiEncoding() {
MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
String testString = "asciiOnly";
MetaString encodedMetaString = encoder.encode(testString);
assertNotSame(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.ALL_TO_LOWER_SPECIAL);
}

@Test
public void testNonAsciiEncoding() {
MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
String testString = "こんにちは"; // Non-ASCII string
MetaString encodedMetaString = encoder.encode(testString);
assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8);
}

@Test
public void testNonAsciiEncodingAndNonUTF8() {
MetaStringEncoder encoder = new MetaStringEncoder('_', '$');
String nonAsciiString = "こんにちは"; // Non-ASCII string

try {
encoder.encode(nonAsciiString, MetaString.Encoding.LOWER_SPECIAL);
} catch (IllegalArgumentException e) {
assertEquals(e.getMessage(), "Non-ASCII characters in meta string are not allowed");
}
}
}

0 comments on commit b904639

Please sign in to comment.