diff --git a/go/fury/meta/meta_string_encoder.go b/go/fury/meta/meta_string_encoder.go index 6d3b89bf0c..419e41e2bf 100644 --- a/go/fury/meta/meta_string_encoder.go +++ b/go/fury/meta/meta_string_encoder.go @@ -36,6 +36,15 @@ func NewEncoder(specialCh1 byte, specialCh2 byte) *Encoder { // Encode the input string to MetaString using adaptive encoding func (e *Encoder) Encode(input string) (MetaString, error) { + if !isASCII(input) { + return MetaString{ + inputString: input, + encoding: UTF_8, + specialChar1: e.specialChar1, + specialChar2: e.specialChar2, + encodedBytes: []byte(input), + }, nil + } encoding := e.ComputeEncoding(input) return e.EncodeWithEncoding(input, encoding) } @@ -43,7 +52,7 @@ func (e *Encoder) Encode(input string) (MetaString, error) { // EncodeWithEncoding Encodes the input string to MetaString using specified encoding. func (e *Encoder) EncodeWithEncoding(input string, encoding Encoding) (MetaString, error) { if encoding != UTF_8 && !isASCII(input) { - return MetaString{}, errors.New("non-ASCII characters in meta string are not allowed") + return MetaString{}, errors.New("non-ASCII characters in meta string are not allowed") } if len(input) > 32767 { return MetaString{}, errors.New("long meta string than 32767 is not allowed") @@ -171,12 +180,12 @@ func (e *Encoder) ComputeEncoding(input string) Encoding { } func isASCII(input string) bool { - for _, r := range input { - if r > 127 { + for _, r := range input { + if r > 127 { return false - } - } - return true + } + } + return true } type stringStatistics struct { diff --git a/go/fury/meta/meta_string_test.go b/go/fury/meta/meta_string_test.go index 0fb89f6dbb..6560578d33 100644 --- a/go/fury/meta/meta_string_test.go +++ b/go/fury/meta/meta_string_test.go @@ -18,8 +18,9 @@ package meta import ( - "github.com/stretchr/testify/require" "testing" + + "github.com/stretchr/testify/require" ) func TestEncodeAndDecodeMetaString(t *testing.T) { @@ -80,3 +81,27 @@ func calcTotalBytes(src string, bitsPerChar int, encoding Encoding) int { } return (ret + 7) / 8 } + +func TestAsciiEncoding(t *testing.T) { + encoder := NewEncoder('.', '_') + + data, err := encoder.Encode("asciiOnly") + require.NoError(t, err) + require.NotEqual(t, UTF_8, data.GetEncoding(), "Encoding should not be UTF-8 for ASCII strings") +} + +func TestNonAsciiEncoding(t *testing.T) { + encoder := NewEncoder('.', '_') + + data, err := encoder.Encode("こんにちは") // Non-ASCII String + require.NoError(t, err) + require.Equal(t, UTF_8, data.GetEncoding(), "Encoding should be UTF-8 for non-ASCII strings") +} + +func TestEncodeWithEncodingNonAscii(t *testing.T) { + encoder := NewEncoder('.', '_') + + _, err := encoder.EncodeWithEncoding("こんにちは", LOWER_SPECIAL) + require.Error(t, err, "Expected error for non-ASCII characters in non-UTF-8 encoding") + require.Equal(t, "non-ASCII characters in meta string are not allowed", err.Error()) +} diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java index 619a441c63..b6a0a58b44 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java @@ -57,6 +57,14 @@ public MetaString encode(String input, Encoding[] encodings) { if (input.isEmpty()) { return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]); } + if (!StringSerializer.isLatin(input.toCharArray())) { + return new MetaString( + input, + Encoding.UTF_8, + specialChar1, + specialChar2, + input.getBytes(StandardCharsets.UTF_8)); + } Encoding encoding = computeEncoding(input, encodings); return encode(input, encoding); } diff --git a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java index f85d5e1507..10bfe37a09 100644 --- a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java @@ -214,4 +214,33 @@ public void testEmptyString() { String decoded = decoder.decode(metaString.getBytes(), metaString.getEncoding()); assertEquals(decoded, ""); } + + @Test + public void testAsciiEncoding() { + MetaStringEncoder encoder = new MetaStringEncoder('_', '$'); + String testString = "asciiOnly"; + MetaString encodedMetaString = encoder.encode(testString); + assertNotSame(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8); + assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.ALL_TO_LOWER_SPECIAL); + } + + @Test + public void testNonAsciiEncoding() { + MetaStringEncoder encoder = new MetaStringEncoder('_', '$'); + String testString = "こんにちは"; // Non-ASCII string + MetaString encodedMetaString = encoder.encode(testString); + assertEquals(encodedMetaString.getEncoding(), MetaString.Encoding.UTF_8); + } + + @Test + public void testNonAsciiEncodingAndNonUTF8() { + MetaStringEncoder encoder = new MetaStringEncoder('_', '$'); + String nonAsciiString = "こんにちは"; // Non-ASCII string + + try { + encoder.encode(nonAsciiString, MetaString.Encoding.LOWER_SPECIAL); + } catch (IllegalArgumentException e) { + assertEquals(e.getMessage(), "Non-ASCII characters in meta string are not allowed"); + } + } }