Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use utf conversions from minipal #89036

Merged
merged 9 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/coreclr/nativeaot/Runtime/eventpipe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,21 @@ set(AOT_EVENTPIPE_SHIM_DIR "${CMAKE_CURRENT_SOURCE_DIR}")

set (CONTAINER_SOURCES "")
set (CONTAINER_HEADERS "")
set (MINIPAL_SOURCES "")
set (EVENTPIPE_SOURCES "")
set (EVENTPIPE_HEADERS "")
set (GEN_EVENTPIPE_SOURCES "")

set (SHARED_CONTAINERS_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/containers")
set (SHARED_EVENTPIPE_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/eventpipe")
set (SHARED_MINIPAL_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/minipal")
include (${SHARED_EVENTPIPE_SOURCE_PATH}/eventpipe.cmake)
include (${SHARED_CONTAINERS_SOURCE_PATH}/containers.cmake)

list(APPEND MINIPAL_SOURCES
utf8.c
)

if(CLR_CMAKE_HOST_WIN32)
list(APPEND SHARED_DIAGNOSTIC_SERVER_SOURCES
ds-ipc-pal-namedpipe.c
Expand Down Expand Up @@ -50,6 +56,7 @@ list(APPEND EVENTPIPE_HEADERS

addprefix(CONTAINER_SOURCES ${SHARED_CONTAINERS_SOURCE_PATH} "${SHARED_CONTAINER_SOURCES}")
addprefix(CONTAINER_HEADERS ${SHARED_CONTAINERS_SOURCE_PATH} "${SHARED_CONTAINER_HEADERS}")
addprefix(MINIPAL_SOURCES ${SHARED_MINIPAL_SOURCE_PATH} "${MINIPAL_SOURCES}")

addprefix(EVENTPIPE_SOURCES ${SHARED_EVENTPIPE_SOURCE_PATH} "${EVENTPIPE_SOURCES}")
addprefix(EVENTPIPE_HEADERS ${SHARED_EVENTPIPE_SOURCE_PATH} "${EVENTPIPE_HEADERS}")
Expand Down Expand Up @@ -125,6 +132,7 @@ list(APPEND EVENTPIPE_SOURCES
${GEN_EVENTPIPE_SOURCES}
${CONTAINER_SOURCES}
${CONTAINER_HEADERS}
${MINIPAL_SOURCES}
)

list(APPEND AOT_EVENTPIPE_DISABLED_SOURCES
Expand Down
61 changes: 33 additions & 28 deletions src/coreclr/nativeaot/Runtime/eventpipe/ep-rt-aot.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <pthread.h>
#endif

#include <minipal/utf8.h>

#include <eventpipe/ep-rt-config.h>
#ifdef ENABLE_PERFTRACING
#include <eventpipe/ep-thread.h>
Expand Down Expand Up @@ -1375,6 +1377,8 @@ ep_rt_utf8_string_replace (
return false;
}

#define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008
LakshanF marked this conversation as resolved.
Show resolved Hide resolved

static
ep_char16_t *
ep_rt_utf8_to_utf16le_string (
Expand All @@ -1386,22 +1390,23 @@ ep_rt_utf8_to_utf16le_string (
if (!str)
return NULL;

// Shipping criteria: no EVENTPIPE-NATIVEAOT-TODO left in the codebase
// Implementation would just use strlen and malloc to make a new buffer, and would then copy the string chars one by one.
// Assumes that only ASCII is used for ep_char8_t
size_t len_utf8 = strlen(str);
ep_char16_t *str_utf16 = reinterpret_cast<ep_char16_t *>(malloc ((len_utf8 + 1) * sizeof (ep_char16_t)));
if (!str_utf16)
int32_t flags = MINIPAL_MB_NO_REPLACE_INVALID_CHARS;
LakshanF marked this conversation as resolved.
Show resolved Hide resolved

ep_char16_t* lpDestStr = NULL;
LakshanF marked this conversation as resolved.
Show resolved Hide resolved

if (static_cast<int>(len) < 0)
LakshanF marked this conversation as resolved.
Show resolved Hide resolved
len = strlen(str) + 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need the +1 here? Adding the null terminator is handled explicitly below?

Copy link
Member

@am11 am11 Jul 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an existing pattern:

if (len < 0)
len = (glong)strlen(str) + 1;
glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags);

Reason is, minipal get_length API needs the source length with null-terminator accounted for, so it returns the correct destination count. Handling it within minipal was requiring some additional conditions which we decided to leave out for the caller to handle.

Copy link
Member

@elinor-fung elinor-fung Jul 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the length in ep_rt_utf16_to_utf8_string need +1 too, then?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an existing pattern:

It has the same issue of allocating one extra byte than necessary, and it is further complicated by returning items_written.

We do not need to replicate the issue here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkotas, not sure which issue you are referring to. The easiest way to understand what goes wrong without +1 is to remove it from coreclr version (it's on the callsite of PAL API) and run PAL tests. fb613a5 did the same thing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkotas, not sure which issue you are referring to.

Allocating buffer that is one character longer than necessary.

For example, if ep_rt_utf8_to_utf16le_string is called with a string that is 1 ascii character + zero terminator, the current code allocates 3 * sizeof(CHAR16_T). It is one CHAR16_T more than what is actually needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the pattern with the extra +1 that @am11 mentions is needed to preserve the current behavior. For example, this test fails without +1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you saying that the event pipe code expects the strings returned by this method to have double zero terminators? It is very atypical contract. If it is really the case, these should be a long comment explaining it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Allocating buffer that is one character longer than necessary.

PAL, COM and reflection tests are very sensitive about "fill the exact slot or fail" cases. So if that was the case, they all will fail. IOW, that +1 effect gets cancelled out how the code in utf8.c works.


size_t ret = minipal_get_length_utf8_to_utf16 (str, len, flags);

if (ret <= 0)
return NULL;

for (size_t i = 0; i < len_utf8; i++)
{
EP_ASSERT(isascii(str[i]));
str_utf16[i] = str[i];
}
lpDestStr = reinterpret_cast<ep_char16_t *>(malloc((ret + 1) * sizeof(ep_char16_t)));
ret = minipal_convert_utf8_to_utf16 (str, len, reinterpret_cast<CHAR16_T*>(lpDestStr), ret, flags);
lpDestStr[ret] = '\0';

str_utf16[len_utf8] = 0;
return str_utf16;
return lpDestStr;
}

static
Expand Down Expand Up @@ -1450,27 +1455,27 @@ ep_rt_utf16_to_utf8_string (
size_t len)
{
STATIC_CONTRACT_NOTHROW;

if (!str)
return NULL;

// shipping criteria: no EVENTPIPE-NATIVEAOT-TODO left in the codebase
// Simple implementation to create a utf8 string from a utf16 one
size_t len_utf16 = len;
if(len_utf16 == (size_t)-1)
len_utf16 = ep_rt_utf16_string_len (str);

ep_char8_t *str_utf8 = reinterpret_cast<ep_char8_t *>(malloc ((len_utf16 + 1) * sizeof (ep_char8_t)));
if (!str_utf8)
return NULL;
ep_char8_t* lpDestStr = NULL;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: rename to str_utf8 and move the declaration down to where it is first assigned.

if (static_cast<int>(len) < 0) {
len = 0;
while (str[len])
len++;
LakshanF marked this conversation as resolved.
Show resolved Hide resolved

for (size_t i = 0; i < len_utf16; i++)
{
str_utf8[i] = (char)str[i];
}

str_utf8[len_utf16] = 0;
return str_utf8;
size_t ret = minipal_get_length_utf16_to_utf8 (reinterpret_cast<const CHAR16_T *>(str), len, 0);

if (ret <= 0)
return NULL;

lpDestStr = reinterpret_cast<ep_char8_t *>(malloc((ret + 1) * sizeof(ep_char8_t)));
ret = minipal_convert_utf16_to_utf8 (reinterpret_cast<const CHAR16_T*>(str), len, lpDestStr, ret, 0);
lpDestStr[ret] = '\0';

return lpDestStr;
}

static
Expand Down
21 changes: 11 additions & 10 deletions src/native/minipal/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,8 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun
// Initialize stuff
unsigned char *pSrc = bytes;
unsigned char *pEnd = pSrc + count;
int availableBytes, chc;
size_t availableBytes;
int chc;

// Start by assuming we have as many as count, charCount always includes the adjustment
// for the character being decoded
Expand Down Expand Up @@ -532,7 +533,7 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun

EncodeChar:

availableBytes = pEnd - pSrc;
availableBytes = (size_t)(pEnd - pSrc);

// don't fall into the fast decoding loop if we don't have enough bytes
if (availableBytes <= 13)
Expand Down Expand Up @@ -749,7 +750,7 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun
return 0; \
}

static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
{
assert(chars != NULL);
assert(byteCount >= 0);
Expand Down Expand Up @@ -982,8 +983,8 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount,
*pTarget = (CHAR16_T)ch;
ENSURE_BUFFER_INC

int availableChars = pAllocatedBufferEnd - pTarget;
int availableBytes = pEnd - pSrc;
size_t availableChars = (size_t)(pAllocatedBufferEnd - pTarget);
size_t availableBytes = (size_t)(pEnd - pSrc);

// don't fall into the fast decoding loop if we don't have enough bytes
// Test for availableChars is done because pStop would be <= pTarget.
Expand Down Expand Up @@ -1289,7 +1290,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount,
return 0;
}

return pTarget - chars;
return (size_t)(pTarget - chars);
}

static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount)
Expand Down Expand Up @@ -1510,8 +1511,8 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un
if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
goto ProcessChar;

int availableChars = pEnd - pSrc;
int availableBytes = pAllocatedBufferEnd - pTarget;
size_t availableChars = (size_t)(pEnd - pSrc);
size_t availableBytes = (size_t)(pAllocatedBufferEnd - pTarget);

// don't fall into the fast decoding loop if we don't have enough characters
// Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
Expand Down Expand Up @@ -1709,7 +1710,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un
return 0;
}

return (int)(pTarget - bytes);
return (size_t)(pTarget - bytes);
}

static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
Expand Down Expand Up @@ -1889,7 +1890,7 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
goto ProcessChar;
}

int availableChars = pEnd - pSrc;
size_t availableChars = (size_t)(pEnd - pSrc);

// don't fall into the fast decoding loop if we don't have enough characters
if (availableChars <= 13)
Expand Down