From 7a4cb37c0ce1c7ae7e9c2c33f8694d7f06c5f686 Mon Sep 17 00:00:00 2001 From: Thomas Koutcher Date: Tue, 23 Feb 2021 23:13:25 +0100 Subject: [PATCH] Fix loop when wrapping line with ISO-8859-1 character Changes utf8_char_length(), utf8_to_unicode() and utf8_length() implementation to rely on utf8proc. Fixes #1087 --- src/string.c | 89 ++++++++++------------------------------------------ 1 file changed, 17 insertions(+), 72 deletions(-) diff --git a/src/string.c b/src/string.c index 2638d9254..69ec5b89f 100644 --- a/src/string.c +++ b/src/string.c @@ -214,72 +214,24 @@ unicode_width(unsigned long c, int tab_size) /* Number of bytes used for encoding a UTF-8 character indexed by first byte. * Illegal bytes are set one. */ -static const unsigned char utf8_bytes[256] = { - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1, -}; - unsigned char utf8_char_length(const char *string) { - int c = *(unsigned char *) string; + size_t c = *(unsigned char *) string; - return utf8_bytes[c]; + return utf8proc_utf8class[c] ? utf8proc_utf8class[c] : 1; } /* Decode UTF-8 multi-byte representation into a Unicode character. */ unsigned long utf8_to_unicode(const char *string, size_t length) { - unsigned long unicode; - - switch (length) { - case 1: - unicode = string[0]; - break; - case 2: - unicode = (string[0] & 0x1f) << 6; - unicode += (string[1] & 0x3f); - break; - case 3: - unicode = (string[0] & 0x0f) << 12; - unicode += ((string[1] & 0x3f) << 6); - unicode += (string[2] & 0x3f); - break; - case 4: - unicode = (string[0] & 0x0f) << 18; - unicode += ((string[1] & 0x3f) << 12); - unicode += ((string[2] & 0x3f) << 6); - unicode += (string[3] & 0x3f); - break; - case 5: - unicode = (string[0] & 0x0f) << 24; - unicode += ((string[1] & 0x3f) << 18); - unicode += ((string[2] & 0x3f) << 12); - unicode += ((string[3] & 0x3f) << 6); - unicode += (string[4] & 0x3f); - break; - case 6: - unicode = (string[0] & 0x01) << 30; - unicode += ((string[1] & 0x3f) << 24); - unicode += ((string[2] & 0x3f) << 18); - unicode += ((string[3] & 0x3f) << 12); - unicode += ((string[4] & 0x3f) << 6); - unicode += (string[5] & 0x3f); - break; - default: - return 0; - } + utf8proc_int32_t unicode; + utf8proc_ssize_t slen = utf8proc_iterate((const utf8proc_uint8_t *) string, length, &unicode); /* Invalid characters could return the special 0xfffd value but NUL * should be just as good. */ - return unicode > 0x10FFFF ? 0 : unicode; + return slen <= 0 || !utf8proc_codepoint_valid(unicode) ? 0 : unicode; } /* Calculates how much of string can be shown within the given maximum width @@ -293,30 +245,23 @@ utf8_length(const char **start, int max_chars, size_t skip, int *width, size_t m { const char *string = *start; const char *end = max_chars < 0 ? strchr(string, '\0') : string + max_chars; - unsigned char last_bytes = 0; - size_t last_ucwidth = 0; + utf8proc_ssize_t last_bytes = 0; + int last_ucwidth = 0; *width = 0; *trimmed = 0; while (string < end) { - unsigned char bytes = utf8_char_length(string); - size_t ucwidth; - unsigned long unicode; - - if (string + bytes > end) - break; - - /* Change representation to figure out whether - * it is a single- or double-width character. */ - - unicode = utf8_to_unicode(string, bytes); - /* FIXME: Graceful handling of invalid Unicode character. */ - if (!unicode) - break; - - ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) : - utf8proc_charwidth((utf8proc_int32_t) unicode); + utf8proc_int32_t unicode; + utf8proc_ssize_t bytes = utf8proc_iterate((const utf8proc_uint8_t *) string, end - string, &unicode); + int ucwidth; + + /* Assume a width of 1 for invalid UTF-8 encoding (could be ISO-8859-1). */ + if (bytes <= 0 || !utf8proc_codepoint_valid(unicode)) + ucwidth = bytes = 1; + else + ucwidth = unicode == '\t' ? tab_size - (*width % tab_size) : + utf8proc_charwidth(unicode); if (skip > 0) { skip -= ucwidth <= skip ? ucwidth : skip; *start += bytes;