diff --git a/CHANGELOG.md b/CHANGELOG.md index 417df3b..d24a8b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## v1.2.0 (2015-01-12) + +- add u::strwidth() to get the width of a string when printed on a terminal +- add more mbstring shims +- add a note about https://bugs.php.net/65358 +- fail properly when COM is not loaded +- fallback on stat() when lstat() fails + ## v1.2.0-beta (2014-08-05) - add best-fit mappings for UTF-8 to Code Page approximations diff --git a/README.md b/README.md index a30bfbc..e194a17 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,11 @@ server even if one or more of those extensions are not enabled: - *utf8_encode, utf8_decode*, - `mbstring`: *mb_check_encoding, mb_convert_case, mb_convert_encoding, mb_decode_mimeheader, mb_detect_encoding, mb_detect_order, - mb_encode_mimeheader, mb_encoding_aliases, mb_internal_encoding, mb_language, - mb_list_encodings, mb_strlen, mb_strpos, mb_strrpos, mb_strtolower, + mb_encode_mimeheader, mb_encoding_aliases, mb_get_info, mb_http_input, + mb_http_output, mb_internal_encoding, mb_language, mb_list_encodings, + mb_output_handler, mb_strlen, mb_strpos, mb_strrpos, mb_strtolower, mb_strtoupper, mb_stripos, mb_stristr, mb_strrchr, mb_strrichr, mb_strripos, - mb_strstr, mb_substitute_character, mb_substr*, + mb_strstr, mb_strwidth, mb_substitute_character, mb_substr, mb_substr_count*, - `iconv`: *iconv, iconv_mime_decode, iconv_mime_decode_headers, iconv_get_encoding, iconv_set_encoding, iconv_mime_encode, ob_iconv_handler, iconv_strlen, iconv_strpos, iconv_strrpos, iconv_substr*, @@ -66,7 +67,8 @@ Some more functions are also provided to help handling UTF-8 strings: - *isUtf8()*: checks if a string contains well formed UTF-8 data, - *toAscii()*: generic UTF-8 to ASCII transliteration, - *strtocasefold()*: unicode transformation for caseless matching, -- *strtonatfold()*: generic case sensitive transformation for collation matching +- *strtonatfold()*: generic case sensitive transformation for collation matching, +- *strwidth()*: computes the width of a string when printed on a terminal, - *wrapPath()*: unicode filesystem access under Windows and other OSes. Mirrored string functions are: diff --git a/class/Patchwork/PHP/Shim/Mbstring.php b/class/Patchwork/PHP/Shim/Mbstring.php index 040df1f..d299c4d 100644 --- a/class/Patchwork/PHP/Shim/Mbstring.php +++ b/class/Patchwork/PHP/Shim/Mbstring.php @@ -18,8 +18,12 @@ * - mb_decode_mimeheader - Decode string in MIME header field * - mb_encode_mimeheader - Encode string for MIME header XXX NATIVE IMPLEMENTATION IS REALLY BUGGED * - mb_convert_case - Perform case folding on a string + * - mb_get_info - Get internal settings of mbstring + * - mb_http_input - Detect HTTP input character encoding + * - mb_http_output - Set/Get HTTP output character encoding * - mb_internal_encoding - Set/Get internal character encoding * - mb_list_encodings - Returns an array of all supported encodings + * - mb_output_handler - Callback function converts character encoding in output buffer * - mb_strlen - Get string length * - mb_strpos - Find position of first occurrence of string in a string * - mb_strrpos - Find position of last occurrence of a string in a string @@ -33,28 +37,23 @@ * - mb_strrichr - Finds the last occurrence of a character in a string within another, case insensitive * - mb_strripos - Finds position of last occurrence of a string within another, case insensitive * - mb_strstr - Finds first occurrence of a string within anothers + * - mb_strwidth - Return width of string + * - mb_substr_count - Count the number of substring occurrences * * Not implemented: - * - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more) - * - mb_convert_variables - Convert character code in variable(s) - * - mb_decode_numericentity - Decode HTML numeric string reference to character - * - mb_encode_numericentity - Encode character to HTML numeric string reference - * - mb_ereg* - Regular expression with multibyte support - * - mb_get_info - Get internal settings of mbstring - * - mb_http_input - Detect HTTP input character encoding - * - mb_http_output - Set/Get HTTP output character encoding - * - mb_list_mime_names - Returns an array or string of all supported mime names - * - mb_output_handler - Callback function converts character encoding in output buffer - * - mb_parse_str - Parse GET/POST/COOKIE data and set global variable - * - mb_preferred_mime_name - Get MIME charset string - * - mb_regex_encoding - Returns current encoding for multibyte regex as string - * - mb_regex_set_options - Set/Get the default options for mbregex functions - * - mb_send_mail - Send encoded mail - * - mb_split - Split multibyte string using regular expression - * - mb_strcut - Get part of string - * - mb_strimwidth - Get truncated string with specified width - * - mb_strwidth - Return width of string - * - mb_substr_count - Count the number of substring occurrences + * - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more) + * - mb_convert_variables - Convert character code in variable(s) + * - mb_decode_numericentity - Decode HTML numeric string reference to character + * - mb_encode_numericentity - Encode character to HTML numeric string reference + * - mb_ereg_* - Regular expression with multibyte support + * - mb_parse_str - Parse GET/POST/COOKIE data and set global variable + * - mb_preferred_mime_name - Get MIME charset string + * - mb_regex_encoding - Returns current encoding for multibyte regex as string + * - mb_regex_set_options - Set/Get the default options for mbregex functions + * - mb_send_mail - Send encoded mail + * - mb_split - Split multibyte string using regular expression + * - mb_strcut - Get part of string + * - mb_strimwidth - Get truncated string with specified width */ class Mbstring { @@ -408,6 +407,68 @@ static function mb_strstr($haystack, $needle, $part = false, $encoding = INF) else return substr($haystack, $pos); } + static function mb_get_info($type = 'all') + { + $info = array( + 'internal_encoding' => self::$internal_encoding, + 'http_output' => 'pass', + 'http_output_conv_mimetypes' => '^(text/|application/xhtml\+xml)', + 'func_overload' => 0, + 'func_overload_list' => 'no overload', + 'mail_charset' => 'UTF-8', + 'mail_header_encoding' => 'BASE64', + 'mail_body_encoding' => 'BASE64', + 'illegal_chars' => 0, + 'encoding_translation' => 'Off', + 'language' => self::$language, + 'detect_order' => self::$encoding_list, + 'substitute_character' => 'none', + 'strict_detection' => 'Off', + ); + + if ('all' === $type) { + return $info; + } elseif (isset($info[$type])) { + return $info[$type]; + } else { + return false; + } + } + + static function mb_http_input($type = '') + { + return false; + } + + static function mb_http_output($encoding = INF) + { + return INF !== $encoding ? 'pass' === $encoding : 'pass'; + } + + static function mb_strwidth($s, $encoding = INF) + { + $encoding = INF === $encoding ? self::$internal_encoding : strtoupper($encoding); + + if ('UTF-8' !== $encoding && 'UTF8' !== $encoding) { + $s = iconv($encoding, 'UTF-8//IGNORE', $s); + } + + $s = preg_replace('/[\x00-\x19]/', '', $s); + + preg_replace('/[\x{0020}-\x{1FFF}\x{FF61}-\x{FF9F}]/u', '', $s, -1, $narrow); + + return (iconv_strlen($s, 'UTF-8') << 1) - $narrow; + } + + static function mb_substr_count($haystack, $needle, $encoding = INF) + { + return substr_count($haystack, $needle); + } + + static function mb_output_handler($contents, $status) + { + return $contents; + } protected static function getSubpart($pos, $part, $haystack, $encoding) { diff --git a/class/Patchwork/Utf8/Bootup/mbstring.php b/class/Patchwork/Utf8/Bootup/mbstring.php index 0af741b..6b6be91 100644 --- a/class/Patchwork/Utf8/Bootup/mbstring.php +++ b/class/Patchwork/Utf8/Bootup/mbstring.php @@ -43,3 +43,9 @@ function mb_strrichr($s, $needle, $part = false, $enc = INF) {return s\Mbstring: function mb_strripos($s, $needle, $offset = 0, $enc = INF) {return s\Mbstring::mb_strripos($s, $needle, $offset, $enc);}; function mb_strrpos($s, $needle, $offset = 0, $enc = INF) {return s\Mbstring::mb_strrpos($s, $needle, $offset, $enc);}; function mb_strstr($s, $needle, $part = false, $enc = INF) {return s\Mbstring::mb_strstr($s, $needle, $part, $enc);}; +function mb_get_info($type = 'all') {return s\Mbstring::mb_get_info($type);} +function mb_http_output($enc = INF) {return s\Mbstring::mb_http_output($enc);} +function mb_strwidth($s, $enc = INF) {return s\Mbstring::mb_strwidth($s, $enc);} +function mb_substr_count($haystack, $needle, $enc = INF) {return s\Mbstring::mb_substr_count($haystack, $needle, $enc);} +function mb_output_handler($contents, $status) {return s\Mbstring::mb_output_handler($contents, $status);} +function mb_http_input($type = '') {return s\Mbstring::mb_http_input($type);} diff --git a/composer.json b/composer.json index dc5c3c1..e06e85c 100644 --- a/composer.json +++ b/composer.json @@ -16,6 +16,7 @@ "lib-pcre": ">=7.3" }, "suggest": { + "ext-wfio": "Use WFIO for UTF-8 filesystem access on Windows", "ext-intl": "Use Intl for best performance", "ext-iconv": "Use iconv for best performance", "ext-mbstring": "Use Mbstring for best performance" diff --git a/tests/Patchwork/Tests/PHP/Shim/MbstringTest.php b/tests/Patchwork/Tests/PHP/Shim/MbstringTest.php index 8f18ac1..fce58ad 100644 --- a/tests/Patchwork/Tests/PHP/Shim/MbstringTest.php +++ b/tests/Patchwork/Tests/PHP/Shim/MbstringTest.php @@ -247,4 +247,14 @@ function testmb_encoding_aliases() $this->assertSame(array('utf8'), p::mb_encoding_aliases('UTF-8')); $this->assertFalse(p::mb_encoding_aliases('ASCII')); } + + /** + * @covers Patchwork\PHP\Shim\Mbstring::mb_strwidth + */ + function testmb_strwidth() + { + $this->assertSame( 2, p::mb_strwidth("\0実") ); + $this->assertSame( 4, p::mb_strwidth('déjà') ); + $this->assertSame( 4, p::mb_strwidth(utf8_decode('déjà'), 'CP1252') ); + } }