From df2adeb3169fe8e813117da7aad2e6030ecb5da6 Mon Sep 17 00:00:00 2001 From: "D. Bohdan" Date: Mon, 20 May 2024 12:11:27 +0000 Subject: [PATCH] feat!: throw an error on UTF-16 or UTF-32 BOM BREAKING CHANGE: UTF-16 and UTF-32 files that have the byte-order mark are no longer passed through with `-p`/`--pass-through`. v0.16.0 --- README.md | 7 +++++-- VERSION | 2 +- initool.sml | 42 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index abced9f..a046426 100644 --- a/README.md +++ b/README.md @@ -220,8 +220,11 @@ Commands act on all of them at the same time. ### Text encodings Initool is encoding-naive and assumes one character is one byte. -It correctly processes UTF-8-encoded files when given UTF-8 command-line arguments but can't open files in UTF-16 or UTF-32. -On Windows, it will receive the command-line arguments in the encoding for your system's language for non-Unicode programs (e.g., [Windows-1252](https://en.wikipedia.org/wiki/Windows-1252)), +It correctly processes UTF-8-encoded files when given UTF-8 command-line arguments. +It exits with an encoding error if it detects the UTF-16 or UTF-32 [BOM](https://en.wikipedia.org/wiki/Byte_order_mark). +Trying to open a UTF-16 or UTF-32 file without the BOM results in an "invalid line" error because initool fails to parse it. + +On Windows, initool will receive the command-line arguments in the encoding for your system's language for non-Unicode programs (e.g., [Windows-1252](https://en.wikipedia.org/wiki/Windows-1252)), which limits what you can do with UTF-8-encoded files. diff --git a/VERSION b/VERSION index a551051..04a373e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.15.0 +0.16.0 diff --git a/initool.sml b/initool.sml index 8639939..5489cb7 100644 --- a/initool.sml +++ b/initool.sml @@ -3,9 +3,29 @@ * License: MIT *) -type options = {ignoreCase: bool, passThrough: bool} +exception Encoding of string -fun idOptions (opts: options) : Id.options = {ignoreCase = #ignoreCase opts} +val unsupportedEncoding = "unsupported encoding: " + +fun checkWrongEncoding (lines: string list) = + let + val _ = + case lines of + [first] => + (case map Char.ord (String.explode first) of + 0x00 :: 0x00 :: 0xFE :: 0xFF :: _ => + raise Encoding (unsupportedEncoding ^ "UTF-32 BE") + | 0xFF :: 0xFE :: 0x00 :: 0x00 :: _ => + raise Encoding (unsupportedEncoding ^ "UTF-32 LE") + | 0xFE :: 0xFF :: _ => + raise Encoding (unsupportedEncoding ^ "UTF-16 BE") + | 0xFF :: 0xFE :: _ => + raise Encoding (unsupportedEncoding ^ "UTF-16 LE") + | _ => ()) + | _ => () + in + lines + end fun readLines (filename: string) : string list = let @@ -44,7 +64,8 @@ datatype result = Output of string | FailureOutput of string | Error of string fun processFileCustom quiet passThrough successFn filterFn filename = let - val parsed = Ini.parse passThrough (readLines filename) + val parsed = + ((Ini.parse passThrough) o checkWrongEncoding o readLines) filename val filtered = filterFn parsed val success = successFn (parsed, filtered) val output = if quiet then "" else Ini.stringify filtered @@ -100,13 +121,17 @@ fun helpCommand [] = Output allUsage Error (invalidUsage ^ (formatArgs (cmd :: rest)) ^ "\n" ^ usage ^ cmd) fun versionCommand [] = - let val version = "0.15.0" + let val version = "0.16.0" in Output (version ^ "\n") end | versionCommand [_] = versionCommand [] | versionCommand (cmd :: rest) = Error (invalidUsage ^ (formatArgs (cmd :: rest)) ^ "\n" ^ usage ^ cmd) +type options = {ignoreCase: bool, passThrough: bool} + +fun idOptions (opts: options) : Id.options = {ignoreCase = #ignoreCase opts} + fun getCommand (opts: options) [_, filename] = processFile (#passThrough opts) (fn _ => true) (fn x => x) filename | getCommand opts [_, filename, section] = @@ -139,7 +164,7 @@ fun getCommand (opts: options) [_, filename] = val q = Ini.SelectProperty {section = section, key = key} val parsed = ((Ini.select (idOptions opts) q) o (Ini.parse (#passThrough opts)) - o readLines) filename + o checkWrongEncoding o readLines) filename val allItems = List.concat (List.map (fn {name = _, contents = xs} => xs) parsed) val values = @@ -275,11 +300,16 @@ fun processArgs (opts: options) [] = helpCommand [] | processArgs opts (cmd :: _) = Error (unknownCommand ^ (formatArgs [cmd]) ^ "\n" ^ availableCommands) +fun handleException (message: string) = + exitWithError "" ("Error: " ^ message) + val args = CommandLine.arguments () val result = processArgs {ignoreCase = false, passThrough = false} args - handle Ini.Tokenization (message) => exitWithError "" ("Error: " ^ message) + handle + Encoding message => handleException message + | Ini.Tokenization message => handleException message val _ = case result of Output s => printFlush TextIO.stdOut s