Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix non-ascii string/binary literals; Add multi-line binary literals #22

Merged
merged 19 commits into from
Jul 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
5cf5941
Allow using string blocks to create binary literals
michallepicki Jun 9, 2021
8354950
multi line binary, not multi line string
michallepicki Jun 9, 2021
40e809e
mark binary literals as utf8 in Erlang; do not escape binary or strin…
michallepicki Jun 13, 2021
5db9f40
output string blocks as raw bytes in erlang source to avoid escaping
michallepicki Jun 13, 2021
aac70de
no need to escape single quotes
michallepicki Jun 13, 2021
9de662d
if double quote in string literal is already escaped, don't try to ex…
michallepicki Jun 13, 2021
97acd17
Move comment about multi-codepoint characters in strings to correct p…
michallepicki Jun 14, 2021
6766774
acknowledge new_line when lexing string block with ignored starting b…
michallepicki Jun 14, 2021
33bc920
Trim lexeme from whitespace instead of subtracting 1 to avoid potenti…
michallepicki Jun 14, 2021
5aa9907
add test_binary.sest
michallepicki Jun 14, 2021
3fdf5aa
show in test what binary literals compile to
michallepicki Jun 14, 2021
7ab034d
simplify test
michallepicki Jun 14, 2021
429ad78
Fix bug where escaped backslash pairs were not grouped so next charac…
michallepicki Jun 15, 2021
b54d983
Reduce down the test to planned escape sequences only
michallepicki Jun 16, 2021
4bf9261
Only allow selected escape sequences; Parse literal into intended run…
michallepicki Jun 19, 2021
8fbf9c2
WIP escape_string in outputErlangCode
michallepicki Jun 19, 2021
5b6e955
fix mistake: don't switch to parsing binary literal from within strin…
michallepicki Jun 19, 2021
bd6acb5
throw a buffer at it
michallepicki Jun 29, 2021
01c8918
Correctly escape carriage return and tab escape sequences
michallepicki Jul 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type lexer_error =
| BlockClosedWithTooManyBackQuotes of Range.t
| SeeBreakInStringLiteral of Range.t
| NotASingleCodePoint of Range.t
| UnknownEscapeSequence of Range.t

type syntax_error =
| LexerError of lexer_error
Expand Down
28 changes: 25 additions & 3 deletions src/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@
Buffer.clear strbuf;
FormatConst(s)


let escape_sequence c rngL = match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 't' -> '\t'
| '\\' | '"' | '\'' -> c
| _ -> raise_error (UnknownEscapeSequence(rngL))

}

let space = [' ' '\t']
Expand Down Expand Up @@ -207,7 +215,17 @@ rule token = parse
STRING(rng, s)
}

| ("`" +) {
| ("`"+ break) {
(* When first character in a string block is a line break,
ignore this line break *)
Lexing.new_line lexbuf;
let posL = Range.from_lexbuf lexbuf in
let num_start = String.length (String.trim (Lexing.lexeme lexbuf)) in
let strbuf = Buffer.create 128 in
string_block num_start posL strbuf lexbuf
}

| ("`"+) {
let posL = Range.from_lexbuf lexbuf in
let num_start = String.length (Lexing.lexeme lexbuf) in
let strbuf = Buffer.create 128 in
Expand All @@ -219,15 +237,19 @@ rule token = parse
and binary_literal posL strbuf = parse
| break { raise_error (SeeBreakInStringLiteral(posL)) }
| eof { raise_error (SeeEndOfFileInStringLiteral(posL)) }
| ("\\" (_ as c)) {
Buffer.add_char strbuf (escape_sequence c posL); binary_literal posL strbuf lexbuf
}
| "\"" { let posR = Range.from_lexbuf lexbuf in (Range.unite posL posR, Buffer.contents strbuf) }
| "\\\"" { Buffer.add_char strbuf '"'; binary_literal posL strbuf lexbuf }
| _ as c { Buffer.add_char strbuf c; binary_literal posL strbuf lexbuf }

and string_literal posL strbuf = parse
| break { raise_error (SeeBreakInStringLiteral(posL)) }
| eof { raise_error (SeeEndOfFileInStringLiteral(posL)) }
| ("\\" (_ as c)) {
Buffer.add_char strbuf (escape_sequence c posL); string_literal posL strbuf lexbuf
}
| "\'" { let posR = Range.from_lexbuf lexbuf in (Range.unite posL posR, Buffer.contents strbuf) }
| "\\\'" { Buffer.add_char strbuf '\''; string_literal posL strbuf lexbuf }
| _ as c { Buffer.add_char strbuf c; string_literal posL strbuf lexbuf }

and format_literal posL strbuf acc = parse
Expand Down
4 changes: 4 additions & 0 deletions src/logging.ml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ let report_lexer_error (e : lexer_error) : unit =
Format.printf "%a: not a single code point\n"
Range.pp rng

| UnknownEscapeSequence(rngL) ->
Format.printf "%a: unknown escape sequence \n"
Range.pp rngL


let report_config_error (e : config_error) : unit =
Format.printf "! [Build error] ";
Expand Down
18 changes: 16 additions & 2 deletions src/outputErlangCode.ml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,20 @@ let stringify_format_element = function
in
(1, Printf.sprintf "~%s%s" s ch)

let escape_character c =
match Uchar.to_int c with
| 10 -> [ Uchar.of_char '\\'; Uchar.of_char 'n' ]
| 13 -> [ Uchar.of_char '\\'; Uchar.of_char 'r' ]
| 9 -> [ Uchar.of_char '\\'; Uchar.of_char 't' ]
| 92 -> [ Uchar.of_char '\\'; Uchar.of_char '\\' ]
| 34 -> [ Uchar.of_char '\\'; Uchar.of_char '"' ]
| 39 -> [ Uchar.of_char '\\'; Uchar.of_char '\'' ]
| _ -> [c]

let escape_string s =
let buffer = Buffer.create 0 in
s |> MyUtil.Utf.uchar_of_utf8 |> List.map escape_character |> List.flatten |> List.iter (Buffer.add_utf_8_uchar buffer);
Buffer.contents buffer

let stringify_base_constant (bc : base_constant) =
match bc with
Expand All @@ -167,9 +181,9 @@ let stringify_base_constant (bc : base_constant) =
else
assert false

| BinaryByString(s) -> Printf.sprintf "<<\"%s\">>" (String.escaped s)
| BinaryByString(s) -> Printf.sprintf "<<\"%s\"/utf8>>" (escape_string s)
| BinaryByInts(ns) -> Printf.sprintf "<<%s>>" (ns |> List.map string_of_int |> String.concat ", ")
| String(s) -> Printf.sprintf "\"%s\"" (String.escaped s)
michallepicki marked this conversation as resolved.
Show resolved Hide resolved
| String(s) -> Printf.sprintf "\"%s\"" (escape_string s)
| Char(uchar) -> Printf.sprintf "%d" (Uchar.to_int uchar)

| FormatString(fmtelems) ->
Expand Down
8 changes: 6 additions & 2 deletions src/parser.mly
Original file line number Diff line number Diff line change
Expand Up @@ -677,10 +677,14 @@ exprbot:
let rng = make_range (Token(tokL)) (Token(tokR)) in
(rng, BinaryByList(ns))
}
| strlit=BINARY {
let (rng, s) = strlit in
| binlit=BINARY {
let (rng, s) = binlit in
(rng, BaseConst(BinaryByString(s)))
}
| strblock=STRING_BLOCK {
let (rng, s) = strblock in
(rng, BaseConst(BinaryByInts(s |> String.to_seq |> List.of_seq |> List.map Char.code)))
}
| strlit=STRING {
let (rng, s) = strlit in
(rng, BaseConst(String(s)))
Expand Down
30 changes: 30 additions & 0 deletions test/pass/test_binary.sest
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
module TestBinary = struct
michallepicki marked this conversation as resolved.
Show resolved Hide resolved

val check : fun({binary, binary, binary, binary, binary}) -> {binary, binary, binary, binary, binary} = external 1 ```
check({A, B, C, D, E}) ->
<<240,159,145,169,226,128,141,240,159,148,172>> = A,
<<"👩‍🔬"/utf8>> = A,
<<10,13,9,34,39,92>> = B,
<<"\n\r\t\"\'\\"/utf8>> = B,
<<39>> = C,
<<"\'"/utf8>> = C,
<<33,34,39,96,92,92>> = D,
<<"!\"\'`\\\\"/utf8>> = D,
<<111,110,101,10,116,119,111>> = E,
<<"one\ntwo"/utf8>> = E,
{A, B, C, D, E}.
```

val main(args) =
let woman_scientist = "👩‍🔬" in
let escape_sequences = "\n\r\t\"\'\\" in
let single_quote = "'" in
let raw = ``!"'`\\`` in
let multiline = ```
one
two```
in
let examples = {woman_scientist, escape_sequences, single_quote, raw, multiline} in
print_debug(check(examples))

end