Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(compiler): Bytes literals #1662

Merged
merged 1 commit into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler/src/parsing/ast_helper.rei
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type str = loc(string);
type loc = Location.t;

module Constant: {
let bytes: string => constant;
let string: string => constant;
let char: string => constant;
let number: number_type => constant;
Expand Down
83 changes: 64 additions & 19 deletions compiler/src/parsing/lexer.re
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ type error =
| UnclosedChar(int)
| UnclosedBlockComment(int)
| UnclosedDocComment(int)
| IllegalUnicodeCodePoint(string);
| IllegalUnicodeCodePoint(string)
| IllegalByteStringUnicodeChar(string)
| IllegalByteStringUnicodeEscape(string);

exception Error(Location.t, error);

Expand All @@ -32,6 +34,18 @@ let report_error = (ppf, err) =>
Format.fprintf(ppf, "Unclosed doc comment, opened on line %d", line)
| IllegalUnicodeCodePoint(cp) =>
Format.fprintf(ppf, "Illegal unicode code point: %S", cp)
| IllegalByteStringUnicodeChar(cp) =>
Format.fprintf(
ppf,
"Byte strings may not contain non-ascii unicode characters: %S",
cp,
)
| IllegalByteStringUnicodeEscape(cp) =>
Format.fprintf(
ppf,
"Byte strings may not contain unicode escapes: %S",
cp,
)
};

let () =
Expand All @@ -42,13 +56,15 @@ let () =
| _ => None,
);

let add_code_point = (buf, str, loc) => {
let add_code_point = (buf, str, unicode, loc) => {
let (esc, numstr) = (
String.sub(str, 1, 1),
String.sub(str, 2, String.length(str) - 2),
);
let code_point =
switch (esc) {
| "u" when !unicode =>
raise(Error(loc, IllegalByteStringUnicodeEscape(str)))
| "u" when numstr.[0] == '{' =>
Scanf.sscanf(String.sub(numstr, 1, String.length(numstr) - 1), "%x", x =>
x
Expand Down Expand Up @@ -332,9 +348,12 @@ let rec token = lexbuf => {
positioned(INFIX_50(Sedlexing.Utf8.lexeme(lexbuf)))
| "!" => positioned(PREFIX_150(Sedlexing.Utf8.lexeme(lexbuf)))
| "@" => positioned(AT)
| "b\"" =>
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
read_str(start_p, Buffer.create(16), false, lexbuf);
| '"' =>
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
read_str(start_p, Buffer.create(16), lexbuf);
read_str(start_p, Buffer.create(16), true, lexbuf);
| "'" =>
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
read_char(start_p, Buffer.create(4), lexbuf);
Expand All @@ -345,42 +364,63 @@ let rec token = lexbuf => {
| _ => raise(Error(lexbuf_loc(lexbuf), UnrecognizedToken))
};
}
and read_str = (start_p, buf, lexbuf) => {
and read_str = (start_p, buf, unicode, lexbuf) => {
switch%sedlex (lexbuf) {
| ('\\', newline_char) => read_str(start_p, buf, lexbuf)
| ('\\', newline_char) => read_str(start_p, buf, unicode, lexbuf)
| "\\b" =>
Buffer.add_char(buf, '\b');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\f" =>
Buffer.add_char(buf, '\012');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\n" =>
Buffer.add_char(buf, '\n');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\r" =>
Buffer.add_char(buf, '\r');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\t" =>
Buffer.add_char(buf, '\t');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\v" =>
Buffer.add_char(buf, '\011');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\\"" =>
Buffer.add_char(buf, '"');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| "\\\\" =>
Buffer.add_char(buf, '\\');
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| num_esc =>
add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
read_str(start_p, buf, lexbuf);
add_code_point(
buf,
Sedlexing.Utf8.lexeme(lexbuf),
unicode,
lexbuf_loc(lexbuf),
);
read_str(start_p, buf, unicode, lexbuf);
| '"' =>
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
(STRING(Buffer.contents(buf)), start_p, end_p);
| any =>
if (unicode) {
(STRING(Buffer.contents(buf)), start_p, end_p);
} else {
(BYTES(Buffer.contents(buf)), start_p, end_p);
};
| 0 .. 127 =>
Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
read_str(start_p, buf, lexbuf);
read_str(start_p, buf, unicode, lexbuf);
| any =>
if (unicode) {
Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
read_str(start_p, buf, unicode, lexbuf);
} else {
raise(
Error(
lexbuf_loc(lexbuf),
IllegalByteStringUnicodeChar(Sedlexing.Utf8.lexeme(lexbuf)),
),
);
}
| _ =>
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
raise(
Expand Down Expand Up @@ -418,7 +458,12 @@ and read_char = (start_p, buf, lexbuf) => {
Buffer.add_char(buf, '\\');
read_char(start_p, buf, lexbuf);
| num_esc =>
add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
add_code_point(
buf,
Sedlexing.Utf8.lexeme(lexbuf),
true,
lexbuf_loc(lexbuf),
);
read_char(start_p, buf, lexbuf);
| "'" =>
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
Expand Down
3 changes: 2 additions & 1 deletion compiler/src/parsing/parser.mly
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ module Grain_parsing = struct end
%token <string> INT32 INT64 FLOAT32 FLOAT64 BIGINT
%token <string> WASMI32 WASMI64 WASMF32 WASMF64
%token <string> LIDENT UIDENT
%token <string> STRING CHAR
%token <string> STRING BYTES CHAR
%token LBRACK LBRACKRCARET RBRACK LPAREN RPAREN LBRACE RBRACE LCARET RCARET
%token COMMA SEMI AS
%token THICKARROW ARROW
Expand Down Expand Up @@ -210,6 +210,7 @@ const:
| FALSE { Constant.bool false, $loc }
| VOID { Constant.void, $loc }
| STRING { Constant.string $1, $loc }
| BYTES { Constant.bytes $1, $loc }
| CHAR { Constant.char $1, $loc }

expr:
Expand Down
18 changes: 18 additions & 0 deletions compiler/test/suites/strings.re
Original file line number Diff line number Diff line change
Expand Up @@ -291,4 +291,22 @@ bar", 1))|},
{|include "float64"; from Float64 use *; print(div(-1.0d, 0.0d))|},
"-Infinity\n",
);

// Bytes literals
assertRun("bytes_literal", {|print(b"abc")|}, "<bytes: 61 62 63 >\n");
assertCompileError(
"bytes_literal_err1",
{|print(b"abc\u1234")|},
"Byte strings may not contain unicode escapes",
);
assertCompileError(
"bytes_literal_err2",
{|print(b"abc\u{1234}")|},
"Byte strings may not contain unicode escapes",
);
assertCompileError(
"bytes_literal_err3",
{|print(b"abc😂")|},
"Byte strings may not contain non-ascii unicode characters",
);
});