Skip to content

Commit

Permalink
Merge #698 #701
Browse files Browse the repository at this point in the history
698: Implement Byte Strings r=philberty a=philberty

Byte strings are not str's they are arrays of [u8; capacity], this
preserves their type guarantees as a byte string.

This patch merges work from Mark to implement the correct typing, the
missing piece was that each implicit type needed its own implicit id, other
wise their is a loop in looking up the covariant types.

Fixes #697

Co-authored-by: Mark Wielaard <mark@klomp.org>


701: Fix lexer to not produce bad unicode escape values r=philberty a=CohenArthur

There were a couple of issues in the lexer unicode escape code.
Unicode escape sequences must always start with an opening curly
bracket (and end with a closing one). Underscores are not allowed as
starting character. And the produced values must be unicode scalar
values, which excludes surrogate values (D800 to DFFF) or values
larger than 10FFFF.

Also try to recover more gracefully from errors by trying to skip past
any bad characters to the end of the escape sequence.

Test all of the above in a new testcase unicode_escape.rs.

Patch: https://git.sr.ht/~mjw/gccrs/commit/unicode_escape
Mail: https://gcc.gnu.org/pipermail/gcc-rust/2021-October/000231.html

Co-authored-by: Philip Herron <philip.herron@embecosm.com>
Co-authored-by: Mark Wielaard <mark@klomp.org>
  • Loading branch information
3 people authored Oct 4, 2021
3 parents e0b9673 + 65d06d5 + 23475a8 commit 99c2830
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 23 deletions.
43 changes: 41 additions & 2 deletions gcc/rust/backend/rust-compile-expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,15 +304,54 @@ class CompileExpr : public HIRCompileBase
}
return;

case HIR::Literal::STRING:
case HIR::Literal::BYTE_STRING: {
case HIR::Literal::STRING: {
auto base = ctx->get_backend ()->string_constant_expression (
literal_value->as_string ());
translated
= ctx->get_backend ()->address_expression (base, expr.get_locus ());
}
return;

case HIR::Literal::BYTE_STRING: {
TyTy::BaseType *tyty = nullptr;
if (!ctx->get_tyctx ()->lookup_type (
expr.get_mappings ().get_hirid (), &tyty))
{
rust_fatal_error (expr.get_locus (),
"did not resolve type for this array expr");
return;
}

// the type here is &[ty; capacity]
rust_assert (tyty->get_kind () == TyTy::TypeKind::REF);
auto ref_tyty = static_cast<TyTy::ReferenceType *> (tyty);
auto base_tyty = ref_tyty->get_base ();
rust_assert (base_tyty->get_kind () == TyTy::TypeKind::ARRAY);
auto array_tyty = static_cast<TyTy::ArrayType *> (base_tyty);

std::string value_str = expr.get_literal ()->as_string ();
std::vector<Bexpression *> vals;
std::vector<unsigned long> indexes;
for (size_t i = 0; i < value_str.size (); i++)
{
char b = value_str.at (i);
Bexpression *bb
= ctx->get_backend ()->char_constant_expression (b);
vals.push_back (bb);
indexes.push_back (i);
}

Btype *array_type = TyTyResolveCompile::compile (ctx, array_tyty);
Bexpression *constructed
= ctx->get_backend ()->array_constructor_expression (
array_type, indexes, vals, expr.get_locus ());

translated
= ctx->get_backend ()->address_expression (constructed,
expr.get_locus ());
}
return;

default:
rust_fatal_error (expr.get_locus (), "unknown literal");
return;
Expand Down
88 changes: 72 additions & 16 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,8 @@ Lexer::parse_escape (char opening_char)
rust_error_at (get_current_location (),
"cannot have a unicode escape \\u in a byte %s",
opening_char == '\'' ? "character" : "string");
// Try to parse it anyway, just to skip it
parse_partial_unicode_escape ();
return std::make_tuple (output_char, additional_length_offset, false);
case '\r':
case '\n':
Expand Down Expand Up @@ -1461,16 +1463,34 @@ Lexer::parse_partial_unicode_escape ()
{
skip_input ();
current_char = peek_input ();
int additional_length_offset = 1;
int additional_length_offset = 0;

bool need_close_brace = false;
if (current_char == '{')
if (current_char != '{')
{
need_close_brace = true;
rust_error_at (get_current_location (),
"unicode escape should start with %<{%>");
/* Skip what should probaby have been between brackets. */
while (is_x_digit (current_char) || current_char == '_')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
return std::make_pair (Codepoint (0), additional_length_offset);
}

skip_input ();
current_char = peek_input ();
additional_length_offset++;

if (current_char == '_')
{
rust_error_at (get_current_location (),
"unicode escape cannot start with %<_%>");
skip_input ();
current_char = peek_input ();
additional_length_offset++;
// fallthrough and try to parse the rest anyway
}

// parse unicode escape - 1-6 hex digits
Expand Down Expand Up @@ -1500,21 +1520,45 @@ Lexer::parse_partial_unicode_escape ()
current_char = peek_input ();
}

// ensure closing brace if required
if (need_close_brace)
if (current_char == '}')
{
if (current_char == '}')
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
else
{
// actually an error, but allow propagation anyway Assume that
// wrong bracketm whitespace or single/double quotes are wrong
// termination, otherwise it is a wrong character, then skip to the actual
// terminator.
if (current_char == '{' || is_whitespace (current_char)
|| current_char == '\'' || current_char == '"')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
rust_error_at (get_current_location (),
"expected terminating %<}%> in unicode escape");
return std::make_pair (Codepoint (0), additional_length_offset);
}
else
{
// actually an error, but allow propagation anyway
rust_error_at (get_current_location (),
"expected terminating %<}%> in unicode escape");
// return false;
"invalid character %<%c%> in unicode escape",
current_char);
while (current_char != '}' && current_char != '{'
&& !is_whitespace (current_char) && current_char != '\''
&& current_char != '"')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
// Consume the actual closing bracket if found
if (current_char == '}')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
return std::make_pair (Codepoint (0), additional_length_offset);
}
}
Expand All @@ -1530,10 +1574,22 @@ Lexer::parse_partial_unicode_escape ()
return std::make_pair (Codepoint (0), additional_length_offset);
}

long hex_num = std::strtol (num_str.c_str (), nullptr, 16);
unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);

// assert fits a uint32_t
gcc_assert (hex_num < 4294967296);
if (hex_num > 0xd7ff && hex_num < 0xe000)
{
rust_error_at (
get_current_location (),
"unicode escape cannot be a surrogate value (D800 to DFFF)");
return std::make_pair (Codepoint (0), additional_length_offset);
}

if (hex_num > 0x10ffff)
{
rust_error_at (get_current_location (),
"unicode escape cannot be larger than 10FFFF");
return std::make_pair (Codepoint (0), additional_length_offset);
}

// return true;
return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
Expand Down
44 changes: 39 additions & 5 deletions gcc/rust/typecheck/rust-hir-type-check-expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -609,15 +609,49 @@ class TypeCheckExpr : public TypeCheckBase
break;

case HIR::Literal::LitType::BYTE_STRING: {
/* We just treat this as a string, but it really is an arraytype of
u8. It isn't in UTF-8, but really just a byte array. */
TyTy::BaseType *base = nullptr;
auto ok = context->lookup_builtin ("str", &base);
/* This is an arraytype of u8 reference (&[u8;size]). It isn't in
UTF-8, but really just a byte array. Code to construct the array
reference copied from ArrayElemsValues and ArrayType. */
TyTy::BaseType *u8;
auto ok = context->lookup_builtin ("u8", &u8);
rust_assert (ok);

auto crate_num = mappings->get_current_crate ();
Analysis::NodeMapping capacity_mapping (crate_num, UNKNOWN_NODEID,
mappings->get_next_hir_id (
crate_num),
UNKNOWN_LOCAL_DEFID);

/* Capacity is the size of the string (number of chars).
It is a constant, but for fold it to get a Bexpression. */
std::string capacity_str
= std::to_string (expr.get_literal ()->as_string ().size ());
HIR::LiteralExpr literal_capacity (capacity_mapping, capacity_str,
HIR::Literal::LitType::INT,
PrimitiveCoreType::CORETYPE_USIZE,
expr.get_locus ());

// mark the type for this implicit node
context->insert_type (capacity_mapping,
new TyTy::USizeType (
capacity_mapping.get_hirid ()));

Bexpression *capacity
= ConstFold::ConstFoldExpr::fold (&literal_capacity);

Analysis::NodeMapping array_mapping (crate_num, UNKNOWN_NODEID,
mappings->get_next_hir_id (
crate_num),
UNKNOWN_LOCAL_DEFID);

TyTy::ArrayType *array
= new TyTy::ArrayType (array_mapping.get_hirid (), capacity,
TyTy::TyVar (u8->get_ref ()));
context->insert_type (array_mapping, array);

infered
= new TyTy::ReferenceType (expr.get_mappings ().get_hirid (),
TyTy::TyVar (base->get_ref ()), false);
TyTy::TyVar (array->get_ref ()), false);
}
break;

Expand Down
4 changes: 4 additions & 0 deletions gcc/testsuite/rust/compile/torture/byte_str.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pub fn main() {
let a: &[u8; 4];
a = b"test";
}
60 changes: 60 additions & 0 deletions gcc/testsuite/rust/compile/unicode_escape.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
fn main ()
{
// Braces are required
let _cbl = '\u013'; // { dg-error "unicode escape" }
let _sbl = "\u013"; //{ dg-error "unicode escape" }

// One to six hex digits
let _c0 = '\u{}'; // { dg-error "unicode escape" }
let _c1 = '\u{0}';
let _c2 = '\u{00}';
let _c3 = '\u{000}';
let _c4 = '\u{0000}';
let _c5 = '\u{00000}';
let _c6 = '\u{000000}';
let _c7 = '\u{0000000}'; // { dg-error "unicode escape" }

let _s0 = "\u{}"; // { dg-error "unicode escape" }
let _s1 = "\u{0}";
let _s2 = "\u{00}";
let _s3 = "\u{000}";
let _s4 = "\u{0000}";
let _s5 = "\u{00000}";
let _s6 = "\u{000000}";
let _s7 = "\u{0000000}"; // { dg-error "unicode escape" }

// Underscores OK except for start
let _c_ = '\u{00___01__0_1_}';
let _s_ = "\u{00___01__0_1_}";
let _c__ = '\u{_00__01__0_}'; // { dg-error "unicode escape" }
let _s__ = "\u{_00__01__0_}"; // { dg-error "unicode escape" }

// Must be hex chars
let _chex = '\u{hex}'; // { dg-error "unicode escape" }
let _shex = '\u{hex}'; // { dg-error "unicode escape" }

// Only valid from 0x0 to 0xD7FF and from 0xE000 to 0x10FFF
let _cd7ff = '\u{D7FF}';
let _sd7ff = "\u{D7FF}";
let _cd800 = '\u{D800}'; // { dg-error "unicode escape" }
let _sd800 = "\u{D800}"; // { dg-error "unicode escape" }

let _cdfff = '\u{DFFF}'; // { dg-error "unicode escape" }
let _sdfff = "\u{DFFF}"; // { dg-error "unicode escape" }
let _ce000 = '\u{E000}';
let _se000 = "\u{E000}";

let _clast = '\u{10FFFF}';
let _slast = "\u{10FFFF}";
let _clast1 = '\u{110000}'; // { dg-error "unicode escape" }
let _slast1 = "\u{110000}"; // { dg-error "unicode escape" }

let _cffffff = '\u{FFFFFF}'; // { dg-error "unicode escape" }
let _sffffff = "\u{FFFFFF}"; // { dg-error "unicode escape" }

// unicode escapes cannot be used in bytes or byte strings.
// Except in raw byte strings (where they aren't escapes).
let _bc = b'\u{000A}'; // { dg-error "unicode escape" }
let _bs = b"\u{000A}"; // { dg-error "unicode escape" }
let _rbs = br"\u{000A}";
}

0 comments on commit 99c2830

Please sign in to comment.