rust-lang · SimonSapin · Jun 6, 2014 · Jun 7, 2014 · Jun 13, 2014 · Jun 13, 2014
diff --git a/src/doc/rust.md b/src/doc/rust.md
@@ -234,7 +234,7 @@ rule. A literal is a form of constant expression, so is evaluated (primarily)
 at compile time.
 
 ~~~~ {.ebnf .gram}
-literal : string_lit | char_lit | num_lit ;
+literal : string_lit | char_lit | byte_string_lit | byte_lit | num_lit ;
 ~~~~
 
 #### Character and string literals
@@ -244,17 +244,17 @@ char_lit : '\x27' char_body '\x27' ;
 string_lit : '"' string_body * '"' | 'r' raw_string ;
 
 char_body : non_single_quote
-          | '\x5c' [ '\x27' | common_escape ] ;
+          | '\x5c' [ '\x27' | common_escape | unicode_escape ] ;
 
 string_body : non_double_quote
-            | '\x5c' [ '\x22' | common_escape ] ;
+            | '\x5c' [ '\x22' | common_escape | unicode_escape ] ;
 raw_string : '"' raw_string_body '"' | '#' raw_string '#' ;
 
 common_escape : '\x5c'
               | 'n' | 'r' | 't' | '0'
               | 'x' hex_digit 2
-              | 'u' hex_digit 4
-              | 'U' hex_digit 8 ;
+unicode_escape : 'u' hex_digit 4
+               | 'U' hex_digit 8 ;
 
 hex_digit : 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
           | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
@@ -294,7 +294,7 @@ the following forms:
     escaped in order to denote *itself*.
 
 Raw string literals do not process any escapes. They start with the character
-`U+0072` (`r`), followed zero or more of the character `U+0023` (`#`) and a
+`U+0072` (`r`), followed by zero or more of the character `U+0023` (`#`) and a
 `U+0022` (double-quote) character. The _raw string body_ is not defined in the
 EBNF grammar above: it can contain any sequence of Unicode characters and is
 terminated only by another `U+0022` (double-quote) character, followed by the
@@ -319,6 +319,65 @@ r##"foo #"# bar"##;                // foo #"# bar
 "\\x52"; r"\x52";                  // \x52
 ~~~~
 
+#### Byte and byte string literals
+
+~~~~ {.ebnf .gram}
+byte_lit : 'b' '\x27' byte_body '\x27' ;
+byte_string_lit : 'b' '"' string_body * '"' | 'b' 'r' raw_byte_string ;
+
+byte_body : ascii_non_single_quote
+          | '\x5c' [ '\x27' | common_escape ] ;
+
+byte_string_body : ascii_non_double_quote
+            | '\x5c' [ '\x22' | common_escape ] ;
+raw_byte_string : '"' raw_byte_string_body '"' | '#' raw_byte_string '#' ;
+
+~~~~
+
+A _byte literal_ is a single ASCII character (in the `U+0000` to `U+007F` range)
+enclosed within two `U+0027` (single-quote) characters,
+with the exception of `U+0027` itself,
+which must be _escaped_ by a preceding U+005C character (`\`),
+or a single _escape_.
+It is equivalent to a `u8` unsigned 8-bit integer _number literal_.
+
+A _byte string literal_ is a sequence of ASCII characters and _escapes_
+enclosed within two `U+0022` (double-quote) characters,
+with the exception of `U+0022` itself,
+which must be _escaped_ by a preceding `U+005C` character (`\`),
+or a _raw byte string literal_.
+It is equivalent to a `&'static [u8]` borrowed vectior unsigned 8-bit integers.
+
+Some additional _escapes_ are available in either byte or non-raw byte string
+literals. An escape starts with a `U+005C` (`\`) and continues with one of
+the following forms:
+
+  * An _byte escape_ escape starts with `U+0078` (`x`) and is
+    followed by exactly two _hex digits_. It denotes the byte
+    equal to the provided hex value.
+  * A _whitespace escape_ is one of the characters `U+006E` (`n`), `U+0072`
+    (`r`), or `U+0074` (`t`), denoting the bytes values `0x0A` (ASCII LF),
+    `0x0D` (ASCII CR) or `0x09` (ASCII HT) respectively.
+  * The _backslash escape_ is the character `U+005C` (`\`) which must be
+    escaped in order to denote its ASCII encoding `0x5C`.
+
+Raw byte string literals do not process any escapes.
+They start with the character `U+0072` (`r`),
+followed by `U+0062` (`b`),
+followed by zero or more of the character `U+0023` (`#`),
+and a `U+0022` (double-quote) character.
+The _raw string body_ is not defined in the EBNF grammar above:
+it can contain any sequence of ASCII characters and is
+terminated only by another `U+0022` (double-quote) character, followed by the
+same number of `U+0023` (`#`) characters that preceded the opening `U+0022`
+(double-quote) character.
+A raw byte string literal can not contain any non-ASCII byte.
+
+All characters contained in the raw string body represent their ASCII encoding,
+the characters `U+0022` (double-quote) (except when followed by at least as
+many `U+0023` (`#`) characters as were used to start the raw string literal) or
+`U+005C` (`\`) do not have any special meaning.
+
 #### Number literals
 
 ~~~~ {.ebnf .gram}

diff --git a/src/libcore/str.rs b/src/libcore/str.rs
@@ -560,6 +560,8 @@ Section: Comparing strings
 
 // share the implementation of the lang-item vs. non-lang-item
 // eq_slice.
+/// NOTE: This function is (ab)used in rustc::middle::trans::_match
+/// to compare &[u8] byte slices that are not necessarily valid UTF-8.
 #[inline]
 fn eq_slice_(a: &str, b: &str) -> bool {
     #[allow(ctypes)]
@@ -572,6 +574,8 @@ fn eq_slice_(a: &str, b: &str) -> bool {
 }
 
 /// Bytewise slice equality
+/// NOTE: This function is (ab)used in rustc::middle::trans::_match
+/// to compare &[u8] byte slices that are not necessarily valid UTF-8.
 #[cfg(not(test))]
 #[lang="str_eq"]
 #[inline]

diff --git a/src/libregex_macros/lib.rs b/src/libregex_macros/lib.rs
@@ -182,7 +182,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
         #[allow(unused_variable)]
         fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
             let mut matched = false;
-            let prefix_bytes: &[u8] = &$prefix_bytes;
+            let prefix_bytes: &[u8] = $prefix_bytes;
             let mut clist = &mut Threads::new(self.which);
             let mut nlist = &mut Threads::new(self.which);
 

diff --git a/src/librustc/middle/const_eval.rs b/src/librustc/middle/const_eval.rs
@@ -506,6 +506,7 @@ pub fn lit_to_const(lit: &Lit) -> const_val {
         LitBinary(ref data) => {
             const_binary(Rc::new(data.iter().map(|x| *x).collect()))
         }
+        LitByte(n) => const_uint(n as u64),
         LitChar(n) => const_uint(n as u64),
         LitInt(n, _) => const_int(n),
         LitUint(n, _) => const_uint(n),
@@ -528,6 +529,7 @@ pub fn compare_const_vals(a: &const_val, b: &const_val) -> Option<int> {
         (&const_float(a), &const_float(b)) => compare_vals(a, b),
         (&const_str(ref a), &const_str(ref b)) => compare_vals(a, b),
         (&const_bool(a), &const_bool(b)) => compare_vals(a, b),
+        (&const_binary(ref a), &const_binary(ref b)) => compare_vals(a, b),
         _ => None
     }
 }

diff --git a/src/librustc/middle/lint.rs b/src/librustc/middle/lint.rs
@@ -805,6 +805,7 @@ fn check_type_limits(cx: &Context, e: &ast::Expr) {
                     } else { t };
                     let (min, max) = uint_ty_range(uint_type);
                     let lit_val: u64 = match lit.node {
+                        ast::LitByte(_v) => return,  // _v is u8, within range by definition
                         ast::LitInt(v, _) => v as u64,
                         ast::LitUint(v, _) => v,
                         ast::LitIntUnsuffixed(v) => v as u64,

diff --git a/src/librustc/middle/trans/_match.rs b/src/librustc/middle/trans/_match.rs
@@ -1273,13 +1273,24 @@ fn compare_values<'a>(
                     val: bool_to_i1(result.bcx, result.val)
                 }
             }
-            _ => cx.sess().bug("only scalars and strings supported in compare_values"),
+            _ => cx.sess().bug("only strings supported in compare_values"),
         },
         ty::ty_rptr(_, mt) => match ty::get(mt.ty).sty {
             ty::ty_str => compare_str(cx, lhs, rhs, rhs_t),
-            _ => cx.sess().bug("only scalars and strings supported in compare_values"),
+            ty::ty_vec(mt, _) => match ty::get(mt.ty).sty {
+                ty::ty_uint(ast::TyU8) => {
+                    // NOTE: cast &[u8] to &str and abuse the str_eq lang item,
+                    // which calls memcmp().
+                    let t = ty::mk_str_slice(cx.tcx(), ty::ReStatic, ast::MutImmutable);
+                    let lhs = BitCast(cx, lhs, type_of::type_of(cx.ccx(), t).ptr_to());
+                    let rhs = BitCast(cx, rhs, type_of::type_of(cx.ccx(), t).ptr_to());
+                    compare_str(cx, lhs, rhs, rhs_t)
+                },
+                _ => cx.sess().bug("only byte strings supported in compare_values"),
+            },
+            _ => cx.sess().bug("on string and byte strings supported in compare_values"),
         },
-        _ => cx.sess().bug("only scalars and strings supported in compare_values"),
+        _ => cx.sess().bug("only scalars, byte strings, and strings supported in compare_values"),
     }
 }
 

diff --git a/src/librustc/middle/trans/consts.rs b/src/librustc/middle/trans/consts.rs
@@ -43,6 +43,7 @@ pub fn const_lit(cx: &CrateContext, e: &ast::Expr, lit: ast::Lit)
     -> ValueRef {
     let _icx = push_ctxt("trans_lit");
     match lit.node {
+        ast::LitByte(b) => C_integral(Type::uint_from_ty(cx, ast::TyU8), b as u64, false),
         ast::LitChar(i) => C_integral(Type::char(cx), i as u64, false),
         ast::LitInt(i, t) => C_integral(Type::int_from_ty(cx, t), i as u64, true),
         ast::LitUint(u, t) => C_integral(Type::uint_from_ty(cx, t), u, false),

diff --git a/src/librustc/middle/typeck/check/mod.rs b/src/librustc/middle/typeck/check/mod.rs
@@ -1715,6 +1715,7 @@ pub fn check_lit(fcx: &FnCtxt, lit: &ast::Lit) -> ty::t {
         ast::LitBinary(..) => {
             ty::mk_slice(tcx, ty::ReStatic, ty::mt{ ty: ty::mk_u8(), mutbl: ast::MutImmutable })
         }
+        ast::LitByte(_) => ty::mk_u8(),
         ast::LitChar(_) => ty::mk_char(),
         ast::LitInt(_, t) => ty::mk_mach_int(t),
         ast::LitUint(_, t) => ty::mk_mach_uint(t),

diff --git a/src/librustdoc/clean/mod.rs b/src/librustdoc/clean/mod.rs
@@ -1924,6 +1924,14 @@ fn lit_to_str(lit: &ast::Lit) -> String {
     match lit.node {
         ast::LitStr(ref st, _) => st.get().to_string(),
         ast::LitBinary(ref data) => format!("{:?}", data.as_slice()),
+        ast::LitByte(b) => {
+            let mut res = String::from_str("b'");
+            (b as char).escape_default(|c| {
+                res.push_char(c);
+            });
+            res.push_char('\'');
+            res
+        },
         ast::LitChar(c) => format!("'{}'", c),
         ast::LitInt(i, _t) => i.to_str(),
         ast::LitUint(u, _t) => u.to_str(),

diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
@@ -140,7 +140,8 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader,
             }
 
             // text literals
-            t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string",
+            t::LIT_BYTE(..) | t::LIT_BINARY(..) | t::LIT_BINARY_RAW(..) |
+                t::LIT_CHAR(..) | t::LIT_STR(..) | t::LIT_STR_RAW(..) => "string",
 
             // number literals
             t::LIT_INT(..) | t::LIT_UINT(..) | t::LIT_INT_UNSUFFIXED(..) |

diff --git a/src/libsyntax/ast.rs b/src/libsyntax/ast.rs
@@ -616,6 +616,7 @@ pub type Lit = Spanned<Lit_>;
 pub enum Lit_ {
     LitStr(InternedString, StrStyle),
     LitBinary(Rc<Vec<u8> >),
+    LitByte(u8),
     LitChar(char),
     LitInt(i64, IntTy),
     LitUint(u64, UintTy),

diff --git a/src/libsyntax/ext/concat.rs b/src/libsyntax/ext/concat.rs
@@ -47,6 +47,7 @@ pub fn expand_syntax_ext(cx: &mut base::ExtCtxt,
                     ast::LitBool(b) => {
                         accumulator.push_str(format!("{}", b).as_slice());
                     }
+                    ast::LitByte(..) |
                     ast::LitBinary(..) => {
                         cx.span_err(e.span, "cannot concatenate a binary literal");
                     }

diff --git a/src/libsyntax/ext/quote.rs b/src/libsyntax/ext/quote.rs
@@ -436,6 +436,12 @@ fn mk_token(cx: &ExtCtxt, sp: Span, tok: &token::Token) -> Gc<ast::Expr> {
                                 vec!(mk_binop(cx, sp, binop)));
         }
 
+        LIT_BYTE(i) => {
+            let e_byte = cx.expr_lit(sp, ast::LitByte(i));
+
+            return cx.expr_call(sp, mk_token_path(cx, sp, "LIT_BYTE"), vec!(e_byte));
+        }
+
         LIT_CHAR(i) => {
             let e_char = cx.expr_lit(sp, ast::LitChar(i));