elixir-lang · josevalim · Nov 15, 2021 · Nov 15, 2021 · Nov 15, 2021 · ericmj
diff --git a/lib/elixir/src/elixir_interpolation.erl b/lib/elixir/src/elixir_interpolation.erl
@@ -3,9 +3,7 @@
 -export([extract/6, unescape_string/1, unescape_string/2,
 unescape_tokens/1, unescape_map/1]).
 -include("elixir.hrl").
--define(is_hex(S), ((S >= $0 andalso S =< $9) orelse
-                    (S >= $A andalso S =< $F) orelse
-                    (S >= $a andalso S =< $f))).
+-include("elixir_tokenizer.hrl").
 
 %% Extract string interpolations
 
@@ -60,8 +58,8 @@ extract([$#, ${ | Rest], Buffer, Output, Line, Column, Scope, true, Last) ->
       {error, {string, Line, Column, "missing interpolation terminator: \"}\"", []}}
   end;
 
-extract([$\\, Char | Rest], Buffer, Output, Line, Column, Scope, Interpol, Last) ->
-  extract(Rest, [Char, $\\ | Buffer], Output, Line, Column + 2, Scope, Interpol, Last);
+extract([$\\ | Rest], Buffer, Output, Line, Column, Scope, Interpol, Last) ->
+  extract_char(Rest, [$\\ | Buffer], Output, Line, Column + 1, Scope, Interpol, Last);
 
 %% Catch all clause
 
@@ -70,8 +68,21 @@ extract([Char1, Char2 | Rest], Buffer, Output, Line, Column, Scope, Interpol, La
   extract([Char2 | Rest], [Char1 | Buffer], Output, Line, Column + 1, Scope, Interpol, Last);
 
 extract(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last) ->
+  extract_char(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last).
+
+extract_char(Rest, Buffer, Output, Line, Column, Scope, Interpol, Last) ->
   [Char | NewRest] = unicode_util:gc(Rest),
-  extract(NewRest, [Char | Buffer], Output, Line, Column + 1, Scope, Interpol, Last).
+
+  if
+    ?bidi(Char) ->
+      Token = io_lib:format("\\u~4.16.0B", [Char]),
+      Pre = "invalid bidirectional formatting character in string: ",
+      Pos = io_lib:format(". If you want to use such character, use it in its escaped ~ts form instead", [Token]),
+      {error, {Line, Column, {Pre, Pos}, Token}};
+
+    true ->
+      extract(NewRest, [Char | Buffer], Output, Line, Column + 1, Scope, Interpol, Last)
+  end.
 
 %% Handle newlines. Heredocs require special attention
 

diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl
@@ -1,28 +1,8 @@
 -module(elixir_tokenizer).
 -include("elixir.hrl").
+-include("elixir_tokenizer.hrl").
 -export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]).
 
-%% Numbers
--define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))).
--define(is_bin(S), (S >= $0 andalso S =< $1)).
--define(is_octal(S), (S >= $0 andalso S =< $7)).
-
-%% Digits and letters
--define(is_digit(S), (S >= $0 andalso S =< $9)).
--define(is_upcase(S), (S >= $A andalso S =< $Z)).
--define(is_downcase(S), (S >= $a andalso S =< $z)).
-
-%% Others
--define(is_quote(S), (S =:= $" orelse S =:= $')).
--define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse
-                      S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)).
-
-%% Spaces
--define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)).
--define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)).
--define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))).
-
-%% Operators
 -define(at_op(T),
   T =:= $@).
 
@@ -198,9 +178,13 @@ tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
 % Comments
 
 tokenize([$# | String], Line, Column, Scope, Tokens) ->
-  {Rest, Comment} = tokenize_comment(String, [$#]),
-  preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
-  tokenize(Rest, Line, Column, Scope, reset_eol(Tokens));
+  case tokenize_comment(String, [$#]) of
+    {error, Char} ->
+      error_comment(Char, [$# | String], Line, Column, Scope, Tokens);
+    {Rest, Comment} ->
+      preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
+      tokenize(Rest, Line, Column, Scope, reset_eol(Tokens))
+  end;
 
 % Sigils
 
@@ -645,9 +629,7 @@ tokenize([$% | T], Line, Column, Scope, Tokens) ->
   tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]);
 
 tokenize([$. | T], Line, Column, Scope, Tokens) ->
-  DotInfo = {Line, Column, nil},
-  {Rest, EndLine, EndColumn} = strip_dot_space(T, Line, Column + 1, [{'.', DotInfo}| Tokens], Scope),
-  handle_dot([$. | Rest], EndLine, EndColumn, DotInfo, Scope, Tokens);
+  tokenize_dot(T, Line, Column + 1, {Line, Column, nil}, Scope, Tokens);
 
 % Identifiers
 
@@ -719,18 +701,23 @@ strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
 strip_horizontal_space(T, Counter) ->
   {T, Counter}.
 
-strip_dot_space(T, Line, Column, Tokens, Scope) ->
+tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) ->
   case strip_horizontal_space(T, 0) of
-    {"#" ++ R, _} ->
-      {Rest, Comment} = tokenize_comment(R, [$#]),
-      preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
-      strip_dot_space(Rest, Line, 1, reset_eol(Tokens), Scope);
+    {[$# | R], _} ->
+      case tokenize_comment(R, [$#]) of
+        {error, Char} ->
+          error_comment(Char, [$# | R], Line, Column, Scope, Tokens);
+
+        {Rest, Comment} ->
+          preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
+          tokenize_dot(Rest, Line, 1, DotInfo, Scope, Tokens)
+      end;
     {"\r\n" ++ Rest, _} ->
-      strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope);
+      tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
     {"\n" ++ Rest, _} ->
-      strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope);
+      tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
     {Rest, Length} ->
-      {Rest, Line, Column + Length}
+      handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens)
   end.
 
 handle_char(0)   -> {"\\0", "null byte"};
@@ -1171,11 +1158,18 @@ tokenize_comment("\r\n" ++ _ = Rest, Acc) ->
   {Rest, lists:reverse(Acc)};
 tokenize_comment("\n" ++ _ = Rest, Acc) ->
   {Rest, lists:reverse(Acc)};
+tokenize_comment([H | _Rest], _) when ?bidi(H) ->
+  {error, H};
 tokenize_comment([H | Rest], Acc) ->
   tokenize_comment(Rest, [H | Acc]);
 tokenize_comment([], Acc) ->
   {[], lists:reverse(Acc)}.
 
+error_comment(H, Comment, Line, Column, Scope, Tokens) ->
+  Token = io_lib:format("\\u~4.16.0B", [H]),
+  Reason = {Line, Column, "invalid bidirectional formatting character in comment: ", Token},
+  error(Reason, Comment, Scope, Tokens).
+
 preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) ->
   case Scope#elixir_tokenizer.preserve_comments of
     Fun when is_function(Fun) ->

diff --git a/lib/elixir/src/elixir_tokenizer.hrl b/lib/elixir/src/elixir_tokenizer.hrl
@@ -0,0 +1,31 @@
+%% Numbers
+-define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))).
+-define(is_bin(S), (S >= $0 andalso S =< $1)).
+-define(is_octal(S), (S >= $0 andalso S =< $7)).
+
+%% Digits and letters
+-define(is_digit(S), (S >= $0 andalso S =< $9)).
+-define(is_upcase(S), (S >= $A andalso S =< $Z)).
+-define(is_downcase(S), (S >= $a andalso S =< $z)).
+
+%% Others
+-define(is_quote(S), (S =:= $" orelse S =:= $')).
+-define(is_sigil(S), (S =:= $/ orelse S =:= $< orelse S =:= $" orelse S =:= $' orelse
+                      S =:= $[ orelse S =:= $( orelse S =:= ${ orelse S =:= $|)).
+
+%% Spaces
+-define(is_horizontal_space(S), (S =:= $\s orelse S =:= $\t)).
+-define(is_vertical_space(S), (S =:= $\r orelse S =:= $\n)).
+-define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))).
+
+%% Bidirectional control
+%% Retrieved from https://trojansource.codes/trojan-source.pdf
+-define(bidi(C), C =:= 16#202A;
+                 C =:= 16#202B;
+                 C =:= 16#202D;
+                 C =:= 16#202E;
+                 C =:= 16#2066;
+                 C =:= 16#2067;
+                 C =:= 16#2068;
+                 C =:= 16#202C;
+                 C =:= 16#2069).
diff --git a/lib/elixir/test/elixir/kernel/errors_test.exs b/lib/elixir/test/elixir/kernel/errors_test.exs
@@ -816,6 +816,24 @@ defmodule Kernel.ErrorsTest do
                       'x = 8; <<a, b::size(^x)>> = <<?a, ?b>>'
   end
 
+  test "invalid bidi in source" do
+    assert_eval_raise SyntaxError,
+                      ~r"nofile:1:1: invalid bidirectional formatting character in comment: \\u202A",
+                      '# This is a \u202A'
+
+    assert_eval_raise SyntaxError,
+                      ~r"nofile:1:5: invalid bidirectional formatting character in comment: \\u202A",
+                      'foo. # This is a \u202A'
+
+    assert_eval_raise SyntaxError,
+                      ~r"nofile:1:12: invalid bidirectional formatting character in string: \\u202A. If you want to use such character, use it in its escaped \\u202A form instead",
+                      '"this is a \u202A"'
+
+    assert_eval_raise SyntaxError,
+                      ~r"nofile:1:13: invalid bidirectional formatting character in string: \\u202A. If you want to use such character, use it in its escaped \\u202A form instead",
+                      '"this is a \\\u202A"'
+  end
+
   test "function head with guard" do
     assert_eval_raise CompileError, "nofile:2: missing :do option in \"def\"", '''
     defmodule Kernel.ErrorsTest.BodyessFunctionWithGuard do