elixir-lang · josevalim · Jan 21, 2022 · Jan 18, 2022 · Jan 18, 2022 · Jan 18, 2022
diff --git a/Makefile b/Makefile
@@ -105,6 +105,7 @@ unicode: $(UNICODE)
 $(UNICODE): lib/elixir/unicode/*
 	@ echo "==> unicode (compile)";
 	$(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin;
+	$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
 	$(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin;
 
 $(eval $(call APP_TEMPLATE,ex_unit,ExUnit))

diff --git a/lib/elixir/pages/unicode-security.md b/lib/elixir/pages/unicode-security.md
@@ -14,12 +14,22 @@ Elixir will not allow tokenization of identifiers with codepoints in `\p{Identif
 
 For instance, the 'HANGUL FILLER' (`ㅤ`) character, which is often invisible, is an uncommon codepoint and will trigger this warning.
 
-## C2, C3 (planned)
+## C2. Confusable detection
 
-Elixir may implement Confusable Detection, and Mixed-Script Confusable detection, in the future, and will likely emit warnings in those cases; there is a reference implementation.
+Elixir will warn on identifiers that look the same, but aren't. Examples: in `а = a = 1`, the two 'a' characters are Cyrillic and Latin, and could be confused for each other; in `力 = カ = 1`, both are Japanese, but different codepoints, in different scripts of that writing system. Confusable identifiers can lead to hard-to-catch bugs (say, due to copy-pasted code) and can be unsafe, so we will warn about identifiers within a single file that could be confused with each other.
+
+We use the means described in Section 4, 'Confusable Detection', with one noted modification
+
+> Alternatively, it shall declare that it uses a modification, and provide a precise list of character mappings that are added to or removed from the provided ones.
+
+Elixir will not warn on confusability for identifiers made up exclusively of characters in a-z, A-Z, 0-9, and _. This is because ASCII identifiers have existed for so long that the programming community has had their own means of dealing with confusability between identifiers like `l,1` or `O,0` (for instance, fonts designed for programming usually make it easy to differentiate between those characters).
+
+## C3. (not yet implemented)
+
+C3 has to do with detecting mixed-script-confusable characters -- like, say, a file in which several Cyrillic 'a' characters are present in a file of mostly latin identifiers. Conformance with this clause is not yet claimed.
 
 ## C4, C5 (inapplicable)
 
-'C4 - Restriction Level detection' conformance is not claimed and is inapplicable. (It applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels).
+'C4 - Restriction Level detection' conformance is not claimed and does not apply to identifiers in code; rather, it applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels.
 
 'C5 - Mixed number detection' conformance is inapplicable as Elixir does not support Unicode numbers.
diff --git a/lib/elixir/src/elixir.hrl b/lib/elixir/src/elixir.hrl
@@ -33,6 +33,7 @@
   static_atoms_encoder=nil,
   preserve_comments=nil,
   identifier_tokenizer=elixir_tokenizer,
+  ascii_identifiers_only=true,
   indentation=0,
   mismatch_hints=[],
   warn_on_unnecessary_quotes=true,

diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl
@@ -134,14 +134,15 @@ tokenize(String, Line, Column, Opts) ->
 tokenize(String, Line, Opts) ->
   tokenize(String, Line, 1, Opts).
 
-tokenize([], Line, Column, #elixir_tokenizer{cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
-  #elixir_tokenizer{terminators=Terminators, warnings=Warnings} = Scope,
+tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
+  #elixir_tokenizer{file=File, terminators=Terminators, warnings=Warnings} = Scope,
 
   {CursorColumn, CursorTerminators, CursorTokens} =
     add_cursor(Line, Column, Cursor, Terminators, Tokens),
 
+  UnicodeWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File),
   AccTokens = cursor_complete(Line, CursorColumn, CursorTerminators, CursorTokens),
-  {ok, Line, Column, Warnings, AccTokens};
+  {ok, Line, Column, Warnings ++ UnicodeWarnings, AccTokens};
 
 tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine, _} | _]} = Scope, Tokens) ->
   End = terminator(Start),
@@ -150,8 +151,9 @@ tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine,
   Formatted = io_lib:format(Message, [End, Start, StartLine]),
   error({EndLine, Column, [Formatted, Hint], []}, [], Scope, Tokens);
 
-tokenize([], Line, Column, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
-  {ok, Line, Column, Warnings, lists:reverse(Tokens)};
+tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, file=File, warnings=Warnings}, Tokens) ->
+  UnicodeWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File),
+  {ok, Line, Column, Warnings ++ UnicodeWarnings, lists:reverse(Tokens)};
 
 % VC merge conflict
 
@@ -541,10 +543,11 @@ tokenize([$:, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H)
 
 tokenize([$: | String] = Original, Line, Column, Scope, Tokens) ->
   case tokenize_identifier(String, Line, Column, Scope, false) of
-    {_Kind, Unencoded, Atom, Rest, Length, _Ascii, _Special} ->
+    {_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} ->
       NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope),
+      TrackedScope = track_ascii(Ascii, NewScope),
       Token = {atom, {Line, Column, nil}, Atom},
-      tokenize(Rest, Line, Column + 1 + Length, NewScope, [Token | Tokens]);
+      tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]);
     empty when Scope#elixir_tokenizer.cursor_completion == false ->
       unexpected_token(Original, Line, Column, Scope, Tokens);
     empty ->
@@ -643,10 +646,11 @@ tokenize([$. | T], Line, Column, Scope, Tokens) ->
 
 % Identifiers
 
-tokenize(String, Line, Column, Scope, Tokens) ->
-  case tokenize_identifier(String, Line, Column, Scope, not previous_was_dot(Tokens)) of
+tokenize(String, Line, Column, OriginalScope, Tokens) ->
+  case tokenize_identifier(String, Line, Column, OriginalScope, not previous_was_dot(Tokens)) of
     {Kind, Unencoded, Atom, Rest, Length, Ascii, Special} ->
       HasAt = lists:member($@, Special),
+      Scope = track_ascii(Ascii, OriginalScope),
 
       case Rest of
         [$: | T] when ?is_space(hd(T)) ->
@@ -678,20 +682,20 @@ tokenize(String, Line, Column, Scope, Tokens) ->
       end;
 
     {keyword, Atom, Type, Rest, Length} ->
-      tokenize_keyword(Type, Rest, Line, Column, Atom, Length, Scope, Tokens);
+      tokenize_keyword(Type, Rest, Line, Column, Atom, Length, OriginalScope, Tokens);
 
-    empty when Scope#elixir_tokenizer.cursor_completion == false ->
-      unexpected_token(String, Line, Column, Scope, Tokens);
+    empty when OriginalScope#elixir_tokenizer.cursor_completion == false ->
+      unexpected_token(String, Line, Column, OriginalScope, Tokens);
 
     empty ->
       case String of
-        [$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, Scope, Tokens);
-        [$~] -> tokenize([], Line, Column, Scope, Tokens);
-        _ -> unexpected_token(String, Line, Column, Scope, Tokens)
+        [$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, OriginalScope, Tokens);
+        [$~] -> tokenize([], Line, Column, OriginalScope, Tokens);
+        _ -> unexpected_token(String, Line, Column, OriginalScope, Tokens)
       end;
 
     {error, Reason} ->
-      error(Reason, String, Scope, Tokens)
+      error(Reason, String, OriginalScope, Tokens)
   end.
 
 previous_was_dot([{'.', _} | _]) -> true;
@@ -1578,6 +1582,15 @@ maybe_warn_for_ambiguous_bang_before_equals(_Kind, _Atom, _Rest, _Line, _Column,
 prepend_warning(Line, Column, File, Msg, #elixir_tokenizer{warnings=Warnings} = Scope) ->
   Scope#elixir_tokenizer{warnings = [{{Line, Column}, File, Msg} | Warnings]}.
 
+track_ascii(true, Scope) -> Scope;
+track_ascii(false, Scope) -> Scope#elixir_tokenizer{ascii_identifiers_only=false}.
+
+maybe_unicode_lint_warnings(_Ascii=false, Tokens, File) ->
+    'Elixir.String.Tokenizer.Security':unicode_lint_warnings(lists:reverse(Tokens), File);
+
+maybe_unicode_lint_warnings(_Ascii=true, _Tokens, _File) -> [].
+
+
 error(Reason, Rest, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
   {error, Reason, Rest, Warnings, Tokens}.
 
@@ -1667,4 +1680,4 @@ prune_tokens([], [], Terminators) ->
 
 drop_including([{Token, _} | Tokens], Token) -> Tokens;
 drop_including([_ | Tokens], Token) -> drop_including(Tokens, Token);
-drop_including([], _Token) -> [].
+drop_including([], _Token) -> [].
diff --git a/lib/elixir/test/elixir/kernel/warning_test.exs b/lib/elixir/test/elixir/kernel/warning_test.exs
@@ -26,6 +26,20 @@ defmodule Kernel.WarningTest do
     assert {:error, _} = Code.string_to_quoted(~s[:"foobar" do])
   end
 
+  describe "unicode identifier security" do
+    test "warns on confusables" do
+      assert capture_err(fn -> Code.eval_string("аdmin=1; admin=1") end) =~
+               "confusable identifier: 'admin' looks like 'аdmin' on line 1"
+
+      assert capture_err(fn -> Code.eval_string("力=1; カ=1") end) =~
+               "confusable identifier: 'カ' looks like '力' on line 1"
+
+      # by convention, doesn't warn on ascii-only confusables
+      assert capture_err(fn -> Code.eval_string("x0 = xO = 1") end) == ""
+      assert capture_err(fn -> Code.eval_string("l1 = ll = 1") end) == ""
+    end
+  end
+
   test "operators formed by many of the same character followed by that character" do
     output =
       capture_err(fn ->