Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warn on confusable non-ascii identifiers (UTS 39, C2) #11582

Merged
merged 13 commits into from
Jan 21, 2022
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ unicode: $(UNICODE)
$(UNICODE): lib/elixir/unicode/*
@ echo "==> unicode (compile)";
$(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin;
$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
$(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin;

$(eval $(call APP_TEMPLATE,ex_unit,ExUnit))
Expand Down
16 changes: 13 additions & 3 deletions lib/elixir/pages/unicode-security.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,22 @@ Elixir will not allow tokenization of identifiers with codepoints in `\p{Identif
For instance, the 'HANGUL FILLER' (``) character, which is often invisible, is an uncommon codepoint and will trigger this warning.

## C2, C3 (planned)
## C2. Confusable detection

Elixir may implement Confusable Detection, and Mixed-Script Confusable detection, in the future, and will likely emit warnings in those cases; there is a reference implementation.
Elixir will warn on identifiers that look the same, but aren't. Examples: in `а = a = 1`, the two 'a' characters are Cyrillic and Latin, and could be confused for each other; in `力 = カ = 1`, both are Japanese, but different codepoints, in different scripts of that writing system. Confusable identifiers can lead to hard-to-catch bugs (say, due to copy-pasted code) and can be unsafe, so we will warn about identifiers within a single file that could be confused with each other.

We use the means described in Section 4, 'Confusable Detection', with one noted modification

> Alternatively, it shall declare that it uses a modification, and provide a precise list of character mappings that are added to or removed from the provided ones.
Elixir will not warn on confusability for identifiers made up exclusively of characters in a-z, A-Z, 0-9, and _. This is because ASCII identifiers have existed for so long that the programming community has had their own means of dealing with confusability between identifiers like `l,1` or `O,0` (for instance, fonts designed for programming usually make it easy to differentiate between those characters).

## C3. (not yet implemented)

C3 has to do with detecting mixed-script-confusable characters -- like, say, a file in which several Cyrillic 'a' characters are present in a file of mostly latin identifiers. Conformance with this clause is not yet claimed.

## C4, C5 (inapplicable)

'C4 - Restriction Level detection' conformance is not claimed and is inapplicable. (It applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels).
'C4 - Restriction Level detection' conformance is not claimed and does not apply to identifiers in code; rather, it applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels.

'C5 - Mixed number detection' conformance is inapplicable as Elixir does not support Unicode numbers.
1 change: 1 addition & 0 deletions lib/elixir/src/elixir.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
static_atoms_encoder=nil,
preserve_comments=nil,
identifier_tokenizer=elixir_tokenizer,
ascii_identifiers_only=true,
indentation=0,
mismatch_hints=[],
warn_on_unnecessary_quotes=true,
Expand Down
47 changes: 30 additions & 17 deletions lib/elixir/src/elixir_tokenizer.erl
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,15 @@ tokenize(String, Line, Column, Opts) ->
tokenize(String, Line, Opts) ->
tokenize(String, Line, 1, Opts).

tokenize([], Line, Column, #elixir_tokenizer{cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
#elixir_tokenizer{terminators=Terminators, warnings=Warnings} = Scope,
tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
#elixir_tokenizer{file=File, terminators=Terminators, warnings=Warnings} = Scope,

{CursorColumn, CursorTerminators, CursorTokens} =
add_cursor(Line, Column, Cursor, Terminators, Tokens),

UnicodeWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File),
josevalim marked this conversation as resolved.
Show resolved Hide resolved
AccTokens = cursor_complete(Line, CursorColumn, CursorTerminators, CursorTokens),
{ok, Line, Column, Warnings, AccTokens};
{ok, Line, Column, Warnings ++ UnicodeWarnings, AccTokens};
josevalim marked this conversation as resolved.
Show resolved Hide resolved

tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine, _} | _]} = Scope, Tokens) ->
End = terminator(Start),
Expand All @@ -150,8 +151,9 @@ tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine,
Formatted = io_lib:format(Message, [End, Start, StartLine]),
error({EndLine, Column, [Formatted, Hint], []}, [], Scope, Tokens);

tokenize([], Line, Column, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
{ok, Line, Column, Warnings, lists:reverse(Tokens)};
tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, file=File, warnings=Warnings}, Tokens) ->
UnicodeWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File),
{ok, Line, Column, Warnings ++ UnicodeWarnings, lists:reverse(Tokens)};
josevalim marked this conversation as resolved.
Show resolved Hide resolved

% VC merge conflict

Expand Down Expand Up @@ -541,10 +543,11 @@ tokenize([$:, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H)

tokenize([$: | String] = Original, Line, Column, Scope, Tokens) ->
case tokenize_identifier(String, Line, Column, Scope, false) of
{_Kind, Unencoded, Atom, Rest, Length, _Ascii, _Special} ->
{_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} ->
NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope),
TrackedScope = track_ascii(Ascii, NewScope),
Token = {atom, {Line, Column, nil}, Atom},
tokenize(Rest, Line, Column + 1 + Length, NewScope, [Token | Tokens]);
tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]);
empty when Scope#elixir_tokenizer.cursor_completion == false ->
unexpected_token(Original, Line, Column, Scope, Tokens);
empty ->
Expand Down Expand Up @@ -643,10 +646,11 @@ tokenize([$. | T], Line, Column, Scope, Tokens) ->

% Identifiers

tokenize(String, Line, Column, Scope, Tokens) ->
case tokenize_identifier(String, Line, Column, Scope, not previous_was_dot(Tokens)) of
tokenize(String, Line, Column, OriginalScope, Tokens) ->
case tokenize_identifier(String, Line, Column, OriginalScope, not previous_was_dot(Tokens)) of
{Kind, Unencoded, Atom, Rest, Length, Ascii, Special} ->
HasAt = lists:member($@, Special),
Scope = track_ascii(Ascii, OriginalScope),

case Rest of
[$: | T] when ?is_space(hd(T)) ->
Expand Down Expand Up @@ -678,20 +682,20 @@ tokenize(String, Line, Column, Scope, Tokens) ->
end;

{keyword, Atom, Type, Rest, Length} ->
tokenize_keyword(Type, Rest, Line, Column, Atom, Length, Scope, Tokens);
tokenize_keyword(Type, Rest, Line, Column, Atom, Length, OriginalScope, Tokens);

empty when Scope#elixir_tokenizer.cursor_completion == false ->
unexpected_token(String, Line, Column, Scope, Tokens);
empty when OriginalScope#elixir_tokenizer.cursor_completion == false ->
unexpected_token(String, Line, Column, OriginalScope, Tokens);

empty ->
case String of
[$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, Scope, Tokens);
[$~] -> tokenize([], Line, Column, Scope, Tokens);
_ -> unexpected_token(String, Line, Column, Scope, Tokens)
[$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, OriginalScope, Tokens);
[$~] -> tokenize([], Line, Column, OriginalScope, Tokens);
_ -> unexpected_token(String, Line, Column, OriginalScope, Tokens)
end;

{error, Reason} ->
error(Reason, String, Scope, Tokens)
error(Reason, String, OriginalScope, Tokens)
end.

previous_was_dot([{'.', _} | _]) -> true;
Expand Down Expand Up @@ -1578,6 +1582,15 @@ maybe_warn_for_ambiguous_bang_before_equals(_Kind, _Atom, _Rest, _Line, _Column,
prepend_warning(Line, Column, File, Msg, #elixir_tokenizer{warnings=Warnings} = Scope) ->
Scope#elixir_tokenizer{warnings = [{{Line, Column}, File, Msg} | Warnings]}.

track_ascii(true, Scope) -> Scope;
track_ascii(false, Scope) -> Scope#elixir_tokenizer{ascii_identifiers_only=false}.

maybe_unicode_lint_warnings(_Ascii=false, Tokens, File) ->
'Elixir.String.Tokenizer.Security':unicode_lint_warnings(lists:reverse(Tokens), File);

maybe_unicode_lint_warnings(_Ascii=true, _Tokens, _File) -> [].

josevalim marked this conversation as resolved.
Show resolved Hide resolved

error(Reason, Rest, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
{error, Reason, Rest, Warnings, Tokens}.

Expand Down Expand Up @@ -1667,4 +1680,4 @@ prune_tokens([], [], Terminators) ->

drop_including([{Token, _} | Tokens], Token) -> Tokens;
drop_including([_ | Tokens], Token) -> drop_including(Tokens, Token);
drop_including([], _Token) -> [].
drop_including([], _Token) -> [].
14 changes: 14 additions & 0 deletions lib/elixir/test/elixir/kernel/warning_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@ defmodule Kernel.WarningTest do
assert {:error, _} = Code.string_to_quoted(~s[:"foobar" do])
end

describe "unicode identifier security" do
test "warns on confusables" do
assert capture_err(fn -> Code.eval_string("аdmin=1; admin=1") end) =~
"confusable identifier: 'admin' looks like 'аdmin' on line 1"

assert capture_err(fn -> Code.eval_string("力=1; カ=1") end) =~
"confusable identifier: 'カ' looks like '力' on line 1"

# by convention, doesn't warn on ascii-only confusables
assert capture_err(fn -> Code.eval_string("x0 = xO = 1") end) == ""
assert capture_err(fn -> Code.eval_string("l1 = ll = 1") end) == ""
end
end

test "operators formed by many of the same character followed by that character" do
output =
capture_err(fn ->
Expand Down
Loading