Skip to content

Commit

Permalink
Allow most kinds of characters in URL query (fixes mastodon#8408) (ma…
Browse files Browse the repository at this point in the history
…stodon#8447)

* Allow unicode characters in URL query strings

Fixes mastodon#8408

* Alternative approach to unicode support in urls

Adds PoC/idea to approch this problem.
  • Loading branch information
JMendyk authored and hiyuki2578 committed Oct 2, 2019
1 parent b7382b0 commit 8dcdb22
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
39 changes: 38 additions & 1 deletion app/lib/formatter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def encode(html)
end

def encode_and_link_urls(html, accounts = nil, options = {})
entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)

if accounts.is_a?(Hash)
options = accounts
Expand Down Expand Up @@ -204,6 +204,43 @@ def rewrite(text, entities)
result.flatten.join
end

def utf8_friendly_extractor(text, options = {})
old_to_new_index = [0]

escaped = text.chars.map do |c|
output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
old_to_new_index << old_to_new_index.last + output.length
output
end.join

# Note: I couldn't obtain list_slug with @user/list-name format
# for mention so this requires additional check
special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
# exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first

new_indices = [
old_to_new_index.find_index(extract[:indices].first),
old_to_new_index.find_index(extract[:indices].last),
]

has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
value_indices = [
new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
new_indices.last - 1,
]

next extract.merge(
:indices => new_indices,
key => text[value_indices.first..value_indices.last]
)
end

standard = Extractor.extract_entities_with_indices(text, options)

Extractor.remove_overlapping_entities(special + standard)
end

def link_to_url(entity, options = {})
url = Addressable::URI.parse(entity[:url])
html_attrs = { target: '_blank', rel: 'nofollow noopener' }
Expand Down
32 changes: 29 additions & 3 deletions spec/lib/formatter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,36 @@
end

context 'given a URL with a query string' do
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
context 'with escaped unicode character' do
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }

it 'matches the full URL' do
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
it 'matches the full URL' do
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
end
end

context 'with unicode character' do
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }

it 'matches the full URL' do
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&amp;q=autolink"'
end
end

context 'with unicode character at the end' do
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }

it 'matches the full URL' do
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
end
end

context 'with escaped and not escaped unicode characters' do
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }

it 'preserves escaped unicode characters' do
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;utf81=✓&amp;q=autolink"'
end
end
end

Expand Down

0 comments on commit 8dcdb22

Please sign in to comment.