From 80a1dbf46ec7e0ea499dca192b3c94dc9a434b0a Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Mon, 7 Feb 2022 18:16:31 +0100 Subject: [PATCH] Fix structured data parsing from links choking on bad data (#17403) * Fix structured data parsing from links choking on bad data - Fix og:url meta tag being prioritized over canonical link tag - Fix structured data parsing choking on commented-out CDATA declarations - Fix HTML entities in title, description, provider_name, author_name - Change structured data parsing to attempt every JSON-LD script tag * Remove unnecessary slash escapes from CDATA regex pattern --- app/lib/link_details_extractor.rb | 13 +++ spec/lib/link_details_extractor_spec.rb | 122 ++++++++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index abb38fb9ca7f54..cf273e4967d59e 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -17,6 +17,19 @@ class LinkDetailsExtractor (//[\s]*\]\]>) # Single-line comment style closing )[\s]*$}x + # Some publications wrap their JSON-LD data in their + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + + context 'but the first tag is invalid JSON' do + let(:html) { <<-HTML } + + + + + + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + end end