Skip to content

Commit

Permalink
Added referrer parsing and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aalsuwaidi committed Apr 21, 2024
1 parent 8056a30 commit 809e238
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 1 deletion.
33 changes: 33 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
xpath_element,
xpath_text,
xpath_with_ns,
get_referrer_url
)
from yt_dlp.utils.networking import (
HTTPHeaderDict,
Expand Down Expand Up @@ -336,6 +337,38 @@ def test_unescape_html(self):
# HTML5 entities
self.assertEqual(unescapeHTML('.''), '.\'')

def test_get_referrer_url(self):
example_page_url = 'https://example.com/page'
example_page_url_insecure = 'http://example.com/page'
example_page_url_with_query = 'https://example.com/page?q=123'
example_org_url_insecure = 'http://example.org'
example_url_insecure = 'http://example.com'
example_url = 'https://example.com'
example_otherpage_url = 'https://example.com/otherpage'
example_otherpage_url_insecure = 'http://example.com/otherpage'
mozilla_url = 'https://mozilla.org'
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "no-referrer"), None)
self.assertEqual(get_referrer_url(example_page_url, example_url_insecure, "origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url_with_query, example_page_url_with_query, "unsafe-url"), example_page_url_with_query)
self.assertEqual(get_referrer_url(example_page_url_with_query, mozilla_url, "unsafe-url"), example_page_url_with_query)
self.assertEqual(get_referrer_url(example_page_url_with_query, example_page_url_with_query, "unsafe-url"), example_page_url_with_query)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "strict-origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url, example_org_url_insecure, "strict-origin"), None)
self.assertEqual(get_referrer_url(example_page_url, example_url_insecure, "strict-origin"), None)
self.assertEqual(get_referrer_url(example_page_url, example_otherpage_url, "strict-origin-when-cross-origin"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "strict-origin-when-cross-origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url, example_otherpage_url_insecure, "strict-origin-when-cross-origin"), None)
self.assertEqual(get_referrer_url(example_page_url, example_otherpage_url, "same-origin"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "same-origin"), None)
self.assertEqual(get_referrer_url(example_page_url, example_page_url, "same-origin"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, example_otherpage_url, "origin-when-cross-origin"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "origin-when-cross-origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url, example_page_url_insecure, "origin-when-cross-origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "origin-when-cross-origin"), example_url)
self.assertEqual(get_referrer_url(example_page_url, example_otherpage_url, "no-referrer-when-downgrade"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, mozilla_url, "no-referrer-when-downgrade"), example_page_url)
self.assertEqual(get_referrer_url(example_page_url, example_url_insecure, "no-referrer-when-downgrade"), None)

def test_date_from_str(self):
self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week'))
Expand Down
6 changes: 5 additions & 1 deletion yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
xpath_element,
xpath_text,
xpath_with_ns,
get_referrer_url
)


Expand Down Expand Up @@ -3173,6 +3174,7 @@ def _media_formats(src, cur_media_type, type_info=None):
return is_plain_url, formats

entries = []
referrer_policy = self._html_search_meta('referrer', webpage) or 'strict-origin-when-cross-origin'
# amp-video and amp-audio are very similar to their HTML5 counterparts
# so we will include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
Expand Down Expand Up @@ -3255,7 +3257,9 @@ def _media_formats(src, cur_media_type, type_info=None):
'url': absolute_url(src),
})
for f in media_info['formats']:
f.setdefault('http_headers', {})['Referer'] = base_url
referrer = get_referrer_url(base_url, f["url"], referrer_policy)
if referrer:
f.setdefault('http_headers', {})['Referer'] = referrer
if media_info['formats'] or media_info['subtitles']:
entries.append(media_info)
return entries
Expand Down
65 changes: 65 additions & 0 deletions yt_dlp/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5478,3 +5478,68 @@ def stdout(self, message):
def stderr(self, message):
if self._ydl:
self._ydl.to_stderr(message)


def get_referrer_url(referrer_source, request_url, policy):
# Returns correct referrer url based on the site policy
# Resources used:
# https://w3c.github.io/webappsec-referrer-policy/#determine-requests-referrer
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy#examples
# https://github.com/scrapy/scrapy/blob/master/scrapy/spidermiddlewares/referer.py

# Strip URL as per https://w3c.github.io/webappsec-referrer-policy/#strip-url
def strip_url(url, origin_only=False):
if url is None:
return 'no-referrer'
parsed_url = urllib.parse.urlparse(url)
if parsed_url.username:
parsed_url = parsed_url._replace(username=None)
if parsed_url.password:
parsed_url = parsed_url._replace(password=None)
if parsed_url.fragment:
parsed_url = parsed_url._replace(fragment='')
if origin_only:
parsed_url = parsed_url._replace(path='')
parsed_url = parsed_url._replace(query='')
return parsed_url.geturl()

# https://w3c.github.io/webappsec-secure-contexts/#is-origin-trustworthy
# More checks to determine the trustworthiness of a URL, the URL scheme is one check
def is_tls(url):
return urllib.parse.urlparse(url).scheme in ('https', 'ftps')
referrer_url = strip_url(referrer_source)
referrer_origin = strip_url(referrer_source, origin_only=True)
is_origin_only = (referrer_origin == strip_url(request_url, True))
is_tls_referrer = is_tls(referrer_url)
is_not_tls_requester = not is_tls(request_url)
if len(referrer_url) > 4096:
referrer_url = referrer_origin
if policy == 'no-referrer':
return None
elif policy == 'origin':
return referrer_origin
elif policy == 'unsafe-url':
return referrer_url
elif policy == 'strict-origin':
if is_tls_referrer and is_not_tls_requester:
return None
else:
return referrer_origin
elif policy == 'strict-origin-when-cross-origin':
if is_origin_only:
return referrer_url
elif is_tls_referrer and is_not_tls_requester:
return None
return referrer_origin
elif policy == 'same-origin':
if is_origin_only:
return referrer_url
return None
elif policy == 'origin-when-cross-origin':
if is_origin_only:
return referrer_url
return referrer_origin
elif policy == 'no-referrer-when-downgrade':
if is_tls_referrer and is_not_tls_requester:
return None
return referrer_url

0 comments on commit 809e238

Please sign in to comment.