Skip to content

Commit

Permalink
fixes URL normalization for feed parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mrmegatelo committed Dec 4, 2024
1 parent 0b20321 commit afb86f2
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 6 deletions.
6 changes: 4 additions & 2 deletions feed/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,11 @@ def parse_feeds_by_url(url):
@shared_task()
def parse_feed_info(url):
result = []
feed_list = get_feed_by_url(url)
normalized_url = normalize_url(url)
feed_list = get_feed_by_url(normalized_url)

if len(feed_list) == 0:
return parse_feeds_by_url(url)
return parse_feeds_by_url(normalized_url)

for feed in feed_list:
result.append(feed["id"])
Expand Down
8 changes: 5 additions & 3 deletions feed/utils/feed_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,19 @@ def parse(self, url: str) -> Generator[FeedMeta]:
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3",
"Accept": "text/html, text/xml, application/xml, application/rss+xml, application/atom+xml",
}
parsed = feedparser.parse(url, request_headers=headers)
normalized_rss_url = normalize_url(url)
parsed = feedparser.parse(normalized_rss_url, request_headers=headers)
parsed_icon = self.parse_icon(parsed)

normalized_icon_url = normalize_url(parsed_icon, parsed.feed.link)
normalized_url = normalize_url(parsed.feed.link)

feed_meta = FeedMeta(
title=parsed.feed.title,
description=self.parse_description(parsed),
icon_url=normalized_icon_url,
rss_url=url,
url=parsed.feed.link,
rss_url=normalized_rss_url,
url=normalized_url,
entries=parsed.entries,
)

Expand Down
2 changes: 1 addition & 1 deletion feed/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def normalize_url(url, base_path=None):
if parsed_url.scheme == "":
parsed_url = parsed_url._replace(scheme="https")
if parsed_url.path == "":
parsed_url = parsed_url._replace(path="")
parsed_url = parsed_url._replace(path="/")
if parsed_url.netloc == "" and base_path is not None:
base_path = urlparse(base_path)
parsed_url = base_path._replace(path=parsed_url.path)
Expand Down

0 comments on commit afb86f2

Please sign in to comment.