Skip to content

Commit

Permalink
Merge pull request #151 from openzim/fix_topics
Browse files Browse the repository at this point in the history
Fix fetching topics, moved to an internal search API
  • Loading branch information
benoit74 authored Dec 18, 2023
2 parents 3779bf3 + 35da0a1 commit 9eae1dc
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 68 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 2.0.11).

## Unreleased

### Changed
- fixed search by topic to use new search API instead of broken web page scraping (#149)
- download_link is renamed request_url and can also perform POST requests (in addition to previous GET requests)

## [2.0.13] - 2023-04-24

### Changed
Expand Down
2 changes: 2 additions & 0 deletions ted2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

BASE_URL = "https://ted.com/"

SEARCH_URL = "https://zenith-prod-alt.ted.com/api/search"

MATCHING = "matching"
ALL = "all"
NONE = "none"
Expand Down
6 changes: 3 additions & 3 deletions ted2zim/multi/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from kiwixstorage import KiwixStorage

from ..constants import NAME, getLogger
from ..utils import download_link, get_temp_fpath
from ..utils import request_url, get_temp_fpath

logger = getLogger()

Expand Down Expand Up @@ -83,7 +83,7 @@ def download_playlists_list_from_site(topics_list):
for topic in topics_list:
logger.debug(f"Getting playlists related to {topic}")
playlist_sub_list = json.loads(
download_link(
request_url(
f"https://www.ted.com/playlists/browse.json?topics={topic['value'].replace(' ', '+')}"
).text
)
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_list_of_all(self, mode):
"""returns a list of topics or playlists"""
# get all topics
topics_list = json.loads(
download_link("https://www.ted.com/topics/combo?models=Talks").text
request_url("https://www.ted.com/topics/combo?models=Talks").text
)
if mode == "topic":
return topics_list
Expand Down
120 changes: 61 additions & 59 deletions ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import locale
import pathlib
import requests
import shutil
import tempfile
import time
Expand All @@ -29,6 +30,7 @@
from .constants import (
ALL,
BASE_URL,
SEARCH_URL,
MATCHING,
NONE,
ROOT_DIR,
Expand All @@ -37,7 +39,7 @@
getLogger,
)
from .processing import post_process_video
from .utils import WebVTT, download_link, update_subtitles_list, get_main_title
from .utils import WebVTT, request_url, update_subtitles_list, get_main_title

logger = getLogger()

Expand Down Expand Up @@ -97,7 +99,7 @@ def __init__(
self.topics = (
[]
if not topics
else [c.strip().replace(" ", "+") for c in topics.split(",")]
else topics.split(",")
)
self.autoplay = autoplay
self.playlist = playlist
Expand Down Expand Up @@ -168,7 +170,7 @@ def ted_topics_json(self):

@property
def talks_base_url(self):
return BASE_URL + "talks"
return BASE_URL + "talks/"

@property
def playlists_base_url(self):
Expand Down Expand Up @@ -230,7 +232,7 @@ def extract_videos_from_playlist(self, playlist):

playlist_url = f"{self.playlists_base_url}/{playlist}"
logger.debug(f"extract_videos_from_playlist: {playlist_url}")
soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser")
soup = BeautifulSoup(request_url(playlist_url).text, features="html.parser")
video_elements = soup.find_all("a", attrs={"class": "group"})
self.playlist_title = soup.find("h1").string
self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string
Expand All @@ -246,47 +248,55 @@ def extract_videos_from_playlist(self, playlist):
)
for lang_url in other_lang_urls:
self.extract_info_from_video_page(lang_url)
self.already_visited.append(urllib.parse.urlparse(url)[2])
self.already_visited.append(urllib.parse.urlparse(url).path)
logger.debug(f"Seen {relative_path}")
logger.debug(f"Total videos found on playlist: {len(video_elements)}")
if not video_elements:
raise ValueError("Wrong playlist ID supplied. No videos found")

def generate_search_result_and_scrape(self, topic_url, total_videos_scraped):
"""generates a search result and returns the total number of videos scraped"""
def generate_search_results(self, topic):
"""generates a search results and returns the total number of videos scraped"""

page = 1
total_videos_scraped = 0
page = 0
while True:
logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}")
html = download_link(f"{topic_url}&page={page}").text
nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page(
html
)
result = self.query_search_engine(topic, page)
result_json = result.json()
nb_videos_extracted, nb_videos_on_page = self.extract_videos_in_search_results(result_json)
if nb_videos_on_page == 0:
break
total_videos_scraped += nb_videos_extracted
page += 1
return total_videos_scraped


def query_search_engine(self, topic, page):
logger.debug(f"Fetching page {page} of topic {topic}")
data = [
{
"indexName": "relevance",
"params": {
"attributeForDistinct": "objectID",
"distinct": 1,
"facetFilters": [[f"tags:{topic}"]],
"facets": ["subtitle_languages", "tags"],
"highlightPostTag": "__/ais-highlight__",
"highlightPreTag": "__ais-highlight__",
"hitsPerPage": 24,
"maxValuesPerFacet": 500,
"page": page,
"query": "",
"tagFilters": "",
},
},
]
return request_url(SEARCH_URL, data)

def extract_videos_from_topics(self, topic):
"""extracts metadata for required number of videos on different topics"""

logger.debug(f"Fetching video links for topic: {topic}")
topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}"
total_videos_scraped = 0

if self.source_languages:
for lang in self.source_languages:
topic_url = topic_url + f"&language={lang}"
total_videos_scraped = self.generate_search_result_and_scrape(
topic_url, total_videos_scraped
)

else:
total_videos_scraped = self.generate_search_result_and_scrape(
topic_url, total_videos_scraped
)

total_videos_scraped = self.generate_search_results(topic)
logger.info(f"Total video links found in {topic}: {total_videos_scraped}")
if total_videos_scraped == 0:
return False
Expand Down Expand Up @@ -402,31 +412,16 @@ def generate_urls_for_other_languages(self, url):
urls.append(urllib.parse.urlunparse(url_parts))
return urls

def extract_videos_on_topic_page(self, page_html):

# all videos are embedded in a <div> with the class name 'row'.
# we are searching for the div inside this div, that has an <a>-tag
# with the class name 'media__image', because this is the relative
# link to the representative TED talk. It turns this relative link to
# an absolute link and calls extract_video_info for them
soup = BeautifulSoup(page_html, features="html.parser")
video_links = soup.select("div.row div.media__image a")
def extract_videos_in_search_results(self, result_json):
hits = result_json["results"][0]["hits"]
nb_extracted = 0
nb_listed = len(video_links)
nb_listed = len(hits)
logger.debug(f"{nb_listed} video(s) found on current page")
for video_link in video_links:
url = urllib.parse.urljoin(self.talks_base_url, video_link["href"])
for hit in hits:
url = urllib.parse.urljoin(self.talks_base_url, hit["slug"])
if self.extract_info_from_video_page(url):
nb_extracted += 1
if self.source_languages and len(self.source_languages) > 1:
other_lang_urls = self.generate_urls_for_other_languages(url)
logger.debug(
f"Searching info for video in other {len(other_lang_urls)} language(s)"
)
for lang_url in other_lang_urls:
self.extract_info_from_video_page(lang_url)
self.already_visited.append(urllib.parse.urlparse(url)[2])
logger.debug(f"Seen {video_link['href']}")
logger.debug(f"Seen {hit['slug']}")
return nb_extracted, nb_listed

def get_lang_code_from_url(self, url, with_full_query=False):
Expand Down Expand Up @@ -537,15 +532,22 @@ def extract_video_info_from_json(self, json_data):
except Exception as exc:
logger.warning(f"player data has no entry for {lang_code}: {exc}")
lang_name = lang_code
# talk_info = json_data["talks"][0]
if (self.topics) :
# we need to filter videos since this has not been done before for topics with
# the "new" search page (2023)
if lang_code not in self.source_languages:
# video language is not among the selected ones, we have to check subtitles
# if they are enough
if (not self.subtitles_enough):
logger.debug(f"Ignoring video in non-selected language {lang_code}")
return False
else:
matching_languages = [lang for lang in player_data["languages"] if lang["languageCode"] in self.source_languages]
if len(matching_languages) == 0:
logger.debug(f"Ignoring video without a selected language in audio or subtitles")
return False

native_talk_language = player_data["nativeLanguage"]
if (
not self.subtitles_enough
and self.source_languages
and native_talk_language != lang_code
and self.topics
):
return False

# Extract the speaker of the TED talk
if len(json_data["speakers"]):
Expand Down Expand Up @@ -622,7 +624,7 @@ def extract_info_from_video_page(self, url, retry_count=0):
# returns True if successfully scraped new video

# don't scrape if URL already visited
if urllib.parse.urlparse(url)[2] in self.already_visited:
if urllib.parse.urlparse(url).path in self.already_visited:
return False

# don't scrape if maximum retry count is reached
Expand All @@ -631,7 +633,7 @@ def extract_info_from_video_page(self, url, retry_count=0):
return False

logger.debug(f"extract_info_from_video_page: {url}")
soup = BeautifulSoup(download_link(url).text, features="html.parser")
soup = BeautifulSoup(request_url(url).text, features="html.parser")

json_data = json.loads(
soup.find("script", attrs={"id": "__NEXT_DATA__"}).string
Expand Down
27 changes: 21 additions & 6 deletions ted2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,22 @@ def update_subtitles_list(video_id, language_list):
return language_list


def download_link(url):
def request_url(url, json_data=None):
"""performs an HTTP request and returns the response, either GET or POST
- json_data is used as POST body when passed, otherwise a GET request is done
- request is retried 5 times, with a 30*attemp_no secs pause between retries
- a pause of 1 sec is done before every request (including first one)
"""

if url == f"{BASE_URL}playlists/57":
url = f"{BASE_URL}playlists/57/björk_6_talks_that_are_music"
for attempt in range(1, 6):
time.sleep(1) # delay requests
req = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
if json_data:
req = requests.post(url, headers={"User-Agent": "Mozilla/5.0"}, json=json_data)
else:
req = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
try:
req.raise_for_status()
except Exception as exc:
Expand All @@ -43,9 +53,14 @@ def download_link(url):
time.sleep(30 * attempt) # wait upon failure
continue
return req
raise ConnectionRefusedError(
f"Failed to download {url} after {attempt} attempts (HTTP {req.status_code})"
)
if json_data:
raise ConnectionRefusedError(
f"Failed to query {url} after {attempt} attempts (HTTP {req.status_code}); sent data was: {json.dumps(json_data)}"
)
else:
raise ConnectionRefusedError(
f"Failed to download {url} after {attempt} attempts (HTTP {req.status_code})"
)


class WebVTT:
Expand All @@ -57,7 +72,7 @@ def __init__(self, url):

def convert(self):
"""download and convert its URL to WebVTT text"""
req = download_link(self.url)
req = request_url(self.url)

if req.status_code == 404:
return None
Expand Down

0 comments on commit 9eae1dc

Please sign in to comment.