Merge pull request #151 from openzim/fix_topics

Fix fetching topics, moved to an internal search API
openzim · Dec 18, 2023 · 9eae1dc · 9eae1dc
2 parents 3779bf3 + 35da0a1
commit 9eae1dc
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 2.0.11).
 
+## Unreleased
+
+### Changed
+- fixed search by topic to use new search API instead of broken web page scraping (#149)
+- download_link is renamed request_url and can also perform POST requests (in addition to previous GET requests)
+
 ## [2.0.13] - 2023-04-24
 
 ### Changed

diff --git a/ted2zim/constants.py b/ted2zim/constants.py
@@ -17,6 +17,8 @@
 
 BASE_URL = "https://ted.com/"
 
+SEARCH_URL = "https://zenith-prod-alt.ted.com/api/search"
+
 MATCHING = "matching"
 ALL = "all"
 NONE = "none"

diff --git a/ted2zim/multi/scraper.py b/ted2zim/multi/scraper.py
@@ -17,7 +17,7 @@
 from kiwixstorage import KiwixStorage
 
 from ..constants import NAME, getLogger
-from ..utils import download_link, get_temp_fpath
+from ..utils import request_url, get_temp_fpath
 
 logger = getLogger()
 
@@ -83,7 +83,7 @@ def download_playlists_list_from_site(topics_list):
         for topic in topics_list:
             logger.debug(f"Getting playlists related to {topic}")
             playlist_sub_list = json.loads(
-                download_link(
+                request_url(
                     f"https://www.ted.com/playlists/browse.json?topics={topic['value'].replace(' ', '+')}"
                 ).text
             )
@@ -136,7 +136,7 @@ def get_list_of_all(self, mode):
         """returns a list of topics or playlists"""
         # get all topics
         topics_list = json.loads(
-            download_link("https://www.ted.com/topics/combo?models=Talks").text
+            request_url("https://www.ted.com/topics/combo?models=Talks").text
         )
         if mode == "topic":
             return topics_list

diff --git a/ted2zim/scraper.py b/ted2zim/scraper.py
@@ -7,6 +7,7 @@
 import json
 import locale
 import pathlib
+import requests
 import shutil
 import tempfile
 import time
@@ -29,6 +30,7 @@
 from .constants import (
     ALL,
     BASE_URL,
+    SEARCH_URL,
     MATCHING,
     NONE,
     ROOT_DIR,
@@ -37,7 +39,7 @@
     getLogger,
 )
 from .processing import post_process_video
-from .utils import WebVTT, download_link, update_subtitles_list, get_main_title
+from .utils import WebVTT, request_url, update_subtitles_list, get_main_title
 
 logger = getLogger()
 
@@ -97,7 +99,7 @@ def __init__(
         self.topics = (
             []
             if not topics
-            else [c.strip().replace(" ", "+") for c in topics.split(",")]
+            else topics.split(",")
         )
         self.autoplay = autoplay
         self.playlist = playlist
@@ -168,7 +170,7 @@ def ted_topics_json(self):
 
     @property
     def talks_base_url(self):
-        return BASE_URL + "talks"
+        return BASE_URL + "talks/"
 
     @property
     def playlists_base_url(self):
@@ -230,7 +232,7 @@ def extract_videos_from_playlist(self, playlist):
 
         playlist_url = f"{self.playlists_base_url}/{playlist}"
         logger.debug(f"extract_videos_from_playlist: {playlist_url}")
-        soup = BeautifulSoup(download_link(playlist_url).text, features="html.parser")
+        soup = BeautifulSoup(request_url(playlist_url).text, features="html.parser")
         video_elements = soup.find_all("a", attrs={"class": "group"})
         self.playlist_title = soup.find("h1").string
         self.playlist_description = soup.find("p", attrs={"class": "text-base"}).string
@@ -246,47 +248,55 @@ def extract_videos_from_playlist(self, playlist):
                     )
                     for lang_url in other_lang_urls:
                         self.extract_info_from_video_page(lang_url)
-                    self.already_visited.append(urllib.parse.urlparse(url)[2])
+                    self.already_visited.append(urllib.parse.urlparse(url).path)
             logger.debug(f"Seen {relative_path}")
         logger.debug(f"Total videos found on playlist: {len(video_elements)}")
         if not video_elements:
             raise ValueError("Wrong playlist ID supplied. No videos found")
 
-    def generate_search_result_and_scrape(self, topic_url, total_videos_scraped):
-        """generates a search result and returns the total number of videos scraped"""
+    def generate_search_results(self, topic):
+        """generates a search results and returns the total number of videos scraped"""
 
-        page = 1
+        total_videos_scraped = 0
+        page = 0
         while True:
-            logger.debug(f"generate_search_result_and_scrape: {topic_url}&page={page}")
-            html = download_link(f"{topic_url}&page={page}").text
-            nb_videos_extracted, nb_videos_on_page = self.extract_videos_on_topic_page(
-                html
-            )
+            result = self.query_search_engine(topic, page)
+            result_json = result.json()
+            nb_videos_extracted, nb_videos_on_page = self.extract_videos_in_search_results(result_json)
             if nb_videos_on_page == 0:
                 break
             total_videos_scraped += nb_videos_extracted
             page += 1
         return total_videos_scraped
+
+
+    def query_search_engine(self, topic, page):
+        logger.debug(f"Fetching page {page} of topic {topic}")
+        data = [
+            {
+                "indexName": "relevance",
+                "params": {
+                    "attributeForDistinct": "objectID",
+                    "distinct": 1,
+                    "facetFilters": [[f"tags:{topic}"]],
+                    "facets": ["subtitle_languages", "tags"],
+                    "highlightPostTag": "__/ais-highlight__",
+                    "highlightPreTag": "__ais-highlight__",
+                    "hitsPerPage": 24,
+                    "maxValuesPerFacet": 500,
+                    "page": page,
+                    "query": "",
+                    "tagFilters": "",
+                },
+            },
+        ]
+        return request_url(SEARCH_URL, data)
 
     def extract_videos_from_topics(self, topic):
         """extracts metadata for required number of videos on different topics"""
 
         logger.debug(f"Fetching video links for topic: {topic}")
-        topic_url = f"{self.talks_base_url}?topics%5B%5D={topic}"
-        total_videos_scraped = 0
-
-        if self.source_languages:
-            for lang in self.source_languages:
-                topic_url = topic_url + f"&language={lang}"
-                total_videos_scraped = self.generate_search_result_and_scrape(
-                    topic_url, total_videos_scraped
-                )
-
-        else:
-            total_videos_scraped = self.generate_search_result_and_scrape(
-                topic_url, total_videos_scraped
-            )
-
+        total_videos_scraped = self.generate_search_results(topic)
         logger.info(f"Total video links found in {topic}: {total_videos_scraped}")
         if total_videos_scraped == 0:
             return False
@@ -402,31 +412,16 @@ def generate_urls_for_other_languages(self, url):
                 urls.append(urllib.parse.urlunparse(url_parts))
         return urls
 
-    def extract_videos_on_topic_page(self, page_html):
-
-        # all videos are embedded in a <div> with the class name 'row'.
-        # we are searching for the div inside this div, that has an <a>-tag
-        # with the class name 'media__image', because this is the relative
-        # link to the representative TED talk. It turns this relative link to
-        # an absolute link and calls extract_video_info for them
-        soup = BeautifulSoup(page_html, features="html.parser")
-        video_links = soup.select("div.row div.media__image a")
+    def extract_videos_in_search_results(self, result_json):
+        hits = result_json["results"][0]["hits"]
         nb_extracted = 0
-        nb_listed = len(video_links)
+        nb_listed = len(hits)
         logger.debug(f"{nb_listed} video(s) found on current page")
-        for video_link in video_links:
-            url = urllib.parse.urljoin(self.talks_base_url, video_link["href"])
+        for hit in hits:
+            url = urllib.parse.urljoin(self.talks_base_url, hit["slug"])
             if self.extract_info_from_video_page(url):
                 nb_extracted += 1
-                if self.source_languages and len(self.source_languages) > 1:
-                    other_lang_urls = self.generate_urls_for_other_languages(url)
-                    logger.debug(
-                        f"Searching info for video in other {len(other_lang_urls)} language(s)"
-                    )
-                    for lang_url in other_lang_urls:
-                        self.extract_info_from_video_page(lang_url)
-                    self.already_visited.append(urllib.parse.urlparse(url)[2])
-            logger.debug(f"Seen {video_link['href']}")
+            logger.debug(f"Seen {hit['slug']}")
         return nb_extracted, nb_listed
 
     def get_lang_code_from_url(self, url, with_full_query=False):
@@ -537,15 +532,22 @@ def extract_video_info_from_json(self, json_data):
         except Exception as exc:
             logger.warning(f"player data has no entry for {lang_code}: {exc}")
             lang_name = lang_code
-        # talk_info = json_data["talks"][0]
+        if (self.topics) :
+            # we need to filter videos since this has not been done before for topics with
+            # the "new" search page (2023)
+            if lang_code not in self.source_languages:
+                # video language is not among the selected ones, we have to check subtitles
+                # if they are enough
+                if (not self.subtitles_enough):
+                    logger.debug(f"Ignoring video in non-selected language {lang_code}")
+                    return False
+                else:
+                    matching_languages = [lang for lang in player_data["languages"] if lang["languageCode"] in self.source_languages]
+                    if len(matching_languages) == 0:
+                        logger.debug(f"Ignoring video without a selected language in audio or subtitles")
+                        return False
+
         native_talk_language = player_data["nativeLanguage"]
-        if (
-            not self.subtitles_enough
-            and self.source_languages
-            and native_talk_language != lang_code
-            and self.topics
-        ):
-            return False
 
         # Extract the speaker of the TED talk
         if len(json_data["speakers"]):
@@ -622,7 +624,7 @@ def extract_info_from_video_page(self, url, retry_count=0):
         # returns True if successfully scraped new video
 
         # don't scrape if URL already visited
-        if urllib.parse.urlparse(url)[2] in self.already_visited:
+        if urllib.parse.urlparse(url).path in self.already_visited:
             return False
 
         # don't scrape if maximum retry count is reached
@@ -631,7 +633,7 @@ def extract_info_from_video_page(self, url, retry_count=0):
             return False
 
         logger.debug(f"extract_info_from_video_page: {url}")
-        soup = BeautifulSoup(download_link(url).text, features="html.parser")
+        soup = BeautifulSoup(request_url(url).text, features="html.parser")
 
         json_data = json.loads(
             soup.find("script", attrs={"id": "__NEXT_DATA__"}).string

diff --git a/ted2zim/utils.py b/ted2zim/utils.py
@@ -29,12 +29,22 @@ def update_subtitles_list(video_id, language_list):
     return language_list
 
 
-def download_link(url):
+def request_url(url, json_data=None):
+    """performs an HTTP request and returns the response, either GET or POST
+    
+    - json_data is used as POST body when passed, otherwise a GET request is done
+    - request is retried 5 times, with a 30*attemp_no secs pause between retries
+    - a pause of 1 sec is done before every request (including first one)
+    """
+
     if url == f"{BASE_URL}playlists/57":
         url = f"{BASE_URL}playlists/57/björk_6_talks_that_are_music"
     for attempt in range(1, 6):
         time.sleep(1)  # delay requests
-        req = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+        if json_data:
+            req = requests.post(url, headers={"User-Agent": "Mozilla/5.0"}, json=json_data)
+        else:
+            req = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
         try:
             req.raise_for_status()
         except Exception as exc:
@@ -43,9 +53,14 @@ def download_link(url):
             time.sleep(30 * attempt)  # wait upon failure
             continue
         return req
-    raise ConnectionRefusedError(
-        f"Failed to download {url} after {attempt} attempts (HTTP {req.status_code})"
-    )
+    if json_data:
+        raise ConnectionRefusedError(
+            f"Failed to query {url} after {attempt} attempts (HTTP {req.status_code}); sent data was: {json.dumps(json_data)}"
+        )
+    else:
+        raise ConnectionRefusedError(
+            f"Failed to download {url} after {attempt} attempts (HTTP {req.status_code})"
+        )
 
 
 class WebVTT:
@@ -57,7 +72,7 @@ def __init__(self, url):
 
     def convert(self):
         """download and convert its URL to WebVTT text"""
-        req = download_link(self.url)
+        req = request_url(self.url)
 
         if req.status_code == 404:
             return None