openzim · MUCCHU · Dec 24, 2023 · Dec 24, 2023 · Dec 26, 2023 · Dec 26, 2023
diff --git a/openedx2zim/annex.py b/openedx2zim/annex.py
@@ -3,7 +3,7 @@
 import uuid
 import json
 import pathlib
-import collections
+from collections import OrderedDict, defaultdict
 
 from bs4 import BeautifulSoup
 
@@ -17,7 +17,7 @@ class MoocForum:
  def __init__(self, scraper):
  self.scraper = scraper
  self.threads = []
- self.categories = collections.OrderedDict()
+ self.categories = OrderedDict()
  self.staff_user = []
  self.output_path = self.scraper.build_dir.joinpath("forum")
  self.output_path.mkdir(parents=True, exist_ok=True)
@@ -219,7 +219,7 @@ def annex_forum(self):
  self.update_thread_children(thread)
 
  def render_forum(self):
- thread_by_category = collections.defaultdict(list)
+ thread_by_category = defaultdict(list)
  for thread in self.threads:
  thread_by_category[thread["commentable_id"]].append(thread)
  jinja(

diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py
@@ -1,6 +1,6 @@
 import pathlib
 import re
-import urllib
+import requests
 
 import xxhash
 import lxml.html
@@ -26,9 +26,10 @@ def download_and_get_filename(
  """downloads a file from src and return the name of the downloaded file
 
  with_ext: ensure that downloaded file has the given extension
- filter_ext: download only if the file to download has an extension in this list"""
+ filter_ext: download only if the file to download has an extension in this list
+ """
 
- server_path = pathlib.Path(urllib.parse.urlparse(src).path)
+ server_path = pathlib.Path(requests.utils.urlparse(src).path)
  ext = with_ext if with_ext else server_path.suffix
 
  if server_path.suffix:
@@ -57,7 +58,8 @@ def download_dependencies_from_css(
 
  - css_org_url: URL to the CSS file on the internet
  - css_path: path of CSS on the filesystem (Path)
- - output_path_from_css: string representing path of the output directory relative to css_path"""
+ - output_path_from_css: string representing path of the output directory relative to css_path
+ """
 
  def encapsulate(url):
  return f"url({url})"
@@ -71,7 +73,7 @@ def remove_quotes(url):
 
  # ensure the original CSS url has netloc
  css_org_url = prepare_url(css_org_url, netloc, path_on_server)
- css_org_url = urllib.parse.urlparse(css_org_url)
+ css_org_url = requests.utils.urlparse(css_org_url)
 
  with open(css_path, "r") as fp:
  content = fp.read()
@@ -93,7 +95,7 @@ def remove_quotes(url):
  continue
 
  # add netloc if not present
- parsed_url = urllib.parse.urlparse(css_url)
+ parsed_url = requests.utils.urlparse(css_url)
  if parsed_url.netloc == "":
  if parsed_url.path.startswith("/"):
  css_url = (
@@ -120,7 +122,7 @@ def remove_quotes(url):
  path_on_server=path_on_server,
  with_ext=".css",
  )
- parsed_css_url = urllib.parse.urlparse(css_url)
+ parsed_css_url = requests.utils.urlparse(css_url)
  self.download_dependencies_from_css(
  css_org_url=css_url,
  css_path=output_path.joinpath(filename),
@@ -147,7 +149,7 @@ def remove_quotes(url):
  def download_images_from_html(
  self, html_body, output_path, path_from_html, netloc, path_on_server
  ):
- """ download images from <img> tag and fix path """
+ """download images from <img> tag and fix path"""
 
  imgs = html_body.xpath("//img")
  for img in imgs:
@@ -171,7 +173,7 @@ def download_images_from_html(
  return bool(imgs)
 
  def get_root_from_asset(self, path_from_html, root_from_html):
- """ get path to root from the downloaded/generated asset """
+ """get path to root from the downloaded/generated asset"""
 
  # return original root if path_from_html is empty
  if path_from_html == "":
@@ -200,7 +202,7 @@ def download_documents_from_html(
  netloc,
  path_on_server,
  ):
- """ download documents from <a> tag and fix path """
+ """download documents from <a> tag and fix path"""
 
  anchors = html_body.xpath("//a")
  for anchor in anchors:
@@ -242,7 +244,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
  """get the path and netloc to send recursively after downloading asset from downloaded_asset_url
  path_on_server is the current path on server and netloc is the current netloc"""
 
- parsed_src = urllib.parse.urlparse(downloaded_asset_url)
+ parsed_src = requests.utils.urlparse(downloaded_asset_url)
  path_recursive = path_on_server
  if parsed_src.path:
  asset_path_on_server = pathlib.Path(parsed_src.path)
@@ -262,7 +264,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
  def download_css_from_html(
  self, html_body, output_path, path_from_html, netloc, path_on_server
  ):
- """ download css files from <link> tag and fix path """
+ """download css files from <link> tag and fix path"""
 
  css_files = html_body.xpath("//link")
  for css in css_files:
@@ -298,7 +300,7 @@ def download_css_from_html(
  def download_js_from_html(
  self, html_body, output_path, path_from_html, netloc, path_on_server
  ):
- """ download javascript from <script> tag and fix path """
+ """download javascript from <script> tag and fix path"""
 
  js_files = html_body.xpath("//script")
  for js in js_files:
@@ -320,7 +322,7 @@ def download_js_from_html(
  def download_sources_from_html(
  self, html_body, output_path, path_from_html, netloc, path_on_server
  ):
- """ downloads content from <source> tags """
+ """downloads content from <source> tags"""
 
  sources = html_body.xpath("//source")
  for source in sources:
@@ -348,7 +350,7 @@ def download_iframes_from_html(
  netloc,
  path_on_server,
  ):
- """ download youtube videos and pdf files from iframes in html content """
+ """download youtube videos and pdf files from iframes in html content"""
 
  iframes = html_body.xpath("//iframe")
  for iframe in iframes:
@@ -395,7 +397,9 @@ def download_iframes_from_html(
  # handle iframe recursively
  iframe_url = prepare_url(src, netloc)
  try:
- src_content = self.scraper.instance_connection.get_page(iframe_url)
+ src_content = self.scraper.instance_connection.get_page(
+ iframe_url
+ )
  except Exception:
  continue
  path_recursive, netloc_recursive = self.get_path_and_netloc_to_send(
@@ -425,7 +429,7 @@ def download_iframes_from_html(
  return bool(iframes)
 
  def handle_jump_to_paths(self, target_path):
- """ return a fixed path in zim for a inter-xblock path containing jump_to """
+ """return a fixed path in zim for a inter-xblock path containing jump_to"""
 
  def check_descendants_and_return_path(xblock_extractor):
  if xblock_extractor.xblock_json["type"] in ["vertical", "course"]:
@@ -436,15 +440,17 @@ def check_descendants_and_return_path(xblock_extractor):
 
  for xblock_extractor in self.scraper.xblock_extractor_objects:
  if (xblock_extractor.xblock_json["block_id"] == target_path.parts[-1]) or (
- urllib.parse.urlparse(xblock_extractor.xblock_json["lms_web_url"]).path
+ requests.utils.urlparse(
+ xblock_extractor.xblock_json["lms_web_url"]
+ ).path
  == str(target_path)
  ):
  # we have a path match, we now check xblock type to redirect properly
  # Only vertical and course xblocks have HTMLs
  return check_descendants_and_return_path(xblock_extractor)
 
  def rewrite_internal_links(self, html_body, root_from_html, netloc):
- """ rewrites internal links and ensures no root-relative links are left behind """
+ """rewrites internal links and ensures no root-relative links are left behind"""
 
  def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
  """updates a root-relative path to the fixed path in zim
@@ -456,12 +462,12 @@ def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
  anchor.attrib["href"] = netloc + anchor.attrib["href"]
 
  anchors = html_body.xpath("//a")
- path_prefix = f"{self.scraper.instance_config['course_prefix']}{urllib.parse.unquote_plus(self.scraper.course_id)}"
+ path_prefix = f"{self.scraper.instance_config['course_prefix']}{requests.utils.unquote(self.scraper.course_id)}"
  has_changed = False
  for anchor in anchors:
  if "href" not in anchor.attrib:
  continue
- src = urllib.parse.urlparse(anchor.attrib["href"])
+ src = requests.utils.urlparse(anchor.attrib["href"])
 
  # ignore external links
  if src.netloc and src.netloc != self.scraper.instance_url:
@@ -507,7 +513,7 @@ def dl_dependencies_and_fix_links(
  netloc=None,
  path_on_server="",
  ):
- """ downloads all static dependencies from an HTML content, and fixes links """
+ """downloads all static dependencies from an HTML content, and fixes links"""
 
  if not netloc:
  netloc = self.scraper.instance_url
@@ -551,7 +557,7 @@ def dl_dependencies_and_fix_links(
  return content
 
  def defer_scripts(self, content, output_path, path_from_html):
- """ defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred """
+ """defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred"""
 
  soup = BeautifulSoup(content, "lxml")
  script_tags = soup.find_all("script")
@@ -590,7 +596,8 @@ def extract_head_css_js(self, soup, output_path, path_from_html, root_from_html)
 
  output_path: a Path object to store the downloaded CSS/JS to
  path_from_html: a string representing the path to output_path from the resultant HTML
- root_from_html: a string representing the path to the root from the resultant HTML"""
+ root_from_html: a string representing the path to the root from the resultant HTML
+ """
 
  html_headers = soup.find("head")
  head_css_js = (
@@ -622,7 +629,8 @@ def extract_body_end_scripts(
 
  output_path: a Path object to store the downloaded CSS/JS to
  path_from_html: a string representing the path to output_path from the resultant HTML
- root_from_html: a string representing the path to the root from the resultant HTML"""
+ root_from_html: a string representing the path to the root from the resultant HTML
+ """
 
  html_body = soup.find("body")
  body_scripts = html_body.find_all("script", recursive=False)

diff --git a/openedx2zim/instance_connection.py b/openedx2zim/instance_connection.py
@@ -3,7 +3,7 @@
 import http
 import json
 import sys
-import urllib
+import requests
 
 from .constants import getLogger, LANGUAGE_COOKIES, OPENEDX_LANG_MAP
 
@@ -25,17 +25,17 @@ def __init__(self, email, password, instance_config, locale, build_dir, debug):
  self.debug = debug
 
  def get_response(self, url, post_data, headers, max_attempts=5):
- req = urllib.request.Request(url, post_data, headers)
+ req = requests.post(url, data=post_data, headers=headers)
  for attempt in range(max_attempts):
  try:
- return urllib.request.urlopen(req).read().decode("utf-8")
- except urllib.error.HTTPError as exc:
+ requests.urlopen(req).content.decode("utf-8")
+ except requests.exceptions.HTTPError as exc:
  logger.debug(f"HTTP Error (won't retry this kind of error) while opening {url}: {exc}")
  if self.debug:
  responseData = exc.read().decode("utf8", 'ignore')
  print(responseData, file=sys.stderr)
  raise exc
- except urllib.error.URLError as exc:
+ except requests.exceptions.RequestException as exc:
  if attempt < max_attempts - 1:
  logger.debug(f"Error opening {url}: {exc}\nRetrying ...")
  continue
@@ -56,11 +56,9 @@ def update_csrf_token_in_headers(self):
  self.headers.update({"X-CSRFToken": csrf_token})
 
  def generate_connection_headers(self):
- opener = urllib.request.build_opener(
- urllib.request.HTTPCookieProcessor(self.cookie_jar)
- )
+ opener = requests.Session().cookies.update(self.cookie_jar)
  opener.addheaders = [("User-Agent", "Mozilla/5.0")]
- urllib.request.install_opener(opener)
+ requests.Session().cookies = opener.cookies
  opener.open(self.instance_config["instance_url"] + "/login")
  self.headers = {
  "User-Agent": "Mozilla/5.0",
@@ -74,9 +72,7 @@ def generate_connection_headers(self):
 
  def establish_connection(self):
  self.generate_connection_headers()
- post_data = urllib.parse.urlencode(
- {"email": self.email, "password": self.password, "remember": False}
- ).encode("utf-8")
+ post_data = {"email": self.email, "password": self.password, "remember": False}
  # API login can also be used : /user_api/v1/account/login_session/
  self.instance_connection = self.get_api_json(
  self.instance_config["login_page"], post_data, max_attempts=1
@@ -122,5 +118,5 @@ def get_page(self, url):
 
  def get_redirection(self, url):
  self.update_csrf_token_in_headers()
- req = urllib.request.Request(url, None, self.headers)
- return urllib.request.urlopen(req).geturl()
+ response = requests.get(url, headers=self.headers, allow_redirects=False)
+ return response.headers.get('Location', response.url)