Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"urllib" to "requests" library change #182

Closed
wants to merge 7 commits into from
6 changes: 3 additions & 3 deletions openedx2zim/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import uuid
import json
import pathlib
import collections
from collections import OrderedDict, defaultdict

from bs4 import BeautifulSoup

Expand All @@ -17,7 +17,7 @@ class MoocForum:
def __init__(self, scraper):
self.scraper = scraper
self.threads = []
self.categories = collections.OrderedDict()
self.categories = OrderedDict()
self.staff_user = []
self.output_path = self.scraper.build_dir.joinpath("forum")
self.output_path.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -219,7 +219,7 @@ def annex_forum(self):
self.update_thread_children(thread)

def render_forum(self):
thread_by_category = collections.defaultdict(list)
thread_by_category = defaultdict(list)
for thread in self.threads:
thread_by_category[thread["commentable_id"]].append(thread)
jinja(
Expand Down
58 changes: 33 additions & 25 deletions openedx2zim/html_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pathlib
import re
import urllib
import requests

import xxhash
import lxml.html
Expand All @@ -26,9 +26,10 @@ def download_and_get_filename(
"""downloads a file from src and return the name of the downloaded file

with_ext: ensure that downloaded file has the given extension
filter_ext: download only if the file to download has an extension in this list"""
filter_ext: download only if the file to download has an extension in this list
"""

server_path = pathlib.Path(urllib.parse.urlparse(src).path)
server_path = pathlib.Path(requests.utils.urlparse(src).path)
ext = with_ext if with_ext else server_path.suffix

if server_path.suffix:
Expand Down Expand Up @@ -57,7 +58,8 @@ def download_dependencies_from_css(

- css_org_url: URL to the CSS file on the internet
- css_path: path of CSS on the filesystem (Path)
- output_path_from_css: string representing path of the output directory relative to css_path"""
- output_path_from_css: string representing path of the output directory relative to css_path
"""

def encapsulate(url):
return f"url({url})"
Expand All @@ -71,7 +73,7 @@ def remove_quotes(url):

# ensure the original CSS url has netloc
css_org_url = prepare_url(css_org_url, netloc, path_on_server)
css_org_url = urllib.parse.urlparse(css_org_url)
css_org_url = requests.utils.urlparse(css_org_url)

with open(css_path, "r") as fp:
content = fp.read()
Expand All @@ -93,7 +95,7 @@ def remove_quotes(url):
continue

# add netloc if not present
parsed_url = urllib.parse.urlparse(css_url)
parsed_url = requests.utils.urlparse(css_url)
if parsed_url.netloc == "":
if parsed_url.path.startswith("/"):
css_url = (
Expand All @@ -120,7 +122,7 @@ def remove_quotes(url):
path_on_server=path_on_server,
with_ext=".css",
)
parsed_css_url = urllib.parse.urlparse(css_url)
parsed_css_url = requests.utils.urlparse(css_url)
self.download_dependencies_from_css(
css_org_url=css_url,
css_path=output_path.joinpath(filename),
Expand All @@ -147,7 +149,7 @@ def remove_quotes(url):
def download_images_from_html(
self, html_body, output_path, path_from_html, netloc, path_on_server
):
""" download images from <img> tag and fix path """
"""download images from <img> tag and fix path"""

imgs = html_body.xpath("//img")
for img in imgs:
Expand All @@ -171,7 +173,7 @@ def download_images_from_html(
return bool(imgs)

def get_root_from_asset(self, path_from_html, root_from_html):
""" get path to root from the downloaded/generated asset """
"""get path to root from the downloaded/generated asset"""

# return original root if path_from_html is empty
if path_from_html == "":
Expand Down Expand Up @@ -200,7 +202,7 @@ def download_documents_from_html(
netloc,
path_on_server,
):
""" download documents from <a> tag and fix path """
"""download documents from <a> tag and fix path"""

anchors = html_body.xpath("//a")
for anchor in anchors:
Expand Down Expand Up @@ -242,7 +244,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
"""get the path and netloc to send recursively after downloading asset from downloaded_asset_url
path_on_server is the current path on server and netloc is the current netloc"""

parsed_src = urllib.parse.urlparse(downloaded_asset_url)
parsed_src = requests.utils.urlparse(downloaded_asset_url)
path_recursive = path_on_server
if parsed_src.path:
asset_path_on_server = pathlib.Path(parsed_src.path)
Expand All @@ -262,7 +264,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
def download_css_from_html(
self, html_body, output_path, path_from_html, netloc, path_on_server
):
""" download css files from <link> tag and fix path """
"""download css files from <link> tag and fix path"""

css_files = html_body.xpath("//link")
for css in css_files:
Expand Down Expand Up @@ -298,7 +300,7 @@ def download_css_from_html(
def download_js_from_html(
self, html_body, output_path, path_from_html, netloc, path_on_server
):
""" download javascript from <script> tag and fix path """
"""download javascript from <script> tag and fix path"""

js_files = html_body.xpath("//script")
for js in js_files:
Expand All @@ -320,7 +322,7 @@ def download_js_from_html(
def download_sources_from_html(
self, html_body, output_path, path_from_html, netloc, path_on_server
):
""" downloads content from <source> tags """
"""downloads content from <source> tags"""

sources = html_body.xpath("//source")
for source in sources:
Expand Down Expand Up @@ -348,7 +350,7 @@ def download_iframes_from_html(
netloc,
path_on_server,
):
""" download youtube videos and pdf files from iframes in html content """
"""download youtube videos and pdf files from iframes in html content"""

iframes = html_body.xpath("//iframe")
for iframe in iframes:
Expand Down Expand Up @@ -395,7 +397,9 @@ def download_iframes_from_html(
# handle iframe recursively
iframe_url = prepare_url(src, netloc)
try:
src_content = self.scraper.instance_connection.get_page(iframe_url)
src_content = self.scraper.instance_connection.get_page(
iframe_url
)
except Exception:
continue
path_recursive, netloc_recursive = self.get_path_and_netloc_to_send(
Expand Down Expand Up @@ -425,7 +429,7 @@ def download_iframes_from_html(
return bool(iframes)

def handle_jump_to_paths(self, target_path):
""" return a fixed path in zim for a inter-xblock path containing jump_to """
"""return a fixed path in zim for a inter-xblock path containing jump_to"""

def check_descendants_and_return_path(xblock_extractor):
if xblock_extractor.xblock_json["type"] in ["vertical", "course"]:
Expand All @@ -436,15 +440,17 @@ def check_descendants_and_return_path(xblock_extractor):

for xblock_extractor in self.scraper.xblock_extractor_objects:
if (xblock_extractor.xblock_json["block_id"] == target_path.parts[-1]) or (
urllib.parse.urlparse(xblock_extractor.xblock_json["lms_web_url"]).path
requests.utils.urlparse(
xblock_extractor.xblock_json["lms_web_url"]
).path
== str(target_path)
):
# we have a path match, we now check xblock type to redirect properly
# Only vertical and course xblocks have HTMLs
return check_descendants_and_return_path(xblock_extractor)

def rewrite_internal_links(self, html_body, root_from_html, netloc):
""" rewrites internal links and ensures no root-relative links are left behind """
"""rewrites internal links and ensures no root-relative links are left behind"""

def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
"""updates a root-relative path to the fixed path in zim
Expand All @@ -456,12 +462,12 @@ def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
anchor.attrib["href"] = netloc + anchor.attrib["href"]

anchors = html_body.xpath("//a")
path_prefix = f"{self.scraper.instance_config['course_prefix']}{urllib.parse.unquote_plus(self.scraper.course_id)}"
path_prefix = f"{self.scraper.instance_config['course_prefix']}{requests.utils.unquote(self.scraper.course_id)}"
has_changed = False
for anchor in anchors:
if "href" not in anchor.attrib:
continue
src = urllib.parse.urlparse(anchor.attrib["href"])
src = requests.utils.urlparse(anchor.attrib["href"])

# ignore external links
if src.netloc and src.netloc != self.scraper.instance_url:
Expand Down Expand Up @@ -507,7 +513,7 @@ def dl_dependencies_and_fix_links(
netloc=None,
path_on_server="",
):
""" downloads all static dependencies from an HTML content, and fixes links """
"""downloads all static dependencies from an HTML content, and fixes links"""

if not netloc:
netloc = self.scraper.instance_url
Expand Down Expand Up @@ -551,7 +557,7 @@ def dl_dependencies_and_fix_links(
return content

def defer_scripts(self, content, output_path, path_from_html):
""" defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred """
"""defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred"""

soup = BeautifulSoup(content, "lxml")
script_tags = soup.find_all("script")
Expand Down Expand Up @@ -590,7 +596,8 @@ def extract_head_css_js(self, soup, output_path, path_from_html, root_from_html)

output_path: a Path object to store the downloaded CSS/JS to
path_from_html: a string representing the path to output_path from the resultant HTML
root_from_html: a string representing the path to the root from the resultant HTML"""
root_from_html: a string representing the path to the root from the resultant HTML
"""

html_headers = soup.find("head")
head_css_js = (
Expand Down Expand Up @@ -622,7 +629,8 @@ def extract_body_end_scripts(

output_path: a Path object to store the downloaded CSS/JS to
path_from_html: a string representing the path to output_path from the resultant HTML
root_from_html: a string representing the path to the root from the resultant HTML"""
root_from_html: a string representing the path to the root from the resultant HTML
"""

html_body = soup.find("body")
body_scripts = html_body.find_all("script", recursive=False)
Expand Down
24 changes: 10 additions & 14 deletions openedx2zim/instance_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import http
import json
import sys
import urllib
import requests

from .constants import getLogger, LANGUAGE_COOKIES, OPENEDX_LANG_MAP

Expand All @@ -25,17 +25,17 @@ def __init__(self, email, password, instance_config, locale, build_dir, debug):
self.debug = debug

def get_response(self, url, post_data, headers, max_attempts=5):
req = urllib.request.Request(url, post_data, headers)
req = requests.post(url, data=post_data, headers=headers)
for attempt in range(max_attempts):
try:
return urllib.request.urlopen(req).read().decode("utf-8")
except urllib.error.HTTPError as exc:
requests.urlopen(req).content.decode("utf-8")
except requests.exceptions.HTTPError as exc:
logger.debug(f"HTTP Error (won't retry this kind of error) while opening {url}: {exc}")
if self.debug:
responseData = exc.read().decode("utf8", 'ignore')
print(responseData, file=sys.stderr)
raise exc
except urllib.error.URLError as exc:
except requests.exceptions.RequestException as exc:
if attempt < max_attempts - 1:
logger.debug(f"Error opening {url}: {exc}\nRetrying ...")
continue
Expand All @@ -56,11 +56,9 @@ def update_csrf_token_in_headers(self):
self.headers.update({"X-CSRFToken": csrf_token})

def generate_connection_headers(self):
opener = urllib.request.build_opener(
urllib.request.HTTPCookieProcessor(self.cookie_jar)
)
opener = requests.Session().cookies.update(self.cookie_jar)
opener.addheaders = [("User-Agent", "Mozilla/5.0")]
urllib.request.install_opener(opener)
requests.Session().cookies = opener.cookies
opener.open(self.instance_config["instance_url"] + "/login")
self.headers = {
"User-Agent": "Mozilla/5.0",
Expand All @@ -74,9 +72,7 @@ def generate_connection_headers(self):

def establish_connection(self):
self.generate_connection_headers()
post_data = urllib.parse.urlencode(
{"email": self.email, "password": self.password, "remember": False}
).encode("utf-8")
post_data = {"email": self.email, "password": self.password, "remember": False}
# API login can also be used : /user_api/v1/account/login_session/
self.instance_connection = self.get_api_json(
self.instance_config["login_page"], post_data, max_attempts=1
Expand Down Expand Up @@ -122,5 +118,5 @@ def get_page(self, url):

def get_redirection(self, url):
self.update_csrf_token_in_headers()
req = urllib.request.Request(url, None, self.headers)
return urllib.request.urlopen(req).geturl()
response = requests.get(url, headers=self.headers, allow_redirects=False)
return response.headers.get('Location', response.url)
Loading