Merge pull request #1 from sean-morris/main

Downloader-Plugins are now in one repo
jupyterhub · Dec 21, 2021 · bb2a30a · bb2a30a
2 parents 0c7808c + 8686668
commit bb2a30a
Show file tree

Hide file tree

Showing 14 changed files with 643 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,23 @@
 # nbgitpuller-downloader-plugins
 jupyterhub/nbgitpuller allows for content provider plugins, this python package provides downloader plugins
+
+There are three downloader-plugins included in this repository:
+- nbgitpuller-downloader-generic-web
+- nbgitpuller-downloader-googledrive
+- nbgitpuller-downloader-dropbox
+
+When this package is installed next to nbgitpuller in a jupyterhub instance, you can use nbgitpuller links
+to compressed archives(zip an tar) in Google Drive, Dropbox or any publicly exposed URL to download your notebooks(or any files)
+into a jupyterhub. 
+
+This plugin expects URLs included in the nbgitpuller link to be in the following format:
+- Generic Web: https://www.example.com/materials-sp20-external.zip
+- Dropbox: https://www.dropbox.com%/<dropbox-idd>/materials-sp20-external.zip?dl=0
+- Google Drive: https://drive.google.com/file/d/<google-file-id/view?usp=sharing
+
+In all of these cases, the archive must be publicly available or "shared" with everyone in the case of Google Drive.
+
+## Installation
+
+```shell
+python3 -m pip install nbgitpuller-downloader-plugins
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -0,0 +1,4 @@
+aioresponses
+pytest
+pytest-asyncio
+.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,28 @@
+[metadata]
+name = nbgitpuller-downloader-plugins
+version = 0.0.0
+author = Sean Morris
+author_email = sean.smorris@berkeley.edu
+description = Downloads compressed archives from Google Drive, dropbox, or any generic web server
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/jupyterhub/nbgitpuller-downloader-plugins
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+
+[options]
+package_dir =
+    = src
+packages = find:
+python_requires = >=3.8
+
+[options.packages.find]
+where=src
+
+[options.entry_points]
+nbgitpuller =
+    dropbox=nbgitpuller_downloader_dropbox.dropbox_downloader
+    generic_web=nbgitpuller_downloader_generic_web.generic_web_downloader
+    googledrive=nbgitpuller_downloader_googledrive.googledrive_downloader
diff --git a/src/nbgitpuller_downloader_dropbox/__init__.py b/src/nbgitpuller_downloader_dropbox/__init__.py
diff --git a/src/nbgitpuller_downloader_dropbox/dropbox_downloader.py b/src/nbgitpuller_downloader_dropbox/dropbox_downloader.py
@@ -0,0 +1,20 @@
+from nbgitpuller_downloader_plugins_util.plugin_helper import HandleFilesHelper
+from nbgitpuller.plugin_hook_specs import hookimpl
+import asyncio
+
+
+@hookimpl
+def handle_files(helper_args, query_line_args):
+    """
+    :param dict helper_args: the function, helper_args["progress_func"], that writes messages to
+    the progress stream in the browser window and the download_q, helper_args["download_q"] the progress function uses.
+    :param dict query_line_args: this includes all the arguments included on the nbgitpuller URL
+    :return two parameter json unzip_dir and origin_repo_path
+    :rtype json object
+    """
+    query_line_args["repo"] = query_line_args["repo"].replace("dl=0", "dl=1")  # dropbox: download set to 1
+    loop = asyncio.get_event_loop()
+    hfh = HandleFilesHelper(helper_args, query_line_args)
+    tasks = hfh.handle_files_helper(), helper_args["wait_for_sync_progress_queue"]()
+    result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks))
+    return result_handle
diff --git a/src/nbgitpuller_downloader_generic_web/__init__.py b/src/nbgitpuller_downloader_generic_web/__init__.py
diff --git a/src/nbgitpuller_downloader_generic_web/generic_web_downloader.py b/src/nbgitpuller_downloader_generic_web/generic_web_downloader.py
@@ -0,0 +1,19 @@
+from nbgitpuller_downloader_plugins_util.plugin_helper import HandleFilesHelper
+from nbgitpuller.plugin_hook_specs import hookimpl
+import asyncio
+
+@hookimpl
+def handle_files(helper_args, query_line_args):
+    """
+    :param dict helper_args: the function, helper_args["progress_func"], that writes messages to
+    the progress stream in the browser window and the download_q, helper_args["download_q"] the progress function uses.
+    :param dict query_line_args: this includes all the arguments included on the nbgitpuller URL
+    :return two parameter json unzip_dir and origin_repo_path
+    :rtype json object
+    """
+    loop = asyncio.get_event_loop()
+    hfh = HandleFilesHelper(helper_args, query_line_args)
+    tasks = hfh.handle_files_helper(), helper_args["wait_for_sync_progress_queue"]()
+    result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks))
+
+    return result_handle
diff --git a/src/nbgitpuller_downloader_googledrive/__init__.py b/src/nbgitpuller_downloader_googledrive/__init__.py
diff --git a/src/nbgitpuller_downloader_googledrive/googledrive_downloader.py b/src/nbgitpuller_downloader_googledrive/googledrive_downloader.py
@@ -0,0 +1,145 @@
+from nbgitpuller.plugin_hook_specs import hookimpl
+from nbgitpuller_downloader_plugins_util.plugin_helper import HandleFilesHelper
+import re
+import asyncio
+import aiohttp
+
+DOWNLOAD_URL = "https://docs.google.com/uc?export=download"
+
+
+@hookimpl
+def handle_files(helper_args, query_line_args):
+    """
+    This function calls nbgitpuller's handle_files_helper after first determining the
+    file extension(e.g. zip, tar.gz, etc). Google Drive does not use the name of the file to
+    identify the file on the URL so we must download the file first to get the extension from the
+    response, set up a specialized download function and parameters and then pass off handling
+    to nbgitpuller.
+    :param dict helper_args: the function, helper_args["progress_func"], that writes messages to
+    the progress stream in the browser window and the download_q, helper_args["download_q"] the progress function uses.
+    :param dict query_line_args: this includes all the arguments included on the nbgitpuller URL
+    :return two parameter json output_dir and origin_repo_path
+    :rtype json object
+    """
+    loop = asyncio.get_event_loop()
+    repo = query_line_args["repo"]
+    helper_args["download_q"].put_nowait("Determining type of archive...\n")
+    response = loop.run_until_complete(get_response_from_drive(DOWNLOAD_URL, get_id(repo)))
+    ext = determine_file_extension_from_response(response)
+    helper_args["download_q"].put_nowait(f"Archive is: {ext}\n")
+    helper_args["extension"] = ext
+    helper_args["download_func"] = download_archive_for_google
+
+    hfh = HandleFilesHelper(helper_args, query_line_args)
+    tasks = hfh.handle_files_helper(), helper_args["wait_for_sync_progress_queue"]()
+    result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks))
+    return result_handle
+
+
+def get_id(repo):
+    """
+    This gets the id of the file from the URL.
+
+    :param str repo: the url to the compressed file contained the google id
+    :return the google drive id of the file to be downloaded
+    :rtype str
+    """
+    start_id_index = repo.index("d/") + 2
+    end_id_index = repo.index("/view")
+    return repo[start_id_index:end_id_index]
+
+
+def get_confirm_token(session, url):
+    """
+    Google may include a confirm dialog if the file is too big. This retreives the
+    confirmation token and uses it to complete the download.
+
+    :param aiohttp.ClientSession session: used to the get the cookies from the reponse
+    :param str url : the url is used to filter out the correct cookies from the session
+    :return the cookie if found or None if not found
+    :rtype str
+    """
+    cookies = session.cookie_jar.filter_cookies(url)
+    for key, cookie in cookies.items():
+        if key.startswith('download_warning'):
+            return cookie
+    return None
+
+
+async def download_archive_for_google(repo=None, temp_download_file=None):
+    """
+    This requests the file from the repo(url) given and saves it to the disk. This is executed
+    in plugin_helper.py and note that the parameters to this function are the same as the standard
+    parameters used by the standard download_archive function in plugin_helper. You may also note that I let
+    plugin_helper handle passing the temp_download_file to the function
+
+    :param str repo: the name of the repo
+    :param str temp_download_file: the path to save the requested file to
+    """
+    yield "Downloading archive ...\n"
+    try:
+        id = get_id(repo)
+        CHUNK_SIZE = 1024
+        async with aiohttp.ClientSession() as session:
+            async with session.get(DOWNLOAD_URL, params={'id': id}) as response:
+                token = get_confirm_token(session, repo)
+                if token:
+                    params = {'id': id, 'confirm': token}
+                    response = await session.get(repo, params=params)
+                with open(temp_download_file, 'ab') as fd:
+                    count_chunks = 1
+                    while True:
+                        count_chunks += 1
+                        if count_chunks % 1000 == 0:
+                            display = count_chunks / 1000
+                            yield f"Downloading Progress ... {display}MB\n"
+                        chunk = await response.content.read(CHUNK_SIZE)
+                        if not chunk:
+                            break
+                        fd.write(chunk)
+        yield "Archive Downloaded....\n"
+    except Exception as e:
+        raise e
+
+
+async def get_response_from_drive(url, id):
+    """
+    You need to check to see that Google Drive has not asked the
+    request to confirm that they disabled the virus scan on files that
+    are bigger than 100MB(The size is mentioned online but I did not see
+    confirmation - something larger essentially). For large files, you have
+    to request again but this time putting the 'confirm=XXX' as a query
+    parameter.
+
+    :param str url: the google download URL
+    :param str id: the google id of the file to download
+    :return response object
+    :rtype json object
+    """
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url, params={'id': id}) as response:
+            token = get_confirm_token(session, url)
+            if token:
+                params = {'id': id, 'confirm': token}
+                response = await session.get(url, params=params)
+                return response
+            return response
+
+
+def determine_file_extension_from_response(response):
+    """
+    This retrieves the file extension from the response.
+    :param str response: the response object from the download
+    :return the extension indicating the file compression(e.g. zip, tgz)
+    :rtype str
+    """
+    content_disposition = response.headers.get('content-disposition')
+    if content_disposition:
+        fname = re.findall("filename\\*?=([^;]+)", content_disposition)
+        fname = fname[0].strip().strip('"')
+        ext = fname.split(".")[1]
+
+    if ext is None:
+        m = f"Could not determine compression type of: {content_disposition}"
+        raise Exception(m)
+    return ext
diff --git a/src/nbgitpuller_downloader_plugins_util/__init__.py b/src/nbgitpuller_downloader_plugins_util/__init__.py