Merge pull request #98 from kbaseapps/dev-build_test_fix

Fix build and tests
kbaseapps · Mar 13, 2024 · 2fce573 · 2fce573
2 parents ee76705 + 3886f57
commit 2fce573
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 62 deletions.
diff --git a/.github/workflows/kb_sdk_test.yaml b/.github/workflows/kb_sdk_test.yaml
@@ -50,8 +50,10 @@ jobs:
     - name: Run tests
       if: "!contains(github.event.head_commit.message, 'skip ci')"
       shell: bash
-      env:
-        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
       run: |
         sh $GITHUB_WORKSPACE/kb_sdk_actions/bin/kb-sdk test
-        bash <(curl -s https://codecov.io/bash)
+
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        fail_ci_if_error: true
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,6 @@ test_local
 sdk.cfg
 /run_local/
 *.py.bak-*
+/.settings/
+/.project
+/.pydevproject
diff --git a/Dockerfile b/Dockerfile
@@ -1,24 +1,28 @@
-FROM kbase/sdkbase2:python
+FROM kbase/sdkpython:3.8.10
 MAINTAINER KBase Developer
 # -----------------------------------------
 
 # Insert apt-get instructions here to install
 # any required dependencies for your module.
 
-# RUN apt-get update
+RUN apt update
 
-RUN pip install semver \
-    && ( [ $(pip show filemagic|grep -c filemagic) -eq 0 ] || pip uninstall -y filemagic ) \
-    && pip install python-magic \
-    && pip install ftputil \
-    && pip install ipython==5.3.0 \
-    && pip install pyftpdlib==1.5.6 \
-    && sudo apt-get install nano
-# -----------------------------------------
+RUN apt install -y nano pigz wget libmagic-dev
+
+RUN pip uninstall -y filemagic
 
-RUN sudo apt-get update
-RUN sudo apt-get install pigz wget
-RUN pip install bz2file
+# bz2file hasn't been updated in 10 years
+RUN pip install \
+    requests==2.31.0 \
+    requests_toolbelt==1.0.0 \
+    semver==3.0.2 \
+    python-magic==0.4.27 \
+    ftputil==5.1.0 \
+    ipython==5.3.0 \
+    bz2file==0.98 \
+    pyftpdlib==1.5.6
+
+# -----------------------------------------
 
 COPY ./ /kb/module
 RUN mkdir -p /kb/module/work

diff --git a/Makefile b/Makefile
@@ -53,8 +53,7 @@ build-test-script:
 	echo 'export KB_AUTH_TOKEN=`cat /kb/module/work/token`' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
 	echo 'export PYTHONPATH=$$script_dir/../$(LIB_DIR):$$PATH:$$PYTHONPATH' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
 	echo 'cd $$script_dir/../$(TEST_DIR)' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
-	echo 'python -u -m unittest discover -p "*_test.py"' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
-	echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml --nocapture  --nologcapture .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
+	echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
 	chmod +x $(TEST_DIR)/$(TEST_SCRIPT_NAME)
 
 test:

diff --git a/lib/DataFileUtil/DataFileUtilImpl.py b/lib/DataFileUtil/DataFileUtilImpl.py
@@ -11,6 +11,7 @@
 import subprocess
 import tarfile
 import tempfile
+import urllib.parse as urlparse
 import uuid
 import zipfile
 
@@ -460,16 +461,6 @@ def _download_dropbox_link(self, file_url):
 
         return copy_file_path
 
-    def _get_google_confirm_token(self, response):
-        """
-        _get_google_confirm_token: get Google drive confirm token for large file
-        """
-        for key, value in response.cookies.items():
-            if key.startswith('download_warning'):
-                return value
-
-        return None
-
     def _download_google_drive_to_file(self, file_url, cookies=None):
         """
         _download_google_drive_to_file: download url content to file
@@ -504,36 +495,42 @@ def _download_google_drive_link(self, file_url):
         file_url: Google Drive download link
 
         """
+        # TODO all this should be rewritten with the google drive libs, super fragile as is
         if not file_url.startswith('https://drive.google.com/'):
-            raise ValueError('Invalid Google Drive Link: {}'.format(file_url))
+            raise ValueError(f'Invalid Google Drive Link: {file_url}')
+        log(f'Connecting Google Drive link: {file_url}')
 
-        log('Connecting Google Drive link: {}'.format(file_url))
         # translate Google Drive URL for direct download
-        force_download_link_prefix = 'https://docs.google.com/uc?export=download'
-
-        if file_url.find('drive.google.com/file/d/') != -1:
+        force_download_link_prefix = 'https://drive.usercontent.google.com/download?'
+        parsedurl = urlparse.urlparse(file_url)
+        query = {}
+        if parsedurl.query:
+            # should we catch an error here?
+            query = urlparse.parse_qs(parsedurl.query, strict_parsing=True)
+
+        if query.get("id"):
+            file_id = query["id"][0]  # assume only 1 id
+        elif file_url.find('drive.google.com/file/d/') != -1:
             file_id = file_url.partition('/d/')[-1].partition('/')[0]
-        elif file_url.find('drive.google.com/open?id=') != -1:
-            file_id = file_url.partition('id=')[-1]
         else:
             error_msg = 'Unexpected Google Drive share link.\n'
             error_msg += 'URL: {}'.format(file_url)
             raise ValueError(error_msg)
+
+        # This deliberately drops other query parameters... which may or may not be a good idea.
+        # The URLs returned from Google have other parameters that aren't needed here
+        # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f#updated-at-january-21-2024
+        new_query = {"id": file_id, "export": "download", "confirm": "x"}
+        if query.get("resourcekey"):
+            new_query["resourcekey"] = query["resourcekey"][0]  # assume only 1 key
+        querystr = "&".join([f"{k}={v}" for k, v in new_query.items()])
+        force_download_link = force_download_link_prefix + querystr
+
+        log('Generating Google Drive direct download link\n' +
+            ' from: {}\n to: {}'.format(file_url, force_download_link))
 
-        force_download_link = force_download_link_prefix + '&id={}'.format(file_id)
-
-        with requests.Session() as session:
-            response = session.get(force_download_link)
-            confirm_token = self._get_google_confirm_token(response)
-
-            if confirm_token:
-                force_download_link = force_download_link_prefix + '&confirm={}&id={}'.format(confirm_token, file_id)
-
-            log('Generating Google Drive direct download link\n' +
-                ' from: {}\n to: {}'.format(file_url, force_download_link))
-
-            copy_file_path = self._download_google_drive_to_file(force_download_link, response.cookies)
-            copy_file_path = self._unpack(copy_file_path, True)
+        copy_file_path = self._download_google_drive_to_file(force_download_link)
+        copy_file_path = self._unpack(copy_file_path, True)
 
         return copy_file_path
 

diff --git a/lib/DataFileUtil/utils/retrieve_filename.py b/lib/DataFileUtil/utils/retrieve_filename.py
@@ -1,7 +1,9 @@
 """
 Fetch the file name from a remote URL.
 """
+from email.message import Message
 from requests.cookies import RequestsCookieJar
+from pathlib import Path
 from typing import Optional
 from urllib.parse import urlparse
 from uuid import uuid4
@@ -12,6 +14,9 @@
 from DataFileUtil.implementation import log
 
 
+_CONTENT_DISPOSITION = 'content-disposition'
+
+
 def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None) -> str:
     """
     Fetch the file name from a URL using the Content-Disposition header,
@@ -26,15 +31,17 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
     """
     try:
         # Fetch the headers
+        # Not sure how expensive this is for us and and the remote host, but head doesn't
+        # get the content dispo header
         with requests.get(file_url, cookies=cookies, stream=True) as response:
             try:
-                content_disposition = response.headers['content-disposition']
+                content_disposition = response.headers[_CONTENT_DISPOSITION]
             except KeyError:
                 log('Parsing file name directly from URL')
                 url = urlparse(file_url)
                 file_name = os.path.basename(url.path)
             else:
-                file_name = content_disposition.split('filename="')[-1].split('";')[0]
+                file_name = _get_filename_from_header(response)
     except Exception as error:
         error_msg = 'Cannot connect to URL: {}\n'.format(file_url)
         error_msg += 'Exception: {}'.format(error)
@@ -51,3 +58,12 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
         file_name = str(uuid4())
     log(f'Retrieved file name: {file_name}')
     return file_name
+
+
+def _get_filename_from_header(response):
+    # https://stackoverflow.com/a/78073510/643675
+    # https://peps.python.org/pep-0594/#cgi
+    m = Message()
+    m[_CONTENT_DISPOSITION] = response.headers[_CONTENT_DISPOSITION]
+    return Path(m.get_filename()).name
+
diff --git a/test/DataFileUtil_server_test.py b/test/DataFileUtil_server_test.py
@@ -512,7 +512,7 @@ def test_upload_tgz_with_no_dir(self):
         os.chdir(wd)
         self.assertEqual(ret1['node_file_name'], 'target.tar.gz')
         self.assertGreater(ret1['size'], 220)
-        self.assertLess(ret1['size'], 240)
+        self.assertLess(ret1['size'], 260)
         shock_id = ret1['shock_id']
         file_path2 = os.path.join(tmp_dir, 'output.tgz')
         ret2 = self.impl.shock_to_file(
@@ -523,7 +523,7 @@ def test_upload_tgz_with_no_dir(self):
         self.assertIsNone(ret2['attributes'])
         self.assertEqual(ret2['file_path'], file_path2)
         self.assertGreater(ret2['size'], 220)
-        self.assertLess(ret2['size'], 240)
+        self.assertLess(ret2['size'], 260)
         with tarfile.open(file_path2) as t:
             self.assertEqual(set(t.getnames()),
                              set(['.', './intar1.txt', './intar2.txt']))
@@ -569,7 +569,7 @@ def test_upload_tgz_with_no_file_name(self):
              'pack': 'targz'})[0]
         self.assertEqual(ret1['node_file_name'], 'tartest2.tar.gz')
         self.assertGreater(ret1['size'], 220)
-        self.assertLess(ret1['size'], 240)
+        self.assertLess(ret1['size'], 260)
         shock_id = ret1['shock_id']
         file_path2 = os.path.join(tmp_dir, 'output.tgz')
         ret2 = self.impl.shock_to_file(
@@ -580,7 +580,7 @@ def test_upload_tgz_with_no_file_name(self):
         self.assertIsNone(ret2['attributes'])
         self.assertEqual(ret2['file_path'], file_path2)
         self.assertGreater(ret1['size'], 220)
-        self.assertLess(ret1['size'], 240)
+        self.assertLess(ret1['size'], 260)
         with tarfile.open(file_path2) as t:
             self.assertEqual(set(t.getnames()),
                              set(['.', './intar1.txt', './intar2.txt']))
@@ -1671,8 +1671,8 @@ def test_download_dropbox_link_archive_file(self):
 
     def test_download_google_drive_link_uncompress_file(self):
         # google drive link of 'file1.txt'
-        file_url = 'https://drive.google.com/open?'
-        file_url += 'id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ'
+        file_url = ("https://drive.google.com/open?id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ"
+                    + "&resourcekey=0-eeP2JBzYnbKaFVncybSA0Q")
         params = {
             'download_type': 'Google Drive',
             'file_url': file_url
@@ -1687,8 +1687,8 @@ def test_download_google_drive_link_uncompress_file(self):
 
     def test_download_google_drive_link_compress_file(self):
         # google drive link of 'file1.txt.gzip'
-        file_url = 'https://drive.google.com/file/d/'
-        file_url += '0B0exSa7ebQ0qU1U5YmxMRktkbmc/view?usp=sharing'
+        file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qU1U5YmxMRktkbmc"
+                    + "/view?usp=drive_link&resourcekey=0-YXZfeTbInLOV0hkGVx92AA")
         params = {
             'download_type': 'Google Drive',
             'file_url': file_url
@@ -1702,7 +1702,7 @@ def test_download_google_drive_link_compress_file(self):
                          os.stat(ret1['copy_file_path']).st_size)
 
     def test_download_google_drive_link_large_uncompress_file(self):
-        file_url = 'https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch'
+        file_url = "https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch"
         params = {
             'download_type': 'Google Drive',
             'file_url': file_url
@@ -1717,8 +1717,8 @@ def test_download_google_drive_link_large_uncompress_file(self):
 
     def test_download_google_drive_link_archive_file(self):
         # google drive link of 'zip1.zip'
-        file_url = 'https://drive.google.com/open?'
-        file_url += 'id=0B0exSa7ebQ0qcEhRaDJoVDJSTkk'
+        file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qcEhRaDJoVDJSTkk"
+                    + "/view?usp=drive_link&resourcekey=0-wTnouG2x1Yb0bUm08rG1KA")
         params = {
             'download_type': 'Google Drive',
             'file_url': file_url

diff --git a/test/test_retrieve_filename.py b/test/test_retrieve_filename.py
@@ -69,3 +69,23 @@ def test_filename_truncation_no_ext(self):
         expected_fn = given_fn[0:255]
         fn = retrieve_filename(url)
         self.assertEqual(fn, expected_fn)
+
+    def test_filename_google(self):
+        """
+        Test a google downloaded file, which has a different content disposition header
+        than box.com and broke an older implementation of the header parser.
+        """
+        url = "https://docs.google.com/uc?export=download&id=1iPE1Mw6m91ONfm-Z4WpxDGTXX_BKADH8"
+        expected_filename = "Sample1.fastq.gz"
+        fn = retrieve_filename(url)
+        self.assertEqual(fn, expected_filename)
+
+    @unittest.skip("Eventually redirects to a 200 auth page. Need to do something very different")
+    def test_filename_google_inaccessible(self):
+        """
+        Test a non-public google downloaded file
+        """
+        url = "https://docs.google.com/uc?export=download&id=197Bp6PuiEiuCvOUulvo8jQiiv2bhQj9o"
+        expected_filename = "Sample1.fastq.gz"
+        fn = retrieve_filename(url)
+        self.assertEqual(fn, expected_filename)