Skip to content

Commit

Permalink
Merge pull request #98 from kbaseapps/dev-build_test_fix
Browse files Browse the repository at this point in the history
Fix build and tests
  • Loading branch information
MrCreosote authored Mar 13, 2024
2 parents ee76705 + 3886f57 commit 2fce573
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 62 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/kb_sdk_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ jobs:
- name: Run tests
if: "!contains(github.event.head_commit.message, 'skip ci')"
shell: bash
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
run: |
sh $GITHUB_WORKSPACE/kb_sdk_actions/bin/kb-sdk test
bash <(curl -s https://codecov.io/bash)
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
fail_ci_if_error: true
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ test_local
sdk.cfg
/run_local/
*.py.bak-*
/.settings/
/.project
/.pydevproject
30 changes: 17 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
FROM kbase/sdkbase2:python
FROM kbase/sdkpython:3.8.10
MAINTAINER KBase Developer
# -----------------------------------------

# Insert apt-get instructions here to install
# any required dependencies for your module.

# RUN apt-get update
RUN apt update

RUN pip install semver \
&& ( [ $(pip show filemagic|grep -c filemagic) -eq 0 ] || pip uninstall -y filemagic ) \
&& pip install python-magic \
&& pip install ftputil \
&& pip install ipython==5.3.0 \
&& pip install pyftpdlib==1.5.6 \
&& sudo apt-get install nano
# -----------------------------------------
RUN apt install -y nano pigz wget libmagic-dev

RUN pip uninstall -y filemagic

RUN sudo apt-get update
RUN sudo apt-get install pigz wget
RUN pip install bz2file
# bz2file hasn't been updated in 10 years
RUN pip install \
requests==2.31.0 \
requests_toolbelt==1.0.0 \
semver==3.0.2 \
python-magic==0.4.27 \
ftputil==5.1.0 \
ipython==5.3.0 \
bz2file==0.98 \
pyftpdlib==1.5.6

# -----------------------------------------

COPY ./ /kb/module
RUN mkdir -p /kb/module/work
Expand Down
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ build-test-script:
echo 'export KB_AUTH_TOKEN=`cat /kb/module/work/token`' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'export PYTHONPATH=$$script_dir/../$(LIB_DIR):$$PATH:$$PYTHONPATH' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'cd $$script_dir/../$(TEST_DIR)' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -u -m unittest discover -p "*_test.py"' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml --nocapture --nologcapture .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
chmod +x $(TEST_DIR)/$(TEST_SCRIPT_NAME)

test:
Expand Down
59 changes: 28 additions & 31 deletions lib/DataFileUtil/DataFileUtilImpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess
import tarfile
import tempfile
import urllib.parse as urlparse
import uuid
import zipfile

Expand Down Expand Up @@ -460,16 +461,6 @@ def _download_dropbox_link(self, file_url):

return copy_file_path

def _get_google_confirm_token(self, response):
"""
_get_google_confirm_token: get Google drive confirm token for large file
"""
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value

return None

def _download_google_drive_to_file(self, file_url, cookies=None):
"""
_download_google_drive_to_file: download url content to file
Expand Down Expand Up @@ -504,36 +495,42 @@ def _download_google_drive_link(self, file_url):
file_url: Google Drive download link
"""
# TODO all this should be rewritten with the google drive libs, super fragile as is
if not file_url.startswith('https://drive.google.com/'):
raise ValueError('Invalid Google Drive Link: {}'.format(file_url))
raise ValueError(f'Invalid Google Drive Link: {file_url}')
log(f'Connecting Google Drive link: {file_url}')

log('Connecting Google Drive link: {}'.format(file_url))
# translate Google Drive URL for direct download
force_download_link_prefix = 'https://docs.google.com/uc?export=download'

if file_url.find('drive.google.com/file/d/') != -1:
force_download_link_prefix = 'https://drive.usercontent.google.com/download?'
parsedurl = urlparse.urlparse(file_url)
query = {}
if parsedurl.query:
# should we catch an error here?
query = urlparse.parse_qs(parsedurl.query, strict_parsing=True)

if query.get("id"):
file_id = query["id"][0] # assume only 1 id
elif file_url.find('drive.google.com/file/d/') != -1:
file_id = file_url.partition('/d/')[-1].partition('/')[0]
elif file_url.find('drive.google.com/open?id=') != -1:
file_id = file_url.partition('id=')[-1]
else:
error_msg = 'Unexpected Google Drive share link.\n'
error_msg += 'URL: {}'.format(file_url)
raise ValueError(error_msg)

# This deliberately drops other query parameters... which may or may not be a good idea.
# The URLs returned from Google have other parameters that aren't needed here
# https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f#updated-at-january-21-2024
new_query = {"id": file_id, "export": "download", "confirm": "x"}
if query.get("resourcekey"):
new_query["resourcekey"] = query["resourcekey"][0] # assume only 1 key
querystr = "&".join([f"{k}={v}" for k, v in new_query.items()])
force_download_link = force_download_link_prefix + querystr

log('Generating Google Drive direct download link\n' +
' from: {}\n to: {}'.format(file_url, force_download_link))

force_download_link = force_download_link_prefix + '&id={}'.format(file_id)

with requests.Session() as session:
response = session.get(force_download_link)
confirm_token = self._get_google_confirm_token(response)

if confirm_token:
force_download_link = force_download_link_prefix + '&confirm={}&id={}'.format(confirm_token, file_id)

log('Generating Google Drive direct download link\n' +
' from: {}\n to: {}'.format(file_url, force_download_link))

copy_file_path = self._download_google_drive_to_file(force_download_link, response.cookies)
copy_file_path = self._unpack(copy_file_path, True)
copy_file_path = self._download_google_drive_to_file(force_download_link)
copy_file_path = self._unpack(copy_file_path, True)

return copy_file_path

Expand Down
20 changes: 18 additions & 2 deletions lib/DataFileUtil/utils/retrieve_filename.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
Fetch the file name from a remote URL.
"""
from email.message import Message
from requests.cookies import RequestsCookieJar
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from uuid import uuid4
Expand All @@ -12,6 +14,9 @@
from DataFileUtil.implementation import log


_CONTENT_DISPOSITION = 'content-disposition'


def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None) -> str:
"""
Fetch the file name from a URL using the Content-Disposition header,
Expand All @@ -26,15 +31,17 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
"""
try:
# Fetch the headers
# Not sure how expensive this is for us and and the remote host, but head doesn't
# get the content dispo header
with requests.get(file_url, cookies=cookies, stream=True) as response:
try:
content_disposition = response.headers['content-disposition']
content_disposition = response.headers[_CONTENT_DISPOSITION]
except KeyError:
log('Parsing file name directly from URL')
url = urlparse(file_url)
file_name = os.path.basename(url.path)
else:
file_name = content_disposition.split('filename="')[-1].split('";')[0]
file_name = _get_filename_from_header(response)
except Exception as error:
error_msg = 'Cannot connect to URL: {}\n'.format(file_url)
error_msg += 'Exception: {}'.format(error)
Expand All @@ -51,3 +58,12 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
file_name = str(uuid4())
log(f'Retrieved file name: {file_name}')
return file_name


def _get_filename_from_header(response):
# https://stackoverflow.com/a/78073510/643675
# https://peps.python.org/pep-0594/#cgi
m = Message()
m[_CONTENT_DISPOSITION] = response.headers[_CONTENT_DISPOSITION]
return Path(m.get_filename()).name

22 changes: 11 additions & 11 deletions test/DataFileUtil_server_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def test_upload_tgz_with_no_dir(self):
os.chdir(wd)
self.assertEqual(ret1['node_file_name'], 'target.tar.gz')
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
shock_id = ret1['shock_id']
file_path2 = os.path.join(tmp_dir, 'output.tgz')
ret2 = self.impl.shock_to_file(
Expand All @@ -523,7 +523,7 @@ def test_upload_tgz_with_no_dir(self):
self.assertIsNone(ret2['attributes'])
self.assertEqual(ret2['file_path'], file_path2)
self.assertGreater(ret2['size'], 220)
self.assertLess(ret2['size'], 240)
self.assertLess(ret2['size'], 260)
with tarfile.open(file_path2) as t:
self.assertEqual(set(t.getnames()),
set(['.', './intar1.txt', './intar2.txt']))
Expand Down Expand Up @@ -569,7 +569,7 @@ def test_upload_tgz_with_no_file_name(self):
'pack': 'targz'})[0]
self.assertEqual(ret1['node_file_name'], 'tartest2.tar.gz')
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
shock_id = ret1['shock_id']
file_path2 = os.path.join(tmp_dir, 'output.tgz')
ret2 = self.impl.shock_to_file(
Expand All @@ -580,7 +580,7 @@ def test_upload_tgz_with_no_file_name(self):
self.assertIsNone(ret2['attributes'])
self.assertEqual(ret2['file_path'], file_path2)
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
with tarfile.open(file_path2) as t:
self.assertEqual(set(t.getnames()),
set(['.', './intar1.txt', './intar2.txt']))
Expand Down Expand Up @@ -1671,8 +1671,8 @@ def test_download_dropbox_link_archive_file(self):

def test_download_google_drive_link_uncompress_file(self):
# google drive link of 'file1.txt'
file_url = 'https://drive.google.com/open?'
file_url += 'id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ'
file_url = ("https://drive.google.com/open?id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ"
+ "&resourcekey=0-eeP2JBzYnbKaFVncybSA0Q")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1687,8 +1687,8 @@ def test_download_google_drive_link_uncompress_file(self):

def test_download_google_drive_link_compress_file(self):
# google drive link of 'file1.txt.gzip'
file_url = 'https://drive.google.com/file/d/'
file_url += '0B0exSa7ebQ0qU1U5YmxMRktkbmc/view?usp=sharing'
file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qU1U5YmxMRktkbmc"
+ "/view?usp=drive_link&resourcekey=0-YXZfeTbInLOV0hkGVx92AA")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1702,7 +1702,7 @@ def test_download_google_drive_link_compress_file(self):
os.stat(ret1['copy_file_path']).st_size)

def test_download_google_drive_link_large_uncompress_file(self):
file_url = 'https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch'
file_url = "https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch"
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1717,8 +1717,8 @@ def test_download_google_drive_link_large_uncompress_file(self):

def test_download_google_drive_link_archive_file(self):
# google drive link of 'zip1.zip'
file_url = 'https://drive.google.com/open?'
file_url += 'id=0B0exSa7ebQ0qcEhRaDJoVDJSTkk'
file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qcEhRaDJoVDJSTkk"
+ "/view?usp=drive_link&resourcekey=0-wTnouG2x1Yb0bUm08rG1KA")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand Down
20 changes: 20 additions & 0 deletions test/test_retrieve_filename.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,23 @@ def test_filename_truncation_no_ext(self):
expected_fn = given_fn[0:255]
fn = retrieve_filename(url)
self.assertEqual(fn, expected_fn)

def test_filename_google(self):
"""
Test a google downloaded file, which has a different content disposition header
than box.com and broke an older implementation of the header parser.
"""
url = "https://docs.google.com/uc?export=download&id=1iPE1Mw6m91ONfm-Z4WpxDGTXX_BKADH8"
expected_filename = "Sample1.fastq.gz"
fn = retrieve_filename(url)
self.assertEqual(fn, expected_filename)

@unittest.skip("Eventually redirects to a 200 auth page. Need to do something very different")
def test_filename_google_inaccessible(self):
"""
Test a non-public google downloaded file
"""
url = "https://docs.google.com/uc?export=download&id=197Bp6PuiEiuCvOUulvo8jQiiv2bhQj9o"
expected_filename = "Sample1.fastq.gz"
fn = retrieve_filename(url)
self.assertEqual(fn, expected_filename)

0 comments on commit 2fce573

Please sign in to comment.