Skip to content

Fix build and tests #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .github/workflows/kb_sdk_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ jobs:
- name: Run tests
if: "!contains(github.event.head_commit.message, 'skip ci')"
shell: bash
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
run: |
sh $GITHUB_WORKSPACE/kb_sdk_actions/bin/kb-sdk test
bash <(curl -s https://codecov.io/bash)

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
fail_ci_if_error: true
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ test_local
sdk.cfg
/run_local/
*.py.bak-*
/.settings/
/.project
/.pydevproject
30 changes: 17 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
FROM kbase/sdkbase2:python
FROM kbase/sdkpython:3.8.10
MAINTAINER KBase Developer
# -----------------------------------------

# Insert apt-get instructions here to install
# any required dependencies for your module.

# RUN apt-get update
RUN apt update

RUN pip install semver \
&& ( [ $(pip show filemagic|grep -c filemagic) -eq 0 ] || pip uninstall -y filemagic ) \
&& pip install python-magic \
&& pip install ftputil \
&& pip install ipython==5.3.0 \
&& pip install pyftpdlib==1.5.6 \
&& sudo apt-get install nano
# -----------------------------------------
RUN apt install -y nano pigz wget libmagic-dev

RUN pip uninstall -y filemagic

RUN sudo apt-get update
RUN sudo apt-get install pigz wget
RUN pip install bz2file
# bz2file hasn't been updated in 10 years
RUN pip install \
requests==2.31.0 \
requests_toolbelt==1.0.0 \
semver==3.0.2 \
python-magic==0.4.27 \
ftputil==5.1.0 \
ipython==5.3.0 \
bz2file==0.98 \
pyftpdlib==1.5.6

# -----------------------------------------

COPY ./ /kb/module
RUN mkdir -p /kb/module/work
Expand Down
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ build-test-script:
echo 'export KB_AUTH_TOKEN=`cat /kb/module/work/token`' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'export PYTHONPATH=$$script_dir/../$(LIB_DIR):$$PATH:$$PYTHONPATH' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'cd $$script_dir/../$(TEST_DIR)' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -u -m unittest discover -p "*_test.py"' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml --nocapture --nologcapture .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
echo 'python -m nose --with-coverage --cover-package=$(SERVICE_CAPS) --cover-html --cover-html-dir=/kb/module/work/test_coverage --cover-xml --cover-xml-file=/kb/module/work/test_coverage/coverage.xml .' >> $(TEST_DIR)/$(TEST_SCRIPT_NAME)
chmod +x $(TEST_DIR)/$(TEST_SCRIPT_NAME)

test:
Expand Down
59 changes: 28 additions & 31 deletions lib/DataFileUtil/DataFileUtilImpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to bump version as current release version is also 0.2.0

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, coming up next

import tarfile
import tempfile
import urllib.parse as urlparse
import uuid
import zipfile

Expand Down Expand Up @@ -460,16 +461,6 @@ def _download_dropbox_link(self, file_url):

return copy_file_path

def _get_google_confirm_token(self, response):
"""
_get_google_confirm_token: get Google drive confirm token for large file
"""
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value

return None

def _download_google_drive_to_file(self, file_url, cookies=None):
"""
_download_google_drive_to_file: download url content to file
Expand Down Expand Up @@ -504,36 +495,42 @@ def _download_google_drive_link(self, file_url):
file_url: Google Drive download link

"""
# TODO all this should be rewritten with the google drive libs, super fragile as is
if not file_url.startswith('https://drive.google.com/'):
raise ValueError('Invalid Google Drive Link: {}'.format(file_url))
raise ValueError(f'Invalid Google Drive Link: {file_url}')
log(f'Connecting Google Drive link: {file_url}')

log('Connecting Google Drive link: {}'.format(file_url))
# translate Google Drive URL for direct download
force_download_link_prefix = 'https://docs.google.com/uc?export=download'

if file_url.find('drive.google.com/file/d/') != -1:
force_download_link_prefix = 'https://drive.usercontent.google.com/download?'
parsedurl = urlparse.urlparse(file_url)
query = {}
if parsedurl.query:
# should we catch an error here?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong option. Probably fine as is.

query = urlparse.parse_qs(parsedurl.query, strict_parsing=True)

if query.get("id"):
file_id = query["id"][0] # assume only 1 id
elif file_url.find('drive.google.com/file/d/') != -1:
file_id = file_url.partition('/d/')[-1].partition('/')[0]
elif file_url.find('drive.google.com/open?id=') != -1:
file_id = file_url.partition('id=')[-1]
else:
error_msg = 'Unexpected Google Drive share link.\n'
error_msg += 'URL: {}'.format(file_url)
raise ValueError(error_msg)

# This deliberately drops other query parameters... which may or may not be a good idea.
# The URLs returned from Google have other parameters that aren't needed here
# https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f#updated-at-january-21-2024
new_query = {"id": file_id, "export": "download", "confirm": "x"}
if query.get("resourcekey"):
new_query["resourcekey"] = query["resourcekey"][0] # assume only 1 key
querystr = "&".join([f"{k}={v}" for k, v in new_query.items()])
force_download_link = force_download_link_prefix + querystr

log('Generating Google Drive direct download link\n' +
' from: {}\n to: {}'.format(file_url, force_download_link))

force_download_link = force_download_link_prefix + '&id={}'.format(file_id)

with requests.Session() as session:
response = session.get(force_download_link)
confirm_token = self._get_google_confirm_token(response)

if confirm_token:
force_download_link = force_download_link_prefix + '&confirm={}&id={}'.format(confirm_token, file_id)

log('Generating Google Drive direct download link\n' +
' from: {}\n to: {}'.format(file_url, force_download_link))

copy_file_path = self._download_google_drive_to_file(force_download_link, response.cookies)
copy_file_path = self._unpack(copy_file_path, True)
copy_file_path = self._download_google_drive_to_file(force_download_link)
copy_file_path = self._unpack(copy_file_path, True)

return copy_file_path

Expand Down
20 changes: 18 additions & 2 deletions lib/DataFileUtil/utils/retrieve_filename.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""
Fetch the file name from a remote URL.
"""
from email.message import Message
from requests.cookies import RequestsCookieJar
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from uuid import uuid4
Expand All @@ -12,6 +14,9 @@
from DataFileUtil.implementation import log


_CONTENT_DISPOSITION = 'content-disposition'


def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None) -> str:
"""
Fetch the file name from a URL using the Content-Disposition header,
Expand All @@ -26,15 +31,17 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
"""
try:
# Fetch the headers
# Not sure how expensive this is for us and and the remote host, but head doesn't
# get the content dispo header
with requests.get(file_url, cookies=cookies, stream=True) as response:
try:
content_disposition = response.headers['content-disposition']
content_disposition = response.headers[_CONTENT_DISPOSITION]
except KeyError:
log('Parsing file name directly from URL')
url = urlparse(file_url)
file_name = os.path.basename(url.path)
else:
file_name = content_disposition.split('filename="')[-1].split('";')[0]
file_name = _get_filename_from_header(response)
except Exception as error:
error_msg = 'Cannot connect to URL: {}\n'.format(file_url)
error_msg += 'Exception: {}'.format(error)
Expand All @@ -51,3 +58,12 @@ def retrieve_filename(file_url: str, cookies: Optional[RequestsCookieJar] = None
file_name = str(uuid4())
log(f'Retrieved file name: {file_name}')
return file_name


def _get_filename_from_header(response):
# https://stackoverflow.com/a/78073510/643675
# https://peps.python.org/pep-0594/#cgi
m = Message()
m[_CONTENT_DISPOSITION] = response.headers[_CONTENT_DISPOSITION]
return Path(m.get_filename()).name

22 changes: 11 additions & 11 deletions test/DataFileUtil_server_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def test_upload_tgz_with_no_dir(self):
os.chdir(wd)
self.assertEqual(ret1['node_file_name'], 'target.tar.gz')
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
shock_id = ret1['shock_id']
file_path2 = os.path.join(tmp_dir, 'output.tgz')
ret2 = self.impl.shock_to_file(
Expand All @@ -523,7 +523,7 @@ def test_upload_tgz_with_no_dir(self):
self.assertIsNone(ret2['attributes'])
self.assertEqual(ret2['file_path'], file_path2)
self.assertGreater(ret2['size'], 220)
self.assertLess(ret2['size'], 240)
self.assertLess(ret2['size'], 260)
with tarfile.open(file_path2) as t:
self.assertEqual(set(t.getnames()),
set(['.', './intar1.txt', './intar2.txt']))
Expand Down Expand Up @@ -569,7 +569,7 @@ def test_upload_tgz_with_no_file_name(self):
'pack': 'targz'})[0]
self.assertEqual(ret1['node_file_name'], 'tartest2.tar.gz')
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
shock_id = ret1['shock_id']
file_path2 = os.path.join(tmp_dir, 'output.tgz')
ret2 = self.impl.shock_to_file(
Expand All @@ -580,7 +580,7 @@ def test_upload_tgz_with_no_file_name(self):
self.assertIsNone(ret2['attributes'])
self.assertEqual(ret2['file_path'], file_path2)
self.assertGreater(ret1['size'], 220)
self.assertLess(ret1['size'], 240)
self.assertLess(ret1['size'], 260)
with tarfile.open(file_path2) as t:
self.assertEqual(set(t.getnames()),
set(['.', './intar1.txt', './intar2.txt']))
Expand Down Expand Up @@ -1671,8 +1671,8 @@ def test_download_dropbox_link_archive_file(self):

def test_download_google_drive_link_uncompress_file(self):
# google drive link of 'file1.txt'
file_url = 'https://drive.google.com/open?'
file_url += 'id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ'
file_url = ("https://drive.google.com/open?id=0B0exSa7ebQ0qX01mZ3FaRzhuMDQ"
+ "&resourcekey=0-eeP2JBzYnbKaFVncybSA0Q")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1687,8 +1687,8 @@ def test_download_google_drive_link_uncompress_file(self):

def test_download_google_drive_link_compress_file(self):
# google drive link of 'file1.txt.gzip'
file_url = 'https://drive.google.com/file/d/'
file_url += '0B0exSa7ebQ0qU1U5YmxMRktkbmc/view?usp=sharing'
file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qU1U5YmxMRktkbmc"
+ "/view?usp=drive_link&resourcekey=0-YXZfeTbInLOV0hkGVx92AA")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1702,7 +1702,7 @@ def test_download_google_drive_link_compress_file(self):
os.stat(ret1['copy_file_path']).st_size)

def test_download_google_drive_link_large_uncompress_file(self):
file_url = 'https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch'
file_url = "https://drive.google.com/open?id=1WBni9AcU7AHWY72amXBg7Dxb0d2RUGch"
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand All @@ -1717,8 +1717,8 @@ def test_download_google_drive_link_large_uncompress_file(self):

def test_download_google_drive_link_archive_file(self):
# google drive link of 'zip1.zip'
file_url = 'https://drive.google.com/open?'
file_url += 'id=0B0exSa7ebQ0qcEhRaDJoVDJSTkk'
file_url = ("https://drive.google.com/file/d/0B0exSa7ebQ0qcEhRaDJoVDJSTkk"
+ "/view?usp=drive_link&resourcekey=0-wTnouG2x1Yb0bUm08rG1KA")
params = {
'download_type': 'Google Drive',
'file_url': file_url
Expand Down
20 changes: 20 additions & 0 deletions test/test_retrieve_filename.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,23 @@ def test_filename_truncation_no_ext(self):
expected_fn = given_fn[0:255]
fn = retrieve_filename(url)
self.assertEqual(fn, expected_fn)

def test_filename_google(self):
"""
Test a google downloaded file, which has a different content disposition header
than box.com and broke an older implementation of the header parser.
"""
url = "https://docs.google.com/uc?export=download&id=1iPE1Mw6m91ONfm-Z4WpxDGTXX_BKADH8"
expected_filename = "Sample1.fastq.gz"
fn = retrieve_filename(url)
self.assertEqual(fn, expected_filename)

@unittest.skip("Eventually redirects to a 200 auth page. Need to do something very different")
def test_filename_google_inaccessible(self):
"""
Test a non-public google downloaded file
"""
url = "https://docs.google.com/uc?export=download&id=197Bp6PuiEiuCvOUulvo8jQiiv2bhQj9o"
expected_filename = "Sample1.fastq.gz"
fn = retrieve_filename(url)
self.assertEqual(fn, expected_filename)