diff --git a/tests/unit/forklift/test_legacy.py b/tests/unit/forklift/test_legacy.py index 6129068c5ed3..62aea04bdd07 100644 --- a/tests/unit/forklift/test_legacy.py +++ b/tests/unit/forklift/test_legacy.py @@ -72,19 +72,26 @@ def _get_tar_testdata(compression_type=""): return temp_f.getvalue() +def _get_whl_testdata(name="fake_package", version="1.0"): + temp_f = io.BytesIO() + with zipfile.ZipFile(file=temp_f, mode="w") as zfp: + zfp.writestr(f"{name}-{version}.dist-info/METADATA", "Fake metadata") + return temp_f.getvalue() + + +def _storage_hash(data): + return hashlib.blake2b(data, digest_size=256 // 8).hexdigest() + + _TAR_GZ_PKG_TESTDATA = _get_tar_testdata("gz") _TAR_GZ_PKG_MD5 = hashlib.md5(_TAR_GZ_PKG_TESTDATA).hexdigest() _TAR_GZ_PKG_SHA256 = hashlib.sha256(_TAR_GZ_PKG_TESTDATA).hexdigest() -_TAR_GZ_PKG_STORAGE_HASH = hashlib.blake2b( - _TAR_GZ_PKG_TESTDATA, digest_size=256 // 8 -).hexdigest() +_TAR_GZ_PKG_STORAGE_HASH = _storage_hash(_TAR_GZ_PKG_TESTDATA) _TAR_BZ2_PKG_TESTDATA = _get_tar_testdata("bz2") _TAR_BZ2_PKG_MD5 = hashlib.md5(_TAR_BZ2_PKG_TESTDATA).hexdigest() _TAR_BZ2_PKG_SHA256 = hashlib.sha256(_TAR_BZ2_PKG_TESTDATA).hexdigest() -_TAR_BZ2_PKG_STORAGE_HASH = hashlib.blake2b( - _TAR_BZ2_PKG_TESTDATA, digest_size=256 // 8 -).hexdigest() +_TAR_BZ2_PKG_STORAGE_HASH = _storage_hash(_TAR_BZ2_PKG_TESTDATA) class TestExcWithMessage: @@ -2761,6 +2768,8 @@ def test_upload_succeeds_with_wheel( RoleFactory.create(user=user, project=project) filename = f"{project.name}-{release.version}-cp34-none-{plat}.whl" + filebody = _get_whl_testdata(project.name) + file_storage_hash = _storage_hash(filebody) pyramid_config.testing_securitypolicy(identity=user) db_request.user = user @@ -2772,11 +2781,11 @@ def test_upload_succeeds_with_wheel( "version": release.version, "filetype": "bdist_wheel", "pyversion": "cp34", - "md5_digest": _TAR_GZ_PKG_MD5, + "md5_digest": hashlib.md5(filebody).hexdigest(), "content": pretend.stub( filename=filename, - file=io.BytesIO(_TAR_GZ_PKG_TESTDATA), - type="application/tar", + file=io.BytesIO(filebody), + type="application/octet-stream", ), } ) @@ -2784,7 +2793,10 @@ def test_upload_succeeds_with_wheel( @pretend.call_recorder def storage_service_store(path, file_path, *, meta): with open(file_path, "rb") as fp: - assert fp.read() == _TAR_GZ_PKG_TESTDATA + if file_path.endswith(".metadata"): + assert fp.read() == b"Fake metadata" + else: + assert fp.read() == filebody storage_service = pretend.stub(store=storage_service_store) @@ -2808,9 +2820,9 @@ def storage_service_store(path, file_path, *, meta): pretend.call( "/".join( [ - _TAR_GZ_PKG_STORAGE_HASH[:2], - _TAR_GZ_PKG_STORAGE_HASH[2:4], - _TAR_GZ_PKG_STORAGE_HASH[4:], + file_storage_hash[:2], + file_storage_hash[2:4], + file_storage_hash[4:], filename, ] ), @@ -2821,7 +2833,24 @@ def storage_service_store(path, file_path, *, meta): "package-type": "bdist_wheel", "python-version": "cp34", }, - ) + ), + pretend.call( + "/".join( + [ + file_storage_hash[:2], + file_storage_hash[2:4], + file_storage_hash[4:], + filename + ".metadata", + ] + ), + mock.ANY, + meta={ + "project": project.normalized_name, + "version": release.version, + "package-type": "bdist_wheel", + "python-version": "cp34", + }, + ), ] # Ensure that a File object has been created. @@ -2874,6 +2903,8 @@ def test_upload_succeeds_with_wheel_after_sdist( RoleFactory.create(user=user, project=project) filename = f"{project.name}-{release.version}-cp34-none-any.whl" + filebody = _get_whl_testdata(project.name) + file_storage_hash = _storage_hash(filebody) pyramid_config.testing_securitypolicy(identity=user) db_request.user = user @@ -2885,11 +2916,11 @@ def test_upload_succeeds_with_wheel_after_sdist( "version": release.version, "filetype": "bdist_wheel", "pyversion": "cp34", - "md5_digest": "335c476dc930b959dda9ec82bd65ef19", + "md5_digest": hashlib.md5(filebody).hexdigest(), "content": pretend.stub( filename=filename, - file=io.BytesIO(b"A fake file."), - type="application/tar", + file=io.BytesIO(filebody), + type="application/zip", ), } ) @@ -2897,7 +2928,10 @@ def test_upload_succeeds_with_wheel_after_sdist( @pretend.call_recorder def storage_service_store(path, file_path, *, meta): with open(file_path, "rb") as fp: - assert fp.read() == b"A fake file." + if file_path.endswith(".metadata"): + assert fp.read() == b"Fake metadata" + else: + assert fp.read() == filebody storage_service = pretend.stub(store=storage_service_store) db_request.find_service = pretend.call_recorder( @@ -2920,9 +2954,9 @@ def storage_service_store(path, file_path, *, meta): pretend.call( "/".join( [ - "4e", - "6e", - "fa4c0ee2bbad071b4f5b5ea68f1aea89fa716e7754eb13e2314d45a5916e", + file_storage_hash[:2], + file_storage_hash[2:4], + file_storage_hash[4:], filename, ] ), @@ -2933,7 +2967,24 @@ def storage_service_store(path, file_path, *, meta): "package-type": "bdist_wheel", "python-version": "cp34", }, - ) + ), + pretend.call( + "/".join( + [ + file_storage_hash[:2], + file_storage_hash[2:4], + file_storage_hash[4:], + filename + ".metadata", + ] + ), + mock.ANY, + meta={ + "project": project.normalized_name, + "version": release.version, + "package-type": "bdist_wheel", + "python-version": "cp34", + }, + ), ] # Ensure that a File object has been created. diff --git a/warehouse/forklift/legacy.py b/warehouse/forklift/legacy.py index b0d27483c6b4..b5ec696094b1 100644 --- a/warehouse/forklift/legacy.py +++ b/warehouse/forklift/legacy.py @@ -10,6 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import email import hashlib import hmac @@ -790,6 +791,20 @@ def _is_duplicate_file(db_session, filename, hashes): return None +def extract_wheel_metadata(path): + """ + Extract METADATA file and return it as a content. The name of the + .whl file is used to find the corresponding .dist-info dir. + + See https://www.python.org/dev/peps/pep-0658/#specification + """ + filename = os.path.basename(path) + namever = _wheel_file_re.match(filename).group("namever") + metafile = namever + ".dist-info/METADATA" + with zipfile.ZipFile(path) as zfp: + return zfp.read(metafile) + + @view_config( route_name="forklift.legacy.file_upload", uses_session=True, @@ -1317,11 +1332,19 @@ def file_upload(request): "Binary wheel '{filename}' has an unsupported " "platform tag '{plat}'.".format(filename=filename, plat=plat), ) + wheel_metadata = extract_wheel_metadata(temporary_filename) + with open(temporary_filename + ".metadata", "wb") as fp: + fp.write(wheel_metadata) + metadata_hash = base64.b64encode( + hashlib.blake2s(wheel_metadata, digest_size=128 // 8).digest() + ).decode("utf-8") + else: + metadata_hash = None # Also buffer the entire signature file to disk. if "gpg_signature" in request.POST: has_signature = True - with open(os.path.join(tmpdir, filename + ".asc"), "wb") as fp: + with open(temporary_filename + ".asc", "wb") as fp: signature_size = 0 for chunk in iter( lambda: request.POST["gpg_signature"].file.read(8096), b"" @@ -1332,7 +1355,7 @@ def file_upload(request): fp.write(chunk) # Check whether signature is ASCII armored - with open(os.path.join(tmpdir, filename + ".asc"), "rb") as fp: + with open(temporary_filename + ".asc", "rb") as fp: if not fp.read().startswith(b"-----BEGIN PGP SIGNATURE-----"): raise _exc_with_message( HTTPBadRequest, "PGP signature isn't ASCII armored." @@ -1357,6 +1380,7 @@ def file_upload(request): md5_digest=file_hashes["md5"], sha256_digest=file_hashes["sha256"], blake2_256_digest=file_hashes["blake2_256"], + metadata_hash=metadata_hash, # Figure out what our filepath is going to be, we're going to use a # directory structure based on the hash of the file contents. This # will ensure that the contents of the file cannot change without @@ -1412,7 +1436,7 @@ def file_upload(request): storage = request.find_service(IFileStorage, name="primary") storage.store( file_.path, - os.path.join(tmpdir, filename), + temporary_filename, meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, @@ -1420,10 +1444,21 @@ def file_upload(request): "python-version": file_.python_version, }, ) + if metadata_hash is not None: + storage.store( + file_.path + ".metadata", + temporary_filename + ".metadata", + meta={ + "project": file_.release.project.normalized_name, + "version": file_.release.version, + "package-type": file_.packagetype, + "python-version": file_.python_version, + }, + ) if has_signature: storage.store( file_.pgp_path, - os.path.join(tmpdir, filename + ".asc"), + temporary_filename + ".asc", meta={ "project": file_.release.project.normalized_name, "version": file_.release.version, diff --git a/warehouse/migrations/versions/9b9778779fe2_add_a_metadata_hash_column_to_file.py b/warehouse/migrations/versions/9b9778779fe2_add_a_metadata_hash_column_to_file.py new file mode 100644 index 000000000000..0367955e38ad --- /dev/null +++ b/warehouse/migrations/versions/9b9778779fe2_add_a_metadata_hash_column_to_file.py @@ -0,0 +1,33 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Add a metadata_hash column to File + +Revision ID: 9b9778779fe2 +Revises: d582fb87b94c +Create Date: 2021-09-18 07:34:31.828437 +""" + +import sqlalchemy as sa + +from alembic import op + +revision = "9b9778779fe2" +down_revision = "d582fb87b94c" + + +def upgrade(): + op.add_column("release_files", sa.Column("metadata_hash", sa.Text(), nullable=True)) + + +def downgrade(): + op.drop_column("release_files", "metadata_hash") diff --git a/warehouse/packaging/models.py b/warehouse/packaging/models.py index a3a64593a3a4..10503d315fb5 100644 --- a/warehouse/packaging/models.py +++ b/warehouse/packaging/models.py @@ -666,6 +666,7 @@ def __table_args__(cls): # noqa md5_digest = Column(Text, unique=True, nullable=False) sha256_digest = Column(CIText, unique=True, nullable=False) blake2_256_digest = Column(CIText, unique=True, nullable=False) + metadata_hash = Column(Text, unique=False, nullable=True) upload_time = Column(DateTime(timezone=False), server_default=func.now()) uploaded_via = Column(Text)