Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract and host wheel METADATA on upload #9972

Closed
wants to merge 11 commits into from
95 changes: 73 additions & 22 deletions tests/unit/forklift/test_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,26 @@ def _get_tar_testdata(compression_type=""):
return temp_f.getvalue()


def _get_whl_testdata(name="fake_package", version="1.0"):
temp_f = io.BytesIO()
with zipfile.ZipFile(file=temp_f, mode="w") as zfp:
zfp.writestr(f"{name}-{version}.dist-info/METADATA", "Fake metadata")
return temp_f.getvalue()


def _storage_hash(data):
return hashlib.blake2b(data, digest_size=256 // 8).hexdigest()


_TAR_GZ_PKG_TESTDATA = _get_tar_testdata("gz")
_TAR_GZ_PKG_MD5 = hashlib.md5(_TAR_GZ_PKG_TESTDATA).hexdigest()
_TAR_GZ_PKG_SHA256 = hashlib.sha256(_TAR_GZ_PKG_TESTDATA).hexdigest()
_TAR_GZ_PKG_STORAGE_HASH = hashlib.blake2b(
_TAR_GZ_PKG_TESTDATA, digest_size=256 // 8
).hexdigest()
_TAR_GZ_PKG_STORAGE_HASH = _storage_hash(_TAR_GZ_PKG_TESTDATA)

_TAR_BZ2_PKG_TESTDATA = _get_tar_testdata("bz2")
_TAR_BZ2_PKG_MD5 = hashlib.md5(_TAR_BZ2_PKG_TESTDATA).hexdigest()
_TAR_BZ2_PKG_SHA256 = hashlib.sha256(_TAR_BZ2_PKG_TESTDATA).hexdigest()
_TAR_BZ2_PKG_STORAGE_HASH = hashlib.blake2b(
_TAR_BZ2_PKG_TESTDATA, digest_size=256 // 8
).hexdigest()
_TAR_BZ2_PKG_STORAGE_HASH = _storage_hash(_TAR_BZ2_PKG_TESTDATA)


class TestExcWithMessage:
Expand Down Expand Up @@ -2761,6 +2768,8 @@ def test_upload_succeeds_with_wheel(
RoleFactory.create(user=user, project=project)

filename = f"{project.name}-{release.version}-cp34-none-{plat}.whl"
filebody = _get_whl_testdata(project.name)
file_storage_hash = _storage_hash(filebody)

pyramid_config.testing_securitypolicy(identity=user)
db_request.user = user
Expand All @@ -2772,19 +2781,22 @@ def test_upload_succeeds_with_wheel(
"version": release.version,
"filetype": "bdist_wheel",
"pyversion": "cp34",
"md5_digest": _TAR_GZ_PKG_MD5,
"md5_digest": hashlib.md5(filebody).hexdigest(),
"content": pretend.stub(
filename=filename,
file=io.BytesIO(_TAR_GZ_PKG_TESTDATA),
type="application/tar",
file=io.BytesIO(filebody),
type="application/octet-stream",
),
}
)

@pretend.call_recorder
def storage_service_store(path, file_path, *, meta):
with open(file_path, "rb") as fp:
assert fp.read() == _TAR_GZ_PKG_TESTDATA
if file_path.endswith(".metadata"):
assert fp.read() == b"Fake metadata"
else:
assert fp.read() == filebody

storage_service = pretend.stub(store=storage_service_store)

Expand All @@ -2808,9 +2820,9 @@ def storage_service_store(path, file_path, *, meta):
pretend.call(
"/".join(
[
_TAR_GZ_PKG_STORAGE_HASH[:2],
_TAR_GZ_PKG_STORAGE_HASH[2:4],
_TAR_GZ_PKG_STORAGE_HASH[4:],
file_storage_hash[:2],
file_storage_hash[2:4],
file_storage_hash[4:],
filename,
]
),
Expand All @@ -2821,7 +2833,24 @@ def storage_service_store(path, file_path, *, meta):
"package-type": "bdist_wheel",
"python-version": "cp34",
},
)
),
pretend.call(
"/".join(
[
file_storage_hash[:2],
file_storage_hash[2:4],
file_storage_hash[4:],
filename + ".metadata",
]
),
mock.ANY,
meta={
"project": project.normalized_name,
"version": release.version,
"package-type": "bdist_wheel",
"python-version": "cp34",
},
),
]

# Ensure that a File object has been created.
Expand Down Expand Up @@ -2874,6 +2903,8 @@ def test_upload_succeeds_with_wheel_after_sdist(
RoleFactory.create(user=user, project=project)

filename = f"{project.name}-{release.version}-cp34-none-any.whl"
filebody = _get_whl_testdata(project.name)
file_storage_hash = _storage_hash(filebody)

pyramid_config.testing_securitypolicy(identity=user)
db_request.user = user
Expand All @@ -2885,19 +2916,22 @@ def test_upload_succeeds_with_wheel_after_sdist(
"version": release.version,
"filetype": "bdist_wheel",
"pyversion": "cp34",
"md5_digest": "335c476dc930b959dda9ec82bd65ef19",
"md5_digest": hashlib.md5(filebody).hexdigest(),
"content": pretend.stub(
filename=filename,
file=io.BytesIO(b"A fake file."),
type="application/tar",
file=io.BytesIO(filebody),
type="application/zip",
),
}
)

@pretend.call_recorder
def storage_service_store(path, file_path, *, meta):
with open(file_path, "rb") as fp:
assert fp.read() == b"A fake file."
if file_path.endswith(".metadata"):
assert fp.read() == b"Fake metadata"
else:
assert fp.read() == filebody

storage_service = pretend.stub(store=storage_service_store)
db_request.find_service = pretend.call_recorder(
Expand All @@ -2920,9 +2954,9 @@ def storage_service_store(path, file_path, *, meta):
pretend.call(
"/".join(
[
"4e",
"6e",
"fa4c0ee2bbad071b4f5b5ea68f1aea89fa716e7754eb13e2314d45a5916e",
file_storage_hash[:2],
file_storage_hash[2:4],
file_storage_hash[4:],
filename,
]
),
Expand All @@ -2933,7 +2967,24 @@ def storage_service_store(path, file_path, *, meta):
"package-type": "bdist_wheel",
"python-version": "cp34",
},
)
),
pretend.call(
"/".join(
[
file_storage_hash[:2],
file_storage_hash[2:4],
file_storage_hash[4:],
filename + ".metadata",
]
),
mock.ANY,
meta={
"project": project.normalized_name,
"version": release.version,
"package-type": "bdist_wheel",
"python-version": "cp34",
},
),
]

# Ensure that a File object has been created.
Expand Down
43 changes: 39 additions & 4 deletions warehouse/forklift/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import email
import hashlib
import hmac
Expand Down Expand Up @@ -790,6 +791,20 @@ def _is_duplicate_file(db_session, filename, hashes):
return None


def extract_wheel_metadata(path):
"""
Extract METADATA file and return it as a content. The name of the
.whl file is used to find the corresponding .dist-info dir.

See https://www.python.org/dev/peps/pep-0658/#specification
"""
filename = os.path.basename(path)
namever = _wheel_file_re.match(filename).group("namever")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the group ever be None here?

metafile = namever + ".dist-info/METADATA"
with zipfile.ZipFile(path) as zfp:
return zfp.read(metafile)


@view_config(
route_name="forklift.legacy.file_upload",
uses_session=True,
Expand Down Expand Up @@ -1317,11 +1332,19 @@ def file_upload(request):
"Binary wheel '{filename}' has an unsupported "
"platform tag '{plat}'.".format(filename=filename, plat=plat),
)
wheel_metadata = extract_wheel_metadata(temporary_filename)
with open(temporary_filename + ".metadata", "wb") as fp:
fp.write(wheel_metadata)
metadata_hash = base64.b64encode(
hashlib.blake2s(wheel_metadata, digest_size=128 // 8).digest()
).decode("utf-8")
else:
metadata_hash = None

# Also buffer the entire signature file to disk.
if "gpg_signature" in request.POST:
has_signature = True
with open(os.path.join(tmpdir, filename + ".asc"), "wb") as fp:
with open(temporary_filename + ".asc", "wb") as fp:
signature_size = 0
for chunk in iter(
lambda: request.POST["gpg_signature"].file.read(8096), b""
Expand All @@ -1332,7 +1355,7 @@ def file_upload(request):
fp.write(chunk)

# Check whether signature is ASCII armored
with open(os.path.join(tmpdir, filename + ".asc"), "rb") as fp:
with open(temporary_filename + ".asc", "rb") as fp:
if not fp.read().startswith(b"-----BEGIN PGP SIGNATURE-----"):
raise _exc_with_message(
HTTPBadRequest, "PGP signature isn't ASCII armored."
Expand All @@ -1357,6 +1380,7 @@ def file_upload(request):
md5_digest=file_hashes["md5"],
sha256_digest=file_hashes["sha256"],
blake2_256_digest=file_hashes["blake2_256"],
metadata_hash=metadata_hash,
# Figure out what our filepath is going to be, we're going to use a
# directory structure based on the hash of the file contents. This
# will ensure that the contents of the file cannot change without
Expand Down Expand Up @@ -1412,18 +1436,29 @@ def file_upload(request):
storage = request.find_service(IFileStorage, name="primary")
storage.store(
file_.path,
os.path.join(tmpdir, filename),
temporary_filename,
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
if metadata_hash is not None:
storage.store(
file_.path + ".metadata",
temporary_filename + ".metadata",
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
if has_signature:
storage.store(
file_.pgp_path,
os.path.join(tmpdir, filename + ".asc"),
temporary_filename + ".asc",
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Add a metadata_hash column to File

Revision ID: 9b9778779fe2
Revises: d582fb87b94c
Create Date: 2021-09-18 07:34:31.828437
"""

import sqlalchemy as sa

from alembic import op

revision = "9b9778779fe2"
down_revision = "d582fb87b94c"


def upgrade():
op.add_column("release_files", sa.Column("metadata_hash", sa.Text(), nullable=True))


def downgrade():
op.drop_column("release_files", "metadata_hash")
1 change: 1 addition & 0 deletions warehouse/packaging/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,7 @@ def __table_args__(cls): # noqa
md5_digest = Column(Text, unique=True, nullable=False)
sha256_digest = Column(CIText, unique=True, nullable=False)
blake2_256_digest = Column(CIText, unique=True, nullable=False)
metadata_hash = Column(Text, unique=False, nullable=True)
upload_time = Column(DateTime(timezone=False), server_default=func.now())
uploaded_via = Column(Text)

Expand Down