diff --git a/.github/workflows/build_python_3.yml b/.github/workflows/build_python_3.yml index c1b2e70e882..b90c1f657e2 100644 --- a/.github/workflows/build_python_3.yml +++ b/.github/workflows/build_python_3.yml @@ -69,20 +69,16 @@ jobs: # See: https://stackoverflow.com/a/65402241 CIBW_ENVIRONMENT_MACOS: CMAKE_BUILD_PARALLEL_LEVEL=24 SYSTEM_VERSION_COMPAT=0 CMAKE_ARGS="-DNATIVE_TESTING=OFF" CIBW_REPAIR_WHEEL_COMMAND_LINUX: | + python scripts/zip_filter.py {wheel} \*.c \*.cpp \*.cc \*.h \*.hpp \*.pyx \*.md && mkdir ./tempwheelhouse && unzip -l {wheel} | grep '\.so' && auditwheel repair -w ./tempwheelhouse {wheel} && - for w in ./tempwheelhouse/*.whl; do - python scripts/zip_filter.py $w \*.c \*.cpp \*.cc \*.h \*.hpp \*.pyx \*.md - mv $w {dest_dir} - done && + mv ./tempwheelhouse/*.whl {dest_dir} && rm -rf ./tempwheelhouse CIBW_REPAIR_WHEEL_COMMAND_MACOS: | - zip -d {wheel} \*.c \*.cpp \*.cc \*.h \*.hpp \*.pyx \*.md && + python scripts/zip_filter.py {wheel} \*.c \*.cpp \*.cc \*.h \*.hpp \*.pyx \*.md && MACOSX_DEPLOYMENT_TARGET=12.7 delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel} - CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: choco install -y 7zip && - 7z d -r "{wheel}" *.c *.cpp *.cc *.h *.hpp *.pyx *.md && - move "{wheel}" "{dest_dir}" + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: python scripts/zip_filter.py "{wheel}" "*.c" "*.cpp" "*.cc" "*.h" "*.hpp" "*.pyx" "*.md" && mv "{wheel}" "{dest_dir}" CIBW_TEST_COMMAND: "python {project}/tests/smoke_test.py" steps: @@ -107,6 +103,16 @@ jobs: with: only: ${{ matrix.only }} + - name: Validate wheel RECORD files + shell: bash + run: | + for wheel in ./wheelhouse/*.whl; do + if [ -f "$wheel" ]; then + echo "Validating $(basename $wheel)..." + python scripts/validate_wheel.py "$wheel" + fi + done + - if: runner.os != 'Windows' run: | echo "ARTIFACT_NAME=${{ matrix.only }}" >> $GITHUB_ENV diff --git a/scripts/validate_wheel.py b/scripts/validate_wheel.py new file mode 100755 index 00000000000..bb6c546c04e --- /dev/null +++ b/scripts/validate_wheel.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Validate that a wheel's contents match its RECORD file. + +This script checks: +1. All files in the wheel are listed in RECORD +2. All files in RECORD exist in the wheel +3. File hashes match (for files that have hashes in RECORD) +4. File sizes match +""" + +import argparse +import base64 +import csv +import hashlib +import io +from pathlib import Path +import sys +import zipfile + + +def compute_hash(data): + """Compute the urlsafe base64 encoded SHA256 hash of data.""" + hash_digest = hashlib.sha256(data).digest() + return base64.urlsafe_b64encode(hash_digest).rstrip(b"=").decode("ascii") + + +def validate_wheel(wheel_path): + """Validate that wheel contents match its RECORD file.""" + errors = [] + + with zipfile.ZipFile(wheel_path, "r") as wheel: + # Find the RECORD file + record_path = None + for name in wheel.namelist(): + if name.endswith(".dist-info/RECORD"): + record_path = name + break + + if not record_path: + errors.append("No RECORD file found in wheel") + return errors + + # Parse the RECORD file + record_content = wheel.read(record_path).decode("utf-8") + record_entries = {} + + reader = csv.reader(io.StringIO(record_content)) + for row in reader: + if not row or len(row) < 3: + continue + + file_path, hash_str, size_str = row[0], row[1], row[2] + record_entries[file_path] = {"hash": hash_str, "size": int(size_str) if size_str else None} + + # Get all files in the wheel (excluding directories) + wheel_files = set() + for name in wheel.namelist(): + # Skip directories (they end with /) + if not name.endswith("/"): + wheel_files.add(name) + + record_files = set(record_entries.keys()) + + # Check for files in wheel but not in RECORD + files_not_in_record = wheel_files - record_files + if files_not_in_record: + for f in sorted(files_not_in_record): + errors.append(f"File in wheel but not in RECORD: {f}") + + # Check for files in RECORD but not in wheel + files_not_in_wheel = record_files - wheel_files + if files_not_in_wheel: + for f in sorted(files_not_in_wheel): + errors.append(f"File in RECORD but not in wheel: {f}") + + # Validate hashes and sizes for files that exist in both + for file_path in record_files & wheel_files: + # Skip the RECORD file itself + if file_path == record_path: + continue + + record_entry = record_entries[file_path] + file_data = wheel.read(file_path) + + # Check size + if record_entry["size"] is not None: + actual_size = len(file_data) + if actual_size != record_entry["size"]: + errors.append( + f"Size mismatch for {file_path}: RECORD says {record_entry['size']}, actual is {actual_size}" + ) + + # Check hash + if record_entry["hash"]: + # Parse the hash format (algorithm=base64hash) + if "=" in record_entry["hash"]: + algo, expected_hash = record_entry["hash"].split("=", 1) + if algo == "sha256": + actual_hash = compute_hash(file_data) + if actual_hash != expected_hash: + errors.append( + f"Hash mismatch for {file_path}: RECORD says {expected_hash}, actual is {actual_hash}" + ) + else: + errors.append(f"Unknown hash algorithm {algo} for {file_path} (expected sha256)") + else: + errors.append(f"Invalid hash format for {file_path}: {record_entry['hash']}") + # The RECORD file itself should not have a hash + elif file_path != record_path: + errors.append(f"No hash recorded for {file_path}") + + return errors + + +def main(): + parser = argparse.ArgumentParser(description="Validate wheel RECORD file matches contents") + parser.add_argument("wheel", help="Path to wheel file to validate") + + args = parser.parse_args() + + wheel_path = Path(args.wheel) + if not wheel_path.exists(): + print(f"Error: Wheel file not found: {wheel_path}", file=sys.stderr) + sys.exit(1) + + print(f"Validating {wheel_path.name}...") + errors = validate_wheel(wheel_path) + + if errors: + print(f"\n[ERROR] Found {len(errors)} error(s):", file=sys.stderr) + for error in errors: + print(f" - {error}", file=sys.stderr) + sys.exit(1) + + print("[SUCCESS] Wheel validation passed!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/zip_filter.py b/scripts/zip_filter.py index d930bacb8eb..0c9f8568460 100644 --- a/scripts/zip_filter.py +++ b/scripts/zip_filter.py @@ -1,11 +1,46 @@ import argparse +import csv import fnmatch +import io import os import zipfile +def update_record(record_content, patterns): + """Update the RECORD file to remove entries for deleted files.""" + # Parse the existing RECORD + records = [] + reader = csv.reader(io.StringIO(record_content)) + + for row in reader: + if not row: + continue + file_path = row[0] + # Skip files that match removal patterns + if not any(fnmatch.fnmatch(file_path, pattern) for pattern in patterns): + records.append(row) + + # Rebuild the RECORD content + output = io.StringIO() + writer = csv.writer(output, lineterminator="\n") + for record in records: + writer.writerow(record) + + return output.getvalue() + + def remove_from_zip(zip_filename, patterns): temp_zip_filename = f"{zip_filename}.tmp" + record_content = None + + # First pass: read RECORD file if it exists + with zipfile.ZipFile(zip_filename, "r") as source_zip: + for file in source_zip.infolist(): + if file.filename.endswith(".dist-info/RECORD"): + record_content = source_zip.read(file.filename).decode("utf-8") + break + + # Second pass: create new zip without removed files and with updated RECORD with zipfile.ZipFile(zip_filename, "r") as source_zip, zipfile.ZipFile( temp_zip_filename, "w", zipfile.ZIP_DEFLATED ) as temp_zip: @@ -13,7 +48,12 @@ def remove_from_zip(zip_filename, patterns): for file in source_zip.infolist(): if any(fnmatch.fnmatch(file.filename, pattern) for pattern in patterns): continue - temp_zip.writestr(file, source_zip.read(file.filename)) + elif file.filename.endswith(".dist-info/RECORD") and record_content: + # Update the RECORD file + updated_record = update_record(record_content, patterns) + temp_zip.writestr(file, updated_record) + else: + temp_zip.writestr(file, source_zip.read(file.filename)) os.replace(temp_zip_filename, zip_filename)