From 227d972eb6b744dcdbf005496fcecef046db715e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 1 Oct 2025 02:06:52 +0530 Subject: [PATCH 1/4] Add support for mining cpan packageURLs Reference: https://github.com/aboutcode-org/purldb/issues/685 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/miners/cpan.py | 124 ++++++++++++++++++ minecode_pipelines/pipelines/mine_cpan.py | 64 ++++++++++ minecode_pipelines/pipes/cpan.py | 146 ++++++++++++++++++++++ pyproject-minecode_pipelines.toml | 4 +- 4 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 minecode_pipelines/miners/cpan.py create mode 100644 minecode_pipelines/pipelines/mine_cpan.py create mode 100644 minecode_pipelines/pipes/cpan.py diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py new file mode 100644 index 00000000..dca0048c --- /dev/null +++ b/minecode_pipelines/miners/cpan.py @@ -0,0 +1,124 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import gzip +import requests + +from bs4 import BeautifulSoup + + +from packageurl import PackageURL + +from minecode_pipelines.utils import get_temp_file +from minecode_pipelines.pipes import write_data_to_json_file + +""" +Visitors for cpan and cpan-like perl package repositories. +""" + + +CPAN_REPO = "https://www.cpan.org/" +CPAN_TYPE = "cpan" + + +def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None): + cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz" + local_filename = "cpan_packages.gz" + + response = requests.get(cpan_packages_url, stream=True) + if not response.ok: + return + + with open(local_filename, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + with gzip.open("cpan_packages.gz", "rb") as f_in: + with open("cpan_packages.txt", "wb") as f_out: + f_out.writelines(f_in) + + with open("cpan_packages.txt", encoding="utf-8") as file: + packages_content = file.read() + + package_path_by_name = {} + + modules = packages_content.split("\n")[9:-1] + for module in modules: + info = [section for section in module.split(" ") if section] + package_path = info[-1] + path_segments = package_path.split("/") + filename = path_segments.pop() + path_prefix = "/".join(path_segments) + + name_version = filename.replace(".tar.gz", "").split("-") + _version = name_version.pop() + name = "-".join(name_version) + + package_path_by_name[name] = path_prefix + + return package_path_by_name + + +def write_packages_json(packages, name): + temp_file = get_temp_file(name) + write_data_to_json_file(path=temp_file, data=packages) + return temp_file + + +def get_cpan_packageurls(name, path_prefix, logger=None): + packageurls = [] + + # file extensions found in cpan index + ignorable_extensions = [".meta", ".readme", ".tar.gz"] + + cpan_authors_path = "/authors/id/" + cpan_authors_url = CPAN_REPO + cpan_authors_path + + cpan_author_page_url = cpan_authors_url + path_prefix + + response = requests.get(cpan_author_page_url) + if not response.ok: + return packageurls + + if logger: + logger(f"Getting package versions for {name} from {cpan_author_page_url}") + + soup = BeautifulSoup(response.text, "html.parser") + package_list_elements = soup.find("ul").text.split("\n") + + package_elements = [ + element.replace(" ", "") + for element in package_list_elements + if element and element not in {" Parent Directory", " CHECKSUMS"} + ] + + versions = [] + for package_file in package_elements: + for extension in ignorable_extensions: + if extension in package_file: + package_file = package_file.replace(extension, "") + + name_version = package_file.split("-") + version = name_version.pop() + package_name = "-".join(name_version) + if package_name != name: + continue + + versions.append(version) + + unique_versions = list(set(versions)) + for version in unique_versions: + purl = PackageURL( + type=CPAN_TYPE, + name=name, + version=version, + ) + packageurls.append(purl.to_string()) + + return packageurls diff --git a/minecode_pipelines/pipelines/mine_cpan.py b/minecode_pipelines/pipelines/mine_cpan.py new file mode 100644 index 00000000..ded51ecb --- /dev/null +++ b/minecode_pipelines/pipelines/mine_cpan.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import federatedcode + +from minecode_pipelines import pipes +from minecode_pipelines.pipes import cpan + + +class MineCpan(Pipeline): + """ + Mine all packageURLs from a cpan index and publish them to + a FederatedCode repo. + """ + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.mine_cpan_packages, + cls.mine_and_publish_cpan_packageurls, + cls.delete_cloned_repos, + ) + + def check_federatedcode_eligibility(self): + """ + Check if the project fulfills the following criteria for + pushing the project result to FederatedCode. + """ + federatedcode.check_federatedcode_configured_and_available(logger=self.log) + + def mine_cpan_packages(self): + """Mine cpan package names from cpan indexes or checkpoint.""" + self.cpan_packages_path_by_name = cpan.mine_cpan_packages(logger=self.log) + + def mine_and_publish_cpan_packageurls(self): + """Get cpan packageURLs for all mined cpan package names.""" + self.repos = cpan.mine_and_publish_cpan_packageurls( + package_path_by_name=self.cpan_packages_path_by_name, + logger=self.log, + ) + + def delete_cloned_repos(self): + pipes.delete_cloned_repos(repos=self.repos, logger=self.log) diff --git a/minecode_pipelines/pipes/cpan.py b/minecode_pipelines/pipes/cpan.py new file mode 100644 index 00000000..4c99018f --- /dev/null +++ b/minecode_pipelines/pipes/cpan.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from minecode_pipelines import VERSION +from minecode_pipelines.pipes import write_packageurls_to_file + +from minecode_pipelines.miners.cpan import get_cpan_packages +from minecode_pipelines.miners.cpan import get_cpan_packageurls +from minecode_pipelines.miners.cpan import CPAN_REPO + +from minecode_pipelines.miners.cpan import CPAN_TYPE +from minecode_pipelines.utils import grouper + +from aboutcode.hashid import get_package_base_dir +from packageurl import PackageURL +from scanpipe.pipes.federatedcode import clone_repository + +from scanpipe.pipes.federatedcode import commit_changes +from scanpipe.pipes.federatedcode import push_changes + + +# If True, show full details on fetching packageURL for +# a package name present in the index +LOG_PACKAGEURL_DETAILS = False + +PACKAGE_BATCH_SIZE = 500 + + +# We are testing and storing mined packageURLs in one single repo per ecosystem for now +MINECODE_DATA_CPAN_REPO = "https://github.com/aboutcode-data/minecode-data-cpan-test" + + +def mine_cpan_packages(logger=None): + if logger: + logger("Getting packages from cpan index") + + package_path_by_name = get_cpan_packages(cpan_repo=CPAN_REPO, logger=logger) + + if logger: + packages_count = len(package_path_by_name.keys()) + logger(f"Mined {packages_count} packages from cpan index") + + return package_path_by_name + + +def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None): + if not package_path_by_name: + return + + # clone repo + cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_CPAN_REPO) + if logger: + logger(f"{MINECODE_DATA_CPAN_REPO} repo cloned at: {cloned_data_repo.working_dir}") + + for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=package_path_by_name.keys()): + packages_mined = [] + purls = [] + purl_files = [] + + if logger and LOG_PACKAGEURL_DETAILS: + logger("Starting package mining for a batch of packages") + + for package_name in package_batch: + if not package_name: + continue + + # fetch packageURLs for package + if logger and LOG_PACKAGEURL_DETAILS: + logger(f"getting packageURLs for package: {package_name}") + + path_prefix = package_path_by_name.get(package_name) + if not path_prefix: + continue + + packageurls = get_cpan_packageurls( + name=package_name, + path_prefix=path_prefix, + logger=logger, + ) + if not packageurls: + if logger and LOG_PACKAGEURL_DETAILS: + logger(f"Package versions not present for package: {package_name}") + + # We don't want to try fetching versions for these again + packages_mined.append(package_name) + continue + + # get repo and path for package + base_purl = PackageURL(type=CPAN_TYPE, name=package_name).to_string() + package_base_dir = get_package_base_dir(purl=base_purl) + + if logger and LOG_PACKAGEURL_DETAILS: + logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}") + purls_string = " ".join(packageurls) + logger(f"packageURLs: {purls_string}") + + # write packageURLs to file + purl_file = write_packageurls_to_file( + repo=cloned_data_repo, + base_dir=package_base_dir, + packageurls=packageurls, + ) + purl_files.append(purl_file) + purls.append(base_purl) + + packages_mined.append(package_name) + + if logger: + purls_string = " ".join(purls) + logger("Committing and pushing changes for a batch of packages: ") + logger(f"{purls_string}") + + # commit changes + commit_changes( + repo=cloned_data_repo, + files_to_commit=purl_files, + purls=purls, + mine_type="packageURL", + tool_name="pkg:cpan/minecode-pipelines", + tool_version=VERSION, + ) + + # Push changes to remote repository + push_changes(repo=cloned_data_repo) + + repos_to_clean = [cloned_data_repo] + return repos_to_clean diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index d8f1f586..b8411de9 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -42,7 +42,8 @@ dependencies = [ "scancodeio >= 35.3.0", "ftputil >= 5.1.0", "jawa >= 2.2.0", - "arrow >= 1.3.0" + "arrow >= 1.3.0", + "beautifulsoup4 >= 4.13.4" ] urls = { Homepage = "https://github.com/aboutcode-org/purldb" } @@ -54,6 +55,7 @@ mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo" mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian" mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine" mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan" +mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan" [tool.bumpversion] current_version = "0.0.1b13" From 8230a2259bbe299989b0de8cb5b5249e49c4afcd Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 1 Oct 2025 04:38:10 +0530 Subject: [PATCH 2/4] Fix flaky purldb git tag tests Signed-off-by: Ayan Sinha Mahapatra --- minecode/tests/collectors/test_github.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/minecode/tests/collectors/test_github.py b/minecode/tests/collectors/test_github.py index 02a113e5..3aef499c 100644 --- a/minecode/tests/collectors/test_github.py +++ b/minecode/tests/collectors/test_github.py @@ -40,6 +40,13 @@ def test_github_get_all_versions(self): "minecode-pipelines/v0.0.1b6", "minecode-pipelines/v0.0.1b7", "minecode-pipelines/v0.0.1b8", + "minecode-pipelines/v0.0.1b9", + "minecode-pipelines/v0.0.1b10", + "minecode-pipelines/v0.0.1b11", + "minecode-pipelines/v0.0.1b12", + "minecode-pipelines/v0.0.1b13", + "minecode-pipelines/v0.0.1b14", + "minecode-pipelines/v0.0.1b15", ] for item in versions: self.assertIn(item, expected) From 2ee926ac3bc55aa6c86ea03ce6a87b55090693ab Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 1 Oct 2025 04:39:31 +0530 Subject: [PATCH 3/4] Bump version to v0.0.1b16 Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/__init__.py | 2 +- pyproject-minecode_pipelines.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py index 9f2d0162..ee4403f9 100644 --- a/minecode_pipelines/__init__.py +++ b/minecode_pipelines/__init__.py @@ -7,4 +7,4 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -VERSION = "0.0.1b13" +VERSION = "0.0.1b16" diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index b8411de9..8e450b8d 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b13" +version = "0.0.1b16" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } @@ -58,7 +58,7 @@ mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan" mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan" [tool.bumpversion] -current_version = "0.0.1b13" +current_version = "0.0.1b16" allow_dirty = true files = [ From 583c4ef84e719595a162f965ef0e95951c366c93 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 1 Oct 2025 20:22:59 +0530 Subject: [PATCH 4/4] Address review feedback Signed-off-by: Ayan Sinha Mahapatra --- minecode_pipelines/miners/cpan.py | 70 ++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py index dca0048c..a7512ebd 100644 --- a/minecode_pipelines/miners/cpan.py +++ b/minecode_pipelines/miners/cpan.py @@ -11,12 +11,10 @@ import requests from bs4 import BeautifulSoup - - from packageurl import PackageURL -from minecode_pipelines.utils import get_temp_file -from minecode_pipelines.pipes import write_data_to_json_file +from scanpipe.pipes.fetch import fetch_http + """ Visitors for cpan and cpan-like perl package repositories. @@ -28,29 +26,44 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None): + """ + Get cpan package names parsed from the `02packages.details.txt` + which conatins a list of all modules and their respective + package archive paths. We parse the package names and their respective + path_prefixes with author page path from this list. + """ cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz" - local_filename = "cpan_packages.gz" + cpan_packages_gz_download = fetch_http(cpan_packages_url) + with gzip.open(cpan_packages_gz_download, "rb") as file_content: + packages_content = file_content.read() - response = requests.get(cpan_packages_url, stream=True) - if not response.ok: - return - - with open(local_filename, "wb") as f: - for chunk in response.iter_content(chunk_size=8192): - f.write(chunk) + package_path_by_name = {} - with gzip.open("cpan_packages.gz", "rb") as f_in: - with open("cpan_packages.txt", "wb") as f_out: - f_out.writelines(f_in) + # The ``modules/02packages.details.txt`` file has the following section + # at the beginning of the file: + # + # File: 02packages.details.txt + # URL: http://www.cpan.org/modules/02packages.details.txt + # Description: Package names found in directory $CPAN/authors/id/ + # Columns: package name, version, path + # Intended-For: Automated fetch routines, namespace documentation. + # Written-By: PAUSE version 1.005 + # Line-Count: 268940 + # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT + # + # This information is there in first 10 lines, and the last line is an + # empty line, both of which we are ignoring below - with open("cpan_packages.txt", encoding="utf-8") as file: - packages_content = file.read() + modules = packages_content.split("\n")[9:-1] - package_path_by_name = {} + # A sample line from this module list looks like this: + # + # Crypt::Passphrase::SHA1::Base64 0.021 L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz - modules = packages_content.split("\n")[9:-1] for module in modules: info = [section for section in module.split(" ") if section] + + # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz package_path = info[-1] path_segments = package_path.split("/") filename = path_segments.pop() @@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None): _version = name_version.pop() name = "-".join(name_version) + # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/ package_path_by_name[name] = path_prefix return package_path_by_name -def write_packages_json(packages, name): - temp_file = get_temp_file(name) - write_data_to_json_file(path=temp_file, data=packages) - return temp_file +def get_cpan_packageurls(name, path_prefix, logger=None): + """ + Given a package name and it's path_prefix (author page path) + return a list of packageURLs for that package. + An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists + all versions of all packages released by the author, so we can scrape + all the packageURLs from this author packages index. + """ + + author_name = path_prefix.split("/")[-1] -def get_cpan_packageurls(name, path_prefix, logger=None): packageurls = [] # file extensions found in cpan index @@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None): logger(f"Getting package versions for {name} from {cpan_author_page_url}") soup = BeautifulSoup(response.text, "html.parser") + + # We get all the listed packages in the author page index package_list_elements = soup.find("ul").text.split("\n") package_elements = [ @@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None): for version in unique_versions: purl = PackageURL( type=CPAN_TYPE, + namespace=author_name, name=name, version=version, )