From 227d972eb6b744dcdbf005496fcecef046db715e Mon Sep 17 00:00:00 2001
From: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
Date: Wed, 1 Oct 2025 02:06:52 +0530
Subject: [PATCH 1/4] Add support for mining cpan packageURLs

Reference: https://github.com/aboutcode-org/purldb/issues/685
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
---
 minecode_pipelines/miners/cpan.py         | 124 ++++++++++++++++++
 minecode_pipelines/pipelines/mine_cpan.py |  64 ++++++++++
 minecode_pipelines/pipes/cpan.py          | 146 ++++++++++++++++++++++
 pyproject-minecode_pipelines.toml         |   4 +-
 4 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 minecode_pipelines/miners/cpan.py
 create mode 100644 minecode_pipelines/pipelines/mine_cpan.py
 create mode 100644 minecode_pipelines/pipes/cpan.py

diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py
new file mode 100644
index 00000000..dca0048c
--- /dev/null
+++ b/minecode_pipelines/miners/cpan.py
@@ -0,0 +1,124 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import gzip
+import requests
+
+from bs4 import BeautifulSoup
+
+
+from packageurl import PackageURL
+
+from minecode_pipelines.utils import get_temp_file
+from minecode_pipelines.pipes import write_data_to_json_file
+
+"""
+Visitors for cpan and cpan-like perl package repositories.
+"""
+
+
+CPAN_REPO = "https://www.cpan.org/"
+CPAN_TYPE = "cpan"
+
+
+def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
+    cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
+    local_filename = "cpan_packages.gz"
+
+    response = requests.get(cpan_packages_url, stream=True)
+    if not response.ok:
+        return
+
+    with open(local_filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+
+    with gzip.open("cpan_packages.gz", "rb") as f_in:
+        with open("cpan_packages.txt", "wb") as f_out:
+            f_out.writelines(f_in)
+
+    with open("cpan_packages.txt", encoding="utf-8") as file:
+        packages_content = file.read()
+
+    package_path_by_name = {}
+
+    modules = packages_content.split("\n")[9:-1]
+    for module in modules:
+        info = [section for section in module.split(" ") if section]
+        package_path = info[-1]
+        path_segments = package_path.split("/")
+        filename = path_segments.pop()
+        path_prefix = "/".join(path_segments)
+
+        name_version = filename.replace(".tar.gz", "").split("-")
+        _version = name_version.pop()
+        name = "-".join(name_version)
+
+        package_path_by_name[name] = path_prefix
+
+    return package_path_by_name
+
+
+def write_packages_json(packages, name):
+    temp_file = get_temp_file(name)
+    write_data_to_json_file(path=temp_file, data=packages)
+    return temp_file
+
+
+def get_cpan_packageurls(name, path_prefix, logger=None):
+    packageurls = []
+
+    # file extensions found in cpan index
+    ignorable_extensions = [".meta", ".readme", ".tar.gz"]
+
+    cpan_authors_path = "/authors/id/"
+    cpan_authors_url = CPAN_REPO + cpan_authors_path
+
+    cpan_author_page_url = cpan_authors_url + path_prefix
+
+    response = requests.get(cpan_author_page_url)
+    if not response.ok:
+        return packageurls
+
+    if logger:
+        logger(f"Getting package versions for {name} from {cpan_author_page_url}")
+
+    soup = BeautifulSoup(response.text, "html.parser")
+    package_list_elements = soup.find("ul").text.split("\n")
+
+    package_elements = [
+        element.replace(" ", "")
+        for element in package_list_elements
+        if element and element not in {" Parent Directory", " CHECKSUMS"}
+    ]
+
+    versions = []
+    for package_file in package_elements:
+        for extension in ignorable_extensions:
+            if extension in package_file:
+                package_file = package_file.replace(extension, "")
+
+        name_version = package_file.split("-")
+        version = name_version.pop()
+        package_name = "-".join(name_version)
+        if package_name != name:
+            continue
+
+        versions.append(version)
+
+    unique_versions = list(set(versions))
+    for version in unique_versions:
+        purl = PackageURL(
+            type=CPAN_TYPE,
+            name=name,
+            version=version,
+        )
+        packageurls.append(purl.to_string())
+
+    return packageurls
diff --git a/minecode_pipelines/pipelines/mine_cpan.py b/minecode_pipelines/pipelines/mine_cpan.py
new file mode 100644
index 00000000..ded51ecb
--- /dev/null
+++ b/minecode_pipelines/pipelines/mine_cpan.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes import federatedcode
+
+from minecode_pipelines import pipes
+from minecode_pipelines.pipes import cpan
+
+
+class MineCpan(Pipeline):
+    """
+    Mine all packageURLs from a cpan index and publish them to
+    a FederatedCode repo.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.check_federatedcode_eligibility,
+            cls.mine_cpan_packages,
+            cls.mine_and_publish_cpan_packageurls,
+            cls.delete_cloned_repos,
+        )
+
+    def check_federatedcode_eligibility(self):
+        """
+        Check if the project fulfills the following criteria for
+        pushing the project result to FederatedCode.
+        """
+        federatedcode.check_federatedcode_configured_and_available(logger=self.log)
+
+    def mine_cpan_packages(self):
+        """Mine cpan package names from cpan indexes or checkpoint."""
+        self.cpan_packages_path_by_name = cpan.mine_cpan_packages(logger=self.log)
+
+    def mine_and_publish_cpan_packageurls(self):
+        """Get cpan packageURLs for all mined cpan package names."""
+        self.repos = cpan.mine_and_publish_cpan_packageurls(
+            package_path_by_name=self.cpan_packages_path_by_name,
+            logger=self.log,
+        )
+
+    def delete_cloned_repos(self):
+        pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
diff --git a/minecode_pipelines/pipes/cpan.py b/minecode_pipelines/pipes/cpan.py
new file mode 100644
index 00000000..4c99018f
--- /dev/null
+++ b/minecode_pipelines/pipes/cpan.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+from minecode_pipelines import VERSION
+from minecode_pipelines.pipes import write_packageurls_to_file
+
+from minecode_pipelines.miners.cpan import get_cpan_packages
+from minecode_pipelines.miners.cpan import get_cpan_packageurls
+from minecode_pipelines.miners.cpan import CPAN_REPO
+
+from minecode_pipelines.miners.cpan import CPAN_TYPE
+from minecode_pipelines.utils import grouper
+
+from aboutcode.hashid import get_package_base_dir
+from packageurl import PackageURL
+from scanpipe.pipes.federatedcode import clone_repository
+
+from scanpipe.pipes.federatedcode import commit_changes
+from scanpipe.pipes.federatedcode import push_changes
+
+
+# If True, show full details on fetching packageURL for
+# a package name present in the index
+LOG_PACKAGEURL_DETAILS = False
+
+PACKAGE_BATCH_SIZE = 500
+
+
+# We are testing and storing mined packageURLs in one single repo per ecosystem for now
+MINECODE_DATA_CPAN_REPO = "https://github.com/aboutcode-data/minecode-data-cpan-test"
+
+
+def mine_cpan_packages(logger=None):
+    if logger:
+        logger("Getting packages from cpan index")
+
+    package_path_by_name = get_cpan_packages(cpan_repo=CPAN_REPO, logger=logger)
+
+    if logger:
+        packages_count = len(package_path_by_name.keys())
+        logger(f"Mined {packages_count} packages from cpan index")
+
+    return package_path_by_name
+
+
+def mine_and_publish_cpan_packageurls(package_path_by_name, logger=None):
+    if not package_path_by_name:
+        return
+
+    # clone repo
+    cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_CPAN_REPO)
+    if logger:
+        logger(f"{MINECODE_DATA_CPAN_REPO} repo cloned at: {cloned_data_repo.working_dir}")
+
+    for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=package_path_by_name.keys()):
+        packages_mined = []
+        purls = []
+        purl_files = []
+
+        if logger and LOG_PACKAGEURL_DETAILS:
+            logger("Starting package mining for a batch of packages")
+
+        for package_name in package_batch:
+            if not package_name:
+                continue
+
+            # fetch packageURLs for package
+            if logger and LOG_PACKAGEURL_DETAILS:
+                logger(f"getting packageURLs for package: {package_name}")
+
+            path_prefix = package_path_by_name.get(package_name)
+            if not path_prefix:
+                continue
+
+            packageurls = get_cpan_packageurls(
+                name=package_name,
+                path_prefix=path_prefix,
+                logger=logger,
+            )
+            if not packageurls:
+                if logger and LOG_PACKAGEURL_DETAILS:
+                    logger(f"Package versions not present for package: {package_name}")
+
+                # We don't want to try fetching versions for these again
+                packages_mined.append(package_name)
+                continue
+
+            # get repo and path for package
+            base_purl = PackageURL(type=CPAN_TYPE, name=package_name).to_string()
+            package_base_dir = get_package_base_dir(purl=base_purl)
+
+            if logger and LOG_PACKAGEURL_DETAILS:
+                logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
+                purls_string = " ".join(packageurls)
+                logger(f"packageURLs: {purls_string}")
+
+            # write packageURLs to file
+            purl_file = write_packageurls_to_file(
+                repo=cloned_data_repo,
+                base_dir=package_base_dir,
+                packageurls=packageurls,
+            )
+            purl_files.append(purl_file)
+            purls.append(base_purl)
+
+            packages_mined.append(package_name)
+
+        if logger:
+            purls_string = " ".join(purls)
+            logger("Committing and pushing changes for a batch of packages: ")
+            logger(f"{purls_string}")
+
+        # commit changes
+        commit_changes(
+            repo=cloned_data_repo,
+            files_to_commit=purl_files,
+            purls=purls,
+            mine_type="packageURL",
+            tool_name="pkg:cpan/minecode-pipelines",
+            tool_version=VERSION,
+        )
+
+        # Push changes to remote repository
+        push_changes(repo=cloned_data_repo)
+
+    repos_to_clean = [cloned_data_repo]
+    return repos_to_clean
diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml
index d8f1f586..b8411de9 100644
--- a/pyproject-minecode_pipelines.toml
+++ b/pyproject-minecode_pipelines.toml
@@ -42,7 +42,8 @@ dependencies = [
     "scancodeio >= 35.3.0",
     "ftputil >= 5.1.0",
     "jawa >= 2.2.0",
-    "arrow >= 1.3.0"
+    "arrow >= 1.3.0",
+    "beautifulsoup4 >= 4.13.4"
 ]
 
 urls = { Homepage = "https://github.com/aboutcode-org/purldb" }
@@ -54,6 +55,7 @@ mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo"
 mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian"
 mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine"
 mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan"
+mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan"
 
 [tool.bumpversion]
 current_version = "0.0.1b13"

From 8230a2259bbe299989b0de8cb5b5249e49c4afcd Mon Sep 17 00:00:00 2001
From: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
Date: Wed, 1 Oct 2025 04:38:10 +0530
Subject: [PATCH 2/4] Fix flaky purldb git tag tests

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
---
 minecode/tests/collectors/test_github.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/minecode/tests/collectors/test_github.py b/minecode/tests/collectors/test_github.py
index 02a113e5..3aef499c 100644
--- a/minecode/tests/collectors/test_github.py
+++ b/minecode/tests/collectors/test_github.py
@@ -40,6 +40,13 @@ def test_github_get_all_versions(self):
             "minecode-pipelines/v0.0.1b6",
             "minecode-pipelines/v0.0.1b7",
             "minecode-pipelines/v0.0.1b8",
+            "minecode-pipelines/v0.0.1b9",
+            "minecode-pipelines/v0.0.1b10",
+            "minecode-pipelines/v0.0.1b11",
+            "minecode-pipelines/v0.0.1b12",
+            "minecode-pipelines/v0.0.1b13",
+            "minecode-pipelines/v0.0.1b14",
+            "minecode-pipelines/v0.0.1b15",
         ]
         for item in versions:
             self.assertIn(item, expected)

From 2ee926ac3bc55aa6c86ea03ce6a87b55090693ab Mon Sep 17 00:00:00 2001
From: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
Date: Wed, 1 Oct 2025 04:39:31 +0530
Subject: [PATCH 3/4] Bump version to v0.0.1b16

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
---
 minecode_pipelines/__init__.py    | 2 +-
 pyproject-minecode_pipelines.toml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/minecode_pipelines/__init__.py b/minecode_pipelines/__init__.py
index 9f2d0162..ee4403f9 100644
--- a/minecode_pipelines/__init__.py
+++ b/minecode_pipelines/__init__.py
@@ -7,4 +7,4 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-VERSION = "0.0.1b13"
+VERSION = "0.0.1b16"
diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml
index b8411de9..8e450b8d 100644
--- a/pyproject-minecode_pipelines.toml
+++ b/pyproject-minecode_pipelines.toml
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
 
 [project]
 name = "minecode_pipelines"
-version = "0.0.1b13"
+version = "0.0.1b16"
 description = "A library for mining packageURLs and package metadata from ecosystem repositories."
 readme = "minecode_pipelines/README.rst"
 license = { text = "Apache-2.0" }
@@ -58,7 +58,7 @@ mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan"
 mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan"
 
 [tool.bumpversion]
-current_version = "0.0.1b13"
+current_version = "0.0.1b16"
 allow_dirty = true
 
 files = [

From 583c4ef84e719595a162f965ef0e95951c366c93 Mon Sep 17 00:00:00 2001
From: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
Date: Wed, 1 Oct 2025 20:22:59 +0530
Subject: [PATCH 4/4] Address review feedback

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
---
 minecode_pipelines/miners/cpan.py | 70 ++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/minecode_pipelines/miners/cpan.py b/minecode_pipelines/miners/cpan.py
index dca0048c..a7512ebd 100644
--- a/minecode_pipelines/miners/cpan.py
+++ b/minecode_pipelines/miners/cpan.py
@@ -11,12 +11,10 @@
 import requests
 
 from bs4 import BeautifulSoup
-
-
 from packageurl import PackageURL
 
-from minecode_pipelines.utils import get_temp_file
-from minecode_pipelines.pipes import write_data_to_json_file
+from scanpipe.pipes.fetch import fetch_http
+
 
 """
 Visitors for cpan and cpan-like perl package repositories.
@@ -28,29 +26,44 @@
 
 
 def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
+    """
+    Get cpan package names parsed from the `02packages.details.txt`
+    which conatins a list of all modules and their respective
+    package archive paths. We parse the package names and their respective
+    path_prefixes with author page path from this list.
+    """
     cpan_packages_url = cpan_repo + "modules/02packages.details.txt.gz"
-    local_filename = "cpan_packages.gz"
+    cpan_packages_gz_download = fetch_http(cpan_packages_url)
+    with gzip.open(cpan_packages_gz_download, "rb") as file_content:
+        packages_content = file_content.read()
 
-    response = requests.get(cpan_packages_url, stream=True)
-    if not response.ok:
-        return
-
-    with open(local_filename, "wb") as f:
-        for chunk in response.iter_content(chunk_size=8192):
-            f.write(chunk)
+    package_path_by_name = {}
 
-    with gzip.open("cpan_packages.gz", "rb") as f_in:
-        with open("cpan_packages.txt", "wb") as f_out:
-            f_out.writelines(f_in)
+    # The ``modules/02packages.details.txt`` file has the following section
+    # at the beginning of the file:
+    #
+    # File:         02packages.details.txt
+    # URL:          http://www.cpan.org/modules/02packages.details.txt
+    # Description:  Package names found in directory $CPAN/authors/id/
+    # Columns:      package name, version, path
+    # Intended-For: Automated fetch routines, namespace documentation.
+    # Written-By:   PAUSE version 1.005
+    # Line-Count:   268940
+    # Last-Updated: Mon, 29 Sep 2025 22:29:02 GMT
+    #
+    # This information is there in first 10 lines, and the last line is an
+    # empty line, both of which we are ignoring below
 
-    with open("cpan_packages.txt", encoding="utf-8") as file:
-        packages_content = file.read()
+    modules = packages_content.split("\n")[9:-1]
 
-    package_path_by_name = {}
+    # A sample line from this module list looks like this:
+    #
+    # Crypt::Passphrase::SHA1::Base64   0.021  L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
 
-    modules = packages_content.split("\n")[9:-1]
     for module in modules:
         info = [section for section in module.split(" ") if section]
+
+        # This is like: L/LE/LEONT/Crypt-Passphrase-0.021.tar.gz
         package_path = info[-1]
         path_segments = package_path.split("/")
         filename = path_segments.pop()
@@ -60,18 +73,24 @@ def get_cpan_packages(cpan_repo=CPAN_REPO, logger=None):
         _version = name_version.pop()
         name = "-".join(name_version)
 
+        # for the above example: name: Crypt-Passphrase, path_prefix: L/LE/LEONT/
         package_path_by_name[name] = path_prefix
 
     return package_path_by_name
 
 
-def write_packages_json(packages, name):
-    temp_file = get_temp_file(name)
-    write_data_to_json_file(path=temp_file, data=packages)
-    return temp_file
+def get_cpan_packageurls(name, path_prefix, logger=None):
+    """
+    Given a package name and it's path_prefix (author page path)
+    return a list of packageURLs for that package.
 
+    An author page (like https://www.cpan.org/authors/id/P/PT/PTC/) lists
+    all versions of all packages released by the author, so we can scrape
+    all the packageURLs from this author packages index.
+    """
+
+    author_name = path_prefix.split("/")[-1]
 
-def get_cpan_packageurls(name, path_prefix, logger=None):
     packageurls = []
 
     # file extensions found in cpan index
@@ -90,6 +109,8 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
         logger(f"Getting package versions for {name} from {cpan_author_page_url}")
 
     soup = BeautifulSoup(response.text, "html.parser")
+
+    # We get all the listed packages in the author page index
     package_list_elements = soup.find("ul").text.split("\n")
 
     package_elements = [
@@ -116,6 +137,7 @@ def get_cpan_packageurls(name, path_prefix, logger=None):
     for version in unique_versions:
         purl = PackageURL(
             type=CPAN_TYPE,
+            namespace=author_name,
             name=name,
             version=version,
         )