Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

purl2sym: Add metadata support for linux, mtd-utils, barebox, e2fsprogs and erofs-utils #113

Merged
merged 8 commits into from
Mar 28, 2024
164 changes: 133 additions & 31 deletions src/fetchcode/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,22 @@ def get_github_data_for_miniupnp(purl):
)


@router.route("pkg:generic/erofs-utils.*",)
def get_github_data_for_erofs_utils(purl):
"""
Yield `Package` object for erofs-utils package from GitHub.
"""
generic_purl = PackageURL.from_string(purl)
github_repo_purl = PackageURL(
type="github",
namespace="erofs",
name="erofs-utils",
version=generic_purl.version,
)

return GitHubSource.get_package_info(github_repo_purl)


@router.route("pkg:bitbucket/.*")
def get_bitbucket_data_from_purl(purl):
"""
Expand Down Expand Up @@ -317,9 +333,12 @@ def get_gnu_data_from_purl(purl):
"""Generate `Package` object from the `purl` string of gnu type"""
purl = PackageURL.from_string(purl)
source_archive_url = f"https://ftp.gnu.org/pub/gnu/{purl.name}/"
regex = r"^({}-)([\w.-]*)(.tar.gz)$".format(purl.name)
version_regex_template = r"^({}-)(?P<version>[\w.-]*)(.tar.gz)$"
version_regex = re.compile(version_regex_template.format(re.escape(purl.name)))

yield from extract_packages_from_listing(purl, source_archive_url, regex, [])
yield from extract_packages_from_listing(
purl, source_archive_url, version_regex, []
)


@dataclasses.dataclass
Expand All @@ -333,8 +352,8 @@ class DirectoryListedSource:
"description": "Flag indicating whether the archives are nested within another directory"
},
)
source_archive_regex: str = dataclasses.field(
default="",
source_archive_regex: re.Pattern = dataclasses.field(
default=None,
metadata={
"description": "Regular expression pattern to match files in the directory listing."
},
Expand Down Expand Up @@ -368,7 +387,9 @@ def get_package_info(cls, package_url):
class IpkgDirectoryListedSource(DirectoryListedSource):
source_url = "https://web.archive.org/web/20090326020239/http://handhelds.org/download/packages/ipkg/"
is_nested = False
source_archive_regex = r"^(ipkg[-_])([\w.-]*)(_arm.ipk|.tar.gz)$"
source_archive_regex = re.compile(
r"^(ipkg[-_])(?P<version>[\w.-]*)(_arm.ipk|.tar.gz)$"
)
ignored_files_and_dir = []

@classmethod
Expand Down Expand Up @@ -398,132 +419,163 @@ def get_package_info(cls, package_url):
class UtilLinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/util-linux/"
is_nested = True
source_archive_regex = r"^(util-linux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: util-linux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(util-linux-)(?P<version>[\w.-]*)(.tar.gz)$")
ignored_files_and_dir = []


class BusyBoxDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.busybox.net/downloads/"
source_archive_regex = r"^(busybox-)([\w.-]*)(.tar.bz2)$"
# Source archive ex: busybox-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(busybox-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class UclibcDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.uclibc.org/downloads/"
source_archive_regex = r"^(uClibc-)([\w.-]*)(.tar.gz)$"
# Source archive ex: uClibc-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(uClibc-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class UclibcNGDirectoryListedSource(DirectoryListedSource):
source_url = "https://downloads.uclibc-ng.org/releases/"
source_archive_regex = r"^(uClibc-ng-)([\w.-]*)(.tar.gz)$"
# Source archive ex: uClibc-ng-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(uClibc-ng-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = []


class Bzip2DirectoryListedSource(DirectoryListedSource):
source_url = "https://sourceware.org/pub/bzip2/"
source_archive_regex = r"^(bzip2-)([\w.-]*)(.tar.gz)$"
# Source archive ex: bzip2-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(bzip2-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class OpenSSHDirectoryListedSource(DirectoryListedSource):
source_url = "https://cdn.openbsd.org/pub/OpenBSD/OpenSSH/"
source_archive_regex = r"^(openssh-)([\w.-]*)(.tgz|.tar.gz)$"
# Source archive ex: openssh-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(openssh-)(?P<version>[\w.-]*)(.tgz|.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class DnsmasqDirectoryListedSource(DirectoryListedSource):
source_url = "https://thekelleys.org.uk/dnsmasq/"
source_archive_regex = r"^(dnsmasq-)([\w.-]*)(.tar.xz|.tar.gz)$"
# Source archive ex: dnsmasq-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(dnsmasq-)(?P<version>[\w.-]*)(.tar.xz|.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class EbtablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/ebtables/"
source_archive_regex = r"^(ebtables-)([\w.-]*)(.tar.gz)$"
# Source archive ex: ebtables-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(ebtables-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class HostapdDirectoryListedSource(DirectoryListedSource):
source_url = "https://w1.fi/releases/"
source_archive_regex = r"^(hostapd-)([\w.-]*)(.tar.gz)$"
# Source archive ex: hostapd-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(hostapd-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class Iproute2DirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/net/iproute2/"
source_archive_regex = r"^(iproute2-)([\w.-]*)(.tar.xz|.tar.gz)$"
source_archive_regex = re.compile(
# Source archive ex: iproute2-1.2.3.tar.gz
r"^(iproute2-)(?P<version>[\w.-]*)(.tar.xz|.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class IptablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/iptables/"
source_archive_regex = r"^(iptables-)([\w.-]*)(.tar.bz2)$"
# Source archive ex: iptables-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(iptables-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class LibnlDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.infradead.org/~tgr/libnl/files/"
source_archive_regex = r"^(libnl-)([\w.-]*)(.tar.gz)$"
# Source archive ex: libnl-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(libnl-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class LighttpdDirectoryListedSource(DirectoryListedSource):
source_url = "https://download.lighttpd.net/lighttpd/releases-1.4.x/"
source_archive_regex = r"^(lighttpd-)([\w.-]*)(.tar.gz)$"
# Source archive ex: lighttpd-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(lighttpd-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class NftablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/nftables/"
source_archive_regex = r"^(nftables-)([\w.-]*)(.tar.xz|.tar.bz2)$"
# Source archive ex: nftables-1.2.3.tar.bz2
source_archive_regex = re.compile(
r"^(nftables-)(?P<version>[\w.-]*)(.tar.xz|.tar.bz2)$"
)
is_nested = False
ignored_files_and_dir = []


class WpaSupplicantDirectoryListedSource(DirectoryListedSource):
source_url = "https://w1.fi/releases/"
source_archive_regex = r"^(wpa_supplicant-)([\w.-]*)(.tar.gz)$"
# Source archive ex: wpa_supplicant-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(wpa_supplicant-)(?P<version>[\w.-]*)(.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class SyslinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/boot/syslinux/"
source_archive_regex = r"^(syslinux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: syslinux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(syslinux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class SyslinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/boot/syslinux/"
source_archive_regex = r"^(syslinux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: syslinux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(syslinux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class ToyboxDirectoryListedSource(DirectoryListedSource):
source_url = "http://www.landley.net/toybox/downloads/"
source_archive_regex = r"^(toybox-)([\w.-]*)(.tar.gz|.tar.bz2)$"
# Source archive ex: toybox-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(toybox-)(?P<version>[\w.-]*)(.tar.gz|.tar.bz2)$"
)
is_nested = False
ignored_files_and_dir = []


class DropbearDirectoryListedSource(DirectoryListedSource):
source_url = "https://matt.ucc.asn.au/dropbear/releases/"
source_archive_regex = r"^(dropbear-)([\w.-]*)(.tar.bz2|_i386.deb)$"
# Source archive ex: dropbear-1.2.3.tar.bz2
source_archive_regex = re.compile(
r"^(dropbear-)(?P<version>[\w.-]*)(.tar.bz2|_i386.deb)$"
)
is_nested = False
ignored_files_and_dir = [
"dropbear-0.44test1.tar.bz2",
Expand All @@ -539,10 +591,53 @@ class DropbearDirectoryListedSource(DirectoryListedSource):

class SambaDirectoryListedSource(DirectoryListedSource):
source_url = "https://download.samba.org/pub/samba/stable/"
source_archive_regex = r"^(samba-)([\w.-]*)(.tar.gz)$"
# Source archive ex: samba-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(samba-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class MtdUtilsDirectoryListedSource(DirectoryListedSource):
source_url = "https://infraroot.at/pub/mtd/"
# Source archive ex: mtd-utils-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(mtd-utils-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class BareboxDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.barebox.org/download/"
# Source archive ex: barebox-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(barebox-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []

class LinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://cdn.kernel.org/pub/linux/kernel/"
# Source archive ex: linux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(linux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = [
"Historic/",
"SillySounds/",
"crypto/",
"firmware/",
"next/",
"people/",
"ports/",
"projects/",
"testing/",
"tools/",
"uemacs/",
]

class E2fsprogsDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/"
# Source archive ex: e2fsprogs-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(e2fsprogs-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = ["testing/"]


DIR_SUPPORTED_PURLS = [
"pkg:generic/busybox.*",
Expand All @@ -566,6 +661,10 @@ class SambaDirectoryListedSource(DirectoryListedSource):
"pkg:generic/util-linux.*",
"pkg:generic/wpa_supplicant.*",
"pkg:generic/ipkg.*",
"pkg:generic/mtd-utils.*",
"pkg:generic/barebox.*",
"pkg:generic/linux.*",
"pkg:generic/e2fsprogs.*",
]

DIR_LISTED_SOURCE_BY_PACKAGE_NAME = {
Expand All @@ -589,6 +688,10 @@ class SambaDirectoryListedSource(DirectoryListedSource):
"util-linux": UtilLinuxDirectoryListedSource,
"wpa_supplicant": WpaSupplicantDirectoryListedSource,
"ipkg": IpkgDirectoryListedSource,
"mtd-utils": MtdUtilsDirectoryListedSource,
"barebox": BareboxDirectoryListedSource,
"linux": LinuxDirectoryListedSource,
"e2fsprogs": E2fsprogsDirectoryListedSource,
}


Expand All @@ -605,18 +708,17 @@ def get_packages_from_listing(purl, source_archive_url, regex, ignored_files_and
"""
Return list of package data from a directory listing based on the specified regex.
"""
pattern = re.compile(regex)
_, listing = htmllistparse.fetch_listing(source_archive_url)

packages = []
for file in listing:
if not pattern.match(file.name) or file.name in ignored_files_and_dir:
match = regex.match(file.name)
if not match or file.name in ignored_files_and_dir:
continue

match = re.search(regex, file.name)
version = match.group(2)
version = match.group("version")
version = version.strip("v").strip()
if not version:
if not version or not version[0].isdigit():
continue

modified_time = file.modified
Expand Down Expand Up @@ -668,7 +770,7 @@ def extract_package_from_nested_listing(purl, source_url, regex, ignored_files_a
"""
_, listing = htmllistparse.fetch_listing(source_url)
for directory in listing:
if not directory.name.endswith("/"):
if not directory.name.endswith("/") or directory.name in ignored_files_and_dir:
continue

directory_url = urljoin(source_url, directory.name)
Expand Down
2 changes: 1 addition & 1 deletion src/fetchcode/package_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def _get_github_packages(purl, version_regex, ignored_tag_regex, default_package
version = tag

version = version.strip("Vv").strip()
if not version:
if not version or not version[0].isdigit():
continue

download_url = archive_download_url.format(
Expand Down
Loading
Loading