Skip to content

Commit

Permalink
Merge pull request #1188 from TG1999/clean_import_data
Browse files Browse the repository at this point in the history
Clean imported data after import process
  • Loading branch information
TG1999 authored May 15, 2023
2 parents 2646d7e + dcbf076 commit 87b4ebc
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 107 deletions.
44 changes: 10 additions & 34 deletions vulnerabilities/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import pytz
from dateutil import parser as dateparser
from fetchcode.vcs import VCSResponse
from fetchcode.vcs import fetch_via_vcs
from license_expression import Licensing
from packageurl import PackageURL
Expand Down Expand Up @@ -288,6 +289,10 @@ class InvalidSPDXLicense(Exception):
pass


class ForkError(Exception):
pass


class Importer:
"""
An Importer collects data from various upstreams and returns corresponding AdvisoryData objects
Expand All @@ -297,7 +302,7 @@ class Importer:
spdx_license_expression = ""
license_url = ""
notice = ""
vcs_response = None
vcs_response: VCSResponse = None

def __init__(self):
if not self.spdx_license_expression:
Expand All @@ -324,47 +329,18 @@ def advisory_data(self) -> Iterable[AdvisoryData]:
raise NotImplementedError

def clone(self, repo_url):
"""
Clone the repo at repo_url and return the VCSResponse object
"""
try:
self.vcs_response = fetch_via_vcs(repo_url)
return self.vcs_response
except Exception as e:
msg = f"Failed to fetch {repo_url} via vcs: {e}"
logger.error(msg)
raise ForkError(msg) from e


class ForkError(Exception):
pass


class GitImporter(Importer):
def __init__(self, repo_url):
super().__init__()
self.repo_url = repo_url
self.vcs_response = None

def __enter__(self):
super().__enter__()
self.clone()
return self

def __exit__(self):
self.vcs_response.delete()

def clone(self):
try:
self.vcs_response = fetch_via_vcs(self.repo_url)
except Exception as e:
msg = f"Failed to fetch {self.repo_url} via vcs: {e}"
logger.error(msg)
raise ForkError(msg) from e

def advisory_data(self) -> Iterable[AdvisoryData]:
"""
Return AdvisoryData objects corresponding to the data being imported
"""
raise NotImplementedError


# TODO: Needs rewrite
class OvalImporter(Importer):
"""
Expand Down
2 changes: 1 addition & 1 deletion vulnerabilities/importers/elixir_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class ElixirSecurityImporter(Importer):

def advisory_data(self) -> Set[AdvisoryData]:
try:
self.clone(self.repo_url)
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)
vuln = path / "packages"
for file in vuln.glob("**/*.yml"):
Expand Down
36 changes: 19 additions & 17 deletions vulnerabilities/importers/fireeye.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
from typing import List

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Importer
from vulnerabilities.importer import Reference
from vulnerabilities.utils import build_description
from vulnerabilities.utils import dedupe

logger = logging.getLogger(__name__)


class FireyeImporter(GitImporter):
class FireyeImporter(Importer):
spdx_license_expression = "CC-BY-SA-4.0 AND MIT"
license_url = "https://github.com/mandiant/Vulnerability-Disclosures/blob/master/README.md"
notice = """
Expand All @@ -30,23 +30,25 @@ class FireyeImporter(GitImporter):
1. CC BY-SA 4.0 - For CVE related information not including source code (such as PoCs)
2. MIT - For source code contained within provided CVE information
"""

def __init__(self):
super().__init__(repo_url="git+https://github.com/mandiant/Vulnerability-Disclosures")
repo_url = "git+https://github.com/mandiant/Vulnerability-Disclosures"

def advisory_data(self) -> Iterable[AdvisoryData]:
self.clone()
files = filter(
lambda p: p.suffix in [".md", ".MD"], Path(self.vcs_response.dest_dir).glob("**/*")
)
for file in files:
if Path(file).stem == "README":
continue
try:
with open(file) as f:
yield parse_advisory_data(f.read())
except UnicodeError:
logger.error(f"Invalid file {file}")
try:
self.clone(repo_url=self.repo_url)
files = filter(
lambda p: p.suffix in [".md", ".MD"], Path(self.vcs_response.dest_dir).glob("**/*")
)
for file in files:
if Path(file).stem == "README":
continue
try:
with open(file) as f:
yield parse_advisory_data(f.read())
except UnicodeError:
logger.error(f"Invalid file {file}")
finally:
if self.vcs_response:
self.vcs_response.delete()


def parse_advisory_data(raw_data) -> AdvisoryData:
Expand Down
12 changes: 5 additions & 7 deletions vulnerabilities/importers/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackage
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Importer
from vulnerabilities.importer import Reference
from vulnerabilities.utils import build_description

Expand All @@ -48,16 +48,14 @@
GITLAB_SCHEME_BY_PURL_TYPE = {v: k for k, v in PURL_TYPE_BY_GITLAB_SCHEME.items()}


class GitLabAPIImporter(GitImporter):
class GitLabAPIImporter(Importer):
spdx_license_expression = "MIT"
license_url = "https://gitlab.com/gitlab-org/advisories-community/-/blob/main/LICENSE"
repo_url = "git+https://gitlab.com/gitlab-org/advisories-community/"

def __init__(self):
super().__init__(repo_url="git+https://gitlab.com/gitlab-org/advisories-community/")

def advisory_data(self, _keep_clone=True) -> Iterable[AdvisoryData]:
def advisory_data(self, _keep_clone=False) -> Iterable[AdvisoryData]:
try:
self.clone()
self.clone(repo_url=self.repo_url)
base_path = Path(self.vcs_response.dest_dir)

for file_path in base_path.glob("**/*.yml"):
Expand Down
26 changes: 15 additions & 11 deletions vulnerabilities/importers/istio.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,21 @@ class IstioImporter(Importer):
repo_url = "git+https://github.com/istio/istio.io/"

def advisory_data(self) -> Set[AdvisoryData]:
self.clone(self.repo_url)
path = Path(self.vcs_response.dest_dir)
vuln = path / "content/en/news/security/"
for file in vuln.glob("**/*.md"):
# Istio website has files with name starting with underscore, these contain metadata
# required for rendering the website. We're not interested in these.
# See also https://github.com/nexB/vulnerablecode/issues/563
file = str(file)
if file.endswith("_index.md"):
continue
yield from self.process_file(file)
try:
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)
vuln = path / "content/en/news/security/"
for file in vuln.glob("**/*.md"):
# Istio website has files with name starting with underscore, these contain metadata
# required for rendering the website. We're not interested in these.
# See also https://github.com/nexB/vulnerablecode/issues/563
file = str(file)
if file.endswith("_index.md"):
continue
yield from self.process_file(file)
finally:
if self.vcs_response:
self.vcs_response.delete()

def process_file(self, path):

Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/kaybee.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
from packageurl import PackageURL

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Importer
from vulnerabilities.importer import Reference
from vulnerabilities.utils import load_yaml
from vulnerabilities.utils import nearest_patched_package


class KaybeeImporter(GitImporter):
class KaybeeImporter(Importer):
def __enter__(self):
super(KaybeeImporter, self).__enter__()
self._added_files, self._updated_files = self.file_changes(
Expand Down
2 changes: 1 addition & 1 deletion vulnerabilities/importers/mozilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class MozillaImporter(Importer):

def advisory_data(self) -> Iterable[AdvisoryData]:
try:
self.clone(self.repo_url)
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)

vuln = path / "announce"
Expand Down
2 changes: 1 addition & 1 deletion vulnerabilities/importers/npm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class NpmImporter(Importer):

def advisory_data(self) -> Iterable[AdvisoryData]:
try:
self.clone(self.repo_url)
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)

vuln = path / "vuln"
Expand Down
34 changes: 15 additions & 19 deletions vulnerabilities/importers/pypa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#
import logging
import os
from pathlib import Path
from typing import Iterable

import saneyaml
Expand All @@ -23,33 +24,28 @@
class PyPaImporter(Importer):
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
spdx_license_expression = "CC-BY-4.0"
url = "git+https://github.com/pypa/advisory-database"
repo_url = "git+https://github.com/pypa/advisory-database"

def advisory_data(self) -> Iterable[AdvisoryData]:
for raw_data in fork_and_get_files(self.url):
yield parse_advisory_data(raw_data=raw_data, supported_ecosystem="pypi")
try:
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)
for raw_data in fork_and_get_files(path=path):
yield parse_advisory_data(raw_data=raw_data, supported_ecosystem="pypi")
finally:
if self.vcs_response:
self.vcs_response.delete()


class ForkError(Exception):
pass


def fork_and_get_files(url) -> dict:
def fork_and_get_files(path) -> dict:
"""
Yield advisorie data mappings from the PyPA GitHub repository at ``url``.
"""
try:
fork_directory = fetch_via_git(url=url)
except Exception as e:
logger.error(f"Failed to clone url {url}: {e}")
raise ForkError(url) from e

advisory_dirs = os.path.join(fork_directory.dest_dir, "vulns")
for root, _, files in os.walk(advisory_dirs):
for file in files:
path = os.path.join(root, file)
if not file.endswith(".yaml"):
logger.warning(f"Unsupported non-YAML PyPA advisory file: {path}")
continue
with open(path) as f:
yield saneyaml.load(f.read())
advisory_dirs = path / "vulns"
for file in advisory_dirs.glob("**/*.yaml"):
with open(file) as f:
yield saneyaml.load(f.read())
2 changes: 1 addition & 1 deletion vulnerabilities/importers/retiredotnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class RetireDotnetImporter(Importer):

def advisory_data(self) -> Iterable[AdvisoryData]:
try:
self.clone(self.repo_url)
self.clone(repo_url=self.repo_url)
path = Path(self.vcs_response.dest_dir)

vuln = path / "Content"
Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/ruby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
from univers.versions import SemverVersion

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Importer
from vulnerabilities.importer import Reference
from vulnerabilities.package_managers import RubyVersionAPI
from vulnerabilities.utils import load_yaml
from vulnerabilities.utils import nearest_patched_package


class RubyImporter(GitImporter):
class RubyImporter(Importer):
def __enter__(self):
super(RubyImporter, self).__enter__()

Expand Down
4 changes: 2 additions & 2 deletions vulnerabilities/importers/rust.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
from univers.versions import SemverVersion

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Importer
from vulnerabilities.importer import Reference
from vulnerabilities.package_managers import CratesVersionAPI
from vulnerabilities.utils import nearest_patched_package


class RustImporter(GitImporter):
class RustImporter(Importer):
def __enter__(self):
super(RustImporter, self).__enter__()

Expand Down
Loading

0 comments on commit 87b4ebc

Please sign in to comment.