From 3cceb284b8322c17caba22f4d8e3d268eb3632e0 Mon Sep 17 00:00:00 2001 From: Michael Kedar Date: Tue, 23 Sep 2025 15:15:04 +1000 Subject: [PATCH 1/2] feat: begin populating new database in prod --- .../gke-workers/base/kustomization.yaml | 1 + .../gke-workers/base/record-checker.yaml | 25 +++++ .../oss-vdb-test/kustomization.yaml | 2 +- .../oss-vdb-test/record-checker.yaml | 14 --- .../oss-vdb/alias-computation.yaml | 2 + .../oss-vdb/importer-deleter.yaml | 2 + .../environments/oss-vdb/importer.yaml | 2 + .../environments/oss-vdb/kustomization.yaml | 1 + .../environments/oss-vdb/record-checker.yaml | 16 ++++ .../environments/oss-vdb/recoverer.yaml | 3 +- .../environments/oss-vdb/workers.yaml | 2 + gcp/workers/alias/alias_computation.py | 7 -- gcp/workers/alias/upstream_computation.py | 7 -- osv/models.py | 8 -- tools/datafix/reput_all.py | 96 +++++++++++++++++++ 15 files changed, 150 insertions(+), 38 deletions(-) create mode 100644 deployment/clouddeploy/gke-workers/base/record-checker.yaml create mode 100644 deployment/clouddeploy/gke-workers/environments/oss-vdb/record-checker.yaml create mode 100644 tools/datafix/reput_all.py diff --git a/deployment/clouddeploy/gke-workers/base/kustomization.yaml b/deployment/clouddeploy/gke-workers/base/kustomization.yaml index 17317d396e4..51fc8ae0879 100644 --- a/deployment/clouddeploy/gke-workers/base/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/base/kustomization.yaml @@ -25,3 +25,4 @@ resources: - ksm_service.yaml - ksm_stateful_set.yaml - recoverer.yaml +- record-checker.yaml diff --git a/deployment/clouddeploy/gke-workers/base/record-checker.yaml b/deployment/clouddeploy/gke-workers/base/record-checker.yaml new file mode 100644 index 00000000000..12ed6175b89 --- /dev/null +++ b/deployment/clouddeploy/gke-workers/base/record-checker.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: record-checker + labels: + cronLastSuccessfulTimeMins: "90" +spec: + schedule: "10/15 * * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - name: record-checker + image: record-checker + imagePullPolicy: Always + resources: + requests: + cpu: "1" + memory: "1G" + limits: + cpu: "1" + memory: "2G" + restartPolicy: Never diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml index 53000a8477e..1a862d6c9a9 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml @@ -3,7 +3,6 @@ resources: - staging-api-test.yaml - osv-linter.yaml - cve5-to-osv.yaml -- record-checker.yaml patches: - path: workers.yaml - path: scaler.yaml @@ -23,3 +22,4 @@ patches: - path: backup.yaml - path: generate-sitemap.yaml - path: recoverer.yaml +- path: record-checker.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/record-checker.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/record-checker.yaml index 1e996d51f9d..e2b066108aa 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/record-checker.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/record-checker.yaml @@ -2,29 +2,15 @@ apiVersion: batch/v1 kind: CronJob metadata: name: record-checker - labels: - cronLastSuccessfulTimeMins: "90" spec: - schedule: "10/15 * * * *" - concurrencyPolicy: Forbid jobTemplate: spec: template: spec: containers: - name: record-checker - image: record-checker env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb-test - name: OSV_VULNERABILITIES_BUCKET value: osv-test-vulnerabilities - imagePullPolicy: Always - resources: - requests: - cpu: "1" - memory: "1G" - limits: - cpu: "1" - memory: "2G" - restartPolicy: Never diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/alias-computation.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/alias-computation.yaml index 2ef56372397..d3769eb1128 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/alias-computation.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/alias-computation.yaml @@ -12,3 +12,5 @@ spec: env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer-deleter.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer-deleter.yaml index 6086116966d..f1ee65ece66 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer-deleter.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer-deleter.yaml @@ -12,6 +12,8 @@ spec: env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities image: importer args: - --delete diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer.yaml index 95d6537e526..6708cfa33fd 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/importer.yaml @@ -12,6 +12,8 @@ spec: env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities args: - "--ssh_key_public=/secrets/ssh.pub" - "--ssh_key_private=/secrets/ssh" diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml index 6dff72a1a69..c3fc7dd30f9 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml @@ -20,3 +20,4 @@ patches: - path: backup.yaml - path: generate-sitemap.yaml - path: recoverer.yaml +- path: record-checker.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/record-checker.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/record-checker.yaml new file mode 100644 index 00000000000..b3226d78dcf --- /dev/null +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/record-checker.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: record-checker +spec: + jobTemplate: + spec: + template: + spec: + containers: + - name: record-checker + env: + - name: GOOGLE_CLOUD_PROJECT + value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/recoverer.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/recoverer.yaml index c5a876f930e..ab78b549ce3 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/recoverer.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/recoverer.yaml @@ -10,4 +10,5 @@ spec: env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb - + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/workers.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/workers.yaml index 771735af27f..e6420f75697 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/workers.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/workers.yaml @@ -10,6 +10,8 @@ spec: env: - name: GOOGLE_CLOUD_PROJECT value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities args: - "--ssh_key_public=/secrets/ssh.pub" - "--ssh_key_private=/secrets/ssh" diff --git a/gcp/workers/alias/alias_computation.py b/gcp/workers/alias/alias_computation.py index 14eeb4e5c31..afbbbc1eb05 100755 --- a/gcp/workers/alias/alias_computation.py +++ b/gcp/workers/alias/alias_computation.py @@ -99,13 +99,6 @@ def _update_vuln_with_group(vuln_id: str, alias_group: osv.AliasGroup | None): """Updates the Vulnerability in Datastore & GCS with the new alias group. If `alias_group` is None, assumes a preexisting AliasGroup was just deleted. """ - # TODO(michaelkedar): Currently, only want to run this on the test instance - # (or when running tests). Remove this check when we're ready for prod. - project = osv.utils.get_google_cloud_project() - if not project: - logging.error('failed to get GCP project') - if project not in ('oss-vdb-test', 'test-osv'): - return # Get the existing vulnerability first, so we can recalculate search_indices result = gcs.get_by_id_with_generation(vuln_id) if result is None: diff --git a/gcp/workers/alias/upstream_computation.py b/gcp/workers/alias/upstream_computation.py index e580922a0d8..950878c3bbc 100644 --- a/gcp/workers/alias/upstream_computation.py +++ b/gcp/workers/alias/upstream_computation.py @@ -92,13 +92,6 @@ def _update_vuln_with_group(vuln_id: str, upstream: osv.UpstreamGroup | None): """Updates the Vulnerability in Datastore & GCS with the new upstream group. If `upstream` is None, assumes a preexisting UpstreamGroup was just deleted. """ - # TODO(michaelkedar): Currently, only want to run this on the test instance - # (or when running tests). Remove this check when we're ready for prod. - project = osv.utils.get_google_cloud_project() - if not project: - logging.error('failed to get GCP project') - if project not in ('oss-vdb-test', 'test-osv'): - return # Get the existing vulnerability first, so we can recalculate search_indices result = gcs.get_by_id_with_generation(vuln_id) if result is None: diff --git a/osv/models.py b/osv/models.py index 9128031cea0..31558d592bb 100644 --- a/osv/models.py +++ b/osv/models.py @@ -34,7 +34,6 @@ from . import purl_helpers from . import semver_index from . import sources -from . import utils from . import vulnerability_pb2 SCHEMA_VERSION = '1.7.3' @@ -876,13 +875,6 @@ def to_vulnerability_async(self, def _post_put_hook(self: Self, future: ndb.Future): # pylint: disable=arguments-differ """Post-put hook for writing new entities for database migration.""" - # TODO(michaelkedar): Currently, only want to run this on the test instance - # (or when running tests). Remove this check when we're ready for prod. - project = utils.get_google_cloud_project() - if not project: - logging.error('failed to get GCP project') - if project not in ('oss-vdb-test', 'test-osv'): - return if future.exception(): logging.error("Not writing new entities for %s since Bug.put() failed", self.db_id) diff --git a/tools/datafix/reput_all.py b/tools/datafix/reput_all.py new file mode 100644 index 00000000000..28894ede0d7 --- /dev/null +++ b/tools/datafix/reput_all.py @@ -0,0 +1,96 @@ +"""Reputs all Bug entities in Datastore. + +This is useful for applying changes to all existing entities. +""" + +import logging +from multiprocessing import Process +import sys + +from google.cloud import ndb + +from osv import Bug, gcs + +# IDs that divide the OSV database into very roughly equal groups. +# Determined experimentally by print_vuln_ranges +ID_BOUNDS = [ + None, + 'C', + 'CV', + 'CVE-202', + 'CVE-2023', + 'D', + 'DF', + 'G', + 'GHSA-m', + 'H', + 'MAL-2025', + 'MAL-2025-3', + 'MB', + 'Q', + 'S', + 'U', + 'UBUNTU-CVE-202', + 'US', + None, +] + + +def iter_bounds(): + """Yields the start and end bounds for each shard.""" + a = ID_BOUNDS[0] + for b in ID_BOUNDS[1:]: + yield a, b + a = b + + +def print_vuln_ranges(): + """Prints the number of vulnerabilities in each shard. + + Useful for re-calculating ID_BOUNDS. + """ + with ndb.Client().context(cache_policy=False): + for start, stop in iter_bounds(): + q = Bug.query() + if start is not None: + q = q.filter(Bug.key >= ndb.Key('Bug', start)) + if stop is not None: + q = q.filter(Bug.key < ndb.Key('Bug', stop)) + print(f'[{start}, {stop}): {q.count()}') + + +def do_reput(start: str | None = None, stop: str | None = None): + """Re-puts all Bug entities within a given key range.""" + with ndb.Client().context(cache_policy=False): + q = Bug.query() + if start: + q = q.filter(Bug.key >= ndb.Key('Bug', start)) + if stop: + q = q.filter(Bug.key < ndb.Key('Bug', stop)) + + count = 0 + for b in q: + count += 1 + if count % 500 == 0: + logging.info('Processed %d entities in shard [%s, %s)', + count, start, stop) + try: + b.put() + except Exception as e: + logging.error('Failed to put %s: %s', b.key.id(), e) + + +def main(): + """Reputs all bugs in parallel.""" + for a, b in iter_bounds(): + Process(target=do_reput, args=(a, b)).start() + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + try: + # Make sure the OSV_VULNERABILITIES_BUCKET env is set. + gcs.get_osv_bucket() + except: + sys.exit(1) + main() From 24df92e6cc679f9754fa61d678815021da6716bb Mon Sep 17 00:00:00 2001 From: Michael Kedar Date: Tue, 23 Sep 2025 15:28:45 +1000 Subject: [PATCH 2/2] if the linter has one hater I am him --- tools/datafix/reput_all.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/datafix/reput_all.py b/tools/datafix/reput_all.py index 28894ede0d7..bedc78d7753 100644 --- a/tools/datafix/reput_all.py +++ b/tools/datafix/reput_all.py @@ -72,8 +72,8 @@ def do_reput(start: str | None = None, stop: str | None = None): for b in q: count += 1 if count % 500 == 0: - logging.info('Processed %d entities in shard [%s, %s)', - count, start, stop) + logging.info('Processed %d entities in shard [%s, %s)', count, start, + stop) try: b.put() except Exception as e: