From 87f02b529850f15c5fee3482a3efd58d9ee86909 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Tue, 27 Apr 2021 11:20:24 +0100 Subject: [PATCH] [datasets] Tighten the requirements of benchmark name regex. Do not permit that the benchmark name can be missing for a benchmark URI to be considered well formed, as we no longer support dataset-only URIs. Issue #45. --- compiler_gym/datasets/benchmark.py | 10 +++--- tests/datasets/benchmark_test.py | 56 ++++++++++++++++++------------ 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/compiler_gym/datasets/benchmark.py b/compiler_gym/datasets/benchmark.py index 97c4bb9b15..4f564f4750 100644 --- a/compiler_gym/datasets/benchmark.py +++ b/compiler_gym/datasets/benchmark.py @@ -22,9 +22,9 @@ # Regular expression that matches the full two-part URI prefix of a dataset: # {{protocol}}://{{dataset}} # -# A trailing slash is permitted. +# An optional trailing slash is permitted. # -# Example matches: "benchmark://foo-v0", "benchmark://foo-v0/". +# Example matches: "benchmark://foo-v0", "generator://bar-v0/". DATASET_NAME_RE = re.compile( r"(?P(?P[a-zA-z0-9-_]+)://(?P[a-zA-z0-9-_]+-v(?P[0-9]+)))/?" ) @@ -32,11 +32,9 @@ # Regular expression that matches the full three-part format of a benchmark URI: # {{protocol}}://{{dataset}}/{{id}} # -# The {{id}} is optional. -# -# Example matches: "benchmark://foo-v0/" or "benchmark://foo-v0/program". +# Example matches: "benchmark://foo-v0/foo" or "generator://bar-v1/foo/bar.txt". BENCHMARK_URI_RE = re.compile( - r"(?P(?P[a-zA-z0-9-_]+)://(?P[a-zA-z0-9-_]+-v(?P[0-9]+)))(/(?P[^\s]*))?$" + r"(?P(?P[a-zA-z0-9-_]+)://(?P[a-zA-z0-9-_]+-v(?P[0-9]+)))/(?P[^\s]+)$" ) diff --git a/tests/datasets/benchmark_test.py b/tests/datasets/benchmark_test.py index cbc40bc8be..542999efc8 100644 --- a/tests/datasets/benchmark_test.py +++ b/tests/datasets/benchmark_test.py @@ -28,54 +28,58 @@ def _rgx_match(regex, groupname, string) -> str: return match.group(groupname) -@pytest.mark.parametrize("regex", (DATASET_NAME_RE, BENCHMARK_URI_RE)) -def test_benchmark_uri_protocol(regex): - assert not regex.match("B?://cbench-v1/") # Invalid characters - assert not regex.match("cbench-v1/") # Missing protocol - +def test_benchmark_uri_protocol(): assert ( - _rgx_match(regex, "dataset_protocol", "benchmark://cbench-v1/") == "benchmark" + _rgx_match(DATASET_NAME_RE, "dataset_protocol", "benchmark://cbench-v1/") + == "benchmark" ) assert ( - _rgx_match(regex, "dataset_protocol", "Generator13://gen-v11/") == "Generator13" + _rgx_match(DATASET_NAME_RE, "dataset_protocol", "Generator13://gen-v11/") + == "Generator13" ) -def test_benchmark_uri_dataset(): - assert not BENCHMARK_URI_RE.match("benchmark://cBench?v0/") # Invalid character - assert not BENCHMARK_URI_RE.match("benchmark://cBench/") # Missing version suffix +def test_invalid_benchmark_uris(): + # Invalid protocol + assert not DATASET_NAME_RE.match("B?://cbench-v1/") # Invalid characters + assert not DATASET_NAME_RE.match("cbench-v1/") # Missing protocol + + # Invalid dataset name + assert not BENCHMARK_URI_RE.match("benchmark://cbench?v0/foo") # Invalid character + assert not BENCHMARK_URI_RE.match( + "benchmark://cbench/foo" + ) # Missing version suffix + assert not BENCHMARK_URI_RE.match("benchmark://cbench-v0") # Missing benchmark ID + assert not BENCHMARK_URI_RE.match("benchmark://cbench-v0/") # Missing benchmark ID + + # Invalid benchmark ID + assert not BENCHMARK_URI_RE.match("benchmark://cbench-v1/ whitespace") # Whitespace + assert not BENCHMARK_URI_RE.match("benchmark://cbench-v1/\t") # Whitespace + +def test_benchmark_uri_dataset(): assert ( - _rgx_match(BENCHMARK_URI_RE, "dataset_name", "benchmark://cbench-v1/") + _rgx_match(BENCHMARK_URI_RE, "dataset_name", "benchmark://cbench-v1/foo") == "cbench-v1" ) assert ( - _rgx_match(BENCHMARK_URI_RE, "dataset_name", "Generator13://gen-v11/") + _rgx_match(BENCHMARK_URI_RE, "dataset_name", "Generator13://gen-v11/foo") == "gen-v11" ) def test_benchmark_dataset_name(): assert ( - _rgx_match(BENCHMARK_URI_RE, "dataset", "benchmark://cbench-v1/") + _rgx_match(BENCHMARK_URI_RE, "dataset", "benchmark://cbench-v1/foo") == "benchmark://cbench-v1" ) assert ( - _rgx_match(BENCHMARK_URI_RE, "dataset", "Generator13://gen-v11/") + _rgx_match(BENCHMARK_URI_RE, "dataset", "Generator13://gen-v11/foo") == "Generator13://gen-v11" ) def test_benchmark_uri_id(): - assert not BENCHMARK_URI_RE.match("benchmark://cbench-v1/ whitespace") # Whitespace - assert not BENCHMARK_URI_RE.match("benchmark://cbench-v1/\t") # Whitespace - - assert ( - _rgx_match(BENCHMARK_URI_RE, "benchmark_name", "benchmark://cbench-v1") is None - ) - assert ( - _rgx_match(BENCHMARK_URI_RE, "benchmark_name", "benchmark://cbench-v1/") == "" - ) assert ( _rgx_match(BENCHMARK_URI_RE, "benchmark_name", "benchmark://cbench-v1/foo") == "foo" @@ -84,6 +88,12 @@ def test_benchmark_uri_id(): _rgx_match(BENCHMARK_URI_RE, "benchmark_name", "benchmark://cbench-v1/foo/123") == "foo/123" ) + assert ( + _rgx_match( + BENCHMARK_URI_RE, "benchmark_name", "benchmark://cbench-v1/foo/123.txt" + ) + == "foo/123.txt" + ) assert ( _rgx_match( BENCHMARK_URI_RE,