Skip to content

Commit

Permalink
WIP: extend LLVM tar
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisCummins committed Mar 24, 2021
1 parent b430c6e commit 05c1804
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 222 deletions.
9 changes: 7 additions & 2 deletions compiler_gym/bin/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,18 @@ def summarize_datasets(datasets: Iterable[Dataset]) -> str:
dataset.name,
truncate(dataset.description, max_line_len=60),
dataset.n,
dataset.site_data_size_in_bytes,
dataset.site_data_size_in_bytes if dataset.installed else 0,
)
)
rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
return tabulate(
[
(n, l, humanize.intcomma(f) if f else "∞", humanize.naturalsize(s))
(
n,
l,
humanize.intcomma(f) if f else "∞",
humanize.naturalsize(s) if s else "-",
)
for n, l, f, s in rows
],
headers=("Dataset", "Description", "#. Benchmarks", "Size on disk"),
Expand Down
10 changes: 10 additions & 0 deletions compiler_gym/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
site_data_base: Path,
long_description_url: Optional[str] = None,
random: Optional[np.random.Generator] = None,
hidden: bool = False,
):
self._name = name
components = DATASET_NAME_RE.match(name)
Expand All @@ -53,6 +54,7 @@ def __init__(
self._protocol = components.group("dataset_protocol")
self._version = int(components.group("dataset_version"))
self._long_description_url = long_description_url
self._hidden = hidden

self.random = random or np.random.default_rng()
self.logger = logging.getLogger("compiler_gym.datasets")
Expand Down Expand Up @@ -92,6 +94,10 @@ def version(self) -> int:
def long_description_url(self) -> str:
return self._long_description_url

@property
def hidden(self) -> str:
return self._hidden

@property
def site_data_path(self) -> Path:
return self._site_data_path
Expand All @@ -114,6 +120,10 @@ def n(self) -> int:
"""
return 0

@property
def installed(self) -> bool:
return True

def install(self) -> None:
"""
Implementing this method is optional.
Expand Down
31 changes: 19 additions & 12 deletions compiler_gym/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def __init__(
random: Optional[np.random.Generator] = None,
):
# A look-up table mapping dataset names to Dataset instances.
self._datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
self._all_datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
self._visible_datasets: Dict[str, Dataset] = {
k: v for k, v in self._all_datasets.items() if not v.hidden
}
self._site_data_path = Path(site_data_path)
self._site_data_path.mkdir(exist_ok=True, parents=True)
self.random = random or np.random.default_rng()
Expand All @@ -38,7 +41,7 @@ def datasets(self) -> Iterable[Dataset]:
"""
Iteration order is consistent between runs.
"""
yield from sorted(self._datasets.values(), key=lambda d: d.name)
yield from sorted(self._visible_datasets.values(), key=lambda d: d.name)

def __getitem__(self, name: str) -> Dataset:
return self.dataset(name)
Expand All @@ -49,20 +52,24 @@ def __iter__(self) -> Iterable[Dataset]:
def dataset(self, name: str) -> Dataset:
if "://" not in name:
name = f"benchmark://{name}"
if name not in self._datasets:
if name not in self._all_datasets:
raise LookupError(f"Dataset not found: '{name}'")
return self._datasets[name]
return self._all_datasets[name]

def add(self, dataset: Dataset) -> None:
if dataset.name in self._datasets:
if dataset.name in self._all_datasets:
warnings.warn(f"Overwriting existing dataset '{dataset.name}'")
self._datasets[dataset.name] = dataset
self._all_datasets[dataset.name] = dataset
if not dataset.hidden:
self._visible_datasets[dataset.name] = dataset

def remove(self, dataset: Union[str, Dataset]) -> bool:
dataset_name: str = dataset.name if isinstance(dataset, Dataset) else dataset

if dataset_name in self._datasets:
del self._datasets[dataset_name]
if dataset_name in self._all_datasets:
del self._all_datasets[dataset_name]
if dataset_name in self._visible_datasets:
del self._visible_datasets[dataset_name]
return True

return False
Expand All @@ -76,11 +83,11 @@ def benchmark_uris(self) -> Iterable[str]:
yield from dataset.benchmark_uris()

def benchmark(self, uri: Optional[str] = None) -> Benchmark:
if not self._datasets:
if not self._all_datasets:
raise ValueError("No datasets available")

if uri is None:
dataset = self.random.choice(list(self._datasets.values()))
dataset = self.random.choice(list(self._visible_datasets.values()))
return dataset.benchmark()

# Prepend the default benchmark:// protocol on URIs
Expand All @@ -92,7 +99,7 @@ def benchmark(self, uri: Optional[str] = None) -> Benchmark:
raise ValueError(f"Invalid benchmark URI: '{uri}'")

dataset_name = match.group("dataset")
if dataset_name not in self._datasets:
if dataset_name not in self._all_datasets:
raise LookupError(f"Dataset not found: '{dataset_name}'")

return self._datasets[dataset_name].benchmark(uri)
return self._all_datasets[dataset_name].benchmark(uri)
37 changes: 22 additions & 15 deletions compiler_gym/datasets/tar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from compiler_gym.util.download import download


class FilesystemDirectoryDataset(Dataset):
class BenchmarkFilesDataset(Dataset):
def __init__(
self,
dataset_root: Path,
Expand Down Expand Up @@ -100,7 +100,7 @@ def _get_benchmark_by_index(self, n: int) -> Benchmark:

class TarDataset(Dataset):

# TODO: Subclass FilesystemDirectoryDataset
# TODO: Subclass BenchmarkFilesDataset

def __init__(
self,
Expand All @@ -116,16 +116,24 @@ def __init__(
self.benchmark_file_suffix = benchmark_file_suffix
self.strip_prefix = strip_prefix

self._installed = False
self._tar_extracted_marker = self.site_data_path / ".extracted"
self._tar_lock = Lock()
self._tar_lockfile = self.site_data_path / "LOCK"
self._tar_data = self.site_data_path / "contents" / self.strip_prefix

@property
def installed(self) -> bool:
# Fast path for repeated checks to 'installed' without a disk op.
if not self._installed:
self._installed = self._tar_extracted_marker.is_file()
return self._installed

def install(self) -> None:
"""
Implementing this method is optional.
"""
if self._tar_extracted_marker.is_file():
if self.installed:
return

self.logger.info("Downloading %s dataset", self.name)
Expand Down Expand Up @@ -227,9 +235,13 @@ def __init__(self, manifest_url: str, manifest_sha256: str, **dataet_args):
self.manifest_sha256 = manifest_sha256
self._manifest_path = self.site_data_path / "manifest.txt"

def _install_manifest(self) -> List[str]:
@memoized_property
def _benchmark_uris(self) -> List[str]:
if self._manifest_path.is_file():
return
with open(self._manifest_path) as f:
uris = f.readlines()
self.logger.debug("Read %s manifest, %d entries", self.name, len(uris))
return uris
with self._tar_lock:
with fasteners.InterProcessLock(self._tar_lockfile):
self.logger.debug("Downloading %s manifest", self.name)
Expand All @@ -240,16 +252,11 @@ def _install_manifest(self) -> List[str]:
manifest = gzipf.read().decode("utf-8").strip()
with open(self._manifest_path, "w") as f:
f.write(manifest)
return manifest.split("\n")

@memoized_property
def _benchmark_uris(self) -> List[str]:
uris = self._install_manifest()
if not uris:
with open(self._manifest_path) as f:
uris = f.readlines()
self.logger.debug("Read %s manifest, %d entries", self.name, len(uris))
return uris
uris = manifest.split("\n")
self.logger.debug(
"Downloaded %s manifest, %d entries", self.name, len(uris)
)
return uris

@memoized_property
def n(self) -> int:
Expand Down
7 changes: 5 additions & 2 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from compiler_gym.datasets import Dataset, TarDatasetWithManifest
from compiler_gym.envs.llvm.datasets.cBench import CBenchDataset, CBenchLegacyDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset


class BlasDataset(TarDatasetWithManifest):
Expand All @@ -33,7 +34,7 @@ def __init__(self, site_data_base: Path):
tar_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0.tar.bz2",
tar_sha256="880269dd7a5c2508ea222a2e54c318c38c8090eb105c0a87c595e9dd31720764",
manifest_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0-manifest.gz",
manifest_sha256="d7b42ef68c9b452233baa13303d5140d6b9bf15da2ba9d4e7b0ef73524611a42",
manifest_sha256="6d0ed47f8c70868db62ae9e3d2f33dbad9fda5ef1cfe99a9855eef4618ddef1b",
license="CC BY 4.0",
long_description_url="https://github.com/ctuning/ctuning-programs",
strip_prefix="github-v0",
Expand All @@ -50,7 +51,7 @@ def __init__(self, site_data_base: Path):
tar_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0.tar.bz2",
tar_sha256="a1ae5c376af30ab042c9e54dc432f89ce75f9ebaee953bc19c08aff070f12566",
manifest_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0-manifest.gz",
manifest_sha256="dbdf82046cb5779fc48f47c4899deea51d396daddee1dd448ccf411010788b96",
manifest_sha256="6b45716ca142950e42958634366626d06f02e73d37ddce225b3ef55468011aa8",
long_description_url="https://github.com/spcl/ncc/tree/master/data",
license="GPL-2.0",
strip_prefix="linux-v0",
Expand Down Expand Up @@ -156,6 +157,7 @@ def get_llvm_datasets(site_data_base: Path) -> Iterable[Dataset]:
yield OpenCVDataset(site_data_base=site_data_base)
yield POJ104Dataset(site_data_base=site_data_base)
yield TensorflowDataset(site_data_base=site_data_base)
yield LlvmStressDataset(site_data_base=site_data_base)


__all__ = [
Expand All @@ -169,4 +171,5 @@ def get_llvm_datasets(site_data_base: Path) -> Iterable[Dataset]:
"OpenCVDataset",
"POJ104Dataset",
"TensorflowDataset",
"LlvmStressDataset",
]
7 changes: 6 additions & 1 deletion compiler_gym/envs/llvm/datasets/cBench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from deprecated.sphinx import deprecated

from compiler_gym.datasets import TarDatasetWithManifest
from compiler_gym.datasets import Benchmark, TarDatasetWithManifest

_CBENCH_TARS = {
"darwin": (
Expand All @@ -21,6 +21,10 @@
}


class CBenchBenchmark(Benchmark):
pass


class CBenchDataset(TarDatasetWithManifest):
def __init__(self, site_data_base: Path):
platform = {"darwin": "macos"}.get(sys.platform, sys.platform)
Expand Down Expand Up @@ -70,4 +74,5 @@ def __init__(self, site_data_base: Path):
strip_prefix="cBench-v0",
benchmark_file_suffix=".bc",
site_data_base=site_data_base,
hidden=True,
)
9 changes: 3 additions & 6 deletions compiler_gym/envs/llvm/datasets/llvm_stress.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@
from typing import Iterable, Optional

from compiler_gym.datasets import Benchmark, Dataset
from compiler_gym.util.runfiles_path import runfiles_path

LLVM_AS = runfiles_path("compiler_gym/third_party/llvm/bin/llvm-as")
LLVM_STRESS = runfiles_path("compiler_gym/third_party/llvm/bin/llvm-stress")
from compiler_gym.third_party import llvm

# The maximum value for the --seed argument to llvm-stress.
UINT_MAX = (2 << 32) - 1
Expand Down Expand Up @@ -38,12 +35,12 @@ def benchmark(self, uri: Optional[str] = None):
# Run llvm-stress with the given seed and pipe the output to llvm-as to
# assemble a bitcode.
llvm_stress = subprocess.Popen(
[str(LLVM_STRESS), f"--seed={seed}"],
[str(llvm.llvm_stress_path()), f"--seed={seed}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
llvm_as = subprocess.Popen(
[str(LLVM_AS), "-"],
[str(llvm.llvm_as_path()), "-"],
stdin=llvm_stress.stdout,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
Expand Down
4 changes: 0 additions & 4 deletions compiler_gym/envs/llvm/service/BenchmarkFactory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ using grpc::StatusCode;

namespace compiler_gym::llvm_service {

static const std::string kExpectedExtension = ".bc";

static const fs::path kSiteBenchmarksDir = util::getSiteDataPath("llvm/10.0.0/bitcode_benchmarks");

BenchmarkFactory::BenchmarkFactory(const boost::filesystem::path& workingDirectory,
std::optional<std::mt19937_64> rand,
size_t maxLoadedBenchmarkSize)
Expand Down
Loading

0 comments on commit 05c1804

Please sign in to comment.