stanford-crfm · dlwh · Oct 5, 2024 · Oct 4, 2024 · Oct 5, 2024 · Oct 5, 2024
diff --git a/config/gpt2_nano_mixture.yaml b/config/gpt2_nano_mixture.yaml
@@ -5,6 +5,7 @@ data:
  id: dlwh/wikitext_103_detokenized
  w2:
  id: dlwh/wikitext_103_detokenized
+ cache_dir: wikitext2_cache
  train_weights:
  wikitext: 1.0
  w2: 1.0

diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py
@@ -524,13 +524,18 @@ def fsspec_expand_glob(url):
  return urls
 
 
+@dataclass
+class LMDatasetMixtureComponentConfig(LMDatasetSourceConfig):
+ cache_dir: Optional[str] = None # Optionally override the cache dir for this component
+
+
 @dataclass
 class LMTaskConfig(abc.ABC):
  tokenizer: str = "gpt2"
  vocab_size: Optional[int] = None # if using the passthrough tokenizer, this is required
 
  # config related to caching
- cache_dir: str = "cache/"
+ cache_dir: Optional[str] = "cache/"
  cache_options: CacheOptions = field(default_factory=CacheOptions)
  enforce_eos: bool = True # whether to append eos even if the tokenizer doesn't
 
@@ -560,7 +565,7 @@ def validation_sets(
 
  @property
  @abc.abstractmethod
- def sources(self) -> dict[str, LMDatasetSourceConfig]:
+ def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
  pass
 
  def tagged_eval_sets(
@@ -605,7 +610,7 @@ def validation_sets(
  return {}
 
  @property
- def sources(self) -> dict[str, LMDatasetSourceConfig]:
+ def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
  return {"": self}
 
  @cached_property
@@ -634,6 +639,9 @@ def token_seq_dataset(
  def build_or_load_cache(
  self, split: str, monitors: Union[bool, List[MetricsMonitor]] = True, logger_name: Optional[str] = None
  ) -> Optional[TreeCache[BatchEncoding]]:
+ if self.cache_dir is None:
+ raise ValueError("cache_dir cannot be None")
+
  split_cache_dir = os.path.join(self.cache_dir, split)
  name = logger_name or os.path.basename(self.cache_dir)
 
@@ -702,7 +710,7 @@ class LMMixtureDatasetConfig(LMTaskConfig):
  """This class represents a mixture of datasets with their associated weights."""
 
  # data source configs and weights
- configs: Dict[str, LMDatasetSourceConfig] = field(default_factory=dict)
+ configs: Dict[str, LMDatasetMixtureComponentConfig] = field(default_factory=dict)
  """ configuration of each dataset source (urls, hf dataset id, etc.) """
  train_weights: Dict[str, float] = field(default_factory=dict)
  """ weights for each dataset source. They will be normalized to sum to 1. """
@@ -788,10 +796,23 @@ def build_caches(
  if weight == 0 and split == "train":
  continue
 
- source_config_dict = source_config.__dict__
+ source_config_dict = dict(**source_config.__dict__)
+ if "cache_dir" in source_config_dict:
+ del source_config_dict["cache_dir"]
+
+ if source_config.cache_dir is not None:
+ cache_dir = source_config.cache_dir
+ else:
+ if self.cache_dir is None:
+ raise ValueError(
+ "If the 'main' cache_dir is None, then all component cache_dirs must be non-None, but"
+ f"{name}'s cache_dir is None."
+ )
+
+ cache_dir = os.path.join(self.cache_dir, name)
 
  dataset = LMDatasetConfig(
- cache_dir=os.path.join(self.cache_dir, name),
+ cache_dir=cache_dir,
  **source_config_dict,
  **task_config_dict,
  )
@@ -813,5 +834,5 @@ def build_caches(
  return caches
 
  @property
- def sources(self) -> dict[str, LMDatasetSourceConfig]:
+ def sources(self) -> Mapping[str, LMDatasetSourceConfig]:
  return self.configs
diff --git a/src/levanter/main/cache_dataset.py b/src/levanter/main/cache_dataset.py
@@ -31,7 +31,7 @@ def main(args: RayCachedLMDatasetConfig):
  print(f"Caching {split} to {args.cache_dir}.")
  # connect or start the actor
  batch_tokenizer = BatchTokenizer(tokenizer, enforce_eos=args.enforce_eos)
- split_cache_dir = os.path.join(args.cache_dir, split)
+ split_cache_dir = os.path.join(args.cache_dir, split) # type: ignore
  source = args.get_shard_source(split)
 
  if source is None: