Noble-Lab · Lilferrit · Sep 16, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,14 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Changed
 
 - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
+- The `--output` option has been split into two options, `--output_dir` and `--output_root`.
+- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.
 
 ### Fixed
 
 - Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification.
 
 ### Removed
 
-- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`. 
+- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
+- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training.
 
 ## [4.2.1] - 2024-06-25
 

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -12,7 +12,7 @@
 import urllib.parse
 import warnings
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 
 warnings.formatwarning = lambda message, category, *args, **kwargs: (
  f"{category.__name__}: {message}"
@@ -67,8 +67,13 @@
  """,
  ),
  click.Option(
- ("-o", "--output"),
- help="The mzTab file to which results will be written.",
+ ("-d", "--output_dir"),
+ help="The destination directory for output files",
+ type=click.Path(dir_okay=True),
+ ),
+ click.Option(
+ ("-o", "--output_root"),
+ help="The root name for all output files",
  type=click.Path(dir_okay=False),
  ),
  click.Option(
@@ -90,6 +95,13 @@
  ),
  default="info",
  ),
+ click.Option(
+ ("-f", "--force_overwrite"),
+ help="Whether to overwrite output files.",
+ is_flag=True,
+ show_default=True,
+ default=False,
+ ),
  ]
 
 
@@ -144,8 +156,10 @@
  peak_path: Tuple[str],
  model: Optional[str],
  config: Optional[str],
- output: Optional[str],
+ output_dir: Optional[str],
+ output_root: Optional[str],
  verbosity: str,
+ force_overwrite: bool,
  evaluate: bool,
 ) -> None:
  """De novo sequence peptides from tandem mass spectra.
@@ -154,18 +168,25 @@
  to sequence peptides. If evaluate is set to True PEAK_PATH must be
  one or more annotated MGF file.
  """
- output = setup_logging(output, verbosity)
- config, model = setup_model(model, config, output, False)
+ output_path, output_root = _setup_output(
+ output_dir, output_root, force_overwrite, verbosity
+ )
+ utils.check_dir_file_exists(output_path, f"{output_root}.mztab")
+ config, model = setup_model(model, config, output_dir, output_root, False)
  start_time = time.time()
- with ModelRunner(config, model) as runner:
+ with ModelRunner(config, model, output_path, output_root, False) as runner:
  logger.info(
  "Sequencing %speptides from:",
  "and evaluating " if evaluate else "",
  )
  for peak_file in peak_path:
  logger.info(" %s", peak_file)
 
- runner.predict(peak_path, output, evaluate=evaluate)
+ runner.predict(
+ peak_path,
+ str((output_path / output_root).with_suffix(".mztab")),
+ evaluate=evaluate,
+ )
  psms = runner.writer.psms
  utils.log_sequencing_report(
  psms, start_time=start_time, end_time=time.time()
@@ -186,31 +207,40 @@
  An annotated MGF file for validation, like from MassIVE-KB. Use this
  option multiple times to specify multiple files.
  """,
- required=True,
+ required=False,
  multiple=True,
  type=click.Path(exists=True, dir_okay=False),
 )
 def train(
  train_peak_path: Tuple[str],
- validation_peak_path: Tuple[str],
+ validation_peak_path: Optional[Tuple[str]],
  model: Optional[str],
  config: Optional[str],
- output: Optional[str],
+ output_dir: Optional[str],
+ output_root: Optional[str],
  verbosity: str,
+ force_overwrite: bool,
 ) -> None:
  """Train a Casanovo model on your own data.
 
  TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those
  provided by MassIVE-KB, from which to train a new Casnovo model.
  """
- output = setup_logging(output, verbosity)
- config, model = setup_model(model, config, output, True)
+ output_path, output_root = _setup_output(
+ output_dir, output_root, force_overwrite, verbosity
+ )
+ config, model = setup_model(model, config, output_path, output_root, True)
  start_time = time.time()
- with ModelRunner(config, model) as runner:
+ with ModelRunner(
+ config, model, output_path, output_root, not force_overwrite
+ ) as runner:
  logger.info("Training a model from:")
  for peak_file in train_peak_path:
  logger.info(" %s", peak_file)
 
+ if len(validation_peak_path) == 0:
+ validation_peak_path = train_peak_path
+
  logger.info("Using the following validation files:")
  for peak_file in validation_peak_path:
  logger.info(" %s", peak_file)
@@ -250,7 +280,7 @@
 
 
 def setup_logging(
- output: Optional[str],
+ log_file_path: Path,
  verbosity: str,
 ) -> Path:
  """Set up the logger.
@@ -259,21 +289,11 @@
 
  Parameters
  ----------
- output : Optional[str]
- The provided output file name.
+ log_file_path: Path
+ The log file path.
  verbosity : str
  The logging level to use in the console.
-
- Return
- ------
- output : Path
- The output file path.
  """
- if output is None:
- output = f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"
-
- output = Path(output).expanduser().resolve()
-
  logging_levels = {
  "debug": logging.DEBUG,
  "info": logging.INFO,
@@ -300,9 +320,7 @@
  console_handler.setFormatter(console_formatter)
  root_logger.addHandler(console_handler)
  warnings_logger.addHandler(console_handler)
- file_handler = logging.FileHandler(
- output.with_suffix(".log"), encoding="utf8"
- )
+ file_handler = logging.FileHandler(log_file_path, encoding="utf8")
  file_handler.setFormatter(log_formatter)
  root_logger.addHandler(file_handler)
  warnings_logger.addHandler(file_handler)
@@ -319,13 +337,12 @@
  logging.getLogger("torch").setLevel(logging.WARNING)
  logging.getLogger("urllib3").setLevel(logging.WARNING)
 
- return output
-
 
 def setup_model(
  model: Optional[str],
  config: Optional[str],
- output: Optional[Path],
+ output_dir: Optional[Path | str],
+ output_root_name: Optional[str],
  is_train: bool,
 ) -> Config:
  """Setup Casanovo for most commands.
@@ -385,7 +402,8 @@
  logger.info("Casanovo version %s", str(__version__))
  logger.debug("model = %s", model)
  logger.debug("config = %s", config.file)
- logger.debug("output = %s", output)
+ logger.debug("output directory = %s", output_dir)
+ logger.debug("output root name = %s", output_root_name)
  for key, value in config.items():
  logger.debug("%s = %s", str(key), str(value))
 
@@ -489,6 +507,56 @@
  )
 
 
+def _setup_output(
+ output_dir: str | None,
+ output_root: str | None,
+ overwrite: bool,
+ verbosity: str,
+) -> Tuple[Path, str]:
+ """
+ Set up the output directory, output file root name, and logging.
+
+ Parameters:
+ -----------
+ output_dir : str | None
+ The path to the output directory. If `None`, the output directory will
+ be resolved to the current working directory.
+ output_root : str | None
+ The base name for the output files. If `None` the output root name will
+ be resolved to casanovo_<current data and time>
+ overwrite: bool
+ Whether to overwrite log file if it already exists in the output
+ directory.
+ verbosity : str
+ The verbosity level for logging.
+
+ Returns:
+ --------
+ Tuple[Path, str]
+ A tuple containing the resolved output directory and root name for
+ output files.
+ """
+ if output_root is None:
+ output_root = (
+ f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"
+ )
+
+ if output_dir is None:
+ output_path = Path.cwd()
+ else:
+ output_path = Path(output_dir)
+ if not output_path.is_dir():
+ raise FileNotFoundError(
+ f"Target output directory {output_dir} does not exists."
+ )
+
+ if not overwrite:
+ utils.check_dir_file_exists(output_path, f"{output_root}.log")
+
+ setup_logging((output_path / output_root).with_suffix(".log"), verbosity)
+ return output_path, output_root
+
+
 def _get_weights_from_url(
  file_url: str,
  cache_dir: Path,

diff --git a/casanovo/config.py b/casanovo/config.py
@@ -19,6 +19,7 @@
  every_n_train_steps="val_check_interval",
  max_iters="cosine_schedule_period_iters",
  save_top_k=None,
+ model_save_folder_path=None,
 )
 
 
@@ -75,7 +76,6 @@ class Config:
  top_match=int,
  max_epochs=int,
  num_sanity_val_steps=int,
- model_save_folder_path=str,
  val_check_interval=int,
  calculate_precision=bool,
  accelerator=str,

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -42,8 +42,6 @@ random_seed: 454
 n_log: 1
 # Tensorboard directory to use for keeping track of training metrics.
 tb_summarywriter:
-# Path to saved checkpoints.
-model_save_folder_path: ""
 # Model validation and checkpointing frequency in training steps.
 val_check_interval: 50_000