Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File IO command line options revision #372

Merged
merged 25 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1868980
file io console options
Lilferrit Aug 26, 2024
8a346e0
output console io options
Lilferrit Aug 26, 2024
6e043d1
file io options tests
Lilferrit Aug 26, 2024
ee88344
changelog entry
Lilferrit Aug 26, 2024
4da1357
revised changelog
Lilferrit Aug 27, 2024
653deed
file io console options
Lilferrit Aug 26, 2024
837b769
output console io options
Lilferrit Aug 26, 2024
2483d67
file io options tests
Lilferrit Aug 26, 2024
bf14f2b
changelog entry
Lilferrit Aug 26, 2024
1903cbc
revised changelog
Lilferrit Aug 27, 2024
90ef08b
Generate new screengrabs with rich-codex
github-actions[bot] Aug 27, 2024
970adb6
requested changes
Lilferrit Aug 29, 2024
77c6756
merge conflicts
Lilferrit Aug 29, 2024
e68858b
updated integration test
Lilferrit Aug 30, 2024
3d91f81
requested changes
Lilferrit Aug 30, 2024
645a33f
Generate new screengrabs with rich-codex
github-actions[bot] Aug 30, 2024
2d6dd00
requested changes, output setup refactor
Lilferrit Sep 3, 2024
66de213
Merge branch 'console-file-io' of github.com:Noble-Lab/casanovo into …
Lilferrit Sep 3, 2024
1ee28be
ModelRunner documentation
Lilferrit Sep 3, 2024
4cb18e1
requested changes, _setup_output unit test
Lilferrit Sep 4, 2024
503fb86
ModelRunner output root bug fix, setup_model documentation, sequence …
Lilferrit Sep 12, 2024
71dc50c
Generate new screengrabs with rich-codex
github-actions[bot] Sep 12, 2024
6ba9bb3
logging format character
Lilferrit Sep 16, 2024
257a681
Merge branch 'console-file-io' of github.com:Noble-Lab/casanovo into …
Lilferrit Sep 16, 2024
e405add
Generate new screengrabs with rich-codex
github-actions[bot] Sep 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Changed

- Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
- The `--output` option has been split into two options, `--output_dir` and `--output_root`.
- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.

### Fixed

- Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification.

### Removed

- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training.

## [4.2.1] - 2024-06-25

Expand Down
136 changes: 102 additions & 34 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import urllib.parse
import warnings
from pathlib import Path
from typing import Optional, Tuple
from typing import Optional, Tuple, List

warnings.formatwarning = lambda message, category, *args, **kwargs: (
f"{category.__name__}: {message}"
Expand Down Expand Up @@ -67,8 +67,13 @@
""",
),
click.Option(
("-o", "--output"),
help="The mzTab file to which results will be written.",
("-d", "--output_dir"),
help="The destination directory for output files",
type=click.Path(dir_okay=True),
),
click.Option(
("-o", "--output_root"),
help="The root name for all output files",
type=click.Path(dir_okay=False),
),
click.Option(
Expand All @@ -90,6 +95,13 @@
),
default="info",
),
click.Option(
("-f", "--force_overwrite"),
help="Whether to overwrite output files.",
is_flag=True,
show_default=True,
default=False,
),
]


Expand Down Expand Up @@ -144,8 +156,10 @@
peak_path: Tuple[str],
model: Optional[str],
config: Optional[str],
output: Optional[str],
output_dir: Optional[str],
output_root: Optional[str],
verbosity: str,
force_overwrite: bool,
evaluate: bool,
) -> None:
"""De novo sequence peptides from tandem mass spectra.
Expand All @@ -154,18 +168,25 @@
to sequence peptides. If evaluate is set to True PEAK_PATH must be
one or more annotated MGF file.
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, False)
output_path, output_root = _setup_output(
output_dir, output_root, force_overwrite, verbosity
)
utils.check_dir_file_exists(output_path, f"{output_root}.mztab")
config, model = setup_model(model, config, output_dir, output_root, False)
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
start_time = time.time()
with ModelRunner(config, model) as runner:
with ModelRunner(config, model, output_path, output_root, False) as runner:
logger.info(
"Sequencing %speptides from:",
"and evaluating " if evaluate else "",
)
for peak_file in peak_path:
logger.info(" %s", peak_file)

runner.predict(peak_path, output, evaluate=evaluate)
runner.predict(
peak_path,
str((output_path / output_root).with_suffix(".mztab")),
evaluate=evaluate,
)
psms = runner.writer.psms
utils.log_sequencing_report(
psms, start_time=start_time, end_time=time.time()
Expand All @@ -186,31 +207,40 @@
An annotated MGF file for validation, like from MassIVE-KB. Use this
option multiple times to specify multiple files.
""",
required=True,
required=False,
multiple=True,
type=click.Path(exists=True, dir_okay=False),
)
def train(
train_peak_path: Tuple[str],
validation_peak_path: Tuple[str],
validation_peak_path: Optional[Tuple[str]],
model: Optional[str],
config: Optional[str],
output: Optional[str],
output_dir: Optional[str],
output_root: Optional[str],
verbosity: str,
force_overwrite: bool,
) -> None:
"""Train a Casanovo model on your own data.

TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those
provided by MassIVE-KB, from which to train a new Casnovo model.
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, True)
output_path, output_root = _setup_output(
output_dir, output_root, force_overwrite, verbosity
)
config, model = setup_model(model, config, output_path, output_root, True)
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
start_time = time.time()
with ModelRunner(config, model) as runner:
with ModelRunner(
config, model, output_path, output_root, not force_overwrite
) as runner:
logger.info("Training a model from:")
for peak_file in train_peak_path:
logger.info(" %s", peak_file)

if len(validation_peak_path) == 0:
validation_peak_path = train_peak_path

Check warning on line 242 in casanovo/casanovo.py

View check run for this annotation

Codecov / codecov/patch

casanovo/casanovo.py#L242

Added line #L242 was not covered by tests

logger.info("Using the following validation files:")
for peak_file in validation_peak_path:
logger.info(" %s", peak_file)
Expand Down Expand Up @@ -250,7 +280,7 @@


def setup_logging(
output: Optional[str],
log_file_path: Path,
verbosity: str,
) -> Path:
"""Set up the logger.
Expand All @@ -259,21 +289,11 @@

Parameters
----------
output : Optional[str]
The provided output file name.
log_file_path: Path
The log file path.
verbosity : str
The logging level to use in the console.

Return
------
output : Path
The output file path.
"""
if output is None:
output = f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"

output = Path(output).expanduser().resolve()
bittremieux marked this conversation as resolved.
Show resolved Hide resolved

logging_levels = {
"debug": logging.DEBUG,
"info": logging.INFO,
Expand All @@ -300,9 +320,7 @@
console_handler.setFormatter(console_formatter)
root_logger.addHandler(console_handler)
warnings_logger.addHandler(console_handler)
file_handler = logging.FileHandler(
output.with_suffix(".log"), encoding="utf8"
)
file_handler = logging.FileHandler(log_file_path, encoding="utf8")
file_handler.setFormatter(log_formatter)
root_logger.addHandler(file_handler)
warnings_logger.addHandler(file_handler)
Expand All @@ -319,13 +337,12 @@
logging.getLogger("torch").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

return output


def setup_model(
model: Optional[str],
config: Optional[str],
output: Optional[Path],
output_dir: Optional[Path | str],
output_root_name: Optional[str],
is_train: bool,
) -> Config:
"""Setup Casanovo for most commands.
Expand Down Expand Up @@ -385,7 +402,8 @@
logger.info("Casanovo version %s", str(__version__))
logger.debug("model = %s", model)
logger.debug("config = %s", config.file)
logger.debug("output = %s", output)
logger.debug("output directory = %s", output_dir)
logger.debug("output root name = %s", output_root_name)
for key, value in config.items():
logger.debug("%s = %s", str(key), str(value))

Expand Down Expand Up @@ -489,6 +507,56 @@
)


def _setup_output(
output_dir: str | None,
output_root: str | None,
overwrite: bool,
verbosity: str,
) -> Tuple[Path, str]:
"""
Set up the output directory, output file root name, and logging.

Parameters:
-----------
output_dir : str | None
The path to the output directory. If `None`, the output directory will
be resolved to the current working directory.
output_root : str | None
The base name for the output files. If `None` the output root name will
be resolved to casanovo_<current data and time>
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
overwrite: bool
Whether to overwrite log file if it already exists in the output
directory.
verbosity : str
The verbosity level for logging.

Returns:
--------
Tuple[Path, str]
A tuple containing the resolved output directory and root name for
output files.
"""
if output_root is None:
output_root = (

Check warning on line 540 in casanovo/casanovo.py

View check run for this annotation

Codecov / codecov/patch

casanovo/casanovo.py#L540

Added line #L540 was not covered by tests
f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}"
)

if output_dir is None:
output_path = Path.cwd()

Check warning on line 545 in casanovo/casanovo.py

View check run for this annotation

Codecov / codecov/patch

casanovo/casanovo.py#L545

Added line #L545 was not covered by tests
else:
output_path = Path(output_dir)
if not output_path.is_dir():
raise FileNotFoundError(

Check warning on line 549 in casanovo/casanovo.py

View check run for this annotation

Codecov / codecov/patch

casanovo/casanovo.py#L549

Added line #L549 was not covered by tests
f"Target output directory {output_dir} does not exists."
)

if not overwrite:
utils.check_dir_file_exists(output_path, f"{output_root}.log")

setup_logging((output_path / output_root).with_suffix(".log"), verbosity)
return output_path, output_root

bittremieux marked this conversation as resolved.
Show resolved Hide resolved

def _get_weights_from_url(
file_url: str,
cache_dir: Path,
Expand Down
2 changes: 1 addition & 1 deletion casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
every_n_train_steps="val_check_interval",
max_iters="cosine_schedule_period_iters",
save_top_k=None,
model_save_folder_path=None,
)


Expand Down Expand Up @@ -75,7 +76,6 @@ class Config:
top_match=int,
max_epochs=int,
num_sanity_val_steps=int,
model_save_folder_path=str,
val_check_interval=int,
calculate_precision=bool,
accelerator=str,
Expand Down
2 changes: 0 additions & 2 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ random_seed: 454
n_log: 1
# Tensorboard directory to use for keeping track of training metrics.
tb_summarywriter:
# Path to saved checkpoints.
model_save_folder_path: ""
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

Expand Down
Loading
Loading