Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add apollo and update some packages #157

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion machine/corpora/zip_paratext_project_terms_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ def _exists(self, file_name: StrPath) -> bool:

def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return BytesIO(self._archive.read(str(file_name)))
return None
2 changes: 1 addition & 1 deletion machine/corpora/zip_paratext_project_text_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ def _exists(self, file_name: StrPath) -> bool:

def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return BytesIO(self._archive.read(str(file_name)))
return None
3 changes: 2 additions & 1 deletion machine/jobs/settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ default:
parent_model_name: facebook/nllb-200-distilled-1.3B
train_params:
do_train: true
optim: adamw_torch
optim: apollo_adamw
optim_target_modules: [".*.attn.*", ".*.mlp.*"]
warmup_steps: 1000
per_device_train_batch_size: 16
gradient_accumulation_steps: 4
Expand Down
20 changes: 12 additions & 8 deletions machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,21 @@ def __init__(
else:
src_lang_token = src_lang
tgt_lang_token = tgt_lang
if (
src_lang is not None
and src_lang_token not in self._tokenizer.added_tokens_encoder
and src_lang_token not in additional_special_tokens
if src_lang is not None and (
src_lang_token is None
or (
src_lang_token not in self._tokenizer.added_tokens_encoder
and src_lang_token not in additional_special_tokens # type: ignore - we already check for None
)
):
raise ValueError(f"The specified model does not support the language code '{src_lang}'")

if (
tgt_lang is not None
and tgt_lang_token not in self._tokenizer.added_tokens_encoder
and tgt_lang_token not in additional_special_tokens
if tgt_lang is not None and (
tgt_lang_token is None
or (
tgt_lang_token not in self._tokenizer.added_tokens_encoder
and tgt_lang_token not in additional_special_tokens # type: ignore - we already check for None
)
):
raise ValueError(f"The specified model does not support the language code '{tgt_lang}'")

Expand Down
2 changes: 2 additions & 0 deletions machine/translation/huggingface/hugging_face_nmt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def __init__(self, model: HuggingFaceNmtModel, corpus: Union[ParallelTextCorpus,

def save(self) -> None:
super().save()
if self._model.training_args.output_dir is None:
raise ValueError("Output directory must not be None.")
output_dir = Path(self._model.training_args.output_dir)
if output_dir != self._model._model_path:
shutil.copytree(output_dir, self._model._model_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ def train(
check_canceled: Optional[Callable[[], None]] = None,
) -> None:
last_checkpoint = None
if self._training_args.output_dir is None:
raise ValueError("Output directory is not set")
if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
if last_checkpoint is None and any(os.path.isfile(p) for p in os.listdir(self._training_args.output_dir)):
Expand Down Expand Up @@ -176,6 +178,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
return missing_characters

def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
if self._training_args.output_dir is None:
raise ValueError("Output directory is not set")
tokenizer_dir = Path(self._training_args.output_dir)
tokenizer.save_pretrained(str(tokenizer_dir))
with open(tokenizer_dir / "tokenizer.json", "r+", encoding="utf-8") as file:
Expand Down Expand Up @@ -317,7 +321,7 @@ def preprocess_function(examples):
model=model,
args=self._training_args,
train_dataset=cast(Any, train_dataset),
tokenizer=tokenizer,
processing_class=tokenizer,
data_collator=data_collator,
callbacks=[
_ProgressCallback(
Expand Down
200 changes: 76 additions & 124 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ urllib3 = "<2"
sentencepiece = "^0.2.0"
sil-thot = "^3.4.6"

transformers = ">=4.38.0,<4.46"
transformers = "^4.49.0"
datasets = "^2.4.0"
sacremoses = "^0.0.53"

Expand All @@ -83,12 +83,13 @@ pytest-cov = "^4.1.0"
ipykernel = "^6.7.0"
jupyter = "^1.0.0"
pandas = "^2.0.3"
pyright = { extras = ["nodejs"], version = "^1.1.362" }
pyright = { extras = ["nodejs"], version = "^1.1.394" }
decoy = "^2.1.0"
pep8-naming = "^0.14.1"

[tool.poetry.group.gpu.dependencies]
torch = { version = "2.4.0", markers = "sys_platform == 'win32' or sys_platform == 'linux'" }
apollo-torch = { version = "^1.0.3", markers = "sys_platform == 'win32' or sys_platform == 'linux'" }
accelerate = { version = "^0.26.1", markers = "sys_platform == 'win32' or sys_platform == 'linux'" }

[tool.poetry.extras]
Expand Down
Loading