Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incremental training #147

Merged
merged 5 commits into from
Nov 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions delft/applications/grobidTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat


# train a GROBID model with all available data
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None,
output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None):

print('Loading data...')
if input_path == None:
Expand Down Expand Up @@ -174,8 +175,16 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
multiprocessing=multiprocessing,
early_stop=early_stop)

if incremental:
if input_model_path != None:
model.load(input_model_path)
elif output_path != None:
model.load(output_path)
else:
model.load()

start_time = time.time()
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid)
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % (runtime))

Expand All @@ -189,7 +198,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
# split data, train a GROBID model and evaluate it
def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None):
print('Loading data...')
if input_path is None:
x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
Expand Down Expand Up @@ -229,12 +239,20 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
multiprocessing=multiprocessing,
early_stop=early_stop)

if incremental:
if input_model_path != None:
model.load(input_model_path)
elif output_path != None:
model.load(output_path)
else:
model.load()

start_time = time.time()

if fold_count == 1:
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
else:
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)

runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % runtime)
Expand Down Expand Up @@ -362,21 +380,28 @@ class Tasks:
parser.add_argument("--output", help="Directory where to save a trained model.")
parser.add_argument("--input", help="Grobid data file to be used for training (train action), for training and " +
"evaluation (train_eval action) or just for evaluation (eval action).")
parser.add_argument("--incremental", action="store_true", help="training is incremental, starting from existing model if present")
parser.add_argument("--input-model", help="In case of incremental training, path to an existing model to be used " +
"to start the training, instead of the default one.")
parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.")
parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.")



args = parser.parse_args()

model = args.model
action = args.action
architecture = args.architecture
output = args.output
input_path = args.input
input_model_path = args.input_model
embeddings_name = args.embedding
max_sequence_length = args.max_sequence_length
batch_size = args.batch_size
transformer = args.transformer
use_ELMo = args.use_ELMo
incremental = args.incremental

if transformer is None and embeddings_name is None:
# default word embeddings
Expand All @@ -391,7 +416,9 @@ class Tasks:
output_path=output,
max_sequence_length=max_sequence_length,
batch_size=batch_size,
use_ELMo=use_ELMo)
use_ELMo=use_ELMo,
incremental=incremental,
input_model_path=input_model_path)

if action == Tasks.EVAL:
if args.fold_count is not None and args.fold_count > 1:
Expand All @@ -413,7 +440,9 @@ class Tasks:
fold_count=args.fold_count,
max_sequence_length=max_sequence_length,
batch_size=batch_size,
use_ELMo=use_ELMo)
use_ELMo=use_ELMo,
incremental=incremental,
input_model_path=input_model_path)

if action == Tasks.TAG:
someTexts = []
Expand Down
36 changes: 36 additions & 0 deletions delft/sequenceLabelling/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,42 @@ def fit(self, X, y):

return self

def extend(self, X, y):
chars = self.vocab_char
tags = self.vocab_tag

temp_chars = {
c
for w in set(itertools.chain(*X))
for c in w
if c not in chars
}

sorted_chars = sorted(temp_chars)
sorted_chars_dict = {
c: idx + len(chars)
for idx, c in enumerate(sorted_chars)
}
chars = {**chars, **sorted_chars_dict}

temp_tags = set(itertools.chain(*y))
# filter known tags
temp_tags = { the_tag for the_tag in temp_tags if the_tag not in tags }
sorted_tags = sorted(temp_tags)
sorted_tags_dict = {
tag: idx + len(tags)
for idx, tag in enumerate(sorted_tags)
}
tags = {**tags, **sorted_tags_dict}

self.vocab_char = chars
self.vocab_tag = tags

# refresh tag indices
self.indice_tag = {i: t for t, i in self.vocab_tag.items()}

return self

def transform(self, X, y=None, extend=False, label_indices=False):
"""
transforms input into sequence
Expand Down
37 changes: 27 additions & 10 deletions delft/sequenceLabelling/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def __init__(self,
early_stop, patience,
max_checkpoints_to_keep, multiprocessing)

def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, callbacks=None):
def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, incremental=False, callbacks=None):
# TBD if valid is None, segment train to get one if early_stop is True

# we concatenate all the training+validation data to create the model vocabulary
Expand All @@ -153,12 +153,22 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va

features_all = concatenate_or_none((f_train, f_valid), axis=0)

self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
if incremental:
if self.model == None and self.models == None:
print("error: you must load a model first for an incremental training")
return
print("Incremental training from loaded model", self.model_config.model_name)
# update the preprocessor for the new chars and labels
self.p.extend(x_all, y_all)
else:
# init a new "fresh" model
self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)

self.model_config.char_vocab_size = len(self.p.vocab_char)
self.model_config.case_vocab_size = len(self.p.vocab_case)
self.model_config.char_vocab_size = len(self.p.vocab_char)
self.model_config.case_vocab_size = len(self.p.vocab_case)

self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True)
self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True)

print_parameters(self.model_config, self.training_config)
self.model.print_summary()

Expand All @@ -179,17 +189,24 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va
if self.embeddings and self.embeddings.use_ELMo:
self.embeddings.clean_ELMo_cache()

def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, callbacks=None):
def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, incremental=False, callbacks=None):
x_all = np.concatenate((x_train, x_valid), axis=0) if x_valid is not None else x_train
y_all = np.concatenate((y_train, y_valid), axis=0) if y_valid is not None else y_train
features_all = concatenate_or_none((f_train, f_valid), axis=0)

self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
if incremental:
if self.model == None and self.models == None:
print("error: you must load a model first for an incremental training")
return

self.model_config.char_vocab_size = len(self.p.vocab_char)
self.model_config.case_vocab_size = len(self.p.vocab_case)
print("Incremental training from loaded model", self.model_config.model_name)
self.model.print_summary()
else:
self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
self.model_config.char_vocab_size = len(self.p.vocab_char)
self.model_config.case_vocab_size = len(self.p.vocab_case)
self.models = []

self.models = []
trainer = Trainer(self.model,
self.models,
self.embeddings,
Expand Down
24 changes: 19 additions & 5 deletions delft/textClassification/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,15 +261,20 @@ def load(self, filepath):
self.model.load_weights(filepath=filepath)


def train_folds(X, y, model_config, training_config, embeddings, callbacks=None):
def train_folds(X, y, model_config, training_config, embeddings, models=None, callbacks=None):
fold_count = model_config.fold_number
max_epoch = training_config.max_epoch
architecture = model_config.architecture
use_roc_auc = training_config.use_roc_auc
class_weights = training_config.class_weights

fold_size = len(X) // fold_count
models = []

if models == None:
models = []
incremental = False
else:
incremental = True
scores = []

bert_data = False
Expand All @@ -289,7 +294,10 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)
val_x = X[fold_start:fold_end]
val_y = y[fold_start:fold_end]

foldModel = getModel(model_config, training_config)
if incremental:
foldModel = models[fold_id]
else:
foldModel = getModel(model_config, training_config)

if fold_id == 0:
print_parameters(model_config, training_config)
Expand All @@ -312,15 +320,21 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)
patience=training_config.patience, callbacks=callbacks)

if model_config.transformer_name is None:
models.append(foldModel)
if incremental:
models[fold_id] = foldModel
else:
models.append(foldModel)
else:
# if we are using a transformer layer in the architecture, we need to save the fold model on the disk
directory = os.path.join("data/models/textClassification/", model_config.model_name)
if not os.path.exists(directory):
os.makedirs(directory)

if fold_id == 0:
models.append(foldModel)
if incremental:
models[0] = foldModel
else:
models.append(foldModel)
# save transformer config and tokenizer
if foldModel.transformer_config is not None:
foldModel.transformer_config.to_json_file(os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME))
Expand Down
22 changes: 18 additions & 4 deletions delft/textClassification/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,16 @@ def __init__(self,
class_weights=class_weights,
multiprocessing=multiprocessing)

def train(self, x_train, y_train, vocab_init=None, callbacks=None):
self.model = getModel(self.model_config, self.training_config)
def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None):

if incremental:
if self.model == None and self.models == None:
print("error: you must load a model first for an incremental training")
return
print("Incremental training from loaded model", self.model_config.model_name)
else:
self.model = getModel(self.model_config, self.training_config)

print_parameters(self.model_config, self.training_config)
self.model.print_summary()

Expand Down Expand Up @@ -175,8 +182,15 @@ def train(self, x_train, y_train, vocab_init=None, callbacks=None):
callbacks=callbacks)


def train_nfold(self, x_train, y_train, vocab_init=None, callbacks=None):
self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, callbacks=callbacks)
def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None):
if incremental:
if self.models == None:
print("error: you must load a model first for an incremental training")
return
print("Incremental n-fold training from loaded models", self.model_config.model_name)
self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, self.models, callbacks=callbacks)
else:
self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, None, callbacks=callbacks)


def predict(self, texts, output_format='json', use_main_thread_only=False, batch_size=None):
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from unittest.mock import MagicMock

import pytest
from py._path.local import LocalPath
#from py._path.local import LocalPath

import tensorflow as tf

Expand Down Expand Up @@ -33,7 +33,7 @@ def patch_magicmock():


@pytest.fixture
def temp_dir(tmpdir: LocalPath):
def temp_dir(tmpdir):
# convert to standard Path
return Path(str(tmpdir))

Expand Down