kermitt2 · kermitt2 · Nov 25, 2022 · Nov 23, 2022 · Nov 24, 2022 · Nov 24, 2022
diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
@@ -134,8 +134,9 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat
 
 
 # train a GROBID model with all available data
-def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None,
-          features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):
+def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, 
+        output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
+        use_ELMo=False, incremental=False, input_model_path=None):
 
     print('Loading data...')
     if input_path == None:
@@ -174,8 +175,16 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
                      multiprocessing=multiprocessing,
                      early_stop=early_stop)
 
+    if incremental:
+        if input_model_path != None:
+            model.load(input_model_path)
+        elif output_path != None:
+            model.load(output_path)
+        else:
+            model.load()
+
     start_time = time.time()
-    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid)
+    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % (runtime))
 
@@ -189,7 +198,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
 # split data, train a GROBID model and evaluate it
 def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
                input_path=None, output_path=None, fold_count=1,
-               features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):
+               features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
+               use_ELMo=False, incremental=False, input_model_path=None):
     print('Loading data...')
     if input_path is None:
         x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
@@ -229,12 +239,20 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
                     multiprocessing=multiprocessing,
                     early_stop=early_stop)
 
+    if incremental:
+        if input_model_path != None:
+            model.load(input_model_path)
+        elif output_path != None:
+            model.load(output_path)
+        else:
+            model.load()
+
     start_time = time.time()
 
     if fold_count == 1:
-        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
+        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
     else:
-        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
+        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
 
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % runtime)
@@ -362,21 +380,28 @@ class Tasks:
     parser.add_argument("--output", help="Directory where to save a trained model.")
     parser.add_argument("--input", help="Grobid data file to be used for training (train action), for training and " +
                                         "evaluation (train_eval action) or just for evaluation (eval action).")
+    parser.add_argument("--incremental", action="store_true", help="training is incremental, starting from existing model if present") 
+    parser.add_argument("--input-model", help="In case of incremental training, path to an existing model to be used " +
+                                        "to start the training, instead of the default one.")
     parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.")
     parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.")
 
+
+
     args = parser.parse_args()
 
     model = args.model
     action = args.action
     architecture = args.architecture
     output = args.output
     input_path = args.input
+    input_model_path = args.input_model
     embeddings_name = args.embedding
     max_sequence_length = args.max_sequence_length
     batch_size = args.batch_size
     transformer = args.transformer
     use_ELMo = args.use_ELMo
+    incremental = args.incremental
 
     if transformer is None and embeddings_name is None:
         # default word embeddings
@@ -391,7 +416,9 @@ class Tasks:
             output_path=output,
             max_sequence_length=max_sequence_length,
             batch_size=batch_size,
-            use_ELMo=use_ELMo)
+            use_ELMo=use_ELMo,
+            incremental=incremental,
+            input_model_path=input_model_path)
 
     if action == Tasks.EVAL:
         if args.fold_count is not None and args.fold_count > 1:
@@ -413,7 +440,9 @@ class Tasks:
                 fold_count=args.fold_count,
                 max_sequence_length=max_sequence_length,
                 batch_size=batch_size,
-                use_ELMo=use_ELMo)
+                use_ELMo=use_ELMo, 
+                incremental=incremental,
+                input_model_path=input_model_path)
 
     if action == Tasks.TAG:
         someTexts = []

diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py
@@ -587,6 +587,42 @@ def fit(self, X, y):
 
         return self
 
+    def extend(self, X, y):
+        chars = self.vocab_char
+        tags = self.vocab_tag
+
+        temp_chars = {
+            c 
+            for w in set(itertools.chain(*X))
+            for c in w
+            if c not in chars
+        }
+
+        sorted_chars = sorted(temp_chars)
+        sorted_chars_dict = {
+            c: idx + len(chars)
+            for idx, c in enumerate(sorted_chars)
+        }
+        chars = {**chars, **sorted_chars_dict}
+
+        temp_tags = set(itertools.chain(*y))
+        # filter known tags
+        temp_tags = { the_tag for the_tag in temp_tags if the_tag not in tags }
+        sorted_tags = sorted(temp_tags)
+        sorted_tags_dict = {
+            tag: idx + len(tags)
+            for idx, tag in enumerate(sorted_tags)
+        }
+        tags = {**tags, **sorted_tags_dict}
+
+        self.vocab_char = chars
+        self.vocab_tag = tags
+
+        # refresh tag indices
+        self.indice_tag = {i: t for t, i in self.vocab_tag.items()}
+
+        return self
+
     def transform(self, X, y=None, extend=False, label_indices=False):
         """
         transforms input into sequence

diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py
@@ -137,7 +137,7 @@ def __init__(self,
                                               early_stop, patience,
                                               max_checkpoints_to_keep, multiprocessing)
 
-    def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, callbacks=None):
+    def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, incremental=False, callbacks=None):
         # TBD if valid is None, segment train to get one if early_stop is True
 
         # we concatenate all the training+validation data to create the model vocabulary
@@ -153,12 +153,22 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va
 
         features_all = concatenate_or_none((f_train, f_valid), axis=0)
 
-        self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
+        if incremental:
+            if self.model == None and self.models == None:
+                print("error: you must load a model first for an incremental training")
+                return
+            print("Incremental training from loaded model", self.model_config.model_name)
+            # update the preprocessor for the new chars and labels
+            self.p.extend(x_all, y_all)
+        else:
+            # init a new "fresh" model
+            self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
 
-        self.model_config.char_vocab_size = len(self.p.vocab_char)
-        self.model_config.case_vocab_size = len(self.p.vocab_case)
+            self.model_config.char_vocab_size = len(self.p.vocab_char)
+            self.model_config.case_vocab_size = len(self.p.vocab_case)
 
-        self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True)
+            self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True)
+
         print_parameters(self.model_config, self.training_config)
         self.model.print_summary()
 
@@ -179,17 +189,24 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va
         if self.embeddings and self.embeddings.use_ELMo:
             self.embeddings.clean_ELMo_cache()
 
-    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, callbacks=None):
+    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, incremental=False, callbacks=None):
         x_all = np.concatenate((x_train, x_valid), axis=0) if x_valid is not None else x_train
         y_all = np.concatenate((y_train, y_valid), axis=0) if y_valid is not None else y_train
         features_all = concatenate_or_none((f_train, f_valid), axis=0)
 
-        self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
+        if incremental:
+            if self.model == None and self.models == None:
+                print("error: you must load a model first for an incremental training")
+                return
 
-        self.model_config.char_vocab_size = len(self.p.vocab_char)
-        self.model_config.case_vocab_size = len(self.p.vocab_case)
+            print("Incremental training from loaded model", self.model_config.model_name)
+            self.model.print_summary()
+        else:
+            self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config)
+            self.model_config.char_vocab_size = len(self.p.vocab_char)
+            self.model_config.case_vocab_size = len(self.p.vocab_case)
+            self.models = []
 
-        self.models = []
         trainer = Trainer(self.model,
                           self.models,
                           self.embeddings,

diff --git a/delft/textClassification/models.py b/delft/textClassification/models.py
@@ -261,15 +261,20 @@ def load(self, filepath):
         self.model.load_weights(filepath=filepath)
 
 
-def train_folds(X, y, model_config, training_config, embeddings, callbacks=None):
+def train_folds(X, y, model_config, training_config, embeddings, models=None, callbacks=None):
     fold_count = model_config.fold_number
     max_epoch = training_config.max_epoch
     architecture = model_config.architecture
     use_roc_auc = training_config.use_roc_auc
     class_weights = training_config.class_weights
 
     fold_size = len(X) // fold_count
-    models = []
+
+    if models == None:
+        models = []
+        incremental = False
+    else:
+        incremental = True
     scores = []
 
     bert_data = False
@@ -289,7 +294,10 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)
         val_x = X[fold_start:fold_end]
         val_y = y[fold_start:fold_end]
 
-        foldModel = getModel(model_config, training_config)
+        if incremental:
+            foldModel = models[fold_id]
+        else:    
+            foldModel = getModel(model_config, training_config)
 
         if fold_id == 0:
             print_parameters(model_config, training_config)
@@ -312,15 +320,21 @@ def train_folds(X, y, model_config, training_config, embeddings, callbacks=None)
                 patience=training_config.patience, callbacks=callbacks)
 
         if model_config.transformer_name is None:
-            models.append(foldModel)
+            if incremental:
+                models[fold_id] = foldModel
+            else:
+                models.append(foldModel)
         else:
             # if we are using a transformer layer in the architecture, we need to save the fold model on the disk
             directory = os.path.join("data/models/textClassification/", model_config.model_name)
             if not os.path.exists(directory):
                 os.makedirs(directory)
 
             if fold_id == 0:
-                models.append(foldModel)
+                if incremental:
+                    models[0] = foldModel
+                else:
+                    models.append(foldModel)
                 # save transformer config and tokenizer
                 if foldModel.transformer_config is not None:
                     foldModel.transformer_config.to_json_file(os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME))

diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py
@@ -130,9 +130,16 @@ def __init__(self,
                                               class_weights=class_weights, 
                                               multiprocessing=multiprocessing)
 
-    def train(self, x_train, y_train, vocab_init=None, callbacks=None):
-        self.model = getModel(self.model_config, self.training_config)
+    def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None):
 
+        if incremental:
+            if self.model == None and self.models == None:
+                print("error: you must load a model first for an incremental training")
+                return
+            print("Incremental training from loaded model", self.model_config.model_name)
+        else:
+            self.model = getModel(self.model_config, self.training_config)
+
         print_parameters(self.model_config, self.training_config)
         self.model.print_summary()
 
@@ -175,8 +182,15 @@ def train(self, x_train, y_train, vocab_init=None, callbacks=None):
             callbacks=callbacks)
 
 
-    def train_nfold(self, x_train, y_train, vocab_init=None, callbacks=None):
-        self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, callbacks=callbacks)
+    def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None):
+        if incremental:
+            if self.models == None:
+                print("error: you must load a model first for an incremental training")
+                return
+            print("Incremental n-fold training from loaded models", self.model_config.model_name)
+            self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, self.models, callbacks=callbacks)
+        else:
+            self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, None, callbacks=callbacks)
 
 
     def predict(self, texts, output_format='json', use_main_thread_only=False, batch_size=None):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,7 +3,7 @@
 from unittest.mock import MagicMock
 
 import pytest
-from py._path.local import LocalPath
+#from py._path.local import LocalPath
 
 import tensorflow as tf
 
@@ -33,7 +33,7 @@ def patch_magicmock():
 
 
 @pytest.fixture
-def temp_dir(tmpdir: LocalPath):
+def temp_dir(tmpdir):
     # convert to standard Path
     return Path(str(tmpdir))