diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 79507f6ee4..2d9f842545 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,3 +9,12 @@ repos:
         stages: [pre-commit]
         fail_fast: true
         verbose: true
+      - id: pylint-check
+        name: pylint-check
+        entry: pylint --rcfile=.pylintrc -rn -sn
+        language: system
+        types: [python]
+        stages: [pre-commit]
+        fail_fast: true
+        require_serial: true
+        verbose: true
diff --git a/.pylintrc b/.pylintrc
index ca5736a5f2..c17e50e122 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -114,7 +114,11 @@ disable=too-few-public-methods,
         consider-using-enumerate,
         too-many-statements,
         assignment-from-none,
-        eval-used
+        eval-used,
+        duplicate-code,
+        redefined-outer-name,
+        consider-using-f-string,
+        fixme,
         
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/examples/inferences/main.py b/examples/inferences/main.py
index 78880551fb..393ab20fd5 100644
--- a/examples/inferences/main.py
+++ b/examples/inferences/main.py
@@ -15,8 +15,10 @@
 import os
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr import schemas, tokenizers
+from tensorflow_asr.models import base_model
 from tensorflow_asr.configs import Config
 from tensorflow_asr.utils import cli_util, data_util, env_util, file_util
 
@@ -35,7 +37,7 @@ def main(
     config = Config(config_path, training=False, repodir=repodir)
     tokenizer = tokenizers.get(config)
 
-    model: tf.keras.Model = tf.keras.models.model_from_config(config.model_config)
+    model: base_model.BaseModel = keras.models.model_from_config(config.model_config)
     model.make(batch_size=1)
     model.load_weights(h5, by_name=file_util.is_hdf5_filepath(h5), skip_mismatch=False)
     model.summary()
@@ -44,7 +46,15 @@ def main(
     signal = tf.reshape(signal, [1, -1])
     signal_length = tf.reshape(tf.shape(signal)[1], [1])
 
-    outputs = model.recognize(schemas.PredictInput(signal, signal_length))
+    outputs = model.recognize(
+        schemas.PredictInput(
+            inputs=signal,
+            inputs_length=signal_length,
+            previous_tokens=model.get_initial_tokens(),
+            previous_encoder_states=model.get_initial_encoder_states(),
+            previous_decoder_states=model.get_initial_decoder_states(),
+        )
+    )
     print(outputs.tokens)
     transcript = tokenizer.detokenize(outputs.tokens)[0].numpy().decode("utf-8")
 
diff --git a/examples/inferences/rnn_transducer.py b/examples/inferences/rnn_transducer.py
index 443e9a77df..b6d471a6d4 100644
--- a/examples/inferences/rnn_transducer.py
+++ b/examples/inferences/rnn_transducer.py
@@ -1,89 +1,89 @@
-# Copyright 2020 Huy Le Nguyen (@nglehuy)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-from tensorflow_asr.utils import data_util, env_util, math_util
-
-logger = env_util.setup_environment()
-import tensorflow as tf
-
-parser = argparse.ArgumentParser(prog="Rnn Transducer non streaming")
-
-parser.add_argument("filename", metavar="FILENAME", help="audio file to be played back")
-
-parser.add_argument("--config", type=str, default=None, help="Path to rnnt config yaml")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to rnnt saved h5 weights")
-
-parser.add_argument("--beam_width", type=int, default=0, help="Beam width")
-
-parser.add_argument("--timestamp", default=False, action="store_true", help="Return with timestamp")
-
-parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
-
-parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
-
-parser.add_argument("--subwords", default=False, action="store_true", help="Path to file that stores generated subwords")
-
-parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
-
-args = parser.parse_args()
-
-env_util.setup_devices([args.device], cpu=args.cpu)
-
-from tensorflow_asr.configs import Config
-from tensorflow_asr.features.speech_featurizers import SpeechFeaturizer, read_raw_audio
-from tensorflow_asr.models.transducer.rnnt import RnnTransducer
-from tensorflow_asr.tokenizers import CharTokenizer, SentencePieceTokenizer, SubwordFeaturizer
-
-config = Config(args.config)
-speech_featurizer = SpeechFeaturizer(config.speech_config)
-if args.sentence_piece:
-    logger.info("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceTokenizer(config.decoder_config)
-elif args.subwords:
-    logger.info("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer(config.decoder_config)
-else:
-    text_featurizer = CharTokenizer(config.decoder_config)
-text_featurizer.decoder_config.beam_width = args.beam_width
-
-# build model
-rnnt = RnnTransducer(**config.model_config, vocab_size=text_featurizer.num_classes)
-rnnt.make(speech_featurizer.shape)
-rnnt.load_weights(args.saved, by_name=True, skip_mismatch=True)
-rnnt.summary()
-rnnt.add_featurizers(speech_featurizer, text_featurizer)
-
-signal = read_raw_audio(args.filename)
-features = speech_featurizer.tf_extract(signal)
-input_length = math_util.get_reduced_length(tf.shape(features)[0], rnnt.time_reduction_factor)
-
-if args.beam_width:
-    transcript = rnnt.recognize_beam(data_util.create_inputs(inputs=features[None, ...], inputs_length=input_length[None, ...]))
-    logger.info("Transcript:", transcript[0].numpy().decode("UTF-8"))
-elif args.timestamp:
-    transcript, stime, etime, _, _, _ = rnnt.recognize_tflite_with_timestamp(
-        signal=signal,
-        predicted=tf.constant(text_featurizer.blank, dtype=tf.int32),
-        encoder_states=rnnt.encoder.get_initial_state(),
-        prediction_states=rnnt.predict_net.get_initial_state(),
-    )
-    logger.info("Transcript:", transcript)
-    logger.info("Start time:", stime)
-    logger.info("End time:", etime)
-else:
-    transcript = rnnt.recognize(data_util.create_inputs(inputs=features[None, ...], inputs_length=input_length[None, ...]))
-    logger.info("Transcript:", transcript[0].numpy().decode("UTF-8"))
+# # Copyright 2020 Huy Le Nguyen (@nglehuy)
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+# import argparse
+
+# from tensorflow_asr.utils import data_util, env_util, math_util
+
+# logger = env_util.setup_environment()
+# import tensorflow as tf
+
+# parser = argparse.ArgumentParser(prog="Rnn Transducer non streaming")
+
+# parser.add_argument("filename", metavar="FILENAME", help="audio file to be played back")
+
+# parser.add_argument("--config", type=str, default=None, help="Path to rnnt config yaml")
+
+# parser.add_argument("--saved", type=str, default=None, help="Path to rnnt saved h5 weights")
+
+# parser.add_argument("--beam_width", type=int, default=0, help="Beam width")
+
+# parser.add_argument("--timestamp", default=False, action="store_true", help="Return with timestamp")
+
+# parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
+
+# parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
+
+# parser.add_argument("--subwords", default=False, action="store_true", help="Path to file that stores generated subwords")
+
+# parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+# args = parser.parse_args()
+
+# env_util.setup_devices([args.device], cpu=args.cpu)
+
+# from tensorflow_asr.configs import Config
+# from tensorflow_asr.features.speech_featurizers import SpeechFeaturizer, read_raw_audio
+# from tensorflow_asr.models.transducer.rnnt import RnnTransducer
+# from tensorflow_asr.tokenizers import CharTokenizer, SentencePieceTokenizer, SubwordFeaturizer
+
+# config = Config(args.config)
+# speech_featurizer = SpeechFeaturizer(config.speech_config)
+# if args.sentence_piece:
+#     logger.info("Loading SentencePiece model ...")
+#     text_featurizer = SentencePieceTokenizer(config.decoder_config)
+# elif args.subwords:
+#     logger.info("Loading subwords ...")
+#     text_featurizer = SubwordFeaturizer(config.decoder_config)
+# else:
+#     text_featurizer = CharTokenizer(config.decoder_config)
+# text_featurizer.decoder_config.beam_width = args.beam_width
+
+# # build model
+# rnnt = RnnTransducer(**config.model_config, vocab_size=text_featurizer.num_classes)
+# rnnt.make(speech_featurizer.shape)
+# rnnt.load_weights(args.saved, by_name=True, skip_mismatch=True)
+# rnnt.summary()
+# rnnt.add_featurizers(speech_featurizer, text_featurizer)
+
+# signal = read_raw_audio(args.filename)
+# features = speech_featurizer.tf_extract(signal)
+# input_length = math_util.get_reduced_length(tf.shape(features)[0], rnnt.time_reduction_factor)
+
+# if args.beam_width:
+#     transcript = rnnt.recognize_beam(data_util.create_inputs(inputs=features[None, ...], inputs_length=input_length[None, ...]))
+#     logger.info("Transcript:", transcript[0].numpy().decode("UTF-8"))
+# elif args.timestamp:
+#     transcript, stime, etime, _, _, _ = rnnt.recognize_tflite_with_timestamp(
+#         signal=signal,
+#         predicted=tf.constant(text_featurizer.blank, dtype=tf.int32),
+#         encoder_states=rnnt.encoder.get_initial_state(),
+#         prediction_states=rnnt.predict_net.get_initial_state(),
+#     )
+#     logger.info("Transcript:", transcript)
+#     logger.info("Start time:", stime)
+#     logger.info("End time:", etime)
+# else:
+#     transcript = rnnt.recognize(data_util.create_inputs(inputs=features[None, ...], inputs_length=input_length[None, ...]))
+#     logger.info("Transcript:", transcript[0].numpy().decode("UTF-8"))
diff --git a/examples/inferences/streaming_tflite_conformer.py b/examples/inferences/streaming_tflite_conformer.py
index 321f2a9c5f..46c0523a58 100644
--- a/examples/inferences/streaming_tflite_conformer.py
+++ b/examples/inferences/streaming_tflite_conformer.py
@@ -1,172 +1,172 @@
-# Copyright 2020 Huy Le Nguyen (@nglehuy)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# # Copyright 2020 Huy Le Nguyen (@nglehuy)
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
 
-import argparse
-import queue
-import sys
-from multiprocessing import Event, Manager, Process
+# import argparse
+# import queue
+# import sys
+# from multiprocessing import Event, Manager, Process
 
-import numpy as np
-import sounddevice as sd
-import soundfile as sf
-import tensorflow as tf
+# import numpy as np
+# import sounddevice as sd
+# import soundfile as sf
+# import tensorflow as tf
 
 
-def int_or_str(text):
-    """Helper function for argument parsing."""
-    try:
-        return int(text)
-    except ValueError:
-        return text
-
-
-parser = argparse.ArgumentParser(prog="Conformer audio file streaming")
+# def int_or_str(text):
+#     """Helper function for argument parsing."""
+#     try:
+#         return int(text)
+#     except ValueError:
+#         return text
+
+
+# parser = argparse.ArgumentParser(prog="Conformer audio file streaming")
 
-parser.add_argument("-l", "--list-devices", action="store_true", help="show list of audio devices and exit")
-
-args, remaining = parser.parse_known_args()
-
-if args.list_devices:
-    print(sd.query_devices())
-    parser.exit(0)
-
-parser.add_argument("filename", metavar="FILENAME", help="audio file to be played back")
-
-parser.add_argument("-d", "--device", type=int_or_str, help="output device (numeric ID or substring)")
-
-parser.add_argument("-b", "--blocksize", type=int, default=4096, help="block size (default: %(default)s)")
-
-parser.add_argument("-q", "--buffersize", type=int, default=20, help="number of blocks used for buffering (default: %(default)s)")
-
-parser.add_argument("--tflite", type=str, default=None, help="Path to conformer tflite")
-
-parser.add_argument("--blank", type=int, default=0, help="Path to conformer tflite")
-
-parser.add_argument("--num_rnns", type=int, default=1, help="Number of RNN layers in prediction network")
-
-parser.add_argument("--nstates", type=int, default=2, help="Number of RNN states in prediction network (1 for GRU and 2 for LSTM)")
-
-parser.add_argument("--statesize", type=int, default=320, help="Size of RNN state in prediction network")
-
-args = parser.parse_args(remaining)
-
-if args.blocksize == 0:
-    parser.error("blocksize must not be zero")
-if args.buffersize < 1:
-    parser.error("buffersize must be at least 1")
-
-q = queue.Queue(maxsize=args.buffersize)
-m = Manager()
-Q = m.Queue()
-E = Event()
-
-
-def recognizer(Q):
-    tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
-
-    input_details = tflitemodel.get_input_details()
-    output_details = tflitemodel.get_output_details()
-
-    tflitemodel.resize_tensor_input(input_details[0]["index"], [args.blocksize])
-    tflitemodel.allocate_tensors()
-
-    def recognize(signal, lastid, states):
-        if signal.shape[0] < args.blocksize:
-            signal = tf.pad(signal, [[0, args.blocksize - signal.shape[0]]])
-        tflitemodel.set_tensor(input_details[0]["index"], signal)
-        tflitemodel.set_tensor(input_details[1]["index"], lastid)
-        tflitemodel.set_tensor(input_details[2]["index"], states)
-        tflitemodel.invoke()
-        upoints = tflitemodel.get_tensor(output_details[0]["index"])
-        lastid = tflitemodel.get_tensor(output_details[1]["index"])
-        states = tflitemodel.get_tensor(output_details[2]["index"])
-        text = "".join([chr(u) for u in upoints])
-        return text, lastid, states
-
-    lastid = args.blank * tf.ones(shape=[], dtype=tf.int32)
-    states = tf.zeros(shape=[args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)
-    transcript = ""
-
-    while True:
-        try:
-            data = Q.get()
-            text, lastid, states = recognize(data, lastid, states)
-            transcript += text
-            print(transcript, flush=True)
-        except queue.Empty:
-            pass
-
-
-tflite_process = Process(target=recognizer, args=[Q])
-tflite_process.start()
-
-
-def send(q, Q, E):
-    def callback(outdata, frames, time, status):
-        assert frames == args.blocksize
-        if status.output_underflow:
-            print("Output underflow: increase blocksize?", file=sys.stderr)
-            raise sd.CallbackAbort
-        assert not status
-        try:
-            data = q.get_nowait()
-            Q.put(np.frombuffer(data, dtype=np.float32))
-        except queue.Empty as e:
-            print("Buffer is empty: increase buffersize?", file=sys.stderr)
-            raise sd.CallbackAbort from e
-        if len(data) < len(outdata):
-            outdata[: len(data)] = data
-            outdata[len(data) :] = b"\x00" * (len(outdata) - len(data))
-            raise sd.CallbackStop
-        else:
-            outdata[:] = data
-
-    try:
-        with sf.SoundFile(args.filename) as f:
-            for _ in range(args.buffersize):
-                data = f.buffer_read(args.blocksize, dtype="float32")
-                if not data:
-                    break
-                q.put_nowait(data)  # Pre-fill queue
-            stream = sd.RawOutputStream(
-                samplerate=f.samplerate,
-                blocksize=args.blocksize,
-                device=args.device,
-                channels=f.channels,
-                dtype="float32",
-                callback=callback,
-                finished_callback=E.set,
-            )
-            with stream:
-                timeout = args.blocksize * args.buffersize / f.samplerate
-                while data:
-                    data = f.buffer_read(args.blocksize, dtype="float32")
-                    q.put(data, timeout=timeout)
-                E.wait()
-
-    except KeyboardInterrupt:
-        parser.exit("\nInterrupted by user")
-    except queue.Full:
-        # A timeout occurred, i.e. there was an error in the callback
-        parser.exit(1)
-    except Exception as e:
-        parser.exit(type(e).__name__ + ": " + str(e))
-
-
-send_process = Process(target=send, args=[q, Q, E])
-send_process.start()
-send_process.join()
-send_process.close()
-
-tflite_process.terminate()
+# parser.add_argument("-l", "--list-devices", action="store_true", help="show list of audio devices and exit")
+
+# args, remaining = parser.parse_known_args()
+
+# if args.list_devices:
+#     print(sd.query_devices())
+#     parser.exit(0)
+
+# parser.add_argument("filename", metavar="FILENAME", help="audio file to be played back")
+
+# parser.add_argument("-d", "--device", type=int_or_str, help="output device (numeric ID or substring)")
+
+# parser.add_argument("-b", "--blocksize", type=int, default=4096, help="block size (default: %(default)s)")
+
+# parser.add_argument("-q", "--buffersize", type=int, default=20, help="number of blocks used for buffering (default: %(default)s)")
+
+# parser.add_argument("--tflite", type=str, default=None, help="Path to conformer tflite")
+
+# parser.add_argument("--blank", type=int, default=0, help="Path to conformer tflite")
+
+# parser.add_argument("--num_rnns", type=int, default=1, help="Number of RNN layers in prediction network")
+
+# parser.add_argument("--nstates", type=int, default=2, help="Number of RNN states in prediction network (1 for GRU and 2 for LSTM)")
+
+# parser.add_argument("--statesize", type=int, default=320, help="Size of RNN state in prediction network")
+
+# args = parser.parse_args(remaining)
+
+# if args.blocksize == 0:
+#     parser.error("blocksize must not be zero")
+# if args.buffersize < 1:
+#     parser.error("buffersize must be at least 1")
+
+# q = queue.Queue(maxsize=args.buffersize)
+# m = Manager()
+# Q = m.Queue()
+# E = Event()
+
+
+# def recognizer(Q):
+#     tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
+
+#     input_details = tflitemodel.get_input_details()
+#     output_details = tflitemodel.get_output_details()
+
+#     tflitemodel.resize_tensor_input(input_details[0]["index"], [args.blocksize])
+#     tflitemodel.allocate_tensors()
+
+#     def recognize(signal, lastid, states):
+#         if signal.shape[0] < args.blocksize:
+#             signal = tf.pad(signal, [[0, args.blocksize - signal.shape[0]]])
+#         tflitemodel.set_tensor(input_details[0]["index"], signal)
+#         tflitemodel.set_tensor(input_details[1]["index"], lastid)
+#         tflitemodel.set_tensor(input_details[2]["index"], states)
+#         tflitemodel.invoke()
+#         upoints = tflitemodel.get_tensor(output_details[0]["index"])
+#         lastid = tflitemodel.get_tensor(output_details[1]["index"])
+#         states = tflitemodel.get_tensor(output_details[2]["index"])
+#         text = "".join([chr(u) for u in upoints])
+#         return text, lastid, states
+
+#     lastid = args.blank * tf.ones(shape=[], dtype=tf.int32)
+#     states = tf.zeros(shape=[args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)
+#     transcript = ""
+
+#     while True:
+#         try:
+#             data = Q.get()
+#             text, lastid, states = recognize(data, lastid, states)
+#             transcript += text
+#             print(transcript, flush=True)
+#         except queue.Empty:
+#             pass
+
+
+# tflite_process = Process(target=recognizer, args=[Q])
+# tflite_process.start()
+
+
+# def send(q, Q, E):
+#     def callback(outdata, frames, time, status):
+#         assert frames == args.blocksize
+#         if status.output_underflow:
+#             print("Output underflow: increase blocksize?", file=sys.stderr)
+#             raise sd.CallbackAbort
+#         assert not status
+#         try:
+#             data = q.get_nowait()
+#             Q.put(np.frombuffer(data, dtype=np.float32))
+#         except queue.Empty as e:
+#             print("Buffer is empty: increase buffersize?", file=sys.stderr)
+#             raise sd.CallbackAbort from e
+#         if len(data) < len(outdata):
+#             outdata[: len(data)] = data
+#             outdata[len(data) :] = b"\x00" * (len(outdata) - len(data))
+#             raise sd.CallbackStop
+#         else:
+#             outdata[:] = data
+
+#     try:
+#         with sf.SoundFile(args.filename) as f:
+#             for _ in range(args.buffersize):
+#                 data = f.buffer_read(args.blocksize, dtype="float32")
+#                 if not data:
+#                     break
+#                 q.put_nowait(data)  # Pre-fill queue
+#             stream = sd.RawOutputStream(
+#                 samplerate=f.samplerate,
+#                 blocksize=args.blocksize,
+#                 device=args.device,
+#                 channels=f.channels,
+#                 dtype="float32",
+#                 callback=callback,
+#                 finished_callback=E.set,
+#             )
+#             with stream:
+#                 timeout = args.blocksize * args.buffersize / f.samplerate
+#                 while data:
+#                     data = f.buffer_read(args.blocksize, dtype="float32")
+#                     q.put(data, timeout=timeout)
+#                 E.wait()
+
+#     except KeyboardInterrupt:
+#         parser.exit("\nInterrupted by user")
+#     except queue.Full:
+#         # A timeout occurred, i.e. there was an error in the callback
+#         parser.exit(1)
+#     except Exception as e:
+#         parser.exit(type(e).__name__ + ": " + str(e))
+
+
+# send_process = Process(target=send, args=[q, Q, E])
+# send_process.start()
+# send_process.join()
+# send_process.close()
+
+# tflite_process.terminate()
diff --git a/examples/models/transducer/conformer/inference/gen_saved_model.py b/examples/models/transducer/conformer/inference/gen_saved_model.py
index 0048351bd0..c9cc875950 100644
--- a/examples/models/transducer/conformer/inference/gen_saved_model.py
+++ b/examples/models/transducer/conformer/inference/gen_saved_model.py
@@ -1,56 +1,56 @@
-# pylint: disable=no-member
-# Copyright 2020 Huy Le Nguyen (@nglehuy)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import fire
-import tensorflow as tf
-
-from tensorflow_asr.configs import Config
-from tensorflow_asr.helpers import featurizer_helpers
-from tensorflow_asr.models.transducer.conformer import Conformer
-from tensorflow_asr.utils import env_util
-
-logger = env_util.setup_environment()
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config_wp.j2")
-
-
-def main(
-    config_path: str = DEFAULT_YAML,
-    saved: str = None,
-    output_dir: str = None,
-):
-    assert saved and output_dir
-    tf.random.set_seed(0)
-    tf.keras.backend.clear_session()
-
-    logger.info("Load config and featurizers ...")
-    config = Config(config_path)
-    speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers(config=config)
-
-    logger.info("Build and load model ...")
-    conformer = Conformer(**config.model_config, vocab_size=text_featurizer.num_classes)
-    conformer.make(speech_featurizer.shape)
-    conformer.add_featurizers(speech_featurizer, text_featurizer)
-    conformer.load_weights(saved, by_name=True)
-    conformer.summary()
-
-    logger.info("Save model ...")
-    tf.saved_model.save(conformer, export_dir=output_dir, signatures=conformer.recognize_from_signal.get_concrete_function())
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
+# # pylint: disable=no-member
+# # Copyright 2020 Huy Le Nguyen (@nglehuy)
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+# import os
+
+# import fire
+# from tensorflow_asr import tf, keras
+
+# from tensorflow_asr.configs import Config
+# from tensorflow_asr.helpers import featurizer_helpers
+# from tensorflow_asr.models.transducer.conformer import Conformer
+# from tensorflow_asr.utils import env_util
+
+# logger = env_util.setup_environment()
+
+# DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config_wp.j2")
+
+
+# def main(
+#     config_path: str = DEFAULT_YAML,
+#     saved: str = None,
+#     output_dir: str = None,
+# ):
+#     assert saved and output_dir
+#     tf.random.set_seed(0)
+#     keras.backend.clear_session()
+
+#     logger.info("Load config and featurizers ...")
+#     config = Config(config_path)
+#     speech_featurizer, text_featurizer = featurizer_helpers.prepare_featurizers(config=config)
+
+#     logger.info("Build and load model ...")
+#     conformer = Conformer(**config.model_config, vocab_size=text_featurizer.num_classes)
+#     conformer.make(speech_featurizer.shape)
+#     conformer.add_featurizers(speech_featurizer, text_featurizer)
+#     conformer.load_weights(saved, by_name=True)
+#     conformer.summary()
+
+#     logger.info("Save model ...")
+#     tf.saved_model.save(conformer, export_dir=output_dir, signatures=conformer.recognize_from_signal.get_concrete_function())
+
+
+# if __name__ == "__main__":
+#     fire.Fire(main)
diff --git a/examples/models/transducer/conformer/inference/run_saved_model.py b/examples/models/transducer/conformer/inference/run_saved_model.py
index eb00912d9d..56da5da980 100644
--- a/examples/models/transducer/conformer/inference/run_saved_model.py
+++ b/examples/models/transducer/conformer/inference/run_saved_model.py
@@ -1,43 +1,43 @@
-# Copyright 2020 Huy Le Nguyen (@nglehuy)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# # Copyright 2020 Huy Le Nguyen (@nglehuy)
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
 
-import os
+# import os
 
-import fire
-import tensorflow as tf
+# import fire
+# from tensorflow_asr import tf, keras
 
-from tensorflow_asr.features.speech_featurizers import read_raw_audio
-from tensorflow_asr.utils import env_util
+# from tensorflow_asr.features.speech_featurizers import read_raw_audio
+# from tensorflow_asr.utils import env_util
 
-logger = env_util.setup_environment()
+# logger = env_util.setup_environment()
 
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config_wp.j2")
+# DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config_wp.j2")
 
 
-def main(
-    saved_model: str = None,
-    filename: str = None,
-):
-    tf.keras.backend.clear_session()
+# def main(
+#     saved_model: str = None,
+#     filename: str = None,
+# ):
+#     keras.backend.clear_session()
 
-    module = tf.saved_model.load(export_dir=saved_model)
+#     module = tf.saved_model.load(export_dir=saved_model)
 
-    signal = read_raw_audio(filename)
-    transcript = module.pred(signal)
+#     signal = read_raw_audio(filename)
+#     transcript = module.pred(signal)
 
-    print("Transcript: ", "".join([chr(u) for u in transcript]))
+#     print("Transcript: ", "".join([chr(u) for u in transcript]))
 
 
-if __name__ == "__main__":
-    fire.Fire(main)
+# if __name__ == "__main__":
+#     fire.Fire(main)
diff --git a/examples/models/transducer/rnnt/results/sentencepiece/README.md b/examples/models/transducer/rnnt/results/sentencepiece/README.md
new file mode 100644
index 0000000000..03b137f092
--- /dev/null
+++ b/examples/models/transducer/rnnt/results/sentencepiece/README.md
@@ -0,0 +1,57 @@
+- [SentencePiece 256 + Tiny + LibriSpeech](#sentencepiece-256--tiny--librispeech)
+    - [Training Loss](#training-loss)
+      - [1. Epoch Loss](#1-epoch-loss)
+      - [2. Batch Loss](#2-batch-loss)
+    - [Results](#results)
+
+
+# SentencePiece 256 + Tiny + LibriSpeech
+
+| Category          | Description                      |
+| :---------------- | :------------------------------- |
+| Config            | [tiny.yml.j2](../../tiny.yml.j2) |
+| Tensorflow        | **2.15.x**                       |
+| Device            | NVIDIA GeForce GTX 1650          |
+| Global Batch Size | 3                                |
+| Max Epochs        | 300                              |
+
+
+### Training Loss
+
+#### 1. Epoch Loss
+
+![Epoch Loss](./figs/rnnt-tiny-sp256-epoch-loss.svg)
+
+#### 2. Batch Loss
+
+![Batch Loss](./figs/rnnt-tiny-sp256-batch-loss.svg)
+
+
+### Results
+
+Pretrain Model here: [link](https://drive.google.com/drive/folders/1h0BrCzZo8JTz_MUU5bJPJ3UBqroBnsuv?usp=sharing)
+
+```json
+[
+  {
+    "epoch": 136,
+    "test-clean": {
+      "greedy": {
+        "wer": 0.15853241022519782,
+        "cer": 0.07179696657549817,
+        "mer": 0.15537908021549876,
+        "wil": 0.2587056704145151,
+        "wip": 0.7412943295854849
+      }
+    },
+    "test-other": {
+      "greedy": {
+        "wer": 0.3457577899623636,
+        "cer": 0.18733822655980759,
+        "mer": 0.33391759995571874,
+        "wil": 0.5185365485613327,
+        "wip": 0.48146345143866726
+      }
+    }
+  },
+]
\ No newline at end of file
diff --git a/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-batch-loss.svg b/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-batch-loss.svg
new file mode 100644
index 0000000000..c90c689ff3
--- /dev/null
+++ b/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-batch-loss.svg
@@ -0,0 +1 @@
+<svg viewBox="0 0 330 200" xmlns="http://www.w3.org/2000/svg"><g><g><g><g><g><line x1="30" y1="177" x2="25" y2="177" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="154.875" x2="25" y2="154.875" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="132.75" x2="25" y2="132.75" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="110.625" x2="25" y2="110.625" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="88.5" x2="25" y2="88.5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="66.375" x2="25" y2="66.375" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="44.25" x2="25" y2="44.25" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="22.125" x2="25" y2="22.125" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="0" x2="25" y2="0" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(20, 0)"><text x="0" y="177" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">30</text><text x="0" y="154.875" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">35</text><text x="0" y="132.75" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">40</text><text x="0" y="110.625" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">45</text><text x="0" y="88.5" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">50</text><text x="0" y="66.375" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">55</text><text x="0" y="44.25" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">60</text><text x="0" y="22.125" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">65</text><text x="0" y="0" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">70</text></g><line x1="30" y1="0" x2="30" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g><g transform="translate(30, 0)" clip-path="url(#clip_0)"><clipPath id="clip_0"><rect width="300" height="177"></rect></clipPath><g><g><g><line x1="0" y1="0" x2="0" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="33.33333333333333" y1="0" x2="33.33333333333333" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="66.66666666666666" y1="0" x2="66.66666666666666" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="100" y1="0" x2="100" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="133.33333333333331" y1="0" x2="133.33333333333331" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="166.66666666666669" y1="0" x2="166.66666666666669" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="200" y1="0" x2="200" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="233.33333333333334" y1="0" x2="233.33333333333334" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="266.66666666666663" y1="0" x2="266.66666666666663" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="300" y1="0" x2="300" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line></g><g><line x1="0" y1="177" x2="300" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="154.875" x2="300" y2="154.875" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="132.75" x2="300" y2="132.75" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="110.625" x2="300" y2="110.625" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="88.5" x2="300" y2="88.5" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="66.375" x2="300" y2="66.375" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="44.25" x2="300" y2="44.25" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="22.125" x2="300" y2="22.125" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="0" x2="300" y2="0" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line></g></g></g><g><g><line x1="0" y1="309.75" x2="300" y2="309.75" fill="rgb(0, 0, 0)" stroke="rgb(153, 153, 153)" stroke-width="1.5px"></line></g></g><g><g><line x1="33.33333333333333" y1="0" x2="33.33333333333333" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(153, 153, 153)" stroke-width="1.5px"></line></g></g><g><g><g><g><g><path stroke="rgb(255, 112, 67)" stroke-width="2px" d="M39.11600000000001,-19.669663810729972L39.39366666666667,7.728929328918461L39.65466666666667,10.803492736816402L39.686,10.855280685424809L39.931666666666665,12.823020172119136L40.096000000000004,14.292258453369136L40.26233333333334,15.636010551452628L40.511,17.896618652343754L40.70033333333333,19.298539352416988L40.71233333333333,19.358092117309575L40.73033333333333,19.48097877502442L40.74266666666667,19.582866668701172L40.959,21.031004905700684L41.435,36.327237224578866L41.660333333333334,37.49985952377319L41.67033333333333,37.56174173355102L42.038666666666664,39.422107315063485L42.076,39.67807617187499L42.35733333333333,40.979121780395516L42.397333333333336,41.16144304275513L42.684666666666665,42.6507516860962L43.20766666666666,52.7675983428955L43.60666666666666,53.6879991531372L43.997,55.0874722480774L44.020999999999994,55.2216685295105L44.114999999999995,55.58337392807006L44.184,55.73568248748779L44.201,55.82328987121582L44.43866666666667,56.77164402008056L44.574666666666666,57.20139284133911L44.89533333333334,58.157883167266846L44.92166666666667,58.26316394805907L45.36533333333333,64.79239540100097L45.440666666666665,65.15553560256957L45.510999999999996,65.528196144104L45.87466666666667,66.13315973281861L46.39033333333333,67.4592553138733L46.495333333333335,67.7276985168457L46.52633333333333,67.8094654083252L46.71666666666667,68.276282787323L47.086,74.04427528381348L47.147999999999996,73.70358552932738L47.404666666666664,74.0753514289856L48.291000000000004,75.52489070892334L48.39066666666667,75.77177810668945L49.032666666666664,80.82753438949585L49.510333333333335,80.64890985488891L49.51233333333333,80.64938249588012L49.609,80.76970338821411L50.16366666666667,81.75245904922485L50.67433333333334,82.60463075637817L50.95766666666667,86.80460300445556L51.33166666666667,86.13114023208618L51.713,86.48503017425537L51.791,86.546439743042L51.93066666666667,86.7928882598877L52.51233333333334,87.72652301788331L52.690333333333335,87.8563473701477L52.977333333333334,89.60361671447754L53.00266666666667,90.25364999771118L53.26,90.7123649597168L53.419333333333334,90.52305536270141L53.43633333333334,90.6695065498352L53.45466666666666,90.53240690231324L53.69166666666667,90.8248872756958L53.787,90.90602960586547L54.85666666666666,95.07183666229248L55.086,94.05738029479981L55.519999999999996,94.50323266983033L55.757333333333335,94.91569633483887L55.785666666666664,94.88971796035766L56.24733333333333,95.55946712493896L56.32166666666667,95.59662008285522L56.449333333333335,95.71331176757813L57.651,97.71862621307373L57.81366666666667,97.95774879455566L58.122,98.39067106246948L58.37733333333333,98.7025465965271L58.40566666666667,98.72999353408814L58.453333333333326,98.77136650085448L58.58166666666667,98.83735055923462L58.82966666666666,100.48651313781738L59.00866666666667,100.64498291015626L59.233,100.61851501464844L59.23833333333334,100.6387541770935L59.44166666666667,100.80725069046021L59.52966666666667,101.0098448753357L59.992000000000004,101.3169095993042L59.99966666666667,101.33616971969604L60.157333333333334,101.56018466949463L60.607666666666674,101.87273540496825L60.751000000000005,103.90070285797118L61.13633333333334,103.35309762954712L61.243,103.2189182281494L61.27733333333334,103.1747938156128L61.550999999999995,103.49387712478638L62.27533333333333,104.24026165008546L62.745333333333335,105.0677547454834L63.07,105.57076292037964L63.229,105.41897764205932L63.425000000000004,105.4737364768982L63.43166666666667,105.5039011001587L63.754333333333335,105.84886150360107L63.78433333333333,105.89450511932374L64.09966666666666,106.22261924743653L64.22866666666667,106.31772136688232L64.416,106.39488000869751L64.42533333333333,106.39287128448485L64.70866666666667,106.32216081619264L64.97266666666667,107.11699104309082L65.39633333333333,107.4660701751709L65.646,107.60337238311767L66.19200000000001,108.1792685508728L66.55833333333334,109.46540908813476L66.68333333333334,109.02102527618409L67.08466666666666,109.24471950531006L67.816,109.98880834579468L67.95666666666668,110.06039657592773L68.35433333333333,110.2838376045227L68.37833333333333,110.28652153015138L68.46633333333334,110.41582260131837L68.66233333333334,110.92229118347167L68.84733333333334,111.08533544540406L69.848,111.70452890396119L69.84833333333333,111.70859699249267L69.89466666666667,111.74152994155884L69.91833333333334,111.7663773536682L70.06233333333333,111.87781934738159L70.18566666666666,111.93063697814942L70.32866666666666,111.99089870452882L71.214,113.04630603790282L72.80766666666666,114.36495752334595L73.344,114.3798457145691L73.40933333333334,114.48075456619263L74.24266666666666,114.95329427719116L74.247,114.95363187789917L74.27533333333334,115.04365310668946L74.40966666666667,116.48166332244874L74.65566666666666,115.47706489562988L75.00833333333334,115.64978141784668L75.01,115.6459834098816L75.13566666666668,115.82481050491333L75.91833333333334,116.4202031135559L75.93366666666667,116.40553436279298L76.10900000000001,116.43129329681396L76.13566666666667,116.43647546768189L76.36733333333333,117.23969507217407L76.447,116.98767614364624L76.52466666666666,117.08389234542847L77.08133333333333,116.9384539604187L77.526,117.28589572906493L77.54566666666668,117.31356210708618L78.22133333333333,117.70779533386231L78.34933333333332,118.0625461578369L78.61433333333333,118.14561281204224L78.98533333333333,118.28728694915772L79.00833333333333,118.230215549469L79.70333333333333,118.75542097091675L79.73833333333333,118.72814283370973L79.798,118.79848194122314L79.876,118.89299325942993L80.37266666666666,119.01204814910888L81.18033333333334,119.49166059494019L81.668,119.79253034591675L81.82733333333333,119.88935422897339L81.92733333333334,119.92689542770385L81.99166666666667,119.87564764022828L82.71033333333334,120.42622375488281L82.86800000000001,120.54332056045531L83.17966666666666,120.64360485076905L83.23866666666666,120.57674303054809L83.434,120.75540132522583L83.645,120.80273294448853L83.71366666666667,120.86756916046143L83.88266666666667,120.95065269470216L84.32466666666667,120.67322931289674L84.605,121.26171798706055L84.64966666666666,121.21128044128419L84.71733333333333,121.24804515838623L85.19533333333332,121.5617606163025L85.24933333333334,121.66467819213868L85.33200000000001,121.73251905441283L85.463,121.87375431060791L85.89166666666667,121.95955553054809L86.10499999999999,123.87982835769652L86.18,122.44202070236206L86.65733333333333,122.25281238555908L86.67033333333333,122.22830257415772L86.67366666666668,122.21690855026246L87.108,122.3661955833435L87.14833333333333,122.34371137619019L87.30266666666667,122.51166772842407L87.66633333333333,122.82659854888915L88.70966666666666,123.087428855896L88.848,123.31936054229736L89.00433333333334,123.4377908706665L89.01766666666667,123.45662899017334L89.18266666666668,123.6192006111145L89.55566666666667,123.7052212715149L89.80033333333334,123.78136711120605L89.82,123.7745644569397L90.14633333333335,123.68640003204347L90.432,123.92329444885253L90.43366666666667,123.87532138824463L90.541,123.84873533248901L90.64999999999999,123.96551141738892L91.13666666666667,124.20495471954347L91.52799999999999,124.48934955596924L91.81400000000001,124.59657154083253L91.83066666666667,124.60801620483399L92.09833333333334,124.50388326644897L92.558,124.7641396522522L92.89666666666668,124.95135612487793L93.52233333333334,125.43868274688721L94.54966666666665,125.47080545425415L94.584,125.50789089202881L94.657,125.54406480789184L95.026,125.85092697143556L95.51766666666667,126.06903390884399L95.73433333333332,126.13165884017944L95.73566666666667,126.12455234527589L95.87966666666667,128.00610237121583L96.18766666666667,126.16168842315675L96.58433333333333,126.43435163497925L96.75733333333334,126.62784748077392L98.15400000000001,127.05211029052735L98.46733333333333,127.18887233734131L98.65366666666667,127.31527004241944L99.03333333333335,127.55027389526367L99.09533333333333,127.65390043258667L99.44666666666667,127.69034442901612L99.51433333333333,127.67930488586425L100.48366666666666,127.75074119567871L100.73966666666666,128.0019161224365L101.22733333333335,128.24130878448486L101.435,128.31061820983888L101.45833333333334,128.3245442390442L101.468,128.316357421875L102.07300000000001,128.33080673217773L102.19066666666667,128.18764715194703L102.622,128.57068891525267L102.73766666666666,128.67193536758424L102.87233333333333,128.722474193573L103.06299999999999,128.8308777809143L103.20733333333334,128.8481291770935L103.963,128.70209999084472L104.399,129.03451852798463L104.86633333333333,129.27104158401488L104.899,129.26908349990845L104.903,129.2711428642273L105.25433333333332,129.51917810440062L105.50766666666667,129.47581329345704L105.64500000000001,130.4983383178711L105.646,130.4383129119873L105.88066666666667,129.29099378585815L105.97500000000001,129.36806802749635L106.45833333333334,129.4656852722168L106.804,129.72737646102905L106.842,129.6997944831848L106.848,129.72300453186034L107.328,129.96108055114746L107.387,129.91265172958373L107.60333333333332,130.88099184036255L107.75633333333333,129.99497566223144L107.80366666666666,130.22457790374756L107.85566666666666,130.08295440673828L108.79966666666667,130.4203863143921L108.98899999999999,130.4594467163086L109.22999999999999,130.5556966781616L109.482,130.62313241958617L109.961,130.73116464614867L109.99366666666667,130.60483446121216L110.13866666666667,130.51721019744872L110.23100000000001,130.53403959274291L110.64099999999999,130.8479407310486L111.084,131.14492807388305L111.35533333333333,131.17490701675416L111.46733333333333,145.08996419906617L111.76366666666667,130.92491369247438L111.953,130.9134690284729L112.43733333333334,131.23547258377076L112.676,131.4408182144165L112.792,131.57092952728271L113.07366666666667,131.64634952545165L113.22366666666666,131.6360189437866L113.38266666666667,131.69521722793579L113.47266666666665,132.819309425354L113.93033333333332,131.7817274093628L114.258,131.902098941803L114.31766666666667,131.96936588287355L114.78233333333333,132.12199516296386L115.099,132.16147756576538L115.47133333333333,132.08507852554322L115.51966666666667,131.93964014053344L115.56733333333332,131.68174695968628L115.82100000000001,132.0847578048706L115.82466666666666,132.0413592338562L116.435,132.01316957473756L116.95466666666667,132.34526739120483L116.98333333333333,132.34595947265626L117.06833333333334,132.34113178253173L117.43299999999999,132.54943141937255L117.52133333333335,132.43755054473877L117.77066666666667,132.5988899230957L117.87466666666667,132.5637119293213L117.98433333333334,132.61961860656737L118.01533333333333,132.56529865264892L118.74966666666666,133.03581275939942L119.013,133.13867969512938L119.208,133.16658239364625L119.56433333333334,133.3715566635132L119.64833333333334,133.41650819778442L119.73766666666667,133.45182123184205L119.80133333333333,133.3425061225891L120.14166666666667,133.52376394271852L120.232,133.50943279266357L120.36766666666666,133.56103506088257L120.42999999999999,133.63662385940552L121.25133333333332,135.07146062850953L121.29266666666666,133.9739882469177L121.93033333333332,133.72335348129272L122.36633333333334,133.79051914215088L122.41033333333333,133.8095767021179L122.46666666666667,133.81482639312745L122.61099999999999,133.96306686401368L122.61766666666666,133.94574794769287L122.74733333333333,133.9132201194763L122.935,134.02023954391478L122.99,133.97625017166138L123.08533333333334,134.03040132522582L123.20966666666666,135.66081018447875L123.35133333333333,134.391836643219L123.66733333333333,134.0636212348938L124.01933333333332,134.1246425628662L124.21833333333333,134.228134059906L124.41266666666667,134.3142728805542L124.62966666666667,134.4694679260254L125.165,134.72579126358033L125.70400000000001,134.30409421920777L125.868,134.3782819747925L125.90533333333335,134.37765741348267L126.38933333333333,134.4908211708069L127.123,136.32220373153686L127.39266666666666,134.8977819442749L127.451,135.00020999908446L127.92,135.02850093841553L127.975,134.98294172286987L128.411,135.22913703918456L128.901,135.2816170692444L129.17333333333335,136.249737739563L129.279,135.91011142730713L129.68866666666668,135.5411476135254L129.72833333333335,135.54864234924315L130.06033333333335,135.67772397994995L130.261,135.65264024734498L130.282,135.64916296005248L130.47266666666667,135.74095659255983L130.79166666666666,135.7621916770935L131.21866666666665,135.54356145858765L132.25433333333334,135.88531465530394L132.40233333333333,135.93625860214235L132.45366666666666,135.98578462600707L132.69966666666667,136.0948634147644L132.756,136.04871339797972L132.76533333333333,136.06662311553956L132.882,136.12241163253785L133.33033333333333,135.81617403030396L133.70233333333334,135.88175296783447L134.02933333333334,136.14476079940795L134.21433333333334,136.276931476593L134.507,136.39850149154663L135.45266666666666,136.4058443069458L135.587,136.30875034332274L135.72666666666666,136.38584146499633L135.821,136.49343481063843L135.929,136.61048097610472L136.09766666666667,136.70004644393921L136.21133333333333,136.73321571350098L136.40333333333334,136.76805610656737L136.43566666666666,136.77936573028563L136.57833333333335,136.85760469436644L136.91333333333333,137.51817111968995L137.13366666666667,136.79801816940306L137.552,136.73380651474L137.68033333333332,136.73932628631593L137.80466666666666,136.8027108192444L138.34266666666667,137.04320068359374L138.96533333333335,136.42346706390381L139.13400000000001,136.9366201400757L139.46633333333332,136.79756240844728L139.561,136.8465651512146L139.77366666666666,136.94519519805908L139.99666666666667,137.14628705978393L140.10033333333334,137.20877695083618L140.10933333333332,137.2054515838623L140.15966666666665,137.2178921699524L140.53633333333335,137.42345724105834L140.93333333333334,137.24110221862793L140.943,137.1356863975525L141.58133333333333,137.46619749069214L141.692,137.47919511795044L142.35333333333332,137.84236907958984L142.36266666666666,137.8452049255371L142.414,137.88267860412597L143.371,137.5940131187439L143.38633333333334,137.58864526748658L143.39066666666668,137.54102668762206L143.80566666666667,137.81326789855956L143.83366666666666,137.86326656341552L144.285,138.16364679336547L144.295,138.1904354095459L144.35766666666666,138.18927068710326L144.54366666666667,138.09379720687866L145.15433333333334,137.91478443145752L145.512,138.07521228790284L145.631,137.9360701560974L145.902,138.10276050567626L146.99333333333334,138.28050727844237L146.995,138.254478263855L147.18566666666666,138.35796976089478L148.81166666666667,138.4756236076355L148.838,138.28023719787598L148.84933333333333,138.37501859664917L149.053,138.42091541290284L149.24466666666666,138.30567541122437L149.38066666666668,138.40243177413942L149.60033333333334,138.54498367309571L149.7086666666667,138.62085943222047L149.82433333333333,138.69857511520385L150.09333333333333,138.86268281936646L150.89533333333333,138.99238901138307L151.04066666666665,138.92785663604735L151.04999999999998,138.9207839012146L151.74,139.28115577697753L152.22266666666667,139.38393831253052L152.35766666666666,139.34982376098634L152.78433333333334,139.04750232696534L153.06300000000002,139.4689124107361L153.15900000000002,139.44625940322877L153.35966666666667,139.45108709335327L154.24633333333335,139.63585596084596L154.40233333333333,139.7380139350891L154.46766666666664,140.07694816589355L154.63366666666667,139.37381029129028L155.10566666666668,139.63232803344727L155.21133333333333,139.6339485168457L155.24099999999999,139.70835571289064L156.02866666666665,139.9310877799988L156.06,139.9236943244934L156.24433333333334,139.86598148345948L156.333,139.93400802612305L156.66799999999998,140.01624755859376L157.08766666666668,139.93475074768065L157.15733333333333,139.92151679992676L157.953,140.17198276519775L158.03533333333334,140.17321500778198L158.157,140.1564024925232L158.20766666666668,140.14730415344238L158.666,140.1610276222229L158.68699999999998,140.04124689102173L158.778,140.19419689178466L158.816,140.05486907958985L159.06333333333333,139.9569986343384L159.17533333333333,140.09097547531127L159.32233333333332,140.02114276885987L159.85766666666666,140.31615514755248L160.19833333333332,140.3853801727295L160.58633333333333,140.23837194442748L161.08833333333334,140.27937355041504L161.31933333333336,140.38838481903076L161.737,140.5705035209656L161.954,140.70767068862915L162.04066666666668,140.6363862991333L162.09566666666666,140.64307079315185L162.18266666666668,140.70476732254028L162.54466666666667,140.25866174697876L162.85,140.53605136871337L163.82066666666668,141.1544852256775L163.92,141.16466388702392L164.18433333333334,141.16172676086427L164.70366666666666,140.83752880096435L164.733,140.84915914535523L165.28966666666668,141.05359325408935L165.43366666666665,141.0984266281128L165.63400000000001,141.20490589141846L166.107,141.2192032814026L166.53633333333332,141.04819164276122L166.80933333333334,140.9470633506775L167.16566666666668,141.20473709106446L167.32066666666665,141.20387620925902L167.78933333333333,141.548650932312L167.82933333333332,141.55857639312745L167.99599999999998,141.5910029411316L168.15233333333336,142.602623462677L168.50466666666665,141.86432447433472L168.63866666666667,141.58555068969727L168.71733333333333,141.5867322921753L168.72,141.5619017601013L168.88766666666666,141.46129674911498L169.17966666666666,141.60572233200074L169.40666666666667,141.80104122161865L169.62533333333334,141.79813785552977L169.94966666666667,141.71903800964355L169.961,141.73247451782225L170.286,141.5394681930542L170.60633333333334,141.452063369751L170.70133333333334,141.46490907669067L171.12766666666664,141.62044172286988L171.12933333333334,141.60751161575317L171.32333333333335,141.76782131195068L171.64233333333334,141.88861484527587L171.80266666666668,141.85353813171386L171.946,141.91793546676635L172.17466666666667,142.14647426605225L172.20366666666666,141.87411489486695L172.30866666666665,141.90542736053467L172.82766666666666,142.05262126922608L173.02766666666668,142.13774728775024L173.47466666666668,142.25084352493286L173.54066666666668,142.2527003288269L174.42066666666668,142.3212163925171L174.62466666666666,142.21480464935303L174.67433333333335,142.1095069885254L174.85399999999998,142.1278387069702L175.55333333333334,142.3556516647339L175.72566666666665,142.36118831634522L176.477,142.1624090194702L176.86733333333333,142.1582734107971L176.88033333333334,142.07701292037964L176.996,142.213369846344L177.13533333333334,142.21966609954833L177.397,142.39626502990723L177.74233333333333,142.50107316970826L177.901,143.16136951446532L178.61833333333334,142.22164106369019L178.91533333333334,142.40924577713014L178.99733333333333,142.4776774406433L179.11633333333333,142.48351793289186L179.191,142.5628203392029L179.26100000000002,142.61155300140382L179.59466666666668,142.68174018859864L179.60566666666668,142.6634759902954L180.09266666666667,142.16031589508057L180.20133333333334,142.43135862350465L180.29999999999998,142.42340812683105L180.79033333333334,142.50130949020385L180.91666666666666,142.5902335166931L181.34233333333333,142.82447776794433L182.267,142.95431900024414L182.77766666666668,143.02207546234132L182.906,143.07453861236573L183.11866666666668,143.17976875305175L183.22433333333333,143.17544746398926L183.33833333333334,143.17821578979493L183.895,142.85455799102783L184.29166666666666,143.09464273452758L184.87400000000002,143.16969137191774L185.1376666666667,143.31820192337037L185.64366666666666,143.4747136116028L185.65066666666667,143.47408905029297L185.79066666666665,142.84952774047852L186.095,143.41178483963012L186.762,143.44220266342163L186.83733333333333,143.5054015159607L186.948,143.5279026031494L187,143.54655504226685L187.0353333333333,143.5579490661621L187.94433333333333,143.3549835205078L188.22566666666665,143.30456285476686L188.26366666666667,143.26977310180663L188.33833333333334,143.39390888214112L188.41633333333334,143.4881163597107L188.49133333333333,143.62855825424194L188.49466666666666,143.64305820465088L188.94666666666666,143.8297682762146L189.02966666666666,143.81827297210694L189.1303333333333,143.8081787109375L189.17133333333334,143.8339038848877L189.38233333333335,143.83489980697632L189.586,143.94740524291993L189.83233333333334,143.3246332168579L189.88633333333334,143.37272443771363L191.09733333333332,144.14215021133424L191.10700000000003,144.1495943069458L191.14366666666666,144.17852668762208L191.62599999999998,144.40173139572144L192.26166666666666,143.7039613723755L192.39666666666668,143.85368728637695L192.93833333333333,144.10346117019654L192.95566666666667,144.08629417419434L193.60433333333333,143.6950824737549L194.18533333333332,143.65924615859984L195.67233333333334,144.089146900177L195.74466666666666,144.13106002807618L195.82333333333335,144.31451225280762L196.34366666666668,144.39354457855225L196.58,144.57370519638062L196.66866666666667,144.5822464942932L197.09,144.71446781158448L197.29766666666666,144.67599821090698L197.49033333333333,144.71738805770875L198.79966666666664,144.62667474746704L199.02800000000002,144.66154890060426L199.37300000000002,143.29580211639404L199.37333333333333,144.04792585372925L199.61533333333333,144.7065341949463L199.7396666666667,144.7239881515503L199.84699999999998,144.5813349723816L200.05333333333334,144.7074457168579L200.40466666666669,144.68717279434205L200.93733333333333,144.9365921974182L201.293,144.94999494552613L201.345,145.22689504623412L201.65866666666668,144.44837093353271L201.76333333333332,144.74513883590697L202.25333333333333,144.71318492889404L202.39733333333334,144.84699296951294L202.41066666666666,144.86009187698363L202.453,144.89846019744874L203.03766666666667,145.10557823181153L203.96833333333333,145.0180214881897L204.0956666666667,145.03058023452758L204.5186666666667,145.0488444328308L204.675,145.14598903656005L205.45566666666667,145.36070308685302L205.739,145.34961290359496L205.78233333333333,145.52565479278564L205.88233333333332,145.47022075653075L205.92166666666668,145.48107461929322L205.97166666666666,145.46041345596313L206.25266666666664,145.56562671661376L206.29433333333333,145.5499282836914L206.72533333333334,145.55377693176268L207.00633333333332,145.54253482818604L207.08633333333333,145.61749906539916L207.18,145.65269393920897L207.261,144.7199538230896L207.53533333333334,145.14998960494995L207.682,145.0358974456787L208.19433333333336,145.28752813339233L208.33066666666667,145.43472204208373L208.43800000000002,145.53968210220336L208.5713333333333,145.5796709060669L208.594,145.60477151870728L209.10433333333336,145.7310341835022L209.429,145.22456560134887L209.99966666666666,145.42746362686157L210.291,145.5758222579956L210.71,145.72862033843995L211.16366666666667,146.64529066085817L211.40566666666666,146.00246515274048L211.86599999999999,145.9387261390686L211.962,145.92834491729735L212.26466666666667,146.1418098449707L212.605,146.14363288879395L212.69333333333336,146.1596014022827L213.011,146.20813150405883L213.886,145.73800563812256L214.34166666666664,145.91828441619873L214.965,146.06279439926146L215.09466666666665,145.84835042953492L215.12833333333333,145.97024116516113L215.27833333333334,145.80285873413087L215.53333333333333,146.0240209579468L215.55466666666666,146.06946201324462L215.757,145.91595497131348L215.834,145.90724487304686L215.86866666666666,145.8999020576477L215.91033333333334,145.9127984046936L216.09933333333333,146.04334859848024L216.95666666666668,146.71478576660155L217.29066666666665,146.02248487472534L217.63566666666668,146.05408430099487L217.76333333333332,146.09052829742433L217.821,146.11431226730346L217.89933333333335,146.0985463142395L218.10466666666665,146.32608919143678L218.294,146.43489789962769L218.30466666666666,146.48052463531494L218.65933333333334,146.48081159591675L218.66633333333334,146.46374588012696L218.707,146.47589950561525L219.09366666666668,146.0510796546936L219.182,146.28706254959107L219.52966666666666,146.13588495254515L219.63266666666667,146.17514791488648L220.1783333333333,146.46735820770263L220.48799999999997,146.5224208831787L220.79933333333332,146.56625833511353L221.17733333333334,146.00489587783812L221.936,146.4917329788208L222.0593333333333,146.53097906112671L222.70566666666664,146.8117109298706L223.22933333333333,146.50395412445067L223.39333333333335,146.31793613433837L223.749,146.41876058578492L223.82633333333334,146.53654947280884L223.989,146.58803358078003L224.351,146.74071350097657L224.51733333333334,146.76778907775878L224.65633333333335,146.820792388916L224.70200000000003,146.79925346374512L224.784,147.65512189865112L225.045,146.6393320083618L225.253,146.75837001800537L225.62866666666665,146.67348031997682L225.85033333333334,146.78039846420288L225.862,146.7762797355652L225.88733333333332,146.76859931945802L226.0353333333333,146.8896966934204L226.27700000000002,146.92705221176146L226.285,146.93496894836426L226.36533333333333,146.97283086776733L227.22233333333332,146.81412477493285L227.31766666666667,146.85203733444214L228.22266666666667,147.16232614517213L228.24666666666664,147.1877981185913L228.64366666666666,147.26391019821168L228.7193333333333,147.55314960479737L229.26333333333332,147.1486026763916L229.38433333333336,147.08224725723267L229.39033333333333,147.05994873046876L229.55633333333333,147.174462890625L229.735,147.20964088439942L229.922,147.35111246109008L230.609,147.51471376419067L230.82399999999998,147.00252285003663L230.84400000000002,147.01432199478148L231.71666666666667,147.3646671295166L232.37166666666667,147.54082717895508L232.496,147.5937292098999L232.736,146.82746000289916L232.677,148.65580415725708L232.776,147.3349920272827L233.338,147.321724319458L234.16233333333335,147.771813583374L234.60933333333332,146.97872200012208L234.99866666666665,147.53826141357422L236.26999999999998,147.68920269012452L236.887,147.7271321296692L236.99333333333334,147.52332258224487L237.04600000000002,147.6406219482422L237.349,147.71253089904786L237.38933333333333,147.75581130981445L237.6723333333333,147.98034954071045L237.832,148.01539249420165L237.924,148.05289993286132L238.11766666666668,148.13105449676513L238.61366666666666,147.72433004379272L238.72400000000002,147.8275852203369L238.938,147.82211608886718L239.5066666666667,147.9125593185425L239.63133333333332,147.9609375L240.08766666666665,148.16091527938843L240.22933333333333,148.08409423828124L241.1136666666667,147.9303846359253L241.58,148.09228105545043L241.73533333333333,148.16648569107056L242.39133333333334,149.39047393798828L242.40433333333334,149.35310153961183L242.44599999999997,148.5913561820984L242.47233333333332,148.32726802825928L243.03599999999997,148.20880393981935L243.13233333333332,148.16221504211427L243.50333333333336,148.34014749526978L243.55333333333334,148.38707399368286L244.23000000000002,148.54689416885375L244.34333333333333,148.9116379737854L244.6,148.15656023025514L244.608,148.14504804611207L244.97166666666666,148.18304500579833L245.113,148.20509033203126L245.54333333333332,148.31408472061156L246.06466666666668,148.36253042221068L246.512,147.96357078552245L246.87466666666666,148.16521968841553L247.04399999999998,148.2719521522522L247.08566666666667,148.28300857543945L247.1076666666667,148.31445608139038L247.173,148.34784479141234L247.47266666666667,148.4583583831787L247.55833333333334,148.53788022994996L247.84833333333333,148.55638074874878L248.06166666666667,148.51645946502686L248.47400000000002,148.58158264160156L248.65833333333336,148.76722927093505L248.72433333333333,148.56823053359986L248.914,148.52808980941774L249.22866666666664,148.57786903381347L250.02233333333334,148.72037029266357L250.27233333333334,148.03190116882325L250.35,148.10911045074462L250.40933333333334,148.3713924407959L250.88933333333335,148.4493106842041L250.95333333333332,148.4659881591797L251.14033333333333,148.59198074340821L251.58666666666667,148.86712532043458L251.63,148.8379059791565L251.66966666666667,148.85659217834473L251.6903333333333,148.8554274559021L251.72466666666665,148.9027590751648L251.87966666666668,148.92097263336183L252.001,148.89551753997802L252.26733333333334,148.78883571624755L253.338,149.02486925125123L253.477,149.1105185508728L254.111,148.65420055389404L254.65633333333335,148.67857532501222L254.67033333333333,148.66682682037353L254.98766666666668,148.79418668746948L255.42766666666665,148.9804916381836L255.94,149.15839033126832L256.32733333333334,148.62714185714722L256.4893333333333,148.7128755569458L256.5,148.6776131629944L256.62333333333333,148.7736436843872L256.66166666666663,148.89919738769532L256.9993333333333,149.00792169570923L257.148,149.18149909973144L257.236,149.1970962524414L257.3283333333333,149.28060178756715L257.429,149.26306343078613L257.527,149.28269491195678L257.625,149.2619662284851L257.729,149.28364019393922L257.952,149.32472620010375L258.26433333333335,149.24697675704957L258.423,149.48223381042482L258.5876666666667,149.18946647644043L258.76866666666666,149.11308431625366L258.86266666666666,149.1660876274109L259.41666666666663,149.41096630096436L259.9123333333333,149.44400053024293L259.92966666666666,146.7643117904663L259.95166666666665,149.24750003814697L259.9526666666667,148.83296012878418L260.33233333333334,149.05493259429932L260.42466666666667,148.89739122390748L260.53933333333333,149.0538860321045L260.703,148.94676532745362L260.76666666666665,148.96442184448242L260.82233333333335,149.0021318435669L260.839,149.0057104110718L261.44,149.14679374694825L261.8276666666667,149.29783630371094L261.91866666666664,149.39766483306886L261.94399999999996,149.44254884719848L262.3426666666667,149.32877740859985L262.3933333333333,149.27405233383178L262.59166666666664,149.2130816459656L262.64366666666666,149.16811323165894L262.71500000000003,149.21909093856812L263.832,148.6729205131531L264.00866666666667,148.80019598007203L264.0266666666667,148.67320747375487L264.5333333333333,149.3656265258789L264.936,149.49607543945314L265.0613333333333,149.52406253814698L265.70599999999996,149.70312595367432L265.75866666666667,149.70131978988647L265.815,150.18013887405397L265.907,149.37943439483644L266.473,149.46680545806885L266.9013333333333,149.67442989349365L267.17400000000004,149.76433296203612L267.203,149.76404600143434L267.4583333333333,149.74458332061766L267.67833333333334,149.83004693984986L267.85966666666667,149.2662368774414L268.119,149.5728796005249L268.3953333333333,149.53960905075073L268.6073333333333,149.5390182495117L268.845,149.6639811515808L268.87733333333335,149.71305141448974L268.9866666666667,149.67881870269775L269.1813333333333,149.75550470352172L270.25766666666664,149.80489568710328L270.34366666666665,149.67372093200683L270.5593333333333,149.68150262832643L270.83233333333334,149.8971619606018L271.3113333333333,150.10250759124756L271.5146666666667,150.07129640579222L271.813,149.98642358779907L271.83733333333333,149.66614179611207L272.05466666666666,150.01147356033326L272.2316666666667,149.9954712867737L272.237,149.94678926467896L272.265,149.91706352233888L272.5536666666667,150.0001132965088L272.92133333333334,150.1225273132324L273.00266666666664,150.11445865631103L273.2006666666667,150.1868402481079L273.70966666666664,149.8315830230713L273.83433333333335,149.98559646606446L273.86533333333335,149.77081489562988L274.4463333333333,150.10490455627442L274.48266666666666,150.0361859321594L274.83133333333336,150.1306128501892L274.8736666666667,150.19286642074584L275.51266666666663,150.3555055618286L276.0966666666667,149.94960823059083L276.497,150.10151166915892L276.5453333333333,150.19720458984375L277.0676666666667,150.34944562911988L277.3396666666667,150.28692197799683L277.42466666666667,150.35317611694336L277.533,152.0280301094055L277.69233333333335,149.8404788017273L278.74133333333333,150.51351957321168L279.214,150.60266304016113L279.21833333333336,150.61135625839233L279.24966666666666,150.5238332748413L279.3916666666667,150.58333539962769L279.557,151.0118688583374L279.5686666666667,150.79531488418579L279.7283333333333,150.3152297973633L279.9533333333333,150.4114291191101L280.214,150.48614015579224L280.2656666666667,150.5585217475891L280.27933333333334,150.54687452316284L280.28933333333333,150.52248287200928L280.94933333333336,150.65021409988404L281.25233333333335,150.63826303482057L281.32300000000004,150.69808588027954L281.42633333333333,151.14344873428345L281.4993333333333,151.07898387908935L281.5203333333333,150.656324672699L281.746,150.5661008834839L281.87533333333334,150.70596885681152L282.34766666666667,150.58666076660157L282.592,150.72475633621215L282.9943333333333,150.78361701965332L283.44,150.59020557403565L283.46633333333335,150.13189573287963L283.663,150.03019351959227L283.69733333333335,150.1729817390442L284.0756666666667,150.30650281906128L284.38866666666667,150.46176538467407L284.7776666666667,150.6102252960205L284.9746666666667,150.7265456199646L285.2753333333333,150.7487597465515L285.86899999999997,150.6444748878479L285.89300000000003,150.71120166778564L286.041,150.75794248580934L286.295,150.73377027511597L286.707,150.8821457862854L286.88266666666664,150.91084184646607L287.1513333333333,150.91359329223633L287.28000000000003,150.7485909461975L288.486,150.74302053451538L288.5266666666667,150.7872631072998L288.56800000000004,150.82303190231323L288.84700000000004,150.9218307495117L288.9193333333333,150.93037204742433L289.35133333333334,151.01782751083374L289.387,151.05545310974122L289.47633333333334,150.69072618484498L289.752,150.76864442825317L291.2803333333333,150.87534313201905L291.65666666666664,150.88552179336548L292.3396666666667,150.93206005096437L292.373,150.96105995178223L292.5826666666667,151.09922304153443L292.624,151.11027946472169" style="fill: none;" fill="none"></path></g></g></g></g></g></g><g transform="translate(30, 177)" clip-path="url(#clip_1)"><clipPath id="clip_1"><rect width="300" height="23"></rect></clipPath><g><g><line x1="0" y1="0" x2="0" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="33.33333333333333" y1="0" x2="33.33333333333333" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="66.66666666666666" y1="0" x2="66.66666666666666" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="100" y1="0" x2="100" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="133.33333333333331" y1="0" x2="133.33333333333331" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="166.66666666666669" y1="0" x2="166.66666666666669" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="200" y1="0" x2="200" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="233.33333333333334" y1="0" x2="233.33333333333334" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="266.66666666666663" y1="0" x2="266.66666666666663" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="300" y1="0" x2="300" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(0, 8)"><text x="0" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">-100k</text><text x="33.33333333333333" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">0</text><text x="66.66666666666666" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">100k</text><text x="100" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">200k</text><text x="133.33333333333331" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">300k</text><text x="166.66666666666669" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">400k</text><text x="200" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">500k</text><text x="233.33333333333334" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">600k</text><text x="266.66666666666663" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">700k</text><text x="300" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">800k</text></g><line x1="0" y1="0" x2="300" y2="0" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g></g></g></svg>
\ No newline at end of file
diff --git a/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-epoch-loss.svg b/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-epoch-loss.svg
new file mode 100644
index 0000000000..21b438c108
--- /dev/null
+++ b/examples/models/transducer/rnnt/results/sentencepiece/figs/rnnt-tiny-sp256-epoch-loss.svg
@@ -0,0 +1 @@
+<svg viewBox="0 0 330 200" xmlns="http://www.w3.org/2000/svg"><g><g><g><g><g><line x1="30" y1="177" x2="25" y2="177" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="163.3846153846154" x2="25" y2="163.3846153846154" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="149.76923076923077" x2="25" y2="149.76923076923077" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="136.15384615384613" x2="25" y2="136.15384615384613" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="122.53846153846153" x2="25" y2="122.53846153846153" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="108.92307692307693" x2="25" y2="108.92307692307693" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="95.3076923076923" x2="25" y2="95.3076923076923" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="81.6923076923077" x2="25" y2="81.6923076923077" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="68.07692307692307" x2="25" y2="68.07692307692307" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="54.46153846153847" x2="25" y2="54.46153846153847" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="40.84615384615384" x2="25" y2="40.84615384615384" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="27.230769230769234" x2="25" y2="27.230769230769234" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="13.615384615384606" x2="25" y2="13.615384615384606" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="0" x2="25" y2="0" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(20, 0)"><text x="0" y="177" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">34</text><text x="0" y="163.3846153846154" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">36</text><text x="0" y="149.76923076923077" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">38</text><text x="0" y="136.15384615384613" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">40</text><text x="0" y="122.53846153846153" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">42</text><text x="0" y="108.92307692307693" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">44</text><text x="0" y="95.3076923076923" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">46</text><text x="0" y="81.6923076923077" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">48</text><text x="0" y="68.07692307692307" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">50</text><text x="0" y="54.46153846153847" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">52</text><text x="0" y="40.84615384615384" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">54</text><text x="0" y="27.230769230769234" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">56</text><text x="0" y="13.615384615384606" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">58</text><text x="0" y="0" dx="0em" dy="0.3em" style="text-anchor: end; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">60</text></g><line x1="30" y1="0" x2="30" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g><g transform="translate(30, 0)" clip-path="url(#clip_0)"><clipPath id="clip_0"><rect width="300" height="177"></rect></clipPath><g><g><g><line x1="0" y1="0" x2="0" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="37.5" y1="0" x2="37.5" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="75" y1="0" x2="75" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="112.5" y1="0" x2="112.5" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="150" y1="0" x2="150" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="187.5" y1="0" x2="187.5" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="225" y1="0" x2="225" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="262.5" y1="0" x2="262.5" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="300" y1="0" x2="300" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line></g><g><line x1="0" y1="177" x2="300" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="163.3846153846154" x2="300" y2="163.3846153846154" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="149.76923076923077" x2="300" y2="149.76923076923077" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="136.15384615384613" x2="300" y2="136.15384615384613" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="122.53846153846153" x2="300" y2="122.53846153846153" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="108.92307692307693" x2="300" y2="108.92307692307693" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="95.3076923076923" x2="300" y2="95.3076923076923" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="81.6923076923077" x2="300" y2="81.6923076923077" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="68.07692307692307" x2="300" y2="68.07692307692307" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="54.46153846153847" x2="300" y2="54.46153846153847" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="40.84615384615384" x2="300" y2="40.84615384615384" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="27.230769230769234" x2="300" y2="27.230769230769234" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="13.615384615384606" x2="300" y2="13.615384615384606" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="0" x2="300" y2="0" fill="rgb(0, 0, 0)" stroke="rgb(66, 66, 66)" stroke-width="1px" opacity="0.25"></line></g></g></g><g><g><line x1="0" y1="408.46153846153845" x2="300" y2="408.46153846153845" fill="rgb(0, 0, 0)" stroke="rgb(153, 153, 153)" stroke-width="1.5px"></line></g></g><g><g><line x1="37.5" y1="0" x2="37.5" y2="177" fill="rgb(0, 0, 0)" stroke="rgb(153, 153, 153)" stroke-width="1.5px"></line></g></g><g><g><g><g><g><path stroke="rgb(255, 112, 67)" stroke-width="2px" d="M43.125,-33.502299675574655L45,0.4411662175105229L46.875,22.33226086543156L48.75,37.96813377967247L50.625,50.09028449425331L52.5,59.539001171405495L54.375,67.61820162259616L56.25,74.08795562157265L58.125,79.63574805626503L60,84.30933644221379L61.875,88.80599608788124L63.75,92.65552711486816L65.625,95.93786298311674L67.5,98.82946498577411L69.375,101.91170751131499L71.25,104.4531656411978L73.125,106.80832011883075L75,108.9757166642409L76.875,111.28243842491737L78.75,113.0889657827524L80.625,115.0619302896353L82.5,116.6288650219257L84.375,118.10963366581844L86.25,119.77236909132736L88.125,121.08833665114182L90,122.49369049072266L91.875,123.78885665306679L93.75,125.1064602778508L95.625,126.12598844674918L97.5,127.39484772315392L99.375,128.48277898935171L101.25,129.4658203125L103.125,130.28185316232535L105,131.1790140592135L106.875,131.9441211407001L108.75,132.89558381300705L110.625,133.84302124610315L112.5,134.59311808072601L114.375,135.36269187927246L116.25,135.65456067598785L118.125,136.85034238375152L120,137.80583029526932L121.875,138.2242993574876L123.75,139.00472831726074L125.625,139.3416278545673L127.5,140.14607840317947L129.375,140.96553919865536L131.25,141.39766810490536L133.125,141.9660838200496L135,142.5232288654034L136.875,142.80886503366324L138.75,143.49144172668457L140.625,144.16737028268668L142.5,144.54054891146149L144.375,144.73285146859976L146.25,145.5430671985333L148.125,145.75295096177322L150,146.44713592529297L151.875,146.94704466599686L153.75,147.24325033334586L155.625,147.70955481896033L157.5,147.98656918452335L159.375,148.45882063645584L161.25,149.11355825570914L163.125,149.1965560913086L165,149.8646159538856L166.875,150.06572209871734L168.75,150.39270136906552L170.625,150.8850011091966L172.5,151.102234180157L174.375,151.34634546133188L176.25,151.58269192622257L178.125,151.8526426462027L180,152.46471272982086L181.875,152.72302921001727L183.75,153.10267419081467L185.625,153.3902840247521L187.5,153.78158921461838L189.375,153.83321615365836L191.25,154.02289581298828L193.125,154.59424605736365L195,154.63792639512283L196.875,154.9759166424091L198.75,155.29248222937952L200.625,155.55313594524677L202.5,156.00414452185998L204.375,156.14985818129318L206.25,156.24056889460635L208.125,156.87432333139273L210,156.67500906724197L211.875,157.09049166165866L213.75,157.45027013925406L215.625,157.4657218639667L217.5,157.8508983025184L219.375,157.80773735046387L221.25,158.07644154475284L223.125,158.50179246755746L225,158.88948792677658L226.875,159.05698981651892L228.75,159.12853519733136L230.625,159.26264058626614L232.5,159.79168686499963L234.375,159.89740782517654L236.25,160.18696535550632L238.125,160.52864324129544L240,160.35213000957782L241.875,160.58777530376727L243.75,160.89997805081882L245.625,161.0444451845609L247.5,161.47943071218637L249.375,161.45805799044095L251.25,161.68232873769907L253.125,161.8547907609206L255,161.6683572622446L256.875,162.10469319270206L258.75,162.2847642164964L260.625,162.50524344811072L262.5,162.51402106651892L264.375,162.94783797630896L266.25,162.9245175581712L268.125,163.24866617642917L270,163.21937282268817L271.875,163.63932213416467L273.75,163.78807419996994L275.625,164.01143602224496L277.5,163.89478199298566L279.375,164.18371626046988L281.25,164.18984501178448L283.125,164.36472217853253" style="fill: none;" fill="none"></path></g></g></g></g></g></g><g transform="translate(30, 177)" clip-path="url(#clip_1)"><clipPath id="clip_1"><rect width="300" height="23"></rect></clipPath><g><g><line x1="0" y1="0" x2="0" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="37.5" y1="0" x2="37.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="75" y1="0" x2="75" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="112.5" y1="0" x2="112.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="150" y1="0" x2="150" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="187.5" y1="0" x2="187.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="225" y1="0" x2="225" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="262.5" y1="0" x2="262.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="300" y1="0" x2="300" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(0, 8)"><text x="0" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">-20</text><text x="37.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">0</text><text x="75" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">20</text><text x="112.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">40</text><text x="150" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">60</text><text x="187.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">80</text><text x="225" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">100</text><text x="262.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">120</text><text x="300" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: Roboto, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(33, 33, 33)" stroke="none" stroke-width="1px">140</text></g><line x1="0" y1="0" x2="300" y2="0" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g></g></g></svg>
\ No newline at end of file
diff --git a/examples/models/transducer/rnnt/small.yml.j2 b/examples/models/transducer/rnnt/small.yml.j2
index 8053bb560c..1c336564e2 100644
--- a/examples/models/transducer/rnnt/small.yml.j2
+++ b/examples/models/transducer/rnnt/small.yml.j2
@@ -22,6 +22,7 @@ model_config:
             prob: 1.0
             num_masks: 1
             mask_factor: 27
+    encoder_reduction_positions: [ post, post, post, post ]
     encoder_reduction_factors: [ 3, 0, 2, 0 ] # downsampled to 30ms and add 2 reduction after second layer
     encoder_dmodel: 320
     encoder_rnn_type: lstm
diff --git a/examples/models/transducer/rnnt/tiny.yml.j2 b/examples/models/transducer/rnnt/tiny.yml.j2
index 86d2c57057..1853790a46 100644
--- a/examples/models/transducer/rnnt/tiny.yml.j2
+++ b/examples/models/transducer/rnnt/tiny.yml.j2
@@ -22,6 +22,7 @@ model_config:
             prob: 1.0
             num_masks: 1
             mask_factor: 27
+    encoder_reduction_positions: [ pre, pre, pre, pre ]
     encoder_reduction_factors: [ 3, 0, 2, 0 ] # downsampled to 30ms and add 2 reduction after second layer
     encoder_dmodel: 128
     encoder_rnn_type: lstm
diff --git a/examples/save.py b/examples/save.py
new file mode 100644
index 0000000000..a7ab0d5512
--- /dev/null
+++ b/examples/save.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Huy Le Nguyen (@nglehuy)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from tensorflow_asr import tf, keras
+from tensorflow_asr import tokenizers
+from tensorflow_asr.configs import Config
+from tensorflow_asr.models.base_model import BaseModel
+from tensorflow_asr.utils import cli_util, env_util, file_util
+
+
+def main(
+    config_path: str,
+    output: str,
+    h5: str = None,
+    bs: int = 2,
+    repodir: str = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")),
+):
+    assert output
+    keras.backend.clear_session()
+    env_util.setup_seed()
+
+    config = Config(config_path, training=False, repodir=repodir)
+    tokenizer = tokenizers.get(config)
+
+    model: BaseModel = keras.models.model_from_config(config.model_config)
+    model.tokenizer = tokenizer
+    model.make(batch_size=bs)
+    if h5 and tf.io.gfile.exists(h5):
+        model.load_weights(h5, by_name=file_util.is_hdf5_filepath(h5))
+    model.summary()
+
+    model.save(output)
+    print(model.to_json())
+    keras.utils.plot_model(model, to_file=f"{output}.png", show_shapes=True, show_dtype=True, expand_nested=True, show_layer_activations=True)
+
+
+if __name__ == "__main__":
+    cli_util.run(main)
diff --git a/examples/test.py b/examples/test.py
index b427ba1e3b..30c87ac930 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -16,7 +16,7 @@
 import json
 import os
 
-from tensorflow_asr import datasets, tf, tokenizers  # import to aid logging messages
+from tensorflow_asr import datasets, tf, keras, tokenizers  # import to aid logging messages
 from tensorflow_asr.callbacks import PredictLogger
 from tensorflow_asr.configs import Config
 from tensorflow_asr.models.base_model import BaseModel
@@ -50,7 +50,7 @@ def main(
 
     tokenizer = tokenizers.get(config)
 
-    model: BaseModel = tf.keras.models.model_from_config(config.model_config)
+    model: BaseModel = keras.models.model_from_config(config.model_config)
     model.tokenizer = tokenizer
     model.make(batch_size=batch_size)
     model.load_weights(h5, by_name=file_util.is_hdf5_filepath(h5), skip_mismatch=False)
diff --git a/examples/tflite.py b/examples/tflite.py
index f2d46c56ff..d8b3b9cfd5 100644
--- a/examples/tflite.py
+++ b/examples/tflite.py
@@ -14,7 +14,7 @@
 
 import os
 
-from tensorflow_asr import tf  # import to aid logging messages
+from tensorflow_asr import tf, keras  # import to aid logging messages
 from tensorflow_asr import tokenizers
 from tensorflow_asr.configs import Config
 from tensorflow_asr.models.base_model import BaseModel
@@ -30,13 +30,13 @@ def main(
     repodir: str = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")),
 ):
     assert output
-    tf.keras.backend.clear_session()
+    keras.backend.clear_session()
     env_util.setup_seed()
 
     config = Config(config_path, training=False, repodir=repodir)
     tokenizer = tokenizers.get(config)
 
-    model: BaseModel = tf.keras.models.model_from_config(config.model_config)
+    model: BaseModel = keras.models.model_from_config(config.model_config)
     model.tokenizer = tokenizer
     model.make(batch_size=bs)
     if h5 and tf.io.gfile.exists(h5):
diff --git a/examples/train.py b/examples/train.py
index 62b88413c9..f20c704a83 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -15,7 +15,7 @@
 import json
 import os
 
-from tensorflow_asr import callbacks, datasets, tf, tokenizers  # import to aid logging messages
+from tensorflow_asr import callbacks, datasets, tf, keras, tokenizers  # import to aid logging messages
 from tensorflow_asr.configs import Config
 from tensorflow_asr.models.base_model import BaseModel
 from tensorflow_asr.utils import cli_util, env_util, file_util
@@ -36,7 +36,7 @@ def main(
     ga_steps: int = None,
     repodir: str = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")),
 ):
-    tf.keras.backend.clear_session()
+    keras.backend.clear_session()
     env_util.setup_seed()
     strategy = env_util.setup_strategy(devices)
     env_util.setup_mxp(mxp=mxp)
@@ -73,7 +73,7 @@ def main(
         logger.info(f"eval_data_loader.element_spec = {json.dumps(eval_data_loader.element_spec, indent=2, default=str)}")
 
     with strategy.scope():
-        model: BaseModel = tf.keras.models.model_from_config(config.model_config)
+        model: BaseModel = keras.models.model_from_config(config.model_config)
         model.tokenizer = tokenizer
         output_shapes = model.make(**shapes)
         if config.learning_config.pretrained:
@@ -83,7 +83,7 @@ def main(
                 skip_mismatch=True,
             )
         model.compile(
-            optimizer=tf.keras.optimizers.get(config.learning_config.optimizer_config),
+            optimizer=keras.optimizers.get(config.learning_config.optimizer_config),
             output_shapes=output_shapes,
             steps_per_execution=spx,
             jit_compile=jit_compile,
diff --git a/requirements.txt b/requirements.txt
index eb9076cf47..91b58751cd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,9 +14,11 @@ pytest~=7.4.1
 black~=24.3.0
 pylint~=3.2.1
 matplotlib~=3.7.2
-pydot~=1.4.2
+pydot-ng~=1.4.2
 graphviz~=0.20.1
 pre-commit~=3.7.0
+tf2onnx~=1.16.1
+netron~=7.6.8
 
 # extra=tf2-12
 tensorflow~=2.12.0
diff --git a/scripts/create_mls_trans.py b/scripts/create_mls_trans.py
index 3e1a82f3ac..b118820f44 100644
--- a/scripts/create_mls_trans.py
+++ b/scripts/create_mls_trans.py
@@ -16,29 +16,16 @@
 import os
 
 import librosa
-import tensorflow as tf
+import keras
 import tqdm
 
 # example usage: python create_mls_trans.py -dataset-home /mnt/datasets/mls --language polish --opus
 
 base_url = "https://dl.fbaipublicfiles.com/mls/"
 
-langs = [
-    "dutch",
-    "english",
-    "german",
-    "french",
-    "italian",
-    "portuguese",
-    "polish",
-    "spanish"
-]
-
-splits = [
-    "dev",
-    "test",
-    "train"
-]
+langs = ["dutch", "english", "german", "french", "italian", "portuguese", "polish", "spanish"]
+
+splits = ["dev", "test", "train"]
 
 chars = set()
 
@@ -46,17 +33,17 @@
 def prepare_split(dataset_dir, split, opus=False):
     # Setup necessary paths
     split_home = os.path.join(dataset_dir, split)
-    transcripts_infile = os.path.join(split_home, 'transcripts.txt')
-    transcripts_outfile = os.path.join(split_home, 'transcripts_tfasr.tsv')
+    transcripts_infile = os.path.join(split_home, "transcripts.txt")
+    transcripts_outfile = os.path.join(split_home, "transcripts_tfasr.tsv")
     audio_home = os.path.join(split_home, "audio")
     extension = ".opus" if opus else ".flac"
     transcripts = []
 
     # Make paths absolute, get durations and read chars to form alphabet later on
-    with open(transcripts_infile, 'r', encoding='utf8') as infile:
+    with open(transcripts_infile, "r", encoding="utf8") as infile:
         for line in tqdm.tqdm(infile.readlines(), desc=f"Reading from {transcripts_infile}..."):
-            file_id, transcript = line.strip().split('\t')
-            speaker_id, book_id, _ = file_id.split('_')
+            file_id, transcript = line.strip().split("\t")
+            speaker_id, book_id, _ = file_id.split("_")
             audio_path = os.path.join(audio_home, speaker_id, book_id, f"{file_id}{extension}")
             y, sr = librosa.load(audio_path, sr=None)
             duration = librosa.get_duration(y, sr)
@@ -65,7 +52,7 @@ def prepare_split(dataset_dir, split, opus=False):
                 chars.add(char)
 
     # Write transcripts to file
-    with open(transcripts_outfile, 'w', encoding='utf8') as outfile:
+    with open(transcripts_outfile, "w", encoding="utf8") as outfile:
         outfile.write("PATH\tDURATION\tTRANSCRIPT\n")
         for t in tqdm.tqdm(transcripts, desc=f"Writing to {transcripts_outfile}"):
             outfile.write(t)
@@ -73,7 +60,7 @@ def prepare_split(dataset_dir, split, opus=False):
 
 def make_alphabet_file(filepath, chars_list, lang):
     print(f"Writing alphabet to {filepath}...")
-    with open(filepath, 'w', encoding='utf8') as outfile:
+    with open(filepath, "w", encoding="utf8") as outfile:
         outfile.write(f"# Alphabet file for language {lang}\n")
         outfile.write("Automatically generated. Do not edit\n#\n")
         for char in sorted(list(chars_list)):
@@ -84,10 +71,10 @@ def make_alphabet_file(filepath, chars_list, lang):
 
 if __name__ == "__main__":
     ap = argparse.ArgumentParser(description="Download and prepare MLS dataset in a given language")
-    ap.add_argument("--dataset-home", "-d", default=None, required=False,
-                    help="Path to home directory to download and prepare dataset. Default to ~/.keras")
-    ap.add_argument("--language", "-l", type=str, choices=langs, default=None, required=True,
-                    help="Any name of language included in MLS")
+    ap.add_argument(
+        "--dataset-home", "-d", default=None, required=False, help="Path to home directory to download and prepare dataset. Default to ~/.keras"
+    )
+    ap.add_argument("--language", "-l", type=str, choices=langs, default=None, required=True, help="Any name of language included in MLS")
     ap.add_argument("--opus", default=False, action="store_true", help="Whether to use dataset in opus format or not")
 
     args = ap.parse_args()
@@ -97,12 +84,7 @@ def make_alphabet_file(filepath, chars_list, lang):
     dataset_dir = os.path.join(dataset_home, subdir)
     full_url = base_url + fname
 
-    downloaded_file = tf.keras.utils.get_file(
-        fname,
-        full_url,
-        cache_subdir=dataset_home,
-        extract=True
-    )
+    downloaded_file = keras.utils.get_file(fname, full_url, cache_subdir=dataset_home, extract=True)
 
     print(f"Dataset extracted to {dataset_dir}. Preparing...")
 
diff --git a/tensorflow_asr/callbacks.py b/tensorflow_asr/callbacks.py
index e7dd439f05..012dd73869 100644
--- a/tensorflow_asr/callbacks.py
+++ b/tensorflow_asr/callbacks.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.datasets import ASRDataset
 from tensorflow_asr.utils import file_util
@@ -24,8 +25,8 @@
 serialization_lib = importlib.import_module(f"{KERAS_SRC}.saving.serialization_lib")
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class TestLogger(tf.keras.callbacks.Callback):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class TestLogger(keras.callbacks.Callback):
     def __init__(self):
         super().__init__()
         self.wer = {"numer": 0, "denom": 0}
@@ -80,8 +81,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class PredictLogger(tf.keras.callbacks.Callback):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class PredictLogger(keras.callbacks.Callback):
     def __init__(self, test_dataset: ASRDataset, output_file_path: str):
         super().__init__()
         self.test_dataset = test_dataset
@@ -123,8 +124,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class TensorBoard(tf.keras.callbacks.TensorBoard):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class TensorBoard(keras.callbacks.TensorBoard):
     def __init__(
         self,
         log_dir="logs",
@@ -165,8 +166,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class TerminateOnNaN(tf.keras.callbacks.TerminateOnNaN):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class TerminateOnNaN(keras.callbacks.TerminateOnNaN):
     def get_config(self):
         return {}
 
@@ -175,8 +176,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class ModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class ModelCheckpoint(keras.callbacks.ModelCheckpoint):
     def __init__(
         self,
         filepath,
@@ -203,8 +204,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class BackupAndRestore(tf.keras.callbacks.BackupAndRestore):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class BackupAndRestore(keras.callbacks.BackupAndRestore):
     def __init__(
         self,
         backup_dir,
@@ -223,8 +224,8 @@ def from_config(cls, config):
         return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
-class EarlyStopping(tf.keras.callbacks.EarlyStopping):
+@keras.utils.register_keras_serializable("tensorflow_asr.callbacks")
+class EarlyStopping(keras.callbacks.EarlyStopping):
     def get_config(self):
         return {}
 
diff --git a/tensorflow_asr/losses/ctc_loss.py b/tensorflow_asr/losses/ctc_loss.py
index 5e8e52bf26..f46519547a 100644
--- a/tensorflow_asr/losses/ctc_loss.py
+++ b/tensorflow_asr/losses/ctc_loss.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.utils import env_util, math_util
 
 logger = tf.get_logger()
 
 
-class CtcLoss(tf.keras.losses.Loss):
-    def __init__(self, blank=0, reduction=tf.keras.losses.Reduction.AUTO, name=None):
+class CtcLoss(keras.losses.Loss):
+    def __init__(self, blank=0, reduction=keras.losses.Reduction.AUTO, name=None):
         super().__init__(reduction=reduction, name=name)
         self.blank = blank
         self.use_tpu = env_util.has_devices("TPU")
diff --git a/tensorflow_asr/losses/rnnt_loss.py b/tensorflow_asr/losses/rnnt_loss.py
index 11000c3b7c..86f5febb28 100644
--- a/tensorflow_asr/losses/rnnt_loss.py
+++ b/tensorflow_asr/losses/rnnt_loss.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.utils import env_util, math_util, shape_util
 
@@ -29,11 +30,11 @@
 logger = tf.get_logger()
 
 
-class RnntLoss(tf.keras.losses.Loss):
+class RnntLoss(keras.losses.Loss):
     def __init__(
         self,
         blank,
-        reduction=tf.keras.losses.Reduction.AUTO,
+        reduction=keras.losses.Reduction.AUTO,
         output_shapes=None,
         name=None,
     ):
diff --git a/tensorflow_asr/metrics/error_rates.py b/tensorflow_asr/metrics/error_rates.py
index a3bc72b62c..0ad18001cf 100644
--- a/tensorflow_asr/metrics/error_rates.py
+++ b/tensorflow_asr/metrics/error_rates.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 
-class ErrorRate(tf.keras.metrics.Metric):
+class ErrorRate(keras.metrics.Metric):
     """Metric for WER or CER"""
 
     def __init__(self, name="error_rate", **kwargs):
diff --git a/tensorflow_asr/models/activations/glu.py b/tensorflow_asr/models/activations/glu.py
index 4bf049ce7b..a2a7ea26a8 100644
--- a/tensorflow_asr/models/activations/glu.py
+++ b/tensorflow_asr/models/activations/glu.py
@@ -30,3 +30,8 @@ def call(self, inputs):
     def compute_output_shape(self, input_shape):
         B, T, V = input_shape
         return (B, T, V // 2)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"axis": self.axis})
+        return config
diff --git a/tensorflow_asr/models/base_layer.py b/tensorflow_asr/models/base_layer.py
index 54c331aaac..c59183936d 100644
--- a/tensorflow_asr/models/base_layer.py
+++ b/tensorflow_asr/models/base_layer.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
+import keras
 
 from tensorflow_asr.utils import math_util
 
 
-class Layer(tf.keras.layers.Layer):
+class Layer(keras.layers.Layer):
     def __init__(
         self,
         trainable=True,
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
index 731fc68d02..25c595c6a4 100644
--- a/tensorflow_asr/models/base_model.py
+++ b/tensorflow_asr/models/base_model.py
@@ -17,6 +17,7 @@
 import importlib
 
 import tensorflow as tf
+import keras
 from keras import callbacks as callbacks_module
 from keras.optimizers import Optimizer
 from tensorflow.python.eager import context  # pylint: disable=no-name-in-module
@@ -42,7 +43,7 @@
 logger = tf.get_logger()
 
 
-class BaseModel(tf.keras.Model):
+class BaseModel(keras.Model):
     def __init__(self, speech_config: dict, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.feature_extraction = FeatureExtraction(**speech_config, dtype=self.dtype)
@@ -68,22 +69,11 @@ def save(
         self,
         filepath,
         overwrite=True,
-        include_optimizer=True,
         save_format=None,
-        signatures=None,
-        options=None,
-        save_traces=True,
+        **kwargs,
     ):
         with file_util.save_file(filepath) as path:
-            super().save(
-                filepath=path,
-                overwrite=overwrite,
-                include_optimizer=include_optimizer,
-                save_format=save_format,
-                signatures=signatures,
-                options=options,
-                save_traces=save_traces,
-            )
+            super().save(filepath=path, overwrite=overwrite, save_format=save_format, **kwargs)
 
     def save_weights(
         self,
@@ -105,7 +95,7 @@ def load_weights(
         with file_util.read_file(filepath) as path:
             super().load_weights(filepath=path, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
 
-    def add_custom_metric(self, metric: tf.keras.metrics.Metric):
+    def add_custom_metric(self, metric: keras.metrics.Metric):
         if not hasattr(self, "_tfasr_metrics"):
             self._tfasr_metrics = {}
         self._tfasr_metrics[metric.name] = metric
@@ -124,10 +114,10 @@ def make(self, input_shape=[None], prediction_shape=[None], batch_size=None, cac
             Batch size, by default None
         """
         assert batch_size is not None and batch_size > 0
-        signals = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32)
-        signals_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        predictions = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
-        predictions_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
+        signals = keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32)
+        signals_length = keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
+        predictions = keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
+        predictions_length = keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
         self._per_replica_batch_size = int(batch_size / self.distribute_strategy.num_replicas_in_sync)
         self._batch_size = batch_size
         outputs = self(
@@ -156,13 +146,13 @@ def compile(
         gradn_config=None,
         **kwargs,
     ):
-        optimizer = tf.keras.optimizers.get(optimizer)
+        optimizer = keras.optimizers.get(optimizer)
         if env_util.has_devices("TPU"):
             self.use_loss_scale = False
         else:
             self.use_loss_scale = mxp != "none"
             if self.use_loss_scale:
-                optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
+                optimizer = keras.mixed_precision.LossScaleOptimizer(optimizer)
                 logger.info("Using loss scale")
         if isinstance(ga_steps, int) and ga_steps > 1:
             self.use_ga = True
@@ -171,7 +161,7 @@ def compile(
         else:
             self.use_ga = False
         self.gwn_config = gwn_config
-        self.gradn = tf.keras.regularizers.get(gradn_config) if gradn_config else None
+        self.gradn = keras.regularizers.get(gradn_config) if gradn_config else None
         self.distribute_reduction_method = "sum"
         super().compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
 
@@ -278,12 +268,13 @@ def test_step(self, data):
 
     def predict_step(self, data):
         x, y_true = data
+        batch_size, *_ = shape_util.shape_list(x["inputs"])
         inputs = schemas.PredictInput(
             inputs=x["inputs"],
             inputs_length=x["inputs_length"],
-            previous_tokens=self.get_initial_tokens(),
-            previous_encoder_states=self.get_initial_encoder_states(),
-            previous_decoder_states=self.get_initial_decoder_states(),
+            previous_tokens=self.get_initial_tokens(batch_size=batch_size),
+            previous_encoder_states=self.get_initial_encoder_states(batch_size=batch_size),
+            previous_decoder_states=self.get_initial_decoder_states(batch_size=batch_size),
         )
         _tokens = self.recognize(inputs=inputs).tokens
         _beam_tokens = self.recognize_beam(inputs=inputs).tokens
@@ -508,7 +499,7 @@ def fit(
                 steps_per_execution=self._steps_per_execution,
             )
 
-            # Container that configures and calls `tf.keras.Callback`s.
+            # Container that configures and calls `keras.Callback`s.
             if not isinstance(callbacks, callbacks_module.CallbackList):
                 callbacks = callbacks_module.CallbackList(
                     callbacks,
@@ -570,45 +561,46 @@ def fit(
                         "`Model.compile(..., run_eagerly=True)`, or "
                         "`tf.config.run_functions_eagerly(True)` for more "
                         "information of where went wrong, or file a "
-                        "issue/bug to `tf.keras`."
+                        "issue/bug to `keras`."
                     )
                 # Override with model metrics instead of last step logs
                 logs = self._validate_and_get_metrics_result(logs)
                 epoch_logs = copy.copy(logs)
 
                 # Run validation.
-                if validation_data and self._should_eval(epoch, validation_freq):
-                    # Create data_handler for evaluation and cache it.
-                    if getattr(self, "_eval_data_handler", None) is None:
-                        self._eval_data_handler = data_adapter.get_data_handler(
+                if validation_data:
+                    if self._should_eval(epoch, validation_freq):
+                        # Create data_handler for evaluation and cache it.
+                        if getattr(self, "_eval_data_handler", None) is None:
+                            self._eval_data_handler = data_adapter.get_data_handler(
+                                x=val_x,
+                                y=val_y,
+                                sample_weight=val_sample_weight,
+                                batch_size=validation_batch_size or batch_size,
+                                steps_per_epoch=validation_steps,
+                                initial_epoch=0,
+                                epochs=1,
+                                max_queue_size=max_queue_size,
+                                workers=workers,
+                                use_multiprocessing=use_multiprocessing,
+                                model=self,
+                                steps_per_execution=self._steps_per_execution,
+                            )
+                        val_logs = self.evaluate(
                             x=val_x,
                             y=val_y,
                             sample_weight=val_sample_weight,
                             batch_size=validation_batch_size or batch_size,
-                            steps_per_epoch=validation_steps,
-                            initial_epoch=0,
-                            epochs=1,
+                            steps=validation_steps,
+                            callbacks=callbacks,
                             max_queue_size=max_queue_size,
                             workers=workers,
                             use_multiprocessing=use_multiprocessing,
-                            model=self,
-                            steps_per_execution=self._steps_per_execution,
+                            return_dict=True,
+                            _use_cached_eval_dataset=True,
                         )
-                    val_logs = self.evaluate(
-                        x=val_x,
-                        y=val_y,
-                        sample_weight=val_sample_weight,
-                        batch_size=validation_batch_size or batch_size,
-                        steps=validation_steps,
-                        callbacks=callbacks,
-                        max_queue_size=max_queue_size,
-                        workers=workers,
-                        use_multiprocessing=use_multiprocessing,
-                        return_dict=True,
-                        _use_cached_eval_dataset=True,
-                    )
-                    val_logs = {"val_" + name: val for name, val in val_logs.items()}
-                    epoch_logs.update(val_logs)
+                        val_logs = {"val_" + name: val for name, val in val_logs.items()}
+                        epoch_logs.update(val_logs)
 
                 callbacks.on_epoch_end(epoch, epoch_logs)
                 training_logs = epoch_logs
diff --git a/tensorflow_asr/models/ctc/base_ctc.py b/tensorflow_asr/models/ctc/base_ctc.py
index cd48572d83..be59b9b6bc 100644
--- a/tensorflow_asr/models/ctc/base_ctc.py
+++ b/tensorflow_asr/models/ctc/base_ctc.py
@@ -14,6 +14,7 @@
 
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr import schemas
 from tensorflow_asr.losses.ctc_loss import CtcLoss
@@ -26,8 +27,8 @@ def __init__(
         self,
         blank: int,
         speech_config: dict,
-        encoder: tf.keras.layers.Layer,
-        decoder: tf.keras.layers.Layer,
+        encoder: keras.layers.Layer,
+        decoder: keras.layers.Layer,
         **kwargs,
     ):
         super().__init__(speech_config=speech_config, **kwargs)
@@ -94,9 +95,6 @@ def call_next(
         outputs, outputs_length, next_decoder_states = self.decoder.call_next(outputs, outputs_length, previous_decoder_states)
         return outputs, outputs_length, next_encoder_states, next_decoder_states
 
-    def get_initial_tokens(self, batch_size=1):
-        return super().get_initial_tokens(batch_size)
-
     def get_initial_encoder_states(self, batch_size=1):
         return tf.zeros([], dtype=self.dtype)
 
diff --git a/tensorflow_asr/models/ctc/conformer.py b/tensorflow_asr/models/ctc/conformer.py
index cd1a5f2834..057e37aff8 100644
--- a/tensorflow_asr/models/ctc/conformer.py
+++ b/tensorflow_asr/models/ctc/conformer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.ctc.base_ctc import CtcModel
@@ -29,7 +30,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self._vocab_size = vocab_size
-        self.vocab = tf.keras.layers.Dense(
+        self.vocab = keras.layers.Dense(
             units=vocab_size,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
@@ -51,8 +52,19 @@ def compute_output_shape(self, input_shape):
         outputs_shape = logits_shape[:-1] + (self._vocab_size,)
         return tuple(outputs_shape), tuple(logits_length_shape)
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "vocab_size": self._vocab_size,
+                "kernel_regularizer": self.vocab.kernel_regularizer,
+                "bias_regularizer": self.vocab.bias_regularizer,
+            }
+        )
+        return config
+
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
 class Conformer(CtcModel):
     def __init__(
         self,
@@ -141,7 +153,7 @@ def make(self, input_shape=[None], prediction_shape=[None], batch_size=None, **k
             None
             if self.encoder._memory_length is None
             else [
-                tf.keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
+                keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
                 for _ in range(self.encoder._num_blocks)
             ]
         )
diff --git a/tensorflow_asr/models/ctc/deepspeech2.py b/tensorflow_asr/models/ctc/deepspeech2.py
index 7fbf62afd8..0660439fb4 100644
--- a/tensorflow_asr/models/ctc/deepspeech2.py
+++ b/tensorflow_asr/models/ctc/deepspeech2.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.ctc.base_ctc import CtcModel
@@ -29,7 +30,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.vocab = tf.keras.layers.Dense(
+        self.vocab = keras.layers.Dense(
             vocab_size,
             name="logits",
             kernel_regularizer=kernel_regularizer,
@@ -53,8 +54,20 @@ def compute_output_shape(self, input_shape):
         output_shape = self.vocab.compute_output_shape(output_shape)
         return output_shape, output_length_shape
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "vocab_size": self.vocab.units,
+                "kernel_regularizer": self.vocab.kernel_regularizer,
+                "bias_regularizer": self.vocab.bias_regularizer,
+                "initializer": self.vocab.kernel_initializer,
+            }
+        )
+        return config
+
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
 class DeepSpeech2(CtcModel):
     def __init__(
         self,
diff --git a/tensorflow_asr/models/ctc/jasper.py b/tensorflow_asr/models/ctc/jasper.py
index db5b2da088..54f9b33874 100644
--- a/tensorflow_asr/models/ctc/jasper.py
+++ b/tensorflow_asr/models/ctc/jasper.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.ctc.base_ctc import CtcModel
@@ -57,7 +57,7 @@ def compute_output_shape(self, input_shape):
         return tuple(outputs_shape), tuple(logits_length_shape)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
 class Jasper(CtcModel):
     def __init__(
         self,
diff --git a/tensorflow_asr/models/ctc/transformer.py b/tensorflow_asr/models/ctc/transformer.py
index a37ef36f85..ca0214d922 100644
--- a/tensorflow_asr/models/ctc/transformer.py
+++ b/tensorflow_asr/models/ctc/transformer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.ctc.base_ctc import CtcModel
@@ -29,7 +30,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self._vocab_size = vocab_size
-        self.vocab = tf.keras.layers.Dense(
+        self.vocab = keras.layers.Dense(
             vocab_size,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
@@ -52,7 +53,7 @@ def compute_output_shape(self, input_shape):
         return tuple(outputs_shape), tuple(logits_length_shape)
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.ctc")
 class Transformer(CtcModel):
     def __init__(
         self,
@@ -127,7 +128,7 @@ def make(self, input_shape=[None], prediction_shape=[None], batch_size=None, **k
             None
             if self.encoder._memory_length is None
             else [
-                tf.keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
+                keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
                 for _ in range(self.encoder._num_blocks)
             ]
         )
diff --git a/tensorflow_asr/models/encoders/conformer.py b/tensorflow_asr/models/encoders/conformer.py
index 516b5e3d8d..c91b33921e 100644
--- a/tensorflow_asr/models/encoders/conformer.py
+++ b/tensorflow_asr/models/encoders/conformer.py
@@ -15,6 +15,7 @@
 """ http://arxiv.org/abs/2005.08100 """
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.activations.glu import GLU
 from tensorflow_asr.models.base_layer import Identity, Layer
@@ -26,7 +27,7 @@
 from tensorflow_asr.models.layers.residual import Residual
 from tensorflow_asr.models.layers.subsampling import Conv1dSubsampling, Conv2dSubsampling, VggSubsampling
 
-L2 = tf.keras.regularizers.l2(1e-6)
+L2 = keras.regularizers.l2(1e-6)
 
 
 class FFModule(Layer):
@@ -61,12 +62,21 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
         assert norm_position in ("pre", "post", "none")
+        self._config = {
+            "input_dim": input_dim,
+            "dropout": dropout,
+            "scale_factor": scale_factor,
+            "residual_factor": residual_factor,
+            "norm_position": norm_position,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self.pre_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
             else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
-        self.ffn1 = tf.keras.layers.Dense(
+        self.ffn1 = keras.layers.Dense(
             units=scale_factor * input_dim,
             name="dense_1",
             kernel_regularizer=kernel_regularizer,
@@ -74,17 +84,17 @@ def __init__(
             activation="swish",
             dtype=self.dtype,
         )
-        self.do1 = tf.keras.layers.Dropout(rate=dropout, name="dropout_1", dtype=self.dtype)
-        self.ffn2 = tf.keras.layers.Dense(
+        self.do1 = keras.layers.Dropout(rate=dropout, name="dropout_1", dtype=self.dtype)
+        self.ffn2 = keras.layers.Dense(
             units=input_dim,
             name="dense_2",
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             dtype=self.dtype,
         )
-        self.do2 = tf.keras.layers.Dropout(rate=dropout, name="dropout_2", dtype=self.dtype)
+        self.do2 = keras.layers.Dropout(rate=dropout, name="dropout_2", dtype=self.dtype)
         self.post_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
             else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -103,6 +113,11 @@ def call(self, inputs, training=False):
     def compute_output_shape(self, input_shape):
         return input_shape
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class MHSAModule(Layer):
     r"""
@@ -138,8 +153,22 @@ def __init__(
         super().__init__(name=name, **kwargs)
         assert norm_position in ("pre", "post", "none")
         assert mha_type in ("relmha", "mha")
+        self._config = {
+            "dmodel": dmodel,
+            "head_size": head_size,
+            "num_heads": num_heads,
+            "residual_factor": residual_factor,
+            "dropout": dropout,
+            "mha_type": mha_type,
+            "relmha_causal": relmha_causal,
+            "norm_position": norm_position,
+            "memory_length": memory_length,
+            "use_attention_bias": use_attention_bias,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self.pre_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
             else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -167,9 +196,9 @@ def __init__(
                 name="mhsa",
                 dtype=self.dtype,
             )
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
         self.post_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
             else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -201,6 +230,11 @@ def compute_output_shape(self, input_shape):
         output_shape, caching_shape, *_ = input_shape
         return output_shape, caching_shape
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class ConvModule(Layer):
     r"""
@@ -240,8 +274,20 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
         assert norm_position in ("pre", "post", "none")
+        self._config = {
+            "input_dim": input_dim,
+            "kernel_size": kernel_size,
+            "dropout": dropout,
+            "padding": padding,
+            "scale_factor": scale_factor,
+            "residual_factor": residual_factor,
+            "norm_position": norm_position,
+            "use_group_conv": use_group_conv,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self.pre_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
             else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -278,10 +324,10 @@ def __init__(
                 bias_regularizer=bias_regularizer,
                 dtype=self.dtype,
             )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
-        self.swish = tf.keras.layers.Activation(tf.nn.swish, name="swish", dtype=self.dtype)
+        self.swish = keras.layers.Activation(tf.nn.swish, name="swish", dtype=self.dtype)
         self.pw_conv_2 = Conv1D(
             filters=input_dim,
             kernel_size=1,
@@ -292,9 +338,9 @@ def __init__(
             bias_regularizer=bias_regularizer,
             dtype=self.dtype,
         )
-        self.do = tf.keras.layers.Dropout(rate=dropout, name="dropout", dtype=self.dtype)
+        self.do = keras.layers.Dropout(rate=dropout, name="dropout", dtype=self.dtype)
         self.post_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
             else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -316,6 +362,11 @@ def call(self, inputs, training=False):
     def compute_output_shape(self, input_shape):
         return input_shape
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class ConformerBlock(Layer):
     r"""
@@ -354,8 +405,30 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
         assert block_norm_position in ("pre", "post", "none")
+        self._config = {
+            "input_dim": input_dim,
+            "dropout": dropout,
+            "ffm_scale_factor": ffm_scale_factor,
+            "ffm_residual_factor": ffm_residual_factor,
+            "head_size": head_size,
+            "num_heads": num_heads,
+            "mha_type": mha_type,
+            "mhsam_residual_factor": mhsam_residual_factor,
+            "mhsam_use_attention_bias": mhsam_use_attention_bias,
+            "mhsam_causal": mhsam_causal,
+            "kernel_size": kernel_size,
+            "padding": padding,
+            "convm_scale_factor": convm_scale_factor,
+            "convm_residual_factor": convm_residual_factor,
+            "convm_use_group_conv": convm_use_group_conv,
+            "module_norm_position": module_norm_position,
+            "block_norm_position": block_norm_position,
+            "memory_length": memory_length,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self.pre_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if block_norm_position == "pre"
             else Identity(name="preiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -412,7 +485,7 @@ def __init__(
             dtype=self.dtype,
         )
         self.post_norm = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if block_norm_position == "post"
             else Identity(name="postiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
         )
@@ -444,6 +517,11 @@ def compute_output_shape(self, input_shape):
         output_shape, caching_shape, *_ = input_shape
         return output_shape, caching_shape
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class ConformerEncoder(Layer):
     def __init__(
@@ -478,6 +556,33 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
         assert mha_type in ("relmha", "mha")
+        self._config = {
+            "subsampling": subsampling,
+            "dmodel": dmodel,
+            "num_blocks": num_blocks,
+            "mha_type": mha_type,
+            "head_size": head_size,
+            "num_heads": num_heads,
+            "kernel_size": kernel_size,
+            "padding": padding,
+            "interleave_relpe": interleave_relpe,
+            "use_attention_causal_mask": use_attention_causal_mask,
+            "use_attention_auto_mask": use_attention_auto_mask,
+            "ffm_scale_factor": ffm_scale_factor,
+            "ffm_residual_factor": ffm_residual_factor,
+            "mhsam_residual_factor": mhsam_residual_factor,
+            "mhsam_use_attention_bias": mhsam_use_attention_bias,
+            "mhsam_causal": mhsam_causal,
+            "convm_scale_factor": convm_scale_factor,
+            "convm_residual_factor": convm_residual_factor,
+            "convm_use_group_conv": convm_use_group_conv,
+            "dropout": dropout,
+            "module_norm_position": module_norm_position,
+            "block_norm_position": block_norm_position,
+            "memory_length": memory_length,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self._dmodel = dmodel
         self._kernel_regularizer = kernel_regularizer
         self._bias_regularizer = bias_regularizer
@@ -502,10 +607,10 @@ def __init__(
         )
         self.time_reduction_factor = self.conv_subsampling.time_reduction_factor
 
-        self.linear = tf.keras.layers.Dense(
+        self.linear = keras.layers.Dense(
             dmodel, name="linear", kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, dtype=self.dtype
         )
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
 
         self._mha_type = mha_type
         self._num_heads = num_heads
@@ -638,3 +743,8 @@ def compute_output_shape(self, input_shape):
         for cblock in self.conformer_blocks:
             output_shape, caching_shape = cblock.compute_output_shape((output_shape, caching_shape, relative_position_encoding_shape, None, None))
         return output_shape, output_length_shape, caching_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
diff --git a/tensorflow_asr/models/encoders/contextnet.py b/tensorflow_asr/models/encoders/contextnet.py
index 5804b17255..660d2e0e39 100644
--- a/tensorflow_asr/models/encoders/contextnet.py
+++ b/tensorflow_asr/models/encoders/contextnet.py
@@ -16,11 +16,12 @@
 from typing import List
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer, Reshape
 from tensorflow_asr.utils import math_util
 
-L2 = tf.keras.regularizers.l2(1e-6)
+L2 = keras.regularizers.l2(1e-6)
 
 
 def get_activation(
@@ -32,7 +33,7 @@ def get_activation(
     if activation == "relu":
         return tf.nn.relu
     if activation == "linear":
-        return tf.keras.activations.linear
+        return keras.activations.linear
     raise ValueError("activation must be either 'silu', 'swish', 'relu' or 'linear'")
 
 
@@ -50,7 +51,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self.strides = strides
-        self.conv = tf.keras.layers.SeparableConv1D(
+        self.conv = keras.layers.SeparableConv1D(
             filters=filters,
             kernel_size=kernel_size,
             strides=strides,
@@ -61,7 +62,7 @@ def __init__(
             name="conv",
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
         self.activation = get_activation(activation)
@@ -116,16 +117,16 @@ def __init__(
             name="conv_module",
             dtype=self.dtype,
         )
-        self.global_avg_pool = tf.keras.layers.GlobalAveragePooling1D(keepdims=True, name="global_avg_pool", dtype=self.dtype)
+        self.global_avg_pool = keras.layers.GlobalAveragePooling1D(keepdims=True, name="global_avg_pool", dtype=self.dtype)
         self.activation = get_activation(activation)
-        self.fc1 = tf.keras.layers.Dense(
+        self.fc1 = keras.layers.Dense(
             filters // 8,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name="fc1",
             dtype=self.dtype,
         )
-        self.fc2 = tf.keras.layers.Dense(
+        self.fc2 = keras.layers.Dense(
             filters,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
diff --git a/tensorflow_asr/models/encoders/deepspeech2.py b/tensorflow_asr/models/encoders/deepspeech2.py
index f3f81f14c7..b67e261236 100644
--- a/tensorflow_asr/models/encoders/deepspeech2.py
+++ b/tensorflow_asr/models/encoders/deepspeech2.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Identity, Layer, Reshape
 from tensorflow_asr.models.layers.convolution import DepthwiseConv1D
@@ -44,13 +45,13 @@ def __init__(
             name="conv",
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn",
             gamma_regularizer=regularizer,
             beta_regularizer=regularizer,
             dtype=self.dtype,
         )
-        self.activation = tf.keras.activations.get(activation)
+        self.activation = keras.activations.get(activation)
 
     def call(self, inputs, training=False):
         outputs = self.conv(inputs, training=training)
@@ -92,11 +93,11 @@ def __init__(
             bias_initializer=initializer,
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
-        self.act = tf.keras.layers.Activation(activation=activation, dtype=self.dtype)
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.act = keras.layers.Activation(activation=activation, dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
         self.time_reduction_factor = self.conv.strides[0]
 
     def call(self, inputs, training=False):
@@ -236,8 +237,8 @@ def __init__(
         )
         self._bidirectional = bidirectional
         if bidirectional:
-            self.rnn = tf.keras.layers.Bidirectional(self.rnn, name=f"b{rnn_type}", dtype=self.dtype)
-        self.bn = tf.keras.layers.BatchNormalization(
+            self.rnn = keras.layers.Bidirectional(self.rnn, name=f"b{rnn_type}", dtype=self.dtype)
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
         self.rowconv = None
@@ -371,7 +372,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.fc = tf.keras.layers.Dense(
+        self.fc = keras.layers.Dense(
             units,
             kernel_regularizer=kernel_regularizer,
             kernel_initializer=initializer,
@@ -380,11 +381,11 @@ def __init__(
             name="fc",
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
-        self.act = tf.keras.layers.Activation(activation=activation, dtype=self.dtype)
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.act = keras.layers.Activation(activation=activation, dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
 
     def call(self, inputs, training=False):
         outputs, outputs_length = inputs
diff --git a/tensorflow_asr/models/encoders/jasper.py b/tensorflow_asr/models/encoders/jasper.py
index 723c02525e..7597af24ba 100644
--- a/tensorflow_asr/models/encoders/jasper.py
+++ b/tensorflow_asr/models/encoders/jasper.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer, Reshape
 from tensorflow_asr.models.layers.convolution import Conv1D
 from tensorflow_asr.utils import math_util
 
 
-class JasperSubBlock(tf.keras.layers.Layer):
+class JasperSubBlock(keras.layers.Layer):
     def __init__(
         self,
         channels: int = 256,
@@ -44,11 +45,11 @@ def __init__(
             name="conv1d",
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
-        self.relu = tf.keras.layers.ReLU(name="relu", dtype=self.dtype)
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.relu = keras.layers.ReLU(name="relu", dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
         self.reduction_factor = strides
 
     def call(self, inputs, training=False):
@@ -60,7 +61,7 @@ def call(self, inputs, training=False):
         return outputs
 
 
-class JasperResidual(tf.keras.layers.Layer):
+class JasperResidual(keras.layers.Layer):
     def __init__(
         self,
         channels: int = 256,
@@ -80,7 +81,7 @@ def __init__(
             name="pointwise_conv1d",
             dtype=self.dtype,
         )
-        self.bn = tf.keras.layers.BatchNormalization(
+        self.bn = keras.layers.BatchNormalization(
             name="bn", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
         )
 
@@ -128,7 +129,7 @@ def __init__(
             for i in range(nresiduals)
         ]
 
-        self.add = tf.keras.layers.Add(name="add")
+        self.add = keras.layers.Add(name="add")
 
     def call(self, inputs, training=False):
         outputs, residuals = inputs
@@ -142,7 +143,7 @@ def call(self, inputs, training=False):
         return outputs
 
 
-class JasperBlock(tf.keras.layers.Layer):
+class JasperBlock(keras.layers.Layer):
     def __init__(
         self,
         nsubblocks: int = 3,
diff --git a/tensorflow_asr/models/encoders/rnnt.py b/tensorflow_asr/models/encoders/rnnt.py
index bdea97ba7e..7dcd24880b 100644
--- a/tensorflow_asr/models/encoders/rnnt.py
+++ b/tensorflow_asr/models/encoders/rnnt.py
@@ -14,6 +14,7 @@
 """ http://arxiv.org/abs/1811.06621 """
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer, Reshape
 from tensorflow_asr.models.layers.subsampling import TimeReduction
@@ -23,6 +24,7 @@
 class RnnTransducerBlock(Layer):
     def __init__(
         self,
+        reduction_position: str = "pre",
         reduction_factor: int = 0,
         dmodel: int = 640,
         rnn_type: str = "lstm",
@@ -34,6 +36,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        assert reduction_position in ["post", "pre"], "reduction_position must be 'post' or 'pre'"
+        self._reduction_position = reduction_position
         self.reduction = TimeReduction(reduction_factor, name="reduction", dtype=self.dtype) if reduction_factor > 0 else None
         self.rnn = layer_util.get_rnn(rnn_type)(
             units=rnn_units,
@@ -47,11 +51,11 @@ def __init__(
             dtype=self.dtype,
         )
         self.ln = (
-            tf.keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype)
+            keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype)
             if layer_norm
             else None
         )
-        self.projection = tf.keras.layers.Dense(
+        self.projection = keras.layers.Dense(
             dmodel,
             name="projection",
             kernel_regularizer=kernel_regularizer,
@@ -61,12 +65,16 @@ def __init__(
 
     def call(self, inputs, training=False):
         outputs, outputs_length = inputs
-        if self.reduction is not None:
-            outputs, outputs_length = self.reduction((outputs, outputs_length))
+        if self._reduction_position == "pre":
+            if self.reduction is not None:
+                outputs, outputs_length = self.reduction((outputs, outputs_length))
         outputs, *_ = self.rnn(outputs, training=training)
         if self.ln is not None:
             outputs = self.ln(outputs, training=training)
         outputs = self.projection(outputs, training=training)
+        if self._reduction_position == "post":
+            if self.reduction is not None:
+                outputs, outputs_length = self.reduction((outputs, outputs_length))
         return outputs, outputs_length
 
     def compute_mask(self, inputs, mask=None):
@@ -89,6 +97,9 @@ def call_next(self, inputs, inputs_length, previous_encoder_states):
         """
         with tf.name_scope(f"{self.name}_call_next"):
             outputs, outputs_length = inputs, inputs_length
+            if self._reduction_position == "pre":
+                if self.reduction is not None:
+                    outputs, outputs_length = self.reduction([outputs, outputs_length])
             outputs, *_states = self.rnn(
                 outputs,
                 training=False,
@@ -98,9 +109,10 @@ def call_next(self, inputs, inputs_length, previous_encoder_states):
             new_states = tf.stack(_states, axis=0)
             if self.ln is not None:
                 outputs = self.ln(outputs, training=False)
-            if self.reduction is not None:
-                outputs, outputs_length = self.reduction([outputs, outputs_length])
             outputs = self.projection(outputs, training=False)
+            if self._reduction_position == "post":
+                if self.reduction is not None:
+                    outputs, outputs_length = self.reduction([outputs, outputs_length])
             return outputs, outputs_length, new_states
 
     def compute_output_shape(self, input_shape):
@@ -114,6 +126,7 @@ def compute_output_shape(self, input_shape):
 class RnnTransducerEncoder(Layer):
     def __init__(
         self,
+        reduction_positions: list = ["pre", "pre", "pre", "pre", "pre", "pre", "pre", "pre"],
         reduction_factors: list = [6, 0, 0, 0, 0, 0, 0, 0],
         dmodel: int = 640,
         nlayers: int = 8,
@@ -126,6 +139,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        assert len(reduction_positions) == nlayers, "reduction_positions length must be equal to nlayers"
         assert len(reduction_factors) == nlayers, "reduction_factors length must be equal to nlayers"
         self.reshape = Reshape(name="reshape", dtype=self.dtype)
 
@@ -133,6 +147,7 @@ def __init__(
         self.blocks = []
         for i in range(nlayers):
             block = RnnTransducerBlock(
+                reduction_position=reduction_positions[i],
                 reduction_factor=reduction_factors[i],
                 dmodel=dmodel,
                 rnn_type=rnn_type,
@@ -151,12 +166,13 @@ def get_initial_state(self, batch_size=1):
         """Get zeros states
 
         Returns:
-            tf.Tensor: states having shape [num_rnns, 1 or 2, 1, P]
+        tf.Tensor, shape [B, num_rnns, nstates, state_size]
+            Zero initialized states
         """
         states = []
         for block in self.blocks:
             states.append(tf.stack(block.rnn.get_initial_state(tf.zeros([batch_size, 1, 1], dtype=self.dtype)), axis=0))
-        return tf.stack(states, axis=0)
+        return tf.transpose(tf.stack(states, axis=0), perm=[2, 0, 1, 3])
 
     def call(self, inputs, training=False):
         outputs, outputs_length, caching = inputs
diff --git a/tensorflow_asr/models/encoders/transformer.py b/tensorflow_asr/models/encoders/transformer.py
index 25d961ee2a..e4e2b0c38e 100644
--- a/tensorflow_asr/models/encoders/transformer.py
+++ b/tensorflow_asr/models/encoders/transformer.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.layers.multihead_attention import MultiHeadAttention, MultiHeadRelativeAttention
@@ -33,7 +34,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.ffn1 = tf.keras.layers.Dense(
+        self.ffn1 = keras.layers.Dense(
             units=dff,
             activation=activation,
             kernel_regularizer=kernel_regularizer,
@@ -41,7 +42,7 @@ def __init__(
             name="ffn_1",
             dtype=self.dtype,
         )
-        self.ffn2 = tf.keras.layers.Dense(
+        self.ffn2 = keras.layers.Dense(
             units=dmodel,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
@@ -82,7 +83,7 @@ def __init__(
         self.norm1 = (
             None
             if self._norm_position == "none"
-            else tf.keras.layers.LayerNormalization(
+            else keras.layers.LayerNormalization(
                 beta_regularizer=kernel_regularizer, gamma_regularizer=bias_regularizer, name="ln_1", dtype=self.dtype
             )
         )
@@ -111,12 +112,12 @@ def __init__(
                 dtype=self.dtype,
             )
         )
-        self.do1 = tf.keras.layers.Dropout(dropout, name="do_1", dtype=self.dtype)
+        self.do1 = keras.layers.Dropout(dropout, name="do_1", dtype=self.dtype)
         self.residual1 = Residual(factor=residual_factor, regularizer=bias_regularizer, name="residual_1", dtype=self.dtype)
         self.norm2 = (
             None
             if self._norm_position == "none"
-            else tf.keras.layers.LayerNormalization(
+            else keras.layers.LayerNormalization(
                 beta_regularizer=kernel_regularizer, gamma_regularizer=bias_regularizer, name="ln_2", dtype=self.dtype
             )
         )
@@ -129,7 +130,7 @@ def __init__(
             name="pwffn",
             dtype=self.dtype,
         )
-        self.do2 = tf.keras.layers.Dropout(dropout, name="do_2", dtype=self.dtype)
+        self.do2 = keras.layers.Dropout(dropout, name="do_2", dtype=self.dtype)
         self.residual2 = Residual(factor=residual_factor, regularizer=bias_regularizer, name="residual_2", dtype=self.dtype)
 
     def call(
@@ -213,14 +214,14 @@ def __init__(
             dtype=self.dtype,
         )
         self.time_reduction_factor = self.subsampling.time_reduction_factor
-        self.linear = tf.keras.layers.Dense(
+        self.linear = keras.layers.Dense(
             units=dmodel,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
             name="linear",
             dtype=self.dtype,
         )
-        self.do = tf.keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
+        self.do = keras.layers.Dropout(dropout, name="dropout", dtype=self.dtype)
 
         if mha_type == "relmha":
             self.relpe = RelativeSinusoidalPositionalEncoding(
diff --git a/tensorflow_asr/models/layers/embedding.py b/tensorflow_asr/models/layers/embedding.py
index cba7362aa2..2bc27c7e0f 100644
--- a/tensorflow_asr/models/layers/embedding.py
+++ b/tensorflow_asr/models/layers/embedding.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 
 
-class Embedding(tf.keras.layers.Embedding):
+class Embedding(keras.layers.Embedding):
     def __init__(
         self,
         vocab_size,
diff --git a/tensorflow_asr/models/layers/feature_extraction.py b/tensorflow_asr/models/layers/feature_extraction.py
index f651bfe78e..74b4173acf 100644
--- a/tensorflow_asr/models/layers/feature_extraction.py
+++ b/tensorflow_asr/models/layers/feature_extraction.py
@@ -131,7 +131,8 @@ def __init__(
         self.padding = padding
         self.nfft = self.frame_length if nfft is None else nfft
 
-        self.augmentations = Augmentation(augmentation_config)
+        self._augmentation_config = augmentation_config
+        self.augmentations = Augmentation(self._augmentation_config)
 
     # ---------------------------------- signals --------------------------------- #
 
@@ -293,3 +294,29 @@ def compute_output_shape(self, input_shape):
         else:
             output_shape = [B, self.get_nframes(nsamples + self.padding), self.num_feature_bins, 1]
         return tf.TensorShape(output_shape), tf.TensorShape(signal_length_shape)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sample_rate": self.sample_rate,
+                "frame_ms": self.frame_ms,
+                "stride_ms": self.stride_ms,
+                "num_feature_bins": self.num_feature_bins,
+                "feature_type": self.feature_type,
+                "preemphasis": self.preemphasis,
+                "pad_end": self.pad_end,
+                "use_librosa_like_stft": self.use_librosa_like_stft,
+                "output_floor": self.output_floor,
+                "lower_edge_hertz": self.lower_edge_hertz,
+                "upper_edge_hertz": self.upper_edge_hertz,
+                "log_base": self.log_base,
+                "nfft": self.nfft,
+                "normalize_signal": self._normalize_signal,
+                "normalize_zscore": self._normalize_zscore,
+                "normalize_min_max": self._normalize_min_max,
+                "padding": self.padding,
+                "augmentation_config": self._augmentations.config,
+            }
+        )
+        return config
diff --git a/tensorflow_asr/models/layers/memory.py b/tensorflow_asr/models/layers/memory.py
index f473213b73..dc624fbaac 100644
--- a/tensorflow_asr/models/layers/memory.py
+++ b/tensorflow_asr/models/layers/memory.py
@@ -121,7 +121,7 @@ def call(self, inputs, memories=None):
     #         memory = tf.zeros(shape=(self.batch_size, self.memory_length, self.dmodel), dtype=self.dtype)
     #     if memory_mask is None:
     #         memory_mask = tf.zeros(shape=(self.batch_size, self.memory_length), dtype=tf.bool)
-    #     self.add_update([tf.keras.backend.update(self.memory, memory), tf.keras.backend.update(self.memory_mask, memory_mask)])
+    #     self.add_update([keras.backend.update(self.memory, memory), keras.backend.update(self.memory_mask, memory_mask)])
 
     # def call(self, inputs):
     #     inputs, inputs_mask = self._get_inputs(inputs)
@@ -147,7 +147,7 @@ def call(self, inputs, memories=None):
     #         begin=[0, tf.shape(new_memory_mask)[1] - self.memory_length],
     #         size=[-1, self.memory_length],
     #     )
-    #     self.add_update([tf.keras.backend.update(self.memory, new_memory), tf.keras.backend.update(self.memory_mask, new_memory_mask)])
+    #     self.add_update([keras.backend.update(self.memory, new_memory), keras.backend.update(self.memory_mask, new_memory_mask)])
     #     new_memory._keras_mask = new_memory_mask  # pylint: disable=protected-access
     #     return new_memory
 
diff --git a/tensorflow_asr/models/layers/multihead_attention.py b/tensorflow_asr/models/layers/multihead_attention.py
index 70e3d54388..7720a7ae65 100644
--- a/tensorflow_asr/models/layers/multihead_attention.py
+++ b/tensorflow_asr/models/layers/multihead_attention.py
@@ -17,6 +17,7 @@
 import math
 
 import tensorflow as tf
+import keras
 from keras.layers import EinsumDense
 from keras.layers import MultiHeadAttention as KerasMultiHeadAttention
 
@@ -244,8 +245,8 @@ def _build_attention(self, rank):
             attn_scores_rank,
         ) = mha_module._build_attention_equation(rank, attn_axes=self._attention_axes)
         norm_axes = tuple(range(attn_scores_rank - len(self._attention_axes), attn_scores_rank))
-        self._softmax = tf.keras.layers.Softmax(axis=norm_axes, dtype=self.dtype)  # stable training
-        self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout, dtype=self.dtype)
+        self._softmax = keras.layers.Softmax(axis=norm_axes, dtype=self.dtype)  # stable training
+        self._dropout_layer = keras.layers.Dropout(rate=self._dropout, dtype=self.dtype)
 
     def _masked_softmax(self, attention_scores, attention_mask=None):
         # Normalize the attention scores to probabilities.
diff --git a/tensorflow_asr/models/layers/positional_encoding.py b/tensorflow_asr/models/layers/positional_encoding.py
index 53f0b05543..db97c19481 100755
--- a/tensorflow_asr/models/layers/positional_encoding.py
+++ b/tensorflow_asr/models/layers/positional_encoding.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.utils import shape_util
@@ -61,7 +62,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(trainable=False, **kwargs)
-        self.do = tf.keras.layers.Dropout(dropout, dtype=self.dtype, name="dropout")
+        self.do = keras.layers.Dropout(dropout, dtype=self.dtype, name="dropout")
         self._scale = scale
         self._interleave = interleave
 
diff --git a/tensorflow_asr/models/layers/residual.py b/tensorflow_asr/models/layers/residual.py
index 63da2e7eda..5de0b4fc18 100644
--- a/tensorflow_asr/models/layers/residual.py
+++ b/tensorflow_asr/models/layers/residual.py
@@ -15,6 +15,7 @@
 from typing import Optional
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 
@@ -30,8 +31,8 @@ class Residual(Layer):
     def __init__(
         self,
         factor="rezero",
-        initializer: tf.keras.initializers.Initializer = "zeros",
-        regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+        initializer: keras.initializers.Initializer = "zeros",
+        regularizer: Optional[keras.regularizers.Regularizer] = None,
         name="residual",
         **kwargs,
     ):
diff --git a/tensorflow_asr/models/layers/sequence_wise_bn.py b/tensorflow_asr/models/layers/sequence_wise_bn.py
index 96c6469324..e03f8e382a 100644
--- a/tensorflow_asr/models/layers/sequence_wise_bn.py
+++ b/tensorflow_asr/models/layers/sequence_wise_bn.py
@@ -12,16 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import tensorflow as tf
+import keras
 
 
 # https://arxiv.org/abs/1510.01378
-class SequenceBatchNorm(tf.keras.layers.Layer):
+class SequenceBatchNorm(keras.layers.Layer):
     def __init__(self, name, time_major=False, gamma_regularizer=None, beta_regularizer=None, **kwargs):
         super().__init__(name=name, **kwargs)
         self.time_major = time_major
-        self.gamma_regularizer = tf.keras.regularizers.get(gamma_regularizer)
-        self.beta_regularizer = tf.keras.regularizers.get(beta_regularizer)
+        self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
+        self.beta_regularizer = keras.regularizers.get(beta_regularizer)
 
     def build(
         self,
@@ -53,12 +55,12 @@ def call(
     ):
         mean, variance = tf.nn.moments(inputs, axes=[0, 1], keepdims=False)
         if self.time_major:
-            total_padded_frames = tf.cast(tf.shape(inputs)[0], tf.keras.backend.dtype(mean))
-            batch_size = tf.cast(tf.shape(inputs)[1], tf.keras.backend.dtype(mean))
+            total_padded_frames = tf.cast(tf.shape(inputs)[0], keras.backend.dtype(mean))
+            batch_size = tf.cast(tf.shape(inputs)[1], keras.backend.dtype(mean))
         else:
-            total_padded_frames = tf.cast(tf.shape(inputs)[1], tf.keras.backend.dtype(mean))
-            batch_size = tf.cast(tf.shape(inputs)[0], tf.keras.backend.dtype(mean))
-        total_unpadded_frames_batch = tf.math.count_nonzero(inputs, axis=[0, 1], keepdims=False, dtype=tf.keras.backend.dtype(mean))
+            total_padded_frames = tf.cast(tf.shape(inputs)[1], keras.backend.dtype(mean))
+            batch_size = tf.cast(tf.shape(inputs)[0], keras.backend.dtype(mean))
+        total_unpadded_frames_batch = tf.math.count_nonzero(inputs, axis=[0, 1], keepdims=False, dtype=keras.backend.dtype(mean))
         mean = (mean * total_padded_frames * batch_size) / total_unpadded_frames_batch
         variance = (variance * total_padded_frames * batch_size) / total_unpadded_frames_batch
         return tf.nn.batch_normalization(
@@ -67,5 +69,5 @@ def call(
             variance=variance,
             offset=self.beta,
             scale=self.gamma,
-            variance_epsilon=tf.keras.backend.epsilon(),
+            variance_epsilon=keras.backend.epsilon(),
         )
diff --git a/tensorflow_asr/models/layers/subsampling.py b/tensorflow_asr/models/layers/subsampling.py
index 80d976d039..3328e4f0d9 100644
--- a/tensorflow_asr/models/layers/subsampling.py
+++ b/tensorflow_asr/models/layers/subsampling.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import typing
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.layers.convolution import Conv1D, Conv2D
@@ -53,10 +55,10 @@ def compute_output_shape(self, input_shape):
 class VggSubsampling(Layer):
     def __init__(
         self,
-        filters: tuple or list = (32, 64),
-        kernel_size: int or list or tuple = 3,
-        pool_size: int or list or tuple = 2,
-        strides: int or list or tuple = 2,
+        filters: typing.Union[tuple, list] = (32, 64),
+        kernel_size: typing.Union[int, list, tuple] = 3,
+        pool_size: typing.Union[int, list, tuple] = 2,
+        strides: typing.Union[int, list, tuple] = 2,
         padding: str = "same",
         activation: str = "relu",
         kernel_regularizer=None,
@@ -87,7 +89,7 @@ def __init__(
             activation=activation,
             dtype=self.dtype,
         )
-        self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=pool_size, strides=strides, padding=padding, dtype=self.dtype, name="maxpool_1")
+        self.maxpool1 = keras.layers.MaxPool2D(pool_size=pool_size, strides=strides, padding=padding, dtype=self.dtype, name="maxpool_1")
         self.conv3 = Conv2D(
             filters=filters[1],
             kernel_size=kernel_size,
@@ -110,7 +112,7 @@ def __init__(
             activation=activation,
             dtype=self.dtype,
         )
-        self.maxpool2 = tf.keras.layers.MaxPool2D(pool_size=pool_size, strides=strides, padding=padding, dtype=self.dtype, name="maxpool_2")
+        self.maxpool2 = keras.layers.MaxPool2D(pool_size=pool_size, strides=strides, padding=padding, dtype=self.dtype, name="maxpool_2")
         self.time_reduction_factor = self.maxpool1.pool_size[0] * self.maxpool2.pool_size[0]
 
     def call(self, inputs, training=False):
@@ -169,7 +171,7 @@ def __init__(
         self.convs = []
         self.time_reduction_factor = 1
         for i in range(len(filters)):
-            subblock = tf.keras.Sequential(name=f"block_{i}")
+            subblock = keras.Sequential(name=f"block_{i}")
             subblock.add(
                 Conv2D(
                     filters=filters[i],
@@ -184,7 +186,7 @@ def __init__(
             )
             if norms[i] == "batch":
                 subblock.add(
-                    tf.keras.layers.BatchNormalization(
+                    keras.layers.BatchNormalization(
                         name=f"bn_{i}",
                         gamma_regularizer=kernel_regularizer,
                         beta_regularizer=bias_regularizer,
@@ -193,14 +195,14 @@ def __init__(
                 )
             elif norms[i] == "layer":
                 subblock.add(
-                    tf.keras.layers.LayerNormalization(
+                    keras.layers.LayerNormalization(
                         name=f"ln_{i}",
                         gamma_regularizer=kernel_regularizer,
                         beta_regularizer=bias_regularizer,
                         dtype=self.dtype,
                     )
                 )
-            subblock.add(tf.keras.layers.Activation(activations[i], name=f"{activations[i]}_{i}", dtype=self.dtype))
+            subblock.add(keras.layers.Activation(activations[i], name=f"{activations[i]}_{i}", dtype=self.dtype))
             self.convs.append(subblock)
             self.time_reduction_factor *= subblock.layers[0].strides[0]
 
@@ -257,7 +259,7 @@ def __init__(
         self.convs = []
         self.time_reduction_factor = 1
         for i in range(len(filters)):
-            subblock = tf.keras.Sequential(name=f"block_{i}")
+            subblock = keras.Sequential(name=f"block_{i}")
             subblock.add(
                 Conv1D(
                     filters=filters[i],
@@ -272,7 +274,7 @@ def __init__(
             )
             if norms[i] == "batch":
                 subblock.add(
-                    tf.keras.layers.BatchNormalization(
+                    keras.layers.BatchNormalization(
                         name=f"bn_{i}",
                         gamma_regularizer=kernel_regularizer,
                         beta_regularizer=bias_regularizer,
@@ -281,14 +283,14 @@ def __init__(
                 )
             elif norms[i] == "layer":
                 subblock.add(
-                    tf.keras.layers.LayerNormalization(
+                    keras.layers.LayerNormalization(
                         name=f"ln_{i}",
                         gamma_regularizer=kernel_regularizer,
                         beta_regularizer=bias_regularizer,
                         dtype=self.dtype,
                     )
                 )
-            subblock.add(tf.keras.layers.Activation(activations[i], name=f"{activations[i]}_{i}", dtype=self.dtype))
+            subblock.add(keras.layers.Activation(activations[i], name=f"{activations[i]}_{i}", dtype=self.dtype))
             self.convs.append(subblock)
             self.time_reduction_factor *= subblock.layers[0].strides[0]
 
diff --git a/tensorflow_asr/models/transducer/base_transducer.py b/tensorflow_asr/models/transducer/base_transducer.py
index b1eb075e9e..11084ee762 100644
--- a/tensorflow_asr/models/transducer/base_transducer.py
+++ b/tensorflow_asr/models/transducer/base_transducer.py
@@ -17,6 +17,7 @@
 import collections
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr import schemas
 from tensorflow_asr.losses.rnnt_loss import RnntLoss
@@ -53,6 +54,21 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
         assert label_encoder_mode in ("one_hot", "embedding"), "label_encode_mode must be either 'one_hot' or 'embedding'"
+        self._config = {
+            "blank": blank,
+            "vocab_size": vocab_size,
+            "label_encoder_mode": label_encoder_mode,
+            "embed_dim": embed_dim,
+            "num_rnns": num_rnns,
+            "rnn_units": rnn_units,
+            "rnn_type": rnn_type,
+            "rnn_implementation": rnn_implementation,
+            "rnn_unroll": rnn_unroll,
+            "layer_norm": layer_norm,
+            "projection_units": projection_units,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         self.label_encoder = (
             Embedding(vocab_size, embed_dim, regularizer=kernel_regularizer, name=label_encoder_mode, dtype=self.dtype)
             if label_encoder_mode == "embedding"
@@ -77,14 +93,14 @@ def __init__(
                 dtype=self.dtype,
             )
             ln = (
-                tf.keras.layers.LayerNormalization(
+                keras.layers.LayerNormalization(
                     name=f"ln_{i}", gamma_regularizer=kernel_regularizer, beta_regularizer=bias_regularizer, dtype=self.dtype
                 )
                 if layer_norm
                 else None
             )
             projection = (
-                tf.keras.layers.Dense(
+                keras.layers.Dense(
                     projection_units,
                     name=f"projection_{i}",
                     kernel_regularizer=kernel_regularizer,
@@ -164,6 +180,11 @@ def compute_output_shape(self, input_shape):
             )
         return tuple(output_shape), tuple(output_length_shape)
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class TransducerJointMerge(Layer):
     def __init__(self, joint_mode: str = "add", name="transducer_joint_merge", **kwargs):
@@ -201,6 +222,11 @@ def compute_output_shape(self, input_shape):
         enc_shape, pred_shape = input_shape
         return enc_shape[0], enc_shape[1], pred_shape[1], enc_shape[-1]
 
+    def get_config(self):
+        config = super().get_config()
+        config.update({"joint_mode": self.joint_mode})
+        return config
+
 
 class TransducerJoint(Layer):
     def __init__(
@@ -219,12 +245,24 @@ def __init__(
     ):
         super().__init__(name=name, **kwargs)
 
+        self._config = {
+            "vocab_size": vocab_size,
+            "joint_dim": joint_dim,
+            "activation": activation,
+            "prejoint_encoder_linear": prejoint_encoder_linear,
+            "prejoint_prediction_linear": prejoint_prediction_linear,
+            "postjoint_linear": postjoint_linear,
+            "joint_mode": joint_mode,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
+
         self.prejoint_encoder_linear = prejoint_encoder_linear
         self.prejoint_prediction_linear = prejoint_prediction_linear
         self.postjoint_linear = postjoint_linear
 
         if self.prejoint_encoder_linear:
-            self.ffn_enc = tf.keras.layers.Dense(
+            self.ffn_enc = keras.layers.Dense(
                 joint_dim,
                 name="enc",
                 kernel_regularizer=kernel_regularizer,
@@ -232,7 +270,7 @@ def __init__(
                 dtype=self.dtype,
             )
         if self.prejoint_prediction_linear:
-            self.ffn_pred = tf.keras.layers.Dense(
+            self.ffn_pred = keras.layers.Dense(
                 joint_dim,
                 use_bias=False,
                 name="pred",
@@ -243,10 +281,10 @@ def __init__(
         self.joint = TransducerJointMerge(joint_mode=joint_mode, name="merge", dtype=self.dtype)
 
         activation = activation.lower()
-        self.activation = tf.keras.layers.Activation(activation, name=activation, dtype=self.dtype)
+        self.activation = keras.layers.Activation(activation, name=activation, dtype=self.dtype)
 
         if self.postjoint_linear:
-            self.ffn = tf.keras.layers.Dense(
+            self.ffn = keras.layers.Dense(
                 joint_dim,
                 name="ffn",
                 kernel_regularizer=kernel_regularizer,
@@ -254,7 +292,7 @@ def __init__(
                 dtype=self.dtype,
             )
 
-        self.ffn_out = tf.keras.layers.Dense(
+        self.ffn_out = keras.layers.Dense(
             vocab_size,
             name="vocab",
             kernel_regularizer=kernel_regularizer,
@@ -286,6 +324,11 @@ def compute_output_shape(self, input_shape):
         encoder_time_shape, prediction_time_shape = encoder_shape[1], prediction_shape[1]
         return batch_shape, encoder_time_shape, prediction_time_shape, self.ffn_out.units
 
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
+
 
 class Transducer(BaseModel):
     """Transducer Model Warper"""
@@ -295,7 +338,7 @@ def __init__(
         blank: int,
         vocab_size: int,
         speech_config: dict,
-        encoder: tf.keras.layers.Layer,
+        encoder: keras.layers.Layer,
         prediction_label_encoder_mode: str = "embedding",
         prediction_embed_dim: int = 512,
         prediction_num_rnns: int = 1,
@@ -444,9 +487,6 @@ def call_next(
             ytu = tf.nn.log_softmax(ytu)
             return ytu, new_states
 
-    def get_initial_tokens(self, batch_size=1):
-        return super().get_initial_tokens(batch_size)
-
     def get_initial_encoder_states(self, batch_size=1):
         return tf.zeros([], dtype=self.dtype)
 
diff --git a/tensorflow_asr/models/transducer/conformer.py b/tensorflow_asr/models/transducer/conformer.py
index 1b90e02099..3c87a50137 100644
--- a/tensorflow_asr/models/transducer/conformer.py
+++ b/tensorflow_asr/models/transducer/conformer.py
@@ -14,12 +14,13 @@
 
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.encoders.conformer import L2, ConformerEncoder
 from tensorflow_asr.models.transducer.base_transducer import Transducer
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
 class Conformer(Transducer):
     def __init__(
         self,
@@ -71,6 +72,53 @@ def __init__(
         name: str = "conformer",
         **kwargs,
     ):
+        self._config = {
+            "blank": blank,
+            "vocab_size": vocab_size,
+            "speech_config": speech_config,
+            "encoder_subsampling": encoder_subsampling,
+            "encoder_dmodel": encoder_dmodel,
+            "encoder_num_blocks": encoder_num_blocks,
+            "encoder_head_size": encoder_head_size,
+            "encoder_num_heads": encoder_num_heads,
+            "encoder_mha_type": encoder_mha_type,
+            "encoder_interleave_relpe": encoder_interleave_relpe,
+            "encoder_use_attention_causal_mask": encoder_use_attention_causal_mask,
+            "encoder_use_attention_auto_mask": encoder_use_attention_auto_mask,
+            "encoder_kernel_size": encoder_kernel_size,
+            "encoder_padding": encoder_padding,
+            "encoder_ffm_scale_factor": encoder_ffm_scale_factor,
+            "encoder_ffm_residual_factor": encoder_ffm_residual_factor,
+            "encoder_mhsam_residual_factor": encoder_mhsam_residual_factor,
+            "encoder_mhsam_use_attention_bias": encoder_mhsam_use_attention_bias,
+            "encoder_convm_scale_factor": encoder_convm_scale_factor,
+            "encoder_convm_residual_factor": encoder_convm_residual_factor,
+            "encoder_convm_use_group_conv": encoder_convm_use_group_conv,
+            "encoder_dropout": encoder_dropout,
+            "encoder_module_norm_position": encoder_module_norm_position,
+            "encoder_block_norm_position": encoder_block_norm_position,
+            "encoder_memory_length": encoder_memory_length,
+            "encoder_trainable": encoder_trainable,
+            "prediction_label_encode_mode": prediction_label_encode_mode,
+            "prediction_embed_dim": prediction_embed_dim,
+            "prediction_num_rnns": prediction_num_rnns,
+            "prediction_rnn_units": prediction_rnn_units,
+            "prediction_rnn_type": prediction_rnn_type,
+            "prediction_rnn_implementation": prediction_rnn_implementation,
+            "prediction_rnn_unroll": prediction_rnn_unroll,
+            "prediction_layer_norm": prediction_layer_norm,
+            "prediction_projection_units": prediction_projection_units,
+            "prediction_trainable": prediction_trainable,
+            "joint_dim": joint_dim,
+            "joint_activation": joint_activation,
+            "prejoint_encoder_linear": prejoint_encoder_linear,
+            "prejoint_prediction_linear": prejoint_prediction_linear,
+            "postjoint_linear": postjoint_linear,
+            "joint_mode": joint_mode,
+            "joint_trainable": joint_trainable,
+            "kernel_regularizer": kernel_regularizer,
+            "bias_regularizer": bias_regularizer,
+        }
         super().__init__(
             speech_config=speech_config,
             encoder=ConformerEncoder(
@@ -136,8 +184,13 @@ def make(self, input_shape=[None], prediction_shape=[None], batch_size=None, **k
             None
             if self.encoder._memory_length is None
             else [
-                tf.keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
+                keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
                 for _ in range(self.encoder._num_blocks)
             ]
         )
         return super().make(input_shape, prediction_shape, batch_size, caching, **kwargs)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(self._config)
+        return config
diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
index 1dc1dad7ab..57cc9a923c 100644
--- a/tensorflow_asr/models/transducer/contextnet.py
+++ b/tensorflow_asr/models/transducer/contextnet.py
@@ -14,13 +14,13 @@
 
 from typing import List
 
-import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.encoders.contextnet import L2, ContextNetEncoder
 from tensorflow_asr.models.transducer.base_transducer import Transducer
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
 class ContextNet(Transducer):
     def __init__(
         self,
diff --git a/tensorflow_asr/models/transducer/rnnt.py b/tensorflow_asr/models/transducer/rnnt.py
index 248d83a537..9b147a717d 100644
--- a/tensorflow_asr/models/transducer/rnnt.py
+++ b/tensorflow_asr/models/transducer/rnnt.py
@@ -13,19 +13,20 @@
 # limitations under the License.
 """ http://arxiv.org/abs/1811.06621 """
 
-import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.encoders.rnnt import RnnTransducerEncoder
 from tensorflow_asr.models.transducer.base_transducer import Transducer
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
 class RnnTransducer(Transducer):
     def __init__(
         self,
         blank: int,
         vocab_size: int,
         speech_config: dict,
+        encoder_reduction_positions: list = ["pre", "pre", "pre", "pre", "pre", "pre", "pre", "pre"],
         encoder_reduction_factors: list = [6, 0, 0, 0, 0, 0, 0, 0],
         encoder_dmodel: int = 640,
         encoder_nlayers: int = 8,
@@ -59,6 +60,7 @@ def __init__(
         super().__init__(
             speech_config=speech_config,
             encoder=RnnTransducerEncoder(
+                reduction_positions=encoder_reduction_positions,
                 reduction_factors=encoder_reduction_factors,
                 dmodel=encoder_dmodel,
                 nlayers=encoder_nlayers,
diff --git a/tensorflow_asr/models/transducer/transformer.py b/tensorflow_asr/models/transducer/transformer.py
index 0f0bda8c65..dfdfb9ebd6 100644
--- a/tensorflow_asr/models/transducer/transformer.py
+++ b/tensorflow_asr/models/transducer/transformer.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.encoders.transformer import TransformerEncoder
 from tensorflow_asr.models.transducer.base_transducer import Transducer
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
+@keras.utils.register_keras_serializable("tensorflow_asr.models.transducer")
 class Transformer(Transducer):
     def __init__(
         self,
@@ -121,7 +122,7 @@ def make(self, input_shape=[None], prediction_shape=[None], batch_size=None, **k
             None
             if self.encoder._memory_length is None
             else [
-                tf.keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
+                keras.Input(shape=[self.encoder._memory_length, self.encoder._dmodel], batch_size=batch_size, dtype=tf.float32)
                 for _ in range(self.encoder._num_blocks)
             ]
         )
diff --git a/tensorflow_asr/optimizers/accumulation.py b/tensorflow_asr/optimizers/accumulation.py
index b7287cf56c..ed94e2236e 100644
--- a/tensorflow_asr/optimizers/accumulation.py
+++ b/tensorflow_asr/optimizers/accumulation.py
@@ -4,6 +4,7 @@
 """
 
 import tensorflow as tf
+import keras
 
 
 class GradientAccumulator:
@@ -14,7 +15,7 @@ class GradientAccumulator:
     def __init__(
         self,
         ga_steps,
-        model: tf.keras.Model,
+        model: keras.Model,
         name="ga",
     ):
         self.name = name
diff --git a/tensorflow_asr/optimizers/regularizers.py b/tensorflow_asr/optimizers/regularizers.py
index efab9d00f6..2397dcb879 100644
--- a/tensorflow_asr/optimizers/regularizers.py
+++ b/tensorflow_asr/optimizers/regularizers.py
@@ -1,10 +1,11 @@
 from typing import List
 
 import tensorflow as tf
+import keras
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.optimizers.regularizers")
-class TimeDependentGaussianGradientNoise(tf.keras.regularizers.Regularizer):
+@keras.utils.register_keras_serializable("tensorflow_asr.optimizers.regularizers")
+class TimeDependentGaussianGradientNoise(keras.regularizers.Regularizer):
     """
     Reference: https://openreview.net/pdf/ZY9xxQDMMu5Pk8ELfEz4.pdf
     """
diff --git a/tensorflow_asr/optimizers/schedules.py b/tensorflow_asr/optimizers/schedules.py
index cb90d1d40d..a33ef66bec 100755
--- a/tensorflow_asr/optimizers/schedules.py
+++ b/tensorflow_asr/optimizers/schedules.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.optimizers.schedules")
-class TransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+@keras.utils.register_keras_serializable("tensorflow_asr.optimizers.schedules")
+class TransformerSchedule(keras.optimizers.schedules.LearningRateSchedule):
     def __init__(self, dmodel, scale=1.0, warmup_steps=4000, max_lr=None, min_lr=None):
         super().__init__()
         self.dmodel = tf.convert_to_tensor(dmodel, dtype=tf.float32)
@@ -46,8 +47,8 @@ def get_config(self):
         }
 
 
-@tf.keras.utils.register_keras_serializable("tensorflow_asr.optimizers.schedules")
-class CyclicTransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+@keras.utils.register_keras_serializable("tensorflow_asr.optimizers.schedules")
+class CyclicTransformerSchedule(keras.optimizers.schedules.LearningRateSchedule):
     """This callback implements a cyclical learning rate policy (CLR) to the square
     root decay generally used to train transformers.
     The method cycles the learning rate around the square root decay LR with an amplitude
diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py
index 0c235e3526..3fcb8cedf6 100644
--- a/tensorflow_asr/utils/env_util.py
+++ b/tensorflow_asr/utils/env_util.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tensorflow as tf
+import keras
 from packaging import version
 
 logger = tf.get_logger()
@@ -126,11 +127,11 @@ def setup_mxp(
         raise ValueError(f"mxp must be in {options}")
     if mxp == "strict":
         policy = "mixed_bfloat16" if has_devices("TPU") else "mixed_float16"
-        tf.keras.mixed_precision.set_global_policy(policy)
+        keras.mixed_precision.set_global_policy(policy)
         logger.info(f"USING mixed precision policy {policy}")
     elif mxp == "strict_auto":
         policy = "mixed_bfloat16" if has_devices("TPU") else "mixed_float16"
-        tf.keras.mixed_precision.set_global_policy(policy)
+        keras.mixed_precision.set_global_policy(policy)
         tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
         logger.info(f"USING auto mixed precision policy {policy}")
     elif mxp == "auto":
@@ -156,5 +157,5 @@ def setup_seed(
     random.seed(seed)
     np.random.seed(seed)
     tf.random.set_seed(seed)
-    tf.keras.backend.experimental.enable_tf_random_generator()
-    tf.keras.utils.set_random_seed(seed)
+    keras.backend.experimental.enable_tf_random_generator()
+    keras.utils.set_random_seed(seed)
diff --git a/tensorflow_asr/utils/layer_util.py b/tensorflow_asr/utils/layer_util.py
index 8e050c02a4..33b9bdd5a0 100644
--- a/tensorflow_asr/utils/layer_util.py
+++ b/tensorflow_asr/utils/layer_util.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
+import keras
 
 from tensorflow_asr.models.layers import convolution
 
@@ -22,10 +23,10 @@ def get_rnn(
 ):
     assert rnn_type in ["lstm", "gru", "rnn"]
     if rnn_type == "lstm":
-        return tf.keras.layers.LSTM
+        return keras.layers.LSTM
     if rnn_type == "gru":
-        return tf.keras.layers.GRU
-    return tf.keras.layers.SimpleRNN
+        return keras.layers.GRU
+    return keras.layers.SimpleRNN
 
 
 def get_conv(