From d4711c4a2270245403bf8802e83f4b64ae142aae Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 3 May 2023 21:41:02 -0700 Subject: [PATCH 01/62] Add FastConformer Hybrid ASR models for EN, ES, IT, DE, PL, HR, UA, BY (#6549) (#6553) * Added fastconfomer hybrid asr models for en, es, it, de, pl, hr, ua, by * updated ASR docs with the fastconformer hybrid checkpoints * added the fastconformer RNNT and CTC models --------- Signed-off-by: KunalDhawan Co-authored-by: Kunal Dhawan --- docs/source/asr/data/benchmark_by.csv | 2 + docs/source/asr/data/benchmark_de.csv | 1 + docs/source/asr/data/benchmark_en.csv | 5 +- docs/source/asr/data/benchmark_es.csv | 3 +- docs/source/asr/data/benchmark_hr.csv | 1 + docs/source/asr/data/benchmark_it.csv | 2 +- docs/source/asr/data/benchmark_pl.csv | 1 + docs/source/asr/data/benchmark_ua.csv | 2 + docs/source/asr/results.rst | 19 ++++++ nemo/collections/asr/models/ctc_bpe_models.py | 7 --- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 61 ++++++++++++++++++- .../collections/asr/models/rnnt_bpe_models.py | 7 --- 12 files changed, 92 insertions(+), 19 deletions(-) create mode 100644 docs/source/asr/data/benchmark_by.csv create mode 100644 docs/source/asr/data/benchmark_ua.csv diff --git a/docs/source/asr/data/benchmark_by.csv b/docs/source/asr/data/benchmark_by.csv new file mode 100644 index 000000000000..750dfd82ff94 --- /dev/null +++ b/docs/source/asr/data/benchmark_by.csv @@ -0,0 +1,2 @@ +Model,Model Base Class,Model Card +stt_by_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_by_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/data/benchmark_de.csv b/docs/source/asr/data/benchmark_de.csv index 99e221a6b835..6084e95c37c0 100644 --- a/docs/source/asr/data/benchmark_de.csv +++ b/docs/source/asr/data/benchmark_de.csv @@ -4,3 +4,4 @@ stt_de_citrinet_1024,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidi stt_de_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_contextnet_1024" stt_de_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large" stt_de_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_transducer_large" +stt_de_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_fastconformer_hybrid_large_pc" diff --git a/docs/source/asr/data/benchmark_en.csv b/docs/source/asr/data/benchmark_en.csv index 0f03452d034d..5f68e9ca22ce 100644 --- a/docs/source/asr/data/benchmark_en.csv +++ b/docs/source/asr/data/benchmark_en.csv @@ -25,4 +25,7 @@ stt_en_conformer_transducer_small,EncDecRNNTBPEModel,"https://ngc.nvidia.com/cat stt_en_conformer_transducer_medium,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_medium" stt_en_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large" stt_en_conformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge" -stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge" \ No newline at end of file +stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge" +stt_en_fastconformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large" +stt_en_fastconformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large" +stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/data/benchmark_es.csv b/docs/source/asr/data/benchmark_es.csv index 1e1ade3a739c..0fa8b0ecedf1 100644 --- a/docs/source/asr/data/benchmark_es.csv +++ b/docs/source/asr/data/benchmark_es.csv @@ -4,4 +4,5 @@ stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvi stt_es_citrinet_1024_gamma_0_25,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25" stt_es_conformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large" stt_es_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_transducer_large" -stt_es_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_contextnet_1024" \ No newline at end of file +stt_es_contextnet_1024,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_contextnet_1024" +stt_es_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/data/benchmark_hr.csv b/docs/source/asr/data/benchmark_hr.csv index ea506eed3432..35a5b5f04f39 100644 --- a/docs/source/asr/data/benchmark_hr.csv +++ b/docs/source/asr/data/benchmark_hr.csv @@ -1,3 +1,4 @@ Model,Model Base Class,Model Card stt_hr_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large" stt_hr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large" +stt_hr_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/data/benchmark_it.csv b/docs/source/asr/data/benchmark_it.csv index d605b68809eb..230194966573 100644 --- a/docs/source/asr/data/benchmark_it.csv +++ b/docs/source/asr/data/benchmark_it.csv @@ -1,3 +1,3 @@ Model,Model Base Class,Model Card stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5" - +stt_it_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_fastconformer_hybrid_large_pc" diff --git a/docs/source/asr/data/benchmark_pl.csv b/docs/source/asr/data/benchmark_pl.csv index bf646e107306..e3ad9bdb50b7 100644 --- a/docs/source/asr/data/benchmark_pl.csv +++ b/docs/source/asr/data/benchmark_pl.csv @@ -1,2 +1,3 @@ Model,Model Base Class,Model Card stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5" +stt_pl_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/data/benchmark_ua.csv b/docs/source/asr/data/benchmark_ua.csv new file mode 100644 index 000000000000..df1b6c383d3b --- /dev/null +++ b/docs/source/asr/data/benchmark_ua.csv @@ -0,0 +1,2 @@ +Model,Model Base Class,Model Card +stt_ua_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ua_fastconformer_hybrid_large_pc" \ No newline at end of file diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst index e6ffe3deac2a..a1c96c7e1727 100644 --- a/docs/source/asr/results.rst +++ b/docs/source/asr/results.rst @@ -268,3 +268,22 @@ Kinyarwanda :widths: 40, 10, 50 :header-rows: 1 +----------------------------- + +Belarusian +^^^^^^^^^^^ +.. csv-table:: + :file: data/benchmark_by.csv + :align: left + :widths: 40, 10, 50 + :header-rows: 1 + +----------------------------- + +Ukrainian +^^^^^^^^^^^ +.. csv-table:: + :file: data/benchmark_ua.csv + :align: left + :widths: 40, 10, 50 + :header-rows: 1 \ No newline at end of file diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index a82f218d1d69..b97bf769132c 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -604,11 +604,4 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_ctc_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large_ls/versions/1.0.0/files/stt_en_fastconformer_ctc_large_ls.nemo", - ) - results.append(model) - return results diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 104b2eb95524..d10d3364ea29 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -14,7 +14,7 @@ import copy import os -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union import torch from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict @@ -454,7 +454,7 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type raise ValueError(f"decoder_type={decoder_type} is not supported. Supported values: [ctc,rnnt]") @classmethod - def list_available_models(cls) -> Optional[PretrainedModelInfo]: + def list_available_models(cls) -> List[PretrainedModelInfo]: """ This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud. @@ -462,4 +462,61 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: List of available pre-trained models. """ results = [] + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_en_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_de_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_de_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_it_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_fastconformer_hybrid_large_pc/versions/1.18/files/stt_it_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_es_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_es_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_hr_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_fastconformer_hybrid_large_pc/versions/1.18.0/files/FastConformer-Hybrid-Transducer-CTC-BPE-v256-averaged.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_ua_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ua_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ua_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_ua_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_pl_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_pl_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_by_fastconformer_hybrid_large_pc", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_by_fastconformer_hybrid_large_pc", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_by_fastconformer_hybrid_large_pc/versions/1.18.0/files/stt_by_fastconformer_hybrid_large_pc.nemo", + ) + results.append(model) + return results diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index b162f2411450..5ee5824b9d27 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -253,13 +253,6 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) - model = PretrainedModelInfo( - pretrained_model_name="stt_en_fastconformer_transducer_large_ls", - description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large_ls", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_large_ls/versions/1.0.0/files/stt_en_fastconformer_transducer_large_ls.nemo", - ) - results.append(model) - return results def __init__(self, cfg: DictConfig, trainer: Trainer = None): From 46bc35729dcf9deb8d4a99be73dff9dcab34b0d9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 3 May 2023 21:41:55 -0700 Subject: [PATCH 02/62] Add scores for FastConformer models (#6557) (#6558) Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar --- .../asr/data/scores/by/fastconformer_by.csv | 2 + .../source/asr/data/scores/de/citrinet_de.csv | 4 +- .../asr/data/scores/de/conformer_de.csv | 6 +- .../asr/data/scores/de/contextnet_de.csv | 4 +- .../asr/data/scores/de/fastconformer_de.csv | 2 + .../asr/data/scores/de/quartznet15x5_de.csv | 4 +- .../source/asr/data/scores/en/citrinet_en.csv | 14 +- .../asr/data/scores/en/conformer_en.csv | 28 ++-- .../asr/data/scores/en/contextnet_en.csv | 14 +- .../asr/data/scores/en/fastconformer_en.csv | 4 + .../asr/data/scores/en/jasper10x5dr_en.csv | 4 +- .../asr/data/scores/en/quartznet15x5_en.csv | 4 +- .../asr/data/scores/en/squeezeformer_en.csv | 14 +- .../source/asr/data/scores/es/citrinet_es.csv | 6 +- .../asr/data/scores/es/conformer_es.csv | 6 +- .../asr/data/scores/es/contextnet_es.csv | 4 +- .../asr/data/scores/es/fastconformer_es.csv | 2 + .../asr/data/scores/es/quartznet15x5_es.csv | 4 +- .../asr/data/scores/hr/conformer_hr.csv | 6 +- .../asr/data/scores/hr/fastconformer_hr.csv | 2 + .../asr/data/scores/it/conformer_it.csv | 6 +- .../asr/data/scores/it/fastconformer_it.csv | 2 + .../asr/data/scores/it/quartznet15x5_it.csv | 4 +- .../asr/data/scores/pl/fastconformer_pl.csv | 2 + .../asr/data/scores/pl/quartznet15x5_pl.csv | 4 +- .../asr/data/scores/ua/fastconformer_ua.csv | 2 + .../data/scores_pc/by/fastconformer_by.csv | 2 + .../data/scores_pc/de/fastconformer_de.csv | 2 + .../data/scores_pc/en/fastconformer_en.csv | 2 + .../data/scores_pc/es/fastconformer_es.csv | 2 + .../data/scores_pc/hr/fastconformer_hr.csv | 2 + .../data/scores_pc/it/fastconformer_it.csv | 2 + .../data/scores_pc/pl/fastconformer_pl.csv | 2 + .../data/scores_pc/ua/fastconformer_ua.csv | 2 + docs/source/asr/scores.rst | 147 ++++++++++++++++++ 35 files changed, 249 insertions(+), 68 deletions(-) create mode 100644 docs/source/asr/data/scores/by/fastconformer_by.csv create mode 100644 docs/source/asr/data/scores/de/fastconformer_de.csv create mode 100644 docs/source/asr/data/scores/en/fastconformer_en.csv create mode 100644 docs/source/asr/data/scores/es/fastconformer_es.csv create mode 100644 docs/source/asr/data/scores/hr/fastconformer_hr.csv create mode 100644 docs/source/asr/data/scores/it/fastconformer_it.csv create mode 100644 docs/source/asr/data/scores/pl/fastconformer_pl.csv create mode 100644 docs/source/asr/data/scores/ua/fastconformer_ua.csv create mode 100644 docs/source/asr/data/scores_pc/by/fastconformer_by.csv create mode 100644 docs/source/asr/data/scores_pc/de/fastconformer_de.csv create mode 100644 docs/source/asr/data/scores_pc/en/fastconformer_en.csv create mode 100644 docs/source/asr/data/scores_pc/es/fastconformer_es.csv create mode 100644 docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv create mode 100644 docs/source/asr/data/scores_pc/it/fastconformer_it.csv create mode 100644 docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv create mode 100644 docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv diff --git a/docs/source/asr/data/scores/by/fastconformer_by.csv b/docs/source/asr/data/scores/by/fastconformer_by.csv new file mode 100644 index 000000000000..c03cc945d99d --- /dev/null +++ b/docs/source/asr/data/scores/by/fastconformer_by.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set v12.0 (be),MCV Test-Set v12.0 (be) +stt_by_fastconformer_hybrid_large_pc,by,2.7 %,2.7 % diff --git a/docs/source/asr/data/scores/de/citrinet_de.csv b/docs/source/asr/data/scores/de/citrinet_de.csv index 1b3e7db093a2..1768373077b9 100644 --- a/docs/source/asr/data/scores/de/citrinet_de.csv +++ b/docs/source/asr/data/scores/de/citrinet_de.csv @@ -1,2 +1,2 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_citrinet_1024,de,,6.63,7.59,4.06,5.07,12.33,10.02 +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_citrinet_1024,de,,,6.63,,7.59,4.06,5.07,12.33,10.02 diff --git a/docs/source/asr/data/scores/de/conformer_de.csv b/docs/source/asr/data/scores/de/conformer_de.csv index 3d0a9e18d452..1bd1443de00e 100644 --- a/docs/source/asr/data/scores/de/conformer_de.csv +++ b/docs/source/asr/data/scores/de/conformer_de.csv @@ -1,3 +1,3 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_conformer_ctc_large,de,,5.84,6.68,3.85,4.63,12.56,10.51 -stt_de_conformer_transducer_large,de,,4.75,5.36,3.46,4.19,11.21,9.14 +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_conformer_ctc_large,de,,,5.84,,6.68,3.85,4.63,12.56,10.51 +stt_de_conformer_transducer_large,de,,,4.75,,5.36,3.46,4.19,11.21,9.14 diff --git a/docs/source/asr/data/scores/de/contextnet_de.csv b/docs/source/asr/data/scores/de/contextnet_de.csv index b7d52d649e73..40be2181a77f 100644 --- a/docs/source/asr/data/scores/de/contextnet_de.csv +++ b/docs/source/asr/data/scores/de/contextnet_de.csv @@ -1,2 +1,2 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_contextnet_1024,de,,4.76,5.5,3.53,4.2,11.32,9.4 +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_contextnet_1024,de,,,4.76,,5.5,3.53,4.2,11.32,9.4 diff --git a/docs/source/asr/data/scores/de/fastconformer_de.csv b/docs/source/asr/data/scores/de/fastconformer_de.csv new file mode 100644 index 000000000000..fe6e6491f443 --- /dev/null +++ b/docs/source/asr/data/scores/de/fastconformer_de.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_fastconformer_hybrid_large_pc,de,,4.2 %,,4.9 %,,3.3 %,3.8 %,10.8 %,8.7 % diff --git a/docs/source/asr/data/scores/de/quartznet15x5_de.csv b/docs/source/asr/data/scores/de/quartznet15x5_de.csv index 17540903f41e..22da250a97f3 100644 --- a/docs/source/asr/data/scores/de/quartznet15x5_de.csv +++ b/docs/source/asr/data/scores/de/quartznet15x5_de.csv @@ -1,2 +1,2 @@ -Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v7.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) -stt_de_quartznet15x5,de,11.78,,,,,, +Model Name,Language,MCV Dev-Set (v??) (de),MCV Dev-Set v12.0 (de),MCV Dev-Set v7.0 (de),MCV Test-Set v12.0 (de),MCV Test-Set v7.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_quartznet15x5,de,11.78,,,,,,,, diff --git a/docs/source/asr/data/scores/en/citrinet_en.csv b/docs/source/asr/data/scores/en/citrinet_en.csv index 42d8cff2cb9b..47f180e7298e 100644 --- a/docs/source/asr/data/scores/en/citrinet_en.csv +++ b/docs/source/asr/data/scores/en/citrinet_en.csv @@ -1,7 +1,7 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_citrinet_256,en,4.2 % WER,10.7 % WER,4.4 % WER,10.7 % WER,,,,,,,,, -stt_en_citrinet_512,en,3.7 % WER,8.9 % WER,3.7 % WER,8.9 % WER,,,,,,,,, -stt_en_citrinet_1024,en,3.7 % WER,8.3 % WER,3.6 % WER,7.9 % WER,,,,,,,,, -stt_en_citrinet_256_gamma_0_25,en,4.7 %,10.6 %,4.8 %,10.7 %,,,,8.3 %,,,,5.8 %,3.6 % -stt_en_citrinet_512_gamma_0_25,en,4.0 %,9.0 %,3.9 %,9.0 %,,,,6.9 %,,,,4.4 %,3.6 % -stt_en_citrinet_1024_gamma_0_25,en,3.4 %,7.7 %,3.4 %,7.6 %,,,,6.2 %,,,,4.0 %,2.5 % +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_citrinet_256,en,,,4.2 % WER,10.7 % WER,4.4 % WER,10.7 % WER,,,,,,,,,,,, +stt_en_citrinet_512,en,,,3.7 % WER,8.9 % WER,3.7 % WER,8.9 % WER,,,,,,,,,,,, +stt_en_citrinet_1024,en,,,3.7 % WER,8.3 % WER,3.6 % WER,7.9 % WER,,,,,,,,,,,, +stt_en_citrinet_256_gamma_0_25,en,,,4.7 %,10.6 %,4.8 %,10.7 %,,,,,8.3 %,,,,,,5.8 %,3.6 % +stt_en_citrinet_512_gamma_0_25,en,,,4.0 %,9.0 %,3.9 %,9.0 %,,,,,6.9 %,,,,,,4.4 %,3.6 % +stt_en_citrinet_1024_gamma_0_25,en,,,3.4 %,7.7 %,3.4 %,7.6 %,,,,,6.2 %,,,,,,4.0 %,2.5 % diff --git a/docs/source/asr/data/scores/en/conformer_en.csv b/docs/source/asr/data/scores/en/conformer_en.csv index 23ec44382578..905bdf2ebedc 100644 --- a/docs/source/asr/data/scores/en/conformer_en.csv +++ b/docs/source/asr/data/scores/en/conformer_en.csv @@ -1,14 +1,14 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_conformer_ctc_small,en,3.6,8.1,3.7,8.1,,,,,,,,, -stt_en_conformer_ctc_medium,en,2.5,5.8,2.6,5.9,,,,,,,,, -stt_en_conformer_ctc_large,en,1.9,4.4,2.1,4.5,,,,,,,,, -stt_en_conformer_ctc_xlarge,en,1.77 %,3.79 %,2.00 %,3.74 %,7.88 %,,5.99 %,,6.44 %,22.90 %,5.50 %,2.36 %, -stt_en_conformer_ctc_small_ls,en,3.3,8.8,3.4,8.8,,,,,,,,, -stt_en_conformer_ctc_medium_ls,en,2.7,7.4,3.0,7.3,,,,,,,,, -stt_en_conformer_ctc_large_ls,en,2.4,6.2,2.7,6.0,,,,,,,,, -stt_en_conformer_transducer_small,en,2.8,6.6,2.5,6.6,,,,,,,,, -stt_en_conformer_transducer_medium,en,2.0,4.6,2.1,4.7,,,,,,,,, -stt_en_conformer_transducer_large,en,1.6,3.5,1.7,3.7,,,,,,,,, -stt_en_conformer_transducer_large_ls,en,2.1,5.0,2.3,5.1,,,,,,,,, -stt_en_conformer_transducer_xlarge,en,1.48 %,2.95 %,1.62 %,3.01 %,6.46 %,4.59 %,5.32 %,5.70 %,6.47 %,21.32 %,,2.05 %,1.17 % -stt_en_conformer_transducer_xxlarge,en,1.52 %,3.09 %,1.72 %,3.14 %,,5.29 %,5.85 %,6.64 %,,,,2.42 %,1.49 % +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_conformer_ctc_small,en,,,3.6,8.1,3.7,8.1,,,,,,,,,,,, +stt_en_conformer_ctc_medium,en,,,2.5,5.8,2.6,5.9,,,,,,,,,,,, +stt_en_conformer_ctc_large,en,,,1.9,4.4,2.1,4.5,,,,,,,,,,,, +stt_en_conformer_ctc_xlarge,en,,,1.77 %,3.79 %,2.00 %,3.74 %,,7.88 %,,5.99 %,,6.44 %,22.90 %,5.50 %,,,2.36 %, +stt_en_conformer_ctc_small_ls,en,,,3.3,8.8,3.4,8.8,,,,,,,,,,,, +stt_en_conformer_ctc_medium_ls,en,,,2.7,7.4,3.0,7.3,,,,,,,,,,,, +stt_en_conformer_ctc_large_ls,en,,,2.4,6.2,2.7,6.0,,,,,,,,,,,, +stt_en_conformer_transducer_small,en,,,2.8,6.6,2.5,6.6,,,,,,,,,,,, +stt_en_conformer_transducer_medium,en,,,2.0,4.6,2.1,4.7,,,,,,,,,,,, +stt_en_conformer_transducer_large,en,,,1.6,3.5,1.7,3.7,,,,,,,,,,,, +stt_en_conformer_transducer_large_ls,en,,,2.1,5.0,2.3,5.1,,,,,,,,,,,, +stt_en_conformer_transducer_xlarge,en,,,1.48 %,2.95 %,1.62 %,3.01 %,,6.46 %,4.59 %,5.32 %,5.70 %,6.47 %,21.32 %,,,,2.05 %,1.17 % +stt_en_conformer_transducer_xxlarge,en,,,1.52 %,3.09 %,1.72 %,3.14 %,,,5.29 %,5.85 %,6.64 %,,,,,,2.42 %,1.49 % diff --git a/docs/source/asr/data/scores/en/contextnet_en.csv b/docs/source/asr/data/scores/en/contextnet_en.csv index 4a065dd299f8..6f986e28039a 100644 --- a/docs/source/asr/data/scores/en/contextnet_en.csv +++ b/docs/source/asr/data/scores/en/contextnet_en.csv @@ -1,7 +1,7 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_contextnet_256,en,3.3 %,7.9 %,3.3 %,8.0 %,,9.7 %,11.0 %,7.1 %,,,,4.6 %,3.2 % -stt_en_contextnet_512,en,2.0 %,4.8 %,2.2 %,5.0 %,,6.6 %,7.3 %,5.9 %,,,,2.8 %,1.4 % -stt_en_contextnet_1024,en,1.7 %,3.8 %,1.9 %,4.0 %,7.9 %,,5.9 %,5.2 %,6.5 %,21.7 %,4.7 %,2.3 %,1.3 % -stt_en_contextnet_256_mls,en,,9.0 %,,9.2 %,,9.4 %,10.9 %,,,,,, -stt_en_contextnet_512_mls,en,,5.2 %,,5.2 %,,5.6 %,6.6 %,,,,,, -stt_en_contextnet_1024_mls,en,,4.1 %,,4.2 %,,4.6 %,5.6 %,,,,,, +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_contextnet_256,en,,,3.3 %,7.9 %,3.3 %,8.0 %,,,9.7 %,11.0 %,7.1 %,,,,,,4.6 %,3.2 % +stt_en_contextnet_512,en,,,2.0 %,4.8 %,2.2 %,5.0 %,,,6.6 %,7.3 %,5.9 %,,,,,,2.8 %,1.4 % +stt_en_contextnet_1024,en,,,1.7 %,3.8 %,1.9 %,4.0 %,,7.9 %,,5.9 %,5.2 %,6.5 %,21.7 %,4.7 %,,,2.3 %,1.3 % +stt_en_contextnet_256_mls,en,,,,9.0 %,,9.2 %,,,9.4 %,10.9 %,,,,,,,, +stt_en_contextnet_512_mls,en,,,,5.2 %,,5.2 %,,,5.6 %,6.6 %,,,,,,,, +stt_en_contextnet_1024_mls,en,,,,4.1 %,,4.2 %,,,4.6 %,5.6 %,,,,,,,, diff --git a/docs/source/asr/data/scores/en/fastconformer_en.csv b/docs/source/asr/data/scores/en/fastconformer_en.csv new file mode 100644 index 000000000000..e993273dfbf4 --- /dev/null +++ b/docs/source/asr/data/scores/en/fastconformer_en.csv @@ -0,0 +1,4 @@ +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_fastconformer_ctc_large,en,,,1.9,4.2,2.1,4.2,,,,,,,,,,,, +stt_en_fastconformer_transducer_large,en,,,2.0,3.8,1.8,3.8,,,,,,,,,,,, +stt_en_fastconformer_hybrid_large_pc,en,8.0 %,10.3 %,,,2.0 %,4.1 %,8.2 %,,,4.5 %,4.6 %,,,,2.3 %,4.5 %,, diff --git a/docs/source/asr/data/scores/en/jasper10x5dr_en.csv b/docs/source/asr/data/scores/en/jasper10x5dr_en.csv index ac9b260c5bb3..a812337ac0eb 100644 --- a/docs/source/asr/data/scores/en/jasper10x5dr_en.csv +++ b/docs/source/asr/data/scores/en/jasper10x5dr_en.csv @@ -1,2 +1,2 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_jasper10x5dr,en,3.74,10.21,,,,,,,,,,, +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_jasper10x5dr,en,,,3.74,10.21,,,,,,,,,,,,,, diff --git a/docs/source/asr/data/scores/en/quartznet15x5_en.csv b/docs/source/asr/data/scores/en/quartznet15x5_en.csv index 04aef4aa49dd..67b52bc9a0da 100644 --- a/docs/source/asr/data/scores/en/quartznet15x5_en.csv +++ b/docs/source/asr/data/scores/en/quartznet15x5_en.csv @@ -1,2 +1,2 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_quartznet15x5,en,4.38,11.3,,,,,,,,,,, +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_quartznet15x5,en,,,4.38,11.3,,,,,,,,,,,,,, diff --git a/docs/source/asr/data/scores/en/squeezeformer_en.csv b/docs/source/asr/data/scores/en/squeezeformer_en.csv index fdbd9bd99665..ecd18cc40b97 100644 --- a/docs/source/asr/data/scores/en/squeezeformer_en.csv +++ b/docs/source/asr/data/scores/en/squeezeformer_en.csv @@ -1,7 +1,7 @@ -Model Name,Language,Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,WSJ Dev 93,WSJ Eval 92 -stt_en_squeezeformer_ctc_xsmall_ls,en,3.6 %,9.7 %,3.8 %,9.4 %,,,,,,,,, -stt_en_squeezeformer_ctc_small_ls,en,2.9 %,7.4 %,3.1 %,7.4 %,,,,,,,,, -stt_en_squeezeformer_ctc_small_medium_ls,en,2.7 %,7.0 %,2.8 %,7.1 %,,,,,,,,, -stt_en_squeezeformer_ctc_medium_ls,en,2.4 %,6.2 %,2.6 %,6.3 %,,,,,,,,, -stt_en_squeezeformer_ctc_medium_large_ls,en,2.3 %,6.0 %,2.5 %,5.9 %,,,,,,,,, -stt_en_squeezeformer_ctc_large_ls,en,2.3 %,5.7 %,2.4 %,5.7 %,,,,,,,,, +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Dev-Clean,Librispeech Dev-Other,Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MCV Test-Set v8.0 (en),MLS Dev (en),MLS Test (en),NSC Part1,NSC Part6,Peoples Speech Test v1,SLR 83 Test,SPGI Test,VoxPopuli Test (en),WSJ Dev 93,WSJ Eval 92 +stt_en_squeezeformer_ctc_xsmall_ls,en,,,3.6 %,9.7 %,3.8 %,9.4 %,,,,,,,,,,,, +stt_en_squeezeformer_ctc_small_ls,en,,,2.9 %,7.4 %,3.1 %,7.4 %,,,,,,,,,,,, +stt_en_squeezeformer_ctc_small_medium_ls,en,,,2.7 %,7.0 %,2.8 %,7.1 %,,,,,,,,,,,, +stt_en_squeezeformer_ctc_medium_ls,en,,,2.4 %,6.2 %,2.6 %,6.3 %,,,,,,,,,,,, +stt_en_squeezeformer_ctc_medium_large_ls,en,,,2.3 %,6.0 %,2.5 %,5.9 %,,,,,,,,,,,, +stt_en_squeezeformer_ctc_large_ls,en,,,2.3 %,5.7 %,2.4 %,5.7 %,,,,,,,,,,,, diff --git a/docs/source/asr/data/scores/es/citrinet_es.csv b/docs/source/asr/data/scores/es/citrinet_es.csv index 9311fb2b04fd..9471293dd227 100644 --- a/docs/source/asr/data/scores/es/citrinet_es.csv +++ b/docs/source/asr/data/scores/es/citrinet_es.csv @@ -1,3 +1,3 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_citrinet_512,es,,,,,,9.1 % WER,,10.3 % WER,,4.9 % WER,5.2 % WER,, -stt_es_citrinet_1024_gamma_0_25,es,19.9 %,21.3 %,19.1 %,15.8 %,15.9 %,,6.1 %,,6.8 %,3.5 %,4.1 %,5.6 %,7.0 % +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_citrinet_512,es,,,,,,9.1 % WER,,,10.3 % WER,,,4.9 % WER,5.2 % WER,, +stt_es_citrinet_1024_gamma_0_25,es,19.9 %,21.3 %,19.1 %,15.8 %,15.9 %,,,6.1 %,,,6.8 %,3.5 %,4.1 %,5.6 %,7.0 % diff --git a/docs/source/asr/data/scores/es/conformer_es.csv b/docs/source/asr/data/scores/es/conformer_es.csv index 10b28dc49f4e..e7e47cbdc068 100644 --- a/docs/source/asr/data/scores/es/conformer_es.csv +++ b/docs/source/asr/data/scores/es/conformer_es.csv @@ -1,3 +1,3 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_conformer_ctc_large,es,23.7 %,25.3 %,22.4 %,18.3 %,18.5 %,,6.3 %,,6.9 %,4.3 %,4.2 %,6.1 %,7.5 % -stt_es_conformer_transducer_large,es,18.0 %,19.4 %,17.2 %,14.7 %,14.8 %,,4.6 %,,5.2 %,2.7 %,3.2 %,4.7 %,6.0 % +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_conformer_ctc_large,es,23.7 %,25.3 %,22.4 %,18.3 %,18.5 %,,,6.3 %,,,6.9 %,4.3 %,4.2 %,6.1 %,7.5 % +stt_es_conformer_transducer_large,es,18.0 %,19.4 %,17.2 %,14.7 %,14.8 %,,,4.6 %,,,5.2 %,2.7 %,3.2 %,4.7 %,6.0 % diff --git a/docs/source/asr/data/scores/es/contextnet_es.csv b/docs/source/asr/data/scores/es/contextnet_es.csv index ec20b5708d93..9f75e2a70bce 100644 --- a/docs/source/asr/data/scores/es/contextnet_es.csv +++ b/docs/source/asr/data/scores/es/contextnet_es.csv @@ -1,2 +1,2 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_contextnet_1024,es,19.1 %,20.7 %,18.2 %,15.3 %,15.1 %,,4.8 %,,5.2 %,3.1 %,3.5 %,5.1 %,6.2 % +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_contextnet_1024,es,19.1 %,20.7 %,18.2 %,15.3 %,15.1 %,,,4.8 %,,,5.2 %,3.1 %,3.5 %,5.1 %,6.2 % diff --git a/docs/source/asr/data/scores/es/fastconformer_es.csv b/docs/source/asr/data/scores/es/fastconformer_es.csv new file mode 100644 index 000000000000..a6c12afe95e1 --- /dev/null +++ b/docs/source/asr/data/scores/es/fastconformer_es.csv @@ -0,0 +1,2 @@ +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_fastconformer_hybrid_large_pc,es,,,,29.4 %,28.9 %,,7.1 %,,,7.5 %,,10.6 %,11.8 %,8.6 %,9.8 % diff --git a/docs/source/asr/data/scores/es/quartznet15x5_es.csv b/docs/source/asr/data/scores/es/quartznet15x5_es.csv index 79de5ce952d8..54de5e94025b 100644 --- a/docs/source/asr/data/scores/es/quartznet15x5_es.csv +++ b/docs/source/asr/data/scores/es/quartznet15x5_es.csv @@ -1,2 +1,2 @@ -Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) -stt_es_quartznet15x5,es,,,,,,12.97,,,,,,, +Model Name,Language,Call Home Dev Test (es),Call Home Eval Test (es),Call Home Train (es),Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set (v??) (es),MCV Dev-Set v12.0 (es),MCV Dev-Set v7.0 (es),MCV Test-Set (v??) (es),MCV Test-Set v12.0 (es),MCV Test-Set v7.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_quartznet15x5,es,,,,,,12.97,,,,,,,,, diff --git a/docs/source/asr/data/scores/hr/conformer_hr.csv b/docs/source/asr/data/scores/hr/conformer_hr.csv index 04383a14e888..4cfd3f79a89f 100644 --- a/docs/source/asr/data/scores/hr/conformer_hr.csv +++ b/docs/source/asr/data/scores/hr/conformer_hr.csv @@ -1,3 +1,3 @@ -Model Name,Language,ParlaSpeech Dev-Set v1.0 (hr),ParlaSpeech Test-Set v1.0 (hr) -stt_hr_conformer_ctc_large,hr,4.43,4.70 -stt_hr_conformer_transducer_large,hr,4.56,4.69 +Model Name,Language,ParlaSpeech Dev-Set v1.0 (hr),ParlaSpeech Test-Set v1.0 (hr),Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) +stt_hr_conformer_ctc_large,hr,4.43,4.70,, +stt_hr_conformer_transducer_large,hr,4.56,4.69,, diff --git a/docs/source/asr/data/scores/hr/fastconformer_hr.csv b/docs/source/asr/data/scores/hr/fastconformer_hr.csv new file mode 100644 index 000000000000..ee54e981e7aa --- /dev/null +++ b/docs/source/asr/data/scores/hr/fastconformer_hr.csv @@ -0,0 +1,2 @@ +Model Name,Language,ParlaSpeech Dev-Set v1.0 (hr),ParlaSpeech Test-Set v1.0 (hr),Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) +stt_hr_fastconformer_hybrid_large_pc,hr,,,4.5 %,4.2 % diff --git a/docs/source/asr/data/scores/it/conformer_it.csv b/docs/source/asr/data/scores/it/conformer_it.csv index 3e3854eb862a..c86a906e982c 100644 --- a/docs/source/asr/data/scores/it/conformer_it.csv +++ b/docs/source/asr/data/scores/it/conformer_it.csv @@ -1,3 +1,3 @@ -Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Test-Set v11.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_conformer_ctc_large,it,,5.38,5.92,13.16,10.62,13.43,16.75 -stt_it_conformer_transducer_large,it,,4.80,5.24,14.62,12.18,12.00,15.15 +Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) +stt_it_conformer_ctc_large,it,,5.38,,5.92,,13.16,10.62,13.43,16.75 +stt_it_conformer_transducer_large,it,,4.80,,5.24,,14.62,12.18,12.00,15.15 diff --git a/docs/source/asr/data/scores/it/fastconformer_it.csv b/docs/source/asr/data/scores/it/fastconformer_it.csv new file mode 100644 index 000000000000..3a684662295e --- /dev/null +++ b/docs/source/asr/data/scores/it/fastconformer_it.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) +stt_it_fastconformer_hybrid_large_pc,it,,,5.2 %,,5.8 %,13.6 %,11.5 %,12.7 %,15.6 % diff --git a/docs/source/asr/data/scores/it/quartznet15x5_it.csv b/docs/source/asr/data/scores/it/quartznet15x5_it.csv index 475058e38bc0..f22cfda089dc 100644 --- a/docs/source/asr/data/scores/it/quartznet15x5_it.csv +++ b/docs/source/asr/data/scores/it/quartznet15x5_it.csv @@ -1,2 +1,2 @@ -Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Test-Set v11.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) -stt_it_quartznet15x5,it,15.22,,,,,, +Model Name,Language,MCV Dev-Set (v??) (it),MCV Dev-Set v11.0 (it),MCV Dev-Set v12.0 (it),MCV Test-Set v11.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) +stt_it_quartznet15x5,it,15.22,,,,,,,, diff --git a/docs/source/asr/data/scores/pl/fastconformer_pl.csv b/docs/source/asr/data/scores/pl/fastconformer_pl.csv new file mode 100644 index 000000000000..8cf9a506b704 --- /dev/null +++ b/docs/source/asr/data/scores/pl/fastconformer_pl.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set (v??) (pl),MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) +stt_pl_fastconformer_hybrid_large_pc,pl,,6.0 %,8.7 %,7.1 %,5.8 %,11.3 %,8.5 % diff --git a/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv b/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv index 5692e36037ac..98c80fdd5401 100644 --- a/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv +++ b/docs/source/asr/data/scores/pl/quartznet15x5_pl.csv @@ -1,2 +1,2 @@ -Model Name,Language,MCV Dev-Set (v??) (pl) -stt_pl_quartznet15x5,pl,14 +Model Name,Language,MCV Dev-Set (v??) (pl),MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) +stt_pl_quartznet15x5,pl,14,,,,,, diff --git a/docs/source/asr/data/scores/ua/fastconformer_ua.csv b/docs/source/asr/data/scores/ua/fastconformer_ua.csv new file mode 100644 index 000000000000..c325a73c5f53 --- /dev/null +++ b/docs/source/asr/data/scores/ua/fastconformer_ua.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Test-Set v12.0 (ua) +stt_ua_fastconformer_hybrid_large_pc,ua,5.2 % diff --git a/docs/source/asr/data/scores_pc/by/fastconformer_by.csv b/docs/source/asr/data/scores_pc/by/fastconformer_by.csv new file mode 100644 index 000000000000..88f5e320f088 --- /dev/null +++ b/docs/source/asr/data/scores_pc/by/fastconformer_by.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set v12.0 (be),MCV Test-Set v12.0 (be) +stt_by_fastconformer_hybrid_large_pc,by,3.8 %,3.9 % diff --git a/docs/source/asr/data/scores_pc/de/fastconformer_de.csv b/docs/source/asr/data/scores_pc/de/fastconformer_de.csv new file mode 100644 index 000000000000..f86228918460 --- /dev/null +++ b/docs/source/asr/data/scores_pc/de/fastconformer_de.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set v12.0 (de),MCV Test-Set v12.0 (de),MLS Dev (en),MLS Test (en),VoxPopuli Dev (de),VoxPopuli Test (de) +stt_de_fastconformer_hybrid_large_pc,de,4.7 %,5.4 %,10.1 %,11.1 %,12.6 %,10.4 % diff --git a/docs/source/asr/data/scores_pc/en/fastconformer_en.csv b/docs/source/asr/data/scores_pc/en/fastconformer_en.csv new file mode 100644 index 000000000000..9495643af30d --- /dev/null +++ b/docs/source/asr/data/scores_pc/en/fastconformer_en.csv @@ -0,0 +1,2 @@ +Model Name,Language,EuroParl Test Set (en),Fisher Test Set (en),Librispeech Test-Clean,Librispeech Test-Other,MCV Test-Set v11.0 (en),MLS Test (en),NSC Part1,SPGI Test,VoxPopuli Test (en) +stt_en_fastconformer_hybrid_large_pc,en,12.5 %,19.0 %,7.3 %,9.2 %,10.1 %,12.7 %,7.2 %,5.1 %,6.7 % diff --git a/docs/source/asr/data/scores_pc/es/fastconformer_es.csv b/docs/source/asr/data/scores_pc/es/fastconformer_es.csv new file mode 100644 index 000000000000..501771865ed8 --- /dev/null +++ b/docs/source/asr/data/scores_pc/es/fastconformer_es.csv @@ -0,0 +1,2 @@ +Model Name,Language,Fisher Dev Set (es),Fisher Test Set (es),MCV Dev-Set v12.0 (es),MCV Test-Set v12.0 (es),MLS Dev (en),MLS Test (en),VoxPopuli Dev (es),VoxPopuli Test (es) +stt_es_fastconformer_hybrid_large_pc,es,14.7 %,14.6 %,4.5 %,5.0 %,3.1 %,3.9 %,4.4 %,5.6 % diff --git a/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv b/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv new file mode 100644 index 000000000000..3c024c09f329 --- /dev/null +++ b/docs/source/asr/data/scores_pc/hr/fastconformer_hr.csv @@ -0,0 +1,2 @@ +Model Name,Language,Parlaspeech Dev-Set (v??) (hr),Parlaspeech Test-Set (v??) (hr) +stt_hr_fastconformer_hybrid_large_pc,hr,10.4 %,8.7 % diff --git a/docs/source/asr/data/scores_pc/it/fastconformer_it.csv b/docs/source/asr/data/scores_pc/it/fastconformer_it.csv new file mode 100644 index 000000000000..6bcf2c0b4400 --- /dev/null +++ b/docs/source/asr/data/scores_pc/it/fastconformer_it.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set v12.0 (it),MCV Test-Set v12.0 (it),MLS Dev (en),MLS Test (en),VoxPopuli Dev (it),VoxPopuli Test (it) +stt_it_fastconformer_hybrid_large_pc,it,7.8 %,8.2 %,26.4 %,22.5 %,16.8 %,19.6 % diff --git a/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv b/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv new file mode 100644 index 000000000000..5cbadae40b59 --- /dev/null +++ b/docs/source/asr/data/scores_pc/pl/fastconformer_pl.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Dev-Set v12.0 (pl),MCV Test-Set v12.0 (pl),MLS Dev (en),MLS Test (en),VoxPopuli Dev (pl),VoxPopuli Test (pl) +stt_pl_fastconformer_hybrid_large_pc,pl,8.9 %,11.0 %,16.0 %,11.0 %,14.0 %,11.4 % diff --git a/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv b/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv new file mode 100644 index 000000000000..b486fa23aeb3 --- /dev/null +++ b/docs/source/asr/data/scores_pc/ua/fastconformer_ua.csv @@ -0,0 +1,2 @@ +Model Name,Language,MCV Test-Set v12.0 (ua) +stt_ua_fastconformer_hybrid_large_pc,ua,7.3 % diff --git a/docs/source/asr/scores.rst b/docs/source/asr/scores.rst index bcb083bd917e..d008a26700ec 100644 --- a/docs/source/asr/scores.rst +++ b/docs/source/asr/scores.rst @@ -28,6 +28,13 @@ EN -------------------- +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/en/fastconformer_en.csv + +-------------------- + .. csv-table:: :header-rows: 1 :align: left @@ -59,6 +66,16 @@ BE -------------------- +BY +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/by/fastconformer_by.csv + +-------------------- + CA ^^ @@ -100,6 +117,13 @@ DE -------------------- +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/de/fastconformer_de.csv + +-------------------- + .. csv-table:: :header-rows: 1 :align: left @@ -158,6 +182,13 @@ ES -------------------- +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/es/fastconformer_es.csv + +-------------------- + .. csv-table:: :header-rows: 1 :align: left @@ -206,6 +237,13 @@ HR -------------------- +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/hr/fastconformer_hr.csv + +-------------------- + IT ^^ @@ -216,6 +254,13 @@ IT -------------------- +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/it/fastconformer_it.csv + +-------------------- + .. csv-table:: :header-rows: 1 :align: left @@ -236,6 +281,13 @@ KAB PL ^^ +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/pl/fastconformer_pl.csv + +-------------------- + .. csv-table:: :header-rows: 1 :align: left @@ -270,6 +322,16 @@ RW -------------------- +UA +^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores/ua/fastconformer_ua.csv + +-------------------- + ZH ^^ @@ -287,3 +349,88 @@ ZH -------------------- + + +Scores with Punctuation and Capitalization +------------------------------------------ + +EN with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/en/fastconformer_en.csv + +-------------------- + +BY with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/by/fastconformer_by.csv + +-------------------- + +DE with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/de/fastconformer_de.csv + +-------------------- + +ES with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/es/fastconformer_es.csv + +-------------------- + +HR with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/hr/fastconformer_hr.csv + +-------------------- + +IT with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/it/fastconformer_it.csv + +-------------------- + +PL with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/pl/fastconformer_pl.csv + +-------------------- + +UA with P&C +^^^^^^^^^^^ + +.. csv-table:: + :header-rows: 1 + :align: left + :file: data/scores_pc/ua/fastconformer_ua.csv + +-------------------- + From f4958870f59c423b5b0c008da50430d2bacb9956 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 3 May 2023 21:42:28 -0700 Subject: [PATCH 03/62] Fix fp16 (#6543) (#6544) Signed-off-by: MaximumEntropy Co-authored-by: Sandeep Subramanian --- examples/nlp/language_modeling/megatron_gpt_eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index d797937850e0..00b53a9f6f8f 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -196,6 +196,8 @@ def main(cfg) -> None: pretrained_cfg.activations_checkpoint_granularity = None pretrained_cfg.activations_checkpoint_method = None pretrained_cfg.precision = trainer.precision + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False model = MegatronGPTModel.restore_from( restore_path=cfg.gpt_model_file, trainer=trainer, From fa2de0a0cc2bb5f89bc61fd8e173987f9e7ab3c5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 4 May 2023 02:36:27 -0700 Subject: [PATCH 04/62] Patch transcribe and support offline transcribe for hybrid model (#6550) (#6559) Signed-off-by: fayejf Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- examples/asr/transcribe_speech.py | 17 ++++++++++++++--- .../asr/parts/utils/transcribe_utils.py | 4 ++-- tools/asr_evaluator/conf/eval.yaml | 2 +- tools/asr_evaluator/utils.py | 3 ++- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 3493fb28d81d..30700153e340 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -19,11 +19,11 @@ import pytorch_lightning as pl import torch -from omegaconf import OmegaConf +from omegaconf import OmegaConf, open_dict from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig from nemo.collections.asr.metrics.wer import CTCDecodingConfig -from nemo.collections.asr.models.ctc_models import EncDecCTCModel +from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig from nemo.collections.asr.parts.utils.transcribe_utils import ( compute_output_filename, @@ -154,6 +154,9 @@ class TranscriptionConfig: def main(cfg: TranscriptionConfig) -> TranscriptionConfig: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + if is_dataclass(cfg): cfg = OmegaConf.structured(cfg) @@ -223,7 +226,6 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: decoding_cfg.preserve_alignments = cfg.compute_timestamps if 'compute_langs' in decoding_cfg: decoding_cfg.compute_langs = cfg.compute_langs - asr_model.change_decoding_strategy(decoding_cfg, decoder_type=cfg.decoder_type) # Check if ctc or rnnt model @@ -243,6 +245,15 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: asr_model.change_decoding_strategy(cfg.ctc_decoding) + # Setup decoding config based on model type and decoder_type + with open_dict(cfg): + if isinstance(asr_model, EncDecCTCModel) or ( + isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc" + ): + cfg.decoding = cfg.ctc_decoding + else: + cfg.decoding = cfg.rnnt_decoding + # prepare audio filepaths and decide wether it's partical audio filepaths, partial_audio = prepare_audio_data(cfg) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index d59d453ba972..8cfe58523751 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -289,14 +289,14 @@ def write_transcription( if isinstance(transcriptions[0], rnnt_utils.Hypothesis): # List[rnnt_utils.Hypothesis] best_hyps = transcriptions - assert cfg.ctc_decoding.beam.return_best_hypothesis, "Works only with return_best_hypothesis=true" + assert cfg.decoding.beam.return_best_hypothesis, "Works only with return_best_hypothesis=true" elif isinstance(transcriptions[0], list) and isinstance( transcriptions[0][0], rnnt_utils.Hypothesis ): # List[List[rnnt_utils.Hypothesis]] NBestHypothesis best_hyps, beams = [], [] for hyps in transcriptions: best_hyps.append(hyps[0]) - if not cfg.ctc_decoding.beam.return_best_hypothesis: + if not cfg.decoding.beam.return_best_hypothesis: beam = [] for hyp in hyps: beam.append((hyp.text, hyp.score)) diff --git a/tools/asr_evaluator/conf/eval.yaml b/tools/asr_evaluator/conf/eval.yaml index 95e7c94b5b43..9129eddc49f1 100644 --- a/tools/asr_evaluator/conf/eval.yaml +++ b/tools/asr_evaluator/conf/eval.yaml @@ -13,7 +13,7 @@ engine: chunk_len_in_secs: 1.6 #null # Need to specify if use buffered inference (default for offline_by_chunked is 20) total_buffer_in_secs: 4 #null # Need to specify if use buffered inference (default for offline_by_chunked is 22) model_stride: 4 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models - + decoder_type: null # Used for hybrid CTC RNNT model only. Specify decoder_type *ctc* or *rnnt* for hybrid CTC RNNT model. test_ds: manifest_filepath: null sample_rate: 16000 diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py index ad69b249f5db..c233376eb13a 100644 --- a/tools/asr_evaluator/utils.py +++ b/tools/asr_evaluator/utils.py @@ -154,7 +154,8 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig: f"output_filename={cfg.output_filename} " f"batch_size={cfg.test_ds.batch_size} " f"random_seed={cfg.random_seed} " - f"eval_config_yaml={f.name} ", + f"eval_config_yaml={f.name} " + f"decoder_type={cfg.inference.decoder_type} ", shell=True, check=True, ) From 8d1901ba436f364c19f5cd77be9d5df465751387 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 4 May 2023 09:20:49 -0700 Subject: [PATCH 05/62] Fix notebook bad json (#6561) Signed-off-by: smajumdar --- tutorials/asr/Offline_ASR.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/asr/Offline_ASR.ipynb b/tutorials/asr/Offline_ASR.ipynb index 678bf3f543fa..fc8af2e76416 100644 --- a/tutorials/asr/Offline_ASR.ipynb +++ b/tutorials/asr/Offline_ASR.ipynb @@ -30,8 +30,8 @@ "* use beam search decoder with N-gram language model re-scoring\n", "\n", "You may find more info on how to train and use language models for ASR models here:\n", - "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html\n" - "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n", + "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html\n", + "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n" ] }, { From bbeabcaea350a8bb4f26cb774d8382e00f3ea81e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 4 May 2023 09:26:29 -0700 Subject: [PATCH 06/62] Change Megatron Enc Dec model to use persistent_workers (#6548) (#6552) * persistent workers * fix --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper --- .../language_modeling/megatron_lm_encoder_decoder_model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 4f4bc0d709a8..b3ecc1b150ac 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -822,7 +822,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, num_workers): # Torch dataloader. return torch.utils.data.DataLoader( - dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True, + dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + pin_memory=True, + persistent_workers=True if num_workers > 0 else False, ) def setup(self, stage=None): From 76ee4c3f6e85efaf5b12c37042bd9c4d86a88180 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 4 May 2023 20:57:47 +0400 Subject: [PATCH 07/62] Make KenLM with PC for AggregateTokenizer and merge it (#6081) * do_lowercase, rm_punctuation Signed-off-by: Nikolay Karpov * support beam_strategy = beam Signed-off-by: Nikolay Karpov * black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix config and^Cunctuation capitalization Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rm math Signed-off-by: Nikolay Karpov * update kenlm Signed-off-by: Nikolay Karpov * black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add opengrm Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * mv install_beamsearch_decoders Signed-off-by: Nikolay Karpov * punctuation_to_preserve Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Only tikenizer opion Signed-off-by: Nikolay Karpov * Black Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * DEFAULT_TOKEN_OFFSET Signed-off-by: Nikolay Karpov * aggregate_tokenizer Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * install kenlm with more than 5gram Signed-off-by: Nikolay Karpov * install_beamsearch_decoders Signed-off-by: Nikolay Karpov * ngram_bin_path kenlm_bin_path Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * black Signed-off-by: Nikolay Karpov * fix greedy PC bug Signed-off-by: Nikolay Karpov * move global params Signed-off-by: Nikolay Karpov * fix description and perplexity Signed-off-by: Nikolay Karpov * fix description Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * NEMO_PATH Signed-off-by: Nikolay Karpov * nemo:23.01 Signed-off-by: Nikolay Karpov * License Signed-off-by: Nikolay Karpov * description Signed-off-by: Nikolay Karpov * isinstance Signed-off-by: Nikolay Karpov * refactor kenlm stdin Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * black Signed-off-by: Nikolay Karpov * add cmd arg Signed-off-by: Nikolay Karpov * use new iter_files Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * EncDecHybridRNNTCTCModel Signed-off-by: Nikolay Karpov * punctuation Signed-off-by: Nikolay Karpov * train_kenlm args Signed-off-by: Nikolay Karpov * add docstrings Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add ngram_merge docs Signed-off-by: Nikolay Karpov * ngram_prune Signed-off-by: Nikolay Karpov * rename to ngram_merge Signed-off-by: Nikolay Karpov * rename to ngram Signed-off-by: Nikolay Karpov * add comments Signed-off-by: Nikolay Karpov * Ngram Signed-off-by: Nikolay Karpov * nemo_model_file Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * install_opengrm_ngram Signed-off-by: Nikolay Karpov * install opengrm Signed-off-by: Nikolay Karpov * rename to install_opengrm.sh Signed-off-by: Nikolay Karpov * rm extra import Signed-off-by: Nikolay Karpov * train_paths Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * text_processing Signed-off-by: Nikolay Karpov * fix ngram_bin_path Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * DECODERS_PATH Signed-off-by: Nikolay Karpov * farcompile Signed-off-by: Nikolay Karpov * rm text processing Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * text_processing Signed-off-by: Nikolay Karpov * AggregateTokenizer.DummyTokenizer Signed-off-by: Nikolay Karpov * comments Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * TextProcessingConfig Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo Signed-off-by: Nikolay Karpov * doc Signed-off-by: Nikolay Karpov * types Signed-off-by: Nikolay Karpov * nemo_model_file Signed-off-by: Nikolay Karpov * rm assert Signed-off-by: Nikolay Karpov * import kenlm_utils Signed-off-by: Nikolay Karpov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * return None Signed-off-by: Nikolay Karpov * Copyright Signed-off-by: Nikolay Karpov * 2022 Signed-off-by: Nikolay Karpov * 2023 Signed-off-by: Nikolay Karpov --------- Signed-off-by: Nikolay Karpov Signed-off-by: Nikolay Karpov Co-authored-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/source/asr/asr_language_modeling.rst | 118 ++++- examples/asr/speech_to_text_eval.py | 18 +- nemo/collections/asr/metrics/wer_bpe.py | 6 +- .../asr/parts/utils/transcribe_utils.py | 49 +- .../ngram_lm/eval_beamsearch_ngram.py | 42 +- .../ngram_lm/install_beamsearch_decoders.sh | 47 +- .../ngram_lm/kenlm_utils.py | 189 ++++++-- .../ngram_lm/ngram_merge.py | 448 ++++++++++++++++++ .../ngram_lm/train_kenlm.py | 201 ++++---- scripts/installers/Dockerfile.ngramtools | 30 ++ scripts/installers/install_opengrm.sh | 32 ++ scripts/installers/setup_os2s_decoders.py | 138 ++++++ 12 files changed, 1132 insertions(+), 186 deletions(-) create mode 100644 scripts/asr_language_modeling/ngram_lm/ngram_merge.py create mode 100644 scripts/installers/Dockerfile.ngramtools create mode 100755 scripts/installers/install_opengrm.sh create mode 100644 scripts/installers/setup_os2s_decoders.py diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling.rst index a0e578092f50..a0d46ca795b1 100644 --- a/docs/source/asr/asr_language_modeling.rst +++ b/docs/source/asr/asr_language_modeling.rst @@ -21,7 +21,9 @@ best candidates. The beam search decoders in NeMo support language models traine `https://github.com/kpu/kenlm `__). The beam search decoders and KenLM library are not installed by default in NeMo, and you need to install them to be able to use beam search decoding and N-gram LM. -Please refer to `scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh` on how to install them. +Please refer to `scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh `__ +on how to install them. Alternatively, you can build Docker image +`scripts/installers/Dockerfile.ngramtools `__ with all the necessary dependencies. NeMo supports both character-based and BPE-based models for N-gram LMs. An N-gram LM can be used with beam search decoders on top of the ASR models to produce more accurate candidates. The beam search decoder would incorporate @@ -45,7 +47,7 @@ The script to train an N-gram language model with KenLM can be found at `scripts/asr_language_modeling/ngram_lm/train_kenlm.py `__. This script would train an N-gram language model with KenLM library which can be used with the beam search decoders -on top of the ASR models. This script supports both character level and BPE level encodings and models which is +on top of the ASR models. This script supports both character level and BPE level encodings and models which are detected automatically from the type of the model. @@ -53,15 +55,15 @@ You may train the N-gram model as the following: .. code-block:: - python train_kenlm.py --nemo_model_file \ - --train_file \ - --kenlm_model_file \ - --ngram_length \ - --preserve_arpa + python train_kenlm.py nemo_model_file= \ + train_paths= \ + kenlm_bin_path= \ + kenlm_model_file= \ + ngram_length= \ + preserve_arpa=true -The train file specified by `--train_file` can be a text file or JSON manifest. If the file's extension is anything -other than `.json`, it assumes that data format is plain text. For plain text format, each line should contain one +The `train_paths` parameter allows for various input types, such as a list of text files, JSON manifests, or directories, to be used as the training data. +If the file's extension is anything other than `.json`, it assumes that data format is plain text. For plain text format, each line should contain one sample. For JSON manifest file, the file need to contain json formatted samples per each line like this: .. code-block:: @@ -69,16 +71,16 @@ sample. For JSON manifest file, the file need to contain json formatted samples {"audio_filepath": "/data_path/file1.wav", "text": "The transcript of the audio file."} It just extracts the `text` field from each line to create the training text file. After the N-gram model is trained, -it is stored at the path specified by `--kenlm_model_file`. +it is stored at the path specified by `kenlm_model_file`. The following is the list of the arguments for the training script: +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ | **Argument** | **Type** | **Default** | **Description** | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| nemo_model_file | str | Required | The path of the `.nemo` file of the ASR model. It is needed to extract the tokenizer. | +| nemo_model_file | str | Required | The path to `.nemo` file of the ASR model, or name of a pretrained NeMo model to extract a tokenizer. | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| train_file | str | Required | Path to the training file, it can be a text file or JSON manifest. | +| train_paths | List[str] | Required | List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ | kenlm_model_file | str | Required | The path to store the KenLM binary model file. | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ @@ -86,10 +88,14 @@ The following is the list of the arguments for the training script: +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ | ngram_length** | int | Required | Specifies order of N-gram LM. | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ -| do_lower_case | bool | ``False`` | Whether to make the training text all lower case. | +| ngram_prune | List[int] | [0] | List of thresholds to prune N-grams. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation | ++------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ +| cache_path | str | "" | Cache path to save tokenized files. | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ | preserve_arpa | bool | ``False`` | Whether to preserve the intermediate ARPA file after construction of the BIN file. | +------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ +| verbose | int | 1 | Verbose level. | ++------------------+----------+-------------+-------------------------------------------------------------------------------------------------+ ** Note: Recommend to use 6 as the order of the N-gram model for BPE-based models. Higher orders may need the re-compilation of KenLM to support it. @@ -175,6 +181,14 @@ The following is the list of the important arguments for the evaluation script: | decoding | Dict | BeamCTC | Subdict of beam search configs. Values found via | | | Config | InferConfig | python eval_beamsearch_ngram.py --help | +---------------------+----------+------------------+-------------------------------------------------------------------------+ +| text_processing.do_lowercase | bool | ``False`` | Whether to make the training text all lower case. | ++---------------------+----------+------------------+-------------------------------------------------------------------------+ +| text_processing.punctuation_marks | str | "" | String with punctuation marks to process. Example: ".\,?" | ++---------------------+----------+------------------+-------------------------------------------------------------------------+ +| text_processing.rm_punctuation | bool | ``False``| Whether to remove punctuation marks from text. | ++---------------------+----------+------------------+-------------------------------------------------------------------------+ +| text_processing.separate_punctuation | bool |``True``| Whether to separate punctuation with the previous word by space. | ++---------------------+----------+------------------+-------------------------------------------------------------------------+ Width of the beam search (`--beam_width`) specifies the number of top candidates/predictions the beam search decoder would search for. Larger beams result in more accurate but slower predictions. @@ -334,7 +348,7 @@ Given a trained TransformerLMModel `.nemo` file or a pretrained HF model, the sc `scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py `__ can be used to re-score beams obtained with ASR model. You need the `.tsv` file containing the candidates produced by the acoustic model and the beam search decoding to use this script. The candidates can be the result of just the beam -search decoding or the result of fusion with an N-gram LM. You may generate this file by specifying `--preds_output_folder' for +search decoding or the result of fusion with an N-gram LM. You may generate this file by specifying `--preds_output_folder` for `scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py `__. The neural rescorer would rescore the beams/candidates by using two parameters of `rescorer_alpha` and `rescorer_beta` as the following: @@ -457,3 +471,77 @@ You can then pass this file to your flashlight config object during decoding: decoding.beam.flashlight_cfg.boost_path='/path/to/my_boost_file.boost' \ decoding.beam.flashlight_cfg.beam_size_token = 32 \ decoding.beam.flashlight_cfg.beam_threshold = 25.0 + +Combine N-gram Language Models +============================== + +Before combining N-gram LMs install required OpenGrm NGram library using `scripts/installers/install_opengrm.sh `__. +Alternatively, you can use Docker image `scripts/installers/Dockerfile.ngramtools `__ with all the necessary dependencies. + +To combine two N-gram language models, you can use the script ngram_merge.py located at +`scripts/asr_language_modeling/ngram_lm/ngram_merge.py `__. + +This script interpolate two ARPA N-gram language models and creates a KenLM binary file that can be used with the beam search decoders on top of ASR models. +You can specify weights (`--alpha` and `--beta`) for each of the models (`--ngram_a` and `--ngram_b`) correspondingly: `alpha` * `ngram_a` + `beta` * `ngram_b`. +This script supports both character level and BPE level encodings and models which are detected automatically from the type of the model. + +To combine two N-gram models, you can use the following command: + +.. code-block:: + + python ngram_merge.py --kenlm_bin_path \ + --ngram_bin_path \ + --arpa_a \ + --alpha \ + --arpa_b \ + --beta \ + --out_path + + + +If you provide `--test_file` and `--nemo_model_file`, the script will calculate the perplexity of the resulting N-gram model on the test set. +Note, the result of each step during the process is cached in the temporary file in the `--out_path`, to speed up further run. +You can use the `--force` flag to discard the cache and recalculate everything from scratch. + +.. code-block:: + + python ngram_merge.py --kenlm_bin_path \ + --ngram_bin_path \ + --arpa_a \ + --alpha \ + --arpa_b \ + --beta \ + --out_path + --nemo_model_file \ + --test_file \ + --symbols \ + --force + + +The following is the list of the arguments for the opengrm script: + ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| **Argument** |**Type**| **Default** | **Description** | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| kenlm_bin_path | str | Required | The path to the bin folder of KenLM library. It is a folder named `bin` under where KenLM is installed. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| ngram_bin_path | str | Required | The path to the bin folder of OpenGrm Ngram. It is a folder named `bin` under where OpenGrm Ngram is installed. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| arpa_a | str | Required | Path to the ARPA N-gram model file A | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| alpha | float | Required | Weight of N-gram model A | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| arpa_b | int | Required | Path to the ARPA N-gram model file B | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| beta | float | Required | Weight of N-gram model B | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| out_path | str | Required | Path for writing temporary and resulting files. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| test_file | str | None | Path to test file to count perplexity if provided. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| symbols | str | None | Path to symbols (.syms) file. Could be calculated if it is not provided.| ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| nemo_model_file | str | None | The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model. | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ +| force | bool | ``False`` | Whether to recompile and rewrite all files | ++----------------------+--------+------------------+-------------------------------------------------------------------------+ diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py index d846157b6513..f8dcbcf81bbd 100644 --- a/examples/asr/speech_to_text_eval.py +++ b/examples/asr/speech_to_text_eval.py @@ -66,7 +66,7 @@ from omegaconf import MISSING, OmegaConf, open_dict from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization +from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig from nemo.core.config import hydra_runner from nemo.utils import logging @@ -81,9 +81,9 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig): only_score_manifest: bool = False - separate_punctuation: bool = True - do_lowercase: bool = False - rm_punctuation: bool = False + text_processing: Optional[TextProcessingConfig] = TextProcessingConfig( + punctuation_marks=".,?", separate_punctuation=True, do_lowercase=False, rm_punctuation=False, + ) @hydra_runner(config_name="EvaluationConfig", schema=EvaluationConfig) @@ -131,13 +131,13 @@ def main(cfg: EvaluationConfig): predicted_text.append(data['pred_text']) - pc = PunctuationCapitalization('.,?') - if cfg.separate_punctuation: + pc = PunctuationCapitalization(cfg.text_processing.punctuation_marks) + if cfg.text_processing.separate_punctuation: ground_truth_text = pc.separate_punctuation(ground_truth_text) - if cfg.do_lowercase: + if cfg.text_processing.do_lowercase: ground_truth_text = pc.do_lowercase(ground_truth_text) predicted_text = pc.do_lowercase(predicted_text) - if cfg.rm_punctuation: + if cfg.text_processing.rm_punctuation: ground_truth_text = pc.rm_punctuation(ground_truth_text) predicted_text = pc.rm_punctuation(predicted_text) @@ -164,8 +164,6 @@ def main(cfg: EvaluationConfig): raise ValueError(f"Got {metric_name} of {metric_value}, which was higher than tolerance={cfg.tolerance}") logging.info(f'Got {metric_name} of {metric_value}. Tolerance was {cfg.tolerance}') - else: - logging.info(f'Got {metric_name} of {metric_value}') logging.info(f'Dataset WER/CER ' + str(round(100 * wer, 2)) + "%/" + str(round(100 * cer, 2)) + "%") diff --git a/nemo/collections/asr/metrics/wer_bpe.py b/nemo/collections/asr/metrics/wer_bpe.py index 3dbecbb39628..762acf172a16 100644 --- a/nemo/collections/asr/metrics/wer_bpe.py +++ b/nemo/collections/asr/metrics/wer_bpe.py @@ -22,6 +22,7 @@ from nemo.collections.asr.metrics.wer import AbstractCTCDecoding, CTCDecodingConfig from nemo.collections.asr.parts.submodules import ctc_beam_decoding from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.collections.common.tokenizers.aggregate_tokenizer import DummyTokenizer from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.utils import logging @@ -147,7 +148,10 @@ def __init__(self, decoding_cfg, tokenizer: TokenizerSpec): if isinstance(self.decoding, ctc_beam_decoding.AbstractBeamCTCInfer): if hasattr(self.tokenizer.tokenizer, 'get_vocab'): vocab_dict = self.tokenizer.tokenizer.get_vocab() - vocab = list(vocab_dict.keys()) + if isinstance(self.tokenizer.tokenizer, DummyTokenizer): # AggregateTokenizer.DummyTokenizer + vocab = vocab_dict + else: + vocab = list(vocab_dict.keys()) self.decoding.set_vocabulary(vocab) self.decoding.set_tokenizer(tokenizer) else: diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 8cfe58523751..0e72ed8fa16d 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -15,6 +15,7 @@ import json import os import re +from dataclasses import dataclass from typing import List, Optional, Tuple, Union import torch @@ -442,14 +443,48 @@ def transcribe_partial_audio( class PunctuationCapitalization: - def __init__(self, punctuation_marks='.,?'): - self.regex_punctuation = re.compile(fr"([{''.join(punctuation_marks)}])") + def __init__(self, punctuation_marks: str): + """ + Class for text processing with punctuation and capitalization. Can be used with class TextProcessingConfig. + + Args: + punctuation_marks (str): String with punctuation marks to process. + Example: punctuation_marks = '.,?' + """ + if punctuation_marks: + self.regex_punctuation = re.compile(fr"([{''.join(punctuation_marks)}])") + self.regex_extra_space = re.compile('\s{2,}') + else: + self.regex_punctuation = None - def separate_punctuation(self, lines): - return [self.regex_punctuation.sub(r' \1 ', line) for line in lines] + def separate_punctuation(self, lines: List[str]) -> List[str]: + if self.regex_punctuation is not None: + return [ + self.regex_extra_space.sub('', self.regex_punctuation.sub(r' \1 ', line)).strip() for line in lines + ] + else: + return lines - def do_lowercase(self, lines): + def do_lowercase(self, lines: List[str]) -> List[str]: return [line.lower() for line in lines] - def rm_punctuation(self, lines): - return [self.regex_punctuation.sub(' ', line).strip() for line in lines] + def rm_punctuation(self, lines: List[str]) -> List[str]: + if self.regex_punctuation is not None: + return [self.regex_extra_space.sub('', self.regex_punctuation.sub(' ', line)).strip() for line in lines] + else: + return lines + + +@dataclass +class TextProcessingConfig: + # Punctuation marks to process. Example: ".,?" + punctuation_marks: str = "" + + # Whether to apply lower case conversion on the training text. + do_lowercase: bool = False + + # Whether to remove punctuation marks from text. + rm_punctuation: bool = False + + # Whether to separate punctuation with the previouse word by space. + separate_punctuation: bool = True diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py index e994a29426cc..1f62da6bb168 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py @@ -71,7 +71,7 @@ import nemo.collections.asr as nemo_asr from nemo.collections.asr.parts.submodules import ctc_beam_decoding -from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization +from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig from nemo.core.config import hydra_runner from nemo.utils import logging @@ -111,10 +111,12 @@ class EvalBeamSearchNGramConfig: decoding_strategy: str = "beam" decoding: ctc_beam_decoding.BeamCTCInferConfig = ctc_beam_decoding.BeamCTCInferConfig(beam_size=128) - separate_punctuation: bool = True - do_lowercase: bool = False - rm_punctuation: bool = False - + text_processing: Optional[TextProcessingConfig] = TextProcessingConfig( + punctuation_marks = ".,?", + separate_punctuation = True, + do_lowercase = False, + rm_punctuation = False, + ) # fmt: on @@ -130,6 +132,7 @@ def beam_search_eval( beam_width: int = 128, beam_batch_size: int = 128, progress_bar: bool = True, + punctuation_capitalization: PunctuationCapitalization = None, ): level = logging.getEffectiveLevel() logging.setLevel(logging.CRITICAL) @@ -182,15 +185,9 @@ def beam_search_eval( _, beams_batch = model.decoding.ctc_decoder_predictions_tensor( packed_batch, decoder_lengths=probs_lens, return_hypotheses=True, ) - pc = PunctuationCapitalization(',.?') + for beams_idx, beams in enumerate(beams_batch): target = target_transcripts[sample_idx + beams_idx] - if cfg.separate_punctuation: - target = pc.separate_punctuation([target])[0] - if cfg.do_lowercase: - target = pc.do_lowercase([target])[0] - if cfg.rm_punctuation: - target = pc.rm_punctuation([target])[0] target_split_w = target.split() target_split_c = list(target) words_count += len(target_split_w) @@ -198,10 +195,10 @@ def beam_search_eval( wer_dist_min = cer_dist_min = 10000 for candidate_idx, candidate in enumerate(beams): # type: (int, ctc_beam_decoding.rnnt_utils.Hypothesis) pred_text = candidate.text - if cfg.do_lowercase: - pred_text = pc.do_lowercase([pred_text])[0] - if cfg.rm_punctuation: - pred_text = pc.rm_punctuation([pred_text])[0] + if cfg.text_processing.do_lowercase: + pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] + if cfg.text_processing.rm_punctuation: + pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] pred_split_w = pred_text.split() wer_dist = editdistance.eval(target_split_w, pred_split_w) pred_split_c = list(pred_text) @@ -281,6 +278,14 @@ def main(cfg: EvalBeamSearchNGramConfig): target_transcripts.append(data['text']) audio_file_paths.append(str(audio_file.absolute())) + punctuation_capitalization = PunctuationCapitalization(cfg.text_processing.punctuation_marks) + if cfg.text_processing.separate_punctuation: + target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts) + if cfg.text_processing.do_lowercase: + target_transcripts = punctuation_capitalization.do_lowercase(target_transcripts) + if cfg.text_processing.rm_punctuation: + target_transcripts = punctuation_capitalization.rm_punctuation(target_transcripts) + if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file): logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.") logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...") @@ -327,6 +332,10 @@ def default_autocast(): preds = np.argmax(probs, axis=1) preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0) pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + if cfg.text_processing.do_lowercase: + pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] + if cfg.text_processing.rm_punctuation: + pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] pred_split_w = pred_text.split() target_split_w = target_transcripts[batch_idx].split() @@ -393,6 +402,7 @@ def default_autocast(): beam_beta=hp["beam_beta"], beam_batch_size=cfg.beam_batch_size, progress_bar=True, + punctuation_capitalization=punctuation_capitalization, ) if candidate_cer < best_cer: diff --git a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh index e0fd1a2fdd0b..c1a94df53a41 100644 --- a/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh +++ b/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh @@ -1,7 +1,29 @@ #!/usr/bin/env bash -# install Boost package -sudo apt-get update -sudo apt-get install swig build-essential libboost-all-dev cmake zlib1g-dev libbz2-dev liblzma-dev +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder +NEMO_PATH=/workspace/nemo # Path to NeMo folder: /workspace/nemo if you use NeMo/Dockerfile +if [ "$#" -eq 1 ] +then + NEMO_PATH=$1 +fi +KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_os2s_decoders.py + +cd $NEMO_PATH +apt-get update && apt-get upgrade -y && apt-get install -y liblzma-dev && rm -rf /var/lib/apt/lists/* # needed for flashlight decoder + git clone https://github.com/NVIDIA/OpenSeq2Seq cd OpenSeq2Seq git checkout ctc-decoders @@ -11,5 +33,24 @@ rm -rf OpenSeq2Seq cd decoders # patch setup code to support the recent distutils sed -i 's/, distutils/, distutils\nimport distutils.ccompiler/g' setup.py + +cp $NEMO_PATH/scripts/installers/setup_os2s_decoders.py ./setup.py ./setup.sh + +# install Boost package for KenLM +wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/decoders/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && ./b2 --layout=tagged link=static,shared threading=multi,single install -j4 || echo FAILURE +export BOOST_ROOT=$NEMO_PATH/decoders/boost_1_80_0 + +# install KenLM +cd $NEMO_PATH/decoders/kenlm/build && cmake -DKENLM_MAX_ORDER=$KENLM_MAX_ORDER .. && make -j2 +cd $NEMO_PATH/decoders/kenlm +python setup.py install --max_order=$KENLM_MAX_ORDER +export KENLM_LIB=$NEMO_PATH/decoders/kenlm/build/bin +export KENLM_ROOT=$NEMO_PATH/decoders/kenlm +cd .. + +# install Flashlight +git clone https://github.com/flashlight/text && cd text +python setup.py bdist_wheel +pip install dist/*.whl cd .. diff --git a/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py b/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py index 27bfa7c25c09..9e255ddc50ca 100644 --- a/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py +++ b/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,21 +15,36 @@ """ Utility methods to be used for training N-gram LM with KenLM in train_kenlm.py + +The BPE sub-words are encoded using the Unicode table. +This encoding scheme reduces the required memory significantly, and the LM and its binary blob format require less storage space. +The value DEFAULT_TOKEN_OFFSET from nemo.collections.asr.parts.submodules.ctc_beam_decoding is utilized as the offset value. """ +CHUNK_SIZE = 8192 +CHUNK_BUFFER_SIZE = 512 + +import gzip import json import os import numpy as np +import torch from joblib import Parallel, delayed from tqdm.auto import tqdm +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET +from nemo.utils import logging + # List of the supported models to be used with N-gram LM and beam search decoding SUPPORTED_MODELS = { 'EncDecCTCModelBPE': 'subword', 'EncDecCTCModel': 'char', 'EncDecRNNTBPEModel': 'subword', 'EncDecRNNTModel': 'char', + 'EncDecHybridRNNTCTCBPEModel': 'subword', + 'EncDecHybridRNNTCTCModel': 'char', } @@ -38,77 +53,169 @@ def softmax(x): return e / e.sum(axis=-1).reshape([x.shape[0], 1]) -def read_train_file(path, lowercase: bool = False): +def get_train_list(args_train_path): + + train_path = [] + for train_item in args_train_path: + if os.path.isdir(train_item): + file_list = os.listdir(train_item) + train_path.extend([os.path.join(train_item, file) for file in file_list]) + + elif os.path.isfile(train_item): + train_path.append(train_item) + return sorted(train_path) + + +def setup_tokenizer(nemo_model_file): + """ TOKENIZER SETUP + nemo_model_file (str): The path to the NeMo model file (.nemo). + """ + logging.info(f"Loading nemo model '{nemo_model_file}' ...") + if nemo_model_file.endswith('.nemo'): + model = nemo_asr.models.ASRModel.restore_from(nemo_model_file, map_location=torch.device('cpu')) + else: + logging.warning( + "tokenizer_model_file does not end with .model or .nemo, therefore trying to load a pretrained model with this name." + ) + model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu')) + + if type(model.tokenizer).__name__ == 'AggregateTokenizer': + is_aggregate_tokenizer = True + else: + is_aggregate_tokenizer = False + + encoding_level = SUPPORTED_MODELS.get(type(model).__name__, None) + if not encoding_level: + logging.warning( + f"Model type '{type(model).__name__}' may not be supported. Would try to train a char-level LM." + ) + encoding_level = 'char' + + tokenizer_nemo = model.tokenizer + del model + + return tokenizer_nemo, encoding_level, is_aggregate_tokenizer + + +def iter_files(source_path, dest_path, tokenizer, encoding_level, is_aggregate_tokenizer, verbose): + if isinstance(dest_path, list): + paths = zip(dest_path, source_path) + else: # dest_path is stdin of KenLM + paths = [(dest_path, path) for path in source_path] + + for dest_path, input_path in paths: + dataset = read_train_file(input_path, is_aggregate_tokenizer=is_aggregate_tokenizer, verbose=verbose) + if encoding_level == "subword": + tokenize_text( + data=dataset, + tokenizer=tokenizer, + path=dest_path, + chunk_size=CHUNK_SIZE, + buffer_size=CHUNK_BUFFER_SIZE, + ) + else: # encoding_level == "char" + if isinstance(dest_path, str): + with open(dest_path, 'w', encoding='utf-8') as f: + for line in dataset: + f.write(line + "\n") + else: # write to stdin of KenLM + for line in dataset: + dest_path.write((line + '\n').encode()) + + +def read_train_file( + path, is_aggregate_tokenizer: bool = False, verbose: int = 0, +): lines_read = 0 - text_dataset = [] - - with open(path, 'r', encoding='utf-8') as f: - reader = tqdm(iter(lambda: f.readline(), ''), desc="Read 0 lines", unit=' lines') - for i, line in enumerate(reader): - if path.endswith('.json'): - line = json.loads(line)['text'] - - line = line.replace("\n", "").strip() - if lowercase: - line = line.lower() - + text_dataset, lang_dataset = [], [] + if path[-8:] == '.json.gz': # for Common Crawl dataset + fin = gzip.open(path, 'r') + else: + fin = open(path, 'r', encoding='utf-8') + + if verbose > 0: + reader = tqdm(iter(lambda: fin.readline(), ''), desc="Read 0 lines", unit=' lines') + else: + reader = fin + + for line in reader: + lang = None + if line: + if path[-8:] == '.json.gz': # for Common Crawl dataset + line = json.loads(line.decode('utf-8'))['text'] + elif path.endswith('.json'): + jline = json.loads(line) + line = jline['text'] + if is_aggregate_tokenizer: + lang = jline['lang'] + + line_list = line.split("\n") + + line = " ".join(line_list) if line: text_dataset.append(line) - + if lang: + lang_dataset.append(lang) lines_read += 1 - if lines_read % 100000 == 0: + if verbose > 0 and lines_read % 100000 == 0: reader.set_description(f"Read {lines_read} lines") - - return text_dataset - - -def tokenize_str(texts, tokenizer, offset): + else: + break + fin.close() + if is_aggregate_tokenizer: + assert len(text_dataset) == len( + lang_dataset + ), f"text_dataset length {len(text_dataset)} and lang_dataset length {len(lang_dataset)} must be the same!" + return list(zip(text_dataset, lang_dataset)) + else: + return [[text] for text in text_dataset] + + +def tokenize_str(texts, tokenizer): tokenized_text = [] for text in texts: - tok_text = tokenizer.text_to_ids(text) - tok_text = [chr(token + offset) for token in tok_text] + tok_text = tokenizer.text_to_ids(*text) + tok_text = [chr(token + DEFAULT_TOKEN_OFFSET) for token in tok_text] tokenized_text.append(tok_text) return tokenized_text -def tokenize_text(data, tokenizer, path, chunk_size=8192, buffer_size=32, token_offset=100): +def tokenize_text(data, tokenizer, path, chunk_size=8192, buffer_size=32): dataset_len = len(data) - print( - f"Chunking {dataset_len} rows into {dataset_len / float(chunk_size):0.4f} tasks (each chunk contains {chunk_size} elements)" - ) - current_step = 0 - if os.path.exists(path): - print(f"Deleting previous file : {path}") + if isinstance(path, str) and os.path.exists(path): os.remove(path) - with Parallel(n_jobs=-2, verbose=10) as parallel: + with Parallel(n_jobs=-2, verbose=0) as parallel: while True: start = current_step * chunk_size end = min((current_step + buffer_size) * chunk_size, dataset_len) tokenized_data = parallel( - delayed(tokenize_str)(data[start : start + chunk_size], tokenizer, token_offset) + delayed(tokenize_str)(data[start : start + chunk_size], tokenizer) for start in range(start, end, chunk_size) ) # Write dataset write_dataset(tokenized_data, path) current_step += len(tokenized_data) - print(f"Finished writing {len(tokenized_data)} chunks to {path}. Current chunk index = {current_step}") + logging.info( + f"Finished writing {len(tokenized_data)} chunks to {path}. Current chunk index = {current_step}" + ) del tokenized_data if end >= dataset_len: break def write_dataset(chunks, path): - basedir = os.path.dirname(path) - - if not os.path.exists(basedir): - os.makedirs(basedir, exist_ok=True) - - with open(path, 'a+', encoding='utf-8') as f: - for chunk_idx in tqdm(range(len(chunks)), desc='Chunk ', total=len(chunks), unit=' chunks'): + if isinstance(path, str): + with open(path, 'a+', encoding='utf-8') as f: + for chunk_idx in tqdm(range(len(chunks)), desc='Chunk ', total=len(chunks), unit=' chunks'): + for text in chunks[chunk_idx]: + line = ' '.join(text) + f.write(f"{line}\n") + else: # write to stdin of KenLM + for chunk_idx in range(len(chunks)): for text in chunks[chunk_idx]: line = ' '.join(text) - f.write(f"{line}\n") + path.write((line + '\n').encode()) diff --git a/scripts/asr_language_modeling/ngram_lm/ngram_merge.py b/scripts/asr_language_modeling/ngram_lm/ngram_merge.py new file mode 100644 index 000000000000..abffc6372518 --- /dev/null +++ b/scripts/asr_language_modeling/ngram_lm/ngram_merge.py @@ -0,0 +1,448 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This script would interpolate two arpa N-gram language models (LMs), +culculate perplexity of resulted LM, and make binary KenLM from it. + +Minimun usage example to interpolate two N-gram language models with weights: +alpha * ngram_a + beta * ngram_b = 2 * ngram_a + 1 * ngram_b + +python3 ngram_merge.py --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \ + --arpa_a /path/ngram_a.kenlm.tmp.arpa \ + --alpha 2 \ + --arpa_b /path/ngram_b.kenlm.tmp.arpa \ + --beta 1 \ + --out_path /path/out + + +Merge two N-gram language models and calculate its perplexity with test_file. +python3 ngram_merge.py --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \ + --ngram_bin_path /workspace/nemo/decoders/ngram-1.3.14/src/bin \ + --arpa_a /path/ngram_a.kenlm.tmp.arpa \ + --alpha 0.5 \ + --arpa_b /path/ngram_b.kenlm.tmp.arpa \ + --beta 0.5 \ + --out_path /path/out \ + --nemo_model_file /path/to/model_tokenizer.nemo \ + --test_file /path/to/test_manifest.json \ + --force +""" + +import argparse +import os +import subprocess +import sys +from typing import Tuple + +import kenlm_utils +import torch + +import nemo.collections.asr as nemo_asr +from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET +from nemo.utils import logging + + +class NgramMerge: + def __init__(self, ngram_bin_path): + self.ngram_bin_path = ngram_bin_path + + def ngrammerge(self, arpa_a: str, alpha: float, arpa_b: str, beta: float, arpa_c: str, force: bool) -> str: + """ + Merge two ARPA n-gram language models using the ngrammerge command-line tool and output the result in ARPA format. + + Args: + arpa_a (str): Path to the first input ARPA file. + alpha (float): Interpolation weight for the first model. + arpa_b (str): Path to the second input ARPA file. + beta (float): Interpolation weight for the second model. + arpa_c (str): Path to the output ARPA file. + force (bool): Whether to overwrite existing output files. + + Returns: + str: Path to the output ARPA file in mod format. + """ + mod_a = arpa_a + ".mod" + mod_b = arpa_b + ".mod" + mod_c = arpa_c + ".mod" + if os.path.isfile(mod_c) and not force: + logging.info("File " + mod_c + " exists. Skipping.") + else: + sh_args = [ + os.path.join(self.ngram_bin_path, "ngrammerge"), + "--alpha=" + str(alpha), + "--beta=" + str(beta), + "--normalize", + # "--use_smoothing", + mod_a, + mod_b, + mod_c, + ] + logging.info( + "\n" + + str(subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,)) + + "\n", + ) + return mod_c + + def arpa2mod(self, arpa_path: str, force: bool): + """ + This function reads an ARPA n-gram model and converts it to a binary format. The binary model is saved to the same directory as the ARPA model with a ".mod" extension. If the binary model file already exists and force argument is False, then the function skips conversion and returns a message. Otherwise, it executes the command to create a binary model using the subprocess.run method. + + Parameters: + arpa_path (string): The file path to the ARPA n-gram model. + force (bool): If True, the function will convert the ARPA model to binary even if the binary file already exists. If False and the binary file exists, the function will skip the conversion. + Returns: + If the binary model file already exists and force argument is False, returns a message indicating that the file exists and the conversion is skipped. + Otherwise, returns a subprocess.CompletedProcess object, which contains information about the executed command. The subprocess's output and error streams are redirected to stdout and stderr, respectively. + """ + mod_path = arpa_path + ".mod" + if os.path.isfile(mod_path) and not force: + return "File " + mod_path + " exists. Skipping." + else: + sh_args = [ + os.path.join(self.ngram_bin_path, "ngramread"), + "--ARPA", + arpa_path, + mod_path, + ] + return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) + + def merge( + self, arpa_a: str, alpha: float, arpa_b: str, beta: float, out_path: str, force: bool + ) -> Tuple[str, str]: + """ + Merges two ARPA language models using the ngrammerge tool. + + Args: + arpa_a (str): Path to the first ARPA language model file. + alpha (float): Interpolation weight for the first model. + arpa_b (str): Path to the second ARPA language model file. + beta (float): Interpolation weight for the second model. + out_path (str): Path to the output directory for the merged ARPA model. + force (bool): Whether to force overwrite of existing files. + + Returns: + Tuple[str, str]: A tuple containing the path to the merged binary language model file and the path to the + merged ARPA language model file. + """ + logging.info("\n" + str(self.arpa2mod(arpa_a, force)) + "\n") + + logging.info("\n" + str(self.arpa2mod(arpa_b, force)) + "\n") + arpa_c = os.path.join(out_path, f"{os.path.split(arpa_a)[1]}-{alpha}-{os.path.split(arpa_b)[1]}-{beta}.arpa",) + mod_c = self.ngrammerge(arpa_a, alpha, arpa_b, beta, arpa_c, force) + return mod_c, arpa_c + + def perplexity(self, ngram_mod: str, test_far: str) -> str: + """ + Calculates perplexity of a given ngram model on a test file. + + Args: + ngram_mod (str): The path to the ngram model file. + test_far (str): The path to the test file. + + Returns: + str: A string representation of the perplexity calculated. + + Raises: + AssertionError: If the subprocess to calculate perplexity returns a non-zero exit code. + + Example: + >>> perplexity("/path/to/ngram_model", "/path/to/test_file") + 'Perplexity: 123.45' + """ + sh_args = [ + os.path.join(self.ngram_bin_path, "ngramperplexity"), + "--v=1", + ngram_mod, + test_far, + ] + ps = subprocess.Popen(sh_args, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = ps.communicate() + exit_code = ps.wait() + command = " ".join(sh_args) + assert ( + exit_code == 0 + ), f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" + perplexity_out = "\n".join(stdout.split("\n")[-6:-1]) + return perplexity_out + + def make_arpa(self, ngram_mod: str, ngram_arpa: str, force: bool): + """ + Converts an ngram model in binary format to ARPA format. + + Args: + - ngram_mod (str): The path to the ngram model in binary format. + - ngram_arpa (str): The desired path for the ARPA format output file. + - force (bool): If True, the ARPA format file will be generated even if it already exists. + + Returns: + - Tuple[bytes, bytes] + + Raises: + - AssertionError: If the shell command execution returns a non-zero exit code. + - FileNotFoundError: If the binary ngram model file does not exist. + """ + if os.path.isfile(ngram_arpa) and not force: + logging.info("File " + ngram_arpa + " exists. Skipping.") + return None + else: + sh_args = [ + os.path.join(self.ngram_bin_path, "ngramprint"), + "--ARPA", + ngram_mod, + ngram_arpa, + ] + return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) + + def test_perplexity( + self, mod_c: str, symbols: str, test_txt: str, nemo_model_file: str, tmp_path: str, force: bool + ) -> str: + """ + Tests the perplexity of a given ngram model on a test file. + + Args: + mod_c (str): The path to the ngram model file. + symbols (str): The path to the symbol table file. + test_txt (str): The path to the test text file. + nemo_model_file (str): The path to the NeMo model file. + tmp_path (str): The path to the temporary directory where the test far file will be created. + force (bool): If True, overwrites any existing far file. + + Returns: + str: A string representation of the perplexity calculated. + + Example: + >>> test_perplexity("/path/to/ngram_model", "/path/to/symbol_table", "/path/to/test_file", "/path/to/tokenizer_model", "/path/to/tmp_dir", True) + 'Perplexity: 123.45' + """ + + test_far = farcompile(symbols, test_txt, tmp_path, nemo_model_file, force) + res_p = self.perplexity(mod_c, test_far) + return res_p + + +def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str, force: bool,) -> str: + """ + Compiles a text file into a FAR file using the given symbol table or tokenizer. + + Args: + symbols (str): The path to the symbol table file. + text_file (str): The path to the text file to compile. + tmp_path (str): The path to the temporary directory where the test far file will be created. + nemo_model_file (str): The path to the NeMo model file (.nemo). + force (bool): If True, overwrites any existing FAR file. + + Returns: + test_far (str): The path to the resulting FAR file. + + Example: + >>> farcompile("/path/to/symbol_table", "/path/to/text_file", "/path/to/far_file", "/path/to/tokenizer_model", "/path/to/nemo_model", True) + """ + test_far = os.path.join(tmp_path, os.path.split(text_file)[1] + ".far") + + if os.path.isfile(test_far) and not force: + logging.info("File " + test_far + " exists. Skipping.") + return None + else: + sh_args = [ + "farcompilestrings", + "--generate_keys=10", + "--fst_type=compact", + "--symbols=" + symbols, + "--keep_symbols", + ">", + test_far, + ] + + tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(nemo_model_file) + + ps = subprocess.Popen( + " ".join(sh_args), shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr, + ) + + kenlm_utils.iter_files( + source_path=[text_file], + dest_path=ps.stdin, + tokenizer=tokenizer, + encoding_level=encoding_level, + is_aggregate_tokenizer=is_aggregate_tokenizer, + verbose=1, + ) + stdout, stderr = ps.communicate() + + exit_code = ps.returncode + + command = " ".join(sh_args) + assert ( + exit_code == 0 + ), f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" + return test_far + + +def make_kenlm(kenlm_bin_path: str, ngram_arpa: str, force: bool): + """ + Builds a language model from an ARPA format file using the KenLM toolkit. + + Args: + - kenlm_bin_path (str): The path to the KenLM toolkit binary. + - ngram_arpa (str): The path to the ARPA format file. + - force (bool): If True, the KenLM language model will be generated even if it already exists. + + Raises: + - AssertionError: If the shell command execution returns a non-zero exit code. + - FileNotFoundError: If the KenLM binary or ARPA format file does not exist. + """ + ngram_kenlm = ngram_arpa + ".kenlm" + if os.path.isfile(ngram_kenlm) and not force: + logging.info("File " + ngram_kenlm + " exists. Skipping.") + return None + else: + sh_args = [kenlm_bin_path, "trie", "-i", ngram_arpa, ngram_kenlm] + return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) + + +def make_symbol_list(nemo_model_file, symbols, force): + """ + Function: make_symbol_list + + Create a symbol table for the input tokenizer model file. + + Args: + nemo_model_file (str): Path to the NeMo model file. + symbols (str): Path to the file where symbol list will be saved. + force (bool): Flag to force creation of symbol list even if it already exists. + + Returns: + None + + Raises: + None + """ + if os.path.isfile(symbols) and not force: + logging.info("File " + symbols + " exists. Skipping.") + else: + if nemo_model_file.endswith('.nemo'): + asr_model = nemo_asr.models.ASRModel.restore_from(nemo_model_file, map_location=torch.device('cpu')) + vocab_size = len(asr_model.decoder.vocabulary) + else: + logging.warning( + "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." + ) + asr_model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu')) + vocab_size = len(asr_model.decoder.vocabulary) + + vocab = [chr(idx + DEFAULT_TOKEN_OFFSET) for idx in range(vocab_size)] + with open(symbols, "w", encoding="utf-8") as f: + for i, v in enumerate(vocab): + f.write(v + " " + str(i) + "\n") + + +def main( + kenlm_bin_path: str, + ngram_bin_path: str, + arpa_a: str, + alpha: float, + arpa_b: str, + beta: float, + out_path: str, + test_file: str, + symbols: str, + nemo_model_file: str, + force: bool, +) -> None: + """ + Entry point function for merging ARPA format language models, testing perplexity, creating symbol list, + and making ARPA and Kenlm models. + + Args: + - kenlm_bin_path (str): The path to the Kenlm binary. + - arpa_a (str): The path to the first ARPA format language model. + - alpha (float): The weight given to the first language model during merging. + - arpa_b (str): The path to the second ARPA format language model. + - beta (float): The weight given to the second language model during merging. + - out_path (str): The path where the output files will be saved. + - test_file (str): The path to the file on which perplexity needs to be calculated. + - symbols (str): The path to the file where symbol list for the tokenizer model will be saved. + - nemo_model_file (str): The path to the NeMo model file. + - force (bool): If True, overwrite existing files, otherwise skip the operations. + + Returns: + - None + """ + nm = NgramMerge(ngram_bin_path) + mod_c, arpa_c = nm.merge(arpa_a, alpha, arpa_b, beta, out_path, force) + + if test_file and nemo_model_file: + if not symbols: + symbols = os.path.join(out_path, os.path.split(nemo_model_file)[1] + ".syms") + make_symbol_list(nemo_model_file, symbols, force) + test_p = nm.test_perplexity(mod_c, symbols, test_file, nemo_model_file, out_path, force) + logging.info("Perplexity summary:" + test_p) + + logging.info("Making ARPA and Kenlm model " + arpa_c) + out = nm.make_arpa(mod_c, arpa_c, force) + if out: + logging.info("\n" + str(out) + "\n") + + out = make_kenlm(kenlm_bin_path, arpa_c, force) + if out: + logging.info("\n" + str(out) + "\n") + + +def _parse_args(): + parser = argparse.ArgumentParser( + description="Interpolate ARPA N-gram language models and make KenLM binary model to be used with beam search decoder of ASR models." + ) + parser.add_argument( + "--kenlm_bin_path", required=True, type=str, help="The path to the bin folder of KenLM library.", + ) # Use /workspace/nemo/decoders/kenlm/build/bin if installed it with scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh + parser.add_argument( + "--ngram_bin_path", required=True, type=str, help="The path to the bin folder of OpenGrm Ngram library.", + ) # Use /workspace/nemo/decoders/ngram-1.3.14/src/bin if installed it with scripts/installers/install_opengrm.sh + parser.add_argument("--arpa_a", required=True, type=str, help="Path to the arpa_a") + parser.add_argument("--alpha", required=True, type=float, help="Weight of arpa_a") + parser.add_argument("--arpa_b", required=True, type=str, help="Path to the arpa_b") + parser.add_argument("--beta", required=True, type=float, help="Weight of arpa_b") + parser.add_argument( + "--out_path", required=True, type=str, help="Path to write tmp and resulted files.", + ) + parser.add_argument( + "--test_file", + required=False, + type=str, + default=None, + help="Path to test file to count perplexity if provided.", + ) + parser.add_argument( + "--symbols", + required=False, + type=str, + default=None, + help="Path to symbols (.syms) file . Could be calculated if it is not provided. Use as: --symbols /path/to/earnest.syms", + ) + parser.add_argument( + "--nemo_model_file", + required=False, + type=str, + default=None, + help="The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model", + ) + parser.add_argument("--force", "-f", action="store_true", help="Whether to recompile and rewrite all files") + return parser.parse_args() + + +if __name__ == "__main__": + main(**vars(_parse_args())) diff --git a/scripts/asr_language_modeling/ngram_lm/train_kenlm.py b/scripts/asr_language_modeling/ngram_lm/train_kenlm.py index 6536a7f5eadd..d23141722653 100644 --- a/scripts/asr_language_modeling/ngram_lm/train_kenlm.py +++ b/scripts/asr_language_modeling/ngram_lm/train_kenlm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,144 +23,159 @@ # You need to install the KenLM library and also the beam search decoders to use this feature. Please refer # to 'scripts/ngram_lm/install_beamsearch_decoders.sh' on how to install them. # -# USAGE: python train_kenlm.py --nemo_model_file \ -# --train_file \ -# --kenlm_model_file \ -# --ngram_length \ -# --preserve_arpa +# USAGE: python train_kenlm.py nemo_model_file= \ +# train_paths= \ +# kenlm_bin_path= \ +# kenlm_model_file= \ +# ngram_length= \ # # After training is done, the binary LM model is stored at the path specified by '--kenlm_model_file'. # You may find more info on how to use this script at: # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html -import argparse import logging import os import subprocess import sys +from dataclasses import dataclass, field +from glob import glob +from typing import List -import kenlm_utils -import torch +from omegaconf import MISSING +from scripts.asr_language_modeling.ngram_lm import kenlm_utils -import nemo.collections.asr as nemo_asr -from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET +from nemo.core.config import hydra_runner from nemo.utils import logging """ NeMo's beam search decoders only support char-level encodings. In order to make it work with BPE-level encodings, we use a trick to encode the sub-word tokens of the training data as unicode characters and train a char-level KenLM. -DEFAULT_TOKEN_OFFSET is the offset in the unicode table to be used to encode the BPE sub-words. This encoding scheme reduces -the required memory significantly, and the LM and its binary blob format require less storage space. """ -CHUNK_SIZE = 8192 -CHUNK_BUFFER_SIZE = 512 - - -def main(): - parser = argparse.ArgumentParser( - description='Train an N-gram language model with KenLM to be used with beam search decoder of ASR models.' - ) - parser.add_argument( - "--train_file", - required=True, - type=str, - help="Path to the training file, it can be a text file or JSON manifest", - ) - parser.add_argument( - "--nemo_model_file", - required=True, - type=str, - help="The path of the '.nemo' file of the ASR model or name of a pretrained model", - ) - parser.add_argument( - "--kenlm_model_file", required=True, type=str, help="The path to store the KenLM binary model file" - ) - parser.add_argument("--ngram_length", required=True, type=int, help="The order of N-gram LM") - parser.add_argument("--kenlm_bin_path", required=True, type=str, help="The path to the bin folder of KenLM") - parser.add_argument( - "--do_lowercase", action='store_true', help="Whether to apply lower case conversion on the training text" - ) - parser.add_argument( - '--preserve_arpa', required=False, action='store_true', help='Whether to preserve the intermediate ARPA file.' - ) - args = parser.parse_args() - - """ TOKENIZER SETUP """ - logging.info(f"Loading nemo model '{args.nemo_model_file}' ...") - - if args.nemo_model_file.endswith('.nemo'): - model = nemo_asr.models.ASRModel.restore_from(args.nemo_model_file, map_location=torch.device('cpu')) - else: - logging.warning( - "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." - ) - model = nemo_asr.models.ASRModel.from_pretrained(args.nemo_model_file, map_location=torch.device('cpu')) - encoding_level = kenlm_utils.SUPPORTED_MODELS.get(type(model).__name__, None) - if not encoding_level: - logging.warning( - f"Model type '{type(model).__name__}' may not be supported. Would try to train a char-level LM." - ) - encoding_level = 'char' +@dataclass +class TrainKenlmConfig: + """ + Train an N-gram language model with KenLM to be used with beam search decoder of ASR models. + """ + + train_paths: List[ + str + ] = MISSING # List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". Example: [/path/to/manifest/file,/path/to/folder] + + nemo_model_file: str = MISSING # The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model + kenlm_model_file: str = MISSING # The path to store the KenLM binary model file + ngram_length: int = MISSING # The order of N-gram LM + kenlm_bin_path: str = MISSING # The path to the bin folder of KenLM. + + preserve_arpa: bool = False # Whether to preserve the intermediate ARPA file. + ngram_prune: List[int] = field( + default_factory=lambda: [0] + ) # List of digits to prune Ngram. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation + cache_path: str = "" # Cache path to save tokenized files. + verbose: int = 1 # Verbose level, default is 1. + + +@hydra_runner(config_path=None, config_name='TrainKenlmConfig', schema=TrainKenlmConfig) +def main(args: TrainKenlmConfig): + train_paths = kenlm_utils.get_train_list(args.train_paths) + + if isinstance(args.ngram_prune, str): + args.ngram_prune = [args.ngram_prune] + + tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(args.nemo_model_file) - """ DATASET SETUP """ - logging.info(f"Encoding the train file '{args.train_file}' ...") - dataset = kenlm_utils.read_train_file(args.train_file, lowercase=args.do_lowercase) - encoded_train_file = f"{args.kenlm_model_file}.tmp.txt" if encoding_level == "subword": - kenlm_utils.tokenize_text( - dataset, - model.tokenizer, - path=encoded_train_file, - chunk_size=CHUNK_SIZE, - buffer_size=CHUNK_BUFFER_SIZE, - token_offset=DEFAULT_TOKEN_OFFSET, - ) - # --discount_fallback is needed for training KenLM for BPE-based models - discount_arg = "--discount_fallback" + discount_arg = "--discount_fallback" # --discount_fallback is needed for training KenLM for BPE-based models else: - with open(encoded_train_file, 'w', encoding='utf-8') as f: - for line in dataset: - f.write(f"{line}\n") - discount_arg = "" - del model - arpa_file = f"{args.kenlm_model_file}.tmp.arpa" """ LMPLZ ARGUMENT SETUP """ kenlm_args = [ os.path.join(args.kenlm_bin_path, 'lmplz'), "-o", - f"{args.ngram_length}", - "--text", - encoded_train_file, + str(args.ngram_length), "--arpa", arpa_file, discount_arg, - ] + "--prune", + ] + [str(n) for n in args.ngram_prune] + + if args.cache_path: + if not os.path.exists(args.cache_path): + os.makedirs(args.cache_path, exist_ok=True) + + """ DATASET SETUP """ + encoded_train_files = [] + for file_num, train_file in enumerate(train_paths): + logging.info(f"Encoding the train file '{train_file}' number {file_num+1} out of {len(train_paths)} ...") + + cached_files = glob(os.path.join(args.cache_path, os.path.split(train_file)[1]) + "*") + encoded_train_file = os.path.join(args.cache_path, os.path.split(train_file)[1] + f"_{file_num}.tmp.txt") + if ( + cached_files and cached_files[0] != encoded_train_file + ): # cached_files exists but has another file name: f"_{file_num}.tmp.txt" + os.rename(cached_files[0], encoded_train_file) + logging.info("Rename", cached_files[0], "to", encoded_train_file) + + encoded_train_files.append(encoded_train_file) + + kenlm_utils.iter_files( + source_path=train_paths, + dest_path=encoded_train_files, + tokenizer=tokenizer, + encoding_level=encoding_level, + is_aggregate_tokenizer=is_aggregate_tokenizer, + verbose=args.verbose, + ) - ret = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr) - if ret.returncode != 0: + first_process_args = ["cat"] + encoded_train_files + first_process = subprocess.Popen(first_process_args, stdout=subprocess.PIPE, stderr=sys.stderr) + + logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n") + kenlm_p = subprocess.run( + kenlm_args, + stdin=first_process.stdout, + capture_output=False, + text=True, + stdout=sys.stdout, + stderr=sys.stderr, + ) + first_process.wait() + + else: + logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n") + kenlm_p = subprocess.Popen(kenlm_args, stdout=sys.stdout, stdin=subprocess.PIPE, stderr=sys.stderr) + + kenlm_utils.iter_files( + source_path=train_paths, + dest_path=kenlm_p.stdin, + tokenizer=tokenizer, + encoding_level=encoding_level, + is_aggregate_tokenizer=is_aggregate_tokenizer, + verbose=args.verbose, + ) + + kenlm_p.communicate() + + if kenlm_p.returncode != 0: raise RuntimeError("Training KenLM was not successful!") + """ BINARY BUILD """ - logging.info(f"Running binary_build command \n\n{' '.join(kenlm_args)}\n\n") + kenlm_args = [ os.path.join(args.kenlm_bin_path, "build_binary"), "trie", arpa_file, args.kenlm_model_file, ] + logging.info(f"Running binary_build command \n\n{' '.join(kenlm_args)}\n\n") ret = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr) if ret.returncode != 0: raise RuntimeError("Training KenLM was not successful!") - os.remove(encoded_train_file) - logging.info(f"Deleted the temporary encoded training file '{encoded_train_file}'.") - if not args.preserve_arpa: os.remove(arpa_file) logging.info(f"Deleted the arpa file '{arpa_file}'.") diff --git a/scripts/installers/Dockerfile.ngramtools b/scripts/installers/Dockerfile.ngramtools new file mode 100644 index 000000000000..49d3c12b3529 --- /dev/null +++ b/scripts/installers/Dockerfile.ngramtools @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder, OpenGRM Ngram tool to contaner + +# How to use? Build it from NeMo root folder: +# 1. git clone https://github.com/NVIDIA/NeMo.git && cd NeMo +# 2. DOCKER_BUILDKIT=1 docker build -t nemo:23.01.1 -f ./scripts/installers/Dockerfile.ngramtools . + +from nvcr.io/nvidia/nemo:23.01 + +WORKDIR /workspace/nemo + +COPY scripts/. /workspace/nemo/scripts/ + +RUN /bin/bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh + +RUN /bin/bash scripts/installers/install_opengrm.sh diff --git a/scripts/installers/install_opengrm.sh b/scripts/installers/install_opengrm.sh new file mode 100755 index 000000000000..e3e11e8d1db9 --- /dev/null +++ b/scripts/installers/install_opengrm.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script install OpenFST and Ngram tools from OpenGRM library +# Optionally, you can specify a path where to install it as a first positional argument: scripts/installers/install_opengrm.sh /path/to/install/openfst . +# Alternatively, in the Linux Debian you can use: sudo apt install libngram-tools + +DECODERS_PATH=/workspace/nemo/decoders # Path to decoders folder: /workspace/nemo/decoders if you use NeMo/Dockerfile +if [ "$#" -eq 1 ] +then + DECODERS_PATH=$1 +fi +cd $DECODERS_PATH + +# Install OpenGrm OpenFST +wget https://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.8.2.tar.gz --no-check-certificate && tar xvzf openfst-1.8.2.tar.gz && cd openfst-1.8.2 && ./configure --enable-grm && make -j4 && make -j4 install && cd .. + +# Install OpenGrm Ngram +OPENFSTPREFIX=$DECODERS_PATH/openfst-1.8.2/src && wget https://www.opengrm.org/twiki/pub/GRM/NGramDownload/ngram-1.3.14.tar.gz --no-check-certificate && tar xvzf ngram-1.3.14.tar.gz && cd ngram-1.3.14 && LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX} && make -j4 && make -j4 install && cd .. diff --git a/scripts/installers/setup_os2s_decoders.py b/scripts/installers/setup_os2s_decoders.py new file mode 100644 index 000000000000..6dfe1bef54e8 --- /dev/null +++ b/scripts/installers/setup_os2s_decoders.py @@ -0,0 +1,138 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script to build and install decoder package. + +It is used by scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh to install +KenLM and OpenSeq2Seq decoder. + +You can set the order of KenLM model by changing -DKENLM_MAX_ORDER=10 argument. +""" +from __future__ import absolute_import, division, print_function + +import argparse +import glob +import multiprocessing.pool +import os +import platform +import sys + +from setuptools import Extension, distutils, setup + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--num_processes", default=1, type=int, help="Number of cpu processes to build package. (default: %(default)d)" +) +args = parser.parse_known_args() + +# reconstruct sys.argv to pass to setup below +sys.argv = [sys.argv[0]] + args[1] + + +# monkey-patch for parallel compilation +# See: https://stackoverflow.com/a/13176803 +def parallelCCompile( + self, + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=0, + extra_preargs=None, + extra_postargs=None, + depends=None, +): + # those lines are copied from distutils.ccompiler.CCompiler directly + macros, objects, extra_postargs, pp_opts, build = self._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs + ) + cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) + + # parallel code + def _single_compile(obj): + try: + src, ext = build[obj] + except KeyError: + return + self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) + + # convert to list, imap is evaluated on-demand + thread_pool = multiprocessing.pool.ThreadPool(args[0].num_processes) + list(thread_pool.imap(_single_compile, objects)) + return objects + + +def compile_test(header, library): + dummy_path = os.path.join(os.path.dirname(__file__), "dummy") + command = ( + "bash -c \"g++ -include " + + header + + " -l" + + library + + " -x c++ - <<<'int main() {}' -o " + + dummy_path + + " >/dev/null 2>/dev/null && rm " + + dummy_path + + " 2>/dev/null\"" + ) + return os.system(command) == 0 + + +# hack compile to support parallel compiling +distutils.ccompiler.CCompiler.compile = parallelCCompile + +FILES = glob.glob('kenlm/util/*.cc') + glob.glob('kenlm/lm/*.cc') + glob.glob('kenlm/util/double-conversion/*.cc') + +FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') + +FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith('unittest.cc'))] + +LIBS = ['stdc++'] +if platform.system() != 'Darwin': + LIBS.append('rt') + +ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=10', '-std=c++11'] + +if compile_test('zlib.h', 'z'): + ARGS.append('-DHAVE_ZLIB') + LIBS.append('z') + +if compile_test('bzlib.h', 'bz2'): + ARGS.append('-DHAVE_BZLIB') + LIBS.append('bz2') + +if compile_test('lzma.h', 'lzma'): + ARGS.append('-DHAVE_XZLIB') + LIBS.append('lzma') + +os.system('swig -python -c++ ./decoders.i') + +decoders_module = [ + Extension( + name='_swig_decoders', + sources=FILES + glob.glob('*.cxx') + glob.glob('*.cpp'), + language='c++', + include_dirs=['.', 'kenlm', 'openfst-1.6.3/src/include', 'ThreadPool',], + libraries=LIBS, + extra_compile_args=ARGS, + ) +] + +setup( + name='ctc_decoders', + version='1.1', + description="""CTC decoders""", + ext_modules=decoders_module, + py_modules=['ctc_decoders', 'swig_decoders'], +) From 6593652c5ad7d93ed8666d261e798acbc3d4b2b2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 4 May 2023 14:54:22 -0700 Subject: [PATCH 08/62] temp rtd fix (#6568) (#6569) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- requirements/requirements_docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements_docs.txt b/requirements/requirements_docs.txt index b7bfd6956905..34406bd2a366 100644 --- a/requirements/requirements_docs.txt +++ b/requirements/requirements_docs.txt @@ -9,4 +9,5 @@ sphinx-book-theme sphinx-copybutton sphinxcontrib-bibtex sphinxext-opengraph +urllib3<2.0.0 wrapt From 7e0ab3f197ec1543ee56751c053fea9d3c85d3a1 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Fri, 5 May 2023 08:32:27 -0700 Subject: [PATCH 09/62] [TTS] Add script for mapping speaker names to indices (#6509) Signed-off-by: Ryan --- .../tts/create_speaker_map.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 scripts/dataset_processing/tts/create_speaker_map.py diff --git a/scripts/dataset_processing/tts/create_speaker_map.py b/scripts/dataset_processing/tts/create_speaker_map.py new file mode 100644 index 000000000000..027a5c6e3e35 --- /dev/null +++ b/scripts/dataset_processing/tts/create_speaker_map.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script takes a list of TTS manifests and creates a JSON mapping the input speaker names to +unique indices for multi-speaker TTS training. + +To ensure that speaker names are unique across datasets, it is recommended that you prepend the speaker +names in your manifest with the name of the dataset. + +$ python /scripts/dataset_processing/tts/create_speaker_map.py \ + --manifest_path=manifest1.json \ + --manifest_path=manifest2.json \ + --speaker_map_path=speakers.json + +Example output: + +{ + "vctk_p225": 0, + "vctk_p226": 1, + "vctk_p227": 2, + ... +} + +""" + +import argparse +import json +from pathlib import Path + +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Create mapping from speaker names to numerical speaker indices.", + ) + parser.add_argument( + "--manifest_path", required=True, type=Path, action="append", help="Path to training manifest(s).", + ) + parser.add_argument( + "--speaker_map_path", required=True, type=Path, help="Path for output speaker index JSON", + ) + parser.add_argument( + "--overwrite", default=False, type=bool, help="Whether to overwrite the output speaker file if it exists.", + ) + args = parser.parse_args() + return args + + +def main(): + args = get_args() + manifest_paths = args.manifest_path + speaker_map_path = args.speaker_map_path + overwrite = args.overwrite + + for manifest_path in manifest_paths: + if not manifest_path.exists(): + raise ValueError(f"Manifest {manifest_path} does not exist.") + + if speaker_map_path.exists(): + if overwrite: + print(f"Will overwrite existing speaker path: {speaker_map_path}") + else: + raise ValueError(f"Speaker path already exists: {speaker_map_path}") + + speaker_set = set() + for manifest_path in manifest_paths: + entries = read_manifest(manifest_path) + for entry in entries: + speaker = str(entry["speaker"]) + speaker_set.add(speaker) + + speaker_list = list(speaker_set) + speaker_list.sort() + speaker_index_map = {speaker_list[i]: i for i in range(len(speaker_list))} + + with open(speaker_map_path, 'w', encoding="utf-8") as stats_f: + json.dump(speaker_index_map, stats_f, indent=4) + + +if __name__ == "__main__": + main() From 0084c0436f7fd3f3f1dffa58fdbda36600d35f02 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 5 May 2023 22:17:02 +0400 Subject: [PATCH 10/62] whitespace (#6574) Signed-off-by: Nikolay Karpov --- nemo/collections/asr/parts/utils/transcribe_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 0e72ed8fa16d..8101bee96723 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -460,7 +460,7 @@ def __init__(self, punctuation_marks: str): def separate_punctuation(self, lines: List[str]) -> List[str]: if self.regex_punctuation is not None: return [ - self.regex_extra_space.sub('', self.regex_punctuation.sub(r' \1 ', line)).strip() for line in lines + self.regex_extra_space.sub(' ', self.regex_punctuation.sub(r' \1 ', line)).strip() for line in lines ] else: return lines @@ -470,7 +470,7 @@ def do_lowercase(self, lines: List[str]) -> List[str]: def rm_punctuation(self, lines: List[str]) -> List[str]: if self.regex_punctuation is not None: - return [self.regex_extra_space.sub('', self.regex_punctuation.sub(' ', line)).strip() for line in lines] + return [self.regex_extra_space.sub(' ', self.regex_punctuation.sub(' ', line)).strip() for line in lines] else: return lines From fd6c75e506de57e06ee4ba636680ad7945868e6e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 5 May 2023 11:21:53 -0700 Subject: [PATCH 11/62] Update manifest.py for speedup (#6565) (#6573) * Update manifest.py Re-order the checks for faster processing audio filepaths that are already absolute paths * Update manifest.py --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Vahid Noroozi --- nemo/collections/common/parts/preprocessing/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py index c1da97c63bcb..9fd69801ec0d 100644 --- a/nemo/collections/common/parts/preprocessing/manifest.py +++ b/nemo/collections/common/parts/preprocessing/manifest.py @@ -198,7 +198,7 @@ def get_full_path( # If input is a string, get the corresponding full path audio_file = Path(audio_file) - if (len(str(audio_file)) < audio_file_len_limit) and not audio_file.is_file() and not audio_file.is_absolute(): + if (len(str(audio_file)) < audio_file_len_limit) and not audio_file.is_absolute() and not audio_file.is_file(): # If audio_file is not available and the path is not absolute, the full path is assumed # to be relative to the manifest file parent directory or data directory. if manifest_file is None and data_dir is None: From f1a3e7530a1d077dd9cf5ab5654914636f0410f1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 5 May 2023 13:00:41 -0700 Subject: [PATCH 12/62] More streaming conformer export fixes (#6567) (#6578) Signed-off-by: Greg Clark Co-authored-by: Greg Clark Co-authored-by: Vahid Noroozi --- .../asr/modules/conformer_encoder.py | 29 +++++++++++++++++++ nemo/core/classes/exportable.py | 16 +++++++--- scripts/export.py | 1 + 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 0fc0912a8921..9955e35444f4 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -183,6 +183,19 @@ def input_types(self): } ) + @property + def input_types_for_export(self): + """Returns definitions of module input ports.""" + return OrderedDict( + { + "audio_signal": NeuralType(('B', 'D', 'T'), SpectrogramType()), + "length": NeuralType(tuple('B'), LengthsType()), + "cache_last_channel": NeuralType(('B', 'D', 'T', 'D'), ChannelType(), optional=True), + "cache_last_time": NeuralType(('B', 'D', 'D', 'T'), ChannelType(), optional=True), + "cache_last_channel_len": NeuralType(tuple('B'), LengthsType(), optional=True), + } + ) + @property def output_types(self): """Returns definitions of module output ports.""" @@ -196,6 +209,19 @@ def output_types(self): } ) + @property + def output_types_for_export(self): + """Returns definitions of module output ports.""" + return OrderedDict( + { + "outputs": NeuralType(('B', 'D', 'T'), AcousticEncodedRepresentation()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + "cache_last_channel_next": NeuralType(('B', 'D', 'T', 'D'), ChannelType(), optional=True), + "cache_last_time_next": NeuralType(('B', 'D', 'D', 'T'), ChannelType(), optional=True), + "cache_last_channel_next_len": NeuralType(tuple('B'), LengthsType(), optional=True), + } + ) + @property def disabled_deployment_input_names(self): if not self.export_cache_support: @@ -489,6 +515,8 @@ def forward_for_export( rets = self.streaming_post_process(rets, keep_all_outputs=False) if len(rets) == 2: return rets + elif rets[2] is None and rets[3] is None and rets[4] is None: + return (rets[0], rets[1]) else: return ( rets[0], @@ -549,6 +577,7 @@ def forward_internal( audio_signal = self.pre_encode(audio_signal) else: audio_signal, length = self.pre_encode(x=audio_signal, lengths=length) + length = length.to(torch.int64) # self.streaming_cfg is set by setup_streaming_cfg(), called in the init if self.streaming_cfg.drop_extra_pre_encoded > 0 and cache_last_channel is not None: audio_signal = audio_signal[:, self.streaming_cfg.drop_extra_pre_encoded :, :] diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index eb399b1c1d1d..38b8e1c1e31b 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -215,8 +215,8 @@ def _export( elif format == ExportFormat.ONNX: # dynamic axis is a mapping from input/output_name => list of "dynamic" indices if dynamic_axes is None: - dynamic_axes = get_dynamic_axes(self.input_module.input_types, input_names) - dynamic_axes.update(get_dynamic_axes(self.output_module.output_types, output_names)) + dynamic_axes = get_dynamic_axes(self.input_module.input_types_for_export, input_names) + dynamic_axes.update(get_dynamic_axes(self.output_module.output_types_for_export, output_names)) torch.onnx.export( jitted_model, input_example, @@ -273,11 +273,19 @@ def _export_teardown(self): @property def input_names(self): - return get_io_names(self.input_module.input_types, self.disabled_deployment_input_names) + return get_io_names(self.input_module.input_types_for_export, self.disabled_deployment_input_names) @property def output_names(self): - return get_io_names(self.output_module.output_types, self.disabled_deployment_output_names) + return get_io_names(self.output_module.output_types_for_export, self.disabled_deployment_output_names) + + @property + def input_types_for_export(self): + return self.input_types + + @property + def output_types_for_export(self): + return self.output_types def get_export_subnet(self, subnet=None): """ diff --git a/scripts/export.py b/scripts/export.py index efb257d00447..80cbcf3dc666 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -32,6 +32,7 @@ import torch from pytorch_lightning import Trainer +import nemo from nemo.core import ModelPT from nemo.core.classes import Exportable from nemo.core.config.pytorch_lightning import TrainerConfig From 84cdcb62a2fbcd6fe86f3a7030cd8310eeeda9b5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 5 May 2023 15:43:42 -0600 Subject: [PATCH 13/62] user selected max_seq_len should be less than model's max_seq_len (#6333) (#6386) * user selection should not break model max limit * eval max seq length --------- Signed-off-by: arendu Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: Sandeep Subramanian Co-authored-by: Eric Harper --- .../conf/megatron_gpt_prompt_learning_inference.yaml | 1 + .../language_modeling/megatron_gpt_prompt_learning_eval.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml index 05099917e912..33ca3f06ddfe 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml @@ -24,6 +24,7 @@ pipeline_model_parallel_size: -1 gpt_model_file: null # GPT nemo file path virtual_prompt_model_file: ??? # path to a MegatronGPTPromptLearningModel model if you want to use soft prompts pred_file_path: ??? # Path will model predictions will be written +max_seq_length: 8192 # this will filter out inputs whose length is longer than the set value form the generation process. data_paths: # paths to .jsonl files you want to perform inference on num_workers: 8 \ No newline at end of file diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py index d66bac0bfecc..3a490b3532f1 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py @@ -151,12 +151,13 @@ def placeholder(): "compute_logprob": cfg.inference.compute_logprob, } - max_input_length = model.frozen_model.cfg.encoder_seq_length - length_params["max_length"] + max_seq_length = model.frozen_model.cfg.encoder_seq_length - length_params["max_length"] + max_seq_length = min(max_seq_length, cfg.get("max_seq_length", 8192)) _, dataloader = model.build_virtual_prompt_dataset( data=cfg.data_paths, batch_size=cfg.inference.get('batch_size', 1), - max_seq_length=max_input_length, + max_seq_length=max_seq_length, min_seq_length=model.cfg.data.get('min_seq_length', 1), add_bos=sampling_params["add_BOS"], add_eos=False, From 3161c6150e886704c144770a28264700a5c999e7 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Date: Fri, 5 May 2023 16:26:03 -0700 Subject: [PATCH 14/62] Framework for PEFT via mixins (#6391) * init commit ptuning via mixin Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * gpt ptuning places virtual tokens on the left only Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * encoder input modified when pre_process is true Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * optimizer group and state dict updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adapter ptuning working for pp>1 Signed-off-by: arendu * adapter defaults Signed-off-by: arendu * adapter ptuining config defaults Signed-off-by: arendu * training works Signed-off-by: arendu * loading and saving adapter only params during training Signed-off-by: arendu * added checks and comments Signed-off-by: arendu * clean up Signed-off-by: arendu * checks for grad is None before calling all_reduce Signed-off-by: arendu * load adapter .nemo file working Signed-off-by: arendu * resume training for adapters Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * peft tuning Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor Signed-off-by: arendu * file not needed Signed-off-by: arendu * undo prompt learning dataset changes Signed-off-by: arendu * undo updates to gpt prompt learning model Signed-off-by: arendu * naming updates Signed-off-by: arendu * decoding Signed-off-by: arendu * predict_step in gpt_sft_model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * removed inference from tuning config Signed-off-by: arendu * no test in peft training Signed-off-by: arendu * answer only loss and correct defaults for val_loss Signed-off-by: arendu * hybrid adapters and ptuning Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * eval working.. Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * prepending tokens for ptuning Signed-off-by: arendu * cleaned up eval config Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * clean up Signed-off-by: arendu * update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * default prompt template Signed-off-by: arendu * Lora added Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support synamic length with GPT SFT Signed-off-by: Abhinav Khattar * make branch functional Signed-off-by: Abhinav Khattar * defaults to max_pad_length=False in GPT SFT dataset Signed-off-by: arendu * adapter parallel_adapters to support Lora Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added early stopping by default Signed-off-by: arendu * eval script for peft and eval config. bug fixes in predict step and added out_features to t5 adapter config Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updates Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * better defaults Signed-off-by: arendu * updates Signed-off-by: arendu * update Signed-off-by: arendu * docs Signed-off-by: arendu --------- Signed-off-by: arendu Signed-off-by: Abhinav Khattar Signed-off-by: Adi Renduchintala <108822655+arendu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Abhinav Khattar --- .../conf/megatron_gpt_peft_eval_config.yaml | 130 +++++++ .../conf/megatron_gpt_peft_tuning_config.yaml | 207 ++++++++++ .../tuning/megatron_gpt_peft_eval.py | 157 ++++++++ .../tuning/megatron_gpt_peft_tuning.py | 244 ++++++++++++ .../megatron/gpt_sft_dataset.py | 22 +- .../language_modeling/megatron_base_model.py | 3 +- .../megatron_gpt_adapter_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 3 +- .../megatron_gpt_peft_models.py | 361 ++++++++++++++++++ .../megatron_gpt_sft_model.py | 35 +- .../megatron_t5_adapter_model.py | 1 + .../megatron/adapters/parallel_adapters.py | 167 ++++++-- .../nlp/modules/common/megatron/attention.py | 13 +- .../nlp/modules/common/megatron/clip_grads.py | 25 +- .../modules/common/megatron/language_model.py | 23 +- nemo/collections/nlp/parts/nlp_overrides.py | 95 +++++ nemo/core/optim/optimizer_with_main_params.py | 3 +- 17 files changed, 1443 insertions(+), 47 deletions(-) create mode 100755 examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml create mode 100755 examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml create mode 100644 examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py create mode 100644 examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml new file mode 100755 index 000000000000..d7ebd69f31be --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_eval_config.yaml @@ -0,0 +1,130 @@ +name: megatron_gpt_peft_${model.peft.peft_scheme}_tuning + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: max + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: True + save_best_model: False + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 1 + micro_batch_size: 1 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + answer_only_loss: False # not used right now + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "adapter" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + data: + test_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: ??? # Names of the corresponding datasets used to log metrics. + global_batch_size: ??? + micro_batch_size: ??? + shuffle: False + num_workers: 0 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: ${data.train_ds.context_key} + label_key: ${data.train_ds.label_key} + add_eos: ${data.train_ds.add_eos} + add_sep: ${data.train_ds.add_sep} + add_bos: ${data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: ${data.train_ds.truncation_field} # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${data.train_ds.prompt_template} + tokens_to_generate: ??? + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + +inference: + greedy: True # Whether or not to use sampling ; use greedy decoding otherwise + top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + temperature: 1.0 # sampling temperature + all_probs: False # whether return the log prob for all the tokens in vocab + repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. + min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. + compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False + outfile_path: /home/adithyare/exp/foo.txt \ No newline at end of file diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml new file mode 100755 index 000000000000..799d105aae7c --- /dev/null +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml @@ -0,0 +1,207 @@ +name: megatron_gpt_peft_tuning + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "adapter" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ??? # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: True + num_workers: 0 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: False + separate_prompt_and_response_with_newline: False + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 0 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${model.data.train_ds.add_eos} + add_sep: ${model.data.train_ds.add_sep} + add_bos: ${model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${model.global_batch_size} + micro_batch_size: ${model.micro_batch_size} + shuffle: False + num_workers: 4 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${model.data.train_ds.add_eos} + add_sep: ${model.data.train_ds.add_sep} + add_bos: ${model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${model.data.train_ds.prompt_template} + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false \ No newline at end of file diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py new file mode 100644 index 000000000000..a9f6a110c210 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from torch.utils.data import DataLoader + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import ( + MegatronGPTAdapterModel, + MegatronGPTAdapterPTuningModel, + MegatronGPTIA3Model, + MegatronGPTLoRAModel, + MegatronGPTPEFTModel, + MegatronGPTPTuningModel, +) +from nemo.collections.nlp.models.nlp_model import NLPModel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + PEFTSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging + +mp.set_start_method("spawn", force=True) + +""" +This is the script to train an Adapter infused GPT Model for text generation. +A base GPT Model is required as a starting point. This script will then insert +Adapters into each Transformer layer and will train/update only these adapters +during training. The base GPT Model weights will remain frozen. + +During training this script will only save the newly trained Adapter weights +in checkpoints. At the end of training a .nemo file of Adapter weights will +be saved. + +Usage: + Assuming the base model is a 125m GPT Model, with TP=1, PP=1: + a. run a training run for a base gpt nemo file: + python megatron_gpt_adapter_tuning.py \ + "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", + "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", + model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" + name="NAME OF TRAINING RUN" + exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", + trainer.max_epochs=2 +""" + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_peft_eval_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + assert cfg.model.restore_from_path is not None + assert cfg.model.peft.restore_from_path is not None + megatron_amp_o2 = cfg.model.get("megatron_amp_O2", False) + with_distributed_adam = False + + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + if cfg.trainer.precision in [16, "bf16"]: + scaler = None + if cfg.trainer.precision == 16: + scaler = GradScaler( + init_scale=cfg.model.get("native_amp_init_scale", 2 ** 32), + growth_interval=cfg.model.get("native_amp_growth_interval", 1000), + hysteresis=cfg.model.get("hysteresis", 2), + enabled=False + if cfg.model.pipeline_model_parallel_size > 1 + else True, # turn off the grad scale for pipeline parallel LM model + ) + if megatron_amp_o2 and not with_distributed_adam: + plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device="cuda", scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device="cuda", scaler=scaler)) + + if cfg.get("cluster_type", None) == "BCP": + plugins.append(TorchElasticEnvironment()) + + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + peft_model_cfg = MegatronGPTPEFTModel.restore_from( + restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, + ) + + # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams + with open_dict(peft_model_cfg): + # update the model config of the trained model with params we want to set at inference time. + peft_model_cfg.precision = cfg.trainer.precision + peft_model_cfg.data.test_ds = cfg.model.data.test_ds + + with open_dict(cfg): + # update the config with the trained model config + # required for hydra interpolation to work inside cfg.inference + cfg.inference.add_BOS = peft_model_cfg.data.test_ds.add_bos + cfg.inference.tokens_to_generate = peft_model_cfg.data.test_ds.tokens_to_generate + + save_restore_connector = PEFTSaveRestoreConnector( + peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=None, + ) + if os.path.isdir(peft_model_cfg.restore_from_path): + save_restore_connector.model_extracted_dir = cfg.model.restore_from_path + # peft_cls = _get_peft_scheme(peft_model_cfg) + model = NLPModel.restore_from( + restore_path=cfg.model.restore_from_path, + trainer=trainer, + override_config_path=peft_model_cfg, + save_restore_connector=save_restore_connector, + ) + + model.freeze() + _test_ds = model._build_dataset(peft_model_cfg.data.test_ds, is_train=False) + request_dl = DataLoader( + dataset=_test_ds[0], + batch_size=peft_model_cfg.data.test_ds.global_batch_size, + collate_fn=_test_ds[0].collate_fn, + ) + config = OmegaConf.to_container(cfg.inference, resolve=True) + model.set_inference_config(config) + response = trainer.predict(model, request_dl) + if model.global_rank == 0: + print("***************************") + if cfg.inference.outfile_path is not None: + with open(cfg.inference.outfile_path, "w", encoding="utf-8") as f: + for batch in response: + for sentence in batch["sentences"]: + s = " ".join(sentence.split("\n")) + f.write(s + "\n") + print("predictions saved to {}".format(cfg.inference.outfile_path)) + else: + print(response) + print("***************************") + + +if __name__ == "__main__": + main() diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py new file mode 100644 index 000000000000..d0f95b371a13 --- /dev/null +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py @@ -0,0 +1,244 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import tempfile + +import torch.multiprocessing as mp +from omegaconf.omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector +from torch.utils.data import DataLoader, Dataset + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import ( + MegatronGPTAdapterModel, + MegatronGPTAdapterPTuningModel, + MegatronGPTIA3Model, + MegatronGPTLoRAModel, + MegatronGPTPTuningModel, +) +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, + PEFTSaveRestoreConnector, + PipelineMixedPrecisionPlugin, +) +from nemo.core.config import hydra_runner +from nemo.utils import AppState, logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.model_utils import inject_model_parallel_rank + +mp.set_start_method("spawn", force=True) + +""" +This is the script to train an Adapter infused GPT Model for text generation. +A base GPT Model is required as a starting point. This script will then insert +Adapters into each Transformer layer and will train/update only these adapters +during training. The base GPT Model weights will remain frozen. + +During training this script will only save the newly trained Adapter weights +in checkpoints. At the end of training a .nemo file of Adapter weights will +be saved. + +Usage: + Assuming the base model is a 125m GPT Model, with TP=1, PP=1: + a. run a training run for a base gpt nemo file: + python megatron_gpt_adapter_tuning.py \ + "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", + "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", + model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" + name="NAME OF TRAINING RUN" + exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", + trainer.max_epochs=2 +""" + + +def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): + """ + This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg). + The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. + """ + OmegaConf.set_struct(gpt_cfg, True) + OmegaConf.resolve(cfg) + with open_dict(gpt_cfg): + gpt_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) + gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size + gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size + gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False) + gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None) + gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None) + gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None) + gpt_cfg.data = cfg.model.data + gpt_cfg.optim = cfg.model.optim + gpt_cfg.precision = cfg.trainer.precision + gpt_cfg.answer_only_loss = cfg.model.answer_only_loss + gpt_cfg.restore_from_path = cfg.model.restore_from_path + gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint + gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end + gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view + gpt_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.0) + gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0) + gpt_cfg.ffn_dropout = cfg.model.ffn_dropout + gpt_cfg.peft = cfg.model.peft + + # This is needed when modifying a hparam file directly to load `.ckpt` files. + # This is not needed to modify the cfg in `.nemo` files. + if add_cfg_to_tree: + OmegaConf.resolve(gpt_cfg) + gpt_cfg.cfg = gpt_cfg + + return gpt_cfg + + +def _get_peft_scheme(cfg): + if cfg.peft.peft_scheme == "adapter": + peft_cls = MegatronGPTAdapterModel + elif cfg.peft.peft_scheme == "ia3": + peft_cls = MegatronGPTIA3Model + elif cfg.peft.peft_scheme == "ptuning": + peft_cls = MegatronGPTPTuningModel + elif cfg.peft.peft_scheme == "adapter_and_ptuning": + peft_cls = MegatronGPTAdapterPTuningModel + elif cfg.peft.peft_scheme == "lora": + peft_cls = MegatronGPTLoRAModel + else: + raise RuntimeError("Invalid Peft scheme") + return peft_cls + + +def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): + app_state = AppState() + if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, + ) + checkpoint_path = inject_model_parallel_rank( + os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) + ) + hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) + gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) + with tempfile.NamedTemporaryFile(suffix='.yaml') as f: + OmegaConf.save(config=gpt_cfg, f=f.name) + model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) + return model + + +def validate_checkpoint_loading_args(cfg): + if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): + raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') + if cfg.checkpoint_name is None: + raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') + if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): + raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') + + +@hydra_runner(config_path="conf", config_name="megatron_gpt_peft_tuning_config") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) + with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' + + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + if cfg.trainer.precision in [16, 'bf16']: + scaler = None + if cfg.trainer.precision == 16: + scaler = GradScaler( + init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=cfg.model.get('native_amp_growth_interval', 1000), + hysteresis=cfg.model.get('hysteresis', 2), + enabled=False + if cfg.model.pipeline_model_parallel_size > 1 + else True, # turn off the grad scale for pipeline parallel LM model + ) + if megatron_amp_o2 and not with_distributed_adam: + plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) + + if cfg.get('cluster_type', None) == 'BCP': + plugins.append(TorchElasticEnvironment()) + + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + exp_manager(trainer, cfg.exp_manager) + # update resume from checkpoint found by exp_manager + if cfg.model.resume_from_checkpoint is not None: + resume_from_checkpoint = cfg.model.resume_from_checkpoint + else: + resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path + logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') + + trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) + + # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams + with open_dict(cfg): + cfg.model.precision = cfg.trainer.precision + + if cfg.model.restore_from_path: + base_model_save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.model.restore_from_path): + base_model_save_restore_connector.model_extracted_dir = cfg.model.restore_from_path + base_model_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.model.restore_from_path, + trainer=trainer, + return_config=True, + save_restore_connector=base_model_save_restore_connector, + ) + base_model_cfg = _modify_config(base_model_cfg, cfg, add_cfg_to_tree=False) + save_restore_connector = PEFTSaveRestoreConnector( + peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=resume_from_checkpoint + ) + if os.path.isdir(cfg.model.restore_from_path): + save_restore_connector.model_extracted_dir = cfg.model.restore_from_path + peft_cls = _get_peft_scheme(cfg.model) + model = peft_cls.restore_from( + restore_path=cfg.model.restore_from_path, + trainer=trainer, + override_config_path=base_model_cfg, + save_restore_connector=save_restore_connector, + ) + else: + raise RuntimeError("PEFT training needs a trained base model present.") + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py index 4df6a1bb577b..24b7fe8d3d6d 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py @@ -41,9 +41,11 @@ def __init__( separate_prompt_and_response_with_newline: bool = False, answer_only_loss: bool = True, truncation_field: str = "answer", - pad_to_max_length: bool = True, + pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings. index_mapping_dir: str = None, prompt_template: str = None, + virtual_tokens: int = 0, + tokens_to_generate: int = 0, ): """ file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} @@ -84,6 +86,8 @@ def __init__( self.pad_to_max_length = pad_to_max_length self.index_mapping_dir = index_mapping_dir self.prompt_template = prompt_template + self.virtual_tokens = virtual_tokens + self.tokens_to_generate = tokens_to_generate if self.prompt_template is not None: # When providing things like newlines in the prompt template via the CLI, they are escaped. This line unescapes them. self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape') @@ -156,8 +160,14 @@ def _process_example(self, example): elif not self.separate_prompt_and_response_with_newline and self.prompt_template is None: text = context + ' ' + output - tokenized_text = self.tokenizer.text_to_ids(text) - context_ids = self.tokenizer.text_to_ids(context) + if self.virtual_tokens: + # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context + # these pad/eos tokens are placeholders for virtual tokens + pre_pad = [self.tokenizer.eos_id] * self.virtual_tokens + else: + pre_pad = [] + tokenized_text = pre_pad + self.tokenizer.text_to_ids(text) + context_ids = pre_pad + self.tokenizer.text_to_ids(context) answer_ids = tokenized_text[len(context_ids) :] total_ids = len(context_ids) + len(answer_ids) if self.add_bos: @@ -212,7 +222,7 @@ def _maybe_cast_to_list(self, x): return [item.tolist() for item in x] return x - def _round_to_nearest(self, n, m): + def _ceil_to_nearest(self, n, m): return (n + m - 1) // m * m def _collate_item(self, item, max_length, pad_id): @@ -252,12 +262,12 @@ def collate_fn(self, batch): context_lengths = torch.LongTensor([item['context_length'] for item in batch]) loss_mask = [self._build_loss_mask(item)[1:] for item in batch] - max_length = max([len(x) for x in input_ids]) + max_length = max([len(x) for x in input_ids]) + self.tokens_to_generate # increase max length to nearest multiple of 4 or 8 if self.pad_to_max_length: max_length = self.max_seq_length else: - max_length = min(self.max_seq_length, self._round_to_nearest(max_length, 8)) + max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8)) assert max_length <= self.max_seq_length attention_mask = [self._create_attention_mask(max_length) for _ in batch] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 5e5c177737fa..3899c75675db 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -239,7 +239,8 @@ def _get_parameters(self): params = [] for param_group in self._optimizer_param_groups: for param in param_group['params']: - params.append(param) + if param.requires_grad: # (@adithyare) adapter training with pp>1 can result in params with no grads + params.append(param) return params def configure_gradient_clipping(self, *args, **kwargs): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py index aa7cd4652b0a..cb38ad863a52 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py @@ -272,6 +272,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if cfg.adapter_tuning.type == "parallel_adapter": adapter_cfg = ParallelLinearAdapterConfig( in_features=self.frozen_model_cfg.hidden_size, + out_features=self.frozen_model_cfg.hidden_size, dim=cfg.adapter_tuning.adapter_dim, norm_position=cfg.adapter_tuning.get('norm_position', 'pre'), norm_type=cfg.adapter_tuning.get('norm_type', 'mixedfusedlayernorm'), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8defb94fd3c1..7159190fdec7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -440,7 +440,8 @@ def training_step(self, dataloader_iter, batch_idx): # we can avoid this broadcast by updating the PTL log function to accept specific ranks torch.distributed.broadcast(loss_mean, get_last_rank()) - if self.cfg.precision == 16: + # (@adithyare) we need to check for the _scaler attribute to enable pp>1 for adapter training + if self.cfg.precision == 16 and hasattr(self.trainer.precision_plugin.scaler, "_scale"): loss_scale = self.trainer.precision_plugin.scaler._scale if loss_scale is not None: self.log('loss_scale', loss_scale, batch_size=1) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py new file mode 100644 index 000000000000..930bfbc8cf25 --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_peft_models.py @@ -0,0 +1,361 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from omegaconf.dictconfig import DictConfig +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel +from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( + AdapterName, + InfusedAdapterConfig, + LoraKQVAdapterConfig, + MLPInfusedAdapterConfig, + ParallelLinearAdapterConfig, + PromptEncoderAdapterConfig, +) +from nemo.core.classes.mixins import adapter_mixins +from nemo.utils import logging, model_utils + + +class MegatronGPTPEFTModel(MegatronGPTSFTModel): + """ + base class for all mixin based adapter models + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + super().__init__(cfg, trainer) + self.setup_complete = False + self.base_keys = self.get_all_keys() + self.init_peft_modules() + self.adapter_keys = self.get_all_keys() - self.base_keys + + def first_stage_of_pipeline(self): + if hasattr(self, "model") and hasattr(self.model, "pre_process"): + return self.model.pre_process + logging.warning("no attribute named model or no model.pre_process found. Can not detect stage of pipeline...") + return False + + def init_peft_modules(self): + """ + Randomly initialize the peft params and add them to the appropriate modules. + """ + assert len(self.peft_name_keys) > 0, "peft_name_keys have not been set no PEFT modules will be added" + assert len(self.name_key_to_cfg) > 0, "name_key_to_cfg has not been set no PEFT modules will be added" + logging.info(f"Before adding PEFT params:\n{self.summarize()}") + for _, module in self.named_modules(): + if isinstance(module, adapter_mixins.AdapterModuleMixin): + for peft_key in self.peft_name_keys: + peft_cfg = self.name_key_to_cfg[peft_key] + if model_utils.import_class_by_path(peft_cfg._target_) in module.get_accepted_adapter_types(): + module.add_adapter( + name=peft_key, cfg=peft_cfg, + ) + logging.info(f"After adding PEFT params:\n{self.summarize()}") + return True + + def setup(self, stage=None): + super().setup(stage) + self.setup_complete = True + + def get_all_keys(self,): + """ + Returns all the keys in the model + """ + k = [n for n, p in self.named_parameters()] + return set(k) + + def get_peft_state_dict(self,): + """ + Gets the keys associated with the adapters only. + """ + state_dict = self.model.state_dict(prefix="model.") + peft_state_dict = {} + for k in self.adapter_keys: + peft_state_dict[k] = state_dict[k] + return peft_state_dict + + def state_dict(self, destination=None, prefix=None, keep_vars=False): + if self.setup_complete: + # Once setup is complete we no longer need to track the frozen part of the model. Only there adapter state dict keeps changing so state_dict only track these. + return self.get_peft_state_dict() + else: + # we want all the params with the same keys as calling self.state_dict() + # but we can't call self.state_dict() here as it would be a recursive call. + # so we call self.model.state_dict(prefix="model.") which will return all the keys and params same as calling self.state_dict() + return self.model.state_dict(prefix="model.") + + def load_state_dict(self, state_dict, strict: bool = True): + if self.setup_complete: + # at this stage only PEFT params will appear in the state_dict arg + # so we only update those while the rest of the model is frozen. + # setting strict=False will ignore the missing keys (which are not being updated anyway) + # explicitly check if state_dict.keys matches all the expected self.adapter_keys since we don't have the + # safety in strict=True anymore. + assert set(state_dict.keys()) == self.adapter_keys + super().load_state_dict(state_dict, strict=False) + else: + super().load_state_dict(state_dict, strict=True) + + def setup_optimizer_param_groups(self): + """ + ModelPT override. Optimizer will get self._optimizer_param_groups. + Makes two optimizer param groups, one for the frozen model params + and one for the prompt-table/prompt-encoder params. The learning + rate for the frozen model's params will always be zero effectively + freezing the model's params but still allowing for the needed gradients + to be passed around in pipeline parallel models. The prompt-encoder + and/or prompt table will use the learning rate set by the user. + """ + self.freeze() # Freeze the entire model + opt_params = [] + for _, module in self.named_modules(): + if isinstance(module, adapter_mixins.AdapterModuleMixin) and module.is_adapter_available(): + module.set_enabled_adapters(enabled=True) + module.unfreeze_enabled_adapters() # selectively unfreeze the adapter modules. + opt_params += [p for p in module.parameters()] + + self._optimizer_param_groups = ({"params": opt_params},) + logging.info(f"Optimizer groups set:\n{self.summarize()}") + + +class MegatronGPTAdapterModel(MegatronGPTPEFTModel): + """ + MegatronGPTAdapterLearningModel is a model that combines a base model (GPTSFTModel) with a adapters. + This class only supports the canonical Adapter training described in Houlsby et al. (https://arxiv.org/pdf/1902.00751.pdf) + + Two adapter's are inserted into each Transformer layer in the base GPT Model. + + It is assumed that these set of adapters will then be trained for a specific task. + Once trained, the adapter weights will be saved and can be re-loaded + and infused into the same GPT Model for inference. + """ + + def __init__( + self, cfg: DictConfig, trainer: Trainer, + ): + self.peft_name_keys = [ + AdapterName.PRE_ATTN_ADAPTER, + AdapterName.POST_ATTN_ADAPTER, + ] + adapter_tuning_cfg = cfg.peft.adapter_tuning + + adapter_cfg = ParallelLinearAdapterConfig( + in_features=cfg.hidden_size, + out_features=cfg.hidden_size, + dim=adapter_tuning_cfg.adapter_dim, + norm_position=adapter_tuning_cfg.get("norm_position", "pre"), + norm_type=adapter_tuning_cfg.get("norm_type", "mixedfusedlayernorm"), + column_init_method=adapter_tuning_cfg.get("column_init_method", "xavier"), + row_init_method=adapter_tuning_cfg.get("row_init_method", "zero"), + dropout=adapter_tuning_cfg.adapter_dropout, + ) + + self.name_key_to_cfg = {} + for k in self.peft_name_keys: + self.name_key_to_cfg[k] = adapter_cfg + + super().__init__(cfg, trainer) + + +class MegatronGPTIA3Model(MegatronGPTPEFTModel): + """ + MegatronGPTInfusedAdapterModel is a model that combines a base model (GPTSFTModel) with a "Infused Adapter that can Inhibiting and Amplify Inner Activations", known as IA3. + This class supports the addition of IA3 into a transformer based LM as described in Liu et al. (https://arxiv.org/pdf/2205.05638.pdf) + + Three adapter's are inserted into each Transformer layer in the base GPT Model. Each adapter is basically a vector that simply scales the key, value or ffn hidden representations. + + It is assumed that these set of adapters will then be trained for a specific task. + Once trained, the adapter weights will be saved and can be re-loaded + and infused into the same GPT Model for inference. + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + self.peft_name_keys = [AdapterName.KEY_INFUSED, AdapterName.VALUE_INFUSED, AdapterName.MLP_INFUSED] + + mlp_infused_adapter_cfg = MLPInfusedAdapterConfig( + in_features=cfg.ffn_hidden_size // cfg.tensor_model_parallel_size + ) + infused_adapter_cfg = InfusedAdapterConfig(in_features=cfg.hidden_size // cfg.tensor_model_parallel_size) + + self.name_key_to_cfg = {} + for k in self.peft_name_keys: + if k == AdapterName.MLP_INFUSED: + self.name_key_to_cfg[k] = mlp_infused_adapter_cfg + elif k in [ + AdapterName.KEY_INFUSED, + AdapterName.VALUE_INFUSED, + ]: + self.name_key_to_cfg[k] = infused_adapter_cfg + else: + raise ValueError(f"PEFT Key {k} is unknown.") + super().__init__(cfg, trainer) + + +class MegatronGPTPTuningModel(MegatronGPTPEFTModel): + """ + MegatronGPTPTuningModel is a model that combines a base model (GPTSFTModel) with a p-tuning prefix in the + input word embedding representations using a prompt-encoder as descripted in Liu et al. https://arxiv.org/pdf/2103.10385.pdf + + The mixin framework adds the output of prompt-encoder (i.e. the virtual embeddings) inside + nemo/collections/nlp/modules/common/megatron/language_model.py + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + self.peft_name_keys = [AdapterName.PTUNING_ADAPTER] + + adapter_cfg = PromptEncoderAdapterConfig( + cfg.peft.p_tuning.virtual_tokens, + cfg.peft.p_tuning.bottleneck_dim, + cfg.peft.p_tuning.embedding_dim, + cfg.peft.p_tuning.init_std, + cfg.hidden_size, + ) + self.name_key_to_cfg = {AdapterName.PTUNING_ADAPTER: adapter_cfg} + super().__init__(cfg, trainer) + self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens + + def init_peft_modules(self,): + """ + Initialize the p-tuning prompt encoder in the mixin. + This should only happen in the first stage of the pipeline unlike other PEFT methods like Lora or Adapters + because p-tuning only adds params at input to the encoder layer. + """ + if not self.first_stage_of_pipeline(): + # There are no params to add if we are not in the first state of the pipeline + return True + super().init_peft_modules() + return True + + def state_dict(self, destination=None, prefix=None, keep_vars=False): + """ + Reimplement state_dict for ptuning because we also need to check the stage of the pipeline. + The check is required to make pp>1 to work. + """ + if self.setup_complete: + if self.first_stage_of_pipeline(): + return self.get_peft_state_dict() + # if we are not in the first state of pipeline after setup is done + # there should be no params in the state_dict + return {} + else: + return self.model.state_dict(prefix="model.") + + def load_state_dict(self, state_dict, strict: bool = True): + """ + Reimplement load_state_dict for ptuning because we also need to check the stage of the pipeline. + The check is required to make pp>1 to work. + """ + if self.setup_complete: + if self.first_stage_of_pipeline(): + # if we are not in the first state of pipeline after setup is done + # there should be no params to load... + assert set(state_dict.keys()) == self.adapter_keys + super().load_state_dict(state_dict, strict=False) + else: + super().load_state_dict(state_dict, strict=True) + + def setup_optimizer_param_groups(self): + if self.first_stage_of_pipeline(): + super().setup_optimizer_param_groups() + else: + self.freeze() # Freeze the entire model + self._optimizer_param_groups = ({"params": []},) + logging.info(f"Optimizer groups set:\n{self.summarize()}") + + +class MegatronGPTAdapterPTuningModel(MegatronGPTPEFTModel): + """ + Want to combine adapters and p-tuning? Why not? they are orthogonal methods. + This class includes both sets of params. + """ + + def __init__(self, cfg: DictConfig, trainer: Trainer): + self.peft_name_keys = [ + AdapterName.PRE_ATTN_ADAPTER, + AdapterName.POST_ATTN_ADAPTER, + AdapterName.PTUNING_ADAPTER, + ] + ptuning_cfg = PromptEncoderAdapterConfig( + cfg.peft.p_tuning.virtual_tokens, + cfg.peft.p_tuning.bottleneck_dim, + cfg.peft.p_tuning.embedding_dim, + cfg.peft.p_tuning.init_std, + cfg.hidden_size, + ) + adapter_tuning_cfg = cfg.peft.adapter_tuning + adapter_cfg = ParallelLinearAdapterConfig( + in_features=cfg.hidden_size, + out_features=cfg.hidden_size, + dim=adapter_tuning_cfg.adapter_dim, + norm_position=adapter_tuning_cfg.get("norm_position", "pre"), + norm_type=adapter_tuning_cfg.get("norm_type", "mixedfusedlayernorm"), + column_init_method=adapter_tuning_cfg.get("column_init_method", "xavier"), + row_init_method=adapter_tuning_cfg.get("row_init_method", "zero"), + dropout=adapter_tuning_cfg.adapter_dropout, + ) + + self.name_key_to_cfg = { + AdapterName.PRE_ATTN_ADAPTER: adapter_cfg, + AdapterName.POST_ATTN_ADAPTER: adapter_cfg, + AdapterName.PTUNING_ADAPTER: ptuning_cfg, + } + super().__init__(cfg, trainer) + self.virtual_tokens = cfg.peft.p_tuning.virtual_tokens + + +class MegatronGPTLoRAModel(MegatronGPTPEFTModel): + """ + MegatronGPTLoRAModel is a model that combines a base model (GPTSFTModel) with a low-rank adapters. + The lora adapters will be added in `nemo/collections/nlp/modules/common/megatron/attention.py` + The implementation is based on Hu et al. nemo/collections/nlp/modules/common/megatron/attention.py + + A single low-rank feedfowrad layer is used in parallel with the KQV projection layer. + TODO: Add support to also include an option to adda low-rank adapter in the output projection layer. + """ + + def __init__( + self, cfg: DictConfig, trainer: Trainer, + ): + self.peft_name_keys = [ + AdapterName.LORA_KQV_ADAPTER, + ] + lora_cfg = cfg.peft.lora_tuning + if cfg.kv_channels is None: + assert ( + cfg.hidden_size % cfg.num_attention_heads == 0 + ), 'hidden_size must be divisible by num_attention_heads if kv_channels is None' + kv_channels = cfg.hidden_size // cfg.num_attention_heads + else: + kv_channels = cfg.kv_channels + projection_size = kv_channels * cfg.num_attention_heads + + adapter_cfg = LoraKQVAdapterConfig( + in_features=cfg.hidden_size, + out_features=3 * projection_size, + dim=lora_cfg.adapter_dim, + norm_position="none", + norm_type="none", + activation="identity", + column_init_method=lora_cfg.get("column_init_method", "normal"), + row_init_method=lora_cfg.get("row_init_method", "zero"), + gather_output=False, + dropout=lora_cfg.adapter_dropout, + ) + + self.name_key_to_cfg = {} + for k in self.peft_name_keys: + self.name_key_to_cfg[k] = adapter_cfg + + super().__init__(cfg, trainer) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 56a4496b800b..a28b8216c207 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -13,6 +13,7 @@ # limitations under the License. import json +from typing import Any, Optional import torch from omegaconf import DictConfig, ListConfig @@ -29,7 +30,12 @@ ) from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split -from nemo.collections.nlp.modules.common.text_generation_utils import LengthParam, SamplingParam, megatron_gpt_generate +from nemo.collections.nlp.modules.common.text_generation_utils import ( + LengthParam, + SamplingParam, + generate, + megatron_gpt_generate, +) from nemo.utils import AppState, logging try: @@ -83,6 +89,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.original_checkpointing_granularity = base_module.language_model.encoder.activations_checkpoint_granularity self.original_checkpointing_num_layers = base_module.language_model.encoder.activations_checkpoint_num_layers self.original_checkpointing_method = base_module.language_model.encoder.activations_checkpoint_method + self.virtual_tokens = 0 def setup_metric(self, data_cfg): metric_name = "exact_string_match" @@ -248,6 +255,10 @@ def _build_dataset(self, data_cfg, is_train=True): pad_to_max_length=False, index_mapping_dir=data_cfg.get('index_mapping_dir', None), prompt_template=data_cfg.get('prompt_template', None), + virtual_tokens=self.virtual_tokens, + tokens_to_generate=data_cfg.get( + 'tokens_to_generate', 0 + ), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure. ) datasets.append(dataset) @@ -515,6 +526,28 @@ def inference_epoch_end(self, outputs, mode, data_cfg): return averaged_loss, averaged_metric + def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any: + inference_config = self.get_inference_config() + if inference_config is None: + return None + # need to overwrite some configuration, make it immutable + inference_config = inference_config.copy() + compute_logprob = inference_config['compute_logprob'] + if compute_logprob: + del inference_config['compute_logprob'] + inference_config['inputs'] = batch + inference_config['tokens_to_generate'] = 1 + inference_config['all_probs'] = True + inference_config["add_BOS"] = False + inference_config['greedy'] = True + response = generate(self, **inference_config) + compute_prob_response = get_computeprob_response(self.tokenizer, response, batch) + return compute_prob_response + else: + del inference_config['compute_logprob'] + inference_config['inputs'] = (batch['contexts'].cuda(), batch['context_lengths'].cuda()) + return generate(self, **inference_config) + def write_predictions_to_file(self, outputs, output_file_path_prefix): with open(output_file_path_prefix + "_inputs_preds_labels.jsonl", "w") as f_json: assert len(outputs['inputs']) == len(outputs['preds']) == len(outputs['labels']) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py index 71b3d5537efd..32345e829be8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py @@ -397,6 +397,7 @@ def _get_adapter_cfg(self, component_cfg): if component_cfg.adapter_tuning.type == "parallel_adapter": adapter_cfg = ParallelLinearAdapterConfig( in_features=component_cfg.hidden_size, + out_features=component_cfg.hidden_size, dim=component_cfg.adapter_tuning.adapter_dim, norm_position=component_cfg.adapter_tuning.get('norm_position', 'pre'), norm_type=component_cfg.adapter_tuning.get('norm_type', 'mixedfusedlayernorm'), diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index e6480362bc85..b26b971a38ba 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -21,9 +21,11 @@ import torch import torch.nn as nn +import torch.nn.init as init from nemo.collections.common.parts.adapter_modules import AdapterModuleUtil from nemo.collections.common.parts.utils import activation_registry +from nemo.collections.nlp.modules.common.megatron.fused_bias_gelu import fused_bias_gelu from nemo.collections.nlp.modules.common.megatron.utils import init_method_const, init_method_normal from nemo.core.classes.mixins import adapter_mixin_strategies @@ -56,6 +58,10 @@ class AdapterName(str, enum.Enum): VALUE_INFUSED = "value_infused_adapter" PRE_ATTN_ADAPTER = 'adapter_1' POST_ATTN_ADAPTER = 'adapter_2' + PTUNING_ADAPTER = "ptuning_adapter" + LORA_KQV_ADAPTER = "lora_kqv_adapter" + LORA_KV_ADAPTER = "lora_kv_adapter" + LORA_Q_ADAPTER = "lora_q_adapter" class InfusedAdapter(nn.Module, AdapterModuleUtil): @@ -97,12 +103,14 @@ class ParallelLinearAdapter(nn.Module, AdapterModuleUtil): def __init__( self, in_features: int, + out_features: int, dim: int, activation: str = 'swish', norm_position: str = 'post', norm_type: str = 'mixedfusedlayernorm', - column_init_method: str = 'xavier', - row_init_method: str = 'zero', + column_init_method: str = 'xavier', # TODO: (@adithyare) should rename this to input_init_method to be more precise. + row_init_method: str = 'zero', # TODO: (@adithyare) should rename this to output_init_method to be more precise. + gather_output: bool = True, dropout: float = 0.0, adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, ): @@ -116,30 +124,28 @@ def __init__( self.activation = activation_registry[activation]() self.norm_position = norm_position - if column_init_method == 'xavier': - self.linear_in = ColumnParallelLinear(in_features, dim, bias=False) - elif column_init_method == 'normal': - self.linear_in = ColumnParallelLinear(in_features, dim, bias=False, init_method=init_method_normal(0.2)) - elif column_init_method == 'zero': - self.linear_in = ColumnParallelLinear(in_features, dim, bias=False, init_method=init_method_const(0.0)) + self.linear_in = ColumnParallelLinear( + in_features, dim, bias=False, gather_output=True, init_method=self._get_init_fn(column_init_method) + ) + if gather_output: + self.linear_out = RowParallelLinear( + dim, out_features, bias=False, init_method=self._get_init_fn(row_init_method) + ) else: - raise NotImplementedError("column_init_method should be zero, normal or xavier") - - if row_init_method == 'xavier': - self.linear_out = RowParallelLinear(dim, in_features, bias=False) - elif row_init_method == 'normal': - self.linear_out = RowParallelLinear(dim, in_features, bias=False, init_method=init_method_normal(0.2)) - elif row_init_method == 'zero': - self.linear_out = RowParallelLinear(dim, in_features, bias=False, init_method=init_method_const(0.0)) - else: - raise NotImplementedError("row_init_method should be zero, normal or xavier") - - if norm_type == 'mixedfusedlayernorm': - self.layer_norm = MixedFusedLayerNorm(in_features, 1e-5, sequence_parallel_enbaled=False) - elif norm_type == 'layernorm': - self.layer_norm = nn.LayerNorm(in_features) - else: - raise NotImplementedError("norm_type should be either mixedfusedlayernorm or layernorm") + # (@adithyare) we use this option to mirror the behavior a column parallel layer with two low-rank column parallel layers + # if the original column parallel layer uses gather_output=False, then we will use the self.liner_out layer defined below. + self.linear_out = ColumnParallelLinear( + dim, out_features, bias=False, gather_output=False, init_method=self._get_init_fn(row_init_method) + ) + + if self.norm_position in ["pre", "post"]: + ln_features = in_features if self.norm_position == "pre" else out_features + if norm_type == 'mixedfusedlayernorm': + self.layer_norm = MixedFusedLayerNorm(ln_features, 1e-5, sequence_parallel_enbaled=False) + elif norm_type == 'layernorm': + self.layer_norm = nn.LayerNorm(ln_features) + else: + raise NotImplementedError("norm_type should be either mixedfusedlayernorm or layernorm") if dropout > 0.0: self.dropout = nn.Dropout(dropout) @@ -149,6 +155,17 @@ def __init__( # Setup adapter strategy self.setup_adapter_strategy(adapter_strategy) + def _get_init_fn(self, init_method: str): + if init_method == 'xavier': + init_fn = init.xavier_normal_ + elif init_method == 'normal': + init_fn = init_method_normal(0.2) + elif init_method == "zero": + init_fn = init_method_const(0.0) + else: + raise NotImplementedError("out_init_method should be zero, normal or xavier") + return init_fn + def forward(self, x): if self.norm_position == 'pre': @@ -157,7 +174,6 @@ def forward(self, x): x, _ = self.linear_in(x) # (@adithyare) ColumnLinear returns output and bias, we are ignoring the bias term. x = self.activation(x) x, _ = self.linear_out(x) - if self.norm_position == 'post': x = self.layer_norm(x) @@ -171,12 +187,111 @@ def forward(self, x): @dataclass class ParallelLinearAdapterConfig: in_features: int + out_features: int dim: int activation: str = 'swish' norm_position: str = 'post' norm_type: str = 'mixedfusedlayernorm' column_init_method: str = 'xavier' row_init_method: str = 'zero' + gather_output: bool = True dropout: float = 0.0 adapter_strategy: Optional[Any] = adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__) + + +class LoraKQVAdapter(ParallelLinearAdapter): + """ + Lora Adapters are the same arch as regualr adapters but with potentially different input and output feature sizes + and they do not use an bottleneck activation function + """ + + pass + + +@dataclass +class LoraKQVAdapterConfig(ParallelLinearAdapterConfig): + _target_: str = "{0}.{1}".format(LoraKQVAdapter.__module__, LoraKQVAdapter.__name__) + + +class PromptEncoderAdapter(nn.Module, AdapterModuleUtil): + """ + The Tensor Parallel MLP prompt encoder network that is used to generate the virtual + token embeddings for p-tuning. It only have two layers. + TODO: (@adithyare) Need to add all the functionality from the PromptEncoder class + """ + + def __init__( + self, + virtual_tokens: int, + bottleneck_dim: int, + embedding_dim: int, + init_std: float, + output_dim: int, + adapter_strategy: adapter_mixin_strategies.ResidualAddAdapterStrategyConfig = None, + ): + """ + Initializes the Tensor Model parallel MLP PromptEncoderMLP module. + Args: + virtual_tokens: the number of vitural tokens + hidden_size: hidden dimension + output_size: the output dimension + init_std: the MLP init std value + """ + super().__init__() + self.bottleneck_dim = bottleneck_dim + self.embedding_dim = embedding_dim + self.output_dim = output_dim + self.virtual_tokens = virtual_tokens + self.activation = "gelu" + + sequence_parallel = False + gradient_accumulation_fusion = False + # (@adithyare) the persistent=False will not pollute the indices into the state_dict of this module. + self.register_buffer("indices", torch.LongTensor(list(range(self.virtual_tokens))), persistent=False) + self.embedding = torch.nn.Embedding(self.virtual_tokens, self.embedding_dim) + self.first = ColumnParallelLinear( + self.embedding_dim, + self.bottleneck_dim, + gather_output=False, + init_method=init_method_normal(init_std), + skip_bias_add=True, + use_cpu_initialization=False, + bias=True, + sequence_parallel_enabled=sequence_parallel, + gradient_accumulation_fusion=gradient_accumulation_fusion, + ) + self.second = RowParallelLinear( + self.bottleneck_dim, + self.output_dim, + input_is_parallel=True, + init_method=init_method_normal(init_std), + skip_bias_add=True, + use_cpu_initialization=False, + bias=True, + sequence_parallel_enabled=sequence_parallel, + gradient_accumulation_fusion=gradient_accumulation_fusion, + ) + # Setup adapter strategy + self.setup_adapter_strategy(adapter_strategy) + + def forward(self, batch_size): + input_embeds = self.embedding(self.indices).unsqueeze(0) + intermediate_parallel, bias_parallel = self.first(input_embeds) + intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) + output_embeds, bias_parallel = self.second(intermediate_parallel) + output_embeds = output_embeds + bias_parallel + output_embeds = output_embeds.transpose(0, 1) + output_embeds = output_embeds.expand(self.virtual_tokens, batch_size, self.output_dim) + return output_embeds + + +@dataclass +class PromptEncoderAdapterConfig: + virtual_tokens: int + bottleneck_dim: int + embedding_dim: int + init_std: float + output_dim: int + adapter_strategy: Optional[Any] = adapter_mixin_strategies.ResidualAddAdapterStrategyConfig() + _target_: str = "{0}.{1}".format(PromptEncoderAdapter.__module__, PromptEncoderAdapter.__name__) diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index 5c2267a25e44..852a3e3c4f88 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -18,7 +18,11 @@ import torch.nn.functional as F from einops import rearrange, repeat -from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import AdapterName, InfusedAdapterConfig +from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( + AdapterName, + InfusedAdapterConfig, + LoraKQVAdapterConfig, +) from nemo.collections.nlp.modules.common.megatron.fused_softmax import MatchedScaleMaskSoftmax from nemo.collections.nlp.modules.common.megatron.module import MegatronModule from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import apply_rotary_pos_emb @@ -108,7 +112,7 @@ def __init__( self.megatron_legacy = megatron_legacy - self.set_accepted_adapter_types([InfusedAdapterConfig._target_]) + self.set_accepted_adapter_types([InfusedAdapterConfig._target_, LoraKQVAdapterConfig._target_]) if kv_channels is None: assert ( @@ -360,6 +364,11 @@ def forward( if self.attention_type == AttnType.self_attn: # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer, _ = self.query_key_value(hidden_states) + if self.is_adapter_available(): + lora_kqv_adapter = self.get_adapter_module(AdapterName.LORA_KQV_ADAPTER) + if lora_kqv_adapter: + lora_mixed_x_layer = lora_kqv_adapter(hidden_states) + mixed_x_layer = mixed_x_layer + lora_mixed_x_layer # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] new_tensor_shape = mixed_x_layer.size()[:-1] + ( diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py index 68a97485edf6..a1620931a695 100644 --- a/nemo/collections/nlp/modules/common/megatron/clip_grads.py +++ b/nemo/collections/nlp/modules/common/megatron/clip_grads.py @@ -20,6 +20,7 @@ from torch import inf from nemo.collections.nlp.modules.common.megatron.module import param_is_not_shared +from nemo.utils import logging try: import amp_C @@ -91,7 +92,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): grads_for_norm.append(grad) if not grads_for_norm: - raise ValueError("No grads found, please disable gradient clipping") + logging.warning("No grads found, consider disabling gradient clipping") # Norm parameters. max_norm = float(max_norm) @@ -100,7 +101,8 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Calculate norm. if norm_type == inf: - total_norm = max(grad.abs().max() for grad in grads_for_norm) + if grads_for_norm: # (@adithyare) grads_for_norm can be empty for adapter training with pp>1 + total_norm = max(grad.abs().max() for grad in grads_for_norm) total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) # Take max across all model-parallel GPUs. torch.distributed.all_reduce( @@ -114,9 +116,12 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Use apex's multi-tensor applier for efficiency reasons. # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. - grad_norm, _ = multi_tensor_applier( - amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm - ) + if grads_for_norm: # (@adithyare) grads_for_norm can be empty for adapter training with pp>1 + grad_norm, _ = multi_tensor_applier( + amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm + ) + else: + grad_norm = 0.0 # Since we will be summing across data parallel groups, # we need the pow(norm-type). total_norm = grad_norm ** norm_type @@ -127,14 +132,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): total_norm += grad_norm ** norm_type # Sum across all model-parallel GPUs. + total_norm_cuda = torch.cuda.FloatTensor( + [float(total_norm)] + ) # (@adithyare) total_norm can be a float at this point so we convert it to cuda.FloatTensor torch.distributed.all_reduce( - total_norm, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_model_parallel_group() + total_norm_cuda, op=torch.distributed.ReduceOp.SUM, group=parallel_state.get_model_parallel_group() ) - total_norm = total_norm.item() ** (1.0 / norm_type) + total_norm = total_norm_cuda[0].item() + total_norm = total_norm ** (1.0 / norm_type) # Scale. clip_coeff = max_norm / (total_norm + 1.0e-6) - if clip_coeff < 1.0: + if clip_coeff < 1.0 and grads: # (@adithyare) grads can be empty for adapter training. dummy_overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index 0ab2ae79bed1..c946038fb7a9 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -15,6 +15,10 @@ """Transformer based language model.""" import torch +from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ( + AdapterName, + PromptEncoderAdapterConfig, +) from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.module import MegatronModule from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import RotaryEmbedding @@ -25,6 +29,7 @@ init_method_normal, scaled_init_method_normal, ) +from nemo.core import adapter_mixins try: from apex.transformer.enums import AttnMaskType @@ -410,7 +415,7 @@ def load_state_dict(self, state_dict, strict=True): ) -class TransformerLanguageModel(MegatronModule): +class TransformerLanguageModel(MegatronModule, adapter_mixins.AdapterModuleMixin): """Transformer language model. Arguments: @@ -639,6 +644,7 @@ def __init__( init_method=self.init_method, ) self._output_layer_key = 'output_layer' + self.set_accepted_adapter_types([PromptEncoderAdapterConfig._target_]) def set_input_tensor(self, input_tensor): """ See megatron.model.transformer.set_input_tensor()""" @@ -671,7 +677,21 @@ def forward( ): # Embeddings. if self.pre_process and encoder_input is None: + encoder_input = self.embedding(enc_input_ids, enc_position_ids, token_type_ids=token_type_ids) + if self.is_adapter_available(): + _sq, _bs, _hs = encoder_input.size() + ptuning_adapter = self.get_adapter_module(AdapterName.PTUNING_ADAPTER) + v = ptuning_adapter.virtual_tokens + if ptuning_adapter and _sq >= v: # The sequence should be longer the v to insert virtual embeddings. + strategy = ptuning_adapter.adapter_strategy + virtual_embeddings = self.forward_single_enabled_adapter_( + _bs, ptuning_adapter, adapter_name=AdapterName.PTUNING_ADAPTER, adapter_strategy=strategy, + ) + encoder_input = encoder_input[ + v:, :, : + ] # the first v tokens are pads so that they can be swapped out with virtual embeddings. + encoder_input = torch.concat([virtual_embeddings, encoder_input], dim=0) else: pass @@ -696,6 +716,7 @@ def forward( rotary_pos_emb = self.rotary_pos_emb(encoder_input.size(0)) else: rotary_pos_emb = None + # encoder. if enc_hidden_states is None: encoder_output = self.encoder( diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 3b11eb838a2f..0a27c135e588 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -397,6 +397,101 @@ def restore_from( return instance +class PEFTSaveRestoreConnector(NLPSaveRestoreConnector): + """ + PEFT models require the ability to load/save a small subset of the full model (once PEFT params have been infused into the base model.) + The PEFTSaveRestoreConnector is used to allow loading and saving only the PEFT params while not saving the entire model. + + Args: + peft_model_nemo_path: Used to provide the .nemo file corresponding to a PEFT model (which will only contain a small set of params) + peft_model_ckpt_path: Used to provide the path to .ckpt files of a PEFt model. This is required when no .nemo is available (yet) such as during resumed training. + If both are provided the peft_model_ckpt_path takes precedence. + If neither are provided, PEFT params are initialized at random (not loaded from any external source). + """ + + def __init__(self, peft_model_nemo_path: Optional[str] = None, peft_model_ckpt_path: Optional[str] = None) -> None: + super().__init__() + self.peft_model_ckpt_name = "model_weights.ckpt" + if peft_model_ckpt_path: + # First we will try to load a adapter ckpt path + # this is given priority over loading from nemo path to make resumption of training possible + ckpt_name = os.path.basename(peft_model_ckpt_path) + if not ckpt_name.strip() == '': + # update the weights file name inside the ckpt path rank folders + self.peft_model_ckpt_name = ckpt_name + self.peft_model_ckpt_dir = os.path.dirname(peft_model_ckpt_path) + assert os.path.isdir(self.peft_model_ckpt_dir) + self.peft_model_nemo_path = None + elif peft_model_nemo_path: + # If resumption is not possible we will try to load a adapter nemo path + self.peft_model_nemo_path = peft_model_nemo_path + assert os.path.exists(self.peft_model_nemo_path) + self.peft_model_ckpt_dir = None + else: + # We are not resuming training from a nemo file or a ckpt + # We are training the adapter from randomly initialization + self.peft_model_nemo_path = None + self.peft_model_ckpt_dir = None + + def _load_state_dict_from_disk(self, model_weights, map_location=None): + """ + Infuse the state_dict of the base model with PEFT params from either a peft_model_nemo_path or peft_model_ckpt_path + """ + # first load based model weights + base_model_state_dict = super()._load_state_dict_from_disk(model_weights, map_location) + # Next, We want to load PEFT model's weights + if self.peft_model_nemo_path: + # if the PEFT weights are provided in a .nemo file + # we need to untar the .nemo if its still tarred + with tempfile.TemporaryDirectory() as tmpdir: + self._unpack_nemo_file(self.peft_model_nemo_path, tmpdir) + model_weights_path = self._inject_model_parallel_rank_for_ckpt(tmpdir, self.peft_model_ckpt_name) + peft_state_dict = torch.load(model_weights_path, map_location) + elif self.peft_model_ckpt_dir: + # if the PEFT weights are provided in a ckpt path file + # we don't need to untar + model_weights_path = self._inject_model_parallel_rank_for_ckpt( + self.peft_model_ckpt_dir, self.peft_model_ckpt_name + ) + peft_state_dict = torch.load(model_weights_path, map_location)['state_dict'] + else: + peft_state_dict = {} + base_model_state_dict.update(peft_state_dict) # add the PEFT state_dict into the base model's state_dict + return base_model_state_dict + + def restore_from( + self, + calling_cls, + restore_path: str, + override_config_path: Optional[Union[OmegaConf, str]] = None, + map_location: Optional[torch.device] = None, + strict: bool = True, + return_config: bool = False, + trainer: Trainer = None, + ): + """ + Extends the restore_from method of the `NLPSaveRestoreConnector` so that PEFT params are inserted into the state_dict which is required when training a PEFT model from scratch. + """ + # Get path where the command is executed - the artifacts will be "retrieved" there + # (original .nemo behavior) + loaded_params = super().load_config_and_state_dict( + calling_cls, restore_path, override_config_path, map_location, strict, return_config, trainer, + ) + if not isinstance(loaded_params, tuple) or return_config is True: + return loaded_params + conf, instance, state_dict = loaded_params + state_dict = self.modify_state_dict(conf, state_dict) + + if ( + self.peft_model_nemo_path is None and self.peft_model_ckpt_dir is None + ): # we have this check only for training PEFT from scratch + peft_state_dict = instance.get_peft_state_dict() + state_dict.update(peft_state_dict) + self.load_instance_with_state_dict(instance, state_dict, strict) + logging.info(f'Model {instance.__class__.__name__} was successfully restored from {restore_path}.') + return instance + + class PipelineMixedPrecisionPlugin(NativeMixedPrecisionPlugin): """ Overrides PTL autocasting to not wrap training/val/test_step. We do this because we have the megatron-core fwd/bwd functions in training_step. diff --git a/nemo/core/optim/optimizer_with_main_params.py b/nemo/core/optim/optimizer_with_main_params.py index cab5e84fda2f..c9790ee2a139 100644 --- a/nemo/core/optim/optimizer_with_main_params.py +++ b/nemo/core/optim/optimizer_with_main_params.py @@ -492,7 +492,8 @@ def get_parameters(self): params = [] for param_group in self.optimizer.param_groups: for param in param_group['params']: - params.append(param) + if param.requires_grad: # (@adithyare) added to enable pp>1 training for adapters + params.append(param) return params # Promote state so it can be retrieved or set via From da6bbec7f4f631704ddd8834caf1a31f681e043e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 7 May 2023 16:38:13 -0600 Subject: [PATCH 15/62] cache and reuse inputs (#6422) (#6452) Co-authored-by: Sangkug Lym Co-authored-by: Eric Harper --- .../language_modeling/megatron/gpt_dataset.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py index a17ba6be3cb4..d2aa5182b716 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py @@ -321,6 +321,8 @@ def __init__( self.reset_position_ids = cfg.data.get('reset_position_ids', False) self.reset_attention_mask = cfg.data.get('reset_attention_mask', False) self.eod_mask_loss = cfg.data.get('eod_mask_loss', False) + self.create_inputs = any([self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss]) + self.cached_inputs = False self.eos_id = tokenizer.eos_id self.no_seqlen_plus_one_input_tokens = cfg.data.get('no_seqlen_plus_one_input_tokens', False) self.add_extra_token = 1 @@ -406,9 +408,19 @@ def __getitem__(self, idx): tokens = text labels = torch.roll(text, shifts=-1, dims=0) labels[-1] = -1 - attention_mask, loss_mask, position_ids = _create_ltor_masks_and_position_ids( - tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss, - ) + if self.create_inputs or not self.cached_inputs: + attention_mask, loss_mask, position_ids = _create_ltor_masks_and_position_ids( + tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss, + ) + if not self.create_inputs: + self.cached_attention_mask = attention_mask + self.cached_loss_mask = loss_mask + self.cached_position_ids = position_ids + self.cached_inputs = True + else: + attention_mask = self.cached_attention_mask + loss_mask = self.cached_loss_mask + position_ids = self.cached_position_ids loss_mask[labels == -1] = 0.0 tokens[tokens == -1] = 0 labels[labels == -1] = 0 From 1d813a372ab51688e3af6395d905a4c0366ffd23 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 8 May 2023 09:19:37 -0700 Subject: [PATCH 16/62] Add patches for Virtual Parallel conversion (#6589) * Add patches for Virtual Parllel conversion Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../nlp/language_modeling/megatron_change_num_partitions.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index a4b28fa4d761..558986e3da36 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -125,7 +125,7 @@ def set_virtual_parallel_rank_safely(rank: int): parallel_state.set_virtual_pipeline_model_parallel_rank(rank) if rank is None: - parallel_state.set_virtual_pipeline_model_parallel_world_size(0) + parallel_state.set_virtual_pipeline_model_parallel_world_size(None) except (ImportError, ModuleNotFoundError): logging.warning("`megatron-core` not installed, cannot set virtual parallel rank !") @@ -861,6 +861,10 @@ def main(): convert_vp = vp_size > 1 if convert_vp: + from megatron.core import parallel_state + + parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size) + hparams_filepath = args.hparams_file if hparams_filepath is None: logging.warning( From 0d17944004949003f12a5a735703327d93ec1129 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 8 May 2023 11:56:27 -0700 Subject: [PATCH 17/62] Pass `.scale` instead of scaler object to core (#6551) * pass .scale instead of scaler object to core (#6545) Signed-off-by: Abhinav Khattar Co-authored-by: Eric Harper * Update megatron_gpt_model.py Signed-off-by: Abhinav Khattar * scale changes for main Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper --- .../nlp/models/language_modeling/megatron_bert_model.py | 2 +- .../nlp/models/language_modeling/megatron_finetune_model.py | 2 +- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- .../language_modeling/megatron_gpt_prompt_learning_model.py | 2 +- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 2 +- .../language_modeling/megatron_lm_encoder_decoder_model.py | 2 +- .../language_modeling/megatron_t5_prompt_learning_model.py | 2 +- .../nlp/models/machine_translation/megatron_nmt_model.py | 2 +- nemo/collections/nlp/parts/nlp_overrides.py | 3 --- 9 files changed, 8 insertions(+), 11 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index a7a22bb18150..cd50f8414470 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -309,7 +309,7 @@ def training_step(self, dataloader_iter, batch_idx): forward_only=False, tensor_shape=tensor_shape, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index fb58ec6a843b..b7a9fb476409 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -298,7 +298,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape=tensor_shape, decoder_seq_length=dec_seq_length, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7159190fdec7..5cab67a71441 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -346,7 +346,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): forward_only=forward_only, tensor_shape=tensor_shape, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 49cb078cd462..dd0d9168c16a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -307,7 +307,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): forward_only=forward_only, tensor_shape=tensor_shape, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index a28b8216c207..7c3bddc9a08c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -300,7 +300,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): forward_only=forward_only, tensor_shape=tensor_shape, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index b3ecc1b150ac..365b1870a2d5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -327,7 +327,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape=tensor_shape, decoder_seq_length=self.max_decoder_seq_length, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py index 4fce103ebc3b..410bf338394b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py @@ -195,7 +195,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape=tensor_shape, decoder_seq_length=dec_seq_length, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index ff1888c1c9ea..248a3c8e2ec0 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -314,7 +314,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape=tensor_shape, decoder_seq_length=decoder_seq_length, dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=True, ) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 0a27c135e588..a43e06669489 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -546,9 +546,6 @@ def __init__( self.hysteresis = hysteresis self._hysteresis_tracker = self.hysteresis - def __call__(self, outputs): - return self.scale(outputs) - def _unscale_grads_(self, optimizer, *args): if getattr(optimizer, "_custom_amp_unscale_grads", False): return optimizer.unscale_grads(*args) From 292e10060e33029a4599e8f02491a4f4150b140b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 8 May 2023 13:19:23 -0700 Subject: [PATCH 18/62] Documentation for ASR-TTS models (#6594) (#6595) * Add docs about hybrid ASR-TTS models * Add docs about text-only datasets * Add docs about ASR-TTS checkpoints * Add docs about ASR-TTS configs and training * Clean up * ASR-TTS docs: add to api, fix imports * Clean up * Wrap optional import * Revert general ASR import --------- Signed-off-by: Vladimir Bataev Co-authored-by: Vladimir Bataev --- docs/source/asr/api.rst | 18 +++ docs/source/asr/configs.rst | 107 ++++++++++++++++++ docs/source/asr/datasets.rst | 22 +++- .../asr/images/hybrid_asr_tts_model.png | Bin 0 -> 112870 bytes docs/source/asr/models.rst | 27 +++++ docs/source/asr/results.rst | 11 ++ docs/source/tts/models.rst | 2 + nemo/collections/asr/data/text_to_text.py | 13 ++- .../asr/models/hybrid_asr_tts_models.py | 4 +- 9 files changed, 198 insertions(+), 6 deletions(-) create mode 100644 docs/source/asr/images/hybrid_asr_tts_model.png diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst index 5735990dc82a..1e2073798d64 100644 --- a/docs/source/asr/api.rst +++ b/docs/source/asr/api.rst @@ -35,6 +35,11 @@ Model Classes :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact +.. autoclass:: nemo.collections.asr.models.hybrid_asr_tts_models.ASRWithTTSModel + :show-inheritance: + :members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data + + Modules ------- @@ -131,6 +136,19 @@ Character Encoding Datasets :show-inheritance: :members: + +Text-to-Text Datasets for Hybrid ASR-TTS models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextDataset + :show-inheritance: + :members: + +.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextIterableDataset + :show-inheritance: + :members: + + Subword Encoding Datasets ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index bd42ac45f9f2..fc48bc06b3ca 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -878,6 +878,113 @@ FastEmit Regularization is supported for the default Numba based WarpRNNT loss. Refer to the above paper for results and recommendations of ``fastemit_lambda``. +.. _Hybrid-ASR-TTS_model__Config: + +Hybrid ASR-TTS Model Configuration +---------------------------------- + +:ref:`Hybrid ASR-TTS model ` consists of three parts: + +* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) +* TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) +* Enhancer model (optional) + +Also, the config allows to specify :ref:`text-only dataset `. + +Main parts of the config: + +* ASR model + * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field + * ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE`` + * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario +* TTS model + * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field +* Enhancer model + * ``enhancer_model_path``: optional path to the enhancer model. Loaded only once, the config is stored in the ``enhancer_model`` field +* ``train_ds`` + * ``text_data``: properties related to text-only data + * ``manifest_filepath``: path (or paths) to :ref:`text-only dataset ` manifests + * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training) + * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words + * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value. + * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset`` + * all other components are similar to conventional ASR models +* ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model + + +.. code-block:: yaml + + model: + sample_rate: 16000 + + # asr model + asr_model_path: ??? + asr_model: null + asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models + + # tts model + tts_model_path: ??? + tts_model: null + + # enhancer model + enhancer_model_path: null + enhancer_model: null + + train_ds: + text_data: + manifest_filepath: ??? + speakers_filepath: ??? + min_words: 1 + max_words: 45 # 45 - recommended value, ~16.7 sec for LibriSpeech + tokenizer_workers: 1 + asr_tts_sampling_technique: round-robin # random, round-robin, temperature + asr_tts_sampling_temperature: null + asr_tts_sampling_probabilities: null # [0.5,0.5] – ASR,TTS + manifest_filepath: ??? + batch_size: 16 # you may increase batch_size if your memory allows + # other params + +Finetuning +~~~~~~~~~~~ + +To finetune existing ASR model using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py`` script with the corresponding config ``/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml``. + +Please specify paths to all the required models (ASR, TTS, and Enhancer checkpoints), along with ``train_ds.text_data.manifest_filepath`` and ``train_ds.text_data.speakers_filepath``. + +.. code-block:: shell + + python speech_to_text_bpe_with_text_finetune.py \ + model.asr_model_path= \ + model.tts_model_path= \ + model.enhancer_model_path= \ + model.asr_model_fuse_bn= \ + model.train_ds.manifest_filepath= \ + model.train_ds.text_data.manifest_filepath= \ + model.train_ds.text_data.speakers_filepath= \ + model.train_ds.text_data.tokenizer_workers=4 \ + model.validation_ds.manifest_filepath= \ + model.train_ds.batch_size= + +Training from Scratch +~~~~~~~~~~~~~~~~~~~~~ + +To train ASR model from scratch using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml`` + +Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields. + +.. code-block:: shell + + python speech_to_text_bpe_with_text.py \ + ++asr_model_type= \ + ++tts_model_path= \ + ++enhancer_model_path= \ + ++model.train_ds.text_data.manifest_filepath= \ + ++model.train_ds.text_data.speakers_filepath= \ + ++model.train_ds.text_data.min_words=1 \ + ++model.train_ds.text_data.max_words=45 \ + ++model.train_ds.text_data.tokenizer_workers=4 + Fine-tuning Configurations -------------------------- diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index b55e49ad1c8f..5f74510bd054 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -481,4 +481,24 @@ An example using an AIS cluster at ``hostname:port`` with a tarred dataset for t model.train_ds.tarred_audio_filepaths=ais://train_bucket/audio__OP_0..511_CL_.tar \ ++model.train_ds.defer_setup=true \ mode.validation_ds.manifest_filepath=ais://validation_bucket/validation_manifest.json \ - ++model.validation_ds.defer_setup=true \ No newline at end of file + ++model.validation_ds.defer_setup=true + + +.. _Hybrid-ASR-TTS_model__Text-Only-Data: + +Preparing Text-Only Data for Hybrid ASR-TTS Models +-------------------------------------------------- + +:ref:`Hybrid ASR-TTS models ` require a text-only dataset for training the ASR model. +Each record in the dataset (in ``.json`` file) should contain the following fields: + +* ``text``: text to use as a target for the ASR model +* ``tts_text`` or/and ``tts_text_normalized``: text to use as a source for TTS model. ``tts_text_normalized`` should contain normalized text for TTS model. If there is no such field, ``tts_text`` will be used after normalization using the normalizer from the TTS model. It is highly recommended to normalize the text and create ``tts_text_normalized`` field manually, since current normalizers are unsuitable for processing a large amount of text on the fly. + +**Example record:** + +.. code-block:: json + + {"text": "target for one hundred billion parameters asr model", + "tts_text": "Target for 100B parameters ASR model.", + "tts_text_normalized": "Target for one hundred billion parameters ASR model."} diff --git a/docs/source/asr/images/hybrid_asr_tts_model.png b/docs/source/asr/images/hybrid_asr_tts_model.png new file mode 100644 index 0000000000000000000000000000000000000000..458c47b8b7723aeb1244511d2949a7c8110c131a GIT binary patch literal 112870 zcmeGEc{r5s8$OOdGseE}`@V$ivSyj6$ex6d#$FM!@6A}E63R|OMI^gKwvjbES(7EZ ztRXvn?@9H3{eGW+zsL8l&v8s0bBt%^d7kUOmh(EV^L`!}=xLCXu#i9yM1D#0ydeZ3 zU=W1YO+)}bdDq+rK7gINrLJz^sIH;z;^g9X#nsBj?wqZQwVk1cnxuk)JOl|ou|c5> zHAMtlt!PlF*3Q#HByQe@5fPDwsC!-5Hf%jt2evsQHOazal8I`v0YXTy&<9jCV~Jjf zF9mmejB6pSFf@D=i`nAj>e3Sv6t+U4I-rvHc5M>mG6$DApAc(@C1lMAV<)n8ffaw| zUZzFKP;IjprNnSKa2?>qbS5I~r7_D>+Tlg`NT-|{F0O^7r|EHTppBaQnORw30uO?? zh_nn(Yaz?rgU;Dqw)eaEAedL;l_9;-=F^`Zbdh9{k<3iUZxol06sXvqM_M~jSrehD zQhDpKlXYVEm7tItNL@lhP z&%N#(1}rCUXL`wAR~Hfn?};E-_$>$l-T})(EZ`pm!8720KS97U@czEX>&E>Nwy84+ zK}hJ*`Ey1-u;nzuXLRpITR7W#{nvD{TJ%&!21W)cf;p|6)cB+*mtj&bBPY~3#APS( z3+FF9;SsEPqe&ez?!PvhcB6}IWO;dfd}Vwr;l|;ltyHqkM(0<(=cn|a+bXOethit_ z|K}^hAAW7RHq42OI0pJZUTB_}G&Vzx$^P6cYSRz;C2gE(xG;?ZG5p#LpHku#=Np4a zhxZMLpDFoD^}p@Vfb${Ye3Mj@^1MF_&wV@%+g=$xrJW%1M8^_UpVdSoG?9TB?lZ02yNRbw#Dpv=is(emjE+uUeG_N& z#^{o;@ms<8(4em7^HDJ(B%(||c;vpdtT7ubxSrR{VqWfaw}rXNd(C*7 zEK*XwE8>xu126b#DGWphW3rj)NWSt|S|X}vIj z9s(QgHTd?Ll>3~{wIUQo#H>tnFyEX}xu!nl#6E&>ySBI|q+%7w@BO*Fk6GmsEAxn&rDW_jJMR(A9Kfyp;+q^KIr+?0~$@RO5pZx7oHL zlID_Rr;(Dds{}3rzymPn`CP%DibTq}=j&;Hw4E+AnvN9EP0@U=0N}j!%w^sROzuieEG|J7fx?u z_}xqdHp6XsxLC@zl?o$py!&JNdEmpYGrB6b2jAwZwDo`RpR(q`g@P>!%+$&1XzRwk zBmPo{ZsWND_xY}?oSuawR@+-8?P9U!E;UyJ-%ek1UmFO@&QgVu2})KPd7(Q!39^+UHp55;Hw-TEc3f0x!g~2yhwUa%{ZuY8))$Ur$yj)9s zLX5LL5`I}1`VuIJ2-=%vwDTj+LJw`gMW`40b7Qww$IOXQm&XK+#l91x&=Gf=hi=>x zl{k8jM`u8_Uwj^anvCf9S?7D9AJ1?;`XU{ogFe63_ikancf9v{ek0hr>EWeBvFrWv zcnp@sEY^m^XwT!LgPlaqpxt^GDx!K$y;mrnFPZ9uc5aM$X$^j{zY~cMEgDZY$=1ee z8=Q{5@Xi3Ld1o?&vTeZRMrX3AOQgZW9g!|JrqS8PJ?fgOBCx3=@j3V z5uaR)bvl$F9K|A|C3+NGYBE~L7Kgp_rR?%F*)COyYUO;~$bIq;TJged zbYFEj$4`)4tu%5c-P@ z&ZamPTvC#7x@!QR2;50i+U$_N9hj3$y@a&7=F_dbLqy<9cCw-mw4pZZ)~vPKgMxrx zM>|0bi^E;JS&5d0Znw&=JiN^hRm-Uyt~azTrW4axdVr)Swln6P?zZe;Y4iKy!D7lf zf6a-5aR#hoCd^i|byM4b%ksZ-91_SuoGJnt%C1epu8oe9W?i(m)Ip#BFj3TeaGpgU zd=vMAY8S_S=h^pWV)gqAGz6&SDFdb-%k{6i+xwkqdI|g>KNIzVSoD!XY{<&tYW1d` z?{aa=E2CV)-o_`M+`p?$gc^KUmO;gKU0IU7s{Sci_E=%fb_(i|?EOX_bpFfzt;LrK zF-a5x+=mSxsA~C07{#8M8VsBLE>svJ+TjwrN7eh^^EP@lI3HB5aEW|cSN1*Lo5weA zx@aDu$d9eKJN?R}(DHWoO>%9@K~(d#4<5JDI(Yw7ve5I88mG<{0T~n5u(5GC4Zl|F zs7sxw+?YqK-jrk)alp?36E|Sm_VPQUUlZn9pQx|a9yq_-iqYa`B;`F>1#X(*YTr^b zH$>F5GpeQQhDI&+Skg`FK*=qnQAJF$(}UyTF~33r7!RD-rlGipT&+aGvfcbR%!w0 z9F?(yMa9Ks>A&)>Z6~s_Ks-vaG4OLwwD;G$;onX2A>_T66TQETpvI%qSc0b@F(9+4 zuu|%+r7@14=xWkR#treA$quq)dt!88+F8%r?W|3NF5GdbAfAF@8AZ25!Cje%KusxDhmOL6)PHejsP+~v+}M)->shvEV6Bxg-rPIi z?IL2rQ)TZ6$3#mj+Ag?3%L8PE9e$$-d8^>hp{ENfcwUZn&ObSUcj!~Fo3m%VmIapv z^W6(=^!H3GeJ_;Qewxabq`8jJOIOJsyjHvY#nHJ**eE9>!LvWlr2Buyl#0bRKA??X zJNwy$+$$GTml8?BoQ=RjZ*J5DIahTl%(LzNXyMEPso0t2k5BSZk^e65ps>Y)JF6I2 z9L!Jsa!~?6HE>MtIfx>`Z)<@Ymt@T|3CLMmhl_2pY(G8y|M>wBZdnP0SlXpOFrF0^l&aP0;dK>y$p-lJ8ED#fp)vJ*z@u;R;^8nsE=%J(lhg06pL>VNW2f; zYX#fi2FNiFW&v2pa{cyE@qxx4C>@~)2QUbSTQs|$E02Bn#s(LlhlxGad7QzCI}Ll! z4S9`Ul7Fx@vWy0;L{`bPQA`0)PHl;kw#xo6;Nwk z$W5}GB=p(ah8S*jgqw%oeWz{(Ze$+)nkw(;u9_XE~Rh$J)Pt^Xyfj| z_KF=y603vdu7&}->sIx;6vQ*M%9Yqm$hKl1Pq^+ZnrS_fugub8z zyjdM~Kc1`)>3ebEQIezfTN5VfV*oBPN6F-`ynthy>WNOGF_Id)s-AEDK4BowgtVEF zh{*-zI10e@g=!h<33lj!&H8%CI04n&B>A3Hx5k4gd|bi-I1UbzU%`B;q`=j|ufQMn z<61lfpl=gfpL_C5#F^z?QHrYZO(%&r2Dwj*HGHp^rueD#3GYAyiXXlEZpf0gH(bMr z7%B||NI55&roSY2dWEWsVgZlqMPT#ClX@xj1k&ksss3?sh?za^t7&4;z*Q4F>wKE& zUe#ZRr67Sd|7>x^fnT|a0Cnw##)Y8rr#M%o3C4mx=oPTu?L^iyOi|9oY{fxG`_zv2 z8snhR_b!PbKegQ_XEmC|1=zb4m^UoK^hPn)Ns$!fcAaUZuHQgkE49yMp%*sNoa;;#{N@L6G_4xHpSHj~-`pi%_A2NL=~XBxTg-Nd>QE(_D+n zk~b`f&mb39B)Ik(PnZ@`KI6p{8bj!_yH{XTOUmv0e{dl+_PpZEt=0jen9^b$$jzHi z?R*@hulzGk=eg3g0m3DQ+WqgRa}Be!_`bJDom`+MmoPDsRn>3nrU_LTFQbQ6vR}}> zqo0X!;x|;Xzfe7nZ`y@Vlhu%A`0uRVZvS@k?dhfSe1>*BCUdRAHFKsjRW8!d_w*ll)X0GWsP)og78!&_Xjj_Vc3_<+bN65Q zD&TOH1wQmVikV6n>ibGr{p2L4-X}&%G_}b4Z!f>0r?LnqW2R{kZ99Auk`C5+Y5iM6 zF4TJHY(;)@VW8=rak_IiqQ!=XW5dTxJp~}<9}5F{i3c5{&Q-cAqe}u3YJcvV_+rMn z5Q4tf`ke0vaA?VC?8AVv1D^+ck|0PCeR@~w?=rlgx%oN!F>-Yfwjn>WQ5Jeel+b7Q zTlj3J+=Nkgh%x+t(V7*H;7{h%w1sO#b6j;CP87dUAQ9?VkeDMjJU|UiA3So;U+Hsf4ADzwCPs+ zdkQYtWlR558OOO7Nh*OK&NzP^2A3_5VV1UUZDPE!Y`_0==tkD9&)KUAf9qjPE27A; zOIL2}6%`*p|0e)O@Pph9X$@=GAY;f32DNAyL9YnpHcygQ<5?KdTi0iK4S1AFT&`{{ zF;XyBxHvoZ2??Yd^0x|6@!wOY&_G*I>?ytDzM25_G%pd_ccxrr5HaO?cY8or8V}!a~2RmZ)ROzIht{2hw_3G>8tF$GUs?kV*s0+Lj6Tt7VzPQq5ck4a1ytT znM*s0yvGQ=XY$;jxq0RfJIsMx@BUkNF<9%j>{#R#J#T6J2cb^H5Z}UV=pG;S}-CV;}g%V(83{7hk8h<-Z?) z<3@klSjV`~Qgm-)R{NWnzQG%#T%?^;+8^Ur{#N8gfd^hq)9vDQ9?JVPJ|h`DQAAPJDdu&CGM%acW(dBx7yq-&qFwk3Z=9;~GV{%Q0_hk8DG(|+qU*2g?3akc?kbpTtoIOn^WXN|HQ#AW`rAXkno(x?nLoms^W>oz|f~1xUX7;k$!q5?;f;rq2;R37Nb+74y&ERZjy>!|VM3?+z zpP#w)^o+~6QkInO|E$YZmo!ZT?l*mua-70+z3PRqakR6j%8qHF*8?|MxkxL&JUg7upA zOz|bFU_Cfa5r;#xEFpLK@OXtH9W*w22DOPRPPCyC*) zKhPB;4n^z!%|H<}60=@cMx0d}-4!4m`r-?e&wio;VTthJVV~bb3-GzzQ?Bci=KyA@ zU{eV&9jgWiDzvOD|7-08_wE&fj<}%I44*#94xWay7k| zuxt)V6XvDm&$ZCRv;q~rW@R_O{l`W`(dfOq&*ksF*qCCqd{FEuSIwZGa^_Qh*{23w z+rPP2lS>z&vo{vE)cNr+0FH9R0!aPl9PuFa@>rFLt1WKTe!n8fFj97HtpF-swZc*K zfK`{1!f!zefie)k*>zTbpv+^juLKk>rW~2Pzd*nE9MlexFo-Pd?~GQuzn(D01SLQK z#=AN2e;>@(2IH758OKY%OFET0f<@-`%%B%2vcNNV?EQ5q9`tyXp8yFoqAwK^Sm$ox z#qLvV)UY4ML5K3A5&stR34g}Uj|;id_8k#U-!5CGj9A91$C;!K{K-j}ByN=Oi0+=H z99LD;!gMe`sfQ7ZtkXjCv}o{3a*@nGnw18f>vMd#_p;Q@YOw9FHf3T=;?d&=qn+C+ zY}PL}a&u+?kvYcCnOpJr+DSf;ih#=BG|9gE!PK#qw>LI$=jRnTZU}HAHbutq!w;WF zNZ9&Zs!OwZ=`hD{u*nKn_S=ft|GCtn>)&y|Kj(6Tauds5>?wV%a;K56Wj>EH(ry+~ z2oR<*h_`@R^9i&$(^(2mggBRIWXFVa_-O(#!r{H57NVmt9=GOc6PSRvJo;WAj2&JV zfWX#%W1RnH^T*#M;G^P(`+Md715)w8kc;>7aed2z}=TX2}-6pXZ9Y4Qj zmF-|sF<1JvM~vc8qa#GIZxp&B98N=}oe}fxiTa?uw>!0Yl5V&{h@FOu>aSjOSbr(K zbvI_3yIb6%!a$kP*oX1D#}jixPJZn7fxI!&fj3v(r+;u@;2GX)<3GlA%A;{b@x^m2 zz#(>EjV|Bl)->ul>`IxK(2Qfxw@X^l;d5*#?N%}W;IT-PKlafNR}+yRsc`iKD=bP4 zZ)P?lMz0m-BG#`_6nU+fC2D{cWla7r&cfZjL9QYHd*tT9l0dVSEkNtec^%tT49i`?h+-WOkZ#NavopNmRB7F#!cg?q1jnQfj{u#S@M*lux89g+PLyRBsKIq$*4 z;}>E^aNCQ~>&Yi8NUWcs`GiLd?_he_GNLphcX`rQ{h6m&s`^PhUPW}Qy;bg7zYJfW zX1jXS+;(!kjglDPPnz4ypHjDQIYRHb5_3yF{eVgtEw_-+rH%P+&Y=~%>A$r|nm8$z z9Nf9$o*;~ZQ)PKjTK~C{^S@?_3)4y~QTGmv_Z}SLy~sIV8VJ~ts23^4!hcdO)jdRq z1mV9WskN0q`wyk9joDEq{h~{%Lc=}FEz%@l2*KE|5s@um|!9b zp%&4oXh{oxZ0?oUxOyiXYxy!qZ&&q?fw?dq+~{{d6xmdREHtJ!e5M+S-(R<{8asdS zU$W=XLbm^m+2meZniPhW!I3v6|0cG7hv43D)e`Yl@kbTPHr;Uqt?`UQ8>abVgU8_8 z0+mzFqnwW9A+OiKC?(wDPDf-tCR}!uc&{1JNmMFusGGz+c+4;Wfe4@?MK^y>FQ-DG&1^*|5%l#s)READ&fv)2yY z&)pixVp^D=t>a-G0n?#X7s=~*_ zNixSh1)I614p`>$?~I#T7tX)ctI(P+}b%;2wx&6_6>q9UQ{XHn%z z-13-YX@@p27&_JSfyb6kx)7tWp5k|dUFLQi*6rID$HJdoF{oS#=0h+WQ!)oc&F${3 zn>u9^lm|6sgy*+^-gKfpAqRTTqitT7?OePVown>^{l)Qlz^Me|=YiWJ-xz3n528UO z7_0JTtWG34EYP46C)MVM3BwGDMM_Hu_wUY7YEE)sdMLkfa!Rom$-V^>z5O%SqqC_9 zrSB!ZaimPMq-_4;k`!Sh>kkmBN-1_CVDoXAZo@sJhyPmNmA3dK3n9wz(asF>j9RS0C>O6{vEnH^@5rQtsF2ZH`B4`@N@-kz0jB;bf)lRJWNAiLtf; zpil=fN_{6BS}p9Skbg*jS|HKF?ScMLCXwcrM8K{qMkA2@OW7@6)6v|9=Eto34BKYm z!L7^P{k5DQYLre|eS`q&-lHe6Hm|YOQvS1c3S%BNJp=m1udN^^ESyK)8dn#PlfWN@*m1ILC=yH*1R%;x!QVxuYDhvbe?I zDJX7Q^*M#H`=?*|^;He!4#%n?ta#l`-B_C#2QUZT_7D7|^XkBMZ_4xyD&Ok~6z-z7_JVsjnrd=YuK_i&9XtnA8 z;!5XQP3`j$n^V{24sM?A$~>gBi_C4W_hzyoIt+nx>`bv<19F9(s<)e8G&(Q7PW|eU z;#;<9a~Lwuo0I?*QH9}iZD`znW;+;imqzme6b}heKMY=?;+xcIq_7Oed)6{SftUcD zG(Q^*ya_t|ZsX}uLu2=aTGkv_GT#yV?BPf?;KI;XBXtRmWe^za@ogVNK z#|+wL>H@mjE6nY>N_oT&sgUMqZ!k#rklon@vJ1_I`{V-()xTcWVm@k%*YNFzbN(o- z-Ph8J6U;2J&Iifs=t3T&SYD?r-kdHQY zChq0&339C$Bz4Pd-e>s*3IHOA)?;C~_$8f~g(>l{BDsbLvF0UWs66K-#jtY`Aece; z;#d>X;~=VxnqT`%h4I!+T@kY+yc$tPznrqj!VXPFFEt4PKs-{YRA zNycHoICJB@bK1lN&G;scL@#FmR5fGE@7&R-42w#8S|y**tcS7GI!IlVIsW;9vF&aE zrzAnWTptC~FH84M8813pRu&9r#krdnOM zS)k`KJ=coCLfl^{wxh)hUWy#{oSP(}TW@&8*$DcCN6LAR{ZV5mN1Z2t7{AMQ7;K^( zb7?*tV$gx17>Lew(Rr$HvCYri~q+Sp)1Ye3L=n1!W!I!KW+fdhNG)`ZCDhQG1V#L`ttqG5?# zHEiutCV_|vTH#9fK65k1xjYZqOFUP*1)O|Y4;79yCfr4SM;}*yyU)6HDZ8+ z|Cy*XkFn~gIXUW~l^=zHlmd7tUoc!=&u=-e8BW1I^F6O{o0+rT3_j>>s%A|n&)Ha^ zMd)d;G^`pV*mwvV3M1H`TwJOkRJAvyoPC$B7}Qvl&7^8!zrK%NSfa9ApT_4V{COrV zZBZ#!K-aLHe6j&cW;y%O8cUaC*yciFo8zBT&#>_WCJ_=iR-It@b+s@~#rT8L+u87P z=D9IH%4-%4;Y;;|oc$yH)G78NArgrXuur(&O6&Zhpvtg%g2|$Y;6>Rpou%X&{>V>9 z<%LxB?UeNwf=x!=-CiDXrZyd<&QTrH5ju~PHF2ght=qshmDvqc=-^+ZxZi0(;u9<% zoY+>%~H@7+{zLQ_*XLYI0+-IFPeU28DP7mC^_U!kt0ck;7RM&J6Qb!M?Yt1PwB;-U9D(1HFkJ;h|vk7@p@WgERQ1_!v6)|z1&QupfkP#l}gAvsG}7s znRVvf?ETE9ysGRwlD_f2eW^n#$R@$s$=3BysF#VK zbdsembVl`T_3$;x1G1o+dU9B(Qp4ufGTj1uBjQ2smL6C@my%mB^qoZRh|#Dr?g$%V z&V>B$E_~(OTZV8{38^unDEmBCnaMq48FYvHrrJ~HWO?vA`8lI@QRFqv>x!Jwd@+|1 z6>MBDHDCWwV8S}WIz>J*oQEIvOmh&$bgTZV?k1CDYxKoG&>&^N%SOIftB*cq|Dw}x zl8_-dN`Wc~b9Vq&FLUxeBJ!$MmFcRNUPR z=de3h4CfEoqf}FRMs1yN@O~m_(+J8?4&2|mOtH7DcrGb4SHXVcM?J>o?3IPG%~VcR zq9aA|sccGUlm_zDIP#Eqx8OH-{YD549;Z5cMx12(;9{=Al#TNs@t1%?Z$>SMef>R^ zW@fNbc~j&b#iCS)P0=9=x>HMGpoAQGwfEg4aLdi%y^f{bvl0h*N0g^2&0!ho z;rn0R+C}$`*nh14=01>0v^Z;f|F(9Z&LwJ>aYDO#hxEWXfqFwKW7r&1Yt6!W30fDi zYmLLOSR%{J)LCg#J@hx1PtQC)CXvM2l=C{6e){-OpK!CP;Ij`OCQ<;YjgAy}-Znp? z8no|qK*$%vXwfmKZW`X>d>-OpWA!Af$USDVB(-0kzSf^Le)6X_;k>pA;B}#D`|Zo5CSnu3;teU5-hS9p2e^ zhUj|)pzY!XS+!ZeBf*z94>Wp?F}=}sdvmGE_n;b`rOf)WusGzbU$XY1!*_ndeKw6z z6Z8-v%Q3!FsrU8!Cd#Njm(!v#6Q>J1wG&wvU*2gW@BTTrLDFGpb4OOEJ0nvL525Jf zPZ)OLHe3ZMNI5_ zQ}~chU2PQaMp8Z9y61;$s{Z1(pFvY|gd3f^kouH~x;uqgU|D^CThuxZ&<>y29fI^d zOfF6jUmEIMy1A2e#4fvr@yudI@ymhvkMt*TXR*SD3z4bIl3}&PwXiZ8=vKD$oq*_K z-S;L%2i!fT%}bWqPmq4p>+Damhpw19rx5!Pw{JV~k-Wc$iXp&=#&FsG%HdHB+H#5& zDs+>KWpmOkPK?1Qrq<_KC~*cqHUZAak38opGvvQi5*&EO=gp73cHzk@9>AlEFO2vUdm9&$~E^Z#Hb*$Z(JJU_2?q9&7jX; zC(005j%u?53cEg>LZDD1{2764Hu^EE)hEon4}Q&a{Trm^x*2<7S!H6Bvq@)_lh)Yi z6xUTV@y3f$_ZV9p#IF<)!VMOZJmn_L%X-WD(T03IQqc8NY%6sd2QIxG?I4c=rWy}+ zU>+%Z7l4)0Id7dEPy6UJ59MWEx*u~sGD%Y$6Z$%SN#8;!RV4J)9}=32k#Isxuail9 z=}2xMhX;fVBqlTjt(Nm2x5i0$mHKPM807!Vo)|+FOe$JEmile4I&7Y@E>FwTHfdBJT#rOAS|rjH{*-;stQ(S zSS%)u~-bD9%(14j&pZZvi(fq=tBw-R$_ka;RIQBUyF{ziFg=;bGsTQZJXD~ zwX_-wUYwzJG(+RO&IE@&2Q{45c6Urll)uyOmF?x5b6|ca^%R6D^QE)#-6U5t!m#!Q zdxO!KA=g6a$|?7Ym**w%DPE1hA9T&z1s4{co7^&^bs-Tc51|VzOmxE=DpF1pVp=@R zMrMC@;$~Y+N~S35Ku$2`zB~#jR7~ksO9i7jen7ci{gmYU3rKq7+!dtLnOjsg%i+-I zT>UK$`@Q03Hi60gh&xkT4Dy1fl4zhcnqw>Tb1RJm?PrQ~VD(1Ai=M7IkDiMXF2(6g zT|Ia1nDDc*c`7Gg)jU5@hY#aQbjg_N_E{H3*?Y$U>3q_4<&!$CSQkK?iAa z4y3N>d&f8NAB$1qJuwJA+PNuCNt8u4hvUXErrg-#D{lHteZxXMRq-h<%c;6O9$vKsfQjc&uvH%ABe6{ zonxt<{8Cz@IW+o>+xfnMYRq6?@J37@8vzQPL`K@zci^AKR%btPd@md+)6n0o8H_+ooyn()rEbF_Aa!Dl(JzZNr zQZ-H1W_4K7`3wiw5qt+mrY2ssVxd{`{8de`|k6I=kCM@|qzOL+I%R~H18kn$;*;T$N z6^^+Ub;QfG5aKZZ+0JJ++!f{g>wdN6Ud*apO0);UX>pEThjMnigKxuzVDD{d*G3gR zS+$C-ss9SU$A}sSNk37cWG&#=v#?ys%*iYavN4rcG=O9km!Sb#r$*s zezQR7&dAhY$-Wu{(%;?(Y@1POM>+%5&3jES=%v%|dFYr&V@ek4h>sp4(z&A&M=4&j z^)y{w`Mugj1OwfiSD*xMSv)jXMfi)h{;k>i<(8*R>`q~rJ5o9M+#W#(M30C|?|AvP zx*T_o_;XLYG?HcO6G)FmHrV!VJ-J*L6v%lYDmn4)??E0R&T{$@jcvn1!)F~%!-5~r z7mY?ScdWfnc5W`dF5Rreq;D;PXbGCL4exriwh*hYvZy!6#s1Jo-ufbX0AGYqSEJ{9 z@TO`Nd1e&(&m*O$ze~l{lTtyH1jmSB9lb`5G3P7`13r)y9#Y9?5J{+KP%Z)$lCtix zRkO0=phzhZeT%X8+;Ztk4p}mn#TPwd$m!4J#)j9&M? zkRaR{VO2Uw@Dvgfi}wfcwZgskR;t}zicGC9E~>J2m!UIN)rxYz#}BHT z)hzeUoGv_MJ9|sc=!-_GjM5(^F7MC`OOBG9>moCAzny&!N;&IYkQ&PA z;$!xcW+vv-qY#DZMTVL4MHHism57ZD(fxE^eY2^Q@YFUw%^Qn{z2P5GGwVg?XYXxI8|n=9OYqtnyf zv@1A^D3ieb6Y;8K&GM3z4sFm@l?mDvH;_5l%PcyeEu|2UBjc)O^PL)7ny9PmHdgZ~ zIFd{3I|!LieaIs+LS##hF0VP>U%I`dId)Odqv+y^dVeY&l$c^UFOn^>k8YGU%Yhd}6?ozVV$X^wh&>GgBJHitc70UIY(HJ@YYZl&qcep*^yN~LvliKUFg zaj^x~LM>wc+kuO)l+p2S6wGHUqBfb6K;y;q&pphKaSmu`%n14POl%Z*3=@4EA@ zS5Qljnz~bxi+8O4Ob9IB^8v(0-0Psa@USG5w06FQ+YF8110gtEMBf?B37{nS87w zr4Ggd9iQo@bHsJ#-qBtX$EW#TFhQ$C8<;)8Uq(DAg4;J^hjBvv^?{MsNc4!L^=q!2G zZC>GdiAWsnAy`f?9fIU?kp>qEw_Z za*#w*Bgg!Gf!U4t0I#`UUaVX69hTPDuWG@9BTbmM_gzZae8yT-KU1fUUEJig8fVpF zW9wMC#TE0VbWSKfLs*IY$KZ;$l$UU!LR0i8SQwmp#1r_TpFpw*6jv=hAIq(CO>7MaLa`sS8CB<43M#h_uM{ z(H&nV5Rkxa*5K4W`^v`mg@OXcMX20UHh#4TG=peMKlE`6le&IJ(u8%~0~gst3RD34 zAR`0Ch3VkI&O*r-RKn)Js0Jxk9dVP!iN_$9;yFJ?h;sSNh8P5MA`hvu=`z??fGj!v z7X7MY->U~SPYs3fo?myVT9K`DqG~Pbz*0~J138k-se@;!r7$nnPDd5FFjwn)yXt$U zzVxJykTl9s^-4xKv0_J02Ri?Ih6p-@mFShCYJen;;fFm)Ee8W&>m^hfaw;_l#uYHCeV(gqbWL0Yz788AflRfr zLNE8C@cra~>VqG*ttY(%ks#vQmO~G@CfywDTsi{a9!v}DAKX+Jq_t-F=z&H&f=mey zCw?8Jn;F_rD8^87Jo+Km@_U!!4?XN5yUghAdoSU$)5D)$c?mcN;ob3?dnfH3%BkJ% zo`DQU-7!<18l}Kj4s|z&_2gXUwNr42c3jtGSMn*+=SAr{)M>Q3wu`>soe6P)lyfmZ zgaw>C&?||O&M#e{Sd!`Fb2t_)gX5$0Pp9nU0Y@)|{(ui`g@?Nf$)#DXOS7hg!K6VF z#TQ7z=K|GkABc5;roAQyul_kuZrcLAuw^d@eowCm< zPonK8n@?m3C8%4E5AUS>gTwi-SSq7913N=@` zy5xE`*)MjJRgd!TkG^x9f7v0oX*ES^bIlY=((;_apq*XtiAeDxyQ_nhso^&WidxmN zJ{0YUm15!oUm>~}z+5t4;Cl2j-*CKmo$z%SWk=lNNMqz1&5Npp6rEcPPE;0>D0@u- zsGj&^4EtnGVc@~Gv((^OuwTdCWET^H!V`4@w85}x)#Dw??ET%#XB<#ht|TNgs?oEN zzp$uo?`c$%=+lCvUEVhL$`*y=DG8FUe+zMI_Tpg5ThZ?Jv$@Z~d0T`TY5|+NPtO|4 zFRx#Dc#L%+jk^ZM{M(YOI?>yx2c*x#vjj~^g1y;DC3wQ`uGlVY|M7gzF@7=++Dn zmCsB|^KdYKOZ%LV%e47@XSE~FMyWjIjutIU{4_*|E|R7F`z!$4Zc8=O7BkyJZAUEH zfRg01+5Mrwq>*TE{ROHGiI`^9`~B4D7qn}oM|!Ud=qA2W9e!R4+z#wZmKGG9>^b3* zG`ZY~!Gxjmu;zKxY=Nm+$_Px2k&bG+Z?vO;F+F`4M`IQrOqBlIRur)fj@OhQpHSv( zJpWe*12$9O0a)C0?o9>ySMD;R-Q=|O& zTIIw_+{g*KiK#;UhTGJGGX8AUK$&Y#L97 zH7a)!)j_dkeVg{1!e7}WL>%F|-1r`+rx_$NAf|FuPU!kMR&n>`JT1>6vU3ymSCEP; z!AyLhizM@6$!GXg_;yKAa&?zqUX;4sFL-vuh4X41na&>}7w&ij?s&MDVyX^Y zQV(PfrZwzVP7oNK_PsPsIMFix&Rlmd0sTeAva#$##8d)6MvZrdO052U+@Ed(?o*sm;gagyXuiQ?(-%Na=6NR_vECXP%af(fohajF*FCiPWy9GnS*EGVdW*;{X) z$Uy#QFr4180(b>cBG(#jKl}gR3&X?>+$qw7F7H2X`p=TzaQM(7xccwLkN@h@tV03M zB-wZQt-$|6?B8iafdivmi8q4&j}wA)K#g-mO^xD|f9Lt%8!9mn#UJ@w5B^E(|94KX zy$}R0FMHGfy~+Q(lK%(S9va>{&qkW(dZt zeSda*;4%;=oQV_l4E;mxc07#gSW=~49)Fr1u#cF9Qy?8}?NB>v!g0@tSsbsa0>T{e zK2Qc?;$UXGh1aXf`#Rp=88r@^d~mW!`}gjC9k>AmelR&8l2z?8!Od@+(=?I#5_^1d6dz^2&i1ByWA3^t!Iu!^EYFL7ar zw!RN&tL#qV)iF4!6-5@8SFNRxr6{Poew;D;Ho~@_FfYQo>mll z`{{A?7$^c~wM~nyF94;K?!IcAKFHXb^sq?ATA&p{?;u!Qww6b(jNkipgx*<%t^!K;pH2nQp30V}K}gLHAEq%%wQ?@7MN!ZK(dY_~2nAwJ-^1P2CJ} zbbpVvB~1hcP7kG_TExIdC51r_LYDa82Op9A-FSpXz^OGqv0$sCjUfT@=1W{&xEy{q zuqg#BMGT80*sQpo^}j7V`25Y1X>I@!w01xec$! z9UF7|S@mB)p^4?9>R>?|Lj-~sFL4pzJSQZrSvVFVLX?8vfF}NK`sKp7o_vIqxAhe# zwp9UFAXP_cibtar6~xvoF+Dd@%5j^Ha81_qo>MjM&34ZP1=$7 zWb3l#Ua#`9hES*f-Cy5hbfOHD5A~7DkqThQctK>`&zuX<2eq|*ao=bu(3S!PN{bXIZlzG%ic4{Khf>@%NDHO7yQEO8Sb!kK z-QC??13>~IfhWE9{@$7QnJ055exU0>w;+xXAN15nu@pRlar7fC`$V~t&=n|Fzm*bub z4I5qywLU2nB7f++o6@4O-hXeuor&nL%f=}XZ_RRbo--GSr!(jkPE(-?e*RdTM)l9I zw0g~QL4uIU!t+y8QX)BFj4%Rw6ufFB)0(wI?EGAqtL%Lpo}=k_8cX_S2xk8SarD^=Uu4r zPdZ-Y8M39cfp1ktF1$rqw4O+f!bEt}90$;VnqQspEWkyZfTFxjDxO#v=f4LwyxT^W=_R(Hj}yHS=gbxbDDZ+1%5{jlZ&})f_crgX zM^fJU;3s)l*C@!1E@LOlE2NYyyp6p0mpb$IeK0DV#jltjBfy>Rw|&+@>FNx5w1+5; z#QHkA>VRJ$RY0nKeSAc3$ID^lvhglK6-zl=nBnyUInB~6VzXw+D}g4Hn>i7rYG#ua zzow_3xI}JanacL?beRZ`=(XbCwF_zR6e?mCzQvg3(rYGA69`1rjjo7dgI6F671FU8 zJY~PgpVo88cb~C)t4P||4t`#u!GbPaOyTtiR7(2Hc^1hDYshGGrf1e~{Jx$xLkRy* z&Hj}*NaCtJ=c?Lmh<2gLVka9!@PUydH}`4DYjQ9f`5|;A`RPKjTagZ+_1I$0oVVouV|GKcL+~ z{hn63z<%a1fmOx$e^8I$B;5Aq7Sk%CLuxim-m)9nXD3Ugj@==r)#PJ}wU1&2m9Z3y z40SPfegZ-!7M$ za}e11`q{qfg$f^BhJS04RCVEP3-$Xvu|e7UGwR61J^~?woMw)`SO&Y8%G0*N+miyR z27KEDuU&vy)L%p0$)K7)eXG!rA;MhGg1-bTPuoJ=3o2Hjdde7g9odU2osy<$D}?;p zOkTlIw_V}_$KS<8zcYref>xe{bb;YW2D{HK@8446U@sF;hWl(Sv+^23b2=NCmfLBs zg=+YloPU=n`Iy1NF%~o6FP`ocIXjvKK>j)%ZRB}tdl(ufZue{S<~i@S^BlVN36q({ zD+<;wjPqH|y0E{GnfDdVzqEZ&K}f5I6!Kp=>&`(q11p&}O%SP2uz}#}!|!yj;D6=m z2jXvZa`MykXEMF+t zAA3mQcf`|`J$txKKDU8yuUUp)Fd@+oH4JYcA3OEIY^!F^;FmiwKGcu@7!P+k)nadX zX8pn}1JP~K{=jf5B4d=CY>Q`((DXWASiNzwC;VQy68U%fkIThUNGfF%a3j+mLqY!H z>|VfAVWdXl||fBn(--^J5ewCMd}C=Sh>Up&6$oRCEE z9P7&O7XUa4NgzmtipT#nRdPBQdezo$W$3RO#^zhCzh;Sgbc&$#T;|DWs}{{;f=Lzb zDNUkUMgJ)n<65!S!+JE^9SuZ80n{%s3m8#}Hhx!B0)O;8G-mAu3O@hFH~%@LEB&{c zKg2Mwv#oRmTp5JqyRL2h>Nz9YWU8fcWzis2tIyuaRv4;RdWVNtv_JaG+z{tME$DFpt=KRY#|d^$5&Blxz4$aHKLo8v~Jrv$GK`#%wEzcHJ0=Y+D+X^cT982F?@zkaNM>8tw4am%rsMXm%v9>IB9=Jz|+ zn19>rO$u&WTi2ZzNAyl%q%* zvA!1@STC-}H~Jq)rI)xr8x#UI7Hsl8tLj?X**@$2@GX_Fk zQy-Izng82K>hfp?j}A=-g9X{+V{l7R>Kn^?ut+!8%-lkvtKxOOVfZw}{vWly6lE}* zUbSEoR@SXJwLlI1;bA{sym&EI%^1!0*v>It5IyIAa+q}4H-75RWOo&f@1F`hhnnVn z57ENN;;&mfxyYxuUH+fAGF`SEA;KG2#0`@1e~+6V9pw4=M^mD!a~_Su3m>w-=29v} zTz!X=3}v|+Xm3BVErefdmk#!y9Jv^WRE5dp5?xM{s{d787j$w zT1xz#9fuiB@2N7|rqWHoQRN>ySPN zwr@;Z+YIsI9tAc3)9{nQLAQR#bOeQ6;Jk)@R5hgBiYgud-ui!Caa1UWOOBxAKXBAP zYm$Fk81oV~IQflb=>KyR|33P^S1{AtI1#*~ZWin}&g<CLvBIec2O z{93Rkj`Mpf(KRm|Y`_i_z(5@b9ub`_`bi5bTd z@Q<<>L-EPvG_maY{(sHX{o${x{L@c1CMWI-efws?1DqM{Jzh^M0EZS{{PcCb!rZCK zAcUK=Q1?`=54y2uQdkgj*EI+f5>B_nrP>4>!UsT|9Mt`M{h68u_COwr+0c7JOx z_D?b5SdX(r0s7jY1rCF{)8y=?XL7C00tHg%80xwHY6C!EiVvaqv0nFTOPJ+JBV~-% zwkQf#6!L;I$HWDsS@p*zfB_YAdrSJyL$=Arq%)0)t*~mblXD%WYVO3&4ukhN66z=+ z9>e!AA8jrm_{I8oE}iRP(nG*jR~uXnrGRBI34lkSq{V+ux$X^X3^`ghwZo*LL{C-X z{tHU?^I%AZAsJMJ?*a*Z0`^|N9>U&Bj{NN7)H=HWa(vBun$XG_8}B0aKx0sMeASf0 zZlK-pN9{vHYnSxo)+|Fp(u>u&nyzR_f7Al2f6j#^&~E!QM+cr?;68cRYW5!9)Zkdx z?BhB}?Pi?df9u*{8y|s!o0@!?_A?|&O%iGP1ccTkaq94tZHm1XRr>xUlZD&5+__h@ z2sy}NR1#+h&|+@@dEpDJ`4A9sp%kwY_j;|;9QEWck=9Xi+4(LX0;Pg4ri1*252GjIdaW#FHhM zZ2suoHMJ)KU%F1X$$Sl2Dek-cVg6Q>x`wl155+T1SSv*Sx_^pr{_1}{V1&YT(ZhVd z&^5s*E-iQvLkGXrF;KVdCnD$ z;&q8{DhKxJ!D1*V;luY|#ZuspJF6l#>ksfPPZ2$iS5+~9Q74mrJI+;kH1!uZZ=wH_Kj(#rqm~S``iXL>Hrf>s#Wg(Z z9S3aj(JXoODhV!T{pmIGv+6+Jp_#en)8SD^-bg$lQKybCdSFk#{vE{SkclW>P@L27 zcGS@u;?i7hh^C+~i;8i5=?Q&*zElnPNq9+>Wh(IS==kxd8Z0}vtLJ3u@v;@S^Q^$ zx<-Ue&np|%+F2LR9ee2mdl9`{f95Nyf|`?pTYqo>%G$==;-~aKvbn{BgZvxRC--Tc zDweXXDBLKdUqa38#=xb2QQZ~Y?C(4nQCvb7sW-7;j+TLfJuG`Q7n?eru;!Sp7E>RO zR#Trzb*e$+=QB&7$9|XC$${>gQ8D!am0I6u_N$=}qqQAlKWW3BGw~}Z#S-V10SM7Z)3--01P;QV z4(P*Pmt>yE=WO zfU>9HkcfyAo;>+;`IzTSV&uFsj{xu0?{AK%!TOHPGg_$%CVf8n1AF(f@PWH;2udX* zw=U!<0MzOX6m~b!`R)7B0M2%Vp8!rta2dX1j#f&z}{XTS_;>RK3_6y zEkGj2X9GZfwlPj?)E_y%Q1Wg1Z*VCL< zUF$@kTB`eq_}w2ygpKZj5&`M_8e*Ln{HJ?slmsJfLw%UQWJkXGi(%p~n*?G=hHqlX z**-xun1uKv9Ln`5LIq-7d@MX~o893%gA?WPy$(m{w1pQ|&6A)V`O@na>*=XGnQ`UOjZs zMd|<{LVDTT$WX6cte2|Ks4;1JuyJrWd4z#`Jrc8(vVS>>KmhV6@-)Mks86a5)n6AY z4wVZ#f9l;dmz&nN7zh{Zbk7mrpu$-I$vO|15b7P;~a68}%oZ^`kqc7+xnlL%R0 zb>)W<+8E;DXOI#H^%i~R`d23YAB;R90^3nrQod^*1;?&+URnG;Rn^mJkQ@CVkH1Mx z6gz%KwmZkX&%gY@V)eJ#Scfb@c3FW0^8qMY7)H!yOTMA+z=G?_hb#YHU}At#83GN# zBdT{0+-Ppw9z>A9Zc&<}rC#wxP2+>vV$=?_4``=_cwfZE6f1gXo?$^>t=gjT!A>(y z$dAXXH`At)I&m$k0URVxtVZMC?&I2#CJK} z5qkVqraBRk6h|>KWLj=@P8A!U#E)H5CfQVD*|bW2x;NJaM{EVo>ek737>?-e8nC5nqll&!P9U|VY6A3P-_2m({{93 z>%G+>Q8KJQeoK9&O1q)E3hQvPp~}wZi%3inA?^CzBdKCl(nJ?5LBUxxeQ!Bid9>kX z*#M!5a>Jq+ord!yT0XD9!Sn)QH%_R7rgg7U8^wPuD87?Kt>bv=ev9pj*NLLtBY21;2cYoHxgzh)9P97Hcfyy7KpSA zv-l2@9J@147)&1IFU-5jPfLWb*)+7npBiKn^re%>P8LK6P{RxlNrxqt%*Nx?JQD=_ z-lIEU2YHoWu*V84eaOCDr_~^oJ<#m?y`Cf$Bh-qmUw5$QzvZ8NmRQ6V-yvDZWYyOXI`tT2I!!T?rR_N!YnvT- zXl&thW;h^uGcS1y-3T(EJep=<8Jdd$;n;(1A2nyf#Dop$J>0Pj%du{*n|z?7L6&QJ zpl0W9QB>S2xYuxFT_yi_pA(K35xHAIf<+3wLb27ASv^lKvM;$}Aoy5pJwY>y-+XP- z_53$YZ_l<&27)mMEa(7)gJdC|0Wb&{LhWr2`tFqCf{(0)e2jyw$y+GAYQ&_}D;HQ*{wPj9}rFtd_ zfBtht1@1gjsSav8RRG#V^UU7e89FN$-iw;x4$6%svlban6*A{Be>W)_Jy$~$7Z16; zUJMLKpq6Q2+-jwKV=(tArQRe^({#05@&P95^#~26c<+wI#@{95$E?8|9ux8m&{fd1 z=TI_mPi+PHZ`S~FfVY1DCQN&KIYxT-St1g(cPo5%{SacYWHpo4gHx%X_dg%EN7Yc$ zju7sirb*AC-@jaS`!vp1o>2_99OiQxg0}U-hY=RL$PQ7bVuKt{>y=dGd3JmA+LKCB zal!kQ+1eJL0mf(Oh;+tN%_c4$J9U$jMzo&i5^JeqP*fdEWg5Iy0`5-%w;l-7g$A<{ z;bXQB@48ag5Zkr_Xaz*p-5yLjb@{_LSKNBSfi)!$gKdFv=N4kBibC!=(^TJTZQ5~6 zZeMsQrq)$^E~vae?~C1z%mo$0(92{4Zoeln8F!B~c`Pa8&JZ%bz+1maZg;cDd_tT9qkM&O4MB_sLlSmtOJiXbq}?mUe4Ba;uZ$?--Ka?H%%W zzt-$0xe-NOtKIz1?|1X@770q3%UE>gcy#62A2x3(Y|NRlhxbL#K39(yU&J6bpElf8 zCVxO7p9Njf4ZFFzHCN5@>$h8s>q&6ACp)q8iPw3dt1dBPQaC!^((?_ z?B_IA-LM3w>{pr@b(_sYC*M0v7RA`6fpB$~W|b^2by`olC(*g3GUx{Vey^Ec5XQWL zS1ixa>48f*P4zm%Afg4<_a3hh0{Pa3hHExdLn+KOQ;biEnTRTnw^AWu?sF@W=il@I zl{$GzfNWn>vr*iU0rS$aUc3? zLyUXAyJuJFb${9KE=ga|O6PqfmK{L|WULK7(SkjO+;1sL7Q^K`Z|ncykim?do_;^K zHZ-C!#-GHf`o?gwIA(E@DE+J+nes~)jG6N=l@>gjqEfJ;sHz<8e9tiwmxITDA@C3A zBZy&qM9=P7q6KB0p!MSX*9z|j~jWDd|fg`h7H<;NVU^y66J5 zIX=9k5hYn9}9J{ny5!CYWUhkQ;Fc?@YE7&X$z_86b~wG8K6XJz!v zlAE#z@gV#b($8UdaJ>bvja%=_J?7G9+g*;7j}5YTy_%48)E}Er4wBU(NM-1hoqDjD z9Iz8EdXmNU1v-5DTjODD)u&xn&EEwvfq@(HEH^Y{)eofpLW&s!gO=${3sU-dTukws zmT=lhLhSgwH(#+u^*1tJfh8bBm z0aLj9mNy@j>Aps5VH#!56rp+~~jajFHAg z98DpJKj_t(R^4;uo}?@hGV=)m3Y8M=iq(>c2|xBjJ6nVB+9NM=hA2wW&k|14AMQVn zuZ_)l)8<=VEuLgZ3MB6xZ>`@`_Cz3rOVnzkMoJcHe2?k?uJpo{X9k_yUuZ+|aJ;Su z*U_`f968ra;$iI^3}jEXVHern9v+4gqJ|jUkfW=ymwE0J=`r2K7EerXA93ik{*go7hspK2y zYQiqOe9>Prputal0Q#*!{rkv}){k~3KD~jL;}C}oZg67Z_D1+lr+|^wjjRX0n1wO< zTqIaGnv8YFw_vD7o>~s<$_ujDT?Wc;E%wXTe*wbw+_uA?kgG?okstffwlu!YJIqh; z$#SwA!V_x)_c4z5iana#0GOzqkynkV*cX_Z6@;xac70ti`n-Gj%-n^e?dJ441$8RS zcfM3uiE8w43ycxJHo$okoU2%1A)^ddw^=HrnFfs+&>nm64kpl--xwKbv!u;`=~CO+ z>%~j>zzWiD^lKcu;PO4y_kGBSpW)?*J%e&KlfM4$;+DpNR(2+wYFu)7ayN zVIyrn68M6wNE)6ND)tHTY#hz1Id1U!ls+*)PiCfklf-{z+iF@IQm$I&aHneJ93_Cx zgl;}MYISPah6YCpq|{;AFIMYt7wyVRfAfvkAKrh|L9=o>dAPElj5CrkJ+YSOGtXx! zcpIYhyS@UbBnx|BVaF`dX0e*jFkPhp<*Q_$6O(YR8GbM0ZgULCq4Oh!pCP~RE_}au z2;WEZU8uCJ_dK+ZdCi+~xGOU}a#;J_gnl0PEr^GD85|-3dXToSC4BvxLA$Jcph5)a zY3lbU$M*buVjEtm?0A>13dbOwYGtOpq%+u6G^=jnC5UH8N> z#|APm&`Dk#_v<{ZpCV+eQoi&Z+*R0JDbAneSC!3_TqC?xa1B&633FeT%cl$%QuCXE z{H=6ZBmJMFp+EnDkI@scOSINwk$=Hux4P>S>UsjFbA6vwq-yeI)-_^xWGx-=1x2l; z4}v+A{y>NGdM9U5CROnS7Ja=riTf0|VD8(U%39;^Ioq4)tk}8q`uh|6JR`W-DKD2T zkAuS$2R}_m;-*Q!p@Umb8e?~}$kvF=7SqplM%T4k5p8mTP*515WQP9n&R$o>0Pe5G ze&MT4Hf>h-D`?0>!sNFlO5vaJxnj+IV!ZFI{0#<^r)owxZDOLJe2-Y`cgLT6sfsFnK#Z9ZtOu&Ql`eNHRh& zI2I;bU}M67R*AadHemoZLod*&TCyB8@1{r#f9ip5w2ASB{%i<48$+xU@s4Y$7wc=N z?!wu?!z$1Ax%^b`PQj#%(I4lHLWRwJON@bRL+TPmGz{U7<;Dd(C(8i2x={m+R*Hq8 z@x+25zR!Ed{9~(W*EVCu)2CbAjnPf{;Vm>%#~~y2SFXFu(&X=zfMhxcowVA529M76 zHoD%Ck2+ScS>gMi>8n$>UIug!<9ddh;b!WyB=&=O=KPN5s)rcs+qD^nN<>o>Gw7 zUB2lBlQvwB&{R=(pobLQKH4l+ijPVQhn4k`_2l?dJn%dn+|iloSgGlGLb{|90~7{1 z#>>uHE+zQ;xavj!XonRjpOmlUEQw`)?D%Qqovb>Gw|mQvgZ>;X_~vc<=rcnyG`*Q2{GzihrA@&g2G0aCuiZh-W(d>T6&GoGx30r0`ZEXj`vKb zQhqJYj~s1tol!xk0~gtK!hLq;GA@s%#4DZ|G>#JYKck+V&N?VxI|FGC>y_#H{K}!x zS$#;XujL&k{;lpaF^2CqXrF_1R9}>qTDymcZtP(vwGboz(u79#l6RXjaz~TFX(5fv z4Qw3{D8XduZ3QGjYgQ5;2x^_}>rG)eu5rzdscIaXj87(*C6yNa^6=_B3^;+g(e|xD zkJzf`X9XCeJEO7)k5$ZqL&~I?2u3i@+P2uB#)>F}p9S zb#R}rcwdsZC82o|-OS~5nK#MC>5BMPv{C%xk;SakskW|ae%iTur}pgc6uFs*<_CwY z{K`QH1K^)30-!AWyj=1-$*Q#&V}Q-06QMDp-3wbDm6OqZRwvw>!=p7Ttv0;S5M$i- zyJ!JRN_^0@Hb`H!&;jYTYnU(bG1)Bf%XOopqHcKEi%y%+3t_*-DlK5S-7=|4S++{^eB{a@q>@odk4LTbm<2L!#V^j=N(*XU&H}|px-5k|=YAs-#M3#qiJfY1h8EoT zYTh?H1k>@(6FuYm3?d4uCzfIi2srp5#uiH*7I|-&I(AaCPaTH842G2rPt94_IsB5N zIciJQh^$Odz%{kQf8+!SLU9h6&^L2jnrqt4`r((dm9DF zc!_YBePp?AA z6fK3_ZICT?6T|sfssMYQrf{_ z`p_S6SdX>;3#0M@4H>Yr|EOnonMQExf~Hd-=a+h(H@d7-cpW0P)8H=ami3^wT*&=m z)(Ea~gUO`*c5=zqo$o>_5S|}ggQy?L0Al6XSKIRSW!1k`>FQ#)dJT14X|+qMtnQQ! z2MfRtwC|nP8aUhLw~k`1iK;~jcSQ5aa$kd5)G>s8ErH}@DwnqcECWn0Ub6{*-2IGJ z50MIxUu|<*B9Cr-IIPWNAwQXz^}exQO?!;!ijBP5@GY zL_rmP{NV}YD~r>?OOvq?^`(AHlvPK!AzH5t;_J@w%6WdiJ(Y4c_!-9G@w@h{#6+e0 z=N_)$P+%V?ahSJ-Mu3eRioDsqwoj2E>zRY7CUYn4(hef+6&adcYz?4Omv=bvb-%55 z%uKqONUq%zG|+IK4$j={GN2w-s=x9$gFS`t*|^=-ud%VSb536Y16CchaN`6LK`hqi zpH~a<>i5~ScWD@=@~c|d$ITfn;?n8CS1zTBXm0{QAU;j zTG^?g;N69e0$MHoPx02}8qY3{@^WoY8EjR)7yUqk;{vfVrG-jXi}qg0}rW?=jK1uwz8F&tdsaNf31dy zNbu)uouI>!hb2bH^SsJKQ({#54_Gk&@#LJDN~5@Fpo6ptgsfw>rI}u`Z4>xGQKsBG z7;*3JXOd48)Mpwyzf>XO!LC;)8i5f+hvh$CNSMr`ChmVLD>$p>QAH9Iz;>@PJLI0pF8qfpcC^B1}j~F-;?09rFWg1CZ?cC#j&# zkGe(MyMPaH=+PRFhLZs)p8Y2EOVbs+GpN;634eY1RWfc1Sa*k4gUN{aZr-O<-Y~K} z{7an zcE%F06LFg_`;ZsvranN_(9>t)z27EL!1APOfs~_>8f99SA?bQ2E>kidKP<{har+~* z42pFp{4;DlE3yJK%Ux|tVFrz=@7mOe$66oF8#JyC+Y#Eo^#wmqG3<|N8tXoz42^Pp;Wu1>Vc2@{0 zsa=w{LO1xzW3Q?qga6KbUY_wbbW>j7_x21DwF!^k^UWcVrzJ1EDPAPg7W;RQHxBGn zh{dljhGOB|Sgi``sN}sSn(CHfzvB3wY)6nW_DH3rvBjp=_hQ_?>O(nr@i|8j+T)-g ze8RJ#SGFBJMw^w!6n9xV8z=D*BvFA=K2Fu#p-RYPqV@3&Q*wH7r)Oii!^g=Oouv)N z{1pOW>57jx!i$Y3hAmK&c{`Hbgd?^PtFkAHF{dsG4Ia*Xv921=HfnC=m=Y+@L}|~A z&X2X(_*C}C2qcnZgqikC3N9O$?K&xjs+1gV<}@lTL-#wjrcX*tMNM!HZalL!ZG)^+#*4;>6Ipd+GeqR*D;&h%yn-D1L@^`1N2^9;0EQ~I08rc{{ zWk44ve+mN?8yTP9k>;MSmSwwniy7!EmH*`EV4e3R=QgXjpEifvsqG@KzwJkVUny4o z&A<~dC=t11o~S1sg0|hVr@&t)>_k|pGjF?gosCnoNx@F@?oE@YW6pYbf#^2y)`OQE z_|79ZWrV@ViLW>}TPTwbRCi=G7}W!xVmhFcRZ+x7Gcl!Co9Ha0X4{2?sMAXS?jDW8 z=oB{QG>OhG7mH5VrGEFTZ-i@mF0iG<)3{!no_!>ovn|I8wR|K00Qb8n)6E7Gi_;?v zh5W~*vh{^INK!C;%@m(LevIn3x1Jb1(x1g!5H}YlaE;6Z_P~3aKRIK)#`cmQg;elJ zhB0!uXr;kDH%F&t{De70NFN3#!h7o<7!T4kar6a=*rW#cdr)m`iY z8>C-4$Ns3^sEmYoZKjszt@}Q#dX1_ALLBkGZD1%bN1!F4 zIbQwAgV+yQU)4wyNzYfW&8FR_RT8zU*&;mIC4bbT>d*dZe@?%A_pGgG|4v}2_Ne)A zSvgHf$Y37lt4|AMQ*6FV;+`GVZh39P7U)}d=_Y)@aS66sclbhe=HhlgUHizo@9VIE z{K(#kt*#Gyd;yN$GTuo93(P!hg14K4_%jf{*(}{gUYC@n)41L@^FAYV&?XS=cuhk7D8)&A<7@DtCQDrzvp*@~rpajZ zu4eF9F8{fKY*J;ltbmB74}7aRv~9pk6$sl7chjl4^q_nC*Ac3a_~|o+_x)y-xTx$G zcBH4wk_fM2&bx)051asB>Z+N+KWKyw^qw!)+0hjkKItij#Tq6RQOso}V_oBgb+Z}` zPPbJN&&d%WW>2a4Fd5sm{>K71AUJ@S_El)qd_FfZKMFb!>RA+t+4efnHDDd3IeNvL z%>=;uoK8yAoFL@B4f-yJK56zVv^uLN_oR%OOdBL3Wp9+boh?%car!$Fh-XS(s49(5 zlfT}MXh_LbJmV#J7GI3|pW#Rgxe5Syi)qcqTipBWg%*_B+VvMtlv;-1+j%^BSiW51 zNAc1wV;=AHtLu7C&4tlydnGd7rC4sKvOe9tOBtNc^!jL{rdk^&XtLpxvU(G$VrSab_KOe#MYo>%Hw)SQ z_RGC|Uops!to^oAr7f-=TGV>_ zp~g*_Kao+lso;qZ!ZJ%xK+DE>Z4gBQ!{rP5rD#XGy^(f ze&Dn*e-6&rlG2*Zp>tAvd96oQ&ctf)Wdes}zFL zjLZ$byLe_47U>fGf|bQ@Ut-*f(8jC&Y*p-2j27#DzEt{T=+h4#2UnptL;?2w7Gh}N zU5=uPmbQ1>7xMexWYwT?=zqspcr;pOFbuhtxmbDQe2C&!X_*8rFXA4g}4g@Jr=8^-4YAlf<{?gy79zUxPAN$l9SA4 z{)R5=#XhHlh}?D^e#vVn;XTvlpDuGmD08*-OKjf}1v`ymy{lRwt(9A!rQp|b0Yxy`7{l9(Ir#CQGEr@TqdYDNrL_01)-sd^;@ z7b#!#gLQTXT-&NK3<>)Wd|T4h1F756?*<)p;z%43 z*gR$pF4u3EXSg&Y)d_!>n(~66Ox0(ryh<%e0R{={sqTT^-cB+A zm{fh2$W{L_iOYF|^Ncd~z1rzi>C;juPZ5}HGTDs>p(0E0E zqB*p*#_KDPfR*mivup+7-vKOnI212VgeODQw&Axc$Ul2`Y8ji))3O)&xI;WF@iFeD z80EXUW1?#uC)(G=p)j$!a0d>ETwg+&_=eZ!`bOfTAtHdE&FG^(yURy_CAn`Xmqgs3 z$P{$f+*!d#uouo0s5}2lQ00XWK?8l_=Ajn;qzy!BL*&eDAfc8kc*h{mdTa zQ_mJ_|In{5kr3ltp#$^hp=Toa0-B=S3wsA+qBCP(F$4(HWb;%+mvq|78aq6k%&BZ! zDxDnd$-e@9VjW1Q!weORd`>Kq2$eM2(mKF)Ya|l-TtnXk%gL?Q`mRRD)w@iKpa$K8 zNrKdm&ssDXYon_&lh!1DENW1_q-hlfSEfgVTrBnm()g1;bQne<>1 z{-dX;iPX=mHG9gOhBe(D(5}X>J5OVlJ}4R5_KWXFmrc&|(uZavQ_%dQvrDZPn1ByD zpw%fL?MWflPb}vt9GlXmX8Jb~L~kA$pkF$a=?kVmdJ2?T3lDEgz%dWifaIF&Tug}R z@ajJ-d39UgxiC}2I2<+hCwu2K3(^X!-Fpt2R(Z8q*VLbfWQ=qQXi|bq!CFD|A@zf=RUY&mx<-Kth+M;)$nXtG~sva8#opSh;6(~liVAX-5E+$P)y*m_yYI*;+NZ8i9~kao$J}UNruQ~aXa}XmCPFgea`Ue z>0L-mnL9D)Xg9HuZyD~`nk6c3GV6yn7=;2Y7g;*y7Vy5UDHS}~fL$!s-qqYez`ONJ zfcvfH)9La#;or|UGUpd1qzB%CLVj5zs{hn?;E5-j^@z7N>m8uBRYI}FxAUGD5bk%; zRnxxJ8I7v>a)s`b>Dpd==)BEuE~la2bpNBKf8#C<(}}R^ZJECSeHP#zsgmy$v?g^& zVT7@{ie+9ewYzHJ)wEhf$~AShxs|98SW@u1E%g+ov7*b9HUVo4z_^LuG^ z)>vYxlAZOS12%>ESi)I;AY#AuaxHkLbHezUqI6bHU+F1Il$A$q9rgUKq(pc5S*}x! z_XJ|H88Lsq$F$(Rh6EDiB&EIEwW;P={<^KH|@JZagWc4gZe4SP9w9$xF!-E6x1 ziTv|GXH|`^uKt;VD=Lp?tZ@rid zc+Crl#~X~d}Ck_WmJA0r01Z8Yfv!;yMfnRC+Un2jYQ;PAV45#Zhp2un-i&Y@-&dzs-Fol-6 z$SL4!9F3ybFM5qyy8@9Ub6{0nb~e3fjS5|ZFO_l|6g8)?dmaEZOpU{aWP^OJ;*c{S z(URNH!U_XL&YftJUF~$G7d$5>e08&u%~#6p!h@mMN$i#5{?KyaHkrARnYv?B?a^g= zYk%Vd<$b!!d*=B~oo6>Pr3P^E+^oSgqBDN<7$woc)b3|Gugg{4a=B+-JX|>O_%^wNL_;C*K_=Kj1oVe#=Yw z*e5_PxWVm*Sb#ui>f-UDUy`bvci^?H*;l~Mpi-mjN#UhhJ1ckr@qR$d`Lswxz08n% zWMS>dgJr5wi+kIlnc9Et-oh1ePkTwpstes6uu@4@8$`K|`tMZU*o-vjtd5Fc1}$u5bK!hfFeN z55qZt>4l?)()gJ#rr4jRESj;#U664o?#TUk3`jF#qKP0I;v_MiO`=FS{Fvi`(ZM)3 z(UMtY87y4jr`k)w^$_paLNeDt^zx-#M@L7sBn=G>3OO-+pYmt)M;UAz#XQ@FKsfQS zu&`8BRpYKY!`BD1S^W+=H=K6l-dm2ypi4(GepM3?#o{FsbGy!XOy-wbfd}3#QlVq| zrWr&5y+GN*|TT(>Hmt76&_{C3p?gCW0Yg?Ag==0blB^K8OCT63L` zef;9mqYXT6xUM)0MQuxR!kOOWneLO|@v=ID zBW(z%e}=(m%PsJMj@gkTT(SDtRpr3E_`kU3j ziWjEGmL?{ivFnC~ZdYww)_A0(iivrw0oN^60ggqeYa5>7{1X~~^IQZyN3Igb{r;`> zhQpHrMeP1q)~H1GclLT_-TAS?KBjx$5#iDNkt7C>G)vW011L@i!nCKublfik>~;Wv zmhk%Yvpy=b9))0YBs19C7&|1iv+^TEH(I-c6o1JxXT0PCK}WHX^Sh3SKdx5L{@8Ez zd4FF2U2jEGQ#C%r$Po`3VP5E<3Do76TKf#%$B*a&5Uy70tZ^M`QXS3mW(291DKK2AoCKKEhW`&=Zvj-twzLgzELhMW!7aE3 zcMI(#4Q_w)1; zwC0Zsd~#m>xd3~SM!oaBc1P|5*y}#&Gyl2NF0`cJRraHy4z1`lEv(e|Fpqqf^?Y^cBX@ruL+2YHtn$m5)^A?h0dgDz4c%2r%%SqMfZ@m9r?!SLBoEQ zVW3ylif6;!i}-heAcT5M-_g$Pr8GY7Y=`%*T^QhE(uX4reCJei;@t3iT!zHCp~5a> zO?-lz)y5~wYQfYm)fa@&OkTbm$egc};!N(m4;R{hk#ZX(6#Gg%c>T%9E>uOGBQ(o2^>o~$aueHQLjvP8PgzM}d=yFAIeF2>n8l&^3J9D~3p#!I$pj6=wn!g@^+JSqCKlDIbhODIg#QTKKX*cM zpDhsTim1VpY4<;u9T-{(y8f{0&9KOCDgEc4o_@5G0S~$Tkpup(XI_ZHh|8_Rr~*%c z>i@k05rBv2OjytU_W|(gB_&Y#TS2}v!l!BRKNlf6@bFE&(#!vS0K7LJ2YA2^ZudqU z{@()8?LG_J2g+ozAk>3=lj;t5b8rh(+#}0uE%GJ;Wd5fR;dY{@`k^W!dQBh{>$Jq^ zRBxB1^iqvh=&L63c$rxhmhUbO>0YWdu!_^Z`Bd#3%@5 z{fcm$yZVraS%*a6a5cP~C0fxa9R(+{1S`uM!WI(6Z_XqBEBu`$Zh=<~zLjLf2^f^P^&ZH#uepmnlRoC>vR1 zN7D!6>~?)B+7?3tkTa8(ucPz6gQZd+8IYzY-B#2B?9NoF7)iB_nTwlO2VKnSLHHEo zN?rR4EYSDysBeYd-Vsl-;@d6pZt)}0*>PS74a4^f86XGMu-SO5(MeZS^-nfF<;ej! zOajz6qG3PyaPx^Rb$07-5;~V2-i4|~`GXf6jbY~x?=#du<>cmjnumD7LhfhLHjAF` z`W)8R@2&~Luq3tl5YKiVo_TRuBE!KgWgfI_j*h|@(a0yNIWLA9mWn>r*9~(^@(V)q zwFHO7>7wrIMqDmHY5mc%5$uPDlvGB@IQ!L~{k)e?gV`RpOl6^3k_8Akn# zlugs-w)r~}V$f)qpzAm65K{Mlm=JF&ym#6VG}rhM(BxPwAAr13zAK0&qXlh+cy=CF zveklSqSTke$8I*m{A5xh&e2vMumH-0P~sE8f25c67`;3jH zrmhYb<-Sxx+}`#ekBcmg+I+%3SuA(-$_*iwPvP9r@%FoG(v?GBI zU8G}-;hHX6ii?DP5O9x_iS64LggH7&T`j8)Yy-W7yz-?&u~a>Ed7HOUBnOt!clQzr z?CD;VEG`BhC=WIJEPd&qt;GgPZ01BT`1{IPq7$Fl#a+y^T~Mh%_1$#o(%6n}y(`E0 zz12s%>{F$|s_5mInjgH zdLPT_1IstIaw0A#JB+oW4UZ@>9?aKJ4c&B9E!!wOwyf)yb z?L;xb9q?Dlu~#kl+OW;IGL&*jpc~aA+HZIHHeQMKRTkjW>N{Kk&|VoW-f1}k-f0;? zED9;4a$c=g&gY07oM``k_+1GdW141G-Q9KC-f*NhGZN*U5#8JQG?|cxiQ>2M!8rW| zF&v6}m~-UZ%|ytOvIx)nM3He1GdakU4P9ORlhHp2gidXda21ue7OQFsq5zsx29iH zSLRy5@}7!wLOECF{lu$56tWeWJZn9;!dBlfRUw4e@2(5I*SU$VdnK*Huk!CLacqNz zZg=(`g{(Set3U80UUzByoDL#Se1gt0UJxXdO13{dR+LV=oFTv_%Ph~)`t^i=rs(|n zLtgtZAbpVCYNC$)co*`?`35ajjymAF@$1)4f3LeO#dUL&3X3$w_~AgkMkWY|Usb$2 zJo&q50=oHF8rd5iMsQj@WibK+;$tL0ZdFWf6b9mG>{x3h(9MPiaU5CIuiiO84w}9K z=@T$78?>VuS9@b#e?iv;v0E+bDm1FQ309tqY^wxp#N902hK(^A)Rtx6_JWmD+X>gV zwA^I7Z}$bW7Cu(IWi^qLG#1lUO!FV_prH2G+>y#tov(ZS;+KkDpSsRbSKN(mX7;`T z?grKSr{%~e@P*dN;M-7;Vlg%3zSY^r_9Ic_ed{mM82Thhg$GMD`uE2gsp&2+G~nrb zvM2R4LZsP{6(w+o_Nk~&luM>vF{ew%8MMPLBToJ65?I7da{S(29Pm*q{^aUt*4EF@ z&?>2>+KVssIJ_3cv#D%dt1Jx5mX7Lc+3|zt%mp(1iWit{*xs7&a1p$Jh-cOr3s%3~ zPJ`L3vFCG6`FfjksFxxQFG2!)rC#=}cl_^=#qnLZza8 z`Y;IH-AnLZ8LXDucE4Ed+Mx`6CgVfB>ftmtaej90KdDSESbpLkQ1a{+az_=4zsMg9 z{i{F9R3{0KEjQP3ySeA`%4PPV{~n%s1gT?rK7inCa+C4B*To&1`7CeAOp`pl#df`1 z_j`tljmujScn(O|7e>peYMdBQg&TNn^${&R$P0#svVF|h`^BSBx#aaP1hnW-0>Jw1 zM$w$WSA;FqYY_Rz`^kll$`s@>$* zic57W<7(LE`tZ`)UO=u`=hSpC&3ima-aX&Y{=E5tK)uF6HweI-Znfap4z%3wa@~vL zjpgN7K@v6?mp(si<>=2zus;;Fc49;+R~*eH3KjJBW^I2P;;XzmC#W;uE+g3L;tV~j zLw0DUd@m%etW(Hkc34bokA*+Qy=OYyh`DKnp&s#FkR`<5%^zJNzyfFw@tg+f z9fdx-#244k6Iq-xriRmEOS4c(*ee=2-q6!v*3N7XlKqairYiV!m6a^KQyDmc?0!!O zT?)BiIXOSi-A@XfWpG~}fwMR{c<(pqn?n$G?-E2e`I6SK$tCkrN25Yb?AI~pvoJ>Q zBc+H*R{jd_yq@OBHZq8xckcqC(pOwooAu1BU@ zra99*Cp}1Owl>ZDa)2f9o#$;zkmKI+OCDuKoX|A#Q|B}xZxIxHMbg^la!M-#~6yWMUb1c3^F7mjzs`yKCVnABp)&zMk~a zpMpb^towAUJn<(XLstQNyGe;A3Iue;$fYBIwPbwl9iEc$sF0jxD1L%>Q0Ht9Fnb#c z%Ss*6>THuK7(r^pBtJ>oU#_rayz>>_8MBObE|{hUk7u;V`XlB$u_^2BRR>4>*|0^x zGQloHQ|%)4CY{R^W~+uiI!0x5UO296sVpv0VfEZS$^NZ}X!gV|&aY`dEQm z2CS85A#-XnG;qRi3@(<$0RGK{GfeO#zJ4T zKDTJm@b?^deL>GC5q_^<#d>v{=3vO-6{^Qr_e_!&G-K#f_iVl%Po{X{s$Aa5?Fj*g zhXrcBH&`z>QL9mbYCu^&iD{}9*Mcf%T^ZYuf_o?!*9+P@SYt= z&JIP5&G-1(uz|RcJEE^#ia3nk6I(IQfK)GpcxLlkm-#Xq(482zNEINv)K6i0RJVOx z$KB13%aql23YsOTr~Z|1-%T~kf2@0B*AhcFbE)gIJJ+}`1aL&$s=f_sfJ#L4$r~%U z{YW0ubZQvJRMe)cW8d!*#C{v1#-JyP_PiITkBRQBcLf`rK1wb%07+(pOXzbrHZ%8V zuw+02WYh9*5SJT=p9zG#fxe^YO7!HYtkddWlI#sjixHnW^PXS7?(9Ak^3&!oo9cD6 zZLX=!`*K(Yu=@s&UI{7+qc);@ig!CgME3&Qzd{QyXleq!nUiIa_y@JpfhCMl*yv{E zIvFfDpT=+dXNThos?g=v%e6$e`x?-DGfpGr?nk46@|iRsHC@m5rAx!wRFKc&4-soy z@J}qC_(Yw$-0~g;FxS{0_d~WkJ9da?=dQP2-~2USck6YC&ZH`JZBFXK;>ZtV?C%P9 zEBBY@_+c?c)t3R=w@1pIqX*R&Iy|z7VTLT};5Yl2w$x@93UGzDzh3h6Zy;MvKhk?{ z`6H@2lW2!wuip!?X-{JJ9ZGuN^US}K0N>VRRtfni02)^9xssB@g|dln@v!5gPzJJ|Qn_m4S)o}f1HL?&Z}laadY5S_{--r$z4 z+U6s)t;^ca#G@)xLv1-Wi%65R71YO@oKYV?#el-30EpfmT`E%vumk30*MX0j`nJjd zA=HTyn)1g@T(rF`5rJY^ulm+4h!fqtaf=`IC{zf(F@h{Otnc&iGd-jre6W7v+%R655sMV=39t|x27b}KVmJ^2ee8~YVbF<`&x}p!s(MPO)=+i zt@<4*ZaSk}-X5E}QOWI}R~oF@+|zp9&>8xYsk)rFr^uY`D!!nxP3?>DEl|H}liZ2GR*8ZZ26((V#-Ppx}sDVj9t=}_Ez zQ%h;30a+2v9wvNy#PH5Fc>33^a9GP!c28VLBX{|(BQ>h4>S z_#JO1je)A>M6umL`$VFF^Ly^XF5L@U7DF>O0~sNa#m@UCa>8m_N)X>PGE!@fI+x}Z z%E)1sCa!CM!`RGaPLz;yYF|`E|M=ASL=Y9SfVu`z-WS^*!I#^rg4Hv`Rfe)3Lv_O}1Yuf<+hz+f z)9o_$P7?AT%GrePU#hp3tP!ttD@&K^eH-bbTbe!o{&RPVNxUBo&D#bz3)GU~L(gz? zl+}Hu)wLjF(?(N_S+=XoIi}{JrQGA=)@|-sal1;V0`qW68(G)qLViO1D8M&WdaXA? z&j)0{x-Csvtf{Il(2AAAlcUB_p$x_*twq=pa#e{gBD^s>ZY~QNT2G-hAJ2_Mz0NF( zqV5}z1jQM{`*+|5xU2wdkwH6_3ZwP~547>)D5^~{J%xLvHTJZ$L*BNgW5JR3@8Zz@)UQ&=e`O)}8g|Dil;41LB z4!!Fr_b5|aIwh(1CR0;pE6U+rH{eo-=e(8t&@^&tj$qQGL3b%mGJ7O-xv*wq`N`s@ zUV1!=M`xLG=RDyQAet3>b-E4BJms)Gm~`nb`?LE!sl&~imOW*pkmnpWveRKC#PryR z%_wiQk+kS~GXlN5RL$e8w3#ddzGGVDmaXqU%gNM2%D41fFF{}6oyJCmo$j~7HWvm< zvRWUnUq*Nt#FwS_ zPTt~=KMLKv&aZA@-AuglP7?}a2vJMo{+0?1M}_HUe9Q>ppJU?rI>x)H-bUjb7bxY< zT?*!(O3O@vt?(*x&1b_O=Vc*hr)ai?#*q&mXPi)BP`3O;6d( z?bTbW!uAw}rPF7|-i5FCrHX%grlyRe)nOn{Xkc1``qy3%V_D#rdjpf`29NHKWVX_I z`}d#=8e3Uc%wO0HTSg@he=!@*c4fvFMJr+DrfRU8GFrbUfjb)&R@EE_A4?|cO(bL;IS85XP4L}jLX3wRnhCDi7XK_>n zrqp;uqWxA&Qth4`vz{J4JW2CP*B@5Y!Y&t*ke^r%gJs^kidphW^cZJcS940x z<+N=aql)W4c?b{B>oi4~7BH5KS8%8;FG@Wt$2-l*i6x*J>^c_B66B(Fd_aZu9`sM)#6XN~Y@%X){a0 z01&FL?}g`&Ikg$EA4f=By*t?G<}vWrifNBfg}Ip$w0iSoLEfAC6H}o>p`LPiw3;(# z?R?WTGFA0jNiQ|LmU57cA(=D6lrgMAx+tFqNCYeTt`5=m&0oP%{x}RO+{`g_+7X3n z47APXvFT?&EzJa>zT2LRm)tGkphCM<+Sq(DfhfxTX!voq?YTWmPuZR&sq9UoQy5o5 zV6)g*q`!(DHuZ|vdFtSe;FI^#l@97Orrn8L?_GFlnQqT(Pbt;aa7yzLSN;!Dd=5b+ zP8}t4Q`9e-3yEI@S*IO*9`WO;l+>>n!jyLvM@^f`Ja^pQU(#s%DNQbk^SR(-`yhS7 z9h2r1)&NXP38!jOfS@M;4AR0xg~KHn)#bJmz#HRp(aoXos^W~TU+;}%!RJVmB20}6 zh*1YZIh0xNwRS!JNS9GT+8)72M#<=t{yQGsMtCTmljr1rvVPtF-dMf&)n+05&59oqDEl^pTqtT>27KIs`r6ZTRYgV8$QVvRL+ zs-#e<>>yVP?qb5!x{@RXvn2!}s&D1ttnSCRY{6iz!_%eH9&@M(IUxejb1ayrWIMhdyyBovp7TCbK@@#s6Q*(@$zl4 zH~HUf1602Ori=Do;J}xnOZCaaFBG;JN=Ts6;X59_ER=~A&!?zO8m)yKENr9qUCa>` z;Wk`CU>Di0tNKm26W_+xk5uFgMR@tl!(5c0x>1`2n-kK(PzB=YGMKPH0G z1U|0;(_el*M2F&eu=zSoDQY-?di;c9F<=JuNuoG;eK<}xUQpuU${;L`35**SX_bUc z%$2kcQ@XFM}@mqp(<;-^N1g30gl-lb9ld1DA_LS>1 z(%4Rrg9T|Rh0V<3-@HCBntf1T@p!iq-c&uu1KoZ?`Iz;U(%#bTraPr6H|Q2w|IXD5 zPj~z7y3)b`&+nsrn{n}i!|QG=M&Cg#RcNiIrNz5qa%ebu*@?_d3)Yjbn2eKYE<#SE_2g2I-bZTLaaF285!okhi3^@p)SJQ zoYNV5gyuiG_UmQv5zh19;C3IMQ-o?Wjs9iO0RhI~L%37!A-;z$5^~iZeO`J3MT67s zV~sZDA!RARlAkOPO5}A+F#ZtYJA|}XpYMWD=W(bV()#>tQZBwZc0e6ZVn|0JL-B@r zimLpEbu|ldc_K21y|`b_rV$#z>8*;wOQq)%*nj!zO^V4iY-mBo$E)X z^r4hvii4;085F>Dq#;ApLoeDSxg;^&>ilm0CSVN*j5lSo8V~tonc@K{mH%>%3mpGc zAcXw^QgUd?j#d5PtLWaXX6FM`be2F&p3o27D}vA@esCI)kY$?<@TrrE*2iN(6^{V> z$gCT3okZ%5Un`hN?1dC%b8J&+I&Ibn1>Clyw(ncV$~dSnks*5Wfno6w_6i}NX^ZG_ zToL@=@kP>DSl0Ja@@!(y-CCOG+!LSU=4n`-Pk$c=VsX^Hd!vhA%g3w6a^#8@jHpX7 z$gD$)e%#!C3yLs@?}2swFVwh)BAAJDTc-?r)MnG`@{Qy}VM-EpseY3r_=`4W7t+H+ zFWGj>lHRd3Bs}Nz6qme?B;Qip5jgTGmDm2Wn6p;?tIm&U^mxv@`qv7q^Bym{5j3sJt2_V!C_Q|x zhgfD)hZ}ZzMtnY3V=V?SU~_9a=Aa?k70Sg5V&7z1ooF_Yc@x8-DbAjc4v16RE*>HIUHl+X->zS;N^WDcuKiU@ms#(?CY}* z3dSMCiy0xnu{CklqF8+gK*C2a#{+>h7>g4p2a6~ds0Q5=oJwv|%x=1UCrYi2N*HT3ULN->xh(d^fZn!F0Z}X0(MRj-^#e+|6LTpgA=F)=0{4a4PefRavIQo>eXH;f@-YFp z_x8r*9KVdHhw>J@Nk(^8xzB&mKYAYDZoesb--DA_`EKEpKD)yO>&{d`fV1sld-5kQ z+vY-jT{(AccV}yH)o*Ac1G7nH3z<;&)s86|cR(aeIYZ@82__?ZIf%P#m4x4OY*|i` zo&gg8J4*(EMvh7nJ89)9SEUgChGnn6Pw5?=BXR9{c<>sBlsRsXjf{y?St5G$&KBv? zC)U};#YR6+ZJ90_=@nSoeDw!?LyC&< zJ$dmh&8MhQ0GJA~tw-dP-c&qP#2H{Q*^}B|>B@UQ$ln40|IRl0Tu=mxl}jQV+b>VY zq&BVrFO+Q5<2T#5NUGyvvE}}Jd!;_Es=8w!)yK=755M|>TnI`DGH*7tW>$SQExq*{ z!Y$e~qPc(Z+&69hYbNB3UyGxQ0*Wa{8s8*{M ze>*K~Y0$%Xfc+jYR00sq<_@{WGt-*CiPimwMU89AHMcq>X~G%y-S+ozyY8izBk z4OMhc|X1*b{Qqi@zIUW&cLM+Kke!cjsW7^#SZ9D&T)AObPQzE@QT{xmBq{H~}Cl zBi}iIqua1+t6hp|RkG7dcD>OxkiZ0)g!8Qq z;heS$mW$E+rfu-5)X!Op?X(y^%RMOpknhVE%Q5{wmz*-WcChje9+PZYBsXOkOC)v) zJb>W&ji;+a;oF*YL0#G5_%Q_q{+O?1{QRPFjgOHeB_l;*W7*PX#~%+2k+lp?Wfl0C zRk-2rFLtRXa(W4Yv_7T9aWOCMDqaCxAM}iQ_ZPQoK|$qjnr*4PX^K(g+t@VjYQa|| zvm`mZLT!!{HAdFk*^$gAu_0N#qWNMU#xGi2lGNFYSpce^m|Sk!XX$fsfE#-2+T%4k zZ!^(!C@ODIE-N>6WyI9nW6)#X&CAkDoj&d!~$qJ*XP zPOHJVl%r zq{&kNGOwt&K5(1wT6q zNF_SF+82X!eCp8`-`VyIOd)7L{veRP!0OcG%aNJpjWuhh6le@d(eoG~j^iEfAB>bz z0S>t@0Gi>wI#jz$w)`$*%yTQGA%kPrOk1Yp>0BpuUgslAi&^d)uWdoW$fc5B=SMt; z37eC8x++gNfWwE;Gv(W9e+qt*5b>WOkHvT-{l~{LO$}S8&8j&pZC2d2Ph?y#H*nGG zm4wZ7sL-t$-`DmSNZ{u%7$JTD$ihu=D? zju};9htNNB6jG`frswkQw$OHX>^8kLsY>NKHmnG;RA!C1>QZl&AS~MxvH219D*{`7 z+&uGQR?)@$NwB`FYdx~_w41)W-R7&5MLr#kcwT*`0q~ev>#ycmrKr3Y2*@4`7fPuw zgg&aM$}lry;4>Fp`5NB#iaLJRGWhV&X>3sWQk0B*{;hMxW~*z(9oEZ;uAOe)M&3!M z$)z>wA0f>8%yfD*XxNx9oIR-4cHVmQ4s3*{HIr(9e+)q25!w)o3HwqRLv~QZh$Tp` z)_EIU*v6#QQ^y2ojKT(j5{2KrBZY&PEY0-y_g4ZOaV=dz=0|Z%`sCh^eUsmPd_YJ@ zNHTI=A*``{nipdMT}uOZ&$X+Rg-2JF2kPCQC0RH;cY6R(N}4n^s_12E4tw@3)Ao|R z$LrGEDf~|Ao~03S`E20S;%F|#SXd6^Hu7&Sdu_6g_cvb0k1Sc`uBR`Srtvc6z7P== zxw$#-4VpYQZ}ZCO3U#$<%G~>gOhpLsxKbOenpuFX(_k<*p$%wPpQTG{ZuJ(df zcfCSUIVDe&!YJM7M@;Vt#9d@Rf`vS;KmyIU=1^+oC+|*%Ub<*VA_{7aLNUL-biH7( zzeNP_wVAJKVd^`_MMy9?ph!q;3$+#-gdVgEdZUaVxOLUMjmAu!%38cuPc{Z|Vn{p= zMR@+G!7H+M)K&lnKr8gaDiZtqZ;&k8Zm7IIA%3{f2gW-X{xUW9_n}Uz-$xf?G&NAm z4gG`=8Jn&B>pZNM`XN54W6W(2jEt<9jo{i@7fQ(d@Rd8*`ZW)LcFqMYGZG6lw%QbN zmo~hFnyXb|n$mWJ|2J%}y#;YAi(&@JU~4f;ZoVar&AW-S{V}KyuX!JSRq)9{l4cr^ z`tfgc)$gG8+>3nOrkCp2kw{t(Ja(xMS}rsDz3b92%hqg5L$l<8-!(vwyUN2+pVDZ) zAyEovx_)^H`xiRv&yKSm&G#^1TpSg?pA|q;k?XW0Uv4)CUPgCWoM@oN_LC=@etKRY zko@@;g+vB)>0z?<`rrS~0fp+tCF39ao(HnZ%BWR6Skk*jnB(L^lc61C<7kb@&PVO+>Q zkSQx12SIFhS*t!`rs`{iphU@+ukka1Uyh%UWW5xz1Ic%n6CyyQTP6zfKkP( zBZm32OD!RiFUB=CE=|C9RI&%8q`&4ypfQNjK&48?zXi6Y!CNd(ef|m>+A8z%>{#FW z6|L7@H9IEhvJLFn;o*3%K{l6&Q(Z-R+tX(wULx>^e@_2=WXFS*3V-4`qS|8=1Csvw zip+LNZ`Q#Z%U%5MI-)$wNUfpk8#s>^3rt!PK zY(o0L^?T5TixK!6y%n35=BCqJ1-A-TD`Y^BQ6P-&L$f@Lzm=PjoS)#r*;0N5`mZH702a7!PH-MK^DI z1DX3^)Q9wUHy+C^ADl<&gxs_Z>rep=HUS!ZoU*IAP<*+?^t)4tp_#$W=}lavmM}79 zN$?g4#L-7)l){P6;ra*psyye1(*$z)xD;GRvU-)Cmc@1_pzH?HJE-|cB17Aj?W)A+ z>e6x9Ggw$ciuS{b6Ty3ShA;0>mR|}*|Ayn5V!RXIJ2j2OSaE%b)AFrF?vnFqSXlmrN-&{eEE%CLwCO{6~ z*{*STCEUJ%cy#;yu;wfM<`0CNX&9&A-&OY!Cd72LyF^{6f0e>+o&DZZ<3hI8jgK*~S?e73N8R|GBKiw?3FAK8%EwDe@*1B@69_}xmh2&K@-bjB z<3VlHcaW-bhr&}yG|M6t#CZ<(-|YyB1qGU*+#G_fJ^7c`0POl4)H+|ghbXkHQx+C2 zg=Si+B!7;BWSr^@{i#|r&7a2npgcM|7T~=+EIVW1XLWY3TwjdNlVdjEOqp*$$>2n{ zgMz+$dwM%`$TL`#r?|bh0@I0S5|4a_s{h*>?v^Co6Uo2g^N2{56ua?Qb0Z$U&%7nthM-6 zP(0}d{dAy8&P9k@PW=m5y<||^(p~E5?GMa{_17taS7_wV>aQH$Tmpfo`SA9I-RbO1 zj#cgYJ%0LkZmGSe0#dI4q@<(mN5$EBwe(exOe}fmcdz4BJbnI})Q@3h7~rXl#;;O; z76opZWo_4&`ur`*Hc+m_7~AQAEqSNieOy2;@S-DPF|&wZ>(B28owUCdhYtsIILSgL-*nih z^|4z4a@S4_-HQ#zxg?dmVat7ubwA-uiU=lIX*w-)mJ}gweFhe>8Fc^xty_%C~&vmi%PExY6UjknFi-;RKKW>ba3j43!6A_0FO?6~=J_ z_yj!_*&47FBYAzXXjPQz11+<`{N6HTz22F+WZn{Poo_p%U&*_BpD0AdsJQ3F5mUvu zfulS7e#1S`$d68X94a0U3fRv9x?%2@+LN>61scEuRwmXEQkS<*t0LL(OB&DD*B6VB zkRgx|6H~OoenY0ZtIcs6^ZNF7@EGnl6-p_Ss7L@N1pR@9>7mZ&;O#x1%yG1!;Hry+ z#4#TQN{-wHBg8ZteGcJWGaeik>r((StR4B9kK$uzfMN^_hV%JK3*Rc2WVTZVvBa-= zr@l5RO9Qu!pOK*E=H{)0a$cjAYsi@95@k8;NE>4#dm9TBCGThMuN>Pa{ot*Rlozv; zPs(KT##`*p_s^O?Bi~M~|8_t-Kj3qp%J%~gC3IL!3g7j&<}|hmsaQMU5T-O834_aF z%PvoR9ErdGi(8_d%LoJGp(UrB_50>bD6s~JcOsX?xlpa_3-RUrhyXwA zi){%d&DdeU`a68jJUJD;8d?qrlyn(xy-E3y^j3_Y>PMiDuqgUxbh0VnMCM!${YFP~ z#!y&n3b#&WN?=BKkQxr7TVGAF>)>zH0omTDk-`t zY$3axWJX3L1_QC6o=kzN$A#c7H%w^o(?UcVhFISDoy&;qI%q9dNdRF+eEC}_fm&f0 zt^e*|oaJ64S^&@=dU2rU@AU^&BEx_B=U?~sf+wSpPC@P2Z`A`V{{MBA$^z*8X)n$# zyx&vszpX|P0(T4$@m1&v8u-`uevbgKLOyX-o`(b z=zv($XWb{9AtQL=HwfdOa+u_w4&sg$^m0$+{$Tj!hUa}B`pNNe#{OJg?-pbp7Mc{j z4d|@T;6*c(e-)t@-2Od*ZF{9F@N7FTO~$w#z-;dAq$S>k6AL7MeI+fLB@tQ)kPn~% zp0^fI^eYDM>VFrFDC47fyyX4p^)vAH@=&!6a&JWka~Ad38%{EKdqoUHCwx56rA|~3 zT+XjgH>3I8?0@b)=_6pr>G1{B0uZXw0k@s9nOXj;j>Nl(0#!ex-&QTc^XJ*i%kO@U zM?|0x#L&kPobj6IH~*zvl3<{rX@EZy>=h=!gTqaRj)NlsAS7ZOpuJ#PTU*bTJ75?G z=jM_oy1~Ykm@A`Mk~>(Ez*ypvEpKdy1CHLPmM6WD9mxqQhqa4@_qX|@8~(Zri(-uE z!ff59Yl7@r8=!qoZ!Qmq;h)-)gboORJTX=y;=jc0gS3wq%q!X0?-^8DZY=ieQh^5| z20+dHfsqyNJdxb}XQ-ZjEMvkJYs1X!5Eg^1Mmspr?c{;3t^eg9QuZ%_Bwa-0Hbeo@ z`?z}Zo1w!dKM_vy-icBE`&$pNgcZ+xuzhZYY`?U#Jqd5ZFj3a;1??Z<4T0VJ{f@XJ zz+kH*1*A=vpUM&QU!t%Y=p!%%Du@1@X6maUOv(hXp3rgCl&SwUf9hXD={~(=w+0_l z>gNM|3G^1`pJM*}fcf(ZOL`|FLOl^MC*}_#GrIm&Y4l5225p#jB+`S1EI9PfmQO;> zVKDK>c>8-6+erCP-!bY`vz#>pC%k`hcnT8M|CL}Yqxaw6V}=E0iiw%&M85LhFZ!+@ zOdFs57dLD^seLexnda`8(fwkbSRfq1;jkn9GAOYkpB3 zN<=w4ZOPr~pZ8I}w){}@eA5@8Mt*m4;W zk2qudgWf?IZD-XhM$J!c1dg#y6=;y<@~ayu37MmjZQqTqvZI$t#2N=f(9_Wm%P8>w zA|*nhAH9OMx)?6X3n$TIea;u4DRXjik^8>08~+arc9{@*n1X!Ee1;;}^+HbL>SV*g zz_-SictJ?6=PaWUs18u zH?>c3#lo2jFzT|6@jkLZkPE^?iT?9SXv48y_7F zmUZ(iFZmkDf^h+1QB1LilTi_e28#cB+i35f8hwyh{uXeqvw&A=l69s8GUJcC7w<@*q@%=ddc5(SQ^-3>W?IQGzyX+~7(#l_|54?n1_=Bf?uwS&Id$RxY(otY z${nVnMQMv(-Sz9ooYhlAgNQHtUcdWWe0}Swj_$Oxu|aiF3~s^oHmK)+YBVE4s(m6Q z3t}oPuGy<+9L4sFOH>0>L}mf z|M|aT`Q`p7JDs-ee5+cM8qw(2Mu}Gk(6bIc?JXYRW^+HHPQp%%PnAlO@rddP#Py#H zgwmV$Bg}vI7O?bELvz`FQt3zXGg_FvBWYB*=aT7eBtAt-6$nY6hr2QRNgsQWK^Jpk zxhVy>m4@4N#Mh|r$@rR0dlCFFNhMPk&j6R(OB5>evFvJta}eEI2F))Az4Eb$uM|_i zxZl|dE&vR&$6@Owe*bum?v^J_zBr85@*m2quht9K<5kLq=4;ZabN`TjcRhq*m8Os#WDLSq0&=h*tV9R~P?h;k3o`g3&o+ zBH|CnnJhidFB1djSqx5ecux++d1epvcfNX;%5h3Q7|iVeQCpNGu?8GFqx7unQm z?0v6xK|1zyP*(8&hkcn&&EnGG+~~qAXTss$^!)ldXB_AWc?8EDCE4BWY}&|HqUz_b zfE0WTQ1d$#;^D^nRM4f?h8n4FabMD8XM ziX3;&)En*3nXX;)UU@&tsWli1S}ayWZ}g>hkRR2*B)C&v z+!)-Iu)oemvr7=W4 zo9f}!FvCAs?<$nU$)wmKr2UG5`>ysEX^MbL%E{*GxC@>+7a3s9B=OTYzW9&e&WE;t zByky=L9~2d*Y;3}4X58MZawEsO&xtxQH&EM7yIi_EuExwSIT_ceV18Mda3m^0-qyE zjNolgk|rELf0j_-_VR3APu;K(#d|0Hzy}Eu+-{4nwc)ebW{qTVdfRBlj6zbA{0%Z0 z%4LX?l(QxF`d48vr(PwouIIxVa-I zlfs)bQ0unN>(}F8y0Uqqaj(KT$hT`K_pNgj`Ns1mi`{AZa?Pi26H!n$jZSe+l`EOqe_A`N_7;YI-?ja}WGas`gYCd)!LeUrJ%7GP^%?YWql&r1rFQgOgNkRsea1W-VNVx%+Z>f{NOh|<2_7Xz1zXE(F6;L zdcuGdo>ir*ilE}(6(oCfu$lPO`5oxv#in`wdZ7qv|6$j0J@gS3wDmeoPT=OGcyND? zBQ8_?-dlY7enqBZKIB=nPRhO5CyJF z;g^So@_438@sOcu-1B#10oLJh(}4>#-a{k|OOJ=j+wb18?q@R}d68U=ymf+r2~Tof zd>+X?zbD=zMSgUBdtvQ8chgz2F~+mE&`&(^qh+WiImu(L!69vacGsH6ZaKn1h6vC5 zB0Ri5wZyjn#2Xyz;hr}<*lIoZc06%ap1O?5lhvuU`Thsm56*kT+r_!M!9vP1txNW` zBFNaUd8!|f{b$GX6*%}mB+}rT_-755?@kkW!G%p=msrIg-! zFmYlg;5vv-vLP_{TBr@NvkX67e{3b8;p1$jJvBwE)KQ#a-n}6dI@@y9e&2{(Kki|B z8Ldctaem9?bmGOje;T;{0Ms7swLXKz0EcF-knKwT;5>g?YG)$$Wic7>ZB}3UGGANB z6osnK0VqMdI(n>m^x2m8|tc>R#yroJrL)MLsIQpW#vzR-4}30CL962HB`&RXOeGYVwp{waepQya&Nji` z-Ih7A;PlLRK%whxtH{Bgn{NwU&G#!<<~F;OXc}d)XF@`gv!(ZwxJ){x=Rl^^qX_=1 zBW`MMznd82smpM;f!Sa4xl^v7mP4;$M4acZVVcfaPTMTo9)+c0ICa1rg>hAt#ky_! zbm|P)G#;P!XZJ;KYvu~xw#Zx<#Re5YLYzq9W zG!A!Yex_8~lv`7^c$YnE^8RjCgC*N*lPvRBL4kR$_np@Duid~DPyFZshM`C6fM&rH zU254Fx*Tyo7@fMrPVG1zFTs>fV%>c0tmIczIL^@KvpKmRvX*;49|@4`UQ-Cm`rLik zEJPsn0YL*d;cle8r8Gkh`sv(_jt3$ZW(n;a+-Kl@M9|E&6N8UNUhZYZ+o+yp3*Lxj zJ__4&SkRus!Zh{0Y|(}8RLA)q`&ts5LXqECPDsCg%NV$Jpa0@eU9-9pw`3`$>8rqQ zD?Lw39c?bMf67_wJQT>I=|S*DNs>A~V|R8_e?0!uG*JY$PG|5&=jXC<-_45vNM^E!C)>+}XXkF}#h<7&~uQmSb?Q<>5I`;V>{?rl3gMcr$Q z%Qkvul)l!~I|k)x=d9SPSW81erg%aJ_^%C%*agDo$NaMb2Gl95SQ?8rVI)Ndi(F0K z)dxXL)l%Y;;*u5GCvOaupI$XNw%5==!SZlh)ITYd@Y)Mxr~^SRI>d{1a9&WI5HR51 zxoI<>2t9u0r+ffQexg3pLLApg)?WfewOGCL?LC}gKg8AC({wgjY2=)RL0oE5(o*em znjkLBu1K6pyyI=Qi)j4xBzS#m zK1C0eM+7yk(~-WPgUFA%e-$R}g{+R@>fh<%*Wk8G*8?uKmiG1EymU&BB9-3k^0h*y z@kOJoCdlmK4lc&j*ZI7uwMQLd$o{K|YLpv8%2YR{cMmC_$sm%NX*OX_sLry!0u;JE z&=bxl!ki+qklm9vEZ>au_~&;+;uvPuk-ZX0x*VXvZ$T3rnWRqPXCc;e71p6cvoZcs&a~ zDJ@|rV4^|3vGB1Cm`HJus|C;d?9;BT@VN}SaWS%F@30zJo;wzni0u=Y@j!}jC*vxFO^sGFEY4T67X7Lk-JU{GFn3$?$f&6=x~$l9<>$of zU9ff#Yqzmfm4$Fc*6!EhfV5VuJRAITdY8`3Of;y-$)Vj}5$_=LXj6QeW;vuZrU#nF z$ghTWSJPzjzd<6ELUz6U-HF>g`1F!%FU}$TFN&sF(cr!Z^H<4?U3H5fOTuk~LxLb-|yYe<@vCvpN zt=M_lbj+?>&a!Ei5UZnN%|1Bw0VI6G$UD`M`2E&9Jy$NRZjum3?{;p^e7JLSh2Zy&} zhhxQvvPMAP{D55bQ1>RHW>W?^Z&;`?CB@`c9(gE?ZH>PfJJp$C&UwYK!z%p~>2aga zM21zmaNx7L_T+xIvjN+y$#LhS?EPDeod?6II7G7<6;{J@kvo?DQ&OL8WbTV5Bp~x9 zg%kud?1kf22CcEhmP6s^cjdpnuh$iG0Q9@hbW9Ocd+f(P{{%o6!ZNLbU=vl2B1k2_ z2ozN)r$4sAE~8?lVg0SX0z)XVdmT?9L6^?k@TlkkKi1fw8CveWacNQa@IB-7`}S$n zMh)fX)?2byd9 z3I%d4!SA3)OX#8c>qI^-c}oR|i=%iZ#Esn{Ex40+{nI&zDahTq&(Mu8uuO}g!}_ER z$N=_OicBIwS*3nuxF?#*cO3B0E)z*)`myN@6k;o;@EkS_%W#x@n}zqtB$8AvB&4Vz zVOAFE-L!hb!P?T(yJ6pMtX{6Mvhz)~o79xvvZ9})<@19@1-#zHwCXFY&;)m6!90)=p?(!8l@H^w=(k$DpN17tiLu`yNr`?;j zph|W&wrik)ZI;GNmSomrggZ|lgr<&Zo+Q*rp4`PDnq1@v(Dq}t%t}a3XxLE{9JqZF zb7Ex#7ptS#c<$sy4fjNe*T zeGr{0Rn_?%ydE+l?6duNp_t)DeUw4M=c9!$SgsJhwZk-M>3sctI<|5%{rtgr(`S?X zLTUHm-H&B)AsZ&g&HZ;AV5Ks&$#O{G| zY8B$XX)SqthzaDPmKLl6RJ8&b6B+Xp@m*BHsAwlOSuF~3y6Tlj=5@H^+G?D+Q7RP!MGPM zUyHMmLpB76WMQ5^(278#r4`XSE@u@M#nHWQcSaG!o&wb&jxna@R~UU9B32b)wzQgts?-xN%83;=U#!AqItW zt<)|=PU?0V?bz#MQDtuP-1SVE3dncKJ-lx_+qi-KFi|wdwSPj(={CrL zK5n5e&~VaPXCK|$SM^S#f#UMyoy62Aa&6NwQ^ztzH&qg8llW?FXk}!qD&#Fb6&phF339(_v9;PV@8Vy zlZtO^ILRriXlwJTwtZ0{>m<9!p>y*CsfXN2JGnB~G#y6yuLKLNfyo3_=b{Q@IZ%e) zN#pne<8E8voaKUARGo`W=w+M|;^&r)WkKb3oAWCOcXf~xrQv}bqaH2anoHvi;=&8K z4^(8`eXiPNOiIwH0;<4ZN(j-vuYr|yKefSND2ZVHEW4mt7WDZJ`*m>;}FW?2; za3vkej`22DY1|_B+fs~nr)RlxuKK-u?@lXt!xJN$a1+HLi!8BA_dn|Y)�BvlIRD z69)3NC0ksMggkLtIH-|~mNqW5Fu{Y48&aMr%BzQ7%8l7?)>M!O3B10k5@L73yy8Q; zBZKT0#J)Z-pZD$ii#LHTNgiM5snS2f_;>l1vycCTAV2*hzde1Qj6lbP(HWpEjCF2W z2}kzg@vlumrMkP`O~wz78cq$bJYKA$)?PWre`x009s1u0=HT`#;uqHkB>IFi1NYdS zIcXhenU8_iCw0J=VfK^vbCbl3o|fbt13RXih~-xkyu5nl5ByQ13Y%oMdKfI_;2W`v zD2{y*Kg9T^n=~eW^)kJ;)2DClJ!KLDAMP6a?S!#>YdDSRqXm*0exW_*|C#!8NEo2@ zm}sMmGcUC4n=3jp3?-*mx|=BN40^KRpYu{ZP9N@2wBLSmRt3iWb$93go)`gU@qwdX za9%)jQj9|+Az{iMPg;`VytdZ(NvFjpHxd_OW;;CA%Ice7lE=#hs^%RcbgoievTmF_ z{adAhAmX(^KXx|WcCO&tYa@Q`xcpESni~J5e%;M7sXe*}Ol(0u*I)btMt*F2$7=vr zoi6ua`)8aVj$%Lv0JwY0(Z>aTg|APTPl#B3625D3GCWiMKe2-tZTCWKQqqD|N#WvJR3B5J7b z^JR1XLu^7Ey{>0}J!hKfgzzon!zXmhBp%&C*=Wh%;OEETYmWmHjJuyBJ$3 za3&vqLN?@k>LHp>&b?l!-!Y?X*yD(}?AsCceY@2~=4)cGAEU++XwpVYJS8mZ20IP2 z)TkYlr#XQeEnj*ee%%-=M8n|>p0RRRsgtR2CrCIz=?|notbzS&rEB5pWJ71|owjjT zDEg5VFyENUKIPy0@|9lOROyy>lJh-lP2#(f9ioBOG)!@I6#a8YmuLcfU6PHNDZa7^ zg}zTuh#F6r$F$QwruuHSGAZ?I*XLiTLF4_kzkM4-yNx<4m&pBGQ$rrpS< zsX0V*=u@$plXhg0Wt~|PX{-LUima%bVR@G!%$!goQ|{hS*4PaCbA)OS=#NJw#qWvZ z;&V2i-EivSmG|5!xRO(q-cX_)IN-z?)kpLyC=Vo2~DH5z}08%s2F)Pv$?f0Xv4VAjyB#KQ5eJw@+_ zc}#RgY$QD$J(dX>`Aaml95*hJ3K)<4cL$n#!|)h4{!7+mM%>)7LxpPJKDM2IdxLuj zUC_iGpL5Nul%rTk=nz}6jOK5I%kM;gJg)0TN|%tP>#p#l{X?r0)dtg}JGSe{n$DZG!wr6}k5Q(n(#|r#Vr*#g%Hi>@xlf(Ea(^NEidu{@2VM+yn*;P}A9>U*h*m(|~NrdtjID!SB_I z|FPP??hFKi?48K6SAQ=3rD)NI+35av5r4M#|C=XqVYNQY2T%!>FcOiV7}11Ivdn&X zmptIBW~Rpfn7+zwYB>R$5rBPtuPsu3rMxOOJ$-dDZkn()3dR*37{Aw|v8kyiYa`y< z0`^lNj7C#;P+(|qFrDk!lP7dqT3Ve0O!9}Q={9V#)WT@|LT;5j|0O3L=WiC=79Oi~ zHnp0Pl8aK4W&ZX#e z-P2D2-*P^yIO5rXY!&0(~5 zt6$3v>%5m-=xJAzDYI=+-DP)M=gCA|jEArr>mQ>JST0F+{zI`TEJUE*H|!vslgFeB z>>2XuUTh|yRnMQ-I4S2Q<^c6Ec`|vd?T3&5^($;2cw#KyvK4#S=|hX%2xwq9^|mi* z4TV5_1(UbXD1G}$+laipU0;0MqLi%5!Pw0PKR6+;$>n#-^%B>e9sYHmRCWBz)~{bL=BVXN?i zD^zZ?C13k%*a2Syq(>g2Z%WhAFwsq%4CSV#VXENyyOCi#Cx8RU-0HK|tW~>MO#S8< zf0t;=>_0GtjR9E9T{U#Vq4@|klVa)z^s!u(%`*Sk+vgI%6$8fL1WsyAe%J^VoaW&h zvw!!TI+;qHii;uY(6uzRtb@;=SEXh*{DYn5UjqBv;ISZb=!r%%*qO?L`bk7;{IAgB zV7+YY2QcZ~$=Vl>IM-c|ssXj7!6$1V|H!K(7jUe#tO3WKKca>ljBqW`=a$+(7i<`3 zJ#4a3J`Bo;^H&>j2MeC60wwbV$aP{#6X($V)LM$^CnF$NU*FfrYF!iRGLQ<1>#NQb z#05)TG;xa}KED!ng`QdG-nVnYx8Ugs;_;o6Fe;ifNU_KEi3>Mx zI+?nh)pLdlw;Rjf@`gV7bJDS@!15~|f*b8#_gL)YyNor;My0P_=MSORS1Al8bA}cQ zu&K>-yl$=V+*BO_r_3hKM}OwYQLp5r_JsYJm>PdNFbvHi?+vXapz)=Uj_iKY8O{8A z9NB^SCk+eCG=%L{S_Dr>oTX>x?23&!3LKDH1Jh}hbisT_#`C)~Ze!2}<0_nukRrj= z^FLqggcz8jFb*$DJoJhytE`u8rV@$Ul<+yBpfy4Ip8osyJiUb*d6Y`pX+ z-@H8k1V+_@^r4&yxcCOw1=pq>Q*j8~r;8jOV7oS=QPJg{r(jD%mulj^Yxqajvci82n7CE!Dc_=4 zB&P9hY6WjamEZ~OIQA85{=}{?zUX_$evB+c6Ty65JKmx=M|;eFqkV(hvzd;Xg-u?4>QEOYU0V%6X&)cE zU|CU+K9F4~^D=8;G>jHO-f|tQ3JALRES~Z2#&QAkJCZ?WT!e#BuW6rcnuNmUh&%tfWT{QNTF=V7g1pZ%399Y9nW)=Lwm z>{?%;Idl5JLE7Ou#kVM4j@{pW=x55Ib15Ds7P2m&h{J+rwZ5xU=MZ8kNu9_3Nyc;( zt4%Uev+$0CyvW2!50{&3w}p+~Rlxda4s58}4xM*ZVs0hUS7Lpl!rjhb!}nK|{$oX9 z{f9pVvCWhwszx%>5{6#TWm`C@;o*A+#to%rQ{Ioew^!ZD| zshq1Nl7bi^=BZjtZP`w>dvD{>x<5WwXHEbaTWfQQyQ4>2sFPjyr55jZ{>ImUEowM; zu89k|{dIvwN!Vgm=+F7?ANyd^^w#SeM;hnl?*H*Ll>ex!OkE0h! z%3rhkLc%#X2>hJ$kEhpiKA+$ZeZ^Q7gk)eW>nT^yPe1x|wE29s7iaBq9=^S{lG@1# z=?2tg6TfG59zvlVEWhI0kHyok-5cc z#oA=`n9V%b?>p51nfEwFclqIku`{~T;WTkStosI7jnT1kNPQHMx8}P4*Ba03^Sgyp z<(Dkw0luLA+4J`Y0(Xc3Z=%BExmpHZX~@i}yV z7m|q|w9|gJnx1~)z1roK{-*K!B!M+I`Ow!HM@6M z1-tetPo;0}Nt_cr-^J}v8J7V;35r`*u1~F*Q}#BO1~J=2<%!4U!Zz<}5-tH>C&HfE zVUDRxDlvt;zQ#WPFZhTxqOPNzzx`_NouL|d*`Kf|AO{Yr-`|};-ST5Sp z7DdT%zo?TY=la?`F9bKnx!U!4>r_raHSb23SF7Yvs>NMHmfVgQR9}L#A|&}4*6+5V zAq#y&bdr>iBWgOR!fnkz<@;9kMh%m%_k$sG@~-n;ef_pe|D2r}V8tG>&0fDVpi1dF zkMV}ve3)Ixy}y4QvG0=l078Y5#x{O$N~^G<%vB?@?@x52ljRY6y@gY^4s+rurMj*! z)|;(K^XuPQgJX^k18FzYPS;UF+|d=9>YIm#D=)MD5fn9-!}CRrE-n@iO)p851x>IZ zK+%_iCoTmA#H*yP63F&Q06KoO%Cm7r5J|3qb(A|pKo3!v4tTE=8=T|aSP*r)(A+qZ zLsW3w_|DMNUP*@$CBM1h*A-4}PC^tok|H!`X^`OT*3dP^`9QtC+RDeeTCLs=>{;P* zvDm~!3@ps|a@S1Ob#irn_&ZaE->M(CivI`L%O9!AUn=;RW1V6y`t^P1fpMOXZ;qoR zb!8&*AWJpH5LN>%mEe%a4$81ZKUHKhf^0TpD{P&c5V(POF^DT?%np7(S40(x{SvlU zF#JIz89c0&+q-WiQHQ6&{Ac~h`IDkaRx@k>sa>ofQ@pucOCmwNH<&!N+M}{fGbhk# znTRB}Q7f!`x7<)uypQupKvnraVYAvr;GiDgIt%RQwe;qBpr=CD?JPCw+O0m`cc4+< zK@ubDX}GBriL_Y_^fl%?{Scf~d`OyqeXjzCVAn^wuU!;~ zt|!W^=p>QHw20>@jt{dR<~=T>UAlU+Z{@9ubkMpjT%#*4fGR;BIe0 zWrnrTn@#$7?KcC&ttir>+~><<$@|s1kCmt8FGSIk999~b`x0}alI7)tx>r~4`~^!? zVY&d6w|&gsTfcxL_2N$j_odD$$;b?Qi7NP{Hf1Y)pQKu$pN0)p4r0^0ZQCjmN0Njx zPhGIqi=TJk3$cLoiaDx8@8vyl<7V^m_2-`CWU5DVsNB^bx=BOUZUyT{7E( z7*kwrN=lt;vs6r^EQ&=l64;#;1$-qb_;XA}(1fqU<`^c^P8_m4KEOn?_%x=6Dy)T$ z+FL{gV&P;nFH^&zt3Y`tD~UQHXjbRq$o~9-D2Hew>q40eMoVXt?3KO@%UEOg{A=0O ztPYYi{gr8cYD1`zzEX$qlfoZD6C5(Sf?f8L%!OaQN4`Pee$cZYLK}b;+uX7 zX$VR7jmq#^Ai(A;s;uq4T7tSA9pLyh_wNXu`6Oq5`$K2Hrc}F536`?*2zU2z5!BGt zn@=|(b5Hd)`ln*dXq)NlUQNTcwGaiZKK3O$mN|r=#w#O|{BbLnjxRqPtihL1S*Rxa zHhe0`9rtn8+BDLSQ`Hsa&mzB0k%6LIfgkrj59!yGBLNYSn3R(qJJD+>!KPTkb;$DSC(idd>XzUAt&49os3=%yen6`GSW1X5EnpL-Ab1# zHJuJv2`W^KB-Xu5ll3gEi2mp3Zys2idy%hnRVgT|0mrV@Y)}DhM({OzG8x)H`j)oQ zSMyrZ{hJu~gVa)IBVWar!CukBAe0Sqm9P_^+W7>e8du9~-<7NyD6&a`ipQwz#$wpk z6ie;C>s@7F=J=)G?~6msqXxs8s~xoXZhLq@UX^xI;4TBni~=ZWgjkxn=6@O>%duR}*Ck?x>~X11&s#C(C z>2}FK?v08jg((}sWScT=J0 zHq+m|09aC5XftJcGg1F-rdIMUhTx-i+cO(++*D=PI*)q}txaiQB=}?k;a-cZsQn8L z>BC*-qf0%xZ!S>*hhlA%8w8M^CS`Kw2rfuXhi;3BUY)i7%RHHIA*26H@i{t(X};5I^3o(*kU3+p3v>Up4a6g2cGN1)W0J_Quq^n^E_B1^LODk z!B9f|meAEtU)=8w9nH#tUjoe~+xWiVW^5*&CT2`PNJodzcH@|#AUVqwpicOmIBQkv z$RWk5wT{@lN8j&_xKMNZmi=pw`46a9IzwsO-EV1`*D!k3@qMBty%gH>Km}BNRUa0B zMIeRQ5c_-p*kw?KD&k&NPtDjQx>^n~x(@N2Jn>NI=(QaNs#Vf>gJqfnMsu&NhYQCW zH52I!Lb6dkOx8)ua9%m2C=RZ>vasVwe;0>bmiu96ZZc|H)dnWEJ|efdDp^~t@&NvP z%myCZ{t`N4&I0-ktd;4He4{M1S(PqMGl+nVJ+Uag%+G%aGmKZ7%;R zkg6~~fgP^(iZ8%uaQr z1zQ}m*YV$R169}?c7W7~zMW58`7k{ggCd_pq%3r&oJVmb880w9J4>xY5w|;!j3*nj zLtLL0>y^snqzLw6=QD<^7&*io$dWoQQ)S@{Jb2fn;`YUkU0CqGJggNOq`-f#<#9`U z-2zgNm1Z&Yv2lTkZy(2lx1HaoOK7T`^}Dz`xBlx=IAWVJ=DRjZ+4_PP#B+!wCBtap zz5@z4PYogGTqa_=g7u7WUS*jnY=4+qw9+n=ytMkC!qJ&Y^G(}V-0U-%2u^>C=*3Q1 z_wloGHhzV=6Ospx4=+D~g}Ox(Bs%p=f;Xt|y1AE&KE>oRonzIrJ96PA>M)DpWnr-|vspoZW#!&=L6h|KQjR{H7tT@JHq-=GpC!f!4q6mwxYRYm+xuCDLu z7*qw|*bNN@<%b}~(ndRIyTNj*$)Q7>;wEArx}B3lCG7_DY`(OhJyQXbvsn+A9O60J z!lp&Wz0KYt_0-Nesf<=R@WmB{j5ZlLoAW~yoLBAgo?FIM!9ZzqLn9*~&axIZu1jFL zjfYwA)x@Oie;1M*r#tpo5cR)p@Yft5@a(Uq0?l zT?@?7PL>U)cZ0o}2r24TGZA8v1)E@)d=X7O$&xTm7x{<%3g_hI?#t&Zy+H2rH8eP3se<2;x zBEKyd{7*W&qY8+AW|9@Arq@OZ6?^y1MubbY{_aTTAq{ zrDWxcJw;+4*;Qhczr^gjrMoxjE)BBZ(~!B@<&yhYt(^|m=sPt&0(VU{MI3gPuSk96 z+5)CL(zos$etD)I9X;=V#D~YfU($RacPmqWzbAffAw$l7Z@)&Wx_b>5w3asU8aXXFXw%;T&v{AesF;0)EeQVG?oIJ(9O_f6InR!mah3Klhk^DTA zdZr#DsQO1q)bMcS)F>*rZ;XUgNBt!TWBEJcsPt zF?UvNf}G~$ksYIwAXNSP&;A?F?zK(PSs{~szbb}@xfKSa6PQ2$C22*>j&UUTMPYZ_ZSW-wg0k zE!rB7nyQlFue(Z?R_!TxSe{>n`}YIH{TXnIE)uraw^^kzb@9Sn%&s66bFA_z#@uHPKkU7pe+UL(}(`N}_Xp;DTHXg!NM_a>h>iRtXC*p5NW zVJFhB!e0=Kc}kOU4*N4ML-axUb+d!c%66N!9c_`gBaFD{K;$xWNy~FNH7k9 z3d!^0;KAck`=)w7Y)q>xS&)-6{4{u%6#F|1nH%@;J;Tg*&W+v={_-~1jsgIm)Ayl` zZ>`&-pj&si>TIUk;(cbHO&{CG)nI&uElZ-~HEHQB5L}f_xA(UEZchggX$#Xj0CQL5 zfPa69ue&v9uluOA`}UEJ6k8g->aC_Wd4mg_ktf8@hTA;q@e)}xlv*5c_#X%UUs!)o z$>9a3d8R@&qW+rgK9OBdmEt$z1JB>PA?2{khXYNlX^3aoeHo;Zy0~8f&1u_QVXnUzG%b; zw;)CuW}NHZwpU8_ANgc2j`A$lJuNrpF<8#TdkOp${2x9}TKR2$_GzUDl`J>5Q_wT3 zvc>4!3HP$T*J+d&^9OTAh_&O2htYVc(aaJr2@@BdW7a za3?3<;$?y425D4p^bK!D`Gt|vDgI2|H%=rwpWesx%w7F*;5ZOuVC!@O&o90XD)E^^z;R|rZ!Q>x1Oh=vMNrEGap?&w-V#CcsQikd(d$hx6N zIF9Aj&bI`@eB3KdFxwiBVihjIT(z&N=@5~Ds?OT0Hu^u3QwM`{)Q|8#-KFk+jSsSR@y@f5qub z5te`rOhXcT=f}-@qV3)C)Y_fmSIXHnyTfVsB4ry1dJ4tgI7WRFM^QWrju~rdpu80% zVlUciq%wSQ#x)hcQk-inYHEqtrX)y#EL*QPij_9(zT*}v({D}=;30)%Gc_Cu_KST0 zTf3Qaqolo|pp~QBBvM&9qi*>TGbTM#r4sC09K?7iUD@l^>ZAw!)bpSc_EbKQQSQrR z2e6+u+>O2B%{}0Is~83`Z|mn*QZjYYn2|68_9UNX}2W zYScs0<4&iFgEH5+u9J?Z#>E{^g9NUYjJR5!jdx(J9O+6dk|;h2iR>f}f2V@KXxt7! z)jMSReJizR;4ZwIs%q-BtcY+^Lf?rrE=d$MG1n{GqAyZ^zte`k|S`bkk8@h{FmMKh~QL|=hyZx|b(jj}~e(=cMi*B{OFn4f8k#x&* zy;E{22yqirHl9=Jv}TpMFiL8r)BN`F(>Q^3Oa*sEFmXWrD#+0TCS5c=D5U99Jd$su zX;Q2(e$=TqRmSlB^;H!k=h2cxpdQr!d)x6fz&c#^p7oDy4gb659QkZL9A~unz z6E0DoEqZsw`gBnFFqbna3Z4x@wP$|m+3Q}ZaMX#2*H<~S*zFX2)Nm0ToPDFXeZQ!+ z;lneaT0jfNo@No^=b>Ek4^N<0Lk|=Wx!vviKge|er>Q+T;0|IVv;O;wFMs~^#1eqg zH4*ZEo$4>mJ9ZYJIU-2lPJs+uK z_&WP$x`-*}av3G`98RaI@7CqFWeFnIz`wdz!U;k+F49m!>>K{`I*C2T$3E`5>~gx` zPu;g2?Xxy4_tdBbEPqqI`9HOA&Byr6b2O_ZQ303 z+x2ecNdznta_gff`2f3xe%3QZ7SlV&6bGdTI_`Xs%>OTjSzt(7jB zsZw}kgX5gNUvR(9!0o`U$Uh^+Y669T9Rd#|*ugZo81hZ>&X&PS?iPA=qfkMSO58wI z!zOUe++hm!kv&F9oVd+P5-GlaLP@0tfpF=oTF6VE7sFt#9Fq#Z3UrDqKARwHDySND zJb4s5MPNyh#~6WLtk&$$n^@u z5&gAV9xuIga~df)vzKh{zx{#>%CB}zY6hI=#v|3udxWS6! z_hLcEW7x?!Tcdm7U2l0%yuKy@g9|bD^AVAE%?P#%_cw%p9k8Eh+NCYV{om;fc~fOA zX`_5QxwYv^OR^>Wnn~KN-Ck?0D5EqTsDC9a;G|GU^-uIfT645_&xOwbYEE7sY8%2tj|G5 z+}Z{cD+0%~*?Yb^ewMr-Vq9pN$ezZRiN?RcuB_~?N`A%o;vlNXx{%hoqZ{>0SKdwm zU4NX#j-HZZgH~Ls7nVb_dW%E%9=3p)?Y_TjGi-&=Uj9DMo~=7J7*>IC3UJFrix`V` zcP^mlx5bXW*DDN7Dz=y}&^`S(fy}0OU_2WnCZJv^MHfN$Q!|2>m} zO~B#O&h;eKobH4;B5&;|_i0@)JkxJges?y$v`1+x-&c0U6r&pQV20$0_OXUGKNbpi zw%bdVns3VkCk>0n=)cE|+r7L#z4}B@awbI+eQxt!g7F}A_jI67!s4iBP`PK`21l3W z(Bziiq6A?mxW}~6w)FzTU1w)MJ>M}7u*rL)}tBjz1zg_!r z$UEdyMMo0!7hRNNh>kqyDst607{%@!^(q)Z?majPsCCo0P+!ma^E1&>=H4&SZv&II zYGL72D1FNG9^?kVbnHy?{!;|+aX}3ddZ7Rh1RR|&O()NSgSUNv)+5uOOQ7{=Evc-@ z)f6vV()iKA-H)R4@k~ZibX3x%`h2qMEkd?mbp*VJ-N$S0#(U#Nu40=iOM&-nd9y*& zb0738V45vt?#d&a^O zcP9KEk8(4!`xDTzjlCb?JS4A9ZYqZg@bpXtRm%Vlwt zl?D4gnKN(t3Rm;U`_8B9{Vx;3)=mSbd;YE~p_c>|? z^Dp)iX+%0?K2pbPND1|Bt+6{oYm9(MA?Gnu`eJu={X{5(I4@4_MibewK(W5^bl*lj z&fHR6+G+4+!5cTZC{8iY3}f+62~3$O&0Q?qTP-WxU0gj+A1ZZ^2VJ#Xco3>Tw3@j< zXLMrYFMRtH2H>*`MYg0EVsMP=gFRBlxroCY6-|36k2&jH)%}o>sRvP1q@nEI>P){S zt$Z6;YGnyXui9#2$WgKc&Vkgh~yZJN*dqMq05v-)q%-sO{U^fIY=9dWUDok4TE# zC@_xj5*8OyY8A36lcBPRLu|fwbRs|AL^?{!Nqb~QVUx|kr< zx1kS7=0cZRS0^=GT99#$W~x`>Yf|ji)DNfE)`^{hds=ZUpPMuEx27eYtx&I?y`@qc>xYS2bn_YxGnj_H^_Ow8@5M) zC<`6{6z>cnN>(;EdSZnRY}#hT+IXtN>RqJre*HQa>921B2REu%`ckPOZ@sf+s$JT~ znuG)pg~a#z0Us!nHA6mvGx*tuIIqk2-O)+&3M&=@tf-iJ(h=yx zgEjuA%y{-T5Tqm+2WF`c>MOp_S?r)-@$2lYyFqV9K&9KscX|=_#{Nd`)|56p!k5sA z_!y|{?!-W0PJsJ_W(?_fCzr4hn9Fh9X-(;-A0FX#;6c%(LfYNRl}F``sozW)OJIx{ zL(r+7W={v`sKQm1fN<;(mzm-czoWw8pv!}RahQ{LmB~!Li0o6c;C}G>o_5E%IAI+fK zsK;06WP1!<*MGf0+K)<+zaIh17`{@IX@|n#aH_(E~(o@BD|!qj^N> zXTDtEJO)^NaLy=}CM@n1n|mIqleng_LI5g;{6}!B0ndQG1sX&hl&ekBq*h)sY^aCN zek^LlonSeHxkOmKTy?-?k!39>;N~^IiLfz9tTbe7pEtJkr9#u z@xZQ5G;nLqVFObEwjIe-z41&UnW3jpk%doxla9&IXvi|a*8ff@bZ^p@vw1)*QQzP} zUP1!CgtJb35P_>N~uVWR;k3SX%HYb+bTFV3NMX(KQ}2H z#dLbooFri?d7X(VEYm^x2JXzRS*(b-OfSywqcQpE%NUhs!61K^`{CE8-DsRRT@Djq zs}skYNr4QO*EjxBk)JXSjBtU`nhu|f2;Zk}aF$oGbz|R{@!Z$Y$9+$!nD++L?$DeV z$Jq=WHibQaNMY~L9Ny!w25Xb_PE`+L&O7&fAkZdUSYJpKZ3uhPEDx*J)GV)#$C;W1 z1lW+5Lw#EWgIY$_w}YX?wBc-PV&OAIkvn5YwC(l%21US3OqGqkB^rHDE;l2erP zizD9OuIKs20r|0k^VFlSaUi&1PHSm}S@K^_>ABL@#X(}A_CEY^GPn0s_5Ah;?pLGc z)T>m_u|*GE@T4Q$n-9Yo>?^b_u#C9iYfa=pe-rZ@fT6B%2V>i*%RmWG%$Uk&f?PTp zbk82zA(AML;c-O;f9qSmJeA1>Onhina_tgKBKB{Xc<_pa22xEQybAM*@_%WV+t!kF zUp;hwe#T}G-(6l`>jAXYO7Bh*u^vyc=!5K23&kU^WCe~Z-)*gXsgG48ZhI$%TR)gQ z7j&`uPE4l=z$$0auJU{hY3@or{V^F`RKoKh!~_}TTm5JT`%HOAd>k`4@}ELw)~iSq z024-bjsq#dbG<8>f_?+q%@UgSh;I^zS7EWn72!6%O(!c0CZf#A3vIB~V3we~pq`3Q z+dmO5+jA`-uX4$}xVAoDVarJojQ8&uzvNWAU(h#LWB&dK zX{(YWRFP^XL;Y-x=u=tKnr0^TN_ z;qKtSS?}iRkhU!_U86$ku7|Zc+-(6*FSSpCPGh;=H|A=2nuO6eW{@uJ?g6>wp_-X- z1@^a$kInQTpf9KkuOF*x&tas-KR;I5kg^oEr?Ip3RzTUW&&?|d7EIP~O}DfDZ+F}> z!v~cJ(R{fzHD5GZY}#%<5+P>;)D)&-dhfoUpUz;tL2#??pwFw0ufE>!KJ99Th)Kyz zy4OmQYa9yP1^q~8w&OGH-j@L+wzaFB>npJ z-QBB0@}eoXImwM3Hi?pUI%)Eg20kMbHh~#3U$Yw3k<~k>^VuSXP2V@D2P+Rk9{v>344MKQ;_#>;Z)IN*n;p98T=`d*? zojGn<@w%k_S&L)C?jm9&gsH-6EwK+CLz~ji_4HoV3La~ldbCR>?k+PSc*Y=(P9l22 z^?m_yMgjq_IQ@72fd-qhchsV-blffuhd4n2fR_n*gf|_DK`?N-UK51bT;gHLzj#)f zpxBjknpb+y_r}WPdE_de=m7m@+4ALRADPBJI4Drd{R=_=?uKThBd90=PWf}_O z$sJxhw1&%a#1ow)eut2z2_ok!h_Y_Is(L;pW_?R1D&fXT)X`)nWPe4#>N1NolHz_M z5oYL-v_{~rOotb`t{WW)T565!*woWb$icey@@rK6VtIC;pTpbSKTLdv70)nvexboM zI2$~iAAGD~LDp%*l{jA{a5`?g3_^uHrQI06*;X$Qvd4ee zA%L?D9K~uS841^3C?Tixmz4**DfrHgZ9MiXt+~qLHNpkz zrDA4-*&&WZJ+q7j+BGU>!}+G#l6OrP|&@=76iS|XAi~7&_v)up-VPQON4TrI}uEk!V$U{5NEl;&D1kgf$qa_L- zM{#Z+{CH{p0kbac#zc9<;~oKihZljsv;OT*qxphY2S2pIMXp8SNjls8a-H|J)lIUU zw`=T*w?nbfqP(!ixlJc&hnMXg0)$xj2n(gHb^dr7bnwtpbA))`ea-LbVRFF}E{7j6 zs)4rK88Bv)HI3oSxhOg=3k2M5M<4BP1z~P%evrl);uM|v1B`{4<+g}@~mt6_z?)(smQ*Dh|2_a|;RgxhhMm~d znppnq<^{^*NGR_W#U;z1%KTNq_muF=CMrT%n_T+!Eehip6l7ug;OJ=bL;il+@T@s4 zv?GG5*~E#n7eN&Prl$0=A$T{4tRt_EOaNrk!-zgstqEr`sbu7L64oX1#&+%vAa9)iqh$3u$JFzCUNEXod`Bp*I4KtH@_9aw%TOqr-}=IdCbfSm|K%sCWMizhjp*zO|H|YfN!hxzLl$ zNH=Q!CPXGJ_Gf~8NBO~nmMlX%V!mqf0Jc?=9xa3f&UpZ>cBV}h8l_%c;3l%rDGI=6 z!als*Y}@{m+<=w=nY>h0Nn3d!ikjSSN4K8Lq`}V^ULXJSj1=-4-WjVNA?Nh2`C2^g ztG@Q#Z9mW){=`J*l*l(;BY=$5P}`t}rftoJbwTx-ecj&&{cNFIjFoFdTAfW*Lsl~r zVh9@huDrPpo|qV3=8%!upFbQExbZ4Nuyt75Jl>x0Cms=INmfLKEYI%Hl8Rm}xj9uY>nOkSa2#eh8vMk(7|~| zQ+g3YuIX50Us3#XA_lvACxG1Mcu~bU7<;Lxrj{wg)ASEo9t(Aeo|{IOo%jBPqp|ZL zGtVA>ldcaH?Pw<`*o<0kXrRsGxYVCIm_DDFCmOIQm)UMt7}!%U(5gtJijKVYE8npb z#qOk`i&v#J_Q4IqiA4LvPUWr!>4^jz|8P2uZ>0$YeY^8j&0|I zSQlYE)YSW45$22a&=T8b8ugDqT!7PKgp_{1Y z4wm6d3Vf7yptXw7UVz2bYv*DRFfnCuRHlaYCH$ilg-nB2qCG2kt;k;Fn#h*z zQgT*e@ae47rQ)&I5Dqt`3Z(I@<-xCE~E&2+J#r- zpvUk-xg<{YKq9hp9mg>2^OMuAJ#R*Xcf@mMmBcLM&H3gHs<`0nl!wookh$%amXyp@ zwe2JDnL7o?kClnGJ+v)XinP=B40Tb}3luU7@u7NymrkC{Ri3{M$UVeqd>eTpNp#T+ zFxiK7v9K1gk%gIG8pa8fxE{V(t+l#&qU1-!Lb_xA^6yi7;l|{>2Vy4Lho9%P{Mp8S zenNxGiREPs{$yPL`U3e0h^dNrlGvdB>7rkskm0Tj9S!RrUgsCh4$gu>MoewMs=oI3 zjQE}P24e01*JE8gfY;xr`)(5P?=AALfU*#LSp9r|CF)+OfJg(_{+5cKZ2Y|;{?#0C zbPO62aWwo8Cgh@UGEAm8zaSQa3DL^8?8nc@Ds z2vAx!7U;uV{df~L@K%MfcRlf0vS+gU+T&-S~m{u{;vneo@e3R0QO3W2)+=M<6D9_6haM z)cW=DB{H~=^Pltb*P0-v1${VsP3G{*H2UjfAh_pD{1xNXVg31pd=?aD;O4bV`kzOQ zV2Xa1c<|mFP3VvsE>7}}*GhXjfz9cJPYjQL1*=|&%}XJL))7FygCSvkY~0OcF1-+ME1M?xtNVX<7F_^nR=-n7O(gQg z#i`Zqx4Tx+w24kle3xgtaKnTmEk*Hhe^@h)#9{PvC&z>Z$}YQTld7(}p-pwXy4!Yh z+>dR{i_ULLu;@qnojBt6o(|XFv+ETy`rhbmSWp@t0gYj8{&0`G{~j(JpWhRwu=~dwK$(Mh5m! zo6FP9hPb#FpW4>SM|2jZ9ATEB9Bgw>?v{Fzc^o|-t10PZ?Fg^ywo#%hU@s)OeD)?>$wonMxJCQ3{?$mVX}!HTKaH~=(H@yv z(AKQ5P!I8-`W8gCNq}I+CgoL|av$l>97;w4#wgoMs8tvbSP@IwO`ntgOz7Rx&)4sc z)`2Udx8K?0XSW>yBeCOWBD$aI&&;}*;Haac@kA^hY=<%tqP4z#H66*HS`Qz==qle0*rSPqrZC_MP)<+Sr6W@*-2|DiaQ zn%d@0 zfdYy4-1`1`p^FpN8TUpm`XNQ9(U)_?Y2I_xo3dz6hW5BoZ{ z(*u(RAGzsz92@G`vLTKXhn=ohCWWvvKm&OOxovrKpKW~! za8uz5A?PUW5ip;-ai3n{6Bo+xLY&o$K<;@0O`oE^ zU6V}m1xj`R?z8bS(*n(8ECB<_U+rqRRRhe?!{(%}<2;8y+dZmuPrMc9TP^+eM6VqNHUtUsDt%D@-c`z&UnI7_i(oLS7fCF{7?YNc>iJhwd0N8i>jG3Y zj&3G-|6@~(eUC7K7>njcNXoeAu3;(bN26Lkb{#u%Ae|ms*5evu7`ZLk)|P|IwygQg z8^ta0VQ?hO`3!Cjn({@z%usO^$N~XF1N*X~E8Azx!eh#aD@eL4V^1Aud>ix z(Z_y0som_ZDBbhRFGnYDzX+JK78TP|#RjjwdHqrt;2_7EW=aG4RUDGX$4hBJtp4l6CS1h_01O;GmdBoT91<>GQmsqN<{DB>++5T|!_OSTLcVS|x z;wk94Gy$fTb}H9)9wei~zzd^X?xh@jI9%%cB4BiJyjZCk6O%QRBLW>|7qZJ68>dWm zpAF(ah<>noabMhKC+*|q#VKg_17n!dbM2GGJA1uPZdE^ft(_&CU`9gw3Dty>1%}}x zB0wyB1A#u44P7p>kLnDxx#ILFK@gebNpvQ1aFdt{GW>i;_xyv4AQIkThT<}qC%}#M z+#35)%0jWPTVAfkhqe(>{lm||plo#T!npIn%-oOfdzPj$E}^hn zS;R~Ail=>R12#ZS5a`!OKo!6_8g<<^&SH&yhupB?qGj$GEGwCNN>^_lF*vT5oTSw<D$v z0*%6lDx4my$9>?Cq=snRRm!e+S0pjDCt69`;f$nGCtS%J-<6qjrBte4qC#8DE|zzV z(z20EqB0KMX$ht&IZ(4KA9CvlUYunm+M5=gc2r1oi*X=Q#@>A<@!NkkW&|Xt5CFYU zA^!V4GKxqhJ$5;iVe7q}6A>dKL+f=aS~l})*5j|qPHxJ_ht18&Y_qP8w0qE*9%Dcy z26l1_*%WOLKgfs7{0NP_g*n8@&r;CkbFw%-a(g7*CRHtt6m0EkVFs~+E_+LAT02H3 zd&|Yyz@`ts@$-K8v>tKVX%>?#0}nCYqI^ijsL=~?gJUyxPP3Jjx`-U-Dr^Pd9$*&) z7edmU^9{kvGg}?i$?4kpsVPIzZFzL$lg&w&O3!OC}H_82|q5 zE$Ict*xK$al55$s*Zd`8?|%*?;r-^vCIZ{@QaB+pm%j#E$`;(VQ|OGD2EcN*WN0~!z9zAKSS>g-^)81d!SiS zmU#N)oym&qI90V8C%jLh71^UIrK6a|JY_?>N3`5*3}MX;ryDgnQhw<<4&!s07=dLE zuE%y}(F=r5+|asfH-wI2s!0NZh;yavNtQMkLu`Yorr3P_M=f*UPAkI>u~VUnY?u_ndWOdj zU)oNwyOv7&w+w%{Z`K`VE({2)$=eOP#^T5F3G5FlnvOvgOoJ;Fm>-|9Exv8@&2#mV z8N1TW)Ahs*^YMynQH7=J10oAR9*}F5 z+Yo+HeZcmuJiKl9)eYv`<_c8-}i=C(m-;xt$dL z@)c=jMnNv)`FxbFEc!Gql10Jf+iMMO#LZv&wKTR$G7L0jE=D{^o^287EeE4fyTV~t~vyM-$ z+J@kcnI_nqarH2mx9~pptt-B`_cu7>RxDw>#U0yf)*nt2`<}#QDPi`QMHnVew(;f7aCuU)@KkzQlC4{g~w4FeY$)r5M=OL6Q1R04Rv>m6|d#aNXJ=^KCZm%yprs5$5 za{WLv*DhJf&HH5H%K8~tt%Xf($m~6a#~AcMS|0!K;IomDj*zq8Jk9>WanI?y_X6;E zSYxSZMP>JJgJI{}o9j>L~ceoGAZTuXoWF^)^tRNTf7n=^Q9xl^#i zQ}Aps`>Dge6Yw0_6lAK0Q>OPF+pz*N=ive zZ*&WW^*?w#8X7RbzNpFLWUx>6=C;7QiJ1)&6zGwBW_h|$fshZl2NZw&SUR&IjZ;t| z!Ur02+$iDK8iiTShndm@_vLV7-QO98ic_0We+teoX=i|MD$Z?~eVqL1;TQCtB|$UD zn#=Q{>A2)M@sh6Hvb^gC=X|YFPbj6sUf&WJP5eQg;djRBwm+m=oVr6~-Oast`@CK_ zMAZZloB2t?5k)Ll7Rzw ztc(B&f(r4lD&Xj+%C=OP>bZ>>M#TU$PO&^vl(Y=+Vu0Now+S;aZP$(_K%9zbnS(?tOSSH z<>3O&zQiSoI^YIQwtjV4>P^YP`)Gio(Kga<1}NZANioydU~@w?qv;tB+ooa|iE*_Z>DS$F#sc4W}fXF-TO!hHu@urH()) zN6CyDrHq*h_+?hG z>5Ycrua$+Y);~m6&*U1^V^+K+c1HviJkoz>Cz~_lNSz8OzNoJu2~N zWIj6YUwS^FgP9Eli?ya-${7gV5BX6Mj5i`?f)VtWYr%E{>4ambvP5L}-l4mnoR`s~ zvjCW6!N!W5Yg)&HllOLHq4T+#jIh;$N*yzMQx$~v=*bRd(XSvRVRWg$m@8`VWDgqj zV7NhpE$>H3C;Nz&n5H0u&|Xju6Sm*+v+Oqx{n!8SGl*#Kfk=dy2E;f>z2`_B41;bA16*Xm;np_&61EsLt*X8a;y9EcQpqJDpFidgZ`@z$XVL1mSin1v!#U zOh_g&-JA)w*}qC-hykTZk&=uL&mqrcuL5Jyt3UOqA4MQsEhJL3AW;8 zG?_#`?)@3bEU%oH_iz5;aFWYEN9H0)Zo2+vL5c7u3l7ISMNiuvFYU7at3*gL5(sBx zHwvXlMuT13WTA z^kw>2lK=(Bgl!VzBRg1m3y)To?a4c=0xS0Lckn*+cEODnaChN1Pv-i=k3t@Sf+%@P zjk;LFXrK(=eA)intRXyLSBGvpuYyizrEzhCnQha1F!iDwbSQukjT-w8CD592?*ir{Sl=keFKGaUb0o*dxu#s3>2 zArH2Mj}m|V9@c-#`wphx7TteZ38n)SjCn;%)bxML;{y|q6y-k?(hE`n3Tr($>gWAW zd0rqc(wKIYL;Pz(w{n2O#z=Kt{u^Z30{!WP|DVnTfhvFs$R}o{H~(8+A_%hlZ*cxM zIR9HY|64i#Gi(2U>^Wtf4u?zH8MsHsi%DH@E-6zcM$hAPopYet*7$TJS2<70l6C!G zJGT5j@<-(C6N>yqtTXVtg|~NKJOpvv9CecvpH2+M{AV1a&#sk%$W=6;IHo)K4<|Fh z$w>=8{8=~vi7CEsmrq=_2ajfdd5ifNgbSaTSpl`6a%n52|7ZiweX;a~HjTM__0mpU zX{MRe^QIdt1_l0mR|GiZui&u4=ZCP}*c{&l5M|VZ z(*y$rnw;=3^ujzv>Na}q(GP#Iurs)S3Xg$1V@7jW#ySJ^-3gdL{mb<4rdT`EuA4DH zestmLA$<%XkJ0k5H6w*qpj+U;*fz?m#wVE)*Md86UdBdlY7?e&P7e!$>$lhwEQD4_@I6L;wy zV_(-^oUDR73c`btkpJmy<2B+Y_lsxyO=R)+x7@S+u<1=BnU(003wM2ihh>4Mh(Mi;X&nEG1&?6cYZv!pWm|%deZHPR&{ClYF zb0NVijz;b~fa6w(^K#8{?zMu*nygqiLPEU#u=)b{>>qPpQ?}*OMjaImc$;x?{*$FN z0h#1&G`GDzaifLKm<#W&=1!Pnl>|iVYYV!?Kh+x%vfnQ)iW%!N%8GK)v^2}AUkUA@hryMYM{Idza9z(sqO4&HV7Wa>0#t_Ke3Cu02+Wecm#=*mkvc3 zF0=s?-DDWJ#N%E4Toc??OCW5&dEt#p`Y_4;ASOmXkbBL;ikn@g@TlF` zdnK5uw3F2+49bC?*K?!ecm_Cevk~`gB1bE&U}^#}Bj%#ok@!g-Cpr7AMdu4EVHNUA zN__T z%(0BfQZ!q;J%Wpra?2Nodqtk-jECDTWKl{zKow%s_w-dYSj~?e*NR&ffTajk6^PAW z0ItAn(yT|r#vMz+R#DP*+rC^&9d)AOKC!#UJC)Ne=FB^9lKg3Q1-gea@>cHh*kv=C z74;olPvRUWB9oS`sXiDtclQ_~PKd?sAS6kN)&z1sJ?AElGP5BDN5Fb5P>KrWYdI;x z1Rhk+g`y-CRjh${yEB&$pz&PAgFlobbkA)sgK}VKzp*l=G-TK1a3g^5 z{`r@=y$%d`rPl7dd(UM6v1@3rT)ilgM1ECiq+QSXh^}^V6@s@$+#DcQcJLE(c zOWKBgQZJZo^cHu?><6E6Lk+auO;yi>@EO>|KHQ$!tu!C$R!(qf6UAEp2;_F`w%D>}Qixj~oTlid# ze_!bZ;7JcG9o+k^T715k+vSSNUUJYFg2?>gapkyKo&oD2b~ZcQ&lMB>Tf-qmB`<;Q zCFix|Pe#w;ZDP->^4zi#@hb*FbsvX!uljeJjt9nFi{N-kxV!^eQ#q=4L-z|jUemFO zJ7>2A6g!TUG#c}qtRtI1Ia#m?rb=bzkLRa`xtE=fdyZIBOzUN44G(lAx&<%Y9^0=d z#W4-oPVsVsxQhOv^n>znz>m3hg2`%-35_$VR1v|Gi<%N4L|FiqWBZU<9*l9-_g4)B$04_)Pn$Y7&IAWF;12&6!uzgf#`&3U~Ck zNA-io^cgMRE(ro!r^?=7d1ItkNbLcBw`>Wg_ZUw?DjlOzDZ_Otbgp-L1przRP$HH=s~hH0p{a$dkLY zVk-WME3j$ZLb5mzp_(~cSS!;tV6ZdSR!p$8^j;)s9qcr%ZJn?o9B8Iu*NXD9^Zk*- zW6H~G^;kr7;z>#)aOyfK&A}a%xq)-Rz`Wb}IH-r{m}kp0fVamn0(WaRDv^vjeRvwXb6hS zvns%UbIA^cHUx%Dv^Qe2CmG?hpWR?16CDnMC9$ojUO`w75E(4vJP`QOA?N+qOp+ zKdtWia!YW{pVay*P7dw$s+%DZUi(KI8)*bOL}eo;g(Qv2`Kf4CSX}4QMvQF?OUb0YvyjWec>sv8|7iL7+uN%UYBSaM!wHig<1! z_;jjCO)hqN!grz`2Fa}2yLhMLwn(n~Gcm6n1huMq`^H1YD>Bc;{V**_UZxfhr@WCu zI47hO#DC8E{HjB*#D*A%mcKNUQ~%BM(uJ-K-8NdqOig@&+xOl{Osi{64YW=y3pd(_ zNM}@pG{^D;C@bCSTf?W8)P!ikMZBCPOcHp|9}zG~E*#bPa0GxTlAyARNcY_L`YyPt zBUCinfc8S5kO7bTX}GPdXVJ%lQzE zDMdm%QMDv^qP_IKfh9E!asG>t2SutP!Us30B~HqDv9g4(hMWmJ_B+o}gy+R6IEs-b zud-apui&g$^5<4z#Zbm?8r~10-Y_29gmP}y?h+Q99Kt*~`}$r9$juqT-Wf_25j>{_ z!nc4M3)){KlV~RUa9$PjH8UZmp8r_zrA^#AW~o2Y<3+c}0a35~8{>HunBkNPZ@K7I zlTsCL%4Bx}0K-0cUvDH>Iyy_C#q{iGn}(;qj}Hhb#N6`(BxXwX`lVY+-nQQgM|i|` zwVJ=#$DzTmKzyP(U|so%IxxSpH3ezTh=l(;QBc75%{628}$B(HwwZNTpiQYFncAsbs zpFl(TgjpshNDB21SBCPkm4%`?>^K}uFzx}br=))xC)1$Pssx$X0 zvkjV$0*}y^l)PhQW;2e0U(J9heL1xuX0q?xa#qrYc|SS=J3TX@*2~YM;NFETLBwOk z-h=D&Agp^F+V-eue}7rga`cV!AgM7c3(Hxzu8_{P8zO9bTy5KWcKILx2?8z_2c!D8 zJjt%7msvMxH`7KSJZ~^WR9M1)_ffX#w`qmX{A?2rl4h-|SAv*|g9>RR}PCKKviYxR$aLq9VCY+D-;)(|WC)MSVlt#S_Qn<%a z$qJQpUw%fRF^x@H%L+#Uj+<$xfu`)vq=!X}S;yCD$^N{0{}E-KX#fEn$*32dQ^Y-D zN;)j2ByCa`f7oNvy8Fn?MpyxL=t#3)Al})g=jG~)fI#G%&PQ+PdP|MvJY7Ky{I)t@ zdK-x3n1it2y(B8zgI@Y))hFrqNxwY6@adfTLa-gIOC6|KRYG?0mao$jsdCary3<8O zDr?IC;oJxwnbl=ls|@;=j0rHI$xW!)cjB_zcIdO>f5b84Z39oCWy3kD#)8$^1(VKs*(XYL>`D|2^8&6*CuzdskFr;VLr%0aUv&can|u>H zW(%KB>xI<=)#kbN!eXolP)AG`eyUvsLCv|@xw02Pn(4f-2h_T(tA#a%Qz|K)JOmWZ zevr}blfAa%hc5##s;jxw$y=9G!{+>)G6EF2IP_wSL|NUAUh>MOJQ1-Z>#O&q9Vf(P zFbzF-IR5uC|Bja&DxT#x0g@nr-#AozIPF?Nq#-&)dJIIgBPI$owg*6b*&I7h8)QBK z#-$772852d(xbqAJ1gavgHOd{vXVT;_cmd$)|wiXEZ|+AjP`lftd;mbbx*<9XvW&D zdKq0gn$8k$@~PyL)KiKqZyy5?&@}=gERGVSV3URu-pyM5({%LgKq6rDh}^CBcLB`y zR~MxNCgG8xD63V8+(4GxVUGfYkbnnANI!!k7YbZpF$;f@9cO@6?pl_Gz=gk-4FGmO zw=4NdsA=*1lpTOT*ExS7>nW6$pN+)=Pi(-FDNuIr zGjnFv80*SowCP>!5MTVZm{IfI#Q#-K)`6IKOLcm!d3tN>>LPIIlA>51P}-zN`QsB8 z8)t0$svOz&ZTp%k&$SQH?mw9gcJr94X%tk|pX*NxBX@JVon2-<4y4)caoQ7MBdxEf zDuL^M>=r@|8O_j2pWHI-TfR}u_JUkq73n0Q8m3ofpHVoZB+9+tv(J8fsj6(}CH9w1 z&;8)Fu)i^Nx4RgF<*8UT4%*;5ulTRp4r7K6lcJI{}z94npFad4uOy z7;kP>C-dnLIDLaB8Wk3%26{yi+i(>p+3CVb%I>=To20NgsVe^wc0K}#>r^?Vj)XyK zk?lf)NrqEvFvUj2lCmW!w@teJt7r$mSAj%>2WZE4f?2%M0TNhwmro?@dCf&<^e|Yu zAEU9np#PgBA!UgOPKSK_t!wJf;K~^za0&K7+m0g6i(5xqUv9Z;6vF~~ME*o!fI%dX zNcIUQHjC;kXD1G6rBJcitbl$O#Lh-VJRh9n);;YmzpMj#s9`ou0T@Z^Z zSy;}MYrzv^9J9|Uq--}ae{(`*>|o%WH}*sB7!X0~Qud$WQ*4LQ5%b9&oP(;f_R-VsY41@8SN{E_!X#-K+8;xpeg z(YULo^jGKw6CUro2FZ}RitT=)A_z6e$pXI;{@<@aybSb)iMnq<7P11cf1 z1t3S^067!Y>R>zSqV#9d9aMD_4F;*tH3Ac}0c7381EZ~CnNANAH?gXflJY#^J?p^~7QYJ_b*w4+=@+oE2 zfrmj8cpW)BFOIWBf98t-E~Zo~I{O#5$SV9DD010B20N3q)kfW+jx8{bk>&LuJ%x9; z0z|f_Z{FqT1eq1zJnq89s>ORCJEz0q({iR!5C0(>u-x`c1BXn_r7 zi3Y$WQ)em|H#vs@VWknyHl4BpN%}~h9Kgt_Kr-=UP1Rk7`?7)`rD_Q5`7c8F=P8WDGjb1n3uFOw= z@e3C3O=?J55W+1BI3@i(i22#&k#$2 z@-4DlprBFOfV&DHg&4qFa}TRL2xTfCR2Yuk0)6!uAg{oT4vZIIhg$&nkOTbBm8qbD zbLO9T4fmWu?d55bd!V#rZfU3;aJ6d2TMa4G&_8=BlE{++fq?@2=`lQu4`J< zDj2!N(~m%T6*nNu+XWlp_}D9aV~E52_547`J^(qp`y-3Z)*#*JEanX#ZuA>o7qPrzM-D<_>uUZv#*wbyC6 z_0P0AZz{xsX`)cTZJ$yVR26+%$-StTPoCUg=H{Gy`PwsH=G4%v;VZ865f@RgC&hyb zlYLL%IP7{@7uN1b@jkjmaJ^6lIsz~k<4K8=1fKg{Nud&ukD%^}a8geU%zRhwI(h{8 zOeMHxnDB0%lnIL^$TmghqWp-jF=r`{G~@r|R&F}FRSwB5-Fv_ndDwphwwLb@?Ftno z@doohM5zc1?{-V+X+^_LTfAGS5;sHaiDoSW?4TasvZ= zkjghBSYeYsQs;W;A}F_YU7Bo_(bX_FEd^coq;&h+-8jIAjS*tfw%R{6;sY+lFftTtRXBv>60To_ zi$TYLq3)X85hbbXEx(+PvgxhVlluPHF?mQhKNp(fc$AaIXMQ0Pmm*M z_GTx@o*T2o-l^LI*`K7YziFem6ckRVk-fPX-(Kyb9h%cl@`3E^RxN1kx zDq!HdIh5(eYnm67D|FtTp9$H181VG58kyH8CE)EM|aGb7Mv@w)b5Aa-brRZj(1iFsCW@_DO zHP{x|4*|n4m+JFDYYxBY8*AH5V5iGH3vUlDR2K%qU6d&3>7fwJw(kHVspVV~S0^nj z#Re$OtYl##rnnw>xrMD{ug^DbtY$;i=CRR~ZXkB8PHpno+*yeWpoWN`n7SbKC%#)S z&oBw{FBi}%*ci+SgGZdSahR3Fuv`KKPE2eS?L?poWlhYs<$B_`6Rp1yi)lDISJ^A9 zm_hb-{J2xC_uq_c-E`&N_SHFZHY^*F6(F2zl%pIFDDdbw-Hgae)9_*B3>Gt}asSeK_vWL$A@nNY(=v=uUKzSP8LR|>fEc(+T0mD5 zV}IRtNLe+;1o1Q1qIcW3WrKJV%ES-M%K{dqDg%Z-YIj3+>%hTFC;5`P7(8Ouc?A`I z?wa(aC%9cEZq+721Ekj3CzOU_o#?5oKCm|$ZJQI7R@3O{Jl>Ld7e47=qm3t5Lc`it z7A2n=&dmJ;(JzH0r^u9iAqthLGr%P^^;;Lw6<;)IItOfVjA4}auzq-srvlq{6neEH z!cz17o@&++Rg&>L-{Uv5Y}%;nt2#I5Z)f4dAxZI=Mdy!*k}u$^}Pq`!tH@t)qz zx)CYg6h4^eW!*mQL*VW`2HC9>xqBspk*FlDiex!^+o5S`@`%Z41m(@bI)3Gso?xG- zoFaQ{^npKZte7rpr){X^SNd=)s`txR#Om!suBY1WD`C|xOKM{zby;Ah`Pu3A#rx_pV6aT>QHglIU&d!mXlQdr!W{tj{4o-{Fk81TMFf}H*vZU!O zah4BS2K3m{BTzP+Aj@mLyOis$I`!jXA}orRGD8G)bi00Hr(&fG&Ui+UbTgALHZ+_9}3;;xX!nzkdWw+4GwZ)i5H_- z!AS4vfTGa5b%mK|9eQ}lvI#c$eKULRm4SoiLhYIpj%rV1yP?Wn5S~u*&p%li>i+n$ z%q4N)FvL}M&g7v?_l>-uJ`p~m2X05K40oYl^_}l$xHFx9S}pX?sJZKYU-Sl>z$OZj z*7wje_G9R6b-}qrw;jnO((9DcLb@XWjozlgc?(g-L2K|G!@@pid=1o!@~-)jZ8-s& z_C=P`_JdG5o?LM(ganiQ3y}7B(0!^^Td3jgNJ;xRh0|00<|Od)n8ym!H$gz;Vj3m& zd~N|S7OX(fXa98HhLy-HAJkq$=oWVA)@^cr$ARNt@;M0NUIP1;_bEgL034NWE4g$QL_YDQ;`Zk}4Q&Idv-rfpmtvE#!E?GAFQyMCP zCzL2&eU2h?sBJH-ZziCa*`G)22R;gsyLSprJ2cw4n;U+?SX3cn#+Zo5y?G;sPUldg zdgL%>=Uo;Vz4maJs+P6SD-}44zYC!1Xhlx^y?Y%oy0J`v#Sw!&?`CB+^$9+sQ6f1J z=vDD$LvQ%}5m)BEh*mjXBQ2yn=$Y6!*@eF52s8C%NaUCb9qVgdV5h&(kwvOk;W(~2@8oizUjXE2o~3O1UQ3va{4ei43S6FgEBrFkaHwaNLfIHs_rMnVY7R?>rnjv_Qz+-@epD-%=Fl038ld%acSXqWhsqYnN5n?teW@_M$?j!AhfqkYBIv+ z4m120F}%%ajx5d z$wF{VKvrka&{JpGy?dNE4%TgXXCz#f&eTC8AfeT?Kw{8?-ePY?QKo5Iij`R<@GoVk z*M02zqWul@hWlEH3Wf89aw>>BVfSxT`7?iN)%fg_w-vj!a{2`8Xl14Mp@psGY3})v zUIzLxNb{a=@a%72ZH2h!nZ7Dic0cA1#eJfZc9I!_OIhX+94Z+8(cR@mnGc&xpih|d zYCDw*JtubdL*OkiSU_U+ZJRtThCH3iF-gV;BP6pUBEu5(?B80VL7Q3vNzCoKYw@^e z7-?n>7trdR9f2T!$J7g^h>}qPQ=&>HZPm>52CO?}F-mR1WtR=u4Pl`8RKPwZ#5wkCyzK+E{E=iNZ-4IyX z_mq=eQtjLPh{LONUo8#(ncY0Y+EC5Mq35Cdl2y8o*7R9f4*y#FY)Gt6Zo!K7mFY&x z1?qb0FHX$UnvtDL9btH;mXTys(0oU5@=Pg6jdC@IBqZ;GGXT`A&Q0ernsedIp6D?CEq-LaJafNwe^ z1KcO}ARV!BQdx!!l0aFMY{wop=Ok7`zasIxAs;HYdYP}nr7R?5vm&mmO$2sCWIEum zA8pUO^;~Q-`EARAnx?8K&*RFnrbq;r7@~Y$;jO3TNv6xAt=hTH|DW2vGOEgL4R?Vc zCDJ0GfKpPTQWC;KP-zh9RA~v3Mp^*{rAv`Ux{;Jdk?xcbK}w`k`o5pwKKqO_#{G4# zKOBy+_gd?VIp-Tsyar%Uc!IZxb6!SWfhkZ;PP;C@J3~wkKQ`)Y=;=QxZCfC^`-1N8 zsw@KkkNpFgYFy6d`LV1l^@zO2KX8EL3rn=sGhcb`WhX!!eujjj#@}mD^r?{~3cgn# zy#JgK1SJ~!bF)>nH=e4SVY+e&x=ZcQKGBj1ts)E=^9r7Ezj0Azy^`FLg!7J`5}gDi zR@56{u`tu3Xdgdh@7i#65{w-ukeW6g-_E}rSnd<5SxGz<)9=e2+EZy$H`5(5fLL>Q z>27_ZYyhm!WL-@QZ^{ z3J-N!>14)l>;7pyHPU)6)7-4gTyoCO*)S#D13*>1X59HB^8_SZuCjQuk$|G=3x*JDWF!wS*Xgv^Mo;6=E^{B@> zP^q^_{Ygb1e`DQxdE&kH;h+x}<9Qlx=)siTnbi3dDZaX$pid=NK_K@gnLdc`M%a!E zuGfM>CX03%Q8q>0OnKfApFtzBgPBGnhN8~6d2QW{yxWub2fTL7l~Gwm9JnO??{k)WsURdmCzc!J#)AGUl{MwMac39@?;6h#C2U{Vyv&-*1<=>UyvLr09sn`UCtI$J*qkX|0D+~y29G3zp1G*5Iv-U8!%c_w?C!b+yvvV&2Pf=l{^Vj5#QFWz zIBuf}ht9>yp9SxCt1r4yMXG*~+3!px;V({AW7Z!`pQE4@O7Xl#7yX`En>+YRwl z8gkO}&estDENj7a95aAa+BA;*lDq+v`n>wh$Ut_ij9^%8kIogHDiUz{nywW!@+d@g zE>71G6sL=Rl|h9ix*gUb(H_!x^f-d@h`#1cE=K(r*q|7JitqBPO6Fg_O#N02ABq*b zg$J9{{e`3zouIj3wFY>%cYT(=SPa2AH{3=oOGi3(#aHj7>2ia15nCA3)R;5B!pWJ3 z^xqhqi+Dxx{aDYW=z(lE*Dzw2dQGV0M&` zS3r(}Mv$BCw}&Kxu?h|A0_j{Q)B=l*1rh#|)Q6{tI{zztOg|uOiu~?DOmx_)T-vqq zEwQ1k$jnH%`52T4Musa^X2pJjy{gU=-Cr^bs~-=dU+m@<~}Na>T!G#a%# z`IJh(U7SL~KfF~Wcj%U4apEe8l9;P!eEhLn*34U{3KgO>|F)^3Y1hixtJ@yBhe@F) zfhR_30}$3=1pHQv@D*!;NO;UaymV+A1efJ?Ppf zv(08vagvA@V(O0o%`ghDEdEJZ$tOg@%jnnwiBxtkI-2?&1?nT7Q4NXYVkKAc?DuB! z^8Qp24LWGj#B2`gmRzxho*vbH(~N%=VIgt+(%FUGfyo=>l^qD#@Jfd2_~ZoIwaUE? zr#a!nTJrnsH$y||*u<_xKV5Si%ER`hq~u3J$Gmv`@BStPI5SY=mS2|8q>xxDD?M@J zDNjjpBc^WB=|uUj)s10aZrOOFpOd_lEwbtI>OEW>S<1aUFneo!nCh+1wqVi^;Ctl@ zh;UL}2kEBEx1xSc_P6FsDk9gKwzj*6O5pgINu>tm4d|pT1P3OKAUVn!)1d`k zH6FvS_QHnG6G;D3PwjUrxS=pd1kyld?P(QD%!=#!0my!`=ZU_jVDJAm>M7vv=q<6f71YaDzD<3ow5=27f%Sb8C8f^E64( zBN9PX4xYsa8pqhPCnIvwvv-2HM$(A&XVtllrlr6={9BWuuPGILsr{HvlR3p!x zs_oCFOb(d==vWxo&he}3GGC-uyoIGHc|Lx#*|t%jI_RE1Qt0}96uWn!^Ey_d%2T0) zh)>k##qRZJtZ$fxyHGT~mOFerH+WNSmN%G_adnw}X@BTlDrL3q7q(DkZF07_BFFt6 zW?I4GyIL0t2lCHWXKlnCDC1xVg%mz68N{QtON+e- zPy#;2HWgT1AGVvYqX8z%T0&VoCvc@rCeMCzDaMwbPX1}?uSpC6*~_T~7qXKkrd-a? zj%7c@4=-H(EEcbXYjU+mpt(P~u8jbo@Mheu(|_Wpi9Pf`dCD)Oemc&TZZiNuj=03a zvT%Bt{(sON6c#n2I{L!gzV^SMkQ|uRp~*aF^8X9$_#@5wf@pE`ebM4-@kV;s13I>y$t)o5Ycp;9n5x(LYVhk^cL;>mC0qi{c>xHlQ3< zY{S1vCPz@-|A*T$`6|&dl~0_8i#g`Dn@t7%j%@&ZS`nOQsO0hEL$<^xJpU!Lhj;uI z?)Vb7VusmSOoRbM1(MFJRfL*r`?H(rj9O3IAJ72^5o@sVq5viDZ~^eb=~je}y~YP< zcfYHBjx&%(>uk|N(@If7kHE0nwsAk_R&ztKdK3{>K;(;1mL*N|%aD@5K9jlIe+ye9kY z&lf$Rf!K#MH>cs}t|^FYzX!Fmc<@MlXe?k`6?_PQpJFr9UQfWR6bM^gI9amXeu1=% z`cM_0z5x-~TLzt!yujnMYM|#ZTx{X`=D##~1rqUv$#onX(N)TXoPs1Uf z0KehZvM5q9->B@Bem^4Fe2308>^0j{cZj7?zz4b^svKfG)HT10X+P1i`qO6Jg|5=d;`8FJu_51 zCP4^@mHo9swR_BI=KLDaRfjDw&9d@+I#_xfHHgyAOWoDHE{1Tp+JH2wzKn>Exe*d~ z8Qify^Pv1UC`#5KdVL%Y$$6?9Whl-n?Ex=vbX7{k5x@+Y+q^rKj=t|gt<1Hx1#-r( z=KuC4>BSn;Qeq8aV482=8)ksnue}h zZKxk$8{F3QYhJp|zE=@PrR;0zzpXl?bP;Mb11;(K;IpfoDZQOG>nwVN z))}c}c(xOs%yWV(^vdy!aa7*`FZnSIdzZgpu2cFDQ3mjVf5!*lVGWx?q-II6^+vNrzQF@l1*k_iv zg7rT=n+l8PlW3-h6OH27T?XY%uqNLSb?`8>rCda|9^ZEKFBYDReL9!kyLwqkpVXrA zl6uDj|zrJ7Z&?eaUF%1;Q2ccV-R_m8(hqzH91;KWv#@c)!~(^3T&JK7$tP z?0~7Qc1#Xw_BEtz=Q}6YE$b+&ZLx=m0PYfW(Zv-9Nj)b)ox+V>Wtv9;(SDV7t*% z@>s7tQnWQGd}jIcvxk|z4+OlY46Fct%b)!?Zx<(Mm*Fdt;d-M$8<+FkO)`f#R$PQq z%e>Q2^CfM6VxG~0SG$U0{WT)7G5nymEr#2+ZWwdH-dkQJ2)kDBSJsbgj)&p%xmbV7@Dua~Sd>CXIUX0Yq?E8`v~nc=5aR3bCbtqzJ=59mykV zi3X4!=n0bET!3}f`Nu+)d4fXL<- zBS}9-AjniJRUo>0K0fKxRBEj+^Up`mqZuCQRYtP_fq#!rgUpsy(01bHB`Em)dA9~s zE*=s<=OXb&jM(Y3MZEL{H}g!vEX%iYPIO2&i07v$+^w8}*!WuG&j}EU=!bLkq{ty) zJRh_pDix|h>5<)Yhi*lIe#9DXQ|g8D-`5Rvk1P`C4)@)Rbbx?_O13UHvYYyUGi;6@@;~fmnzEdo*RqOfduoxVt-nt zQkIwM#jZ0LW<`0mCuz>ce2Kp*fLIGQ6R|SXU6o=J-8=E0#{=0NJv&w+kv{uxEGx!R zJ|<9as5ztd^d^%dF_x4i%;2*WUH|8UWaewVj=BUVJCG+?FP*ZrVM&Ssf$0rFH41zFpGKev+?MC(p zvp}wsbAzvav^+Dwjo?b%O~6Fb(d)nF!%XR18o{a96R(sWtm#gd*ThMv(b6o6X) zq5LV=I#JKOHb4Af+hL0mr)010i+wp=Zif#OcY06yl4G@Mis84HdviE7Tzr~bnN}nNbR}oN@FRyDUGsI|5iICHN0SWT5nL}#7%d$S2aL5 zB^$n%+kXSd6`NdNyWf8h*v1}&TqgAEql$ryiuRy^xL76}y9Du@teAyelyh0o{? zh?sC35gXJu^*%+ZI%lvgSDhg-c?a$5TzLyQRoa~U>}?B%kB5u1H(uwuZVl}o0t0Ah zf1^2R835mucnX1jMAnzBgvLL?l5=@9^YuI7Uv9aTo88H262k;8AJ@9Ml6}{(<0S;B zIG7VY+}>cdbv%3i>sfZTj>BY0+t-O5wAK=M-}8x0BrsRyCK!biKGhrkq~25*qy9&< zLU;$gM$z_*XZ*9H5=BKXju4-9HaY^qn{Ehe_d8zr_O19L6xcyXBmUh z{E+l=!$*RoPe4H-;%?m{;ds-tL%GF+Cm!6AK9nVAS*eovrTyerYPkcX7WX1^AYmO{B;VO_d9zsdy~MS%XIum+4;Jc@Bm>fE3f8@ATW?xtO}!aOHW zTO^%*tzM$xTQ*b*h~01D*GY0k=U-F5^DbZPp|y~*3W;WX;f67SUlF}*x>R8RJCGN^ zew%uI=L6^US(1B>Uh?ZMo)W69uzUJ9Q@U4SX6!b8$QA z*-CRII~@ZrNlv4Nc7EDL{r@RS!)AgHW{gHMrN&X_IlUn!Z*e) zk-42JnQIApUJt=Tu0|9r2#wZfyNU%_H4y!vUJ>(Tf)aya>x2T)o`C6&L zrH6heBjebyn&s%%L+^&#$(F^9DsOo2pV}Q$UKlSs?*C+w;*Nb7-&Nc9XT_-doTsQ- z+nJlf6pv?DdZ=41x}cTo@8$eGT$D#^pi(umu5Xv-aYdI``%)N-95fOxGl-((pKi<< zY8LN_K7R|vhEqtWk=kXazAmo^2>)}yi+QPOuYNJqLQ?*|xC}~e#IE^SD3B*PzT9$N z`h@2+csm?V;bCVr-XGM#@Ph4-!*VT-b0+5bYRYD4pU{5W$t!-?_~?@_`IV*~@~4~g zKSSEiJB*B%T-k1P$DGLI6A0$MIYXYuY=*Dq12n1-^zT^0v1-EY)|4EfdE>#u6idpE zKYl?NxKMADrqNZo_jSAh5NI&Rw2#XyXjb6|S*P2gmEmUCJ^7W;!YECKwFO1+_)BiW ziz9P2Z!Y+rGTT_)Nfj+KAfb+rd;a{{sLVL13jkK==fMaxlpjHC_VV9=YE;l(Z}?pv zVZk>k`Me^cF@ZzJ)g~7I1IAU~AmswD15_AGx?<33om_Od8&tu=3nchL!E~tX(id5U zeHSX!XvDeSi=P(w9QCtY(6(>z6gAJK*M6+>+I5I3w@Q|fBZoXuK7?+0!!-ChEw8~va`%LZw(N71)`MK+OKuv~q-5G{ex9!r+ zKXl0Xg$k)_!a;!V|9xc+aRS1xQu5@W^Tyg#Y?4~ z+cga=lW50)Ce&jZU0@(Dthg7P?R>lsZ>e}!*$0{PnYa0xWT~i5+9`Q{LkUw=xl~PKNAc6Eg)X`U^ru@#)AbF(` zfd3^87yUv5HML`}t{scy&|koKx-5SKo| zAC4}RRU871aVsLU%tqO|L(hrk{8iS&6LaNrPf{5!>EH)L5357u&Q?fX<(xi<+z;I` z553V0q*kXTHzX~zOiz<;a?iWl%nG?4J9Y6eds>OmwXgO0ydm|7w9!8;v*y;{b%)NP zBj14>ey1WN=Ao&K$309&mWm8D5sXP;MUj)y_e8dY1zz>&MVTn2sr;DyW;;cwHr~UW zX1Y;!U2ZcO7s^H&^Dq}GNfN6X44hI9UfroFo7Oj1Mf4IV%q{S6-o9)97p=>JLHrRx0{sbEz(}Hh@oJ=0MRYI4+9cNto2a0c40<_?_@*VqfHqU%UHr~xOiyU| zu|!;Z`1$3as~o?z7iClxrILh~8-}v6BV)X?Pw@>e7IyLsnQa&p;VdLT6i8&dF`L1O zE4ZnsyWc~OYEGhX%I<8LV6YvUZPt7GGa2dC#OAM(ukLY)5(=&=YH=9U29XP(!;0!k zpZs|3g2Bi%@1XPLss#R)t+^9_F=WXtIEDKR4l#UoSEh2vL_z#<~ZL2+=VOM-qz zFtM)${3C|77EYW#?^7nMeL=Ae%a(B*O8JE|C>`FeURo6k&LAh}t4w0NTjkLfk=$Z2 zv-I9C4=Ow7s>4uCzOMPk&G`1dp{J%aaNY=Th zv*yTRD(xZM;L*&%g1NZ2F+Gt#C^m5{*n$KIa*C^hA@64 z(17-j#*zkG6Ly4xrK(J0dg=(11IPr`z&b9J}9BmekeJnjmh7ny&Uk*_!m0{;`T9qplS3^L9XMSy-Q^Q*J+vXrDPX3$FmVt*BCgkGqc)jSYQ!yLW07h-gwK;s@>VU3-$^ z2}9>Z7b6dv>3lNS9zjwo2?sLoiAgB`4teq1A#sFEAR7~1l$HO5IvpRCoXmz1mIuk+ zuZL4P<%`y0|M(pA2BUwAi~=)sPF;r2&wHP!%pl3(ud=BV3}LEvEe|`bM3xv1@!nhF zC=P+N#Zu5=(fmV`^9tgF1BAsYcfTipHy0QT4B_Vips&><9Nx3yXxgwDrew@J;|gV$ zztCkxEn*gh*wzS-N3@DVNAw#x>HFZ#VgybCo2g4CB4g?IQhWI*3bE85r6;!{hA9L; z&&DB6KdNZPzql7KnIm|*RAQpT&+L5=*SbiZ%IMRHtxN z5IhP>Wz0)YkjkyT88j?Q&;FTqbqO8;?I=vZ$odpZn%SsqFTg>H!*YR+@|bE1?WQ3T zB<8q$v7%jKc#f`JKs9sge>sH=L~!!2gI?2pRrLqn@z?)pK0mq`&uOQ*aR94xe$pKZ zrxXyi@lv$7>;MSy=%f9@BL#*i*a&S5qWohK1o0elRw@0_?6wx3Ol~j<@Y1RtnUoh&T(OMrip? zFCnD-F8o)Tm}v&!N?Y+9t@1je>XHKK@eLp5RtjNkmhBu~KPZ-nnD;T@GHp~ZVE0Zu z``+YbwPx|t zf9Dt~F<&ukCKpA;dw3l3q2exE|UjZz-Vk?TTW^Rpaw=XkK!+H$NR4KTwLLySMeP+^{>r0)H>-t=Od8TqZm#P!XeJuKV&*8O9^ly+ZG?xpZFGF|T@YdHmD9-2Q*AlNA26#s3<}rTc(7m|D8| z0b`P!(KYG9oxg)l8P9Z*dcaBWJ9I8JvpTjrYyvJY51>ocqDLwY3&4*33M`^~PwBqB z`1AD9Be=*As?)T>v(1?n&Pvp2ES4Sz3Gvn;ks*7NlE^CN53^-;# z*p|HEd|XPv2PeZX5LJ|$=V+Ii!!QYZNTgrv@@W(OnFH#XMm|_*8rqkwHPFzmQRPy` z3yVv|PzqR(IvV^YqaTaPeV@^?cCT!&&D^uQ_wDu3tz_Zb{_1ZdLcwsX23mj{)Hxr0 z43p8dzSrGMESp<~pnPH??R0Sr?k7 zJ6doP8tB@2c)8QK7M=*Z1`&qedtFudzHq;PO6aFg)aDx@2TKP%!=t2UM+;8`((8cCoN7n65qH?lTr&8^Ggu^G&+$Fn%S--i>}05PkoODpQ5#KP0oyfa&W#y5Ds$wd+l-G>Dw?j zehu(Dos?={Wj=a~#9WwjdYN(O8SAkXMfc!IN1pKdRtOnKvDNrZhYYkQX0g+nQSllc z_QY9aMbR^3pQaNr=Y7JZGWMT8}vlVUigF%kXTI zu0*P6GRxyF@&(^SO1mwJcj5QSkHs&@j0%`Ot!=|xcMaESe(-8JV9Dk%7U;4#{wyeB zsarx>I(UtP2#yE4`eM#Svz@MK4rkZd^do1T5KK*$4)^;!f7>p0b7^epq!z@BiRP}f zy9|_c^n+sb83cExNzlj!lDtiS;X*AnkI8q36PMI-qAk7_)Z?Gwc+*;b&tWbq;sjQS z(rs`VZ9mgAF1W65>A|G+z0BV zT->`SDo0EFng&|zcVFD;;uFfz=Qc_{T}8zw8kSeI)tHT6(qEaVA3cM;Hr-|uIb38G zY3c*5M`7ZC78}B&4_~}}|M&8^Z~u%xfNO}t;AN8@htYeN?Nxi=DeR(NYmx?W8_So2 zdiY^}Y1q+tNE80w2&|Dasd~LMNR8#L1qjXVcS9H_i0a z@oD-{3{B$XyfHWRs=LTcjRqw6?(Jebcxg9QV6l`MQhsdlqhOcqVdy)cKVBjPn1IR^ zz-u0*Gz6Arf2soAq3}7J2flb#hPPolhyk}z>vAfL`!X3Vci8F>`18oo^wijpY3W1c zE+L@2Tsl7n7;&*d6DcN!(S1XBka^o&2Kr6$=LJ*6gAwOcg(vIv4xM$9Es!_ zXj~fTe6-Dn{Hbj3*Q=8jL~lTIofpZ%;fayk8s0zK7AHDp@iS>anV7-^4l%W=@R{Qe zS9m;ILF|6qQ1RAN&705(xrPdo;!X9W+|Hy8jfI$!0bk~2%nj#$!;+ic_?NHi zjMsT5UD&<)?#?!sR&lE`opc6WU+CPcsV`1GH_Y-msyXlW z+y;hDwVP|)A&mAL2EFDi!B4&Ke;yvn6PFC3XxM?zegi5^IXwjOt>!|8)2wUQ%0wAN zeXl|u@}^ZOKR#Yc7d6l^ZX2)qMBzwnn-fjlQu<`;i$_d;(NR`XG7q-@@aj~{^Lv*) z0&o8+8YMc)eLN*e6^#pwJ7W2O;}e%n&HJd7b@rAY`O3`om#2^+?WNhFg%r_3l z&2cWPz7%ZAjd4GYA5ANHr)r6x{)Z<&ZpZl}VC({B&|STt)L+&py^Izdw$Ayl9~J;Z zSWX4;+5H_Oru7rU)s4qfwy8McABRwEh}xpiJuMVo0T>>qa$YNclix^ss^p3-)u9rX zz>W26z)|985hr2yE~y;y!~eanzY)s)2!%Fm&l5w9%4%T;Q1dp+<@VZ(NW;yRo9;db z+RbqNE6P!Uzl$XaoBVAFNL-5@mUJD=T#fc;L=Uf8d?j-+GPI2?)tkdoR>s)u~sL^B9 zZ!mDE1j*ekXS(wEn=qtF(uo)zP^t|>!tolzV_m7^ z&996D<-m*%dQSDcn!b{`nB!gYlpswn^T1*jRRgSmA3L>gc~SA#jECqwEOoQguWIE# z^sWRSQn!p`%w1sm)x+UoUhc5eKYW~Wda0nroDiEy%(ZjiEw7_>Zv|oTL0VY?r-1i{ zn>L3HmU7^){%o5KYS|)XS9bc)IX!or;|)~D8Sj?0^I>iv!#(r;&%?Q`M-9Ro&$+n` zhpau=2`VA-@>?u{1bdx}_4+eRCZy3qRiN^$`s&e&zRGH~Ij`XiY#9q^sM)VhO6h$; z0xx`GUL+5a-G|a>J;6&R-#t&@xRezVt-%=T1t`KkoD;DAy>{*>tOlL&^zy<&;y@*E z3oRWgzEe}Mt0Ii9()&fVWvwo)y>C$o*tr@XQjm=1I%#IU!JkwrQ`<{|w_ zsII$uG}4{*OeguWbU2#^oG$w)kE-*F-zv*pL0EbU^b~}ZDj6&5LnBPDXXay7qtJY& z;`ioS-&-gdv&^m9$uOm>|9(Pv-7erL(w8cshd=-PAU@pf!|a+;0WD3h-Jqoau-o(w!jr6Sgq8;)$k!WB2yi;s~f| zeNUnwPk9Iji*3^CDU@SDsi0yd(&=7YLc0@*b)$n1bk`k{s_BgAPhJzgh*WW2_-S+* z2WdOWb8$YWQE7*QPzVmh`G3Yhg)>|<#`MC3s&)t&uL|Q$R>UZt4akJE86xvGL0hWn ziHZHJ=jeMrr`JGtXDa1~KaApo95h_=f?AF?5>IKaTg6Wy1(}(!uO0@rIW?d88yaH8 zcMLqHn#S<$+8my6e!i&s{O66L&*6jP#)t*4klUW)FzBr=Via+b-&b#G?VWx(Nxt$N zg%P{C?Fv4{=AdkF<1}853M!I7?+Tl>h(+LI9o^N+g*-SlonaM>rn#8Zh=%Km8I{}p znts*_v07LJeaSEb8?kn#Y&8XgO862h5^Q34+m&XQrEZ)5nAA)muFQ5^*1XKf+z zdxyE_uJjvAK$A8bS_`Uq_iNLLeWS!Y&})zf^MMa1|HDII7@bVr*WX^98Vx%o*YfiH zcLI^;N2}iJ7Fz6^QAex=u(i_nfsZGZqmia~fl}{w@ED?JV9d3_^qj=%E$%!`s@agv z*AOlxhSm3_Q(>w(JlEyzQP977G0(8&%vk#%m^6earCTIW`QTrt;fy(qC%(&%OKhM$ zENpuI3j{H9;2PEbln5N6D}}|;Uub&ItT!!qs_h+r*Dv8eA^s&T#;qZp%BMw~Iv;zH z1V+eVQI>CQ?#)kejp5p>-a;A|Y9naNvN%)FC)Crs+g|42jcv{7|XVvoc@!W zuVRCqMM0~~I@!$l(&bhdITYGv3n4rUqOGNmdZ8s3P0@2y0&n6d77w8^##tZ=26jg7 z z!=iS_BvPGr}6F5{c{$Mi4JUAL<7F9z9t2QcthC?BZZwGY)= zrHDFVw+EApf2J6+Czkgggc3_CV6@%d2AZ~cd$a&j=Io|oc+wL^v1Rc2xLKA>MMx~1 z%D#{sLn68I64{Y@=!H}D;RF`O^WQ3mi@4XB3aZRYHI*9vfHlPy!e1+jw8O1wI5uER|h7)*`qvSKvX$5Si3n5W>@y$p7Ps}C9ojo+* z{LvkVIcJhxJ_#Qo-0~Ow+YAEJF{}s_eBMa za4>SR^!6`c-Mg`#w(nU~f8wZ+xQ0#MkPN*w%cpi%>OZRIKMa1WVNJ1x`Qr3>Ydw)P z|AaU=```9QhWyE3FWy6IwL)iqlu0?~X;>g^U#j zO(A!+Kbo~X)h5E!_Lo0_in^Idg{67otSpz*LT_K@=<@+2!*L|(wEcMh$j z!$s?UUT9%_6Te4SbW|2z-!D70@Y-?DEWQ<&-YOeY3?3#{U%MiWxCJ30R!|8372Kd+ zFYcpk^eA>BC-&#}C~=0X8rkGWmCI2a`15YxW1hlfyeOLct%YTk;Y)H}60DKglQqtN;K2 literal 0 HcmV?d00001 diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index b08dd0cf0a8a..2323e1636fcc 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -316,6 +316,33 @@ By default, the decoding for HAT model works in the same way as for Conformer-Tr In the case of external ngram LM fusion you can use ``/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``. To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy. + +.. _Hybrid-ASR-TTS_model: + +Hybrid ASR-TTS Model +-------------------- + +Hybrid ASR-TTS Model (``ASRWithTTSModel``) is a transparent wrapper for the ASR model with a frozen pretrained text-to-spectrogram model. The approach is described in the paper +`Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator `_. +This allows using text-only data for training and finetuning, mixing it with audio-text pairs if necessary. + +The model consists of three models: + +* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) +* Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) +* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram + + .. image:: images/hybrid_asr_tts_model.png + :align: center + :alt: Hybrid ASR-TTS Model + :scale: 50% + +For the detailed information see: + +* :ref:`Text-only dataset ` preparation +* :ref:`Configs and training ` + + References ---------- diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst index a1c96c7e1727..358114e75a40 100644 --- a/docs/source/asr/results.rst +++ b/docs/source/asr/results.rst @@ -26,6 +26,17 @@ If there is a local ``.nemo`` checkpoint that you'd like to load, use the :code: Where the model base class is the ASR model class of the original checkpoint, or the general ``ASRModel`` class. + +Hybrid ASR-TTS Models Checkpoints +--------------------------------- + +:ref:`Hybrid ASR-TTS model ` is a transparent wrapper for the ASR model, text-to-mel-spectrogram generator, and optional enhancer. +The model is saved as a solid ``.nemo`` checkpoint containing all these parts. +Due to transparency, the ASR model can be extracted after training/finetuning separately by using the ``asr_model`` attribute (NeMo submodel) +:code:`hybrid_model.asr_model.save_to(.nemo)` or by using a wrapper +made for convenience purpose :code:`hybrid_model.save_asr_model_to(.nemo)` + + NGC Pretrained Checkpoints -------------------------- diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst index a9c7cdf50042..8b283529a706 100644 --- a/docs/source/tts/models.rst +++ b/docs/source/tts/models.rst @@ -12,6 +12,8 @@ This section provides a brief overview of TTS models that NeMo's TTS collection Mel-Spectrogram Generators -------------------------- +.. _FastPitch_model: + FastPitch ~~~~~~~~~ FastPitch is a fully-parallel text-to-speech synthesis model based on FastSpeech, conditioned on fundamental frequency contours. The model predicts pitch contours during inference. By altering these predictions, the generated speech can be more expressive, better match the semantic of the utterance, and in the end more engaging to the listener. Uniformly increasing or decreasing pitch with FastPitch generates speech that resembles the voluntary modulation of voice. Conditioning on frequency contours improves the overall quality of synthesized speech, making it comparable to the state of the art. It does not introduce an overhead, and FastPitch retains the favorable, fully-parallel Transformers architecture, with over 900x real-time factor for mel-spectrogram synthesis of a typical utterance. The architecture of FastPitch is shown below. It is based on FastSpeech and consists of two feed-forward Transformer (FFTr) stacks. The first FFTr operates in the resolution of input tokens, and the other one in the resolution of the output frames. Please refer to :cite:`tts-models-lancucki2021fastpitch` for details. diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py index 6ed06aa10dde..23ccd3d7a2ef 100644 --- a/nemo/collections/asr/data/text_to_text.py +++ b/nemo/collections/asr/data/text_to_text.py @@ -26,7 +26,6 @@ import numpy as np import torch import torch.utils.data -from nemo_text_processing.text_normalization.normalize import Normalizer from torch.nn.utils.rnn import pad_sequence from tqdm.auto import tqdm @@ -35,6 +34,12 @@ from nemo.core.classes import Dataset, IterableDataset from nemo.utils import logging +try: + from nemo_text_processing.text_normalization.normalize import Normalizer +except Exception as e: + logging.warning(e) + logging.warning("nemo_text_processing is not installed") + AnyPath = Union[Path, str] @@ -176,7 +181,7 @@ def __init__( asr_use_start_end_token: bool, tts_parser: Callable, tts_text_pad_id: int, - tts_text_normalizer: Normalizer, + tts_text_normalizer: "Normalizer", tts_text_normalizer_call_kwargs: Dict, min_words: int = 1, max_words: int = 1_000_000, @@ -379,7 +384,7 @@ def __init__( asr_use_start_end_token: bool, tts_parser: Callable, tts_text_pad_id: int, - tts_text_normalizer: Normalizer, + tts_text_normalizer: "Normalizer", tts_text_normalizer_call_kwargs: Dict, min_words: int = 1, max_words: int = 1_000_000, @@ -426,7 +431,7 @@ def __init__( asr_use_start_end_token: bool, tts_parser: Callable, tts_text_pad_id: int, - tts_text_normalizer: Normalizer, + tts_text_normalizer: "Normalizer", tts_text_normalizer_call_kwargs: Dict, min_words: int = 1, max_words: int = 1_000_000, diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 23a98d13c404..1f15e49e0b0d 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -31,7 +31,9 @@ TextToTextDataset, TextToTextIterableDataset, ) -from nemo.collections.asr.models import ASRModel, EncDecCTCModelBPE, EncDecRNNTBPEModel +from nemo.collections.asr.models.asr_model import ASRModel +from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE +from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch from nemo.collections.asr.parts.submodules.batchnorm import replace_bn_with_fused_bn_all From 588dbe1cff71f7fb63ab21910076663b2cbbcab2 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Mon, 8 May 2023 14:03:27 -0700 Subject: [PATCH 19/62] [TTS] Fix aligner nan loss in fp32 (#6435) * Fix nan loss in fp32 Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/tts/losses/aligner_loss.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/collections/tts/losses/aligner_loss.py b/nemo/collections/tts/losses/aligner_loss.py index 05d8f28e70fe..1a666d750521 100644 --- a/nemo/collections/tts/losses/aligner_loss.py +++ b/nemo/collections/tts/losses/aligner_loss.py @@ -58,9 +58,7 @@ def forward(self, attn_logprob, in_lens, out_lens): # Convert to log probabilities # Note: Mask out probs beyond key_len key_inds = torch.arange(max_key_len + 1, device=attn_logprob.device, dtype=torch.long) - attn_logprob.masked_fill_( - key_inds.view(1, 1, -1) > key_lens.view(1, -1, 1), -float("inf") # key_inds >= key_lens+1 - ) + attn_logprob.masked_fill_(key_inds.view(1, 1, -1) > key_lens.view(1, -1, 1), -1e15) # key_inds >= key_lens+1 attn_logprob = self.log_softmax(attn_logprob) # Target sequences From e781b4d8a573af9247579fc13a173d791eb18290 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 8 May 2023 16:52:59 -0600 Subject: [PATCH 20/62] Update SDP docs (#6485) (#6596) * add info about SDP e.g. processor classes in docs * add link to SDP docs in README * address code review comments and add SDP overview diagram * Fix spelling typo --------- Signed-off-by: Elena Rastorgueva Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> --- README.rst | 1 + docs/source/tools/speech_data_processor.rst | 162 +++++++++++++++++++- 2 files changed, 161 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 614445bc4951..700b4edfdf16 100644 --- a/README.rst +++ b/README.rst @@ -124,6 +124,7 @@ Key Features * `Text Processing (text normalization and inverse text normalization) `_ * `CTC-Segmentation tool `_ * `Speech Data Explorer `_: a dash-based tool for interactive exploration of ASR/TTS datasets + * `Speech Data Processor `_ Built for speed, NeMo can utilize NVIDIA's Tensor Cores and scale out training to multiple GPUs and multiple nodes. diff --git a/docs/source/tools/speech_data_processor.rst b/docs/source/tools/speech_data_processor.rst index 49c3d7a81117..29bc4abb82bd 100644 --- a/docs/source/tools/speech_data_processor.rst +++ b/docs/source/tools/speech_data_processor.rst @@ -3,6 +3,164 @@ Speech Data Processor Speech Data Processor (SDP) is a toolkit to make it easy to: 1. write code to process a new dataset, minimizing the amount of boilerplate code required. - 2. share the steps for processing a speech dataset. Sharing processing steps can be as easy as sharing a YAML file. + 2. share the steps for processing a speech dataset. -SDP is hosted here: https://github.com/NVIDIA/NeMo-speech-data-processor. \ No newline at end of file +SDP is hosted here: https://github.com/NVIDIA/NeMo-speech-data-processor. + +SDP's philosophy is to represent processing operations as 'processor' classes, which take in a path to a NeMo-style data manifest as input (or a path to the raw data directory if you do not have a NeMo-style manifest to start with), apply some processing to it, and then save the output manifest file. + +You specifiy which processors you want to run using a YAML config file. Many common processing operations are provided, and it is easy to add your own. If you do not need to add your own processors, then all that is needed to process a new dataset is to write a single YAML file containing the parameters needed to process your dataset. + +.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.17.0/sdp_overview_diagram.png + :alt: Overview diagram of Speech Data Processor + +Overview of how SDP processes a dataset +--------------------------------------- + +1. You call the ``main.py`` script, passing in a YAML config file, possibly with some overrides. +2. ``main.py`` script calls ``run_processors.py``, passing in your config. +3. ``run_processors.py`` does the following: + + a. picks out the processors that you specified to be run (you can specify a subset of the processors in the config override, e.g. to avoid re-running time-consuming steps). + b. if some of the processors have not had "output_manifest_file" or "input_manfiest_file" entries specified, SDP will automatically create temporary files for those. + c. instantiates the processor classes using ``hydra.utils.instantiate`` + d. runs the run-time processor tests by calling the ``processor.test()`` method (more details about testing :ref:`here`). + e. runs the processing method (``processor.process()``) of each processor in order. + + +Layout of config YAML files +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The YAML config file for processing a dataset must contain a key ``processors``, the value of which is a list. Each item in that list is expected to be a dictionary specifying a processor class, i.e. it must have a key ``_target_``, the value of which is a path to a "processor" class, and the remaining keys must be the kwargs necessary to instantiate that class with ``hydra.utils.instantiate()`` (c.f. https://hydra.cc/docs/advanced/instantiate_objects/overview/). + +SDP will run the processors specified in the ``processors`` list in the config file. It will also check for a ``processors_to_run`` key in the config file, which can be either the string ``"all"``, or any Python "slice" object like ``3:4``, ``2:`` etc. (if there is no ``processors_to_run`` key, then all of the processors will be run). + +.. note:: + SDP will run the processors in the order in which they are listed in the config YAML file. Make sure to list the processors in an order which makes sense, e.g. create an initial manifest first; make sure to run asr inference before doing any processing which looks at ``pred_text`` fields in the manifest. + +Processor classes +----------------- + +**BaseProcessor** +~~~~~~~~~~~~~~~~~ + +All processor classes inherit from the ``BaseProcessor`` class. This is a simple abstract class which has 2 empty methods: ``process()`` and ``test()``. +These serve to remind us that SDP essentially just runs ``test()`` on all processors, and then ``process()`` on all processors (more details about testing :ref:`here`). + +``ASRInference`` is a child class of ``BaseProcessor``. It has a simple ``process()`` method which runs transcription on every utterance in the input_manifest. + +``WriteManifest`` is also a child class of ``BaseProcessor``. It has a simple ``process()`` method which saves a copy of the input manifest containing only the fields specified in ``fields_to_save``. + +**BaseParallelProcessor** +~~~~~~~~~~~~~~~~~~~~~~~~~ +``BaseParallelProcessor`` inherits from the ``BaseProcessor`` class. Within the ``BaseParallelProcessor.process()`` method, it calls other methods and functions, which allow it to do more complex processing. +Most importantly, it calls its ``BaseParallelProcessor.process_dataset_entry(data_entry)`` method on every utterance in the manifest, and it does this in parallel, allowing for more efficient processing. + +What is a **DataEntry**? +~~~~~~~~~~~~~~~~~~~~~~~~ +As mentioned above, ``BaseParallelProcessor.process_dataset_entry(data_entry)`` is called on a variable called ``data_entry`` which represents an utterance in our dataset. +Most often, ``data_entry`` will be a dictionary containing items which represent the JSON manifest entry. +Sometimes, such as in ``CreateInitialManifestMLS``, it will be a string containing a line for that utterance from the original raw MLS transcript. + +``BaseParallelProcessor.process_dataset_entry`` will process ``data_entry`` and output a ``DataEntry`` object. + +The ``DataEntry`` class is a dataclass which contains 2 attributes: + +1. ``data`` is an Optional dictionary containing items which represent the JSON manifest entry. ``data`` can also be ``None``. If a ``.process_dataset_entry(data_entry)`` method returns a ``DataEntry`` class where ``data is None``, then that utterance will be dropped from the output manifest. +2. ``metrics``, which can be of any type, and are ``None`` by default. This variable is used by some variables to record summary statistics about the changes made to the dataset, these metrics are aggregated and can be displayed once every utterance has been processed by the processor. + +What happens in **BaseParallelProcessor.process()**? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We outline the ``BaseParallelProcessor.process()`` method below: + +.. raw:: html + +
+ +
+ + +**ModifyManifestTextProcessor** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``ModifyManifestTextProcessor`` inherits from the ``BaseParallelProcessor`` class. + +The ``ModifyManifestTextProcessor`` constructor takes in the following arguments: +* ``text_key`` (string) and ``pred_text_key`` (string): these parameters specify which keys in ``data_entry.data`` will be used for processing. (default: ``text_key="text"``, ``pred_text_key="pred_text"``, ie. by default the processor will refer to and modify the ``"text"`` and/or ``"pred_text"`` attributes of the input manifest). +* ``test_cases`` (optional, list of dicts) - test cases for checking that the processor makes the changes that we are expecting. + +``ModifyManifestTextProcessor`` has the following methods: +* ``ModifyManifestTextProcessor.test()``: this method makes sure that the output from the processor matches the expected output specified in the ``test_cases`` parameter. +* ``ModifyManifestTextProcessor.process_dataset_entry(data_entry)``: this method applies processing to a ``data_entry``. First, spaces are added to the start and end of the 'text' and 'pred_text' entries (if they exist), then the abstract method ``ModifyManifestTextProcessor._process_dataset_entry(data_entry)`` is called. Then, any extra spaces (e.g. two spaces next to each other ' ') are removed from 'text' and 'pred_text' entries. +* ``ModifyManifestTextProcessor._process_dataset_entry(data_entry)``: this is an abstract method which will be over-written by children of ``ModifyManifestTextProcessor``. + +How to make your own processor classes +-------------------------------------- + +We will describe how to make your own processor classes by referring to SDP's existing classes. + +Creating an initial manifest +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +One of the child classes of ``BaseParallelProcessor`` provided in SDP is ``CreateInitialManifestMLS``. It downloads raw MLS data for a specified language, and creates an initial manifest (in the format expected by NeMo) which can be cleaned by subsequent processors. + +The ``CreateInitialManifestMLS.prepare()`` method downloads and extracts the raw data. + +The ``CreateInitialManifestMLS.read_manifest()`` method reads the lines in the raw MLS transcript file. + +The ``CreateInitialManifestMLS.process_dataset_entry()`` method takes in the lines from the raw MLS transcript file, and outputs ``DataEntry`` objects containing entries that will be saved into the manifest (i.e. ``"audio_filepath"``, ``"duration"``, ``"text"``) for each utterance. + + +A **ModifyManifestTextProcessor** subclass that cleans the reference text +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the classes provided in SDP is ``SubRegex``. At initialization, it takes in ``regex_params_list``, a list of dictionaries which must contain the keys ``"pattern"``, ``"repl"``, and, optionally, ``"count"``. These keys will be used to apply regex substitutions using these parameters fed into ``re.sub``. The substitutions will be applied to the data at ``text_key`` (i.e. ``data_entry.data[self.text_key]``). By default, ``text_key="text"``, i.e. the substitutions will be applied to the ``"text"`` attribute of the manifest. + +In its ``_process_dataset_entry(data_entry)`` method, the ``SubRegex`` processor does the string to string conversion upon the ``data_entry`` that is input. Its output is a ``data_entry`` with the changes applied to ``data``, and the the metrics of which regex patterns caused a substitution to be made. These metrics will be aggregated over all utterances by the ``BaseParallelProcessor`` class. ``SubRegex`` also has a ``finalize(metrics)`` method which will log information about the aggregated metrics after all of the utterances in the manifest have been processed. + +A **ModifyManifestTextProcessor** subclass that drops incorrectly transcribed utterances +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the classes provided in SDP is ``DropHighLowCharrate``. At initialization, it takes in ``high_charrate_threshold`` and ``low_charrate_threshold``, for which the utterance will be dropped if it is above or below each value respectively. This is helpful for automatically filtering out incorrectly transcribed utterances. + +In its ``_process_dataset_entry(data_entry)`` method it evaluates the character rate of the utterance(by dividing the length of ``data_entry.data[self.text_key]`` by the value of ``data_entry.data["duration"]``). If the character rate is within bounds, it will return the same ``data_entry`` that was input. If the character rate is out of bounds, it will return a ``data_entry`` with ``data=None`` and ``metrics`` which reflect the applied changes. +Similar to the ``SubSubstringToSpace`` class, it has a ``finalize(metrics)`` method which will log information about the aggregated metrics after all of the utterances in the manifest have been processed. + +Class diagram +------------- +A diagram of the classes mentioned above is included here. Arrows represent inheritance. + +We omit the details of the ``CreateInitialManifestMLS`` class in the diagram in order to save space. + + +.. raw:: html + +
+ +
+ +SDP Tests +--------- +It is important to make sure that your data processing code has the effect you intend, so SDP has a few different types of tests: + +1. Runtime tests + +* Before running the specified processors, SDP runs ``processor.test()`` on all specified processors. +* Currently, the only provided processor classes with a test method are subclasses of ``ModifyManifestTextProcessor``. + + * ``ModifyManifestTextProcessor.test()`` runs any ``test_cases`` that were provided in the object constructor. + * This means you can provided test cases in the YAML config file, and the dataset will only be processed if the test cases pass. + * This is helpful to (a) make sure that the rules you wrote have the effect you desired, and (b) demonstrate why you wrote those rules. + * An example of test cases we could include in the YAML config file:: + + - _target_: sdp.processors.DropIfRegexMatch + regex_patterns: + - "(\\D ){5,20}" # looks for between 4 and 19 characters surrounded by spaces + test_cases: + - {input: {text: "some s p a c e d out letters"}, output: null} + - {input: {text: "normal words only"}, output: {text: "normal words only"}} + +2. ``pytest`` tests which can be run locally with ``python -m pytest tests/`` and will be run during the GitHub CI process. There are 2 sub-types: + + a. "End to end" tests (link) which run SDP on a mini version of the raw initial dataset, and make sure the final manifest matches the reference final manifest. + b. "Unit tests" for processors and utils (link). From c3deeac8d58f48e9283d13fb7c32fd259605705f Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 9 May 2023 12:04:46 -0700 Subject: [PATCH 21/62] Bug/typo fixes (#6599) Signed-off-by: Igor Gitman --- examples/asr/transcribe_speech.py | 10 +++++----- nemo/collections/asr/models/hybrid_rnnt_ctc_models.py | 1 + nemo/collections/asr/models/rnnt_models.py | 2 +- nemo/collections/asr/parts/utils/transcribe_utils.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 30700153e340..0ab50dba016b 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -47,7 +47,7 @@ compute_timestamps: Bool to request greedy time stamp information (if the model supports it) compute_langs: Bool to request language ID information (if the model supports it) - + (Optionally: You can limit the type of timestamp computations using below overrides) ctc_decoding.ctc_timestamp_type="all" # (default all, can be [all, char, word]) rnnt_decoding.rnnt_timestamp_type="all" # (default all, can be [all, char, word]) @@ -60,12 +60,12 @@ batch_size: batch size during inference cuda: Optional int to enable or disable execution of model on certain CUDA device. - allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available + allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available amp: Bool to decide if Automatic Mixed Precision should be used during inference audio_type: Str filetype of the audio. Supported = wav, flac, mp3 overwrite_transcripts: Bool which when set allows repeated transcriptions to overwrite previous results. - + ctc_decoding: Decoding sub-config for CTC. Refer to documentation for specific values. rnnt_decoding: Decoding sub-config for RNNT. Refer to documentation for specific values. @@ -209,7 +209,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: # collect additional transcription information return_hypotheses = True - # we will adjust this flag is the model does not support it + # we will adjust this flag if the model does not support it compute_timestamps = cfg.compute_timestamps compute_langs = cfg.compute_langs @@ -254,7 +254,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: else: cfg.decoding = cfg.rnnt_decoding - # prepare audio filepaths and decide wether it's partical audio + # prepare audio filepaths and decide wether it's partial audio filepaths, partial_audio = prepare_audio_data(cfg) # setup AMP (optional) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index a413eaeed6fa..9ba5533dbe64 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -138,6 +138,7 @@ def transcribe( num_workers=num_workers, channel_selector=channel_selector, augmentor=augmentor, + verbose=verbose, ) if paths2audio_files is None or len(paths2audio_files) == 0: diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index f4e227f510af..7c91aed99cda 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -286,7 +286,7 @@ def transcribe( config['augmentor'] = augmentor temporary_datalayer = self._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=True): + for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=(not verbose)): encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 8101bee96723..69abf09e8cab 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -325,7 +325,7 @@ def write_transcription( item['beams'] = beams[idx] f.write(json.dumps(item) + "\n") else: - with open(cfg.dataset_manifest, 'r', encoding='utf_8') as fr: + with open(cfg.dataset_manifest, 'r', encoding='utf-8') as fr: for idx, line in enumerate(fr): item = json.loads(line) item[pred_text_attr_name] = best_hyps[idx].text From 06b0d34c5b3b5faec1f5b2e8ca23daa56a32bb0b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 9 May 2023 16:07:33 -0600 Subject: [PATCH 22/62] Manual garbage collection with an interval (#6469) (#6482) * Manual garbage collection with an interval * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use trainer.global_step for tracking the interval of GC --------- Signed-off-by: Sangkug Lym Co-authored-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../language_modeling/conf/megatron_gpt_config.yaml | 4 ++++ .../models/language_modeling/megatron_base_model.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 09b30c08dd47..67999548e8da 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -216,3 +216,7 @@ model: warmup_steps: 500 constant_steps: 50000 min_lr: 2e-5 + + gc_interval: 0 + # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. + # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 3899c75675db..1237491fa39c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import gc import os import re from typing import Any, Dict, Optional, Union @@ -148,6 +149,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): "default_on_epoch": False, } + self.gc_interval = cfg.get('gc_interval', 0) + assert self.gc_interval >= 0, "gc_interval should be an integer value larger than or equal to 0." + # If gc_interval > 0, memory garbage collection is manually controlled. + # The automatic garbage collector sould be disabled before training starts. + if self.gc_interval > 0: + gc.disable() + def _enable_nvidia_optimizations(self): "These optimizations are present in NVIDIA NGC PyTorch Containers" @@ -351,6 +359,9 @@ def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unus # accumulated gradient updates. grad_scaler.optimizer_update_skipped = None + if self.gc_interval > 0 and (self.trainer.global_step % self.gc_interval == 0): + gc.collect() + def setup_optimization( self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, ): From 24c7b4bdcb794536a52a38bbd1737422f095681f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 9 May 2023 15:21:06 -0700 Subject: [PATCH 23/62] Make tensor split contiguous (#6580) (#6593) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- nemo/collections/nlp/modules/common/megatron/attention.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index 852a3e3c4f88..64ab50e59118 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -380,7 +380,9 @@ def forward( mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_x_layer, 3) + (query_layer, key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim( + mixed_x_layer, 3, contiguous_split_chunks=True + ) else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) @@ -395,7 +397,9 @@ def forward( mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] - (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2) + (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim( + mixed_kv_layer, 2, contiguous_split_chunks=True + ) # Attention head [sq, b, h] --> [sq, b, hp] query_layer, _ = self.query(hidden_states) From 49203339df8a895a57ad6e849cc48aea5c032d20 Mon Sep 17 00:00:00 2001 From: Samuel Kriman Date: Tue, 9 May 2023 17:25:02 -0700 Subject: [PATCH 24/62] [ASR] Fix for old models in change_attention_model (#6608) * fixes Signed-off-by: sam1373 * done already Signed-off-by: sam1373 --------- Signed-off-by: sam1373 --- nemo/collections/asr/modules/conformer_encoder.py | 11 +++++++---- nemo/collections/asr/parts/mixins/mixins.py | 8 ++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 9955e35444f4..df5b8f5c69ed 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -20,7 +20,7 @@ import torch import torch.distributed import torch.nn as nn -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig, ListConfig, open_dict from nemo.collections.asr.models.configs import CacheAwareStreamingConfig from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder @@ -884,8 +884,10 @@ def change_attention_model( if att_context_size: att_context_size = list(att_context_size) - else: + elif hasattr(self._cfg, "att_context_size"): att_context_size = self._cfg.att_context_size + else: + att_context_size = self.att_context_size if self_attention_model is None: self_attention_model = self._cfg.self_attention_model @@ -971,8 +973,9 @@ def change_attention_model( m.self_attention_model = self_attention_model if update_config: - self._cfg.self_attention_model = self_attention_model - self._cfg.att_context_size = att_context_size + with open_dict(self._cfg): + self._cfg.self_attention_model = self_attention_model + self._cfg.att_context_size = att_context_size class ConformerEncoderAdapter(ConformerEncoder, adapter_mixins.AdapterModuleMixin): diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index a963850341f9..eba896d0478d 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -412,6 +412,9 @@ def change_attention_model( update_config (bool): Whether to update the config or not with the new attention model. Defaults to True. """ + if self_attention_model is None and att_context_size is None: + return + if not hasattr(self, 'encoder'): logging.info( "Could not change the self_attention_model in encoder " @@ -425,8 +428,9 @@ def change_attention_model( self.encoder.change_attention_model(self_attention_model, att_context_size, update_config, self.device) if update_config: - self.cfg.encoder.self_attention_model = self_attention_model - self.cfg.encoder.att_context_size = att_context_size + with open_dict(self.cfg): + self.cfg.encoder.self_attention_model = self_attention_model + self.cfg.encoder.att_context_size = att_context_size def conformer_stream_step( self, From fa89ba525c6fdaf9a778ba88b5d9e225fd4b8155 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Wed, 10 May 2023 10:09:12 -0400 Subject: [PATCH 25/62] Update manifest.py to use os.path for get_full_path (#6598) * Update manifest.py to use os.path for get_full_path Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update manifest.py to get rid of pathlib Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update manifest.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update manifest.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Vahid Noroozi --- .../common/parts/preprocessing/manifest.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py index 9fd69801ec0d..98194505c589 100644 --- a/nemo/collections/common/parts/preprocessing/manifest.py +++ b/nemo/collections/common/parts/preprocessing/manifest.py @@ -15,7 +15,6 @@ import json import os from os.path import expanduser -from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Union from nemo.utils import logging @@ -196,9 +195,11 @@ def get_full_path( ] elif isinstance(audio_file, str): # If input is a string, get the corresponding full path - audio_file = Path(audio_file) - - if (len(str(audio_file)) < audio_file_len_limit) and not audio_file.is_absolute() and not audio_file.is_file(): + if ( + (len(audio_file) < audio_file_len_limit) + and not os.path.isabs(audio_file) + and not os.path.isfile(audio_file) + ): # If audio_file is not available and the path is not absolute, the full path is assumed # to be relative to the manifest file parent directory or data directory. if manifest_file is None and data_dir is None: @@ -210,23 +211,17 @@ def get_full_path( # resolve the data directory if data_dir is None: - if is_datastore_path(manifest_file): - # WORKAROUND: pathlib does not support URIs, so use os.path - data_dir = os.path.dirname(manifest_file) - else: - data_dir = Path(manifest_file).parent.as_posix() + data_dir = os.path.dirname(manifest_file) # assume audio_file path is relative to data_dir - audio_file_path = os.path.join(data_dir, audio_file.as_posix()) + audio_file_path = os.path.join(data_dir, audio_file) if is_datastore_path(audio_file_path): # If audio was originally on an object store, use locally-cached path audio_file_path = datastore_path_to_local_path(audio_file_path) - audio_file_path = Path(audio_file_path) - - if audio_file_path.is_file(): - audio_file = str(audio_file_path.absolute()) + if os.path.isfile(audio_file_path): + audio_file = os.path.abspath(audio_file_path) else: audio_file = expanduser(audio_file) else: From f7989f7e5b4633e6676fa453623a489a3a30430f Mon Sep 17 00:00:00 2001 From: fayejf <36722593+fayejf@users.noreply.github.com> Date: Wed, 10 May 2023 08:14:38 -0700 Subject: [PATCH 26/62] Cherry pick commits in #6601 to main (#6611) * fix write Signed-off-by: fayejf * decoding ctc Signed-off-by: fayejf * temp set rnnt decoding return_best_hypothesis to true Signed-off-by: fayejf * add wer cal back to transcribe_speech as requested Signed-off-by: fayejf * add wer cal back to speech_to_text_buffered_infer_rnnt as requested Signed-off-by: fayejf * add wer cal back to speech_to_text_buffered_infer_ctc as requested Signed-off-by: fayejf * style fix Signed-off-by: fayejf * reflect change in asr_evaluator Signed-off-by: fayejf * reflect som and vahid comment Signed-off-by: fayejf * remove return_best_hy=true in transcribe_speech Signed-off-by: fayejf * no text skip Signed-off-by: fayejf * revert partial Signed-off-by: fayejf --------- Signed-off-by: fayejf --- .../ctc/speech_to_text_buffered_infer_ctc.py | 30 +++- .../speech_to_text_buffered_infer_rnnt.py | 32 +++- examples/asr/transcribe_speech.py | 29 +++- .../collections/asr/parts/utils/eval_utils.py | 153 ++++++++++++++++++ .../asr/parts/utils/transcribe_utils.py | 8 +- tools/asr_evaluator/asr_evaluator.py | 43 +++-- tools/asr_evaluator/conf/eval.yaml | 3 +- tools/asr_evaluator/utils.py | 138 +--------------- 8 files changed, 276 insertions(+), 160 deletions(-) create mode 100644 nemo/collections/asr/parts/utils/eval_utils.py diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py index 5755297d1600..69ea139d2ed6 100644 --- a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py +++ b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py @@ -27,7 +27,9 @@ total_buffer_in_secs=4.0 \ chunk_len_in_secs=1.6 \ model_stride=4 \ - batch_size=32 + batch_size=32 \ + clean_groundtruth_text=True \ + langid='en' # NOTE: You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the @@ -45,6 +47,8 @@ import torch from omegaconf import OmegaConf +from nemo.collections.asr.metrics.wer import CTCDecodingConfig +from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR from nemo.collections.asr.parts.utils.transcribe_utils import ( compute_output_filename, @@ -79,6 +83,9 @@ class TranscriptionConfig: total_buffer_in_secs: float = 4.0 # Length of buffer (chunk + left and right padding) in seconds model_stride: int = 8 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models", + # Decoding strategy for CTC models + decoding: CTCDecodingConfig = CTCDecodingConfig() + # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA # device anyway, and do inference on CPU only if CUDA device is not found. # If `cuda` is a negative number, inference will be on CPU only. @@ -89,6 +96,12 @@ class TranscriptionConfig: # Recompute model transcription, even if the output folder exists with scores. overwrite_transcripts: bool = True + # Config for word / character error rate calculation + calculate_wer: bool = True + clean_groundtruth_text: bool = False + langid: str = "en" # specify this for convert_num_to_words step in groundtruth cleaning + use_cer: bool = False + @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: @@ -188,11 +201,24 @@ def autocast(): manifest, filepaths, ) - output_filename = write_transcription( + output_filename, pred_text_attr_name = write_transcription( hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False ) logging.info(f"Finished writing predictions to {output_filename}!") + if cfg.calculate_wer: + output_manifest_w_wer, total_res, _ = cal_write_wer( + pred_manifest=output_filename, + pred_text_attr_name=pred_text_attr_name, + clean_groundtruth_text=cfg.clean_groundtruth_text, + langid=cfg.langid, + use_cer=cfg.use_cer, + output_filename=None, + ) + if output_manifest_w_wer: + logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!") + logging.info(f"{total_res}") + return cfg diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py index f2b6d143bdb2..385a29b8f417 100644 --- a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py +++ b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py @@ -34,7 +34,9 @@ total_buffer_in_secs=4.0 \ chunk_len_in_secs=1.6 \ model_stride=4 \ - batch_size=32 + batch_size=32 \ + clean_groundtruth_text=True \ + langid='en' # Longer Common Subsequence (LCS) Merge algorithm @@ -66,6 +68,7 @@ import torch from omegaconf import OmegaConf, open_dict +from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.collections.asr.parts.utils.streaming_utils import ( BatchedFrameASRRNNT, LongestCommonSubsequenceBatchedFrameASRRNNT, @@ -101,7 +104,7 @@ class TranscriptionConfig: # Chunked configs chunk_len_in_secs: float = 1.6 # Chunk length in seconds total_buffer_in_secs: float = 4.0 # Length of buffer (chunk + left and right padding) in seconds - model_stride: int = 8 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models", + model_stride: int = 8 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA # device anyway, and do inference on CPU only if CUDA device is not found. @@ -120,6 +123,12 @@ class TranscriptionConfig: merge_algo: Optional[str] = 'middle' # choices=['middle', 'lcs'], choice of algorithm to apply during inference. lcs_alignment_dir: Optional[str] = None # Path to a directory to store LCS algo alignments + # Config for word / character error rate calculation + calculate_wer: bool = True + clean_groundtruth_text: bool = False + langid: str = "en" # specify this for convert_num_to_words step in groundtruth cleaning + use_cer: bool = False + @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: @@ -194,9 +203,13 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: decoding_cfg.strategy = "greedy_batch" decoding_cfg.preserve_alignments = True # required to compute the middle token for transducers. decoding_cfg.fused_batch_size = -1 # temporarily stop fused batch during inference. + decoding_cfg.beam.return_best_hypothesis = True asr_model.change_decoding_strategy(decoding_cfg) + with open_dict(cfg): + cfg.decoding = decoding_cfg + feature_stride = model_cfg.preprocessor['window_stride'] model_stride_in_secs = feature_stride * cfg.model_stride total_buffer = cfg.total_buffer_in_secs @@ -242,11 +255,24 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: filepaths=filepaths, ) - output_filename = write_transcription( + output_filename, pred_text_attr_name = write_transcription( hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False ) logging.info(f"Finished writing predictions to {output_filename}!") + if cfg.calculate_wer: + output_manifest_w_wer, total_res, _ = cal_write_wer( + pred_manifest=output_filename, + pred_text_attr_name=pred_text_attr_name, + clean_groundtruth_text=cfg.clean_groundtruth_text, + langid=cfg.langid, + use_cer=cfg.use_cer, + output_filename=None, + ) + if output_manifest_w_wer: + logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!") + logging.info(f"{total_res}") + return cfg diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 0ab50dba016b..531b5c56aa4e 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -25,6 +25,7 @@ from nemo.collections.asr.metrics.wer import CTCDecodingConfig from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig +from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.collections.asr.parts.utils.transcribe_utils import ( compute_output_filename, prepare_audio_data, @@ -69,6 +70,11 @@ ctc_decoding: Decoding sub-config for CTC. Refer to documentation for specific values. rnnt_decoding: Decoding sub-config for RNNT. Refer to documentation for specific values. + calculate_wer: Bool to decide whether to calculate wer/cer at end of this script + clean_groundtruth_text: Bool to clean groundtruth text + langid: Str used for convert_num_to_words during groundtruth cleaning + use_cer: Bool to use Character Error Rate (CER) or Word Error Rate (WER) + # Usage ASR model can be specified by either "model_path" or "pretrained_name". Data for transcription can be defined with either "audio_dir" or "dataset_manifest". @@ -82,6 +88,8 @@ audio_dir="" \ dataset_manifest="" \ output_filename="" \ + clean_groundtruth_text=True \ + langid='en' \ batch_size=32 \ compute_timestamps=False \ compute_langs=False \ @@ -149,6 +157,12 @@ class TranscriptionConfig: # Use this for model-specific changes before transcription model_change: ModelChangeConfig = ModelChangeConfig() + # Config for word / character error rate calculation + calculate_wer: bool = True + clean_groundtruth_text: bool = False + langid: str = "en" # specify this for convert_num_to_words step in groundtruth cleaning + use_cer: bool = False + @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) def main(cfg: TranscriptionConfig) -> TranscriptionConfig: @@ -322,7 +336,7 @@ def autocast(): transcriptions = transcriptions[0] # write audio transcriptions - output_filename = write_transcription( + output_filename, pred_text_attr_name = write_transcription( transcriptions, cfg, model_name, @@ -332,6 +346,19 @@ def autocast(): ) logging.info(f"Finished writing predictions to {output_filename}!") + if cfg.calculate_wer: + output_manifest_w_wer, total_res, _ = cal_write_wer( + pred_manifest=output_filename, + pred_text_attr_name=pred_text_attr_name, + clean_groundtruth_text=cfg.clean_groundtruth_text, + langid=cfg.langid, + use_cer=cfg.use_cer, + output_filename=None, + ) + if output_manifest_w_wer: + logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!") + logging.info(f"{total_res}") + return cfg diff --git a/nemo/collections/asr/parts/utils/eval_utils.py b/nemo/collections/asr/parts/utils/eval_utils.py new file mode 100644 index 000000000000..5838f3b4035d --- /dev/null +++ b/nemo/collections/asr/parts/utils/eval_utils.py @@ -0,0 +1,153 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Tuple + +from nemo.collections.asr.metrics.wer import word_error_rate_detail +from nemo.utils import logging + + +def clean_label(_str: str, num_to_words: bool = True, langid="en") -> str: + """ + Remove unauthorized characters in a string, lower it and remove unneeded spaces + """ + replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] + replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] + replace_with_apos = [char for char in '‘’ʻ‘’‘'] + _str = _str.strip() + _str = _str.lower() + for i in replace_with_blank: + _str = _str.replace(i, "") + for i in replace_with_space: + _str = _str.replace(i, " ") + for i in replace_with_apos: + _str = _str.replace(i, "'") + if num_to_words: + if langid == "en": + _str = convert_num_to_words(_str, langid="en") + else: + logging.info( + "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages! Skipping!" + ) + + ret = " ".join(_str.split()) + return ret + + +def convert_num_to_words(_str: str, langid: str = "en") -> str: + """ + Convert digits to corresponding words. Note this is a naive approach and could be replaced with text normalization. + """ + if langid == "en": + num_to_words = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] + _str = _str.strip() + words = _str.split() + out_str = "" + num_word = [] + for word in words: + if word.isdigit(): + num = int(word) + while num: + digit = num % 10 + digit_word = num_to_words[digit] + num_word.append(digit_word) + num = int(num / 10) + if not (num): + num_str = "" + num_word = num_word[::-1] + for ele in num_word: + num_str += ele + " " + out_str += num_str + " " + num_word.clear() + else: + out_str += word + " " + out_str = out_str.strip() + else: + raise ValueError( + "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages!" + ) + return out_str + + +def cal_write_wer( + pred_manifest: str = None, + pred_text_attr_name: str = "pred_text", + clean_groundtruth_text: bool = False, + langid: str = 'en', + use_cer: bool = False, + output_filename: str = None, +) -> Tuple[str, dict, str]: + """ + Calculate wer, inserion, deletion and substitution rate based on groundtruth text and pred_text_attr_name (pred_text) + We use WER in function name as a convention, but Error Rate (ER) currently support Word Error Rate (WER) and Character Error Rate (CER) + """ + samples = [] + hyps = [] + refs = [] + eval_metric = "cer" if use_cer else "wer" + + with open(pred_manifest, 'r') as fp: + for line in fp: + sample = json.loads(line) + + if 'text' not in sample: + logging.info( + "ground-truth text is not present in manifest! Cannot calculate Word Error Rate. Returning!" + ) + return None, None, eval_metric + + hyp = sample[pred_text_attr_name] + ref = sample['text'] + + if clean_groundtruth_text: + ref = clean_label(ref, langid=langid) + + wer, tokens, ins_rate, del_rate, sub_rate = word_error_rate_detail( + hypotheses=[hyp], references=[ref], use_cer=use_cer + ) + sample[eval_metric] = wer # evaluatin metric, could be word error rate of character error rate + sample['tokens'] = tokens # number of word/characters/tokens + sample['ins_rate'] = ins_rate # insertion error rate + sample['del_rate'] = del_rate # deletion error rate + sample['sub_rate'] = sub_rate # substitution error rate + + samples.append(sample) + hyps.append(hyp) + refs.append(ref) + + total_wer, total_tokens, total_ins_rate, total_del_rate, total_sub_rate = word_error_rate_detail( + hypotheses=hyps, references=refs, use_cer=use_cer + ) + + if not output_filename: + output_manifest_w_wer = pred_manifest + else: + output_manifest_w_wer = output_filename + + with open(output_manifest_w_wer, 'w') as fout: + for sample in samples: + json.dump(sample, fout) + fout.write('\n') + fout.flush() + + total_res = { + "samples": len(samples), + "tokens": total_tokens, + eval_metric: total_wer, + "ins_rate": total_ins_rate, + "del_rate": total_del_rate, + "sub_rate": total_sub_rate, + } + return output_manifest_w_wer, total_res, eval_metric diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 69abf09e8cab..d7946aa2842b 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -276,7 +276,7 @@ def write_transcription( filepaths: List[str] = None, compute_langs: bool = False, compute_timestamps: bool = False, -) -> str: +) -> Tuple[str, str]: """ Write generated transcription to output file. """ if cfg.append_pred: logging.info(f'Transcripts will be written in "{cfg.output_filename}" file') @@ -321,7 +321,7 @@ def write_transcription( if compute_langs: item['pred_lang'] = transcription.langs item['pred_lang_chars'] = transcription.langs_chars - if not cfg.ctc_decoding.beam.return_best_hypothesis: + if not cfg.decoding.beam.return_best_hypothesis: item['beams'] = beams[idx] f.write(json.dumps(item) + "\n") else: @@ -344,11 +344,11 @@ def write_transcription( item['pred_lang'] = best_hyps[idx].langs item['pred_lang_chars'] = best_hyps[idx].langs_chars - if not cfg.ctc_decoding.beam.return_best_hypothesis: + if not cfg.decoding.beam.return_best_hypothesis: item['beams'] = beams[idx] f.write(json.dumps(item) + "\n") - return cfg.output_filename + return cfg.output_filename, pred_text_attr_name def transcribe_partial_audio( diff --git a/tools/asr_evaluator/asr_evaluator.py b/tools/asr_evaluator/asr_evaluator.py index da81f33bc8b5..9540d3429138 100644 --- a/tools/asr_evaluator/asr_evaluator.py +++ b/tools/asr_evaluator/asr_evaluator.py @@ -12,19 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import json - import git -from omegaconf import OmegaConf -from utils import cal_target_metadata_wer, cal_write_wer, run_asr_inference - +from omegaconf import OmegaConf, open_dict +from utils import cal_target_metadata_wer, run_asr_inference +from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.core.config import hydra_runner from nemo.utils import logging - """ This script serves as evaluator of ASR models Usage: - python python asr_evaluator.py \ +python asr_evaluator.py \ engine.pretrained_name="stt_en_conformer_transducer_large" \ engine.inference.mode="offline" \ engine.test_ds.augmentor.noise.manifest_path= \ @@ -45,15 +43,34 @@ def main(cfg): report['git_hash'] = repo.head.object.hexsha ## Engine - # Could skip next line to use generated manifest - - # If need to change more parameters for ASR inference, change it in - # 1) shell script in eval_utils.py in nemo/collections/asr/parts/utils or - # 2) TranscriptionConfig on top of the executed scripts such as transcribe_speech.py in examples/asr - cfg.engine = run_asr_inference(cfg=cfg.engine) + # Could skip run_asr_inference and use the generated manifest by + # specifying analyst.metric_calculator.exist_pred_manifest + if cfg.analyst.metric_calculator.exist_pred_manifest is None: + # If need to change more parameters for ASR inference, change it in + # 1) shell script in utils.py + # 2) TranscriptionConfig on top of the executed scripts such as transcribe_speech.py in examples/asr + # Note we SKIP calculating wer during asr_inference stage with calculate_wer=False and calculate wer for each sample below + # for more flexibility and reducing possible redundant inference cost. + cfg.engine = run_asr_inference(cfg=cfg.engine) + + else: + logging.info( + f"Use generated prediction manifest {cfg.analyst.metric_calculator.exist_pred_manifest} and skip enigneer" + ) + with open_dict(cfg): + cfg.engine.output_filename = cfg.analyst.metric_calculator.exist_pred_manifest ## Analyst - cfg, total_res, eval_metric = cal_write_wer(cfg) + output_manifest_w_wer, total_res, eval_metric = cal_write_wer( + pred_manifest=cfg.engine.output_filename, + clean_groundtruth_text=cfg.analyst.metric_calculator.clean_groundtruth_text, + langid=cfg.analyst.metric_calculator.langid, + use_cer=cfg.analyst.metric_calculator.use_cer, + output_filename=cfg.analyst.metric_calculator.output_filename, + ) + with open_dict(cfg): + cfg.analyst.metric_calculator.output_filename = output_manifest_w_wer + report.update({"res": total_res}) for target in cfg.analyst.metadata: diff --git a/tools/asr_evaluator/conf/eval.yaml b/tools/asr_evaluator/conf/eval.yaml index 9129eddc49f1..176392b9c070 100644 --- a/tools/asr_evaluator/conf/eval.yaml +++ b/tools/asr_evaluator/conf/eval.yaml @@ -38,9 +38,10 @@ engine: analyst: metric_calculator: + exist_pred_manifest: null # specify the previously generated manifest will skip engine clean_groundtruth_text: True langid: "en" # speciify language to clean text. Note use text normalization in NeMo for better performancce - output_filename: null # specify it if wanna skip engine and use previously generated manifest + output_filename: null use_cer: False metadata: diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py index c233376eb13a..84f4bdb62364 100644 --- a/tools/asr_evaluator/utils.py +++ b/tools/asr_evaluator/utils.py @@ -18,8 +18,6 @@ from typing import Tuple from omegaconf import DictConfig, OmegaConf, open_dict - -from nemo.collections.asr.metrics.wer import word_error_rate_detail from nemo.utils import logging @@ -110,6 +108,7 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig: subprocess.run( f"python {script_path} " + f"calculate_wer=False " f"model_path={cfg.model_path} " f"pretrained_name={cfg.pretrained_name} " f"dataset_manifest={cfg.test_ds.manifest_filepath} " @@ -148,6 +147,7 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig: # 2) add command as "rnnt_decoding.strategy=greedy_batch " to below script subprocess.run( f"python {script_path} " + f"calculate_wer=False " f"model_path={cfg.model_path} " f"pretrained_name={cfg.pretrained_name} " f"dataset_manifest={cfg.test_ds.manifest_filepath} " @@ -163,139 +163,6 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig: return cfg -def clean_label(_str: str, num_to_words: bool = True, langid="en") -> str: - """ - Remove unauthorized characters in a string, lower it and remove unneeded spaces - """ - replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] - replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] - replace_with_apos = [char for char in '‘’ʻ‘’‘'] - _str = _str.strip() - _str = _str.lower() - for i in replace_with_blank: - _str = _str.replace(i, "") - for i in replace_with_space: - _str = _str.replace(i, " ") - for i in replace_with_apos: - _str = _str.replace(i, "'") - if num_to_words: - if langid == "en": - _str = convert_num_to_words(_str, langid="en") - else: - logging.info( - "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages! Skipping!" - ) - - ret = " ".join(_str.split()) - return ret - - -def convert_num_to_words(_str: str, langid: str = "en") -> str: - """ - Convert digits to corresponding words. Note this is a naive approach and could be replaced with text normalization. - """ - if langid == "en": - num_to_words = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] - _str = _str.strip() - words = _str.split() - out_str = "" - num_word = [] - for word in words: - if word.isdigit(): - num = int(word) - while num: - digit = num % 10 - digit_word = num_to_words[digit] - num_word.append(digit_word) - num = int(num / 10) - if not (num): - num_str = "" - num_word = num_word[::-1] - for ele in num_word: - num_str += ele + " " - out_str += num_str + " " - num_word.clear() - else: - out_str += word + " " - out_str = out_str.strip() - else: - raise ValueError( - "Currently support basic num_to_words in English only. Please use Text Normalization to convert other languages!" - ) - return out_str - - -def cal_write_wer(cfg: DictConfig, pred_text_attr_name: str = None) -> Tuple[DictConfig, dict]: - """ - Calculate wer, inserion, deletion and substitution rate based on groundtruth text and pred_text_attr_name (pred_text) - We use WER in function name as a convention, but it currently Error Rate (ER) support Word Error Rate (WER) and Character Error Rate (CER) - """ - samples = [] - hyps = [] - refs = [] - - with open(cfg.engine.output_filename, 'r') as fp: - for line in fp: - sample = json.loads(line) - - if 'text' not in sample: - raise ValueError( - "ground-truth text does not present in manifest! Cannot calculate Word Error Rate. Exiting!" - ) - - if not pred_text_attr_name: - pred_text_attr_name = "pred_text" - - hyp = sample[pred_text_attr_name] - ref = sample['text'] - - if cfg.analyst.metric_calculator.clean_groundtruth_text: - ref = clean_label(ref, langid=cfg.analyst.metric_calculator.langid) - - wer, tokens, ins_rate, del_rate, sub_rate = word_error_rate_detail( - hypotheses=[hyp], references=[ref], use_cer=cfg.analyst.metric_calculator.use_cer - ) - eval_metric = "wer" - if cfg.analyst.metric_calculator.use_cer: - eval_metric = "cer" - - sample[eval_metric] = wer # evaluatin metric, could be word error rate of character error rate - sample['tokens'] = tokens # number of word/characters/tokens - sample['ins_rate'] = ins_rate # insertion error rate - sample['del_rate'] = del_rate # deletion error rate - sample['sub_rate'] = sub_rate # substitution error rate - - samples.append(sample) - hyps.append(hyp) - refs.append(ref) - - total_wer, total_tokens, total_ins_rate, total_del_rate, total_sub_rate = word_error_rate_detail( - hypotheses=hyps, references=refs, use_cer=cfg.analyst.metric_calculator.use_cer - ) - - if "output_filename" not in cfg.analyst.metric_calculator or not cfg.analyst.metric_calculator.output_filename: - # overwrite the current generated manifest - OmegaConf.set_struct(cfg, True) - with open_dict(cfg): - cfg.analyst.metric_calculator.output_filename = cfg.engine.output_filename - - with open(cfg.analyst.metric_calculator.output_filename, 'w') as fout: - for sample in samples: - json.dump(sample, fout) - fout.write('\n') - fout.flush() - - total_res = { - "samples": len(samples), - "tokens": total_tokens, - eval_metric: total_wer, - "ins_rate": total_ins_rate, - "del_rate": total_del_rate, - "sub_rate": total_sub_rate, - } - return cfg, total_res, eval_metric - - def cal_target_metadata_wer(manifest: str, target: str, meta_cfg: DictConfig, eval_metric: str = "wer",) -> dict: """ Caculating number of samples (samples), number of words/characters/tokens (tokens), @@ -314,7 +181,6 @@ def cal_target_metadata_wer(manifest: str, target: str, meta_cfg: DictConfig, ev Return: ret (dict): Generated dictionary containing all results regarding the target metadata. """ - if eval_metric not in ['wer', 'cer']: raise ValueError( "Currently support wer and cer as eval_metric. Please implement it in cal_target_metadata_wer if using different eval_metric" From 2f8c1f07a30f519e4b6f1e168e8a3442d5ce172c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 08:45:34 -0700 Subject: [PATCH 27/62] Create dummy iters to satisy len checks (#6600) (#6603) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper --- Dockerfile | 2 +- Jenkinsfile | 2 +- .../nlp/models/language_modeling/megatron_gpt_adapter_model.py | 3 ++- .../nlp/models/language_modeling/megatron_gpt_model.py | 3 ++- .../language_modeling/megatron_gpt_prompt_learning_model.py | 3 ++- .../nlp/models/language_modeling/megatron_retrieval_model.py | 3 ++- .../nlp/models/language_modeling/megatron_t5_adapter_model.py | 3 ++- .../collections/nlp/modules/common/text_generation_strategy.py | 2 +- 8 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index e8402189a474..4cbbf14314c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,7 @@ WORKDIR /workspace/ # Install Megatron-core RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \ + git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \ pip install -e . WORKDIR /tmp/ diff --git a/Jenkinsfile b/Jenkinsfile index 955bea6c5ebf..5edfa05b8d46 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -62,7 +62,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 && \ + git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \ pip install -e .' } } diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py index cb38ad863a52..2985ab4df3bb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_adapter_model.py @@ -114,7 +114,8 @@ def get_forward_output_only_func(self): Used for generate method only for now. """ - def fwd_output_only_func(batch, model): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) extra_arg = {} ( tokens, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 5cab67a71441..967f6a6cf85f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -629,7 +629,8 @@ def loss_func(output_tensor): return fwd_output_and_loss_func def get_forward_output_only_func(self): - def fwd_output_only_func(batch, model): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) extra_arg = {} if len(batch) == 3: batch = [x.cuda() for x in batch] diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index dd0d9168c16a..cca46b54e8a8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -646,7 +646,8 @@ def get_forward_output_only_func(self): Used for generate method only for now. """ - def fwd_output_only_func(batch, model): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) extra_arg = {} ( tokens, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index a9c659e48696..1cce8852a37f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -509,7 +509,8 @@ def get_forward_output_only_func(self): Used for generate method only. """ - def fwd_output_only_func(batch, model): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) extra_arg = {} ( tokens, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py index 32345e829be8..31c147022486 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_adapter_model.py @@ -215,7 +215,8 @@ def get_forward_output_only_func(self): Used for generate method only for now. """ - def fwd_output_only_func(batch, model): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) extra_arg = {} ( tokens, diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index 07607d3840d8..b23f77645d3a 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -56,7 +56,7 @@ def forward_step(self, batch, tensor_shape): output_tensor = fwd_bwd_function( forward_step_func=self.model.get_forward_output_only_func(), - data_iterator=batch, + data_iterator=iter([batch,]), model=[self.forward_model], num_microbatches=get_num_microbatches(), forward_only=True, From 2eb0d750843c5cd69456a57013b207b76a0da1ce Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Wed, 10 May 2023 08:46:11 -0700 Subject: [PATCH 28/62] add GPT eval mode fix for interleaved to main (#6610) Signed-off-by: Abhinav Khattar --- .../models/language_modeling/megatron_gpt_model.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 967f6a6cf85f..0222eedd54ce 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -666,8 +666,17 @@ def validation_step(self, dataloader_iter, batch_idx): from the dataloader to produce a list of microbatches. The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. """ + if isinstance(self.model, list): + for model_module in self.model: + model_module.eval() + + loss = self.fwd_bwd_step(dataloader_iter, batch_idx, True) - return self.fwd_bwd_step(dataloader_iter, batch_idx, True) + if isinstance(self.model, list): + for model_module in self.model: + model_module.train() + + return loss def validation_epoch_end(self, outputs): if parallel_state.is_pipeline_last_stage(): From c21f29918e70d9e811f6777c557180ccf2fc6673 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 08:48:01 -0700 Subject: [PATCH 29/62] Fix batch size reconf for T5 FT for multi-validation (#6582) (#6588) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: Eric Harper --- .../megatron_finetune_model.py | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index b7a9fb476409..8e59b1e4ce62 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -30,6 +30,7 @@ try: from apex.transformer.pipeline_parallel.utils import ( _reconfigure_microbatch_calculator, + get_current_global_batch_size, get_micro_batch_size, get_num_microbatches, ) @@ -260,16 +261,33 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar def _reconfigure_and_process_inference_batch(self, batch, ds_config): global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if global_batch_size_per_gpu != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size(): + if ( + global_batch_size_per_gpu + != get_current_global_batch_size() // parallel_state.get_data_parallel_world_size() + ): # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches. - app_state = AppState() - _reconfigure_microbatch_calculator( - rank=app_state.global_rank, - rampup_batch_size=None, - global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), - micro_batch_size=global_batch_size_per_gpu, - data_parallel_size=parallel_state.get_data_parallel_world_size(), - ) + if ( + global_batch_size_per_gpu + != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size() + ): + app_state = AppState() + _reconfigure_microbatch_calculator( + rank=app_state.global_rank, + rampup_batch_size=None, + global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), + micro_batch_size=global_batch_size_per_gpu, + data_parallel_size=parallel_state.get_data_parallel_world_size(), + ) + # NOTE: need to explicitly handle resetting for multi-validation + else: + app_state = AppState() + _reconfigure_microbatch_calculator( + rank=app_state.global_rank, + rampup_batch_size=None, + global_batch_size=ds_config.global_batch_size, + micro_batch_size=ds_config.micro_batch_size, + data_parallel_size=parallel_state.get_data_parallel_world_size(), + ) def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): """ From e6ee3312fb3f35b9d95b54b64c2abed4574708b9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 10:02:52 -0600 Subject: [PATCH 30/62] Not doing CastToFloat by default (#6524) (#6563) * Not doing CastToFloat by default * Added docustring * Dummy commit --------- Signed-off-by: Boris Fomitchev Co-authored-by: Boris Fomitchev Co-authored-by: Eric Harper --- nemo/utils/cast_utils.py | 18 ++++++++++++------ nemo/utils/export_utils.py | 24 +++++++++++++++++------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/nemo/utils/cast_utils.py b/nemo/utils/cast_utils.py index eeb48f35ffa7..21e977ec494d 100644 --- a/nemo/utils/cast_utils.py +++ b/nemo/utils/cast_utils.py @@ -70,8 +70,11 @@ def __init__(self, mod): self.mod = mod def forward(self, x): - with torch.cuda.amp.autocast(enabled=False): - ret = self.mod.forward(x.to(torch.float32)).to(x.dtype) + if torch.is_autocast_enabled() and x.dtype != torch.float32: + with torch.cuda.amp.autocast(enabled=False): + ret = self.mod.forward(x.to(torch.float32)).to(x.dtype) + else: + ret = self.mod.forward(x) return ret @@ -81,7 +84,10 @@ def __init__(self, mod): self.mod = mod def forward(self, *args): - from_dtype = args[0].dtype - with torch.cuda.amp.autocast(enabled=False): - ret = self.mod.forward(*cast_all(args, from_dtype=from_dtype, to_dtype=torch.float32)) - return cast_all(ret, from_dtype=torch.float32, to_dtype=from_dtype) + if torch.is_autocast_enabled(): + from_dtype = args[0].dtype + with torch.cuda.amp.autocast(enabled=False): + ret = self.mod.forward(*cast_all(args, from_dtype=from_dtype, to_dtype=torch.float32)) + return cast_all(ret, from_dtype=torch.float32, to_dtype=from_dtype) + else: + return self.mod.forward(*args) diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index cc0ce744a9a6..9fa2bc239eb8 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -440,22 +440,16 @@ def script_module(m: nn.Module): def replace_for_export(model: nn.Module) -> nn.Module: """ - Top-level function to replace default set of modules in model + Top-level function to replace 'default set' of modules in model, called from _prepare_for_export. NOTE: This occurs in place, if you want to preserve model then make sure to copy it first. Args: model : top level module - replace_1D_2D : include 1D -> 2D replacements Returns: model, possibly modified in-place """ from nemo.collections.tts.modules.submodules import MaskedInstanceNorm1d default_replacements = { - "BatchNorm1d": wrap_module(nn.BatchNorm1d, CastToFloat), - "BatchNorm2d": wrap_module(nn.BatchNorm2d, CastToFloat), - "LayerNorm": wrap_module(nn.LayerNorm, CastToFloat), - "InstanceNorm1d": wrap_module(nn.InstanceNorm1d, CastToFloat), - "MaskedInstanceNorm1d": wrap_module(MaskedInstanceNorm1d, CastToFloatAll), "MatchedScaleMaskSoftmax": wrap_module(None, replace_MatchedScaleMaskSoftmax), } @@ -463,3 +457,19 @@ def replace_for_export(model: nn.Module) -> nn.Module: replace_modules(model, default_replacements) # This one has to be the last replace_modules(model, script_replacements) + + +def add_casts_around_norms(model: nn.Module): + """ + Function to put additional to/from float32 casts around operations known to require full precision. + It was used with an extra post-parse script to have TRT preserve extra precision when --fp16 needed. + Should not be needed with TRT 8.6.1 or later. + """ + default_cast_replacements = { + "BatchNorm1d": wrap_module(nn.BatchNorm1d, CastToFloat), + "BatchNorm2d": wrap_module(nn.BatchNorm2d, CastToFloat), + "LayerNorm": wrap_module(nn.LayerNorm, CastToFloat), + "InstanceNorm1d": wrap_module(nn.InstanceNorm1d, CastToFloat), + "MaskedInstanceNorm1d": wrap_module(MaskedInstanceNorm1d, CastToFloatAll), + } + replace_modules(model, default_cast_replacements) From 13e7ddb835b77640253102e1a3454e6bd0d404a7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 14:58:30 -0700 Subject: [PATCH 31/62] Turn autocast off when precision is fp32 (#6576) * Turn autocast off when precision is fp32 (#6554) * Turn autocast off when precision is fp32 Signed-off-by: Abhinav Khattar * address review Signed-off-by: Abhinav Khattar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: Abhinav Khattar * merge Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper * correct auto-merge Signed-off-by: Abhinav Khattar * correct auto-merge Signed-off-by: Abhinav Khattar * add to GPT SFT Signed-off-by: Abhinav Khattar --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../megatron_base_prompt_learning_model.py | 4 ++++ .../models/language_modeling/megatron_bert_model.py | 8 ++++++-- .../language_modeling/megatron_finetune_model.py | 2 +- .../nlp/models/language_modeling/megatron_gpt_model.py | 6 +++++- .../megatron_gpt_prompt_learning_model.py | 6 +++++- .../models/language_modeling/megatron_gpt_sft_model.py | 2 +- .../megatron_lm_encoder_decoder_model.py | 10 +++++++--- .../language_modeling/megatron_retrieval_model.py | 4 ++++ .../megatron_t5_prompt_learning_model.py | 2 +- .../models/machine_translation/megatron_nmt_model.py | 2 +- .../nlp/modules/common/text_generation_strategy.py | 2 +- 11 files changed, 36 insertions(+), 12 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py index 9e79cb4a41e7..88da586832df 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py @@ -146,6 +146,10 @@ def init_model(self, cfg: DictConfig, trainer: Trainer): self.lowest_val_loss = None self.prompt_encoder = None + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + # define validation metric if self.cfg.get('report_validation_metric', False): validation_metric = self.cfg.get('validation_metric', 'accuracy') diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index cd50f8414470..bda1a595655a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -90,6 +90,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): else: raise ValueError('precision must be in [32, 16, "bf16"]') + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + # used in NVIDIA NGC PyTorch containers # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_lm_loss_buffer = [] @@ -311,7 +315,7 @@ def training_step(self, dataloader_iter, batch_idx): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) if losses_reduced_per_micro_batch: @@ -412,7 +416,7 @@ def validation_step(self, dataloader_iter, batch_idx): tensor_shape=tensor_shape, dtype=self.autocast_dtype, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) if losses_reduced_per_micro_batch: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 8e59b1e4ce62..4ed71756e60e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -318,7 +318,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0222eedd54ce..9cb4efca57fc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -156,6 +156,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): else: raise ValueError('precision must be in [32, 16, "bf16"]') + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + self.transformer_engine = cfg.get('transformer_engine', False) # configuration used for inference @@ -348,7 +352,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index cca46b54e8a8..95448e67bd11 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -150,6 +150,10 @@ def init_model(self, cfg: DictConfig, trainer: Trainer): self.virtual_prompt_style = VirtualPromptStyle(cfg.virtual_prompt_style) self.model_type = ModelType.encoder_or_decoder + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + if self.pipeline_parallel: assert ( self.cfg.optim.sched.get("min_lr", 0.0) == 0.0 @@ -309,7 +313,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 7c3bddc9a08c..a52a7d22e219 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -302,7 +302,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 365b1870a2d5..80d980858f1c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -135,6 +135,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): else: raise ValueError('precision must be in [32, 16, "bf16"]') + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + self.enc_dec_model.model_type = ModelType.encoder_and_decoder def setup_optimizer_param_groups(self): @@ -328,7 +332,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): decoder_seq_length=self.max_decoder_seq_length, dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses @@ -996,7 +1000,7 @@ def dummy(): num_microbatches=1, decoder_seq_length=encoder_seq_length, dtype=self.autocast_dtype, - enable_autocast=True, + enable_autocast=self.enable_autocast, ) if output_tensor: @@ -1160,7 +1164,7 @@ def dummy(): num_microbatches=1, decoder_seq_length=encoder_seq_length, dtype=self.autocast_dtype, - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # get output tensor if parallel_state.is_pipeline_last_stage(): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index 1cce8852a37f..afd8ad54d150 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -105,6 +105,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): raise ValueError('precision must be in [32, 16, "bf16"]') self.model.model_type = ModelType.encoder_and_decoder + self.enable_autocast = ( + True if (not self.megatron_amp_o2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False + ) + if hasattr(self.cfg, "shape_file"): set_base_shapes(self, self.register_artifact("shape_file", self.cfg.shape_file), rescale_params=False) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py index 410bf338394b..ae09ad27387b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py @@ -197,7 +197,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index 248a3c8e2ec0..05fb492828aa 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -316,7 +316,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): dtype=self.autocast_dtype, grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), - enable_autocast=True, + enable_autocast=self.enable_autocast, ) # only the last stages of the pipeline return losses diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py index b23f77645d3a..16935be1cc2d 100644 --- a/nemo/collections/nlp/modules/common/text_generation_strategy.py +++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py @@ -62,7 +62,7 @@ def forward_step(self, batch, tensor_shape): forward_only=True, tensor_shape=tensor_shape, dtype=self.model.autocast_dtype, - enable_autocast=True, + enable_autocast=self.model.enable_autocast, ) return output_tensor From 5e6705fa3f45c01956d20c7fa80a33aff3bfc253 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 16:13:58 -0600 Subject: [PATCH 32/62] update core commit hash in readme (#6622) (#6623) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 700b4edfdf16..929cc7f86abc 100644 --- a/README.rst +++ b/README.rst @@ -254,7 +254,7 @@ To install Megatron-core, run git clone https://github.com/NVIDIA/Megatron-LM.git cd Megatron-LM - git checkout 3db2063b1ff992a971ba18f7101eecc9c4e90f03 + git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 pip install -e . It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies. From ca9cbbbed8cbb827660d2f5e536763488a3cfe25 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 17:27:00 -0700 Subject: [PATCH 33/62] add hat image to docs (#6619) (#6621) Signed-off-by: andrusenkoau Co-authored-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> --- docs/source/asr/images/hat.png | Bin 0 -> 110874 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/source/asr/images/hat.png diff --git a/docs/source/asr/images/hat.png b/docs/source/asr/images/hat.png new file mode 100644 index 0000000000000000000000000000000000000000..4631fe89211d20a9afe5cd038bb2c8618b3f1d4b GIT binary patch literal 110874 zcmeFZWm}YO+cpe1C@DjCcQ*o3!VuCSAstfEA>GoA(n>gh(jiEfw9<`$Qqs~T>9db| zUUmP0_ruFFFm>#;<+{Z|l)9=sHU=365)u;jLj{;764EU=5)uRr?Kb#?c*ZIiJR!Sj z%1a}a4^eD_H>s9-53N*Gkl4U$G$aTz84@bu67Va5O#ZLea>y)5w|;+)f`k-div;<5 zjVgFX{3L>3#F#(NkQd=dcfdP5@Ee+o{C6byJ{RSmk+ACBvD2rM+IdGXhaWc2$ z@^WxS%!4H6B??|TSh}0hcsbZRx`}#;)BV0e6ud@!%uPr0`x19MaXLK}bs8BbS4$cp zE*>r(ItdIK8X7UzCsv}GFuA|O!8>s}8+UhSQEqNePfspSel90hYi?c<5fN@4K5jlf zPH+XMo42F8nHQ&{8~vY2{+S16>1N?->+Ejp{ruUdrI+o$ zS8{awdn|B3Zp1g-yj(oo|C$>N6+?U~s&4CLY5y2z>i}p5){x-k;pY?kJ>dU*_3tJB zW2oN0hl=p?{O8F3_~w5{YP(sw$~ZZIHQgot&9A?M|MSbg1I4%zNB<8}{Gs#jPXVDN zFvPh3#WV?waLG9@z()#OnA#)o3?vTmgMyGbmOsyk*W00&3(yuMBq^kaum_L4kbh*M z&J%P~_uN6(ke0f}$BU+6iFWs7qft~UO3tTaQ>Y6HN=^=@djqF|hq}|VM&i*iqG6Dr zu@s#xSUab-G_}k$wfH*iWuN<*T3a)>RqySJU%paU8+)BgkBmq2KaXG*Tm?B=GR85E zaELEyHjGk>DgBXsphIl_CG|M!I82;%>!T%Sy(Eb=QRr+|Px}&wc%&2L~n#^!NWylH6Vp#g9=*6%mn@;J*5gAu{fgtA5 zD|GG`e}q5^1veL}k~?Ij^)Lv6mk^Y(!5^LxCAV_YMNiZ~&K*|S><@BQ;u zuWB6^o@82W3mrL+_X$DRG`3icwd+M^)d{NqYE*C`G#46&VMQd3(3fCzl5YB|B!aZ; zB{Wva^ODcpghULfiCQ;?_2x2^DG>w~VL9wd|)MBN%4_`Q(8m! z39`24O?FZ$zeDu{lf5{T&`D(3h#p~B)pK{-(k<}X-$z&()M=*AQuXXt@-nu(PQRDnH?GVEV!~LUoS7oB6#Ht0Y{Q|$7g?t!jQ}H`Xh##V9rbU%d@-a!fEZ~T5_L0 z-^uiR`r7I9QuEM`)6{FQdKF+UlTKju2Zlu!l#hC1DI@tG#x;!kKfjPtfFw@}=$Ufp%Go;nb$eD3|f+86f!$_6wqnu1Bj#09v4J5q-3@g z9>$E|4%7`8eT#A>JzBg-Clu)G;5Bb)XV*dpzlA&Yl<2VMLGRNq$UiID*$ViCSRy1d z=pHz`6B8CKGG1kd6wLwp(x>h)`|zP3U|-E>koT#Y$v(6ezaKU_}+)1(nIJ z1|-bKblVExeA|fgwM6Ji;zeqpK(iEsd~}kQ$6uli17f^ZY)@keBPr9Xf-k1>sO~(T zdIeZj1e_7mGaRrrXA)#a2^Rs;Xf=B53vm5%5?J6%El=eO&BPvyglZMAxg-X#`G<`I zuOQ2jNT@ax&zN2T69OedHaU9O(5|7Umyr(U{D@$`UPN^RgJ}v<>tTt%q_)1kBq2H# zm=!N8C?h4g(*ysy-1HK>4?!3aEXE$%U=Qo`^gDHOb1B$akm z>z7~&I=S{VnpLqj(M7T7}L-nviD<0rv`V_X+Yziep6bg~t#alA;5+L#4~u`qe`zdi4fZOJuu3 zSN?Z{#89|9LgE0)=oeWP8t&MB*Wd$u$6T=T6mW3f&EOcJNN+G3r^Wo>{}-1flSvTG+}cVu|! zajkjM&$ETV`HmlA4JT6qaOs9(TI;Aw*O0~yk16BjQzQPjTa_3pw$Z%XX&RR3`?Uvbb z5{B@*6!c`1PP;@_tNH$EMCOByt3EHzzFokQS+yTXMy!8?f9akq(ftax|0!d|vC3~W zHL=?7?3#7?!}Oi4%?KqSf^uT+37| zMUp8!%#nD$aXyALnrqB&QCYO06-hUyH8>X~QGq2GN6o@xNQ=R~M|?CDZU1-=<5BYF zGYkza4lD*_ZWPYB34DV(hZjq)?`?%eAOjfzW=>k*?)?-MI%Im}Xm$Fy_8SE>4ehNB zT&*sP4_hT#<9Qj-md&@Xx&}PgW6Wh0Gu|E$laxbczga5j6702}SGAe@8RtPyqQZ$4 z+GCKq3yJY=1N+s*i=h9mp->!zMY?R^R<`tU)hu=wrqO&vN#^Rsu;g@C(Rf>APM`$s zT`CD0iHj%U?N`cqov%l|sX2+g_Xe%8JOF^vfc2#uQB|4lK6O#rGMHb!BU&XB36K4F zEg!>#ZE%}yXxoQJ`Zz}Rs73_-%ctN3t9@k2sL6BVqhRqQ(IlH>an9vo{==6#k4o@_ zvn!4%GPh%}I_Vz(VaI!v!3>0*11M2kx3O84Li=qLf;Vp-923A*$R+7-pZX8FTCH=x z#$RD>WwHv`k2jdTNRVU=$06M#vJyLBgyAwL(>8jbYeY7Lk-%SZHdQquUw=d!4a&G8 zUMkQPjSN#-+aAr?n~-te-G324#flSVli$Z@Vgg~2^G`Ap25XA}P!{*@+Yu=?Qj;o!Cw++;s120Y=%Y_Qz4%1i*d3GVp|rBs z#YG#X}$wlR0gaO>t> zOQ_a{@#CQ<9OSE5KkjP^I9%1V(S0q~h-_S1L`R3HAYl?ei0ya~DjoaBIQ{!Ec{xHLu?9^eY-A5c1~P#We~54w z=pIChC`q)Y1aE4>!+V7Bq-#7{4wo0--0}&lB*a|v%@k3iZS)_iV?OvFiHJ5qfxjM40$1Q_z;tao9aVGS+xH;PP6RSLsz?v+JNHa>-?POklsTo*Ajm zk+0hV^Px?Q`Q!wVdq+U+$42VAyTp5g>1?%uEwWQ*vmdrEo22Kzxp`70$@kxL`gSSX z(ikVZ_RA7m$e;)Fc@P;~slrx3QT(i$}D_zNTJ(2xs+;<46 z&+n3pshXK?1@N znjXZ)z3x=`Fqnn~T|7{*D#{=xKa$L$_c`_dR+gw@clxs?DC03~yg-GUhK0PY zkh4-H))g884ZV3yDNS(h8L&iJWj=}5!h$v0ZK#cA9>_Hs$`W4fvG?U6$HgIkpq_HY zq<2j=Np9K5Yl*K8Z>Mq5Ft-=YT9hl^DI{r<%lUo{*(62Q8GXFxZ`F8>Q(E)Yr-oQ5 zm1|#_Y501Td_deLll2xSZm#wO9+w;wz^-V(dPleulsrTUMk{?>Eoa(KRG%@vc1O^j zUp+g@>#HhlqtA}(Xr3Bz-8GCBZOy-af?3~0ngG40h1XEO^O3oqm)EZyZ}^j__iTOJ zHfhjq3K$k|j~Hx~M}R~obn*_A&{2#j6S^b$(~{SESD7BgB}XXv!}-gyZW$b+vD3ss z?)YEFc|SddRe1wHQdD_w->$2lsq;AA|Kw=4_2hE;m2Cgi$+5Vu8Mafof2`M4l^mVj zR1-u(&vY)AULNO@!*imisrD;3rQFYU=(8PGZZsg?7Bgov==atrtj7qv>UGe5$fPm5RvIRaZCqQAHRQ%(gM zdlZUPav|xAV89-?TgdUdX~Q=}^4`pE#UOLHe_pqAp$6hksXR*)bHKj;PTwe$lt9r)!{Iibh7$s_H){VxGM~E{ewub zeEx@i`y1o?4of*{CiS#f1AHgUFkEV#BDq1k2JF#p54IVd?tw3?u%y5=$Fi=&l$aB^ zX-X&5h$GQr4No?5nW}KE^8K8A$RF3#P}P@AN*&sA^bN-Q^au3%u%+QBdU(B@hy>5h z{e(;x-}lUf_{G_Jpt|4bf#Qm#R^TW&H8d#h((;!t2#$-Gj zaHM@I3ad*Of_Y}1L%+gs;Rdyi1Zp43D$X7jKPu`P;}*iia>M?Lf zig`fDk*#HtyAeOPN3g9EfbXo7(rv;8UD)|r;LlQT5WwM8Z;C4ts!Ld7cXoF)UzYP3 z8?kaGk5y*CZvQQ%UEm{O@f4mhuV>pj!lLh###A(uP*DW?q4M`f62Lzmm~$XNGa_cX z^zR~jA*BvUh8D;Quoz0v1CH?4l`C&Xqs)DP=ugONvyjk6(>7vfPC*x_XG&f~*?)N- z6LvJ^v!iAUTwXBr-d7*7k~D0!h3tyx)WO-tuq6hK(8BdABm3d!0XS3shnx1Jo{KR) z$IMiA9LByU*w++Pk=lCmDKeyAn@)(KEpHmN-USj77Jr@jhU$&IE2%%1UGT8*F`rQ= zpuHR#M>R-=DDZ(`>dT-O$i#@@_s?Yp($?hVP?eWVWj3QVaV%QJAuO7Op_WO| zSg7pw>5n;1GC|)7he$4t^5++Eu6BM5p)43)H{C)V^^?7vMNYUmb~5f}l>Ci=BPuxo z$|S@G85cbq4pli@as&a_GZrAR1GaU>tX{KR~t0`3mmRp0~#6)&)3{u0+tB8ifklek9I`9IoNi=Q3_A>R7Y=>ju93J+^fcA)ZubMbz2D>Rz+9vort!lAtZ_p z#;(KtL#T!bQ(<-RrHM6u-*X2ODz;3mUzI_ehP4i2i>X*=2G#GjqjOX#+%R>S>#Q4P zlGzO&<0oj#HjJgf!m&sv8QXd*YDYJ~>xF6h*D;I%-1dZ$Q9rQ9f zuPSzY6A=?Nn;=H})b>|pZlYuafCYzV>!#;i25(<@mFoBpE*rdrPf&A^SFqCR7ilsE zi~I>Qp*SG$UwK3PVNV)QfCHuJa7pP70+K zmuCpSKwChd^bLkavd7_fwdDgj?eD21#qJz8eAZLu!RNjog-fIfyxHuccz@Yytw*|M zuCk3uPuNE~j&?AvLXGrR@g78m_r^xx+Y62(CU2dMDq zVCnK6M>-ot@81|#3kSnW?IBEv96ncaX?mdCe~L~It3t)7mDX@PhjSWZ-Yc@Rl1Ia` z9}a2g{p?@8BOjcS_S{9GZSQig%V#hX9r^sE+~KXxV0o4aZr1FAUe-PJZ-vA0Sq3#? zx>3*cze!TeJ7qpOJw|nnuiTWq9j@HYB5jL}O-lbemS+KOm1KuRCk7(TXlhs!=^L&8 z4iSS!iC;=^ui0UekRlv@SII4>AJ?|cR>u)^u%rOszhRwL3H8{cvKcDm;*>75(T7c~ z^QwLgKis?uo(Em{eS3<4tyKWN5^|i~?>1L`Q=ewMNkhbyQNirrs)aM!(5ZCcoiCksasUJ=`<|(bz0l#Ow2snrJ)%0#Po-FliLyYc3}Xm!tgof zdW~$21kudi(V1kxxRzP@hOV8H{bimB+Sv1@ChDOKu5Yh+wM~N=SE-6LbYrQnYeiQ_ znSKZ8dc4H(Q|hmB5t-cJbVQPh-{@m>Vvs*1S(`w_HL_f=+yiaYZT;7TPI4KIYR-9g z*}iGl!$N5SR5NWAJ@wdfyR>X`;9`B$893m56y|qIHppbvmZZC7X9YI%1y!^cTk0f| z4CV^m*n9Pe;84F6=C67aijPB|J1jI}v&%NA4jylyQhDk5U5xSpM}(GSqI%77m-_qx zL4BL4ddadmZ3#1(F5$16%Zlu@bvlJ#S8gg4_X8g7YUme9I~xoQTsUWv@HN+ETkrZF zMl)gxZLA-Mss&Ar?341{`L9eANXo|QXgmQq4>7s>p)869EglUU`>&fOv3%_V-Z*S8 zIn`SVOA`PmL>Tt$QA0ATR&xSZXjv99a8c{?+A$pm=c9ow?KGV&}F1)RIf-#j!6kL39&T@dIH%-)a$2 ziN~)>4@7k&dURyu3nNU!r;chk8h{Yz5`{Ty(U@KCQ%_9(>`;1G63$wbd268yp^?Q+ z5&@j$PRDIGIsXPPLU<*c#8mDJVfUYRS~e@11=MQ-8uI`^srIC z8+(a`>Bf3z`%Ipns91}tlLo}NDQqrHsXXd@tN@M2A~Jmkcg;@}Xedw!@}@Qct_i%} ziS;@@aBW1Ba;PlmpT}uCZisfQmnN`O&b(+$=kaD}Hs%_WCP?MO8a??yZP|>W5p?sk z-{(NNDO}XeeueMp&nFrM=Gfu`+2#%p~IIkuN^}p@mRsvZ& z^M^-`e!s`)iP;+kqKzr_ADq$Ra^gn$VJx2w8qMDz5Kp#j|B)9F+=ga0?vy=^<1ydp z+NLU8rDO;N6P5K0K=$t;I(m>#^_sg`o>oLA0|xYbR(<*Kt=_r?f0TJQ z1GdU24`^pxU#d473}w*QoLntgCmpFZAqHukV}#DeRrKNwQ3! zcm$D5z3RwX2N`i($G57I)Lm!090O!A^k^hm8})ROC?IU{2;;6{{hpO!F(&98$?DU_ zRt0p2l<9!OJI&F--d^bm#PR&)enp3y2g^vKYZr3Tqs^=cQUjC^j-<^~fF;4`Qra)`|w6bv4 zfQ?ZKC%LdU%p%#;%vWGVq*`y3lIX&$jIJM2NCr&U{Tlw0wQfhB_PW7gw@sUbvS&{We4ph~I z0ep?$=F>?Bl_y{kDC$5a@~XY}>LbV6it_Lt{8!eIOJ! zp<-CP=O!OdMNk=Q?we|A)L64AyI@=q4FEWWJzmcdO;Fim8jvDK(`N8O6apY%b5SGx z_Vnl^Xjl}&>R4m~3OH|`J(8ytPlCexX52JLLd_&q(D?JJwIa=tDX~VsRy)-h!)gGc zHU#-=uifq`pD+6iAy94X9uyh+-n*$KB%N@ZHAMpMyq$-$nqvAZytZEiVl0SmBlDk^ z@dlu>N_Zr5YCUv+`U^h4+s#33L9&}ROg#9q(>vkDtIzAxV4&~Y`>?^SgiqpzRr*{; zHjr2um!!@h2(f@C)93QN`nd=8_1u=cYmEl#yhfR;jHAQ=i((xZ*?~ zZ}Qu88db;m?lo}dq6(-_R?-lKFW;12cZSI8Rv6>f*)8+fr8ecC%q{3VXg%!7&TF&j zbXL0w@l&&raaD^1HW@u~=(WfmE)F5f_wn~z>}uf)G(ibyPRf*w&Y{l<3FrvqMG@sT z1qK8SC+6(g(Y>I#G$%+MTEDAsP-HdaXmcCXIHAZl@$+v68!Wr&P z$5A+~DWfnXdO^e@mc}qLz0Qyf!23lNp!u)RWfW+Z&`fFRulR>3Q6U&(p-qG{reJDv zbT16`Co1fPaX}fNfEknrOtxs_rJgrlWm=FvODBs0t=v-f_RQCB9qAxD0RnhdWQ-Bv z9X+8KP|?u=+o)ZBV#NTZhR`TXmm9H{(l$M zWKkHLvw|cK9#@&m+M zr6EcOn|+R!1;BK^NCzw6c_PIyBfrDnw`7!U5_X^7Z0WyvDYcBZ&nWQ#((w{Sz-14+ z78F<8sQ^Vzou6=lM23NpYkCCW3Xu#^H%Rt-NP?R9EeIF6~dKTRJ>PVfXYBZa0?2#){}eG-&L_O zKzkuHVs-WlETFM~Ve!oU=?rHFtAzOGKG;qW-3GNb-!I#TJX!QdBI9#Pgm{k-xb{4$^XS>U{J-JlG-0EJjTpWfVPjsYEhfOE#Q> zd7X&n8xzQcvw@+m^aMXw_q#u3_(Umput~YJdrE*y`w!X|wwlv(bqGP6Z)# zy=_~njl8w=sYT*J$Zt-VKL@n6KfYxLItEN>a!U))4nHj7mTRPkpirVS5FuMGb*UQ& zKihqX+<_lISE_(@mGHY%9uI~MJVR#(r&@d{^;qc|dqtK^u$Lg1n;0n|H+gKc}J!nlg>7v%f={byK^iXxrAU6GzSq= zR-1fb1FO=Ylb{y25&Fb^Aw^~DVAYIs-%x{~C>rYHzFvgGn28A#Lz_Y!fQhC-{so3z zA1!HvQ4j`Gf_ESF)d-pL6vCv!WJoB{h2x|O&GC00h2RH7tE&I)^4bSw<*kRI>xg9+ zpGyg&5I$ympbwvUW~TUN>z2TCdR3_?Wr8O)PnYMtu-9*1ZbUr9)!D@W&tE#mnMhV5>TWlvK|d7g-rPaWh%QSvXRjjq{7mG=dt zP-GNnk@FT>(o?ZCW};vG_@#Xz@JTv_JF@ri(+Y2$2L1pl$gaI@7C+$pw?2FD4TpZsg{wSvS@nEt|x1b=y42fElE}>OyMTb{SLeM+C z&<~m-d4&_5wO^r`Yy(@PEKIBdWn&hgM*v}g6cA-8g2!vX6icB9;i}VVQ)<_G=;>-3 zx5is_Oel|#jr&F$JR9WTe#WU2pDjIFZ7jEB(`>~==OV5i&?8p*Gj17glZIz0c1fDk;!}BQT7*btT(x7IPsp>P(KN=irs2@i#ffZbRDeYLoDMN6k%{NCrb0w=9V6 z>7P6iL0k(Srp!QX@7wZmBev+wf~;n?bHRg!x9w`{;#O+iweb z^mfOR@;@m7P>1%t)gasZC!dx*u(W@MWpbfIs}+6|2yB1??56h_DLZgMQYZ%x7v~$? z%)xl`?WwX0+>Y^aMx|8c9#);w^kxCe&jh1ITFk>XhZM^-&PyG}zcdRJEy;3%|DH?= zWfT$OQ4Vo{paRi1^1OX0>;AKk=*W$}XO0OZYwz}2FUxHvi&N&h9lth4xvcawgq`1K zeN>p3CJYjLacvCE06hIA6QX;7+>DR?&qm!)dgCeGs@bh0>VwLk500FaL#?!Gy;Jn}|HVnwe33DXX^%Q6Nt(6{hgKoMH39(fx* zdi0Mll;WpL2DTQOw6&guQn6MFO~+spt? zn1mn{Y573PZ$g2Zshapna=M6{m7W+8mmv$!u@FgwNWQ~t4LVctuyBb^8D0vz?x}Yc z@m~oHf^-0fK}IF{7Sw0J#?YjY3@W}4g9u78P(n)isJ}A;hgjVARG@8xp5Va5cSn#~ z_xetq(}J;bmgr<4gYoZnv)?0QfQ{DsnGRY`0Ly4dP+l^t5oO)?yvtN<@Y-rHGwMsV zbyFCsuV#g@_`w(J98_|np8j;+yq`Chd)--{-)vba=|Qi_A2Xnb@M3Yq7JohnRmhc# zBMTrVKOCqv{TO22pUO%1Y-=JrEkv}pHD9mUX75|egKqi!D17E{hV!#~8`LGJ+mENbAN@uq zB3e0vuOaLx^zTFp?ZyaCfB3}l&tz1PfU5&}k=~=c1c}4?+d=U;#^Ws5Q*tAVsZ5w2$KLCEI&iytLkO1ppam~%kUF<1p1jV!1|lye^o#k_x8c?1LM zRY8ZVC(3UMf^HTer*9mn8=%&E(oB13?7d1DT&73m`-4|}expdIEVIBnzo#bnN=@QC zKb_w?Ubp7{ZoLeJu(Oou8#tguRIC4Gv}_E~=0Udat_Z8b_&c?Vh|zy30h9_j*8QKn zf`3*E4(eZhaU>OI@J{TY3x8}Zo!dl0+zMzvYWM<~5fJ0m<`KgNSJLMWM<%U)kGYI% z+}2g=#NH`q(22TlEE0wLtI;YXP~Kml!KxzYVPFEP;P^+#9G;(%<~%%=l4 z{>L0#ogWnt5=_a~gDRpT^2LOfns*n$^0Y}{if_!#>73Jrpq?Vox3V1hj%r{sSmi)o z#wo*p^>a#hv37}aOrz^+G(ldTT-;cNZkfI|#uzX_n`Xy;8ClpdE%{p&u z)Y?C8E7+N-lEE0`ZoASq9%J+! ztq=FJ%SICN@)`bf}xrgzU$)tyhQrFZu`XEtLL;5Sapjop_BY z^Uujmd1>_?v|aAr_ePz|Hg#myX;*o4Hgtba;epTA1ZxxxPz}x4OwN(y!q^$dzx3ll z7z(*Cvp>5KMJ{!otF~6F+tHzsFhgy*KAk)9irV@JHNl$B;xP+}&($f@*GvSKQ4c3C z;vdaM^qP(S(`$yePZN~lV1}waS1NWqBBL23Z8O?{KU43L_nN-R=j7>i2YW>`9ja|K z0qZf3&tD=3K|(>ZxX~YnvnY!cgZ2@l(d)qcnrtoH`~jWlFUVWfEH=)3#pVUKVR6k^ zA-k!8KtvkY2=01gt(>`#WUotCrI8TniI^; zvFO%08nG&vX}|s3upSHpvYNT!i8QB%@JPRK^=j=iYC?AB8??gn)pCq6Mu0(kp%Dq0 zDl=e4-Ml#2KB2fc-lRt@qLRY-Yi=2600kgF+=YI3I>$n8u8!1LrFionHN49S(2pMB zc>KPQ2IR_kY{wW_6^66L+1)<9WKJN+Q^^!Sr`Kir#|f;1&Y@?H3sq|fc0h%KL1NJ) zdc-8)Ty`x5m;_d-9Ixf5$|I}WyATi^(|2zeIJFd*6zW!F6XfHF?FUiG{cA@kU`MbG z%i7|jQiQ7L6q4@@+ zv&R>w+8?o##Ro{{)t-Cbd~ARLC_v(T{63#bPVGB7!u(>4VG?uobsVk_bC=~|lL`Ww7DS8!{yhO$Qm?CY(CGvEw_mga zN0*NTymYy7Tbi-Y!{`wLR?Q8w<1VV;oPdjm`tnuqDp!?PLMb5x)Oi1NV*j^EF{ME& zAzKSEZt_qGPvtbw!5DE}{Y+{;O1=$A<#-GX0I(gXc>s+8+O z(Spps52WA)+%kF~CKN%}adfHNcE#IDg|67-LdkT;z&dI_5ED&r2in8Zy#buXu}+Q6 zs5HhH@J9c(_d!~~Mv4}^(^Er`h=+Fn6LXFN`^Hjd#Hw~_2qrs=PIp0!B#MdOL73sd z#w(}Rq&TGvz5;?gai8PJwx0Kh259hb0ht$pyNzcN)B#X{dhurx%GGitTt?s<0?4oW zo*Mw*d>mN!&8LI~I6-%|c1;BuECd>TsiR&RT|#ksixN1Jc7J8!1xPqHITL3-SDKO# z*e+Z%QR1+V{^0Arunpy!;~voLQj<30Cxe;VrnGE-(nv5A7g`wa6WjTE7xN!8Rc2>P zKy%slZ|&_i?GaTq8{Po7*ELb3wHkwkLQt-dL}v~bHsg<6#rhU->5wGu>v6D=mk@3= z*y4MJVIODePO|KAyrE%{s3r;f0mPi+=3<*W*q^NGE6R;ggRAVk-_f8Z7HA`sn-93$ zp>z7yOsAGDMhAeV#1ZB?O2CNyLaUMc_7vx9dw6y&(7ol&cR57ohpRC@UsSoAJ3`UN zFp6t`vS`3VYeIC&4ZnOEH2=zp*Z#TB|MHZO+W$By`!K^gJ1OM2ylHo=b0Z2H#i{lD zZTdZ@?4MgD6%dKgj+URxF^)1@7I|+=Q4w|w78P_VI63MF;0D)DZ|gQtK+GmenQ*L+ z?<~*NE{iEV0+I%Deb07%jI^@^X~{m(S-MG;Q}huB#>o-OX%O!8D(U_&g1 zkbN!1_#~_%kEz};Q_sE3xJ7is^-A_?Jtxo*;G<`@I!4pg!xG!vPnC_wb=;oW!oL=v zm!y}PlxjacS{vk2v2Xn9nTaYeRs87!meCFXi_xsS;B{;LYN~~1@y zphPGbt?QsrZIDB+h9;YLKk%km?IU?KR9jNQnM1oc*={gXuxFX$T(G~SqM$%@OAHm%{92KCwl$5+-oae&-NYSAcEiuH?nEk4iE zgcg_zUF#gbZjFV1YD8eqG@&b}$m zfUOJ;Hj5AigdAHfG)8xr4YKwr+6+Qjg9c z;R%@{_d^TWWDJ~pO3`x63iA|WEI?r9*AusjS!rYE_T{*C-hEu99Dw;Q*OWY+duVrt z*R>+Lt3NFLn~hc@?h18r?SyS9%jX9iK1ft`WdpMxyKib`L6pP%y)W zv+l2t)iRHmlzV-OrxKs#BWTY5AnP=LsKISd!3yJ~kJCrNDw?e27`eQAM4!fGv{iQn zLSj|w9)}<6pvQIdTg5Q-fBxpW@>F!P1)yaggLYI7ystj|-iJ?Y<8;p+E_H<6`nICF6F7DM zJ!GFk^9(oM@^i9_ZOm8MTJ<$YMS$f9FmC}YhnF6-5?WZRn>nX+0?^yBNW^fd2@Zwu zVlluo22?$1HHTV}nWHgTJXnHS{$ccMXcj~H+|$k8ZgoFJ z_F?izL?oMufyQVxT7?@HBRBBf1_oPGW!r?q50PzN))rS9ysAl&%$k7AZC!wf-d8+_ z$1fuuG>r#^eG!v?VG2`Vp^xe)l83(&^u2Mbg2UR_X95s+g~a2*D?>>mzAxWt&KxH$ zBCqW4DYdjHHbABqhVu&t=jtLiABzJAjcPAoBNUy zQp?+qBe`t5p8>F5{{hn>=A`XGQ#!YPP2b6g49@0G@5mE}Hp8a())EYha!A9t110w) zNMgASiNR?2;^VJ0oLV%sTm%gulLadjP0B``|D)wL{gj zz#O3@TFy@S*v#+B`$tvp{H|oF1j6Rse=n|DSd#(mF68n%7SJ!Ayu@mA0quBny3T6H z-bbn%CQ|^hwu>V7qG{-%O>{n{G^nw`b&nC}gwLej!U;tC0w!5t-_xt>Nk`Y{W*ouA z{A&6^ZzVhP!SmQRkK7pee{Fnr#O5|rX@;yRa>SQy z?K$rf@$K6kYQK>W&P&qPLg61g1cDz|TQ%(e&={tlPXmm5=Kjk3!=aV*BNrU^fn?T~ zcBenSRLu%Zf!a?HM4E09wDhIih*3|_%g19ijw(i)?|#uz6JHhj!8jUPxEx8{rbp3T zv>RDF{qAXuXh*cij~U)^rt70&$uYXpIoP7*@Y2S2#N8rM~3o^!hk3i&b@(mYVep;5qLPI8&Qmt_uN| zWti8c7dY|*2?g}Q`3ylgjx#1-yi5>S;sZzs=9lWsr%=X2wf&!K&s?jJM%}luhl1$J z5Z^fzdJS?;UyT#Df}6b#mUHjlk(}Yh^1Qw{F}B6ijtsFt_?f_~g9u`LW{4C)fy%3? z?QbNa?uE7|X5x>|;tzg;K)&y%9!DQZ`BmlJ%m($a4FE80^zW_}T)66K=uXO0(!Q=D zF6XwcWzjs@n)H{Q?lfcVo%Ym@7d7Y2QV_)=)wLu|0B|?*NtH&wFUamV?#P$`+(!R& zwVDPG4TeO`RHcx5u(x-RvX{n`v-%9=tFHlF!t3rc8hVd z|Io7Mq@wNSP$WknJCtQ@wDovQsXT+2XXcqb&K2ox?Qbf2Os4QRO)AI*P;?0hsN z=fAVGC6X@^cyq-yAQ|3ChuhN~!xMV`VcPh}78Of2vV(2g?lPB@-zr_(AUuUl`}4LF z4QPLrSXt>f-W+cklWL}t@SAZGt)?ATh>)$I+uXm05mS3{sUtt|%Kg0oh(t>O*7Ru_ zD^Oa`6tI2q@>N%3uL!c({n(=!e(RwD)EfH@H_+gnr?QVsXyC_b(mEgJA9@fxA^b#N z9A^?&rg#$wo0zRAL*|n}~F9Q2{2M1pqw~ z)>m1|As1y-p^+BGmY*1RIvMpeSjYAX0|$@$)Tg!N?U#f7X^5F|F_!V z-LoXR=eKsKFF?%NG$Kc6Ldh#RlVdkuU!_aN5k!dXxz}>$laj@0P-{cr;7j_meMy9x zbR0!BTg=w?@jxQv2U8GX-+_jf_&w)P!ax9_Jcv_x&NI+Q0r0-45nfrLko&xtS6D^8 z*!wz=!%OaW2&Brih(;S77A8jB5nS6?ImUWG2Qzzy{yi0V^lm(ROd<{v7F%WGh{D6> zd(QD57j83Z!yI?R3(PHNfC3DfxOxvNM}7k-g>qJ{xAj%dA>e;c{P|x1iIS~)7EW4M z=^*8sxk>Rz+f(n_U&OvMZ<#dVaGh=z0?TNL+0Kci8iH z5aP@ioK6haQ#2NLkCV6OG)n_Ff|#3;44h!BXn3->+M4c<|T=jikDn#`#tQ$UdjPJ^3IW_r=+Gpfhf& zC3v!Ua5ma|S=IUSY|qn-7uSNk5)BI!v$EPj{5>yGKL2dm$aO1Xe2#GNB7zNea3vTp zqd836{QP$W+x`h+ZX9WT067^Eq+-`DenNf*Q1%y$!k$L5Yp;yCr4C$p3=t=v{qqEz6gV>>H@?nR5xjLwn!-RhJ%-fN`Q zg}CT;6NO2-=YCUz0UN|dFW%oaqjzjFgab%a0-a-gv9pzCdiW7VvWHss z^8z@q>5B+LW(wL&K1)U7@F{cDuTn$yUYxV%7uywg6xk{)A}g2S@p{RdKMz(MH>=DV zU86^ssvNl-K>`B~G0Sjgt}adKa3zBDlrHS#2;|dK zx5LC&24-!#L*f#!vb}T5XH?}xs!zSZZSMCp<*R+*-+<$dd6Y!Edin7Vbze#fp%#ax zp!Lw}#6xpBJfI}i*29ee{V$*TeH{4(Vo8yP$GAnlprtQHu4p~(;3p`K$mW+1fculc zO{foO)_`y>vwERTSC}jaNq}f+;G-`{{XD<0p8lL+btD^NREoylL1yygb3#l1>otk1 zeVOsv36McwR^i2W1R1aD7Q=zjNB&R5RMUBAH{SE$Os4Ccw_P17Wkh^&cHl52bl9fq zYASwpApCTqFn5mZQzXgl^@bMgF8&Axxfh~G9w@;l+tW>mf1_}E7L(xGlVGH3BhRo+ z5)aJ+@i@7`iQjyarxQcTKr#}Mqg<bgF((}9K}n#N8+8|}Tm z^(8MpeSA2jt+X zpeA&K+^N}Ywg(hL^q7M<}B44>!EOlU&`I`^bSpzYUA9O(upAm(Q)4D4}*u5{I_JlYd>c88tKJT^M5Giu? zyP&uD1cjFy=8sMP!R@Xo*c5+%x>i~+wUB-KbU!Kg2FTCLUG4GdOHWKk+$uR@I#`{r zuZ+aYBUvlI+DZqw7vR@xLKkqa!$hG*X9-uKRxYQb*iqHW)z)N5IoR`;G6|3tbH4^z zG><34VhS|7=%%up?=AZqudxAxxp(JwN`l`2`RW`5bvbf*F=D&P;?)wCa6}a@h#sx0 z8H(BsvR_}ek^L6@&%WFi2KN@#E~}e&df88vW;lrDWoYC`_$$xh$E85>7-3b!1yvQv z)N^fm&p@J#XHaRiDBfYN_Cw|e&I9qln&J!4r8$&(^?dOAOEn3XYCI4le+_eRMi}W* z{B}PNzj@8TVXr`d#CoUrUA>~!!&pvZb$-P5f*e}^5=_q@q(NcK2oJi%+!MilmHym3 zErysjlV1MrH1P9&<{uNbLFU@4Dpc?oWWx?KxKg=)>d&u%3~9XA2e3ijC$mR4z1}RC z9+YBUBVn?U1Bd+K4)gWA%U1&!eQDe|x?lHi3F+^GB1rk#;H#!dWSM0y!+K~LTO|cv zq!J1+53Xjrtp_5{MYb+acVhZaGX*~@N1Cg-$F}mg0Wao)P0H7GY~eQwmu-M1lL~}` zwiMxL1Z-O$%KB<-yd~=a0x<*A-WVtUg0vss?0^Tr1DRihj$N2w&!{Lx8cCNkVTyw2we7hm){5+{1q_azlck_YBMCVgPIW%>n z4uuN->__bB0r|=0`$P%+j)rn&jfo)QTt~@c#Ug4T-5~?u{lfu3Lukrh#j2$6d=E=L zplENN&3RoOPP~=%r~H@PsQq+hmuBuWg1X9&p%S#S-R;n;NIO?3kHuC0&jy(U4KyA46hiCdLU!ap`ZsrhuF=z)Gf{{Z~wBF zO?=GnzK-AnQ{!NleMl+ry6ufVS1bZcYoTy>PpU&B)Au0j4o}Bo%s=N@q~B#emJelB z^f$U}Ju3<@&sPFOu5of32qYVn=+(TrBC{LKQJ7;#vjB&`=?9%==$(pb*&j?Hqj>7W zp_)dU{4%Vt`_!QmYSc90VujZifa2a2^f=7npp8W1*Y9Mo{ZXt=l7J}=n#4F{m-OR3 z3g+wYl}FVGgMI4br(l%6Lqk`M1_ z5kYKEEeL9#l${1O61-opr46NAW1@@ao|Ux#fu&ll`(B?#kNUYC$Salzinrj_OFWEbP`@4;-V1q1QJH?rVq(ldN*u~L$VzvlCRLjhO>z(CR5Q%c?v?>R|sHQKsH(p0d zB+ua7ITn5N1D8n(SFXLA>oX&#vw1g6dldDS(i9XmtD+W+0gTVVbfI$k6ECb-I(pY^ zB=uetX(>Bif~N=83Kss!d-xM4rRIed2e*2TJWkdC%lHb))^v?+;&6~MhB6vTDlPK? zJZN6+@zufH8dCrYy4e)i)fd9|Ks&SSpe~bqu*RfmlV>B_;u>!zimdK@M$|>JDmD!6 z4+YAtNR#=p?2UyHacp*(e~Xs*A$K(l3l3ZvIZYs#Q8VywMX#3(#0bH8EQ>aPL9SA( z1av~ij}e*yZH=rwWM&$IVnp%H6sKgDkVyVuq1(U^zu?VlsVclWaTkGeQt92XrJ z!NU&4J8#$eor*9w11BH(+VXBOsG=^Ju#yuJ5-1&L&?`ct#1~$aOr*P3X+NCOhRE zA!{>eWv6a&_p7MO!7h(&9Sd-U9PIZNTU&E2KpJ|2 zDb^J6k*n?X=)L0vm7o=vas!Rr2zO`vP06TmscigKf2p7O;rW7*0@)j}xaXd5mI#pQgwBoER#z1;H)DMtkk&CPzaWCdGe zN;MqtRt8ZP?diFg_a#gmkqfyNTfb=*(A2f)C^Kf_E-i#RCPZHn7*alzKViR)ia|`} zcRc;d368h4yyZ=g&IAOld60EFf7m6gq#FFRySuXg^j;v{Eb;x0UCH@$V?gY)uUy|N zx*#(0{;A6bFaIR)$#1u4A|L4#-EALS&h)*RzrcXZAyHg|t#kQ6#N{Lqq4jB&OF_Tl7HtNgkn|%*uZBM5 zX~!1(v(%d=R5y5UHrieA9moqC<$BfaPLIAhS4znUeABV5sH=N~%0(c!_T}wF!E?Ex z`)6{%^(nUKIENcEn#>ZK-LJ0Zn?8Q8001{&Q2F#)jb*!tc_!kF`-h^M5->k`tyNAB z!^U-;Tt(JP;=b2|`$A;eDkSme1A=pA@b*FYO0hUSOwx8I{jp6jB(SYbk&u!ilKd$y z3P#0CKFnm1OAExJRP(ohn^{wqg*AkZNnqlI%S|n4@)emVNLOl~*Lt***I`aTe@$`b z|MA*EhP&yat&CtiqUw5 zdOl}ETMWdR06WZe-#RM ziob?S;@RqG!hYG-MKs8iKLY!ETx>@@{sZk8F*M{7DeKwZb}IaZ;PWUPRlH_&TB3}-Olb)))yu?r{M{ow^>-WANh>0>J;WYepk zsdL=xjd>i5$%4=9_UGJ*s90+g1s;PgRJ_DVwJzEckPQChb0iLI0JY!6euE5Apor?`-yFa1IJ(>jJ`TLW-^-`C3?w7KRfsg4h|CA z0Ow%<#El~vyzZhdzel@VPPcW}`8Hcq$_yGtOI`wCj^=tR(BJi#YO;ttKK?``+AF_^|r4M}YPjE$~uGB1XTj-H832MqU|I@2&5 zA7+94ip35dp`?)s-S7$e0vC;v8-`6VX&NnC(Mxm1&04u|09mXp5b#z^$p3&R{yLx= z0MBBg#mlYEl3-2C_hdpl(*6kU)<=n)Ch4c{KllPXvAG%NjbUnSej%64n6c51B!NJShh9 z21&B?E!TPX_kVb(Sz;Mf*$-Fy7^3P+eFnOEa7YLyH$F*-la0~?wwmz+rTNK3W# zQa|QgMO+sQDB{i-uw@M}Buv&|uu+XLn_+!j6B-*c?rC z0YR}0;L?E`=>gYiW!b-@TP$$rU-a5*K^oG_zS^!Dl5X)lF&hTRUK&InA`<8SXk{EZ zgm~Q5m!t>sZ{3W*P> zrsw-V;rfn$+P`Rg&Vnze&=E``CoU35s$^p$M#)m;e{?;axqMe$;o7iruJnNGR_uDlk^&%~{ z;PFor;DCfO0Q+_@o~kgF04NE?L9!B{G{F7wUZ#iCAFQ=!ER1V}9Rw^|N_rACUE-!C zg~fOBwj|uPYP^0*5tq%l=2Ckj`mX%i-ePNS2IQ|ap;6s4u6B~M%zfyZ7LPz@mZL#u z$X%IJrI8erjltm!WHUR~dD9qV<3Lc7A@Ui*JS zONcNypq6yG=z*DoAj6e^KwR{>XS^0CT1(9yTlb$9^)(G*@j1xUt`&(l^y#k&pz1PE zykqtMABE~42(WYQC)&Cm4;%xD z?H8IGjz~L5;S`cME-(PfH2bUx$jgCkz@X5>{b7#%3KaXI2{af zaX$R@-Lr}l3Hg0B2=xAs_;d}d;DH8Cc>N|cB%K@#mgVL9By%0uVnFW6X!u4Sv$H~;3 zn3kLbm?}`WFbM>$@N)n&6<;z06pz)Bd_V$;XkGptZqOj`Ofh5G@VjItP7n~_0+3k* z`G;yhN+9pw)HB>Q)xh+57-n2;Pt&iKx;_Fh*a)tO3YrI z^*ZVHDPXA^uwSZ?aN@!*qD4b2F2s}w)pU*D0*Gl=PMBcu?rvIUFs;%%9<`~+?`i!m zG9Z~r*oEX;Ujqs7Syt}qjUk{X;Sr1ZHg5pkL7QmKYaqV*_+tc2ehP!}U3MNIQ-JDo z(~d75xfmCeR)b{ekPu->^%=+kiRDr2_70QMwHXc8;1B-XFfx zlA}uljgh}#sxrubLwi8Rr)0c^xLT-H!d#LG6pY#u@H{g3*)DGk)bJY7hRyEkYib}l zj+>4_ z|0Lif89D-1-xhJY_n`ulJXnH;|26^C62775n(WBXktLSC24cd%q?v-m7|(6s3drN6 z0G-_bXt%x40l6rRH85ZCD!BjrbE-q^TRAR*zj9r|>H3NgTLP zumdxy;m2n++@{KG_*aiv{;};_%;Vg^W5Cfq(ckV9cHU4^2LeZ-FcZVOCH5RF_E&{M zIwZJIEPqPha|u0(^?}qpqA75;F-d|Pi_-tu=#P*94VI0ceJ-9)E}JJFz1ADAh;nc@ z2ToE{-vc&kDfys|kq%WTzg~5vLiP@3xP#g9xD|&hHRPP$_KsU5{t5D_ zJP(YB&gdQm2@$4a?HK=@3&R_Up93GBz2?A7(xvgNTdiy)00 z?Ubvx!mPb(Wy=H{tW&B|%E-gwoz03aOf$u|Y!;epXmUT`B^O1d!JQdTc z_OEAot}=EEr1-H~!|!gdwGGSEvfkro0%jkl=*vr1tlPsDYr!J_0p@DFx`j_aY@Qta zUQ1p+J8IQaSx=_Y41KX&(3>Xm;e+5iw^jG7bL99oxt!bi1etNRk?|hQ#rE}T%db5% zRqa@AMz!~)1Ce;yEM2xIJ4~ksQ4+?I=@?_`ni<{QxT+L=J6tK}*=ujg7aTtsbNpa@ zq5Z3Q>~gNj#Vax79>XcJe)yt$L|-lLZ^wZp{>bUBp*mkimD`K3X*xv{;?wCy8*N4F z7r#h~7(#$JD1R(EDtUCCdNa|+`swvzA_o-vM~nT_G~M@ex`!dm*b`5he(`U2?)7xW zKH?W$xQHNMGL7+4454XIOzqaIf1QP&1DKD9=5!F%W0Lhc-k?6ESfu0IF|tn?v@wo< zUn8^If2ONU&$nwtclxN+kiRcP1b3WcUc>B}SFoJb(pZ40zMJP?T$~oDOx(ZJD-k=cIit(ZY|Z?lztxHQlYnDn}HPkEo&V6&B3399wR7FqNpkGg@D?zJe`d}>D z_m^|Vf}Y(}-PZ&`)<@++Ih4c^vRV+aYge9aK{F+$l2rdug+81gntU z3sh_zul=3(*L`5C0$XBn1Dk%70`ptCi>78>{61`D2X>1Hgt*I8e-BHE()Aeah3F~`>rIt?Bqrq zLKPwx#X7MS?tY`?&bLfr9lUjH#o-_2qV)(qI8#4mMMae3-eoqf7635s@hs&U2bySvujvCoBRe4$N&xcH#_hD! zi$z5y69gYv^C`UWI_K*1h)7wK^X07mG)&vxPm~ZUth>kGuq>8;l&Xv@x-j$e2euHa z2Cb^e)1n!vP&99O?8$i-FLtiF;1g_zb&eY*n+M*EbIazJm1o)n*%=i#2S>A%Zcs~x z$*Nqe0Tx%tOPiD0Pm@HmqPQJvr{$jpStT-Lvt`muBL>*HSS5kEVgf%kVx^#RL?VY# z0+RFg6d6q&(4dZDcn*o|iiI*smx)<3NeBLsm!oDz4%|3tMaNL5hOtRY+yLKJBitA8 zev?9ySGR-r$Drh^j(7#5bdw#D2Wn8mJlVf@c==26#4#wiTmO4oYZj;)0@%zLnqy#2 z4;A19vLa%rJyJh;;D!q`J`5kWx%WZxSP&g!BnJn!Df$O^$vIT>RqL0W2dNsgA5D2| z51g4pS097R&F(Ib3I=2WbRkAE4*a~h@(Sapp4l4VTJGRqM_ZtL%p~vFG<){;Wz_c1EJzDh z0Q{lX)@o2@+z}!AWJrV|o-QuPfc6Jr0I-g86%)-mz`4bSV$~H&fQN~!f0udcBo7?s zd!vGv-x>JV*z2svWZA9a=)z7sE}WoHP_Fj>t+40~P>+Tmz>`|C${^j%ARLW_?Vo4R zNrY(E8rT(ny4U)pULSC5xx@fH*?RNFl~DT=Q~Slwm0f%Jlo`EXvfn&j7A>tL)7q1M z2{qlwaEh<^Pho=%5B2$nYm)EW?Q+f*2erg|lmzo=#env(oYexwRQ+n#UI z){|QwYUO%Y4fcpBNZnip5H&n2rIML;$p8sd+|4qO>C#Vg_%rad&RjwJEm#h(hGG?{ z%#avV)8n*;0lePya0Uhb^d!b|F?|SRM7aOul#t6Iu;jj~-);YJ-+Ufui9u$CyVx+m z>;ZDXhY|L*C}5Hv$ma}4Hce^+y7(M|W1#u2OaM7_@mj+mEwt-(i7xNoA+f^oBlZE4 zbFvJClN>14ffxjTbBT)K)!w>UlBPhdJvCDhV?;y(d|tHHFaQx7HSNNgqCS_PZV((% z!T+FnU+vU^)c{0dTjGK)o3tp_-QgrBcWFuQ;Rz??3@jCl(j~*D2DtJxNa;p(TjK?f zOEOU~@a18o6*_mVL1cg=iFnn*y+naGzo5mtkKE>{*CSmwhTG96EFURQS%Bn-nMDGS z{K#z{No!07j|%_?3uxud(si8Scom-aSR3CS4#xm+VzCVfPjg(3fr@O5Ob=AyW|9ZA zT+yAMvXlz+t&V(IhFOq@@#JGM;>hTLn~=pAc3bq(=738901)VnQ~2NXtoL-~AX$(} zo`}P+7$Dw@_0`ue*`4zqX9XiGf%K6SQ*@SSNgh^juoh48+`)k4qFrB~Et;w7HO)g}l67=ijGjIh-wZ~x9>=VRbfn={GEMNyRSQxuiG!fB4BQ{24E zB82<)Hn(ZuFR|y~-3jOqD1>z$fC_7>fM-*-MqvY~%Kw|}G^K0|TA>lmu3xJ()ChDr z8qo^?Ji`G`^ae1fJFWM?UzWY(3RG4|z$$Ea82oQ0A<=ChC07P$OW}zjC-v^ruVf7R zC+>hZeB)Z-or6eu4)G&?hfjvb09b*nHxS5%FE1*byC-e^d(u*38&Cu05*&l)JXrvP z&ry;I$dUR*=jOlT4idVl$~}%O0JttSC?J}I`>yiOfhr%qCPDxP7{6Kg45`UrdP|G= z90{P@k6L%RUXvySkAVEaSLMI7uucrb;^oMnjUZDtL9;iRI}bk_DDuMNnJX!1A-uqO zKh6H{IUGnyXVXBIBGL>Z(W4l8FlqGvaousDyh!hEF2LP*3@YPAT1~F@0GVZ`M+T%L zQ-Mi)$M76FGZgnU)=$NFqEK@b1MrkuBncpM6Qr<#d{_~z|MDO~nGZg@DJqoT@I>1} z(-%Y0)Kby_w%(1B;oT@z{Tn5!@}+x*!;Z}dK`PlG-%=xB?OTl0|E}{-(#9YLr4bFM z>nmpfO_K0q1h_MT1ti5R{|cn&wjnF9OYik*i8n90D71w>K|&V@Yy+wLxoAMQQeXTg zUDzWB!5&Z*&r{kw|3ar|KUE}1R+12gdLcRC92roty2H~mAyKptZDfm*G#d7o4LNm8 zia~ZTl>_?Ary@VlfuYwcqj?UmVM;FSu7)uT8na2@d$F3Bi>4ey6ZpRi7Zi7WFcwJF zRsM4IHJFt<6$=CkpbwQ~GpNuGAr|qh#?Jy$`HvMK0KgmV3Cu8 zMnU$fe_+JY;o1O8Np=JozpF#G4%lM;PQBMy`epi497u4pCI`sQ=c zMYPf|4Pe?|Q&^AFM#xbfWZFlxO3GseEb|mc>RqH>7|eht?$4)iI0vdLYxsdwf7z%A zj#hwm>;3;0hXD{brVacO0MurWs)axP_wI*+Z*}}pP%mzGb=kDYNK>gQGV}dSIS#TCc&GJ>j6H|YHN9e84y(-hKq7QhX zwCoHQj>Jku?$>BPeIR557`S>sp@J!KdX@+_E3RyW!DCKBPVOe#$tcO@S9qx7Qm9n1PwaRCiel^)Hs4DY1VSiA{8 z?N6X`Q^Z?7YDg4%R{sndq?|$mA67ble&D?9qzKrAE{HA1 zPr*NBBkft&KMhObbEqh#Xse*Bg{sN~0QbY+82#KcWK@upRigVXGiZO>%-dpq^iB%! z&sSxtG`iUV1h8|bd_o$erQ{=U#AU`bPbBf9)1o3aON*2MC>=6#7Bcnhiy~gdY)o#EAwChj>ctB8E z_)QA@e1|Fm)SDN}0so2;(MYtN%OrxL`vJ+88aQAs3cgi=V_F!g;*dS9#x=Cwc{O$c zQTK@%o=YhI$k6W!6{=0i`!JU1&0Q-R<8^JX=wY{caZf?zH7?tHPW2sVa{V^f#+-t}2qBe#zm62V zE>w5KO8jSnJ-BPe)oPd%0wqSOu)e!18QkTw9K!J8%{DdBU`YQhPL?bL0dWxxYA%yR za1ufEY&s0VuArs+UJ0VVJRL;87GHAwkCDL?rn+}K@2%Vi_*;-3LsE_2+_U9MNAmAM zHdZ`PL_ma;VwCf%nL-oCpRK`?Rj5esm3Mx9)0>(fs^wkxjQ0OK;h_@Tit( zz+jsC6QlWwZ{tB9>SIvb+|&oRKnN5=s6gKiGU1K7MYn4GWm=Yh`)6Nz8{)?n@o3kb z1w6+Rslo(wMkGpxv%!Q-x<)bNwSgk17@jaD5|uH%Khp28!U+jziUNU4U?_r}b^bS) zAp`-iY&!6P`#{1rG$b1U3H(Si^dZUs{9=k4LZ%pMnwRHpD*%B=;@9$!%i{SU`0rC2 z*RTabcyH_Qet`1*di}TU@MeLs1>t18{@qb2`C7*tBiTzV(hsR1l1Syb)B$0I@X?{a zyYHcUezPd?-JQ*mLJociXq_~OI}3zpBRb@0%`*((22m49YS7Ox^t+Y~P%c3sFmG2z z4kk%3o(PCSkn$P#nXDnJ4{y=Ys4?_6=eVT%B&f*$U4BadIyr<7UoU>*)yUP|cvxXD znZQ&&DgdVEN7c?eu&Q)kb(Z0sP^6wm<#K>LqSf)DXc?en&1oJ+*vur2aAh zKETobviuTlS;)oJxe(aetvr|u%h12;N)}0rbcmSj0}|CzAVNSs5$88lFy|}v{~R=RQ8mbkcv9}_H%|s0 z{n~?6^yk*U{G!Y}_j@Qu71@lUt)+_@jB^jFzqq--s)zVNYI@{v9NT)1PXd?_d4tZ* zWeGTxCgQLGX?%m5JCwAL=Q&fiA+1QnmS{fhP<4$}w?8|C`Fp;0-n>rGWlub<$t})W zdRw{e+1>JYY=1*NGVEHV0F;T0|g_3V7Fl!{Y6n&2SR?7u74 z%z#d-ob>6?GBVa$`VWI~1LUMN#9st)g@m|HO&Eom6r=EF0-bu3l7rfxJBuEQv?7LJ>zPV?9zX-u#ZLT>9acboMw{B)J1}K8+}u~_XzEoV zs793GmirIXr8dyTp+V^t9FCkA>Y`-Vf!gV3=Nm0eXB!F2V@p2<^qr6Qxn{n21P{s~ zh;{b4h}}8}Pvq+lyn6Hrw1nWH!xR))qX^W;exJpms0a_E&gBp!$Hfe@7$3sh+dhH5 zLP9PkZ2u4uP2E}(9?z+~7xFJEBzY_kSd;l= zsiY&MqiY`nODq%_x@Z3$a*{^y3{@UMl1KbCy(lWR4UN*r!uJO}=(L6COo(VB(aR?d zhhU7|zsCkD7DyGb(-at5-dwW9v+7GGGT=gB?~zMg=@Im8)<;4b>>)5NZpjcT&~a4s z;eZ}$X=OjFHw2vN^!H?3sHqXCNpXvPwQYhRKIi+?@fBWHaa=txRavWB zwmTMR>b}~EKjt^0F2a7<8aXb8dSbtM^peJr5r;cqV}OkR_hBwe;iE`OG0|N4WCM{| zo`sJG)nyohgI6X)p9B@XeZe;&lCKM~d+N>u`&}Q+Prkp--nf1oT^lGt6DH}N`5fmoyFSsgt;lZ}F95XX42eR47YDe3B zf}v;2ub1w1Zize(Hng35|K?Zku9lG!$)8ovUF;*Pl^Nw~uO5e)x+{eF7B6j|@Cdss z8kB<9Ba)kmc1F@10ai+#C9T*i)*^44NB%BM%q~0&^pPmksI0PXpU0`^Dtm+DKj&+o zltduYNMEe%ef4+;nZ0%?njHK6(Xv{z(1|do>lr~-VQ<6dvgU>GSe9j;#{Hd~J?oQ0 zUYnQ)y(IbfWKXnLh&T*j$I_^h)W1gZX)Uq>salm*oH(oql_021t#vg5@ z;&Bg4tw&UGWSUsZWF5;YTeRF7wdq5!0&N9?H`%~+(1dx?Ar=K}oY+oUTP@;*s_SLG z{YFK$OsCP{DMNa+nr~&vC4#Qb7uj(YMa$Q`fRr)KXSYpNcx(64QD*K?l|oELMCe34 z{2aB;tNVdyl6Pl4b*B%MT^uSa$Kbl#?%s{%);u8w-=qf^n$}f7h|Rx{E7QuJFhI`l z5q>%H!orH?29%8YAh_HSH9jGfVgrkyS4as!jlA?D<@1@bZ4lk_7O!f}C|Veb=VxFu z5Xng3FqH6VJk2{=XLt9BbS&TSX}uNdmHU}s*muDq9ehlpVs|S8CF93>csgWY z?dt^mpd|c?)m}s;Ermd!xe30cCe87wRldZ?My@GtT1wTwRV#G*nd(`eCbQJyjqvms z8?P#hG#;(Ictk314&~?T92QCj74qd|CDoq;k!2%S*9*Dz2g{y~Y)5NMG@fc@S4ZQ*(6QkvFi2$!rr%cL03 zGS9OTLn31_&6>*bc>Jn1HcUO_m5|~Q+@X$*j1Rma+GRCRe`C1=RUcj8WWW)`#2e#^l-He7_r;hwG z7usp89T6=JE4#U(^jtx?MqEj*viPjBSC>}q9GPeit5*_dm2 z?6SEakxI7$f#C+)({QHOl3?)Al7dAu(pIsrRW2E~vwwdN`^A}hsKwc%(T{pH_SCTs z+D3sJt+Q_GV~qnYG}SrgmCJ`Ge67KjG)F}Jigss{Wz*ueTE&`{N;Q-jZ`ZzO4dzr1 z@Sz$7<~u!b7h^8_TzKVd|0-TyUmjRW$th}gf&eSDOxQCKe>F^X!>==y2V{r2=y>#M zmi*J=9+=JHXEmSgt$GsDD$zJHu8&hkz_xeCg@t^i{j81`^gJ!k#yYIfi7&1r&b*0L zUdH%dqge;#ukY%(iHO?gjZ)_Y)u(mlYHY#to1^dt!>oh)_ zuKWE)trPoTn`#JGmdKj`#+LE!;}0fL66vMdh_vIF%Pg&5BxOgIx3YU~8wGvot=?Fx z9C50NJ@C2b*G2XffN`|!+u%j*U6^jul8QJNGnN|aZd5Muq8bHk^hk+jafn^(iS{b` zdg#85Xq+&kpsA2TsO533Pd(8Br?fnbq6}G*P5P>Z-_Q4tL#_9_Gwj7|KF6kjf@Tw_ z-E@6KipWV}|7uw+OKpRN=ITSbWd*3pA6N2;LL({tezpbmr&T)~=8NHI!mqw_Cw8}8 zDZ4L*pqVqrX>{>$^hB`l8&WPDJx1cdUgoI&{~aGF>ZJqm>WayyOHAx=e54#~JQ z98C^&IX(u>%@@^neHopU0k1GhQfuBY@IpDWMC%&*K$lymg2 z(yNUJ9iQ~1o<}M`eV=%fE0Z7P76*%6`tt3J5&Lj|yAJ^e`qXIj@1E_?6j$Zm%-R$GO0)4xC1pZu&y5$HfQEqK34vnn`i0WHKY2nw4v9}FbDzm zQM$4rYClWZXpS;lWp>NqY3#zw%kNJX{*V&~GII1h612$l9@Os?(AUqA#vd@`87ryh zrxyIV!U2u4#_<=URZV#s^DKdly7F}mBET!Z-TD|tcbe+V%eX)HD-9*s)5DQ_8}Po} z-BASJUK~)~%*}ZxXItq_E?zr&JWy?O??(9cbrx)ycTF9ZZye_9U%vd|`FnYVqrsr3 zHLn)8$&??p1hTqXIUDJ)znS&pym%z(A(@blNb8HZRIb(gq{AIG+Ig?)T*qrH8JvW= zS%!iRi}TUg^jej!gU@gjPB5qoJ50g?;=9KPAIUx@SAfq}hXpXccAu%3`8hX_o|Sc7j69fAh$7mBjqgr^w@K+)TdhJ zz^*T7hWJLLp=;#}NeN^i>FGVT#mcGk{K@KrBF9=-e0*c0qm5grp8+N(S)E{B#(*hV+yTM8NQy%P(Q(`{g)VIVn=Z#Ljwz@U( zEPIJBb({p;p`ABPful-&y{#DaNHs|iXgT7~;o8^Ts z4j0DAtq)&+9=DLkmGaO2(&;{LltboO!#SA`>d>`s@$)OX`RQ6@A$-fEUjNW<6gMC; zl+?5R{o*B$+dl*!_xzEmb{=_iqBN2n9H~%N8D!z1bV(zvn8+csIFzw@;Gvql{p`l2 zOR5a*B)uTpa~97!yMpM#-_XPH_wvc5CMpExKl)()Ex^&kCrG@(LIAUPg}(-es#m)& z{tf^BtNCEX!S&V8nzju?rKI*_uy=%Nk#kSaams$4kKh#^iJ#8CHr|sEbX)s0S=mkY z1vqQ#zjyl8?bG}*)a@V52naD<$|S3sXN4M5{=>XIk0i&vKj)8#18<|(OZp-oU)1Wd zlWTI+QR=x_?AZrGt<%^;ZBxT9wC_12tJ~jhr`)_ckPn}{_QD$G)+FnDW6qNBdCxbQ zW=vJnHxg@Ch<2=9*4_Fb5-(L@%ZW&mN2hR2%rWQvp!PA!f^VpC$`wTL68VkE!O$UQTJ{QqJtSuIGqwD!Sft?!{dW>;xk!O z%x`~LU2W5&Q5Sj1EbXy&v>H@0ex2_S&q(q|okxGr?uFWP_IJLdF7h(g6n-znlg&9* z(d8`)b(R3D#s|NxLYH}>TYib^?|7J6D$$f{8=qo@8)N@BW90v4jA~rc`Kwh|)jV3F zrI4=(N7~QiK3wxqmLLu>^TCwq;liHpz3RLhe;P0k(>b_d>Rp&)pTt_i40Wg9#dwIZ zW$r%m>a)wvMp@Cd6xQ#p`fP!Urp|4h@;nh(nNNh-3G%e?eyQ_6hmwj#zQN{y^heIE zn0WnyMjDRn1~RO(+%v?wqVoNt)QeuW(rcEghhPveiCJ~sSe^ML#FAPjN-exF@a zKh$C{CFFB|I&I<-smNH7qI2R6{>l^cX)bBWXaC3zHOV`wiWDtC16rxw{N{pXD*=hu z`zOgnjn{i*F3S9uw6;txi{Vy>D&`6-Wa_Vm1zAkho1DieyJ-x^ydiv)P`0mfcybyY zpH;(XNrsrVgmM^aY-S}F_#Y!t+n~z4-6|h-Pnmr)PN;xPCB4HnV*&VvtE#ar_F#{> z4)MT^eE)3gQOl2XIdKIVqxFumctR4POx`!&wRtS2@+x(%+!kPfJc+T!;SyxQI0?7P z+kFI$ipB9yqxdqq&pT7?Q1wkC$~aZjG1LJwyg3A#`_D{Zv;8v@;*5bChg~fBJ1p!U zVz^bYE?*5GNnr{zTqru!EZ4a3*HJFYZ`aNfS4+kvlCUiv<~3GN>I^<_v`uxWe!W2B zvDFi}_`E_x#@zE}dvW6$OT>Jp-dTHLAJ?QTU>iLK)@C71cgpz`t2$`Kboh0fzKn&rOm4a^gx&pWAZP`QnjA-T~iWdlmR z)rZ(IzIzYV9+8FU%SIj2h}8x%)_u4Z;3>R7Iv8=JIVPP-Ti6(WM(Y$o_is_&C;4wt z#z~H!^PZlZmgJvh?gkOMtK6K=+nGw`Kh(!6JE|9Z&LZ6)-1=dW;JE;doUCII8gPVa zI9jDK9o7+KgiIJSH0i5AJ0Ze^sXe4EliMFof7mspaM;WZgguSw_egn7>URNu^#5@L zR0V8Ydz}?Hl@NubVTLRX>`5p|#+@lA3}`FN@VmyH4V4mfMnX+7g!^==UWf>~ELXG6 zG_GuozY^z9Cd3g!Q+Eh+a(NDl?`^f=e~U*>VLV?cxdj zGTe*_>j$nlFxKVNAGW1O$NH6a=qCdFsw^?#*W!Xa{~cn~|JUl=CJ}p{Q3^g!Iz4eI zy!wgbM&mczz7yL#{;OuQtfs`aRei$e-TfqU6ucPf>}mi|ru@m!7OA@*_rSH2Gk;xe zIon(yXDBKCoSaXU!pUP=wS!a{`Y;Vmc&j)4qI$uxOXSNTZdNUa8?9*G3uM>Ia`$V= zWsb2K1nR^yt(&+1!Y2Zay? z=o=B?q5_u}JZ8rTo)t3BcUb0@SrSWObw*pyKmqQNd9R-n^o8&4zx*6q9tK%;-I=QV zXy~~~@D#J(L*#z5s-vrSL^fK5_&?)=Aob7q0Cy3yB2=@F?Ab;7c8&nf(qkQg{im6# z+SasP6zdZ{jeHM@Co)DqD|RT5=s-$!Dnn(l&gDg{^cRMur>NGoh5wLzV4&_b?*-Jv z-pS6hu4jaB>Y*^|VEfn9&lOGT!V=5#JgIzzBiR|&mvi;~Nse0T%~-yim1V~Z3>3zB z=-H?dBTrS0+XdJ9)Q?|eU&4}_C85voYeRIbh3&_zc{-i$td|-1o+jm^3r2iLe@n4F zCZyw_%FFdO;BK89YSzL2w7)rwGJ+EkbGuGK04`mVm1o?;nfmgM#|e4DqcGuHtoCUi z=T6_=Mrz1&7e0ObrR=_LWs1kKhc=-35lCRL=fVRW?+id( z3Gn?7k;5g&U(fFS%1OqHe=A<}ZGGx7+ur;zs_0eGdsBldiy=v;<|Or}ris|nF0n|)6_b$_O@MVKt&Ch}Kk9G(dj;EiP=dd0L zNGfkO(_;Q8H2OgAyCrBNGdGN(p6+BnS@W>Qdcog4O3B$j5$%lZhfsI!GZha}j zIo7BNmsp0@zWsuxUb?i`s-=xI!5PVQ2Ek+a8liZEvTXAWOs{$~ztsF@AVt~lNe%YEn>KFDT} zPS{PMx3@X5;}+#fZT<0PX;m>ELtDFb>P<>3Zka~&%;AF^K9F&B&Xh4cJ!WaGR%Q~4 z^OCvsFw|1VkMQ{ykOEB30d))U`eHCWyIX#93+H-02%4IaHz)X0#!?+yRp#n8kOHin zolV!%TC?|YXTOS@ zPP;!P2yxh`7R+b^wGo~RFZMZ$~JJ}?A-aK(qsP2J3+jutz<^PGFV z+S;J{_BKVv<8VE}0~(O-7k?M;JP3gBW-!@=%OD90_4b*cFUCm8{f>Fkkp#Y1*u2M0 zUqNV9HH}cF-{P;5Caequo|@r4ILvB{t#jO@RxQg<4UwAN&U|O;Y%@0N8Eqqr<3ud^ zmTjF?k~@#;XFmD?OPG#4%*+5{NG@L)r}HA3rO)=;vo|#iL{kEX>e$r;C|3tl6|aYF zGZJ8QKPCU!u`<-VQy2#s1olj6K_n`?f{PIW$q={zo`@gpK*UfV0-`;@)e`&lH`E1! z&s31dh%6D7;SmyukbOyDdNGO3J|Lh?z5mx$>N*4HP5Gg1qif%)tg!vzgYo)5ET7LF zbdNdb6~c-&ih1*69e@klaXXJd|FUQi z^huvJW|fgi@*IaBpn_3=ync+Z@;eBbgb#kcc1TEk1ozScu07;y3=W#Q92ObrLlDMv zM&e9Yp=|$VB^F?17S8R7j-&!O+RgD8uQ>-ab;~G9J#2858n_DUQ9YV1Ux3wrod;Ky zBXaW)Nsc2RE5Kh^{vI@+AnqX27#{UegbaZ$1;m2kV*DNS#Q|+(n7f+{v_ZTr0$fLi zYc>@PLZ}b#qbMUGLZU&4jSfQ&1f?E5wX@;@LD@0%Hg`cEtQ@k-MrvuPA!5h7STjGk&OtufL<)>7A2~h`Ina+~bc+~i5%{!If%X4D=1Set3{D`0 zJIQiLv=zX;o`!XH*!XGGbdw46W>rxWk9QOG2pAGbx z3iCWxjeIMK-WZl9r7$$R0xJI^vA^!tO?bGYIkmLUN`MBA{1R~Ct${0VHnh%`s*HlM zo@k9jGVH@SkR+%``+K%wG2irsg%0skmt~$bF|YK{1Xk0Z`Ui#LXuuV?XHA)oh)9n> z9l9t{j%?v`g?F;Yr=~ULX2oH1@}uYp8%=pGh&kT;-R_s%Prk$lLQW#^p2<)lGN6xm zx$TYo8p<097%@_d0+g=yf)$1T#Ns!l}koluM!_Aq`WRGUv>CoN&{PqO`!#aPcXD$55@1Alsh>NPHVugQ5S;dDx zB9k)ybHQDl{h|9u7z9S|FU|W?u8j$R9h9S3dK@Zg6m&165}x!^djB8O$zb`X$@P%f zzYT>1vbl8ceV=-E=Z(ks?pvdcERn#1IO&#(AT3VUs33PrUqe3m2}+mOlXRtEL8KKD zo3B{~i_ca_QYbQi$OCvzik?kABMPv%iK3Jc)=#~rzr8WS`L8$on&9J{Zq(ARypWN+ zTEWp$PPohj@vEZwgZiEUyyxMKa{C0ZBjl}9o-=>L@r-|+#vpVM|K%i+Pj&48PWy9Y ztAu-K#p-R1LC(?9F;JF0(XC7GYG z$h`{^1BF8*(6Yhv3OfI?sz#wJsYrI0p`9G>K2Ywa5nYhyt%fl8}!*sv7SD z(3zBH#w}i~PYD7dEiw>%8B-9<)?zS~hCLJB@~aMZ!}^>**U0wr)>IRBg{l5nzK)y9By9> zjy;>Sj=hi88GZkpQxxe}nX<9z*2e)yUU|D~cQd}{q|%AyMULNLS>~bl<6dpfvYz$q zzGj~*tbt^%L<^;4nfqsyu&cA(zGZLO4WP|X6CUKt7I0dQE|YZbj-%WF&QP!C!p>OW zAyF^G*aD4Rz2(q(@*2}0Y)WC6t>VnFHIwSi+_C2@;S0r|)(`HVQsjWpu=@X@>Mz5h z?B4HjSP*6Cp;KZI=@5`EX-VmlP#Woy4h89wZt0Nj1{GAgQ@Xo5|9kNM{GRV~9K53D z+Hsw^&b2Bp%J!QZPdj1PyYSwvBPasUgAzD^44g**wOhgE8v{zuzEp+V#Z46MT3V$t}X4?#Q|z zg_bZVa|GNcGKcJ!Z5qJjvM873un;gc#NS-hx>-)t$p0P1YFM?Bu~ zkTu9MLL&{IRConfB4PE+Mc+t0yfC9L)9wY9Rw|RaqO9szv;9p^)kke zuG{;j%K;Shumfb>mOP}83G17Nq4zu5mjdpM_8SbFv*opyqhl(mu+l!2>h-LCDdoE} zq3nxFZ`?^vn@P(EvTH%VGQC}yyh&?NIZJ2`8oKr7L0-gTZ!_)hUaGMr*F%C}~_(W-ISMwnH#5|xly z+=1XuWUuL@g+a7>MZBpJ7Q^qfNBIRCUK5c-S9Kh5gl}keSx$u3{_p<~JW|IYV5%M#e-zGcC9(2J3 zt!vNoTS}n6RkHn%^?OeO{ljwK1=iH5NKgp41PvC|!Z)w1YFGf{lfc8vt?4-@V{w>l zP0#W8XMa9~dc$r7w^l<|SAtx+ZjExei3ye#jRNgQmvg2+M~L+l+|iD_FJs-DcJh*) zj@9FiLgJW20Yo%s)ccM)B#}zIRIf#5uV(8dQ?THpG|yFAT#IBr$(^5HG`gM}LX>C5dFZD2Ukqr+iFB){=QZZ2$-WK|!DJOq4aTpL~ z$y?#p1T?2X9olv^u*E+P7~>FDOZvgxRQG()&#{k<4hdf)Ds^5^6BPF1)*)a6G%;~9 zDHcdIF=jv@M>R_lLy#z11sM5Ec=C;VAp0QN_^RF#zFCRqm{yq0^X6sXCkUY+u|BDUWu%8B;g=4Ba_g1=8jpg{wyk zD`Vjh6#v+jN~S+9bDMMC#hHpy;p5dbrfFoyOM7$QWK$;~HrlNc#ikdF_{TTPi4}eE z`Zkg;Hr1my-sJ>?SfNCapW9|Sn;=XtFZHi;Cp+1KFhQF6v`=tcU}k7n+N2==+0ZF7 zzu95C`R0T<8WypjkDleAD(J9lgL=G8seU7i>6=N-)!uZe)F9L6KpNKkayOp-a8f!L z05^ur_-jj{nI+{RjyheZje7p3qgLD>QU9LR=<29+6ylGl@#iU~G4N;LEfD7d#L6Gaaf+2S+nWpz}+)+4T*H zS5B8?%PP+DYW2y;Mem1=n0Nc)=b*K&dNA#yy5qx#BAe*X!B}&Uy}Ix)HhrwkJ41> zLJjX!e2g9LFS(u8J`@ujcPJ2Yv19E1bk$jw@$zadrzJn^O*`pi_MXA0K#)c4OM6KK$2PE(L|Muv{>Xo6 z7=!hFPMUIiBGQWfuccv~!y4v^`v;%Rhu>XTH9?evs#AC)uLW;NR4?}ifdJv>;8?W_ zd)m9bG|kh^f`?6Vwcw1Fg(O+J-}OXOlPiiI8ycstRLZBFs#f0=PWEy4p>$5x4OzXSvjD7kRs>?Qo!9rRDypTWh=EjahV5LH5 zBv$ABXv~C(SK~~JSSYY_YVpbEgXc=P9a%A@B6>-+9yOaZnkH^RWyprRpU%zND4wmd zrZ25FPy9(SO}l$@#Ym&@WkR+$D$xog(sDo6tBFDBO&5=PGLB?NPQv(aPt0WW8{4LK zmqhEMJe~F3$J8Lad@JiD{EX)9eYtE1O9&*Z0%Cv14LH-9SRMP)pS$O-gg;Sg+B&?I z$8HjQkC&Myo{=y1#x>pIVB0vSY{Afwi~Rg|9<^v4rqJW%$-@4S&HUZr>QQ4+*r(U64VyMil~ymzyggyly$35a{|i?j;UJmR{Msl|n_)*UX7y;!jG= zAj)z|TB(uxrCYWCVacntSj0_B62!-CgoOydG)8l|0z#YFnXnIKy8Z`_*uW*6x^$W9 zUXpYz7w0>0ReH@?$ixX0?Am2PZ!9-87Y$!$qInRk4y(P97424~lNer;58K6_5uA0< zX3*eWtbEB^7M=abyV1nPm30H{ME8okImk>mC&|z5l@goih*@Ck?Sez0a%IJ;m9G@& zBK8zGrlIone^17|6$3@GgNqdhlUvc|#^dM}026uEJa?xH-VUX}8s$FG9scaR5_Fs~ zuZ~AHP)H0&oI~DZPboip4KwAP(aK21Dn4J)2JT5Kh(Qx9k@xxIOu*WV#Z6Ur0HOF-&*M7mwe9MnU?G(?JDko1?DdcIV%s0)>mi>+}# zvC;>`Ja=(Qf$r8BW(6fDALfwY*QYIsY_Xt({wLvys@vqMaE`_p!0(Au zH0{~v{(KiG5moy3$8Y@s%@qhf{C*i9X*?6Mm+)f_7?;>$dlhrEviS|VVrZ#nxgT7_ zF2l=W(zmF^z9Z|`On>yn7+Zi>ksVcu!_>RO{pSMe5;0~L(XA(_sD;^OPYnSq=Ifi_ zei`Vtj*l?c70JYJgeyKM6`%;!*_`|B_FW*Qv@EBOTo+#GNmnf926wfg3-QNzo9uJf zSGkb1Q+x~m{8D{cWJii3HxK-iB)gl_wa!DH$bR9A;9uwty*`HrR36UxrEgC|(+d>G zZ!28Bf?O9$*r)nt?5AmkwZQzWIo^= z12CGSFBCzhjo8OX1_CDnkrlKcPaCfjFJzu6O3|9RPK?75&#}M{JPW)enRqDdo=Hgc z`rZ#Qc!^ChS{B58K3ark_K|Y5>R587ZRZD`$zGC#y3@2?8KcG)YigglP>RNwin*$* z`YEr-e5tJzlSoV`v1+W!ZF+Q=-_i1~G%6=_X!3hJf6eF2m@gONj%HH&9frgpe&M>K zF%5F6fm1_LPv@?uvwJ(>JZ5Tj>~>XqSEdUbFO-jOJP5sQPTn8&rwPfbh192z8uzC8 z-%gdi)@iqjp!3VejF4+{k--a`NB}|Rdt2CO9Envx>n#DmuPAsd2SWgG!Hc{GkHQK5 za=n#(gklQN<*B^ZUe!}(0)o9}k{VPYgk0jV>v9u>m7Zv^r=#yFMMvr!$KoAcII)IN z9RN_qNKW+@`rswT)?l8#H`z~>lE2$4ae~J|P|#t@d$ld_3Q4G&(_yR!Mm#dALDkh4 zEVE1}8$m9pJQx4m;jwm$ak>5;SHmK!dLi9S-M+i-lLYM1h5aO-gX7@G+?L{?Wcq?A z%0iY^w@DlyRXv*WTwvGRxv^&J52+q(`+0mSI{cG>M}*?HQrm(2>}iUtKh$Zp?qbv; z>%a&rce`_B{57AV#Nv{i`2Mo_3n)N21RU~a!d&bYnlvQB4(lD#BZW7rVr0x!C;=u5iv1lC6@Rq5VR+{Rw?LJ&OANbgkbcI~yD~ z+s#j^eS3QBDe72ZBSjjiLU-0pAaH{BQcF>Y6idx)e@epr+b8UTniyd{f=+FjH9ODl zsFN5vB~ii@vv&^O=l*U^&aOIGk|5d_4J7flD9e$A^~s0@^1W|Dv?v~QIV6tu7|Ui| zSU!jjc1XIc$md4p_Jt@dvG@0AaSRU0hb%XG1eHz50SK+Ja#|RLaLaB~GPiZ1zTMum zt49-BIz@xU8hfpMLjWfVLHV0jVA7PF-P-#i#EEV~hKkIsp7?weqggr8YcC|nX{05bwe@gW;*s}6@^`;pFuvJK~^0?6i85j_J-CeJ%F*&A#x zmx)iNU+IxB{3vDgitF%F4Uj5;e zTD__AY51_sd^BV%Efq3;rvR|Gk*;YlNrY}=VAw4~NmV_K=o zK6h$a{mXH6#E`mAHsf}(QhD6VkYbVww_nu_b(?c>qbp3BKev*wHyaEH3E+>fgpu$h zJ@e$!T0J0F5n>4tMl^c}2UDK$gywzVsxzbjk$g#cppj0O)Q3?9pbwR@ayPez|Kho; zI5(eLyh02zUg$eD>=F);g%Sl$2a}hxzuYUKL*tU zW1-56^P?#C-Y9_0SN|);d4d7pY|yeR ztPJ!xjfa3~tZe`HtJv<{M|g<%nQzAbfnT0DjaKlGvYSvIU{g;kU@MUE7r0cUN_+gu zyn;Y0l!W3+cyuh}D3i2M=-o?6o7Ha-R@5_X}HWUMm+ z-~I)%>w^dHq~xb)I;5~DvU~*tHcdP@j60LTe%#RHX1eU@ic=@vyX(|CYK%Lm6 zKT*Ev3lJ-nS5&j0OC>nKxMll(O(LM7&rRwHn!_aA7KjIj`d0bMSvc~S3l6oDuUQ^a z71FE-VlHi@V(d-?zY zw-`&S&sogjHa*3=9cB<+*7s%HNDfu8wmN)Ei_ctTjY(I|lb2{JIv()-*_L!mfwq$6xG@Z$trK?dN)Iy|zrznq4KMBlrlSq^LT5WFRn; zNcr0f?q3WMbz}-3ww^NcVcU=neg@@grGuf4z*MB&sd61!k;%QEIANuaIsE8A@Zx3k z`>kxM;4O$?$c59jZ9o>q9TTy8ZgkE=-_)eme)XXB+6xi?wV#C}6@Ughm1?(qzHB-2 zwa#T}_)wu~IdM$vL|o|1HTajKDJAdsUr)y?YQfkQx4FH$aE6|%ma7b=cfUgEG=44G zi1br6(9|B%ki(j`&D-uSRZjTS1gAYyIyl;U`sk`IcHuQ-6J6aKXzad*mEOl5j-i&v zdZ$<416vvd3xAnXfr&AJyte{^4lhfZ;rY_9hc(B~(FlC4l4{S+GlnF#>_VSxIo-_* zT6`_~f&CpFJnr{6)XXSP(B3#b*ZE(f)Yy`pBcGN=67N0g7|fQ&k}bJ(@!urUCSlii z&(DmQjtcdtcZ9O(&jp*uPSq^NHfa`DvYtNzTh{t1{?>!@99M$uW~W^D&1e62H=mt0 zcC2?swZ!|5UhEe!PzbY|{$eG6WZr`4S};Dp+b*lN+m9#b0MeTJmzS7M^&q( z$MUoCyA@yaD=WTM;^KYWlY{3hOOjragnjyWTAS6D9Ye=6tSXMKQ|;S(YeUOH!|U@1 z{X6rM6K?%uN?vHJ%AiDC72$S2Vz?NG^1oiI&L#=ogfbzT9`6aLm9_T;hH?eS9#~}6 z=xTraD?YJ1#(Y-gF~LN|z~Bd>B-oKf4zm?o8INwg>#L6}ah2$vgKxv-s27D1y8X?T z7Vy`5jN{h^?t$1E@f-wqjq?{Ek|89t`4ug@f7l=Po_qMV8F{@chjH_ZoY3PO+bGcn z!H|-vqL6aBncB4nJAbPcwH`ahR+#k5Oxn}k#oUB@JTqhSjeoW`TONQ(AyCMoVq(+a zAB?xZCD&^(Tk08iHo)eshw6U2?YcsgL>9yCO}^-v+GIJE-qszb^;(a|-FJ~TP%*K> z2zHzQb;fk7SjhUfuP3s3>5r?Cpy4iu4Z4uUjs(-6p1hmCb(;U2$-$e5^^g8M@=XA< z*bPMA;~F#riT03hK3rMz74ncRM@-SHV=!uzht8eKju5Zo*KoeJ@AK-D(XqhG z{I7@U#MP#0E$C;)gyn-@8qw#Xec;vf5OjA_;ZTB`oj(X*fE!#&9{B&Ke2nN!^&g;T zYj&IQnKd|lsVJnHiFmN3-xX*l1dr77Z;dU0M{e`^7k-G60Wer`gm5L-l0 zI`@81mV!E?Lj8FfhiRG69&n>R`8_>Qkq5+fC@c!^j-45R(+lOD*pg(Nzp8tu74cdDuZNMk9 zc*ESp6d%7PY2gD68;O!?-H~atSTfhZG>hi|_yHku+6iUTYrVm)KUAP;e_j3=>U2x= zYvLhMKM{7u1Bj-K&so6XmrOu=9{CuZ$Y7)KfE~IY|G;nE@LgHX?XCT5awJh%Ffvo+ zQJgPWuaIHkQK0KE$Kp|v*8m)gwC2TX~X$k0N6X3Kd!_eL?spVq( zFw=ZwezCLW7c2rOZ|(=(Kaq3QqW%EFU9Y14CQyTOrzAjuEOkoN><=oj*ix=J<5nL3f+}`k0vN``LU=8h8?VC+ zWCpSDrVzpRRljDavB>!s((QgEID8@DrO3AFcguWYPNjfR(6TRn7JlXnmN)LojL!pA=6LvAmzc zmg0x`(HY|E+w+olhKJPMLBR6IaOShLCwc{tnL4ZE1ZPcH09M1nHIoZKE9OwzYd~ss z;#uUuO#ECl#<$G}O!bQJf**09w2!gFIIeseL_`1v0RO4&Bk(z2L@fWpw?K+N(=Xpy zO9_7I6P1plGU1E@P+M#$vb6)Idnc&3fUPrvZymW{)f1$YS8bU0(4RR4aTKLrrkr0> z=k`RHWU%jH8*+$EerD5BxIv|4WlzWD8uA%y!2A5`6f%E92_;6_HUcH#-n(9j6sWP^ z!^h}f0g;MKlC8m0B}zmei;9?3dWon37V8ZmYMnl;oWR}0dKu%yfKZdu)D>6?Yl}n0 zsRL(!4*zQk?LJo0K~Yuo^n(zVwMuMe{Q{=b2vlkKFVtZheCoUs;kA8(aFALaU%s$XP%K`oQ74$RvAYU_1MW87sk+Dx})%3_P@QHc>56 zCf54$hyf9DE;A33m-f4oZ{x`pZV|aFg45Iva!*NoNpUW3Q^ z^dlhPCFM56Z0DJ!`2A@f%ksseG|O`qtPYH6I-g zVgg^+G-Vc}428-$!L1PzIxT=GjT4A($wa{LW>(%aTsZ5n>dVXh z1u+#1z=6gIL|DGZG`vQE0741yg(OBPpTy~Su-IWx_&VJa(89xo(18Y8;<78myx(hf!0=qDQ3z!G4s3eh#DF=#mxJd-v9NS9Y{U+buevI(ns?1Sw^x9U3$Brdpcc!jFXTiJvqOvsU%fbB1K{N2 zt3yWg!r^h=PGFFP^Dhghj}Vpu9$yCO$w)$GZB=hvdic8!zSw-*z{0Zk@$w!bsq6rS zvy9+@r>|^t&Dh8en7|t`KXSt8cs)hKkW8N^95I>6u+TxwK$Jko@{XLrEa2EPVG zg9Qqyd{sb$_7bo#PInP;fG%maTf65`FLzii+)hFee6NDDdftBX&Y}OSKAh3LwoxD; zv0iJUE4H6IppHUCqTP}&_z|TnW6e+xJOClHKOpoSW-M*WjGJurZe@_ZJ72;&al1OT znyIu~Y*&ik3#MrTG1R*Mw>;l3XJeA~r<(!7MCA}5HS?NVhlEv6 z;Q#GdAQ>W%M~UM|6S%sV|NgMHPor2{4b*RA8R};Ui9M{jSSvk`*Q5Rn=^|s2Ny9}L zSg=r_5Dmot;9i}(_}yBcTh`g6a*G3}-Oqi3Jzw}Z>+=-5KLJ(9fHU#?fN~|eLZqcf ziL-cXk39?m@uT(yg^B?nDX}y5;?@vAXz!f6&tl5m&#ZZMx)w*ye@k4SA2?O!Y_lQ$ zyXBNkDEq`CO^8xV^>E%XodDO6fP{l(<659X>d3gn{K-_H-ph*aOjoU&|x zNm+zKN zen7p|_<{=@yRu8g0}K<=-+&Snu6}#?1gOucNzi#;i$Cy{hC%WA43pdwr7sa0$JB$io+Yu#&SqFk&GB*xT+ z^EqnTN`*5IjhI3JXb~7ApJvGZ82W-i;G=cGWj0Onr$)TTEs0m9-fl^fK_wTRMltQp z4BAwMv25)J5XHcJMJ&+_8c7Dlnt@?Ms8zp2QJA znYXy4*MoVDj2fa;UV99fQmsqr08p-oI?+7E+z)xE=iv@&;4Y4mD1Jk(J-@qsJpZ2R`C<6)cn7h0T=u`>r8RAzE_Q@icL0&r!sL_o z7CCNlU~#(PACOw(iP6k;M>;w8ZliK&jO#L|;V@SW1~7ZXbey;VerfRYGlo4%DeYdf zM^UJ=jd`wIqUa#{2nziAyjAq+_aECmT5i7i70G3#ox0xxbU(@=L*QrS;*>T1>mP6a z<}Vc4Rh)I=KFwsD5sEdZ1&GyD9t$%Lh@VdzAjVv_q~Bv*p+*P=D(0y6HJlBVYoYyA z0BHLFFkSgPxhXrxBexbz{jCK&DZS9eogqB5+I|K9=jGoB>95Yy_;}{H-9q%j8S=Uv zlo3ToUS;{$eDv{00?d~bygI;dS=4e@8lvx5ngc$q=`KnT2|oZaNn2+$t<#U_;a_d0 z0%&_R9>z9R@SyUP7Jki9R_>ZucN$&Pe{>}=;WNJMU zaIEe_$i&M)Ox+X46P?@2aP{lDgM5>4L{%Ccp{_(2hViSoFJA-_eQ?@Cqz;zahhtw* zM?F(gmPvn_NUR1z6nY{h6o@zhoVdMfQdtOXl&Hd}6J{zqd7uOs5yK?l9^nCk6wXAw zL!Nc%8i*^VFO`679la3H*Oex9edECQi2&)Fx(S! z1~3XAv15HimT^EM$IrSZroWZr@vWGg%OW$-Ks?}rA26kX4&rvNdSicd^9+d?kg>?w zUtUrh%2TxYx5S?)V2SKMicKSwQum~WFOtU%{ptM7iVS|sg3!T(s?2hd9UPDO@UvQ} z3AAizj>n4ed-w@Z8qgwogy(76cdxmVhzS^XCOfB_f=K1xx*!~gV0o#Oi=4 zsn+J(N(_}avs5@C2tJWS=Yj1~j;WxX-5Q^1ojss%5KSOpRA-7DpC^XDc#vSg2aN34 zZN;kFu&hoIB42!N^gI}RD3ljz`>FV&6ZyP|q3s#cpa|!Sv+Gw4cl#N2_!VFR^Upx$ zI}lyG#+hHSVv@d?gL`qkSRideFQb+6OfF%d7@Zv+4^e6&a@)*~p!5T3_VHpFJv~N0 zTGrZ?m0I}HqGa2`MOFDp&d7znev0oK%wSyGm`eTYC^o6sBAO6*~*vRnHZiN)-x9YLf zt{B&*A?}$HE=E7Wq`J4Slm;KD7^XI|0f#`eBoGPoWuw`DttA6Pq7NActf=)TF%be# zS(l4oV>|1>NCZU6uT-``wufH8^~Em`B)s@QA$4)5;kUoccv{p*B?@d)n>ReEzoT#e z2rZU2Wl4_|q>ltlmm7!4a?~Jmd4TACx;@w%H{0s1k4yugn)Wp@67&?%2>3QnBAtIC zKvP&)7>ll~TJDVA@}s?NrLOSiSQiA5)Yso5V~#hcFD|6hLLU|AWW5CRUUx|UhkYRK zANv3Ri3i>^T@88GAaLrUB{{x9Gf9sA#*PeW3g?c@c={3D!szw$6IZu#qs(|_o4j1hKv z%nHnew|*`7#_nM5+q!RlL9CxNFenYH+g(3Z?kZ3&P+Q zUlAnSI=7QT5X!!H+n#vNO#+Cx)&1#1h5&(c{fyMTBSU%9>ZMyd?R>U7W6jzN#E7QX zIxEvb*5LG?Hw?r>bU3xljQ73`-W3|)cRJ6J{;RHad zkvR0=i8w7Hpi&;`jZ5Qm>MC~iND%CgIc&WTG2!d;*+X)X_~=}}1N{CpISb$^1`c(!U1*2eI+3B2`k^J6#es`E!)j@z3OPd?+J;0%&>Fsf9F8q@0>j>J4Sx1T{@>(Jp| z&p+&`pmsOwX%HT*D?y}ro@J+Fp*JxV;RVh7r6d9tNx2Bw-$sXjKDfkqP z+C7r_RPousio5KzM&Fy;g}D=Wz)`57s}%ldJMC1APN|7jLj4Z_6vBB=;KvXIr$Gi= z4_A?U?*&wA^YxnD3VE0{N>cuM8u^Gk#W65_yZuxSG5|zW>)+0gKKk*~3a|pt?82*1 zz(%(5`~J}P5^3yfRJS`RJ!HZt9Q$+iaBy17^ZLQ)?X6)$aZg1sTcbI$h4R-J>Ce}w zR3!H8E^y+(Xq2Ki^82ou`)5SpHp!>$zmVv#6Z=K7a(^&}5nm*Pby)tu&a}aG zujJBY@L0=CCmF$46@P#Onu+lCrx^V9inM#jgiH>}Cv(+Pb*2e6^67M8M!3-10LOnH z(rn8a0DjxTeLRkIz#i=Io`vF5dnU*tg)&^;i~q|m+2Y%{bu^T>^z*u8T)C>8`#>k$ zEc@4oEVGNj=FO#z=%-m6x>Cs6v2*>PI2neH1Y8OhQxAfeI~$hmJ`93v!7XZ&}@>?AV_E*meF? zAd(acBi@hgU=DeMiGnXBQR^lLsble`@PnSj6Lo>|p^pz1(s_jIbDajOzg)GnK$pBY znOQ!DE((5ueemyNy83Bnpv*(AUx1$`@DT30!5b!N@OJ^j@h+pgTye;+liqukuA5D6 zt~^e=C;90dce4_y8%<%895M+#w<}$-u#={zfWuIXV`g{0$hX5QUf=;*H=Y2tjiZW; z$>G&pJ$b1=oUnVNVU2`I#zQFqTqR%`-)Q(FOWbj&+d{zdM)Ly&3Y0CnEs9zydw)4m zQ86teD>HLMOR;>JfI_B+3}FQ1zw_e%C7QY|RKY7uV)~8=hFThMGqmZkr_J@Z+!5ol z==R&o=DXRS$T60xv<*k|E!v8m=#%CuTemMqH~N46pNS^YBYJ(DT&Ek|TzClj0GMbv zTVd)?AifSljYdFHA1UkNO1~GasQn=~UdJ$U!37a0t&K9*o^Xow&szGgFHb6_l}r7* z4z}2AsDqijGQ;stPE6ezWRfWwozJ*(h;J~#6&dxRK)VmaHS z1rh|~MJ#fG+n*BM0sTMmWvzjpeNnSN&%gW+p3o^-E6fmQnIB~6Rv2qV=VXqSo@82f z-UyR~Ji!}PGtu-fTaj67o(LjYWn|gHazYa29Pv{UhX4J!w2uTGzW~mUk?1TUJ$8CB z@&kNw8Bagb)sDUt-Ygw!ci@(4$=5ogrBD^qZrT%T{{=Z<01_Uc-g+4z3*P+H)FpM* zHyyXHF0cx+*kmf$%?)UuBpqK5>FqW@ZV=u$+)1tw-;LQY(`bB&Cu}W{NVo~{Gs{QG z^12TyBis5?`E%%u?Yg?%s(sIP4lD~2o_E;GFsJTEuma5^mU>+vtfdnG$kU7dt_3y! z{@*hQ;g)@ZuKW^u;lRW!Q1`QV=2e=U2rE8=*rQ>SY|Bn%m9galy4VVjv!1YTj)#}V ze6BmFR{ZWMULXzKy{)BH)#=3QuT@DOU-jq1q$08Skxv1)uyT}bnBi5we*!l0f#NZc z`*lw$ryYnwSP>hYcZQg2o9&i9>mYZ>&}Su|I5PVDmy-HGCL{v&>YP#w zQ>2k^s^$`eeLJhj)1m791jV4&Q^kb8aT!!)fZ-jKJJJ-xsI)4KfDHz;>ZBo*Rt|_j z#AFPm|K{qfv6@FVjtQjJrXDc3F#+uX#Wof|T;mLiWflkBPZvGYzWdcFwr409TMVu9 z_0okeexKU})0Z#ony~H{^33*RQ;lB66=iQQOKN7e%~UwJIS2T@4~-G%?ohZY==4@S zJcX9*2*Q=Bovs0%;D3IX(m)7-A5f4sUR>2E_vciak6)<3y4T)kUDKGkggsNGv)cA- z1E$Bd?lqWLeSnnm(qQRVm~sM(e#><6;C8lTV=bq{YMX9rqgpt6D};>CQF`~tXs-t7 z8rr(I07>PBEH#{`((~$%;RzLdQ)mpNsJaBb_SdQ8mBZgAwiq(+v^vHq=(a9miwmrX znVc@0k1XlAshWwXRjiz7_I1fYlo-vIbm4z)@Z1+Db;^K3K@P2AL!TXiJiUHPQwnp( zbMlWeW+ecl059c4l-kc8hB3DP{G8-Nt9$DK1yytBVgZ;5@XXdWvgtz2csWfm@x$fB zam+fB*NXg=-R6e*N?D&-wt!Nc>hLcQU8)eAWG<^O-Bzx`dqC-OW)^6^zo=iX0UYes zXHoaD4kfe^DFr5O82`oFCIEU8K_~;a>LL?R9{1BCkVt}bnh5D3xO4CX1@0tag8$!?^1L*n zfLNDQ`#z3d8t;q8v$70uSGCfdYvKGRbv$an?}W0_#p3UQbeLUJm33ToXfx(zXY zX80U=Fsq`_Efjycm77$7zn$Y4q+CS$vLJ37WUd7j&9DD2W}1i>;XxgA$D$Z$2{CGEl(5#IBy>O z-wXB{*)TxUl9YTEIbGt)#R5%d3AKtBWo0teBU3JpEOc?X&5${deyb2;A*Vl)XW z0|W*Wsp^YgJi@f?soYeBo}asqEZ=|KYXoxF*7Z9e6RsNlb^x-1PS3P$BRWv6{(tYz zyd3Dwu11jF&q4wkZZMhr0s#3D-^o7q$r+}V=Geo>W1B#j=gOPGj+ z1YiR_Akz=qJLBSSuwRi;^EmvX6HY4DJHZxm6EE|4Y2VU;CiI6&-?=3G&iT73}Lp9?=PR z1DJ+IkyP=-zCP%G4>Q;Hf}&Rj(wBcWAWEbS6R^&|weiQR2IPO_Lru6>17cyUp9?VN zvehLRlZL`I2{SJ^qy|SyT9b!}RqqpKNUTu0vkoAbwX>8?DQrVlp$5XEbkl~h!5*)8~l<8-PF{%lgQc0zLs~0B_(hGkq z2;{dv-huh@`yc@qMUz>aJDpc1r-i&|>-$7QZ-MOOR6~;p;KcU*^3&nplAq&1{2?px zpG5=ho2NXpOoB{G2>;jJ`A9RcQ>yh{9ed}f+8*y|+QG=@54S0=^sTgB*6z+^J$B7B zlU%BafxU!TMfm{!P=u_XpZ@*VL2O7YWaYzWnLzs)XqFtp|NbL6Zk@-~GhOA|6`8~_ z)u+(R+(mKYe}5uJIr5GLPz_CZ>G#oh1UdT+X$?*e*w}1-vnT8eu+cQ{bi-mg@+ZH8jAT+Ts8RHfkwBC{ z&)5Z+IC1>C3Z1Eom;h5g`Ms&2M2B$RU35pDW-7Jd9NVkbN4ALR@B*ac-xIQ(a(Si4%HXU4lXq zeAK$_>wVQxm3A807)kc~A7cHhpdh^h%`|Br7qWkkApkh%E&MfygT)w4=*tDT1-vWQ z@!{vd69lNP%vOqA&5eq4+cmx+nur&}!p`n5asn>&9rH-HJ#p5k=!i_zlT~3{$9w-T zLMeh6=0!#^AAR|oD^*D5cd4#2N4II8#L`x)s4@G>2w?@~BPt67qo(^V6=)9dpedCr za*ik@z!pAy6oNmZ4=#%3e@y1dX`FBEyTEI8 z>2hRKdMYw}q1EUw^pN0Ja6y(**^kad{O1{WXc{O6`G`2eeTiPZLkN4=j%4kwu%7+3 zd2@V$L!$QJTecH&zx)CoC1UtVi9bUqhGIyWN=#6bB(}akzV}5)zP5k^rPe+=VCYgM zfLG>)170>wsk6$G`Zb|nJ|MBcgkRS{@WIWVZB|IORsea77=*6?nc7JVrMCwon zr*YljP0dkk>1!j8hQS+-Zb@LJ*+*Qk;lJa@jlkHEWrDa{9Z4NNzQ^(YH}*)JRko_~uQ>@DKix;xi%DIfGu!Q>7eeF=yOyJI;2O z_|^S>zoiiu=*0CCIHFAUW}p7fvb+7ldFCI^{^b9L|AA0MbyTjMMp0(B4msA-0LedC z#53?W8vzUH$E&~+UOOKlt=a^WovE)|=64MJLtibz99=WyKnnGRWfNYj_k~a!jN+&> z`sc6X0=|%&gh5^S#tlG0cHsHp04s^$g1q@^UMRKMF0605PUd%sUiME7mO4Q)Rt2)l zwG$y#K10k}_lz#kZO7Ld0fH6}5V57(3Rmjk7R9}ci5t;-fB>F0cUJ*zFqt?eVq-u` zKa~B-aN+L(19*T0ctb+y`F3$a67l}c!i(hBrNN4iu01HRL<1j-V!+F9;LGLdSA6S? zW43ReigfX-@3cyOzvU8i0E5SlJ5w)on?aF23*It|CsbKHoAb^CYGBIYWW4#}K{(Ix z@GPDSj`wN#FXmNa`qkPUXH@C-ysDfKyk-b9kBd&>!!2@U-!basl15=j$9Q!Pq?MKNqfU2!8pr z41n_k{xFbWpYxx*s#gqRKx5=2z}|$L?$6ufeV}c(QNNbvu!W7*PA=$Hx8mMK#BKyR z0V-*;Q!SqS`1Ff^DKtPa4K9H4$CURAkKH>y?^^+2XQbn8B;SG9X9VN`MBhk+JnE!; z$(B9=4j)PbJ_LTqAb2SsPqPY9bGjdL#t2GStRy964jJcmX&pU&lTHjyFzF?>)4vm3 z2lfkBjB2?`^f{iaL-|t;?i4D;=@!rAp&VJM@lyTq=8#8=g=*g%U1xCsV1b$-9acv7 zgwrfi+O0WF$TNxEv~Sya`E~R&c{9-R(Mv}={sr(2YQ7$z7-;rwKBp5@-g4v;*(G!^ zxq;g(q5@qe3wgz2cQn6zr=o*r_s2ztn1Pp-ftM-jcWB^mVQwdW1NGRpJX-DD2%tD7 zr&^-ZBU4Nhcty-^6j&1n>idm5E6c_WDHJN_o8LeY*SS#{T*>p1m#JQ|&#y_dC_zrs^p#xQ4fl6xsRQJx^pgc>KoADOI1Zecu9E%g}xl|-j;x#P^b_qL7_{tE9ofc;=V`y27nXLn8{RTfe!8LoZ@tfN7?+I%*fRaM0q#)|$@1R6+zBn~~fYlMHeoUu6g z4*Zjm5eK2WnN)$2hH02Y*+fXb;7v$etp_S_>e`~8cx+|`{MSm_2=NLOr~Lw}`h_Vb z@=$ZzfPyq_ZQ{07?TjR7729LQFI|E@NC6P@?&1Mj@|i2$@r(im5X6rn*Hn}r^Y7+f z^*kD+9d;iYj1Ji0r`K{wGdL+Vm=;EEJK!cne9myEdBl^*Y z5;_Oz(H4s8OA$0wtQes&&R3gp-*L+%s3Gh6DtuwXoi8p!#h@~vFB`dS8&pr3G-I@! zLP-gMFLRfQ&9sk9R8K7LfjP<-1wCiuT9eL(6^7a#= z4I~tV)q9qJvNb9%Pq@RZub(?*- z1aJ-wt4Z&tEh}%^9%4Z~UZ&y|>K9fj=C6R9XBf!j#jZ*(A>L_H-S4XXLtpl#$FEmA znU01&*_bI%-%UPq9-bGThcEU1seOV(fT72@MO)rNHauMXOP21hXlyw`O@#?Fiokv4 zeT>2xuc7DKLH<{fX@ZGksNk7=D}bF5Gc~&pdee9CWCp5)%`Vm=!8bdJ$g? zpI;kh11Ut!r6bW#BwUvH2mVx$c2Yz|!4o{V>V1nmS`wXE{Sq9QDqOTojE9-lTctsC3Bi*9_YJm&;W>8`mYGT1rLRv8xbO}q)n-x z-roy0r0_H$i8+c{DPr32_vZN@PdYBrLWVca6X$wzKV*+(UVYOob!1d4W4oyo#rI#+ zkU&yujf}BdMDzt6Ra}q3X{2o=&oz1gds;WCg{}{ zfZ8pm^R_0p&gMsvX)2#bSFswG(esyaI@VO;4>9muM~KP(AR;D9LecGd!c)*Zu1!8+ z_i(>dK)`cirNZ+V2*ziWdyAK>%$oyh48s7n7B&3#OXeNWoD|GjhD&GSl8MdmTwHFI z!r^78UO)i$#Dmq4_dbcs^8TX9%}tl;qfEGYv01fdVOS5=m9iz9*XGxm8_t*191| zrmvs!8seNRw8Pay#Cid@X&dQvO0C)N@F$LUH)pul=nkSl<}(MhWLJ#>vp3w=2dEfy z8#qBSt=}amd^%ys2}p+gHF9L|fv;`+!T9>C&&Jxe5vc7)N`{fF7!BfUym={ncU}48 z(_`VhxoQWIHHXb*WX2Fm5_BqFR>)IU#5sy7sU6`Y(PXS!91iHL*{DC?c+=q7RuJVT ze>S^;HT45#bOKKW(GdwIYtiQ+pH6Xpgq~S;_0OUL`YcTx2OB1R6N%LcY8qjn{1g+AaMWXpgTJ&s82qQ)fw`gh) zoSgXqBBOvWFISUol6LXPLZy9edy`(Mmg)&cpY?F`CurN{AZW}x zkkH|WHd;MB0r1I$h~Nv#*~7;1x{^?a)R7iXcNs;+4;5AHoauVcs`mH%ayj`)1l>5lUw6xc<_9g( z925r$2LT~`%-n)R-{U}n&vsr!Qw4M71J0O_H}_{Y3zY1*G2Fa12h{k!IhON2hp0E> z-y-7omqz<~Wo2cvFrV-Z=zf@45u$OzLISa9g+glAR*l#-OXK_^ZUMgpL-sj^M|;%D`ri+ z^Uh4h_Y7JYx%k%y4St(L$yHw1eflCK#t+zWyzcHx6K z{VAeykW&P2U*i#;zDRZGmiqM=*)qAado$VQD#WPUKp)4dMPqF^JZ|)C^g&hoWl^Q# z5|yucjPUQRTboywlR+9_3hn$$xV(--q%kIVEO)U=oWnezQ8>&$i9NqAuVnz##l~9< z2M$UxiZm>5LK&XWp4kGY(m?n!jSN%Ep= zdAkAWF{pL~kwEYL(JJkGLxZ~))}6)YYu+rdx7{}gR*~ozhT-QX8?JU?F!sEw^_^|@ zbJc&#^k4fS!iFC{cvLLaRszhQuWfMqtqs5=)2>0)FYt7G-dmx(3*&c={u%9vv0jZk zj`e;&ck8t86@PHl`lpxQN^`yS&y)h#KB9B~#8R0-`kh8i6k(;G&NV4dZzh_K|@$_4;g| zj+T8?u(!vo?eA)~+gzl*wXI;l8d-S1^l}YM6n+)Yto<<_%4@#G2B}LPEoy5$*PUFX zTiIE61{yjz>qSctw5k6UT&ZC}dS+GmN}rFeY|Fvs{v{tw*iQ?N7#zvAf9k~=gekiW z6w)Nu>OTV&9x)WDP4wK4d*sNGPxq!jeeV90>rtgi@mXgi>GTR%4Ps}FRKZxY;gmwr zz#&|xWWYRskwUDq=3ouux4<9!Fb3q22t&=}tr!JA#Mx?4M+%~_v^wIdh%mP9?77Zr z6xIPWb~y`gg*m#U7IG5-%do)KGmOcp4#k#gmVxWjtydLH5E>o*H0 zQxK;FcD={pfrsEP52{;`FVMk z`(uYq|H`;1=xJWV!k0T?uoJPr_ofq^;p^PQ+ftsYFj_;L)D4N_azZ!nh z0s7;8tQZ0;o)3Y6vzV^`dyXp4d)HdZX}-gPpKY(x_Dd1be>=>!zxn63JIU}4r{eWt zi>lQ>0{A)Ih`k^C?xiknq?8TzA;j%@)N9=waHDfy4i^{RiKw>|PT>hz8#e~q0FIXV#MmIJBPL5s7(Hfx1eZvhM}?WIe1$94=m zZG@HSnfiS0nh)#sDwW@%_r2Fx)Q#80tl7bVGI(i3cv>b(ViXLJnba?mSdvQ*o~dk? zEWQP7>;n|O-XSnDs%LuM=%fN_Hos~C&Bmqf3!5`vOu?T^KTP4HJ+!5A^HGoy6e{0J zv&s3mDZgkm4Uc&ylL0jj3`-_3L4W=R*K#_?YC2m6 zlbGrG5B!V!H8-EHbN7FH@?aNrMedFyW{h6CKAW#5#w-_L7HKJP*cukQ1?(I_(?ba? z$<{X76%@E&pN`*b4@PgbPh#$`jTc^Xq+XGe4AP7oeY5#6$wod_b7{zxlo!b%qS~g7 zHQ^T07`d(A^^frUSzNV9s(}UB1aW*QEddBt?A1TQxa%m`lYqC6+4lamEq?7O)Je*I#9Kjz~UcYEpfue6?sWOhdMbFFda`#BKKbHEJgsQAz&NI(hly;ZlJ%+pqO-rg?`5YcuO+xgCoz=DXpgEm1rX2!41# zb};3h0>w4{G|=*LuN!dKt6oL^13agW2WOei1a1u)U-Ocv_P=?M=gJc}m6eya+$^2|D6ZjA~$@N%l zn?XD%1J@u-QUQ2OFrplzVVfYYb?oV$l?(u9v|+5hXUd< z*wAS~W6fpPu2SbiMnd0}6IaQrT z``mFPUzV-sw9j8=9V!3(XaVY!0KgCif~7<>#VtNfLk}L0%LOOJc$LK#7cP)TmtV>3 zjP?w62(xtm03@6wwij2l(r!5i+Li4n{h$ua1P+JTq2PFHgPi$>2 zsz@{=fIz7u;7Sz|Ju(;G)h7YOcss1GkKOEUpsKXPXKN z!CZwAQ4## zWY>t1x|b3TgbI9i-c%;0S}mecuR&PZY1rb-VHbjs{EUmyT#>&ca!A6*@4-CJU^Ge` zjSvI)v_TGhjcoi*2a2xnI9siBtVe?_Rjh3YYS13|ExjIi52**|u$# zyEOoYkp|h{59U^J4sHi{6tQKWDJ%R{8t7@@&iDT&@X>!4QolH)d9G$W?T5#xS!VqS zS7(1_DT$BFTOH#zD4gn!6h#VBheRMYDJ~x2E4m7BF>~BDu0@kD23fMPADt`&x5Tgp z{y+P|5R&hSTVM$;>$jf^ za628$S8NJ`RPOB}zoMw5a^UQ054!1=JHgszsG2N5-=7PRqiEpbQdv+SceB7@m;8Dx zm*&n=h8&pux{~nr639bxSZaGj95_kZ0JM*{Dey|K5xHa${86Nl8P_(x6P6ejY`IWM zk)UW=J!jbh;x7Zx%$LJVvY>Qk%r`+R6*2=4E9`HN35)RNCs#m*mZIfMMW&4>6Ok1i zr^*Em?3H?FOs7r!09LOJ3&T9&YB*m)l|`!katACUAYE9*>t=}hMCCUE{@?2Y0^iz3 zk8(c@g!)h<45A;?7?b#7anfNgu(r83B&i3Kt5NWA0N0%It`r*^xX?3H6hz98$M{cB zXg8k}j4}${+aFQ;d-1ILasaiRnlm;r!8&wJ7Ek+nVk`YW#K%YRw)2|{d)D&Fu$$!C zPs7++WUTWzMScmG(3u^Qb9^{+Y3V?r7kBd&{jd{%B~SlW+lr&21jF2IvKWr<|E*`b zgn5+xaq!WPDJ@8K;>V$OgY-N&mk;kS9v%VU%Ijst{_c4k1RYhJ-d{8nQb=TEC%_zP zLY9q%{+u`nFnBY6|9(4hXt=nBx(;xGkG@dt-Ng(Z_m6=fyO55pbT36vtck++dQVNl zC}fh=0D}lW`Vx-JeQ4olkb+CsE?u}oAPg9nBDyt{ zoR99oH+j_JTw6+YGqO&SN;FqFtE4&rCP@+WQ_h;lB{AC0CTw{pEbHh8-lQRu-rKChT8`GO3B z%nG;E=8=7Myf!hB|E_c~3&087Uv|}sEl3(cVF`H3@k3`IH>>0WhiIKgocQ@*o_`>L zjl9+CQFzqS>t=^;_fIJ?#tU}u&Tp9H=AOpL0zROUa6E%#++Gjv*A7=l9~7!l@tNNM+nx_ZQwj9L;Neuftz(h?nJo3f~$~mXM(ld;*eO&#~#b z*qK^gAH`z_Qn=}c%$w!>SL{+m>5-NDGBf>eu|fQa2yl%-%V%E?=4CxSJvAoyP5=`N zO5BImZh8x%VAEs(v_-i9;72)xq(9;E0YXlMHSlp_dOG1sh6ZgP%$`>T^>G{TKsnk* z@z}no2P{g7L8}gahP!&Rpvws|ED?hW%@^!DiuXEjz`r8MIs<8%N<%tiqi-CK*?@05DRlGEn~gZ zmI~69g@y4_!=N^f9#9RFoDuXSyv+n%HoqG}j@}lmUiGix6*e8;+Pz-PfS+H*?=0SP4kdX z;j6rZk50fijg;myLOycxK`hw7ZY1Hi2glE2sb=&kPzO{rAUO9IRykgRJV27h@e*Ac z?S@Iw;yQD^jrHHUp^C(ho?-*l!-8BzzvWJ=wn3H^rCZ4LEb4JH2-z9z!8ZWLzu*`Q zN)89pc@z{g09<~8$?$~>EI|QCqcYfm*Y%0iqS?*ewL>%+wKTdbpr$b+NEx%cgy6Ht z0nKVx^wog(cUCRn2gQOy&ivh@3-Ojao!MI#knr{Kn`LwpaS2-%?I1=zLdC()jW3pT zW^1c*oSu06b2O;n`7^&H5F`&?fb!3-qm@1~<*w)~amZ(ASHG@ba4dk>LQiFU#0}Tt zRSxP!vUFS8Pr=(jbJQ+V$pHR5U|+taeFHL2F+IUS$kLd^q(nf_qYfba_QtENh87cf z7%D=GYR(`Pck<#C+q`x#iKiHn0iZB7x}TVE1`^94qxCWV42XP^Ctz)AF)x5;m-TW8 z2M#=%d{DIzT=w}9#E>kV(k@`SJ9OkkhYR4n4OdsIk8gcGZh9bQ*%I=kQ>qNs;yXboo5u>$y@iX$9EX}kRgaW2mB$q9=m=-Z%th8rEAC}yHl zOYjOv2%;VGC_pMCgT?`?78Ml~eBo5*T!eK7!+ly_X`i+J5o9Jci?}5(3tD(xiDv|p zy<6TZT{}C!B^Vp|wUVtuv~z~Zw}sN67bLKx7K;G8hJ3`q+*1B+q0Qs=7nTv{QoC2e z<(8oHk?5k|^?E-}2!v8Ril~n(7A)kY!O0+%8GyQY|E&p0fG3s(j&qQ!u|@9(X{cP< z!ujzrWm@W5HWJ|J(H;`lY7joYH&#Z=r|r0H*79c6Yf$-ZL6#z* zCq#jv7DVIJ-nZY7eYnhqM;BY&HZ~fp3x16mX*%2D9%31ffmRFW6z4$3F;3oY|C=t~ zaD~Ik+D$&@=&MxUqS!F-~#MfuzgXh)oqB^8?YR}`D=awAc=c8#1D ztW;ITB{f}208MF-a2piy1h=LO4x&TP2_4q+xR)!pB}-!e>I$a^VBkL1rhQCWCtD+I zm@7JX3$w5EctO7ul>OhB<9ho(b5~v@~MDlFGWIb zIb!pbu1~%a4|d?z9WQ}Vow4$ALjSUu!Q&?$!hCecPrqjY^d6_1;oESJLTl|#%yJ$c z9#ZaaT$`K@S=0ZP)c8E&pPt+=e+8`LoCwm_1NFgpO!4lDH6N3`AJYOrEZUIleFLQc zc6D~|m{A0e)qR2`c=7r(ZnXOq$}6}dJi z0`giSc-xOJumY^xd}s&AjhiSJku>Jk2m{*o$DL1s=!CJ`?;CZ7-|RL9M~lZE_Eo0? z-asC34lO>7u+1=+Vp(y5e;oY01e1*{$9=@ATSC3bD2pH;e)enyMG*1G2{%av*E=bs z3k!YO9u*V*EToe2CaL6rRZ`m#Si9M%@2l8!ip&?DItMQ5C(D5Dxh#xug_r`2-uUPX z;v?nwgQ9w(hIfZ&^=H6cJWeKO3yf*;Cx=Bi0^E(uok6JRkb4^h^7UMYs zoSKdPL|>oG-o3G(!e?+0i(=k}atyw)muQ+taW~~;TYz$vUy?nM zqJRdc1{&Oc&&~{hGhnI@>dK1HduCCW$1Tt{M@n$Sx?+VAKjWuOhU7dh891YQstQoJ zpY+O>tW>X2W#SX>UN-YW&n^na|A_G$2zsoNBTM|U>Fu7_q~R}EIdkQIX#20g!rIb6 zV!%!fjuJBl60=)=hkF$iE&wN$9#D#?82Lu9QeQijF^kU1<<7-{Cbsw)g2hnN1Ow-a zT#>77vUAYru*EF}oHtrYhOm3{QSck^!%DeqPpM<#EbMxA(zdNttSg(s`w|AO#Yl3J`24DA8WW@OAV`l*yhv_lqLq@u3`2ZTLF^bMM_ z!RCOtk{$ES)t&-=kRVt8)=FWjvbf_P!o=~bE@RNE$)(1Rw0nnKYs_iIQ zByAU(q`;1{-=*Ir3>Ck=1__7$z7~2rF{~`WpN{FFJG@_afW_;p#vWK40eC;t*i&J( zXayFu-~*Bi%DzVDp`KP1J+wabKXEvuQ8In6;d=fMuN@1d0fD$>LtjmtIzu@@zYc0g*n{nGeuXKug z2cqCe&%;9{2lAQL2aiePS5_gH7228( ze7K1m$|tEZrS5QBkGsz?Y;(sN3MJ&q`$5|?$nAqj201c+S}5E97V1n3jZQ;a(*eGN zg$_biw)n+Ps@x7ZsLw%+wY-lULa_k)_~dN>))-zEH*n;WxS`}$L1zSj$Fi(xe1q1jb1zIq-p_>0z;jl zp@g6x^lR1twfpI7fR_o9(jUPbOn=6R(to_s5xyV(Z}Q z#dwhRIH>iv8h^ThLq@YqZ|tld&>5=eO96uTpCnxZa=hi8vls|2ad~{bWw72`)w^%fp^bv#l;E~w zPHvMO8&H2MDJTei3d=)aklWvv@duud5BeEFICK$OP03tliddM?V|0V6I?ogk2Y_%% ztb-c)$R(Xw`1v%S^uNV*69N4ua&}~Z&u0Rw)8cHWAPChe;O7E`M8s1sXdIC2f`7M^ zr-6{weAHVmmY#AIC^HB`eSf&cp})I0e*^;OvCK19J;SeUTwY z)^MOXRIHf@pn(?en1e$&yT-t=O=2>;0;kRjj54nS8hQEqzw6K+UhwJ(y*RJPh= zFz>%(tA!rh7RLVsERXtVwFj;8^7L$HBIyE(tm9{F48%=zQ$HOV#j+0EUA;~sEeLlD zHa9`G0zZozYNwxM-Zvd8tYQDfy<#2QPbwCn;O2!7&;vmI24mB5D!7e7puz~_Lyn*o z#F!O}ShNc47acmu_4PHwlM+MVf*u%bvtAJE;;Ya_Pj_lp?AMlhMKSel$X)Po6cBDW z+r<_v;5Dc4WfqnIDCcC4?oh<{KN|37OgQi)t4u=kenLZ?>Ky0704_E5~cy&j`r@~#xo~EAt$Js?r24>P^OlOt*WH1E!L?m z0$4@>^^+hrK!=<ac&nwuvnBl>%E_sf?gCcqXdP?BQJK34mAU&1(u!5*@_v3N{ zDe^Yu(f;38m1kc;H@GA!fGtHMnx_hU;XZzP(BKrjX??&#L$$|2q+t~U+T>7<#8xT5 z{ujmYfOFQw>AMG|T~Hz|3hd3vhI};Fr}>7eE9>zb6wq8xtQ#s4Pr~VnQUcT-J_w*$ zr*MH)2?k1?OWN;==l>pz79|S%tjvSFO$()#YI`35KU*~iU^i7T8y-SW78`uadjucA zW)0RJ2XEA>h#8;?kX$G|?Ip&OJH;LYZY-KxnE;tj6iTuEc(?^5NV5O`?_NMRHimEk z;snVZK*pCm2T5yuqzC|9(m!bz7}#tOuM$aNpL=^N!U2YJ6yi3}kleI0zlGUaZ2Arv;oc$q#=`3 zpf4Pb8$|K{SDT0eJFHP|z!EwRjJ$ZT?FMe z1G|blv;}%TZnT0vhutH>e2^m=IH67M3G*MZ#}mrH`4TJ)Wjzc9y$GK0Zuv*;tVv-) zi%id=!5D&p04_W{6mpp+DB zW0pve1i9h04<~%}c=08hubVyI6P1F zV0dtM6af^qa7fCZ6ktYF>dAz;$q?ovMBj*jeyrh-1J0=s6W9UFcRd@w{xP{ZtRLs5 zj6Zvhu!nVZb_^eYM*hhn5p>E`4K|gDa1;7%V08C7@le#O4hi5(ux9@(X35|cbY$NxKK<+nV@ZXNCh6PZ@83}G-}XB5-@OYy#uy?%Sv$j#`e&Qyjc~s3nxG5JE};5m z_IkJ|;eZS9gtO!zijfaJtAD`{ZeH}BIRJU8N(>p8Egpo)kCRUyTkk%V5l^koEi)vF@IU*>mBHQqkYpp-S?lTb%7!xz8>hT3iGj-Vd;<|y_$NX?(@npfCLBJv!>yWvYPXM_m9WxO$7 zRK76TDgu{MaJ&Q$_H0e)wA)hJJ0TGcRq@LRuqGOlQ@6S)=8PfU-dgd4@peu&Kb})T zfB7C2ZlI)~`04O5n`O|HO&G8)0*M}b#Gs!Y#!&*(9F~h{rH1SAXb-DA zy&&uauu?(x#d^n2`@Z|1`vUa$ zzYl&0{QrIsxPu|cgl|QGmxQ6CA|;+I0E>nUs4hi<^G4r0+}nP1UipLV*d{* z7`5|nMPf~?V-3W+O_vY}@V~3fbMUcGa$xZa&xQxVOz~Zlrb;Fh7*J;l+l9n9`tHVb?G6fD314G9}X(diF9puk6O zjtw>hz!XNMhTLc3lPupb!hvV-e}K}(F;8UW!xv^}*uc1e!)T6}kNgVOwsS=~U!5r2 z;+0=~4(b`QMn7~a%~Y(=TQ`7?KN@-=hu?otNrz+=x6E=0H8>@+rU|hoB{J%p$|VVg zBfBMUoo>CWvF&pm+ke5arWDcA2EuKAC{k@iGKzYOAgCYoY2MX)vGw*ndFNyPNj0#v zqqrbng39Ir?Yk$7-Iq@vO})!#d-1mZU_Rf7jy1&8v_Cd!JWr9ab8++fY=^6Kp+VE} zwc_3jHK6&h0{ck7c855g57&W?jN0~;dSxMoIv-&-OVCIKxHkqgV+S;?-gxE~DK#8n zrIa#u50>f*e+xXbq@5ef>|?haZ7D`}OWNA&?}}x&vk$?eOPeYa^1Askfd7VS+R}qws(r=QNR*wE zp5&s96$Kb=Z>2B%>nVGMJVwh@r;^M{z1=f7ebQmT*91)Gad|@_^zB&kk$ggjChUGI zcc=vMT91XRCyIHvl#3&CF&Y<|55I1^U;=YLo@eIm%q8RO z2ic!!q^x|QPU2r6-su69>A~x-i<9=YI9Wmhud?0}s?3tC9xk=x9FpE09pfxC-bQ`h zf3285tcMci^=7GVve|6{=GZji2#Hc>E5V@qqnaHg3l-(ERXnKaaK>(sQgXrZjyYhv z+lyBz&9{eicW80{Ux96X@7Zyen{osOQR%#0@M`xdmYwNki`QkK^K8*-mQ70A&t$kn_^^Y5a*fur2T>e@a|J>sQo`Z8q9xRWrQX{TTP0dvY~E_u=G zihiI{ESuO&LsdMzfoga!?Un5k4x6sJUdfNKpaf#K7zfFqlj^hniG-}%4_yzE99c4W z|5s9%t(idkQ(A(~jW9^oe1L>oy~@^R9Cz!^)!)xWOC_N%pZi47E(bv{X5&{6T!CuZ zr44YW1Qqsb4L3gy6jwlm((E$uhSyQMJD!}G3i}Y$anT^`_P}Aj4$ifAF8fOQo7acOqVZh z-wZXMRML-S6M7l&KP^T%TQW-$Wli9W83ToKK=9LI6CImF1i`Dk)G;UF;PhPUjmz0n z%~&cI@sJmkEvIpf+Wk?uS&82_a|mrT8woiac!15-TgzI6@vCBrWGvq2J1gPYXi1X~$QsMXb@RynG%=c3z+w&F$ zRjjOt%oFEkD=DV)iE6T?=6aRu=ZwUN(S4nd;NGcLnM2nuH0|QW0P~GY{7d;5?U<_7vCJiEpHURnk+E!ijHTHkR$rjo{IeUw8Wr28K3p3KD&J=%}NE3FGRrq@&(&e>3N85Mf$bWq-CWq!Zu(apv&b9~8S zvuhknZ6&~{PendetlgWP4g!&rispS0fDZ{cN~4b15XY;evp#&&07WEueooxRpB7rg zYX^<-?2PhQ6S`&fESGXfJe(gjx|!0Dv8cxKx}0*lTJIk_9Oir$lFjcb4yt}BpUgG& z1`#HE67xmh$!XTcNkusITC$G3eZDsqt4OwBR{u#cjPw=kIXV9zrIFIBGJ^~Q(tlVE zXumnK1r!971|vLpRv*&~sm*~AZ zjx&(twm3~*a4;hmx{74PJFXYzH@?$jRLTiE6VJx6F=+lZOzKIKJM<&9S5bFGBd{87 zKb8?RC_qJO$pd-DA}@1%wk5eQ$rsp&TDU?jZR-BB4%fRZa_Pl^LAuq>DKywdO8s*C z+xxG*x_smU-(XRXW+&?gms{3<81&p7cSxbJuNkrYLe@i0_Z2-!;K8ZLV79HmvDfk- z^^r{OV=4Lmtd~k->QM1JUp{WPIhb!@VyW%UewMs93Y(_YtB>#AOQ-&w%Jpg0vFd&t z9rIF6|EUVc#G3Er?5>1gPx5(_L>WIV)svXIm4uII`~J!~U3ei`x!f>$PRQxvz$=no zHt-~?P09;1+ddafMR(^#5~4i3q4FJx3PZv5KCIUejBC^Ncg^ zck!I5Qy=5SW7{$R*|KoTR!!D??g|!7)Mc`|;6qlfj)cth0qb_|h3Sf! zB;M6~b%?@Zz>R5RSd1qcFYb6O(p50s(SI_oVC7_~SMJNa7+VAQZ>oOy9zjy&*=3)? z$GJ`1-&NJ>{ZXbA@amVgT(ZG!btOGAscAK{==<}mD|Z( z7&F?~B@bs(w(gV<*eI{VH9G1*=MLjld=6=pF_5lM`fiE1Gq+KlwNy)7pI_G67sT$y z9AG}cow(gQbxMUvo`|hFr+b}V(-)KApqSVkV86*kHU90=My5EIw#c&Y%cS*a5~uO7 zY`>~m#_hOgcj%VlXkgqacklOxw~2D&y2Big1)*r|z@?H+oX^b zvXK<32FmQF_k1v+Jhs`KTEO9;Z^@cp)cPO#^ zqEY`Kzf)Pdv}aqRq-(Ka*E($xmH5eL(tVxDta*MerCvt{V`aC!bt#v|k~wGWghIn& z=Am>REq%Q!Qmib&HO(q}UFCQoSsc0yPa$2U0}?`d{Uqa)u+_@%1VIUf!}>L1-<`Z` zY_i)e{{_BOMm@q_nwKKBlq5Yw*_iEubE#IkD6!bs7ZfxSu)ibGsr{6Y7b0uqs?;1` zyUw>*?z%B&GdtpPNh(NtqDVOXJ%%QNt+NGP$0c5XlX_tAGcosJu}l03r?G)fDw~t( zKr{;}AY8@&=<6+gX8tE-YBg<7Hjb*28P}z-GPfExOEyQSLl9>u;7$oB))9|eRet~QZ z7^)fMT_5aQxkP@L5*P}Ag&IU>FtXYNT66D3B6j-@CFvebJKs30A5sPXt-Kyhh0!D7(8$pY681QW zD#CjQC{v!0%aK?XOpMeDQEGdsto4;@vuCm*+|J}ht6D#*Me7~o#noiD?n^zsjYMJS zGT05A&wiJgtTCkwe{rtHGz`faC|I*J1^)YubRRee`BFXhq#}*83AOkCodYp%1}Wa@ z5DVLc`~4;{wq*OSoug!O+3-){xQH< ziy=)Qd2dU5sl_uq_|kzfB4BHrpW66mUUy-+(L;*N!tEVpRs(6|Cu_8h9xNj%*MM!m zZ|@jP{tS#W7LfD4Y`|g`PWVlz$V>znP$3FBm=ufOiumw1nMYYtb0O;EE?)nZRBr|Y zOJn&qZjQ#d5;JweT#N9`kd;j)7f!)Yv9vX55Mx`@Y#%aEujf|nK2uAB>oH&53x_Sx z!pUl>w|kh&=9R+N?BudJXWIPpJ58hDPt;As55i@#c&t;?h+Nrhw@`BfxTg0D96OYy zwKN5qL9D5-0(zexFzyb<7gH#twkx#UFa8$j$C|^wC=a(2~K#a+oVuTFb-p} zRJb~Gap+!Qa0}J5L9#CT;=>Uz2+s*v3@X*BFXXN^GHtB#5a_KHf=(8DC!P_^;cDxlh{#vt z7ZfK7*{HNr-E-A?dAn&zhb6Hmc>cT0589;7FneC*)0{3z;&z5Q7Ce!l)OV0AHQR|; zHE|F=0q+rCrq}$v=+!pmz*Lz*h_gjiu2GkG%b}WGOYGI3fG<^3+O)%Swf3oHHDA{Z zqPxGG3Qe-|`J3Mw%8acOKKzYvCfvKGmWMx5ulKvR&R{gGd`Tq$%Dz<^X<^%*{qZ-i zDTv+~=#SF~=VHJ+v9?oOLh4k)?&Fe4YrLR&M!CC~jY7Gbw+297mqcn^lG+Vtlmndp zT}It@4|>vSds!X0V*Cuf2*e(7i$s%E;|CYcx^THrX7@-F z-I2d3b~uEzn7XQ^2AtXRrnGHtk0Lp_#ig+s?o-r~E=c7?+_#sTQ)rFoBi&WbazFSQ z^H_C#Hfuvi8&zc_Pehn5twVP!thVOb6Y=*uTQbAuUU0DIWbpjBI_($(^hw;&{xmOVH< zR=WEmv-#Az$5&33O*cKE+VRaW z;w@zoT~&qDR@BNKgB}I4TjE-qOpld9nV=JNgOJ~GI{?YCR~N(?%wm$N@|Tev*E^)D3oHU&Q#b|svtuAQK!xI+naLjTL1sEcY#mNn!Dp=Kz*pU(;f;X zn<9rcYoIlJpz$vtex6j+56NPDaj;&CnU=&Cp%ZKlNLe%_uVnUY~S8 z16zpE2Tev>@l`J6+2gj+FK*?7nzWInOpif{Y@|M2S$q~D+_8KXMehxhtor>TixQh} zDzxBCi%aDM5A{n`d6TbTdv8Ui_uBGk1~sk39TygfPY8>W!ZhF{a>>c85{B>@)=W#X ze5D3MD(IZStNnm0)Q-9MQqoT#>yG(el`}Sb?_EvFND@S-&Wv?qAnhk-@yp*DhO^rd ztIQZD^v88&Ep(Mq-zcdXl^0I9&2B`oOe4uCEgN4={`p;=_fg&3?70h)vYoYB&}|kU z{@2Rycyz)zaioW6t!^&a?&>EqO()fkXXK~G%D>rguB5y-Vob6F3TQEfi#FHW5S!9- zTby_#4w6>22jpFB4bKcIdpv4iN*)@_(Whrxh6Z3SwcNkIb|#>0@tj3qG;E5(J_`a) zq8ldlllhGn=l*o(Kj|<^(=2jUiZZs}M?P0$T>RcfyDxG$MKA}!V(%3=hrBd{}HFQ7)}=@}Dzrtl18(X0;gcP3@91ExoR< zjefg7@~OdKd;AR>7B-5aYgigqt-F{~yM@{A%*11WPnxO5g}emj;X-tEmHNs;emd4+ zrg};)WP^6cIWQGv)LZ)dfsiSY#bE5-Tnil6 z+51deg`l>E=D|E#-Hn3)Ii76zZzJ?#I`xi%>hx>vQsOi-NApb%=`m(SG@!tBK=PZ@ zF#d;&ZLBO+GnVNTzX7f|L8HRyV{7`6afY&1V!cUTBc7%BfMhQ zDGbB$v*Ov1o4%Nf8U^08*Zi4SHYQ{m2ME!6$sM!D?Pv01GMyf6RAmE|`{RBw{gLsq z_Q>qx?*xqM+6(o}*b9^JKpDbcJ`RD=q7%nxMG%EpngGFM?UJ|Zj4D+z3#Sx>(YcUt zGHQGqetR}0S3D0@%lJPh<4ds?oJcx~{T)c&s3svVBk)%|5}3WG6eA(ExV&w&j4JlT zeGvu!H0}f^8jTN(Y2RF%sI1#?!$;>yiiV)hF0;@n?Lw$Ij=_AhLya$hPMDpuFTN3Q;g z{n>;iXcbopn{i}!g2q(Sau|bH@${}c^X#vwcq2)br%SJv+8^I|De9IW6gnQ?)woPa zVi{2F*P2uo@ENQQ$m@sOOQVeivfN;i8)ft1PcfH~+Iep1jvaW=Nk&gR!&mw9`$4yq zIe?zcugy-w%FYnC?nn!D!^A;GC7iK`#lF!AB0f3P_ihg6QAG1=<`Xix;bAxUKV1J~{_EZsK#1yONH zv!W2CF23c|rRr@lzP!xkV>4|QFKJ6Q$JcC`ps!Zy;TMM|URpr|DQ8+CjQE9XfsQz} z$BWvF5>~NXIzeA83=*FoEKnuAGMz(Z)GKL0{&$QXUvgV&$f0Rm%Uxy6AX&Y|D%mov z8tDg_l0b10rMN;VNYAY+DMj*P@;!VmDF3TEAcUB|fO;b~VyFG(AYm zG{uJ%{j0*way7f7>SZdZ9Jsl8Fi8GywW`Ktl8=iQhyS|%>GJe`Izt`{scjm)8Jbg< z-F4&Jt;bboR(XAXBqAJfIebEPSj zCwgl9S4=MCoaKj~b9p}5Gs>cGF%N+x`NdoEmF-;p z&G$LAH?;M0-zt@7cKK(*6D$Vha&SGopdo*iJy|?W9~mQ3Q4Cfqm73*vvHrnMcaeNR zW{7WW&{sV0iZd8#l^8GyF4I$c6L4c z;h1^h-^L~@fy2YxTI5|dpY&YBdq&NF2*nX_WL3yY#*XJ?4>8igbI8X zrp+nmswMR2YI`187IXNM>76{=JCZVG$rtlkN)2{x1^vR`RB|x%hy^jylo(cWPYV!q zCxwEPWaZjQ7-R##0yf=j#IQen#f;$)t47e1wjw^S-ypnf;i|Gc{j^G%kdls91*99KL8QAIzS;P`=bZ05zdzx-F7(pPe%7;M*37I~Gxxm_tY^qgwY*5C zT*DcG2db2vX#dk)Q~Q81iFpoQM4GWGM@qsy-N~ zdTrwVp*@)&=D`BD{!Ax#4rGeoRcM$jRTTFU;)6w@*FBNOzbzS9vmY}lqLw!z2A2># z90fv>`zI{rq=Kf{cQ=`s0z~|8+)?F6-o}yl=|Nm5m=t07&glGiL(h!vf|zb$in?sA zpi0rKp|Nk}H2`|+PWyQxdHrt>oMm15)8@{{c!C6O>CFyZRj!wy?UkrA%nU&Fas)T4 z0Vht3s<{*2A&EVg1#Sk_p$F1x#?Lh!j*H|+D}O&n#+m7Ez`gN{Jt2RMrr#V$pORI1 z)UP@aRCzkRxkjBl+hp&@itOiPibvk9Vn-ObR5C;ADFKgMU`Kzv{l4enQhFsse&pAh z*P>sA6**XFr^jbL33Mh=_iKB1Ugs|LWa>QF6_1)~Jv|dNlExF$(R~M#!K#Ki^EJP! zhH=ZLPIV{w(&vP*7`kVWS&@YGrW1%_2Yzk6RrTjE83~eEqET({=;PovJZvPqZ#o~jId+O@%&j?aJYy7QqsV44fhY0ND zFRw|PihExFlWrfAW&&ZtfjT8BJA92wGuA=NA;}-=yrIeuG`4D8F1+NDszi`cAzuM2 zR6ZFeK{F6Oy~nLhkWVga?h8tAMGB_a|Gb>=g|caUUr!kWWQ2Y4+Eq3 zG-6QeLRYYlrbDch_qbt>H6ZbLW(ix>rqk%Ne~UHTYVNwUIKO$J{-Yo&X2B0T+w%E` zOogPc6X*)Fs{CSZtG!fWMOu8a!Fh+oF4O8Q`kGyBX^Ki8gBuf<(v8}Fxmz=ASTU*B zqhsGI;2-JUOqLa>_{f)PJSx#B$rDp;b38tf3T&<=*A1QtdN!GrP;xEik+~c zKM7t8k$uCaVA%Y-yS$%9;zCkT-9ILUgu!>W)ZQE|lPZ_?TK*mj@1bSZ7Oabh9DK&# zt4-AA zHIq!E$!8a_gIdgXU-N9w=Cu{x>By87=`>`%kP5`aZTE}&MIJe-&dem>p+~R(EJY+q z{P6z2i3u~5@2q>)gU?ICSdqr~gk583{Tw!q0g9D}&Un`={CuK73*7K2@JJ`kkXgS*0&&94d7(32T|! z=QV>ol3B&HzY_QU17gJ8kXf86;yz~t$CTx~&&TRNp5P=?t7;XcD&<bUEw|NJ}U6 zUA&`ifn#T4k@B2cjgUF|>vfJSCsnU?(gl4V2O1WFtI-|BCgIun1rkLTR4;QVI`avw-n)F7pCg-OUv9s`o9 z&@+oeqEwJ~>7*2f{6(cAQKr21Mg--STB^z&EtNM(c0wm?7+F#Tr5ehMWN|i58t#^t zyDAqU<^%>}#U8VytB|nVou`t#6X+&gYdx2ltxZiOHh#USxM@;_m-+VnnKXO8^pQ{2 zI@MNtwf+{6+re&pme6@z^pab$L7C-2Or>>F;84mZJ^KC+Jil1&->PZwSO)l)mdP{X zJqU#$?6-I(yqzbSOn=R@_hsQdxK=UvCP-Oh6jOgu zD*i^tAfIcJwr#C^@1Z_!Tf48r8h_!P)E^&yx0;9GtC-TFiYSwpECU#SK$AHn{!AF; zlwsS&@~27XGlgQko>6O1Cvak+-T0tP(k)=|f<5J&q*uwGRW6W}R)8t&hurSH;;7HR z^VIyUQIehh5I!xR)z9QVz7YNpk)5EDqA31);KA&B02rRq?1LK@F?akfrFo0g z)hetHTuoOg^1{BjL)Jvswc>eXCC;iN8l*7FJPgH}G3+ZMC6;#KCT??~j2_xvolShV zvNSw-+m~y$3~fude?&Bp9!`kTs#b%HVpUhV`L=InjBb_mwa&VyE7RlUr!O_muA#+;URPIZc2$V|CW==P&y1}3jpHC&S`~9S1 zK>dmPk7nnG+GUH6KO7kPMHpN?8t=P)V4-P{`tl?=J05!!&0kG2h`2BKP)k9k(n_S zQQ}fdd28%4w$b<{D4!rXtFKe@V6m-y3?(@eQV=o+1Dh3RP)DLEFGsUha}cqWE_XQ^ z$bO>Jj48XLftLX31DTo*>oA@VaKCgCQ;zy0pNePi9#slsUWl$>;F8(L{+@O<%cXCb z#Jk7A7aX{9MH;=KsvYwTe*BxT|@^KaljA*t^GYm~zIQ zf4sjr$4KyEa~P4QRvM%=1SQDhL58X7LZveg_?g{nUz9CwyhdhmCX7&snWu%{CKp@V zdF;aAn=>N$Q{rS?!Jk2!+hzNaEYTjXcAc%N?xuv)0wY!;LwUU7R>|Opf`L(+5J*mB zPTnIC{LGsq=;jM2)#2hy|wpQmBg_Gto+hb7UgNcj7sqDKj6qpYVXZSz1v{JdS`F-S)wUUa=PHihs zP;6Qb_BYI11alyg@`gLwxs7r1y_S4sF?EY9=|GW>CK%md^5p8Q6SpH+_HS=DwD z)L)ot?z1-zU5(6U`D~z{+KC#4O?++MalhoOi0c39>{MOLz}!J`nadPr@M{`5C9Tl` zsX42oOkwKZA5C0ti42gDE?pBF(6aUe8fFE+%3d>+$Xi0DSohL1X2RuS&Z{O;Kh-Jn3K!rW(ROz9TD8-j_o?b2t(CQO)PrkB52Ph# zzDdd(+O=Kmwu)A}A0@ita18x$)AJFlJsnhNX%F0ZN_iZuXXgL0X-A4Og@4(Ie}9`5 zha=<1v%{bsv_5`38YwyYe!UX4o5r^wx!Udueu>{bl;;F#6zD^Z#5<+C%?%f=&2ElT z%o{Gx;vdwT<-CtwS+?uCp&^SO3lU{#YEQ-%q1fWF{dGeLc zuj8cjjbIJ`arshA!y)nGzRvk_&g@*%aN$&DcK6G+H(|lFTO;WbJoBk#vE3MRTd!z7 zvYHfJ5Ia4$%*PTLsFNgtzgln#Vllq-rr@59V&Q8$2gfkY1OVtw<9}J!iht9t9nH{) z&x|LW;ufb4am_G?FG${oU%|nM^M^y%2lVeBvY^-*1zWsB{MBSfF`IGOMODI8i8ERK z{OK9vLIn}ihmeA)fhYVc2u}N*Tnzv8XDQsCZyPd?9$o0VCx*T9iz6cY&-c4%B2bWJ z^t2D>xQ;^1rwhUE`^c+R_qb=AP*;qxxm!c@nyXCUBR=w15^&3FKytUN6lVXmDQMm9&1n z#uJVUC|JVK9tFnziXmA@5#?m4=#CpMdQ8f1CA>TPE4eC)(FOf~Y!9%Yt3!Xi!bv%( zay@6``Qj33q#hJkZ2a9zv&4%O7ZYYE3_6H;{hI8JzYbaGhhL=^4~qXu?&&MF-44nu zgT$m>PL5HVi)DjW6SsS~*WsCPc!p3?+#>-}W=cimA`wRYU+yIFl?-uMkI6gtMl5XMF8)Ok?6)su<| z%&JhQ|F+2`2gkO*S+#R+y?q0H3BFZ;(RKI%?J6Hs2t=3~|6A{qEIJJdp}U)tqygYx zihnRzMHxsI#pHj?==C=y^gRS8{ry!G*4y&QnP(oBQdcu@5*HAT2j^EO>UTWxeuTQ4TJqtQVPK+KKE zC*S>BXU>YO)%Z?_R)9^$BnHL&fAqx>X~HKFpV*)*q8ROtk0uh#9Jm|rX>yxfMe)I; zLzW)-{FBB5^%dl}@;5E2mem6Yd&#k52fslRgD#q1o)0V^QsF_JOcUC`iKzD8rmYo$ z6)z6eC-;~)DLL=4nX9itRfkZW;D8FUvDt=)@E~FAHVr{36i7MA1G}N;ELzlvJo8cj zKT}8rf?9~29A!>DK)u(~3;|`G)6S@EW^_{{e;;odnSGaYEXy2(Pk3=n1X&%gf( zk^Z#-fo%OzvZscx^;)5Zgr4hj`Sgn0mk-ySjd`B?c$#A7h(oXct-%^I_04fs5BJ9P zR;KU*0zsj-h*#gmSZT2->KVH3zPDgJ_;wd9i00Lr`VC#P~$h==T9VBu>~> zMe8Xa6vy2?Zq}}o@MrirU}?i05`1s&=PV=b!s$hR1c4f|n)+>2BjlkhW$eDr#{X2P zBl5?n22dJ1zDgCh;fQgX1q2pn&xo((X5OX zQkEj242e}dm`s}7e6(zLzLU5b3QgbdBPRakQ1qIn4ij*5c0Sss=6BpAceL~!6>fY5 z3K0Ejbe_*;?bNOis)WI@TmR@S)|!U}0(TaC&8yCps} zfGQO4P7#b{#3AJ=JADYqY$luIH_N^MS6c+XMc9I$OT_mZM{e;$wcIO&7t}#3LTMy> zd1oSbBmlY;en7$`{QYU<@dB7Pg#wLowhu_wEg*I``VAv5#fxh&WNvjL{l z6xlFhlMJSQl67b?_cj_5q{V8sxm;r#0QQiuQu)UIm$YrjFCGd{Lj?a002$_U0m_p^ zu3~0Myx&v!RX4J#d}KCo@>+N!k{$`*qg71_d2Hzb>aE>`h!R;IcKPFuh~slA>9i4j zf0Iz8&gB3dh}KY8phj%@olgk!3avcg6#_AY=O8Ko!lVMM*$Y|G2N=krG5Bf3K?MLA zNGBD@=K|8LXMhh;{{MZyS0mI#(*f_Gy(t&S)?9bWlm*!XWQ{BQcvj$5I&M+wcf|Et z92~u~1R%`Sd0|TU#R8PG?bSXO54}m*Q(^h?H?`=#77*_mee=cKFCoYNxA(O%$l_Wg z=CS=rx8|$!!&#gfr!c6WoOS}i;kSI&OXY(BRK$M2{Jr@J1l?Vop^Cch;nTnH2T*HA z9mcH90}QYM6Xz{~1fgUCu11_VcrZJBp&-Bu!Sv30SG)j>2QMLdc=lfjABw!|^q*{) zVysTqC7hm~+U~`MO64nN1{Bu*1a8A=Lgf2(NqVt-Fw0B^0}el#^A`dHT^V3v@T-Ua zMNPPZR_iAOWW`3KWyaOVn%Q!s8UT&)e>Wf&N|pp@FPZs60OrvTWVUoB13A^(R8ZV{ zIgO52z2YA0wslMLyJ6pDh)U#yW}Hi?ka!{2!IbTJ3X2g0QGWqoh2QFOk|f>9UyXXd zg;V~-y4-wMP|0Lo0ep^fjlKE%V@|hYTa%HJOi}$XDBf&zLslMc{7RksN9+@zAO1KI zAhoJZK>`_s+!4Oik4SlafeIDZ*|7O}1{z@d^#Vc}34>1uZ0&WV&*Ro|iwOrE70Od+ zzD_O#>0;G@pjv0qwf!qbo-GK+?LU}J!NkM4=THeBt(YyhrmTJk3E02_TVb7D$H@HA z?Kmzg$}J~@E-nF^GJuEdx+6}A+yuxD+(Sho{{ajd*I)`@sVSuhxFs?KijX4`(YpS1 zqAL6^ov2p2|G_P}(zRA51-MLJVGsV3cO}`MiQKG7fajiDLI(?asi=}gJ^_$Vna72D zc6B7Z5)F^TdZLoP0tT9Qq|jwi(FwtC{C_WwKRn|5Tii(~?MUYZ047Q>#|8WAfcp2N z-F^g^z~8%{9~tH!#jw1T1Ja0!zZ>UrIJT$&1o;VY)zsIrCJpI{d61~XgFFaKR0`xx zmgDxn`uwTqfkze)ZLg02nJ_(2T9EycvBms%Zvd1D0KYb!cc)8+Dgr>F&xSz`A=*t) z3?`4{b9uln0QL(Ga;JOw1TguH9&{DRQ7v0%A}V7IO*5%K#UrN#bf`< zy%w_kG_%r*)d$3b=%#WWoD-3tsaN5@&eNa%)5gqDR-pj{nQzAkxUVau%T&6{#v@kfI16vIdX9A@}i;hSC@$ppG*+{1=Br`QLFjX2QnP z4j|@a{m%l4X!(yW_pMW?(T{h}m{X4h6|p_U;dk2WF{qyOJ$#JUmzVs0*MQ(?(!*J% z=|mqf^T}~LlSz6uq06o^Akg8tQmIrl3A15=P|~-~T%~vJLlLXZ=6uD8=qzg{oS~CH zk_RQ_=lJ%JO=Ou7$?e!bzY9$`M<)qIpXRlza9O*`gmQNpO*NSdSC!M%R9z4T;6L`m zSlJsO()L{!I8(EFoS5aC z@YZRsE6SXd_OBU+V`ekSvF^_{m5Jr3rp~0PmA!Z`WJ-vK--;MlpHD|j@HyjA`38&e zUguSNKBGgwxKdH%Y@y5f{8&tVY2#MNPVK=e$KbuW!$Wn`XCsM&xi6Vo8q}^cuYG!H z$a~KJb;yrH-$gM!&AfnmBv|u{A(8ATSBC{z-3Kb-Zpx{fL*3nFYArqb4SmP6GHd$1 zNnbNgETr*x->siYH{(a<_Z;!kMt|xb?nxEt_rk3(i>aU$liBZnYAIM>tgdP)0_mVe z!ozFMrGc)`r}(l0M6Ex(mFH`E5}TfioyD!yw6$_Q=X*!>t=Ll4PsSg*2GTNS8@|Wf zPn6Xi<~pzG)*wb7Oc0|kwdwZ@uPAI}m-WY4-&h(%-oCYjU*bzV)(g zGFoLRC1rM5YG56(eItIk%oT63mS3b6l|QJlN{}ne5SHN?>rCg|I#{O4RU~VfLd3t8 z*sb91G~Q?`aIP`ooEkz|pp*7Bvfp_j^O?VzZoM>9RU&(iWr3A*iOF`{8wuMD+xW8AIa%6W=n|Yq2{$rXX5Mb4w%7C@>&hTE zS!3V23!Rl&i`u8MM8)W?34zxI((h+)1=s4927NRXOQ~}bx$F0*Y{Xs?FTgNYNObwR zRfd0Jtl_jHO&7-SXE6d3osz)sDj#1<*sdORx=hG%f%jk0v!AF|>(+rT|O-DAC8 zz+*W1`>lB3erJZm{V9!t{_vl4xM_-OBAq1kl-oq**BPx=^^|#BR1}JIa5loVZEV9Q zSM7K7<>Gy|v*_P&hL(#ZoBy(L4=jK1e)fhTos{uy1MAB1V$+kQjsjhzxRdL4j4P85 zhn^{itm01C8!%0{yMEV*zSYA1aI`tst-t-&sc6w;gYBVv%r#2iV4=N0F3xnrfr}4l zkMR_awAh&v@Qn_e#ti+|IHgkPu}P)&_*s;s zG`P9C3kR1ZhJ5Bbv>JTb`#lXun*lMa7n<79{pV}~gg-fTjo-KLNCy{W4{>RAX`m!0 zb;m!fKuSIzmgApgof@ZInwU1#4-r44aye^$w&WWWPS%ivb{ly3>j%9i>PiSxmL$u4 zw7DVc5l|qwSmc^rlRu`67HYOw3 zpELI2ft^=FA~xJL9&af?TDHB-)UCf>1`V>D-&nG|0)lpuD6PJeWCMZ@|X!7fj8VRT1eE_V;_>HckvsCEzQ#>U!gU{4ZMf_t4n` z(VKNrB!Pwf>MvoSN6LT#uz+1ahYBw$V%b}tNU)yop#QDCwwztfm2;y z{CKz#%E3fOk^SrZeUX|5wn%00o~xbr@;do|W+ltx;w%=R)Zef^e@g>!AGF#7*r@Bs zj+`~$WV(d{1E9D87q4MTz8YRlSRC(0K09v4*m6pq4^{M28Kl z;3CorDT7b`kHr!N$HoW)=ZXc+-?Ou4Bb@o^3_9X=kM~SvHGqW17!Yvdxmb~l4PeVx zG2)Q;icZa_OplR0xQ%lw9v%6pj`TEOCq_|-qRVciiwDvWKM263@}rr&314CjLLI$= zIuB4JCdtJ@GN=?^fS@{nfQ1Z+7#5b;Oe^{q&G7R4ffn2dcv`t|yor}a;`0KP+u)Cn z;QX1ygbVQsHU0~;p)+;k>u$@N(1T*J5E69($Ar#y$q#tq2H!2+uC52wWx?ZL7wS~EHKtjbZQ_WGVodTjQ zJ&4Wj@*rcj3vg#@|P>z_-a$Dmlg$sUCaP(dUv9^TU{rf?q#vkW4QO~0+X?#E?Pa z-XJ}59n@N)r&d5rM9wjvZEvPt58}K`2NXKD;9#$Cn*a~xVJ9}`awRQBTr5%&TbgGz z?C09<=1UeuF{*_)#ze-B=aCZABKJ|9(~bnGETA7oU=yg%59>YQx;(EaU!vcVBlJ+s zIzodFaF zSklJf9HUREO{q;wETz;-1|s#CwgnruQmVS2EQ~LVAAGdwPCV*Znd4?1?$BC@Kd@^^ zUUD&*C-TnGEmIDvKhT0K0BFk^`{V z#V4It3c}J0bn2=fYJPo~@zC)#NLRB>`ak+Sg2sIhj6oSV7VEa7DYO{n-a|n{2$2V( zqV+!fR!oeFQ7Pg{z8qk&DM^KX^B#Gqz_HQKf9Bg`pYN|vFt(7nm_4nk`s7s#w|?_! zdjQlXoeQ{*<*U0lg$ImZFH2s5)}AVRlE>%V8>8~{OhCg(vseyhvy>?_6x(Ak2K+tq z(0T~yZFli*_>xT9|jf(17qT!@v&bWt9TCB zqPUN56voH`cco*4(IcOb<7`@Ww>6o~j3P&fL`l63pWT6BeKgOL) zl5qRBFdOZ_k=*5|U%+C%;Q1^HP}K0~#H6V4uX+d;N`a~q5y|oEgnjryhv^KMHZ$c@ zMXZO^4GAxptN!54veMFHq;cnVJ5V9J@Cy?mwx&!6%@koE#OnH#jQhF(!BPrP4Mjt; zAo8@mUb7($G9c16Q3fKcWZ8p{d_T%va{p}yXh8>`+~2wCfbZ8nu&Mk+QaAQ(L$I^$ z_H#Z8K&p-b2GySs_MMA;=^hRgM^b*@}Uj?Wd2z)`FAZ{XBto3V?fqydQRudYo>RKp3u9!B&YKS4@ez^WbtJ zelQ9lg4F`n5JHW?sAvkD`_e=e#=!8F*6q7oP%-imVr~>j^huzf(aVmnJfQY(mTmI8 zd?P&`xqlW$e39@K6iE>1kS~hH54`}>y=|vkkmWYbvoP={+JQ90)T}VI1h4}Y^OWi7 z6*K%vcx<0?Sd1I-ahj-2)jI72$(vR%kLaWI>IEv^zuX`8*Gh=RVjrqgLVPO%JR5@w zdaJbt0BX)v&W$Ku1F+^yFuPW3EfSzn1_4gy517I*;JzU&GV5Is`koDe zVi}0R>qEShA|EIUsDRrIi{1t%eL3pk9wkx^GBjQRey#Q1pju#kzoy6G;LckH8gC

5%02J^iy`Kh6gO1|vE*6O zbinDoIU;Ad2AnLqPd_SQp($kR0 zN7H(va`u+-RDIn5;LqQy6ZVI!pQVd;g6G!3cQPZMmfb#SgxS!W= z)j~=OmkTBFg&V769Aw=&r20%4|7wtqIf_Sc8h1A z*2(5*qv4`t@Fx;#7sw5RdhN=AneQvt2qAJN8zoVl$#hB$g3U(DTDeTla9?!i&kvLt zD>==)Bj{+#aK{siY?IQ*aMS^Xdc?fjg+jvTC=V({|K_7IFklbawcfj>aNA_v6)DH_ zHj1Ge!MtoX8utI3!M9E^o3C2H0B%IgN@0y0 zpuFDuCWgSCd4+;}_=;B?3?Yv1vlwVY*v3j|ox@@`PCdxd(i9npAtjb*;9D?VZkpv6tqNWh{sT-^Z(e|pii(sD{0tLUH9@_0E3gfA#!UP z{L*G#;ZBE;`n@5|KG1fEIG~bv3b+HGpG;DjC_M5(8kK2QuW9nUjhP{Zwe;ZcVb6$k zrXrrnE{imtfU4OzppZl$dKB#J8mW?aY!}HvSd$+YU6k4^$QeUvRJl$&Mp|B@?`1=h z$lE9>5WLT#DS)FoP@p-iF?8T|-(|IU$hyP;Bds+egAA=t6s!u_+2Toh<%}P(-gu=| zj8F2bu`**S=aM7=w`#bx7}LmRMe9Y}!oRd&t`ELISesrsC+rlIR)0ZTH#KXW37%Z7s*>Vd^hsdz z`B6$dcKHVKX%iIEMN7B_EP=BQ_tuZw!B^tJEtVX8hOqOEmd4#gN>r(~`AQSiPINk?bF1SCu6<3u(ys;EM?zShzex3}v(0zU%HlEwS2nhqSeNQTQtK^H^J1E`oBK|QuHoj;W zu(HXH_ND1tqJz0;(V2@w{H%st7)?2?4j&uVg!pAE z!hogC>#>jie9|9Ye^mV2p8XuVFJ+tB7 zO=akZQf{XM#ihwghnJGNMC*!>uRymxj}>Q!f&~^>HoJ8f*?uD~yp`&+a)75HQaILl zh?1pYzzpR_%*e&@vPI5?Zjtpf3_We=mL2c@ZaB=hWKw79rAg$X1b^rMU;N9(XI-lP z$Z8mHp1^JzQ=ZApO5y3*0Lb~qZ%E$GHU}46z5hhdpS}AgDux)rv=-5XA3MgOl&4K@ ziZkzyF15cO;YSJ^IEcOJdT;)X5=842=Ax9!cly);%Gc;9E2phhQG)1b2!BaW8)H7{ zr3!3eww&jCf)DmG2n7ZOD#BHiYjm1xRenPIhZ3*0PDh%HnUfuMSq~epg2r4Jefkv54O7X>@>A1`G~ z)_9;9qg46A1bP5NF!mPN+tQnur@F{Zh=Z8afT zOaWXWa$eLKFXit!$xlBbo=SRc+T=Q2i^@+~`ZWf-CigZ5PDY}Pl3_}l<+eToyTQ#r z|5NU1TD z$ymC#+I-^P8*VwE3~NT^iBsriX8KI^XILxH`y(3yI@}eg?-r)fJ$A`ob7;$mCxcm9rys>H656baLJ$=ZjZt_C z*`s+E1z9t9hGf+2@5v&t-DBPvicIWDJ~c%=bwK%MRDJNpBZTEptBp1Fe90l?4hkm* zLSmu^tJU1c(mThA7?*;r%0pl_oGY-nhG_5` z?%4Stk0%Z6Yd%JsA0qQpTTlB_LAyLvCO^d2c5jtc4qdZRgc)cX>`7e3tS3f6^S11T z>0fKO{_>hap`Y_fX!H zNCay$AH+Kq&yMVmd6oK(F7)-S1Q>xJQwCGn6><}Sei4N1T*@7!`TU59HkI67%Jk8= zSRnfvtcEaKwu|?M1dE`TP^==bpvjH@K2VNM=3-uqV zi-23m>?W?G1%62uksgs03fKz|xAi6}8h_{FVIx-5bC?6?nj*}!$#p@EP)M&p z+WoRw*C8|SoNk`P@8XTRU{gcwUc9iuOnXiK5NT*~5nzWzK5qOh1bbHFy$X-Xft^bl zu&cN5&e>^py$Tu=0r8_?^3=w6@Hm0}4O!3~GM?ln3Xax(Am1*f)9~1?9y|8 zjn@A-J%w8ut(F1orb#!cIj8Wz*qP)Cu25q-^7ZzK-SM)BJ@xlCl^BlCV10^jVVN=u zv)%L+N2;y6NFz_TLplPTH&6>ZgU6(x>B#D?VOfOTWSk|^x1a=5i~HbCr93?TO)gam zsnii-OgEl)fvYfL&PIfSRtrwao-@vI`Wc#n$1bbn2TwIH#5!wL>1!}wH5W$ws2GkA zM_TR;*-9$=-J>v@(?;LKGRC*okLp_ECG?KVdU#Kggweq z$d^Oi=jm`a^I!q<`5nq8&=f2W?iWC4D-`MKj9LkpatIF?l6KAfCua{oSE2PjLg0gA z<)sXc2^>Bh^wgkZ+QR$r7%em{x6JbrX4*-&9(kbghh>!rlb;rWUSRRQqXCmi4Tn}b zK5kLOo?b<-5y%g?O#>_Zi=q3s#m;)WNvKhEkn&Q#5b`yFe0l9WeGgVu0iKuS?K147 z@jw3jom&fyj|Qgo#rt>cvE)SPm*zC=2GXyIItp2Z&X+R&Jcq|JplK=Xg%uiq9?K3+ zq?d~evxk9qLkqk%yVu!056}=_U=8u#(Heqxw8}vg48a`m!<_4`pY^(X=LjBuOc!g+ z2YD&lAMdrlEmk+-QAUP>#)lLtqn4x3BWMaKc3kfRJzJm|vXw7i=JCY73%x-4{)!;2 zvhSWug=<5{?5~a7heH*3TRgcdt}xRJ!AD^2g~(x9iG)SGuwd$#SMS8b87BrhowoT0 z;*60K66qj!;Mge?9effTeU8|Z6ts!LaD@a7g zw-(;wZpJ|~x}^9+THZ$YFanXn!2n&%m7nJ=0Z+2Cko*K5i-e|W?UIV%soLWF`qyJI zFZG-Hu342Y{n8DM=5yMSnshy)_Bi)eOj9Q0t}@!H+}BSZz*wECkgQMGd$*_GBpnq~ z+9fcoVbEL$Z2{gdQ#-7=!3*DRQi0QyXQz73j#i2>?Xis13yE$9ipyyczfc7)N>uQf zb&Snz?K>G1i@zBUJz-sk5L$JO#}iXh2sos&X$m^^X>L#RPn5VSA7PQO7!I4$5xg8y zAbM&0aq)>#7pB>0rB0!l!i0%XzvmbSeAC$Y#pXnE1d116i@;WUlzCQ>Pu0m+K6U|1Qii1CyfZUn9XEIQ()Os z)zidG3#ZXxpzwFjt%&Fu+9cPgUR~;28#*yJ6*!%G-D`+ASeu}#xj0rD`g};S>ALi- zaeE4PtiobgoXpYonHcxAF4zT0dv+!ENS!s11Zu#@R;q^lcvtr<L*!yZas_{Z(Lk5TaGLTel*>#RTo+$u^=-eVWh z@i?*qFm4};lZm(O{C%YxJz-Dq6Jgi#tgPs*Atfr8R~>yR3&IjfMh{`TSBTvQtMpI=&4 zpO-;YJjS*INgT8+=rwAiR6I%yfa{U6!9t5xJTN*6_1g2Ns^!Ur@5z^!)^}uQG1Gc$ zMhk>@At^d~>gxsbsDRu3d8-!vM98d>bu}$pf5~Zi`|M)ZT3=dfeY}-8<8A-^+V)1O zsA9cYZbph^h53c3{rP!z)I+O)L~HBRFdtb}1i$U#m(NpQq@?c%s^3V8I^7xfx_a@G zCBdE56)sgtt#7*PAdUM0(zrMHYNo7`Ln^gS)t}Rn=1&p^x)#-Y0#!%*hm#&zGkV)U zD)iR!?(0u1XO`HUsHVre9$`IgVi?S{9*pn%V$5jV^9Qf=*jHe0Xr`McB(hu3U6QTL z_yt>bWFS7L1y%k0@uanI>5zeyR?5oo!$=2I3fJ$W(Ir;jiA=B1x)m)iMR&7#JDy1p z$fP&csFps-ZA#FUXsflX$HQKjj}r6{uKF@}w=dI0kp5+Hp4BsTx6noJDptL&;S|Ds zXfU?HVdF`Cmi$a4_JK6YiA+%?$6LO^j-t>b?hFsLk zn-5OwZ4!zLwcasUcLuv;XuTJMMzH2`37)L; z5OJFL-(j|WGu@RTx_vyB(w1;L>nEp8>^)%xO(dIfu0lb?K%t$V)=^n{wGFBqn?L#Zy^uXIIEA`cK3>c1CAr@EW7PLY}bGHo?@-p9@nlM_6hW+ z%f(`AOJ1Wvmfgw9pTb678TQ2{)+Nk01O1{`U$0AXyB^AC)hOqys~zLGOLXOsojf=j zF1)z5OiCF!>85hd>&n&doS@yObByDp$~NA_O3L@;C0T7n&9j7U;f~I|ytGMFw`Jow z>R$4^$>qZHkFrE}T=s}*)TVlP`(Ng;R;4=U_l{#_KY$3(}?up|n9aU*N~g z$4)W0>`R=b21 zJ!H|sJ$3eK;G{v&VXPoTL$&q=F5%9&P}ORL1abg^%$kg`WgAot<|q8~_Vd!%ov{@r z4~l|Lem;!o@Lts1kP9C>=4bX;o77#??GiiLAmDE>sdRIFkC|iQX|g}coucHg_SKy>kh}2L`R>PoDM6lB%)L3&wuOm%GVQQ zKJLiD|ItmyTRpmn6 z@K_47KX=+@(!`jAuT34h>UGaNofR!^5=VQ3B^;aJ{LJ3MxaJ#aqtDeiiVP;a^Mm^} zKUr30HYD2U^fwmt45C&r-;YpfCuqmnSwIVUt@o4Bc{y?Tg^uei)=JzrLvNp)$*zr3GN&gy7fz6Pp&oy3BD>gm<25qTDM-OOXDxG!ODrI0KF*cTFOwH zw5T~^$W^F|uxelZc3-t^R~>6^ww1MDOvjEFJDfPi?TuM_`%L6yvqr(XO7o8YJP)>F zh5ZJ3Y?%?corwy&1^eIv#h(4Ex;oVqQg+%p@1ymVP_fgV$|rAEOupk!h!vH^M}Ez8 z2+hw_kF2)t8Zndx!f3r-ocy3MZArczgP%<0$9&77Q*~4-`B9t>I7b7TE{rK|?-qAH z%|`**a#h6!cHd_!?OYrCX{(q{K2f4{6L2}| zN#(oF8Cg^p=LDgZ;z02E>C}d`?g@I@)x9{SP+oMIVV$cT;ot=SN_Lio;@+0WIT^8~A^RFgRWp~E_#^+A}S*wyNZ-Sx_a zxh2Vs|EIk(kB9ne<9-pMvLuA;Nm;WcvZQPgvW{J4CrifIhomeaBxKK)7>s>vgOYuj zMwYS-*@=zWYA@WkawHvjCUEYXm| zDdew3C7bh;MJr(K@sT15ZyY5T)x-6=-lgoWf$7-gvOTIuuaxbTWKL_2vVN0lQf9P~ z_R}akNMQKH-AUAJ@_@q$xb?hq76uZc?Yn0mQH2V5(Q-2AmK8g@zthf#*0r?l*JodJ z{T{5%%DX^4>a+gdZ1k5uLeP751k@&-F~yzNJxUd^5KXrNCt4qAeS|C8VePK<+@qlO zwfoiFSN+Jj{OUAxAUOjZ?!FyHZUl97S7e)dg{Zl-?}K@7H}gABoSnXpz}Fymrx)R8 zxN6&4Prd&Qz3qlYq)h5na25p94LG_*6&l53(~Hf86Z1$>JQ;5OIdy{wMf6-+bXY)! z7TMZd+(3l<+i&VjQVGFFe6nH5OA__AJ3)srE<-nnLd%H74 zOO<|{<@n1|v0@3&skk7*6leVuhdt}B<-iF~9*0I?-mr~`xorBZRCeeh70Y9mKQnDG zV^~L|-bxB2UPNrDPn&XJP;5$h=H&&g22$nJ=*15Z++$5A5{M|Wj(h49B6y+x)~2dc z;({Ju=QH`VLK`;YtuaiZ68h!Ps2e_gtaUwQ78++FZ~eu|{a%Vww`L&*VVcHv-ozE{ zCX7(OWJ zJzNG0)&GdSa{26%bC)l|tuWtU;>I|pkqUiRz=Rg=(qGp%w?bzKW9nD9^-!Gu8{Hva zfSCEPoH-fXZ zo5HnGkSApPk#puJ*EK2DyIN*4i!N;ml10uHV;S9MteecH(B^}^t@fEHfl%3U7NM|3 z@p^(FQkwTXt);(vOeE`Obq!*pW{z%(s@!1pNPJSUzc~8y_V0Bu#eNJLpJv4ee1HJ( z0q`qN54)eXE|+0V`CEmGU#d${j@Ed1#K@t4`mS!*aLsX#St4I7GUz~Sue)h14P`V~ z3#%OTk37~$9lx`+tnlzjHY#@UwmQpAn4^D3kJ#SG`y-Wk&s@{M<*( znddm+79LB(`CQI{y8|>)0@1bxJ<0uX4jJ|K$CvY1vh0_%Mw`=)Z4TjdCkt;3I=9HM z{9g0H-+Sqv7xIhMS+VhMV^``d#=p5be#mT4yN*-l@y@b4Bt!U9iYNz2j-4a@{hgbb z0ZJH~m0?h-vm8?bE8=m-^fB>gQkG*9)|}~BdRw)gVP*(-QCfEQJoMu%;=V+ors=9_ zdjeAaC?+z%^PRyUdi(YQ;|>!h2X(mUTwx=i&xq#sRiA(-$}AX+M)Y2zyU21h?=0!d z`+(vH))YFGv;h-kGfCw&m2$1PF?6sA&z{}b`kpuQLlrN{G}D^C-6`ZnU7BFk^Ma$? zob~`@5&EiC9xXAg)bFl!ZM58qoHMisV?xk{U(_*<>_@D@RJVKqdP} z4KqUfP2CXs{7(DdhC1hSCM#AQG>Z`NgiG@wxfvJn8xq@5oB7BO`MPy!^Y+!VIhg9j zJ(Zq?GQ`ST%!gi4rWr)E<=cql!ScVGelGBa{?Z^m%!`WVnD}!mbt(o35O1YO}7bWb3(Z4Ol9wSQki@e$nv%6 zPi;tT1niS`vlZA_OWFLM@objt`eL5@I9X58OksETkxT7H1J@KICmh{X9iEB$ZEc7T z-u9q}N#*5P?kah#UWQU}MW?y1i|T(9U4oI{-z9sl95va$Qc?r!93k*A97 zO)7y$H%7L6jG;#p+VEoi3=v7bx|p3l0ge~-Q|m!ZJ_n!nTt&S)HpKTk5%S;mM(LSR zOO9loiW?YB@tm*s_^eGDGo=1KMS<6+MOQrX*@dgNGIR#fM8D}Ot{->IkQecJwETFm zjQTOOWkAZf>vy+5$LUOhY@RflF#anBQZv6?V%r!t(n;+6sZ7?K=+S7+tO7VfIIk$wF zE>360U-?!W99&Z4T8N3pu{xPY4RX-fiX4dl^vPM}B=k@ zXwycG8~l?)i-ekQpv$YS$c8OGltwmOzgpbMQdND=;~^Jxcz3-b#tXU#eaFjhyRTM& zyz%4kTjK1j@uB}_RCMl$b93Rr2&(3Rytnw2p{bK{WKw2yWCB-E;OquS2u^RM4_%D# z9VxVUDw!9jC2Qg7obDiOOOQT1$Pa4M6kAC@b_fW03MVaRVdqP*4GU<>1O~nbPDK~d zE_vlKk(VErUY-C0O-pxo=fcWAeg&$FqL!z&Do^R4wjaD1nNtE@^+j^OthC!mk0PQ{ zyM80c?5p5Zk`_EbKZ8eeR$}^hsr$VVEyH-G0U_o!-}oejLptw7V?`&pq3vgiQ+9WQ%zW=PSl`Csyj;o)_8*TkUen*Pc9I^9q?XU&VRYh{ zw`Njmc0GtPu&iH_PpY(dl_)A|boB}Nxq2r>+4Nr^hW9fr0B6fA*{qd`2=akC1yK*nY4A``SnQ|K$>Qr>Uzow~;$IY;Yeb{4%84CeOgPk{z1uA1Ai~u{Y1|cZ z=v)+G_dzpGz07&(ij^Kn1b?3x7yF$i`dgo2`-TX5)8Grj>3Q({Z@$#amLkLKUhsg? z7|z8ui|uhy5ZP`gu|A()rSR=I@Cv1_qAzMmBEv3}Gpdz$`I^*Oju*uG%Rb7fOdUwt zv8xH+s`zZo z93A_e3|lZ~=3rJDiT(1zaA8mHXm@Ha=49e$u%#?|MZX4fgBV6O|H(Sym1yBdMTyhl z6ygD6{a(RZyIYAdRd8{uR~t*%UVZ`dEf49XUy*b!MGTbP5-y&`?YD%2kztE!&!GuN z4BO9Su?;u&zi%955z?_rA4KHSID-je510FD4`6GzzY3`cEyllT^gO_rC_p>#ZyK9^ z54zimCrTCDqnEx3hb`V#^EK#C^?3GC*G9}`PO^)BxBhUpF4w=MfUAyH>F!N2bFcxT z{)JqaxcP;D;c+Hw5~8%ZdAU5#_DHpRjx>9Gi<$*25GSZJACZcaxOz-RN=~9M!(u>5>pI;yrEB$NuBH;gBqldTzd87DDI zIoh}L2!q<6tFJ)Pes(qi4NB1$!Y@=dhjR|^wQATLqL_nm-{n+yms(-1^QE(o2Bndu z^*gOhvyXM zdBS6R9nM|~*1{}2eT(fCvv1qW<9#(;niu)PE$9Y;iwM|QL9DftqLK$RWbuZTH?$sx za3-HHe5m)_{;|cAG;!jlz{&!n()P-Th|GMnw6fpO#~S%@j7USefDp31I!+CDvH(w_ zY3)1RF*Mzf$&B<{3T%U|eGi65@&_DQC0-B;j5RRN!Wb@I{UqaFEd7HFt7VQ)^pbDi zaZNDJh)z&BC#Di7wcXRprzqj%3woud#zy4%M^j&J@o{QxGgfECaTR4Gg%Eq1ZW7h_JbRy`i$@SQbPdHe9w~=P)EV z)oIxyvuU$&>st{%%ueO*`w&H7c=6eNHA`%>S)-l@yk!qp&N}Vpi*npMXca1&F%loC z+UIvpHmtf)3IglH2k_{R(hF);!>S#ihCV#}yb}HT<0WF{fQpwkinBn~q`+Nc*Leir zstCj=rqsXog{KS5Jsu3JN5cFq{jXy%5k+I=NeE-OGf0hnOP#db-yvMyL!Q ztkRbC2;EqM7nK$wBRl*$cY}rs+1LaW&uqIKU0K0Kv_%~Wu8yyBshATKJQejR)ca1x zbzk!?xm+$-mjU{V^89}cBAy?L9=9{J#Ndvt9hq*ideh~u0aa+;txm=Pn{Ta~gRvov z+TTp*Y-|sbZCWWyr}J%cZ-1iau3|ap4i3VSa#VG4czZ-vxD2(Jyq3&HzQhhS&*vZ$ z7A~BAg(k?^n>o}^6y6yWPn8LaVb5@($LAm+*rCFAdOJ(TDsplXZV_0XS;r~Id<)4a zyS@_$#lh5H&+T{GYkAevu5e(?>{iYX%|%Bv#BuX6#r+I|uTOV4)zK*SC-6D~Ndx~m z0W+}I7B$Yia8U{Nm7T4=Ej7U6Fnm59>wN|NI3FSmlbPHhS=&N1&y;DfkOe$D)Lvsk zxRNQK%QqeLXTO;^{W{OYDg~NiA+pX?R8n=cfzHc;bwwgjOlG5+4M}xhOcY#OZxVCf zm6x1mgC<2fR@=k_1gpTDN0D24_2g!u23v0}4A{b9T0g$2ldBjAJ05&zt5;BzFycA% zd2YXN=ra0-bFtOpX^AG-Ymzzfn)(I3?B>c*iP=bzt6Quj5=o1NgunYHDYX@Qi~Nh! zi@ig*Uyf#6=V19;0wIxUd`EU#l21>;N0l50%Qi+|UhBoI&RvCF=VH@W%X1Pg68ePd z-ST^wV>XbRxG7g^yOCgnhW$wOpv}$4xl&O=ogfES&4;aJMGz2cJ~*1cn9dGgQ<>ep z&u=v?SI1d^8!Oa$`eS|D(pLPkzrK~z%bxBsj@f&t*HKZDGAt_&HUgbH9TD80Zz}ea zi>S~kN{~|{%s{n#)Hnm^^}20#VPd{hW?o69`dq+kU5Y6Xnn2t^o++}nAj2=d(5UiL z!S)Y2FAY39GX`cN@Wi$uSybDkqQkp>`U^c(=j(Hp_r~j)6;3N0{PeC_ACDazC!C!N zb+tLfF;P(O^W+UZu`=tC^a6d(!o4Y1IP2m6iKNUIOYyX^fNy8;=V(oI-0Ni%rP1{J zw30CTy~|-&W@oC3D6?Fa=t4*0A~|ba@wl~Njp&Y@Qj8IVs`5u(JRl(XTU8)lDhQ_w zfYnmJ%x5d;%o`~Mf(5@J0C`5-5O?VhYl0@w7$LNgW}2;mRqZB&NrBX$@Lb3#Y`oY} zOB=mb+VAR7T4TSryS-wr@_n=hjwqG4JS17OnPA}eS)Y+gXr^H0cUKopx_J84?|POi z`;PWqUC7u>Z6$fD;RYYcANyU}hI^lJl8!~Ela0&W!_5L)cC%2tNyLN=&(Y%PCa_HD zZMX#z6s15WBVI-*ER=>-SLKaL&R>IK370|f@Y5bzrK!h{voC?|brt+G4=}=3wTsAy zE1ZyQE>7q*tt`ScED7& zF=@9ZGQjrZeks&XO+cGJ2XZw}Jy+cE#oJJkvkN!_`6K&!gF-HkFB*7OjIGJbN8(Vs zz-wM(ee#z##<#y}^nJM7I-m_rbkLKJ*~8x?jD{t|3t1YgyiUI?qwZSaFp)My+z==? zMA?t&ghXAk6I!|-t<_`Q@!7RV4BqR$I)x0gn5;gKP-r=T?vqNK1L z=Vo8`Kzu~5?gz8nFi$!q9<|3{i`{Ciq>q>A6WL;4E9ysAL2#-wxe)mhYBNiotWVlV zl3`8X5r-l|n)N|QHj?g}vx*!vW7OZORc+UVF<``XWKc75#B`G?Z_H!<>!TXK3Gu8) z*wojzgcJSu28>2VT$)GB*9$w>I_h|G?=PI6UE#`LNf$w{dG1aHGJBm_Ajg6|s}`0y zOXso2g(I*1l`NH^n*DS3H0=AT!pm-<{&LU;7nl3P>F2uS>Cldsz@AXBw+ z0NOVvw@Ee8k_bCn!)Ac;*^Xi&!T$!wWfKU6ta8IL44BcHdqp@wL5;mpkgk!X!|Sy2%4f1_$Gplvbb?KakY8CR4-P{uZXA+tP*qAFRLwB?%`M?_)-ZsEASXaZz%y3L5v9&p&D(iHSDf>;uvvW+^E)svDgu)Fp{L6rl2Wlz!z zH?F+F063MztzejxUF7HV?(MK-TsCBdf1vh`=Ry3_bop#|X|PEX47LXh?%qixFErhg zyW%u>@OVWyQ35HFfj;v10*xVVFTq&Moy?sF*tA3}W!yps9!fV_L^XXsuZmIhWjC)O z%r%|HH!I`lSY%sd#>WEj3~*c?b+WZq%PU687h}W?K_A1_ZzE4F1NZc54$}FiK{ zRHb}2na49){Gt^KrE|+IS-XjnbZ`5=mF(#FO7ed8D1*RHk9B%X7! zUYd={`qT?tS$bldb~$k9BlXg=_k5mCzmK_IFynj1HD26_JRQ;jT-!dUFT!=Fzutht z=ek?`z@;cDf3&dES3EtaJ*AtNd^&@N;<3&^N(1Xg^O=1Y)4j=@OXXm*jqf zt*F{>?V&4gyxQbkmtA}3g=&54pp)FnQL)bh4}8)jKbO|@C>Irwy3~MJypp{)LWZ3;{`KG@wPRE#BeaO&W{4H(Bj}iRiZw_@_ZSS$LA5s$M_d{@O{mkk!pNPzD1c-H zt~1K?ywccr0qUx2#$e|$f7xwW$o=;W#1yi0-l1xon=pYMQ&zQ#@{(CEgid4U_`)nI zyPpvDkP0)UT&U1dcUC73wtHLYnHU?w@LRAw8j~56`1ELPeb{Kgcu8V^amnN-+o&4F zh1xm@1r}0l4cuCdNTZyl%M;vfiRz`d#M*!8=VJR?(-dxp={n~-~+w& z+Zkq7*(kFnxl0o%vr6GcOVq?%e1O^<2&sz{*N0bVE@q=uAniuqChC)eVwWiH#jD$h zQ#sFWBzy;Jce6wop0b@qUAXQ~>rMCy^9sw(GkX%0O%;h(WF^!)1Sf;jm4)2BywjZVIfpmNG5ASVnp%q>5K2h_2S^|UJ-cNYT zIUNnFR`M*PVtpGeO9X#P>p@x3=SL8J6g3D;WfqFEE%|g-sYe!)CrkzP`!f)rGx>H< zIQM_@2smeE(?6n%az2ox_1BKq(-siPBTsLZ&HZ?An>4&+e-E$FTAl(5f@oy(YI`nD zqQFAF6`DS~o)+G3=Yj}Y$*UAl#yu@qmtk@4(6z}`uQU-wvM2Ag5&Gw>fpLAT(|IrfCFJEqe)HZk1+dh_dS=U}fbM%jQs$=) zGOO1fZGB0c7k{Ubh!j6O@SBg(FVeLrFccPHQQ8dcD%3JTTD0`n!Ww=#@a8;8dZSx6 z%S?%O>R;KDTsPPVXY__>_TY4q<;N&G?5_TYZ z^(}1voU8%%&mPf%0(P~Rpum@*(_hsS6j=XYunq)b4xry#%qd=n(JQ4%&9V|zbfJgGlIlIEKi3={yjj`4A`Dmv;uiId181$8xp>M2POZyoY$ef+z!4D zG*c-z6F;074MM7oa26Q0U4lj`ZuO;Ds)LdV=_b{9Vo~@{l23{oK*JjjX<|Q}^Poxg zSm=~&_JHptt{Xzs7-ETnnYq4sEf)(=uMEY1y#cyKBS;i;E)ha< z`-#+rxTkO*T=yC4sSsH)lMwR5g)cSz=j7gav-Zh^0Zps^Mpf3o$-|n?fsT^eAaM?Z zfLnvGnZfGTKfT@?5DHRv;|U1VoEWhsF0|K!KgzQfU@EV3P1v3k2QIb7*H4%pNJA#b z!GGO_VRv1vi_ST)DFss)ze8=^9(?hgD6>>6dnWyOW#Ut?O?wR2z2daRWA#wYM*GUk zj-u3I;wu!SddD*h6jt2t&v>qD(%Lj}GwoMmfxd-y-SMOD>isi;v=(&NGw8tw$c>B3 zN}Oo2ZbYW5=)t4v_ zQ|K(8T2C}9e&su<{pJRDOaqD8IZa%{W^-)PoNn0eyuqtP}4H`8e`I2wEn_uvJO zJ9@ln1pIGvT7#bx^jA>}9}c5myD9;M@968LP>Zc?QSuCp8qFbG;F(KuBwCbiw~F=3 zRz40TD(Wu;Yrn7U9?&s(2wdeas`>KwR_PqQ;+a2QS_f*MegVEFvl#A@9ecXk;&uiZ z8)4o)oe~Duwm}9poXhHD3;=Dj#Ycyu=~l~cd!H~kL$0^FZuuhcwcNg$`o&Fc95mF@5|p~@kmeiJ&5fN?PXdP0 zl4N!Xvhk6rCqpP=6L-}3hS%u1rAx5nyX}$ox__=W_k)seAkerFq=xTFJP{{Kflg}N4y<$@tuCEE3cL>FXVcjt?}%EP zBueml?_WI_q*t=q$Ujt_DKsvR9PzSx(#j8vZ{{(UZRW^ePT}6k4c^G_Qd?I z9x`ANn2f+cx>MLwduAEIkATeUh{`Q%!xaae?x3GAOP1Aeywemx0>sw9ul9WbyHTx8j~M=^hgk!DtwSnTFw@bS4O~>S&$( z>oo?@?JT2k(<82p@j#h!W!e(Z8*%~hfl{>46s2WYY{p!w4LCQs`uVzaGZ%ncz6A!@ z$~=Y1BuUWL@<DMC|g;;cZ zN&ejM{P)Jd#?+^+%xeJe6mrQE;EQHbTpL6`JG8&Ldd6>iB_8nq+C$E<-l+n(x90%M zbstb@Xpb*Y4MawZ=?(9Br3K&fm5O6z^{ClpFZR7TBV2vk7L(4VoABfFJT^O5HJY6+ zCF-K+ot4_9;xKorzXtefO4BWTJphEjUfujPWZ|dj0?-~|?sd-rDXtxgkz*=T4^&FO z;cSwCx2MXFBm1V%!fO51?LxSqnsv5Fp^Fyd7cq^`l&s->~rN2+Ebl z7*1HoV?|NVbj0!Hq=GLKNBurO;HZ)A0{YV(>PA3jEVJ!l0f>+XYCb!kQ>CFffHei6 zO0h8pV|89k2a4}N@HGkwA7EZbUu!R%0gMq-en{~pI_}CR0BiRKKz;6;ZZwh7w69J! z^HgbE?pFDA|IdqbxOzSC&|ex~zs~)6+8H&*_%v61XLWPX0)9$`!;3OqJyFr_X;;w#%|T$wDtiYM%{c}UM#)!DiLXu1Wq?_ zqg(%DZD?t$LC;S2Na4N%pe24m1O3-ko(u5*&_~` zC3=&$%_IOGmH}v!stJO|=lkC1X-O*&A}vK^CLUDW%K$l97qj+7t7thxa7Ms|O>34i z(KCPDa_9c-z%NCB4nN;?_mf^lP{eF+ngD3oKot=Iv6NpXpN*qzOwl<8+QVO05Y=U zy<#x8>}LvLBC|ggzV)NbcXKhud9y|r{<5osFXPf*0}*Ie7sneiZKMSsZp$Qsr8^7h z)$eGE3P9JQL*W+v)HZkH0n%}96}t-Lr$i%M4Y|$XfJI~hTs@V8^-hyAmYl&KaB6Y@ z8I5bN4E)I-1*5$=HIxCxs8T zG$?7}Gyj!q%4Qg_gs2=~V00;WZSeCGLnh*-t%cuXUM`Bi{}e}U26%A`faxU4?Xp_? z`Q;f_)O5mC9q#IPps!b-fxae9Q=*4z5nBQ#?&WgZ9x;}!wJs42fIQ@3**)CDcf~_2 znE$ajr@x;8Pp1J0W<2ELeJr0dxGItHy5PM55J`U&`c8@&FYPi+m|*b$E?fav_+{Ya zg_s+bZlB1+0G<)(1pjy|PUy+Hp_=c@Fo4JE0wZeKPzTTn*nyMseIIB4xR=<68h~(< z3wUW_;LwX0^2MS$!lj4?l8|G}5}2Lm>BF{U5+_VNt~UWHltz-6&GQIir_m6_)BVO0 zl5?lB(7i3i$^>B@v_aV4Un{$^IuT6M304EyEBl|%h^PTtTvo9i;3HK77ruBXpY>9h zt_0lO``c@ULvQ0c&!V@0n&{8a_4j+rd)CmK3(#V8(GBw}=U60Sq@lB6<3V3QTsltF zE~D`FKCj5Yy?Lvdn?q?5_wbDh$AkOWld3&<*ztz9l}DK1eSaqhHTBOT#+x`Q#~QL{ zq&dN;WGZL&eGCo*sGJ5TMhO0-|Kt11IKNguoq891I0n#%Yk*Z`muc$>xN9g7!i@FN zS2WX;0GTFAzd-+9l`D=NYXPU*9)MV~{p$sO5qXbPs9P*RJYIQ^nps?~@l7Evp}h6L z-FpJE2c)mQ$DbYW3MJR}S(}axrl28b5(O6L_~9u*@S6!RM)8&byS(4zFgK^pVokjP zCog58>IrcUfCY3dQrkYZi)_R5J(H!sXimKEd0DrK+iJ{zOaW)iyQY#y&DOxO(%oVZ zGx~G;7_pdcOXFZ0}Dd@CbLlQBM|gZd!4X6zIRLlNs5$f0>42)djov$&TAy# z`L(WRfx!{4aJa)`Z4MA(HsEwNBIEDn+j3%4K&V&S7;T@~1+16w5^eG^hbo4V1UQiM zY_44XVmM}*fPL?U(HiKEW4xI66@#2sKcO99b{*6nXCu^Eth!yMYa3Le`T43$4% zAXbsm3;-iIi^&iHPNnea69D$B@dplst_={$wABH4lc)0T#~=cN;UBMsA@mW3nx4zUHt`Gh!C4^Ifb*pb4vLC# z2h*nMZ-F&5A567ew!7uWQ%UyPb)8Wzi#fSa&Tfq8Ng>=11Hs}NrOQ!ArBg}M|i*o}X z_UZe^0fH4AKmnZwR29l=G?rhze3zu9{T%_=&VTh2#bk>hP_O^}W~Ty(SKJF@JPvuQ zEt_0UnZe1rfC0}1xV9wrXVmUFc*ZnIC&MpQ^n6-xi6-{1r-}woB?nj3kOi2dO*K{Q z{o|(^qMBb%Bc$5xc(k#3wd-liF<4xJ86yiE?Z+Ae1KV%#Y9vUwV1RAvyzS=SPyUw zL{AoeH4m7yQ6eE#RpBeuAEyR}5CxgX!)zLb{Ra}OH(bX@Ke=W(9#euiS)$MX0e2Bi zzPI5+KpI>7RgUQd6C@Hdj`s80GP1u(vH%`UGckSo<{!fg?(&4JSK-K?XxpP;ENYZl zRrfD&$LNCTre3ov_RpPz^uZcMPpEgBE2aj7{ZO7NK$8jrp{6-FvBRamh?KJuP{w}x zZJ67G^O(dsIeez+@*zlzFE>D-d2BD9p`X3(-2)_n4&ZofwpYelRlpf32>`lmdaE9c zF=B+`l*A13Gg06MlD+R#@4rjN3K=lm>mD@%`KxXToHB(SEnEy{jOX>cyV8)C)J5$lh z6#T;g+qPMZ1%pH*mRq?{Mqy`yRC>KzyvW8DBse6&`ltWqy3pUb#$(G(q-mwo9pkXt z0i{90)c;rHKh&_l`t3ZZ;<}R6Qgu29AYYPM>>9}aP7WTAiPQh^wEnMNASgk|v=dn4 zM{m8B1~mcvlD@15;5Y@8BN*cWExPOKoLm-{tl*!p1a%t(^)|quwr&9icU%xDMJrl_ zmHXK(@1r#{IHVbKuKt0FwHR07Teo7X-F_`FealATTQQ1zZg> z@xLDX^5w4$aKEIRPqRw3LCB(n$)DXI3gt`N*qwyeKyKq7%pj-P#O9D?BCBgXmVfZN zZ;#|qJ2=@XQXto~>PhL~0>Dwe2>uhD|75%fczE;i51<8xMadN|VM#gr_^DQO)l~c#g?;40pKAcEG-sVhhBH}Xv@qKMFl;aiiO@PW40Xni8tGq(8gALKO9yd~lEMRY0>PN|=;AucA2AFN(T#dxn zwL${GJxBv91g+{nhA#j8_W(!(DzPdRGf)3#>fpbB{4Z&;##)fTOgxiS|Bya91 pDeC`sQ2ovz^eCOfw{{h*7Ey(}? literal 0 HcmV?d00001 From 1d80c62025c188c529ea5ce98f6873958d1d1534 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 18:58:24 -0600 Subject: [PATCH 34/62] Allow indices exchange via distributed (#6618) (#6624) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mikołaj Błaż Co-authored-by: mikolajblaz --- .../conf/megatron_gpt_config.yaml | 1 + .../language_modeling/megatron/gpt_dataset.py | 37 +++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 67999548e8da..d502f255bd8e 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -195,6 +195,7 @@ model: no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem # Nsys profiling options nsys_profile: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py index d2aa5182b716..cf1de245d0e7 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py @@ -329,6 +329,7 @@ def __init__( if self.no_seqlen_plus_one_input_tokens: self.add_extra_token = 0 self.shuffle_documents = cfg.data.get('shuffle_documents', True) + self.exchange_indices_distributed = cfg.data.get('exchange_indices_distributed', False) # save index mappings to a configurable dir self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) @@ -353,6 +354,7 @@ def __init__( drop_last=drop_last, add_extra_token=self.add_extra_token, shuffle_documents=self.shuffle_documents, + exchange_indices_distributed=self.exchange_indices_distributed, ) deallocate_indexed_dataset_memory(self.indexed_dataset) @@ -544,6 +546,7 @@ def _build_index_mappings( drop_last: bool = True, add_extra_token: int = 1, shuffle_documents: bool = True, + exchange_indices_distributed: bool = False, ): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. @@ -572,12 +575,13 @@ def _build_index_mappings( # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: + using_cached_indices = True if ( (not os.path.isfile(doc_idx_filename)) or (not os.path.isfile(sample_idx_filename)) or (not os.path.isfile(shuffle_idx_filename)) ): - + using_cached_indices = False logging.info(' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') # For the last epoch, decide whether include the entire epoch @@ -677,17 +681,26 @@ def _build_index_mappings( // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group()) ) - # Load mappings. - start_time = time.time() - logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) - doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' > loading sample-idx mapping from {}'.format(sample_idx_filename)) - sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) - shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') - logging.info(' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) - logging.info(' total number of samples: {}'.format(sample_idx.shape[0])) - logging.info(' total number of epochs: {}'.format(num_epochs)) + if not exchange_indices_distributed or (torch.distributed.get_rank() == 0 and using_cached_indices): + # Load mappings. + start_time = time.time() + logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') + logging.info(' > loading sample-idx mapping from {}'.format(sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + logging.info(' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + logging.info(' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) + logging.info(' total number of samples: {}'.format(sample_idx.shape[0])) + logging.info(' total number of epochs: {}'.format(num_epochs)) + + if exchange_indices_distributed: + if torch.distributed.get_rank() == 0: + indices = [(doc_idx, sample_idx, shuffle_idx)] + else: + indices = [None] + torch.distributed.broadcast_object_list(indices) + doc_idx, sample_idx, shuffle_idx = indices[0] return doc_idx, sample_idx, shuffle_idx From 3fb6b874a32450d24dd87403fde78199a8b46fa8 Mon Sep 17 00:00:00 2001 From: fayejf <36722593+fayejf@users.noreply.github.com> Date: Wed, 10 May 2023 18:31:03 -0700 Subject: [PATCH 35/62] Offline and streaming inference support for hybrid model (#6570) * streaming buffered for hybrid + ctc Signed-off-by: fayejf * change default model_stride in eval.yaml Signed-off-by: fayejf * add fc model_stride Signed-off-by: fayejf * small fix Signed-off-by: fayejf * check whether model and decoding match Signed-off-by: fayejf * small fix Signed-off-by: fayejf * streaming buffered for hybrid + rnnt Signed-off-by: fayejf * style fix Signed-off-by: fayejf * fix yaml Signed-off-by: fayejf * reflect comment wip Signed-off-by: fayejf * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: fayejf * refactor and verified Signed-off-by: fayejf * add get_full_path to buffered Signed-off-by: fayejf * small fix Signed-off-by: fayejf * add RNNTDecodingConfig Signed-off-by: fayejf * model name & instruction of changing decoding Signed-off-by: fayejf --------- Signed-off-by: fayejf Signed-off-by: fayejf <36722593+fayejf@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../ctc/speech_to_text_buffered_infer_ctc.py | 29 ++++++- .../speech_to_text_buffered_infer_rnnt.py | 46 ++++++++---- examples/asr/transcribe_speech.py | 16 +++- .../asr/parts/utils/streaming_utils.py | 12 ++- .../asr/parts/utils/transcribe_utils.py | 6 +- tools/asr_evaluator/conf/eval.yaml | 4 +- tools/asr_evaluator/utils.py | 75 +++++++++++++++---- 7 files changed, 151 insertions(+), 37 deletions(-) diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py index 69ea139d2ed6..dd801ddb37f2 100644 --- a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py +++ b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py @@ -48,6 +48,7 @@ from omegaconf import OmegaConf from nemo.collections.asr.metrics.wer import CTCDecodingConfig +from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR from nemo.collections.asr.parts.utils.transcribe_utils import ( @@ -78,10 +79,16 @@ class TranscriptionConfig: pred_name_postfix: Optional[str] = None # If you need to use another model name, rather than standard one. random_seed: Optional[int] = None # seed number going to be used in seed_everything() + # Set to True to output greedy timestamp information (only supported models) + compute_timestamps: bool = False + + # Set to True to output language ID information + compute_langs: bool = False + # Chunked configs chunk_len_in_secs: float = 1.6 # Chunk length in seconds total_buffer_in_secs: float = 4.0 # Length of buffer (chunk + left and right padding) in seconds - model_stride: int = 8 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models", + model_stride: int = 8 # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models. # Decoding strategy for CTC models decoding: CTCDecodingConfig = CTCDecodingConfig() @@ -108,6 +115,9 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') torch.set_grad_enabled(False) + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + if is_dataclass(cfg): cfg = OmegaConf.structured(cfg) @@ -174,6 +184,23 @@ def autocast(): ) return cfg + # Setup decoding strategy + if hasattr(asr_model, 'change_decoding_strategy'): + if not isinstance(asr_model, EncDecCTCModel) and not isinstance(asr_model, EncDecHybridRNNTCTCModel): + raise ValueError("The script supports ctc model and hybrid model with ctc decodng!") + + else: + if cfg.compute_langs: + raise ValueError("CTC models do not support `compute_langs` at the moment.") + + if hasattr( + asr_model, 'cur_decoder' + ): # hybrid model with ctc decoding or potential other models containing decoding switch feature + asr_model.change_decoding_strategy(cfg.decoding, decoder_type='ctc') + + else: # ctc model + asr_model.change_decoding_strategy(cfg.decoding) + asr_model.eval() asr_model = asr_model.to(asr_model.device) diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py index 385a29b8f417..07f7effb85f8 100644 --- a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py +++ b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py @@ -67,7 +67,8 @@ import pytorch_lightning as pl import torch from omegaconf import OmegaConf, open_dict - +from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig +from nemo.collections.asr.models import EncDecHybridRNNTCTCModel, EncDecRNNTModel from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer from nemo.collections.asr.parts.utils.streaming_utils import ( BatchedFrameASRRNNT, @@ -101,10 +102,16 @@ class TranscriptionConfig: pred_name_postfix: Optional[str] = None # If you need to use another model name, rather than standard one. random_seed: Optional[int] = None # seed number going to be used in seed_everything() + # Set to True to output greedy timestamp information (only supported models) + compute_timestamps: bool = False + + # Set to True to output language ID information + compute_langs: bool = False + # Chunked configs chunk_len_in_secs: float = 1.6 # Chunk length in seconds total_buffer_in_secs: float = 4.0 # Length of buffer (chunk + left and right padding) in seconds - model_stride: int = 8 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models + model_stride: int = 8 # Model downsampling factor, 8 for Citrinet and FastConformer models and 4 for Conformer models. # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA # device anyway, and do inference on CPU only if CUDA device is not found. @@ -115,6 +122,9 @@ class TranscriptionConfig: # Recompute model transcription, even if the output folder exists with scores. overwrite_transcripts: bool = True + # Decoding strategy for RNNT models + decoding: RNNTDecodingConfig = RNNTDecodingConfig() + # Decoding configs max_steps_per_timestep: int = 5 #'Maximum number of tokens decoded per acoustic timestep' stateful_decoding: bool = False # Whether to perform stateful decoding @@ -135,6 +145,9 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') torch.set_grad_enabled(False) + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + if is_dataclass(cfg): cfg = OmegaConf.structured(cfg) @@ -195,20 +208,27 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: asr_model = asr_model.to(asr_model.device) # Change Decoding Config - decoding_cfg = asr_model.cfg.decoding - with open_dict(decoding_cfg): + with open_dict(cfg.decoding): if cfg.stateful_decoding: - decoding_cfg.strategy = "greedy" + cfg.decoding.strategy = "greedy" else: - decoding_cfg.strategy = "greedy_batch" - decoding_cfg.preserve_alignments = True # required to compute the middle token for transducers. - decoding_cfg.fused_batch_size = -1 # temporarily stop fused batch during inference. - decoding_cfg.beam.return_best_hypothesis = True - - asr_model.change_decoding_strategy(decoding_cfg) + cfg.decoding.strategy = "greedy_batch" + cfg.decoding.preserve_alignments = True # required to compute the middle token for transducers. + cfg.decoding.fused_batch_size = -1 # temporarily stop fused batch during inference. + cfg.decoding.beam.return_best_hypothesis = True # return and write the best hypothsis only + + # Setup decoding strategy + if hasattr(asr_model, 'change_decoding_strategy'): + if not isinstance(asr_model, EncDecRNNTModel) and not isinstance(asr_model, EncDecHybridRNNTCTCModel): + raise ValueError("The script supports rnnt model and hybrid model with rnnt decodng!") + else: + # rnnt model + if isinstance(asr_model, EncDecRNNTModel): + asr_model.change_decoding_strategy(cfg.decoding) - with open_dict(cfg): - cfg.decoding = decoding_cfg + # hybrid ctc rnnt model with decoder_type = rnnt + if hasattr(asr_model, 'cur_decoder'): + asr_model.change_decoding_strategy(cfg.decoding, decoder_type='rnnt') feature_stride = model_cfg.preprocessor['window_stride'] model_stride_in_secs = feature_stride * cfg.model_stride diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 531b5c56aa4e..8c8d11132183 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -227,6 +227,17 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: compute_timestamps = cfg.compute_timestamps compute_langs = cfg.compute_langs + # Check whether model and decoder type match + if isinstance(asr_model, EncDecCTCModel): + if cfg.decoder_type and cfg.decoder_type != 'ctc': + raise ValueError('CTC model only support ctc decoding!') + elif isinstance(asr_model, EncDecHybridRNNTCTCModel): + if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']: + raise ValueError('Hybrid model only support ctc or rnnt decoding!') + else: # rnnt model, there could be other models needs to be addressed. + if cfg.decoder_type and cfg.decoder_type != 'rnnt': + raise ValueError('RNNT model only support rnnt decoding!') + # Setup decoding strategy if hasattr(asr_model, 'change_decoding_strategy'): if cfg.decoder_type is not None: @@ -240,7 +251,10 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig: decoding_cfg.preserve_alignments = cfg.compute_timestamps if 'compute_langs' in decoding_cfg: decoding_cfg.compute_langs = cfg.compute_langs - asr_model.change_decoding_strategy(decoding_cfg, decoder_type=cfg.decoder_type) + if hasattr(asr_model, 'cur_decoder'): + asr_model.change_decoding_strategy(decoding_cfg, decoder_type=cfg.decoder_type) + else: + asr_model.change_decoding_strategy(decoding_cfg) # Check if ctc or rnnt model elif hasattr(asr_model, 'joint'): # RNNT model diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index b824bc18e770..9efb675b6175 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -769,9 +769,15 @@ def _get_batch_preds(self, keep_logits=False): feat_signal, feat_signal_len = batch feat_signal, feat_signal_len = feat_signal.to(device), feat_signal_len.to(device) - log_probs, encoded_len, predictions = self.asr_model( - processed_signal=feat_signal, processed_signal_length=feat_signal_len - ) + forward_outs = self.asr_model(processed_signal=feat_signal, processed_signal_length=feat_signal_len) + + if len(forward_outs) == 2: # hybrid ctc rnnt model + encoded, encoded_len = forward_outs + log_probs = self.asr_model.ctc_decoder(encoder_output=encoded) + predictions = log_probs.argmax(dim=-1, keepdim=False) + else: + log_probs, encoded_len, predictions = forward_outs + preds = torch.unbind(predictions) for pred in preds: self.all_preds.append(pred.cpu().numpy()) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index d7946aa2842b..990e3b96b0fc 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -58,7 +58,8 @@ def get_buffered_pred_feat_rnnt( print("Parsing manifest files...") for l in mfst_f: row = json.loads(l.strip()) - filepaths.append(row['audio_filepath']) + audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) + filepaths.append(audio_file) if 'text' in row: refs.append(row['text']) @@ -149,8 +150,9 @@ def get_buffered_pred_feat( row = json.loads(l.strip()) if 'text' in row: refs.append(row['text']) + audio_file = get_full_path(audio_file=row['audio_filepath'], manifest_file=manifest) # do not support partial audio - asr.read_audio_file(row['audio_filepath'], delay, model_stride_in_secs) + asr.read_audio_file(audio_file, delay, model_stride_in_secs) hyp = asr.transcribe(tokens_per_chunk, delay) hyps.append(hyp) diff --git a/tools/asr_evaluator/conf/eval.yaml b/tools/asr_evaluator/conf/eval.yaml index 176392b9c070..fe2d74507903 100644 --- a/tools/asr_evaluator/conf/eval.yaml +++ b/tools/asr_evaluator/conf/eval.yaml @@ -12,12 +12,14 @@ engine: mode: offline # choose from offline, chunked or offline_by_chunked chunk_len_in_secs: 1.6 #null # Need to specify if use buffered inference (default for offline_by_chunked is 20) total_buffer_in_secs: 4 #null # Need to specify if use buffered inference (default for offline_by_chunked is 22) - model_stride: 4 # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models + model_stride: 8 # Model downsampling factor, 8 for Citrinet and FastConformer models, and 4 for Conformer models decoder_type: null # Used for hybrid CTC RNNT model only. Specify decoder_type *ctc* or *rnnt* for hybrid CTC RNNT model. + test_ds: manifest_filepath: null sample_rate: 16000 batch_size: 32 + num_workers: 4 augmentor: silence: diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py index 84f4bdb62364..8fd2ebb224c3 100644 --- a/tools/asr_evaluator/utils.py +++ b/tools/asr_evaluator/utils.py @@ -28,6 +28,9 @@ def run_asr_inference(cfg: DictConfig) -> DictConfig: if (cfg.model_path and cfg.pretrained_name) or (not cfg.model_path and not cfg.pretrained_name): raise ValueError("Please specify either cfg.model_path or cfg.pretrained_name!") + if cfg.inference.decoder_type not in [None, 'ctc', 'rnnt']: + raise ValueError("decoder_type could only be null, ctc or rnnt") + if cfg.inference.mode == "offline": cfg = run_offline_inference(cfg) @@ -67,6 +70,7 @@ def run_asr_inference(cfg: DictConfig) -> DictConfig: def run_chunked_inference(cfg: DictConfig) -> DictConfig: + if "output_filename" not in cfg or not cfg.output_filename: if cfg.model_path: model_name = Path(cfg.model_path).stem @@ -93,10 +97,43 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig: / "ctc" / "speech_to_text_buffered_infer_ctc.py" ) + use_rnnt_scrpit = False + # hybrid model + if (cfg.pretrained_name and 'hybrid' in cfg.pretrained_name.lower()) or ( + cfg.model_path and 'hybrid' in cfg.model_path.lower() + ): + if cfg.inference.decoder_type != 'ctc': + use_rnnt_scrpit = True + # rnnt model + elif ( + (cfg.pretrained_name and 'rnnt' in cfg.pretrained_name.lower()) + or (cfg.pretrained_name and 'transducer' in cfg.pretrained_name.lower()) + or (cfg.model_path and 'rnnt' in cfg.model_path.lower()) + or (cfg.model_path and 'transducer' in cfg.model_path.lower()) + ): + if cfg.inference.decoder_type and cfg.inference.decoder_type != 'rnnt': + raise ValueError( + f"rnnt models only support rnnt deocoding! Current decoder_type: {cfg.inference.decoder_type}! Change it to null or rnnt for rnnt models" + ) + use_rnnt_scrpit = True - if (cfg.pretrained_name and 'transducer' in cfg.pretrained_name) or ( - cfg.model_path and 'transducer' in cfg.model_path + # ctc model + elif (cfg.pretrained_name and 'ctc' in cfg.pretrained_name.lower()) or ( + cfg.pretrained_name and 'ctc' in cfg.pretrained_name.lower() ): + if cfg.inference.decoder_type and cfg.inference.decoder_type != 'ctc': + raise ValueError( + f"ctc models only support ctc deocoding! Current decoder_type: {cfg.inference.decoder_type}! Change it to null or ctc for ctc models" + ) + else: + raise ValueError( + "Please make sure your pretrained_name or model_path contains \n\ + 'hybrid' for EncDecHybridRNNTCTCModel model, \n\ + 'transducer/rnnt' for EncDecRNNTModel model or \n\ + 'ctc' for EncDecCTCModel." + ) + + if use_rnnt_scrpit: script_path = ( Path(__file__).parents[2] / "examples" @@ -106,20 +143,25 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig: / "speech_to_text_buffered_infer_rnnt.py" ) + # If need to change other config such as decoding strategy, could either: + # 1) change TranscriptionConfig on top of the executed scripts such as speech_to_text_buffered_infer_rnnt.py, or + # 2) add command as "decoding.strategy=greedy_batch " to below script + + base_cmd = f"python {script_path} \ + calculate_wer=False \ + model_path={cfg.model_path} \ + pretrained_name={cfg.pretrained_name} \ + dataset_manifest={cfg.test_ds.manifest_filepath} \ + output_filename={cfg.output_filename} \ + random_seed={cfg.random_seed} \ + batch_size={cfg.test_ds.batch_size} \ + num_workers={cfg.test_ds.num_workers} \ + chunk_len_in_secs={cfg.inference.chunk_len_in_secs} \ + total_buffer_in_secs={cfg.inference.total_buffer_in_secs} \ + model_stride={cfg.inference.model_stride} " + subprocess.run( - f"python {script_path} " - f"calculate_wer=False " - f"model_path={cfg.model_path} " - f"pretrained_name={cfg.pretrained_name} " - f"dataset_manifest={cfg.test_ds.manifest_filepath} " - f"output_filename={cfg.output_filename} " - f"random_seed={cfg.random_seed} " - f"batch_size={cfg.test_ds.batch_size} " - f"chunk_len_in_secs={cfg.inference.chunk_len_in_secs} " - f"total_buffer_in_secs={cfg.inference.total_buffer_in_secs} " - f"model_stride={cfg.inference.model_stride} ", - shell=True, - check=True, + base_cmd, shell=True, check=True, ) return cfg @@ -142,7 +184,7 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig: f.seek(0) # reset file pointer script_path = Path(__file__).parents[2] / "examples" / "asr" / "transcribe_speech.py" - # If need to move other config such as decoding strategy, could either: + # If need to change other config such as decoding strategy, could either: # 1) change TranscriptionConfig on top of the executed scripts such as transcribe_speech.py in examples/asr, or # 2) add command as "rnnt_decoding.strategy=greedy_batch " to below script subprocess.run( @@ -153,6 +195,7 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig: f"dataset_manifest={cfg.test_ds.manifest_filepath} " f"output_filename={cfg.output_filename} " f"batch_size={cfg.test_ds.batch_size} " + f"num_workers={cfg.test_ds.num_workers} " f"random_seed={cfg.random_seed} " f"eval_config_yaml={f.name} " f"decoder_type={cfg.inference.decoder_type} ", From 0ccb9440532a0ccac239e11d150b13462fc6e2ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 20:23:25 -0700 Subject: [PATCH 36/62] Patch decoding for PC models (#6630) (#6631) * Patch decoding logic for PC models * Patch decoding logic for PC models --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar --- nemo/collections/asr/metrics/rnnt_wer.py | 5 +++++ nemo/collections/asr/metrics/wer.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 00cacbf863d4..0634a45f6a23 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +import re from abc import abstractmethod from dataclasses import dataclass, is_dataclass from typing import Callable, Dict, List, Optional, Tuple, Union @@ -499,6 +500,10 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp else: hypothesis = self.decode_tokens_to_str(prediction) + # TODO: remove + # collapse leading spaces before . , ? for PC models + hypothesis = re.sub(r'(\s+)([\.\,\?])', r'\2', hypothesis) + if self.compute_hypothesis_token_set: hypotheses_list[ind].tokens = self.decode_ids_to_tokens(prediction) diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index faeef5d3d477..4bbeba7624ae 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re from abc import abstractmethod from dataclasses import dataclass, is_dataclass from typing import Callable, Dict, List, Optional, Tuple, Union @@ -540,6 +541,10 @@ def decode_hypothesis( else: hypothesis = self.decode_tokens_to_str(decoded_prediction) + # TODO: remove + # collapse leading spaces before . , ? for PC models + hypothesis = re.sub(r'(\s+)([\.\,\?])', r'\2', hypothesis) + # Preserve this wrapped hypothesis or decoded text tokens. hypotheses_list[ind].text = hypothesis From 8cd5ed6b7c8c17df873b1ff2bdb379cccebbbc50 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 11 May 2023 09:02:07 -0700 Subject: [PATCH 37/62] Fix wer.py where 'errors' variable was not set (#6633) (#6634) Fix wer.py where 'errors' variable was not set when both reference and hypothesis are empty strings Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- nemo/collections/asr/metrics/wer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 4bbeba7624ae..d9b745cbc940 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -114,6 +114,8 @@ def word_error_rate_detail( if len(h_list) != 0: errors = len(h_list) ops_count['insertions'] += errors + else: + errors = 0 else: if use_cer: measures = jiwer.cer(r, h, return_dict=True) From 4d8651d53880092f00967691bcc5a18647d89e52 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Thu, 11 May 2023 10:00:28 -0700 Subject: [PATCH 38/62] Restore GPT support for interleaved pipeline parallelism (#6528) (#6613) * Restore logic for data-parallel communication with pipeline parallelism in GPT * Support dynamic attention masks in GPT * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug typos * Debug data iterator caching with interleaved pipeline parallelism Each model chunk accesses the data iterator multiple times, so we need to cache multiple samples. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Megatron-LM commit * Distinguish between list of data iterators and data iterator that is a list * Create dummy iters to satisy len checks * Kludge while waiting for Megatron-LM update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * set transformers offline to avoid rate limiting --------- Signed-off-by: Tim Moon Signed-off-by: Eric Harper Signed-off-by: Abhinav Khattar Signed-off-by: ericharper Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper Co-authored-by: Abhinav Khattar --- Jenkinsfile | 42 ++--- .../language_modeling/megatron_gpt_model.py | 151 ++++++++++++------ .../modules/common/megatron/build_model.py | 49 +++--- 3 files changed, 150 insertions(+), 92 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5edfa05b8d46..dce112c33598 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,7 +2,7 @@ pipeline { agent { docker { image 'pytorch_23.03:apex_57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2' - args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g' + args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1' } } options { @@ -1014,7 +1014,7 @@ pipeline { // TODO: pleasefixme @redoctopus // stage('ByT5G2P training, evaluation and inference') { // steps { - // sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \ + // sh 'TRANSFORMERS_OFFLINE=1 && cd examples/tts/g2p && \ // TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \ // python g2p_train_and_evaluate.py \ // train_manifest=/home/TestData/g2p/g2p.json \ @@ -1158,7 +1158,7 @@ pipeline { parallel { stage('Dialogue: Intent and slot classification using GPT') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ model.dataset.data_dir=/home/TestData/nlp/sgd_small \ model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ @@ -1185,7 +1185,7 @@ pipeline { } stage('Intent and slot classification using SGDQA') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ model.dataset.data_dir=/home/TestData/nlp/sgd_small \ model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ @@ -1208,7 +1208,7 @@ pipeline { } stage('Intent and slot classification using IntentSlotClassificationModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ @@ -1230,7 +1230,7 @@ pipeline { } stage('Intent classification using ZeroShotIntentModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ @@ -1255,7 +1255,7 @@ pipeline { } stage('Design Intent classification using ZeroShotIntentModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/design_dataset \ @@ -1281,7 +1281,7 @@ pipeline { } stage('Design Intent classification using ZeroShotIntentModel BART Classifier') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/design_dataset \ @@ -1300,7 +1300,7 @@ pipeline { } stage('Design Intent classification using DialogueNearestNeighbourModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/design_dataset \ @@ -1329,7 +1329,7 @@ pipeline { parallel { stage('Dialogue: Answer Extender using DialogueS2SGenerationModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ @@ -1354,7 +1354,7 @@ pipeline { } stage('Dialogue: SGD Based Answer Extender using DialogueS2SGenerationModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/sgd_small \ @@ -1395,7 +1395,7 @@ pipeline { // parallel { // stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { // steps { -// sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ +// sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ // python dialogue.py \ // do_training=False \ // model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ @@ -1425,7 +1425,7 @@ pipeline { parallel { stage('Dialogue: Answer Extender using DialogueGPTGenerationModel') { steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/dialogue && \ python dialogue.py \ do_training=False \ model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ @@ -1549,7 +1549,7 @@ pipeline { stage('BERT SQUAD 1.1') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ model.dataset.use_cache=false \ @@ -1574,7 +1574,7 @@ pipeline { stage('BERT SQUAD 2.0') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ model.dataset.use_cache=false \ @@ -1608,7 +1608,7 @@ pipeline { stage('BART SQUAD 1.1') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ model.dataset.use_cache=false \ @@ -1634,7 +1634,7 @@ pipeline { stage('BART SQUAD 2.0') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ model.dataset.use_cache=false \ @@ -1669,7 +1669,7 @@ pipeline { stage('GPT2 SQUAD 1.1') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ model.dataset.use_cache=false \ @@ -1695,7 +1695,7 @@ pipeline { stage('GPT2 SQUAD 2.0') { // Cannot do fast_dev_run because squad needs whole dev dataset steps { - sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/question_answering && \ + sh 'TRANSFORMERS_OFFLINE=1 && cd examples/nlp/question_answering && \ python question_answering.py \ model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ model.dataset.use_cache=false \ @@ -4016,7 +4016,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" } } - + stage('L2: Megatron T5 Prompt Learning TP1 PP1') { when { anyOf { @@ -4101,7 +4101,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' } } } - + // TODO: add when https://github.com/NVIDIA/apex/pull/1596 is merged // stage('L2: Megatron T5 Prompt Learning TP1 PP2') { // when { diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 9cb4efca57fc..b5f8b2b18f69 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -13,7 +13,9 @@ # limitations under the License. import itertools -from typing import Any, List, Optional, Union +import queue +from functools import partial +from typing import Any, Iterator, List, Optional, Union import numpy as np import torch @@ -68,6 +70,9 @@ from megatron.core import parallel_state from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + # TODO @tmoon: Use once available in Megatron-LM + # from megatron.core.pipeline_parallel.schedules import DataIteratorList + HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): @@ -337,15 +342,24 @@ def forward(self, tokens, text_position_ids, attention_mask, labels): def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] + # handle asynchronous grad reduction + no_sync_func = None + grad_sync_func = None + param_sync_func = None + if not forward_only and self.with_distributed_adam: + no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_o2,) + grad_sync_func = self.reduce_overlap_gradients + param_sync_func = self.sync_overlap_parameters + # run forward and backwards passes for an entire global batch # we do this inside training_step to support pipeline parallelism fwd_bwd_function = get_forward_backward_func() - # TODO @akhattar: remove sync related stuff from config, add num_micro_batches_with_partial_activation_checkpoints when ready + # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready losses_reduced_per_micro_batch = fwd_bwd_function( forward_step_func=self.get_forward_output_and_loss_func(), - data_iterator=dataloader_iter, - model=[self.model], + data_iterator=self._make_data_iterator_list(dataloader_iter), + model=self.model, num_microbatches=get_num_microbatches(), forward_only=forward_only, tensor_shape=tensor_shape, @@ -353,6 +367,9 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None, sequence_parallel=self.cfg.get('sequence_parallel', False), enable_autocast=self.enable_autocast, + no_sync_func=no_sync_func, + grad_sync_func=grad_sync_func, + param_sync_func=param_sync_func, ) # only the last stages of the pipeline return losses @@ -556,44 +573,88 @@ def allreduce_first_last_embeddings(self): grad = word_embeddings_weight.grad torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) + def _make_data_iterator_list(self, data_iterator: Iterator) -> List[Iterator]: + """ Convert data iterator into form expected by Megatron + + With interleaved pipeline parallelism, Megatron expects a + list of one data iterator per model chunk. Each model + chunk independently gets data from its data iterator, so + we need to interact with the data iterator multiple times + for each microbatch step. Instead of incorporating this + logic into the data loader, we cache the iterator's output + to the first model chunk and reuse it in the other model + chunks. + """ + + if not isinstance(self.model, list) or len(self.model) == 1: + return data_iterator # TODO @tmoon: Remove + # TODO @tmoon: Use once available in Megatron-LM + # return DataIteratorList([data_iterator]) + + class CachingIterator: + """Iterator wrapper that caches values""" + + class Proxy: + """Returns values from caching iterator wrapper + + Assumed to never advance past the caching iterator. + """ + + def __init__(self): + self.cache = queue.Queue() + + def __iter__(self): + return self + + def __next__(self): + return self.cache.get_nowait() + + def __init__(self, iterator: Iterator): + self.iterator = iterator + self.proxies = [] + + def make_proxy(self): + self.proxies.append(CachingIterator.Proxy()) + return self.proxies[-1] + + def __iter__(self): + return self + + def __next__(self): + val = next(self.iterator) + for proxy in self.proxies: + proxy.cache.put(val) + return val + + # Make list of iterator wrappers + iters = [CachingIterator(data_iterator)] + while len(iters) < len(self.model): + iters.append(iters[0].make_proxy()) + return iters # TODO @tmoon: Remove + # TODO @tmoon: Use once available in Megatron-LM + # return DataIteratorList(iters) + def get_forward_output_and_loss_func(self, validation_step=False): def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): + + # Get data batch + batch = next(dataloader_iter) + + # Transfer needed data to GPU + required_keys = set() if parallel_state.get_pipeline_model_parallel_world_size() == 1: - batch = next(dataloader_iter) - for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k not in ['attention_mask'] else None - else: - batch[k] = batch[k].cuda(non_blocking=True) + required_keys.update(batch.keys()) else: + required_keys.add('attention_mask') if parallel_state.is_pipeline_first_stage(): - batch = next(dataloader_iter) - # First pipeline stage needs tokens, position_ids, and attention_mask - for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids'] else None - else: - batch[k] = ( - batch[k].cuda(non_blocking=True) - if k in ['tokens', 'position_ids', 'attention_mask'] - else None - ) - elif parallel_state.is_pipeline_last_stage(): - batch = next(dataloader_iter) - # Last pipeline stage needs the labels, loss_mask, and attention_mask - for k in batch.keys(): - if self.get_attention_mask_from_fusion: - batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None - else: - batch[k] = ( - batch[k].cuda(non_blocking=True) - if k in ['labels', 'loss_mask', 'attention_mask'] - else None - ) - else: - # Intermediate pipeline stage doesn't need any inputs - batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels']} - + required_keys.update(('tokens', 'position_ids')) + if parallel_state.is_pipeline_last_stage(): + required_keys.update(('labels', 'loss_mask')) + if self.get_attention_mask_from_fusion: + required_keys.remove('attention_mask') + batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()} + + # Model forward pass output_tensor = model( batch['tokens'], batch['position_ids'], @@ -1052,8 +1113,8 @@ def parameters(self): return self.model.parameters() def _reset_activation_checkpointing_args(self): - """ Disables activation checkpointing completely and saves the values so that - _restore_activation_checkpointing_args can restore them later. This function must always be + """ Disables activation checkpointing completely and saves the values so that + _restore_activation_checkpointing_args can restore them later. This function must always be called before _restore_activation_checkpointing_args. """ # Store values to restore them later. @@ -1076,8 +1137,8 @@ def _reset_activation_checkpointing_args(self): module.language_model.encoder.activations_checkpoint_layers_per_pipeline = None def _restore_activation_checkpointing_args(self): - """ Restores the activation checkpointing parameters using the values saved by - _reset_activation_checkpointing_args. This function must never be called before + """ Restores the activation checkpointing parameters using the values saved by + _reset_activation_checkpointing_args. This function must never be called before _reset_activation_checkpointing_args. """ # Restore config values. @@ -1096,8 +1157,8 @@ def _restore_activation_checkpointing_args(self): ) def _reset_sequence_parallelism_args(self): - """ Disables sequence parallelism completely and saves the values so that - _restore_sequence_parallelism_args can restore them later. This function must always be + """ Disables sequence parallelism completely and saves the values so that + _restore_sequence_parallelism_args can restore them later. This function must always be called before _restore_sequence_parallelism_args. """ # Store values to restore them later. @@ -1112,8 +1173,8 @@ def _reset_sequence_parallelism_args(self): module.language_model.encoder.sequence_parallel = None def _restore_sequence_parallelism_args(self): - """ Restores the sequence parallelism parameters using the values saved by - _reset_sequence_parallelism_args. This function must never be called before + """ Restores the sequence parallelism parameters using the values saved by + _reset_sequence_parallelism_args. This function must never be called before _reset_sequence_parallelism_args. """ # Restore config values. diff --git a/nemo/collections/nlp/modules/common/megatron/build_model.py b/nemo/collections/nlp/modules/common/megatron/build_model.py index 4c7790773d5b..929093405fce 100644 --- a/nemo/collections/nlp/modules/common/megatron/build_model.py +++ b/nemo/collections/nlp/modules/common/megatron/build_model.py @@ -74,28 +74,25 @@ def build_model( and virtual_pipeline_model_parallel_size is not None ): model = [] + parallel_state.set_virtual_pipeline_model_parallel_world_size(virtual_pipeline_model_parallel_size) for i in range(virtual_pipeline_model_parallel_size): - cur_args = args - cur_kwargs = kwargs parallel_state.set_virtual_pipeline_model_parallel_rank(i) - # Set pre_process and post_process only after virtual rank is set. - pre_process = parallel_state.is_pipeline_first_stage() - post_process = parallel_state.is_pipeline_last_stage() - cur_kwargs.update( - {"pre_process": pre_process, "post_process": post_process,} + model.append( + model_provider_func( + *args, + **kwargs, + pre_process=parallel_state.is_pipeline_first_stage(), + post_process=parallel_state.is_pipeline_last_stage(), + ) ) - this_model = model_provider_func(*cur_args, **cur_kwargs) - model.append(this_model) else: - cur_args = args - cur_kwargs = kwargs if model_type == ModelType.encoder_or_decoder: - pre_process = parallel_state.is_pipeline_first_stage() - post_process = parallel_state.is_pipeline_last_stage() - cur_kwargs.update( - {"pre_process": pre_process, "post_process": post_process,} + model = model_provider_func( + *args, + **kwargs, + pre_process=parallel_state.is_pipeline_first_stage(), + post_process=parallel_state.is_pipeline_last_stage(), ) - model = model_provider_func(*cur_args, **cur_kwargs) elif model_type == ModelType.encoder_and_decoder: pre_process = parallel_state.is_pipeline_first_stage() post_process = parallel_state.is_pipeline_last_stage() @@ -111,23 +108,23 @@ def build_model( post_process = rank == (split_rank - 1) or rank == (world_size - 1) add_encoder = parallel_state.is_pipeline_stage_before_split() add_decoder = parallel_state.is_pipeline_stage_after_split() - cur_kwargs.update( - { - "pre_process": pre_process, - "post_process": post_process, - "add_encoder": add_encoder, - "add_decoder": add_decoder, - } + model = model_provider_func( + *args, + **kwargs, + pre_process=pre_process, + post_process=post_process, + add_encoder=add_encoder, + add_decoder=add_decoder, ) - model = model_provider_func(*cur_args, **cur_kwargs) else: raise ValueError(f"Unrecognized ModelType '{model_type}'") - model.model_type = model_type - if not isinstance(model, list): model = [model] + for model_module in model: + model_module.model_type = model_type + # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes From dfb61ebb16ea174c97569ba5ad9fa74c25c82506 Mon Sep 17 00:00:00 2001 From: fayejf <36722593+fayejf@users.noreply.github.com> Date: Thu, 11 May 2023 14:30:19 -0700 Subject: [PATCH 39/62] bugfix (#6636) Signed-off-by: fayejf --- tools/asr_evaluator/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py index 8fd2ebb224c3..1702dc3caf53 100644 --- a/tools/asr_evaluator/utils.py +++ b/tools/asr_evaluator/utils.py @@ -119,7 +119,7 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig: # ctc model elif (cfg.pretrained_name and 'ctc' in cfg.pretrained_name.lower()) or ( - cfg.pretrained_name and 'ctc' in cfg.pretrained_name.lower() + cfg.model_path and 'ctc' in cfg.model_path.lower() ): if cfg.inference.decoder_type and cfg.inference.decoder_type != 'ctc': raise ValueError( From 149f02f94b99c161ee9a8679f75aa3a44ece32f4 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Thu, 11 May 2023 14:38:14 -0700 Subject: [PATCH 40/62] Disable interctc tests (#6638) Signed-off-by: Igor Gitman --- tests/collections/asr/test_asr_interctc_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/collections/asr/test_asr_interctc_models.py b/tests/collections/asr/test_asr_interctc_models.py index c4022cdb49c7..ebed951d1d92 100644 --- a/tests/collections/asr/test_asr_interctc_models.py +++ b/tests/collections/asr/test_asr_interctc_models.py @@ -66,6 +66,7 @@ def squeezeformer_encoder_config() -> Dict: class TestInterCTCLoss: + @pytest.mark.pleasefixme @pytest.mark.unit @pytest.mark.parametrize( "model_class", [EncDecCTCModel, EncDecHybridRNNTCTCModel], From a7b4de28ebb22be394009a247d531be0195c38f2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 11 May 2023 17:15:31 -0600 Subject: [PATCH 41/62] Add megatron_core to requirements (#6639) (#6640) * add megatron_core to requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: ericharper Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Dockerfile | 5 ----- README.rst | 13 ++----------- requirements/requirements_nlp.txt | 1 + 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4cbbf14314c9..d27ed857a88a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,11 +43,6 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* WORKDIR /workspace/ -# Install Megatron-core -RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \ - pip install -e . WORKDIR /tmp/ # TODO: Remove once this Apex commit (2/24/23) is included in PyTorch diff --git a/README.rst b/README.rst index 929cc7f86abc..da24655d008f 100644 --- a/README.rst +++ b/README.rst @@ -236,8 +236,8 @@ Note that RNNT requires numba to be installed from conda. NeMo Megatron ~~~~~~~~~~~~~ -NeMo Megatron training requires NVIDIA Apex and Megatron-core to be installed. -Install them manually if not using the NVIDIA PyTorch container. +NeMo Megatron training requires NVIDIA Apex to be installed. +Install it manually if not using the NVIDIA PyTorch container. To install Apex, run @@ -248,15 +248,6 @@ To install Apex, run git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ -To install Megatron-core, run - -.. code-block:: bash - - git clone https://github.com/NVIDIA/Megatron-LM.git - cd Megatron-LM - git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 - pip install -e . - It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies. While installing Apex, it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with. diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 7283c3092000..0c3c42ba583f 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -11,6 +11,7 @@ ijson inflect jieba matplotlib>=3.3.2 +megatron_core==0.1.0 nltk>=3.6.5 numpy opencc From 32bea29857a8a8b1910bd7a3198bc90d76d1fcab Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 11 May 2023 17:19:52 -0600 Subject: [PATCH 42/62] Remove from jenkins (#6642) * Remove from jenkins (#6641) * add megatron_core to requirements Signed-off-by: ericharper * remove from jenkins Signed-off-by: ericharper --------- Signed-off-by: ericharper * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove dup Signed-off-by: ericharper --------- Signed-off-by: ericharper Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index dce112c33598..b1774732007b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -57,16 +57,6 @@ pipeline { } } - // TODO: remove when pip package is available - stage('Megatron Core installation') { - steps { - sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git checkout 9f8bdeb4814ed61fbc9c7d5b39c7710e77b99754 && \ - pip install -e .' - } - } - stage('PyTorch Lightning version') { steps { sh 'python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"' From f8e46a9e27535b10f9d9a2c0574a37e60c9727dd Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Thu, 11 May 2023 18:18:10 -0700 Subject: [PATCH 43/62] sft model can use this script for eval (#6637) * sft model can use this script for eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * please fix me Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor Signed-off-by: arendu --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../tuning/megatron_gpt_peft_eval.py | 23 +++++++++++++------ .../asr/test_asr_interctc_models.py | 1 + 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index a9f6a110c210..8cccaa024396 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -34,6 +34,7 @@ GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy, + NLPSaveRestoreConnector, PEFTSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -70,7 +71,6 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f"\n{OmegaConf.to_yaml(cfg)}") assert cfg.model.restore_from_path is not None - assert cfg.model.peft.restore_from_path is not None megatron_amp_o2 = cfg.model.get("megatron_amp_O2", False) with_distributed_adam = False @@ -100,9 +100,14 @@ def main(cfg) -> None: plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) - peft_model_cfg = MegatronGPTPEFTModel.restore_from( - restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, - ) + if cfg.model.peft.restore_from_path: + peft_model_cfg = MegatronGPTPEFTModel.restore_from( + restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, + ) + else: + peft_model_cfg = MegatronGPTPEFTModel.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True, + ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(peft_model_cfg): @@ -116,9 +121,13 @@ def main(cfg) -> None: cfg.inference.add_BOS = peft_model_cfg.data.test_ds.add_bos cfg.inference.tokens_to_generate = peft_model_cfg.data.test_ds.tokens_to_generate - save_restore_connector = PEFTSaveRestoreConnector( - peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=None, - ) + if cfg.model.peft.restore_from_path: + save_restore_connector = PEFTSaveRestoreConnector( + peft_model_nemo_path=cfg.model.peft.restore_from_path, peft_model_ckpt_path=None, + ) + else: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(peft_model_cfg.restore_from_path): save_restore_connector.model_extracted_dir = cfg.model.restore_from_path # peft_cls = _get_peft_scheme(peft_model_cfg) diff --git a/tests/collections/asr/test_asr_interctc_models.py b/tests/collections/asr/test_asr_interctc_models.py index ebed951d1d92..6225eecf9660 100644 --- a/tests/collections/asr/test_asr_interctc_models.py +++ b/tests/collections/asr/test_asr_interctc_models.py @@ -87,6 +87,7 @@ class TestInterCTCLoss: ([], [0.3]), ], ) + @pytest.mark.pleasefixme def test_forward(self, model_class, encoder_config, apply_at_layers, loss_weights): preprocessor_config = {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor'} vocabulary = [ From 232f9de409d04e0ea49ef330ea6415cb1a5766e6 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Thu, 11 May 2023 21:10:57 -0700 Subject: [PATCH 44/62] [TTS] Fix TTS audio preprocessing bugs (#6628) Signed-off-by: Ryan --- .../tts/parts/preprocessing/features.py | 4 + .../tts/parts/utils/tts_dataset_utils.py | 3 + .../tts/audio_processing/preprocess_audio.py | 4 +- tests/collections/tts/data/test_data_utils.py | 76 ------------------- .../tts/parts/utils/test_tts_dataset_utils.py | 68 ++++++++++++++++- 5 files changed, 76 insertions(+), 79 deletions(-) delete mode 100644 tests/collections/tts/data/test_data_utils.py diff --git a/nemo/collections/tts/parts/preprocessing/features.py b/nemo/collections/tts/parts/preprocessing/features.py index 675d61adeebe..7d7150a7050f 100644 --- a/nemo/collections/tts/parts/preprocessing/features.py +++ b/nemo/collections/tts/parts/preprocessing/features.py @@ -131,10 +131,14 @@ def __init__( n_fft=win_length, lowfreq=lowfreq, highfreq=highfreq, + mag_power=1.0, log=log, log_zero_guard_type=log_zero_guard_type, log_zero_guard_value=log_zero_guard_value, mel_norm=mel_norm, + normalize=None, + preemph=None, + dither=0.0, ) def compute_mel_spec(self, manifest_entry: dict, audio_dir: Path) -> Tensor: diff --git a/nemo/collections/tts/parts/utils/tts_dataset_utils.py b/nemo/collections/tts/parts/utils/tts_dataset_utils.py index f07b2a9a5b74..06befcb6ec02 100644 --- a/nemo/collections/tts/parts/utils/tts_dataset_utils.py +++ b/nemo/collections/tts/parts/utils/tts_dataset_utils.py @@ -67,6 +67,9 @@ def normalize_volume(audio: np.array, volume_level: float) -> np.array: if not (0.0 <= volume_level <= 1.0): raise ValueError(f"Volume must be in range [0.0, 1.0], received {volume_level}") + if audio.size == 0: + return audio + max_sample = np.max(np.abs(audio)) if max_sample == 0: return audio diff --git a/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py b/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py index b0a4be54da33..c1121dae7f71 100644 --- a/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py +++ b/scripts/dataset_processing/tts/audio_processing/preprocess_audio.py @@ -128,7 +128,7 @@ def _process_entry( if audio_trimmer is not None: audio, start_i, end_i = audio_trimmer.trim_audio(audio=audio, sample_rate=sample_rate, audio_id=audio_path) - if output_sample_rate is not None: + if output_sample_rate: audio = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=output_sample_rate) sample_rate = output_sample_rate @@ -140,7 +140,7 @@ def _process_entry( original_duration = librosa.get_duration(filename=audio_path) output_duration = librosa.get_duration(filename=output_path) - entry["duration"] = output_duration + entry["duration"] = round(output_duration, 2) if os.path.isabs(audio_filepath): entry["audio_filepath"] = output_path diff --git a/tests/collections/tts/data/test_data_utils.py b/tests/collections/tts/data/test_data_utils.py deleted file mode 100644 index 0ce77a35945f..000000000000 --- a/tests/collections/tts/data/test_data_utils.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import pytest - -from nemo.collections.tts.parts.utils.tts_dataset_utils import normalize_volume - - -class TestDataUtils: - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume(self): - input_audio = np.array([0.0, 0.1, 0.3, 0.5]) - expected_output = np.array([0.0, 0.18, 0.54, 0.9]) - - output_audio = normalize_volume(audio=input_audio, volume_level=0.9) - - np.testing.assert_array_almost_equal(output_audio, expected_output) - - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume_negative_peak(self): - input_audio = np.array([0.0, 0.1, -0.3, -1.0, 0.5]) - expected_output = np.array([0.0, 0.05, -0.15, -0.5, 0.25]) - - output_audio = normalize_volume(audio=input_audio, volume_level=0.5) - - np.testing.assert_array_almost_equal(output_audio, expected_output) - - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume_zero(self): - input_audio = np.array([0.0, 0.1, 0.3, 0.5]) - expected_output = np.array([0.0, 0.0, 0.0, 0.0]) - - output_audio = normalize_volume(audio=input_audio, volume_level=0.0) - - np.testing.assert_array_almost_equal(output_audio, expected_output) - - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume_max(self): - input_audio = np.array([0.0, 0.1, 0.3, 0.5]) - expected_output = np.array([0.0, 0.2, 0.6, 1.0]) - - output_audio = normalize_volume(audio=input_audio, volume_level=1.0) - - np.testing.assert_array_almost_equal(output_audio, expected_output) - - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume_zeros(self): - input_audio = np.array([0.0, 0.0, 0.0]) - - output_audio = normalize_volume(audio=input_audio, volume_level=0.5) - - np.testing.assert_array_almost_equal(input_audio, input_audio) - - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_normalize_volume_out_of_range(self): - input_audio = np.array([0.0, 0.1, 0.3, 0.5]) - with pytest.raises(ValueError, match="Volume must be in range"): - normalize_volume(audio=input_audio, volume_level=2.0) diff --git a/tests/collections/tts/parts/utils/test_tts_dataset_utils.py b/tests/collections/tts/parts/utils/test_tts_dataset_utils.py index 180c3ca9f7fc..dadb1844eca6 100644 --- a/tests/collections/tts/parts/utils/test_tts_dataset_utils.py +++ b/tests/collections/tts/parts/utils/test_tts_dataset_utils.py @@ -14,9 +14,10 @@ from pathlib import Path +import numpy as np import pytest -from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, get_audio_filepaths +from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, get_audio_filepaths, normalize_volume class TestTTSDatasetUtils: @@ -53,3 +54,68 @@ def test_get_audio_paths(self): assert abs_path == Path("/home/audio/examples/example.wav") assert rel_path == audio_rel_path + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume(self): + input_audio = np.array([0.0, 0.1, 0.3, 0.5]) + expected_output = np.array([0.0, 0.18, 0.54, 0.9]) + + output_audio = normalize_volume(audio=input_audio, volume_level=0.9) + + np.testing.assert_array_almost_equal(output_audio, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_negative_peak(self): + input_audio = np.array([0.0, 0.1, -0.3, -1.0, 0.5]) + expected_output = np.array([0.0, 0.05, -0.15, -0.5, 0.25]) + + output_audio = normalize_volume(audio=input_audio, volume_level=0.5) + + np.testing.assert_array_almost_equal(output_audio, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_zero(self): + input_audio = np.array([0.0, 0.1, 0.3, 0.5]) + expected_output = np.array([0.0, 0.0, 0.0, 0.0]) + + output_audio = normalize_volume(audio=input_audio, volume_level=0.0) + + np.testing.assert_array_almost_equal(output_audio, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_max(self): + input_audio = np.array([0.0, 0.1, 0.3, 0.5]) + expected_output = np.array([0.0, 0.2, 0.6, 1.0]) + + output_audio = normalize_volume(audio=input_audio, volume_level=1.0) + + np.testing.assert_array_almost_equal(output_audio, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_zeros(self): + input_audio = np.array([0.0, 0.0, 0.0]) + + output_audio = normalize_volume(audio=input_audio, volume_level=0.5) + + np.testing.assert_array_almost_equal(output_audio, input_audio) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_empty(self): + input_audio = np.array([]) + + output_audio = normalize_volume(audio=input_audio, volume_level=1.0) + + np.testing.assert_array_almost_equal(output_audio, input_audio) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_normalize_volume_out_of_range(self): + input_audio = np.array([0.0, 0.1, 0.3, 0.5]) + with pytest.raises(ValueError, match="Volume must be in range"): + normalize_volume(audio=input_audio, volume_level=2.0) From 2b4e9463362e69f16f28ee26b16b706e20a70b11 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Fri, 12 May 2023 23:21:55 +0400 Subject: [PATCH 45/62] Move black parameters to pyproject.toml (#6647) Signed-off-by: Vladimir Bataev --- .pre-commit-config.yaml | 1 - pyproject.toml | 6 ++++++ setup.py | 5 ++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd89d3983cc5..75d1a6c51a1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,5 +43,4 @@ repos: hooks: - id: black name: Format code - args: [--skip-string-normalization, --line-length=119] additional_dependencies: ['click==8.0.2'] diff --git a/pyproject.toml b/pyproject.toml index 32490f886e1a..f2e74dab4eb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,12 @@ default_section = "THIRDPARTY" extend_skip = ["setup.py", "docs/source/conf.py"] +[tool.black] +line_length = 119 +skip_string_normalization = true +required_version = "19.10b0" # recongized by future versions, disallows to reformat code with incompatible versions + + [tool.pytest.ini_options] # durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one. # -vv will also display tests with durration = 0.00s diff --git a/setup.py b/setup.py index 315fed8b0caf..c58fa794de03 100644 --- a/setup.py +++ b/setup.py @@ -111,9 +111,8 @@ def req_file(filename, folder="requirements"): class StyleCommand(distutils_cmd.Command): - __LINE_WIDTH = 119 - __ISORT_BASE = 'isort ' - __BLACK_BASE = f'black --skip-string-normalization --line-length={__LINE_WIDTH}' + __ISORT_BASE = 'isort' + __BLACK_BASE = 'black' description = 'Checks overall project code style.' user_options = [ ('scope=', None, 'Folder of file to operate within.'), From 58b21606ad7a3bca7cc732c43443740c88e0245c Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Fri, 12 May 2023 23:52:33 +0400 Subject: [PATCH 46/62] ASR-TTS Models: Support hybrid RNNT-CTC, improve docs. (#6620) * ASR-TTS: support hybrid RNNT-CTC models * Do not warn on optional import * Explain adding options to config * Fix import guard docs * Add docs for ConcatDataset * Add explanation for sampling parameters * Initial docs for the enhancer model * Fix use_start_end_token parameter usage --------- Signed-off-by: Vladimir Bataev --- docs/source/asr/configs.rst | 11 ++++++----- docs/source/asr/models.rst | 2 +- docs/source/common/data.rst | 13 +++++++++++++ docs/source/common/intro.rst | 1 + docs/source/tts/api.rst | 5 +++++ docs/source/tts/models.rst | 13 ++++++++++++- .../asr_with_tts/speech_to_text_bpe_with_text.py | 2 +- examples/asr/conf/asr_tts/hybrid_asr_tts.yaml | 2 +- nemo/collections/asr/data/text_to_text.py | 3 +-- .../collections/asr/models/hybrid_asr_tts_models.py | 12 +++++++++--- 10 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 docs/source/common/data.rst diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index fc48bc06b3ca..120969ee9dfa 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -885,9 +885,9 @@ Hybrid ASR-TTS Model Configuration :ref:`Hybrid ASR-TTS model ` consists of three parts: -* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) +* ASR model (``EncDecCTCModelBPE``, ``EncDecRNNTBPEModel`` or ``EncDecHybridRNNTCTCBPEModel``) * TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Enhancer model (optional) +* :ref:`Enhancer model ` (optional) Also, the config allows to specify :ref:`text-only dataset `. @@ -895,7 +895,7 @@ Main parts of the config: * ASR model * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field - * ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE`` + * ``asr_model_type``: needed only when training from scratch. ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``, ``hybrid_rnnt_ctc_bpe`` to ``EncDecHybridRNNTCTCBPEModel`` * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario * TTS model * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field @@ -907,7 +907,7 @@ Main parts of the config: * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training) * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value. - * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset`` + * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). Correspond to ``sampling_technique``, ``sampling_temperature``, and ``sampling_probabilities`` parameters of the :mod:`ConcatDataset `. * all other components are similar to conventional ASR models * ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model @@ -920,7 +920,7 @@ Main parts of the config: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model @@ -972,6 +972,7 @@ Training from Scratch To train ASR model from scratch using text-only data use ``/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or ``/examples/asr/conf/conformer/conformer_transducer_bpe.yaml`` Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields. +Use ``++`` or ``+`` markers for these options, since the options are not present in the original ASR model config. .. code-block:: shell diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 2323e1636fcc..80a0fd90f0fb 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -330,7 +330,7 @@ The model consists of three models: * ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``) * Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch ` model is supported) -* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram +* Optional frozen :ref:`Spectrogram Enhancer model ` model trained to mitigate mismatch between real and generated mel spectrogram .. image:: images/hybrid_asr_tts_model.png :align: center diff --git a/docs/source/common/data.rst b/docs/source/common/data.rst new file mode 100644 index 000000000000..4c2f38cbba83 --- /dev/null +++ b/docs/source/common/data.rst @@ -0,0 +1,13 @@ +Data +---- + +.. autoclass:: nemo.collections.common.data.dataset.ConcatDataset + :show-inheritance: + :members: + :undoc-members: + + +.. autoclass:: nemo.collections.common.data.dataset.ConcatMapDataset + :show-inheritance: + :members: + :undoc-members: diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst index dbe8d5d17930..fadbd9528485 100644 --- a/docs/source/common/intro.rst +++ b/docs/source/common/intro.rst @@ -10,3 +10,4 @@ The common collection contains things that could be used across all collections. losses metrics tokenizers + data diff --git a/docs/source/tts/api.rst b/docs/source/tts/api.rst index 2b706132fc0d..e291a995d3cb 100644 --- a/docs/source/tts/api.rst +++ b/docs/source/tts/api.rst @@ -25,6 +25,11 @@ Mel-Spectrogram Generators :members: :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start +.. autoclass:: nemo.collections.tts.models.SpectrogramEnhancerModel + :show-inheritance: + :members: + :exclude-members: setup_training_data, setup_validation_data, training_step, validation_epoch_end, validation_step, setup_test_data, on_train_epoch_start + Speech-to-Text Aligner Models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst index 8b283529a706..fedfd157c307 100644 --- a/docs/source/tts/models.rst +++ b/docs/source/tts/models.rst @@ -112,7 +112,7 @@ Speech-to-text alignment is a critical component of neural TTS models. Autoregre End2End Models --------- +-------------- VITS ~~~~~~~~~~~~~~~ @@ -123,6 +123,17 @@ VITS is an end-to-end speech synthesis model, which generates raw waveform audio :alt: vits model :scale: 25% + +Enhancers +--------- + +.. _SpectrogramEnhancer_model: + +Spectrogram Enhancer +~~~~~~~~~~~~~~~~~~~~ +GAN-based model to add details to blurry spectrograms from TTS models like Tacotron or FastPitch. + + References ---------- diff --git a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py index 386a567cf2dc..946202364c53 100644 --- a/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py +++ b/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py @@ -19,7 +19,7 @@ ```shell python speech_to_text_bpe_with_text.py \ # (Optional: --config-path= --config-name=) \ - ++asr_model_type= \ + ++asr_model_type= \ ++tts_model_path= \ ++enhancer_model_path= \ model.tokenizer.dir= \ diff --git a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml index e933fc59b40f..bdd483215632 100644 --- a/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml +++ b/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml @@ -8,7 +8,7 @@ model: # asr model asr_model_path: ??? asr_model: null - asr_model_type: null # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred + asr_model_type: null # rnnt_bpe, ctc_bpe or hybrid_rnnt_ctc_bpe; needed only if instantiating from config, otherwise type is auto inferred asr_model_fuse_bn: false # only ConformerEncoder supported now, use false for other models # tts model diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py index 23ccd3d7a2ef..88b417ea21bc 100644 --- a/nemo/collections/asr/data/text_to_text.py +++ b/nemo/collections/asr/data/text_to_text.py @@ -37,8 +37,7 @@ try: from nemo_text_processing.text_normalization.normalize import Normalizer except Exception as e: - logging.warning(e) - logging.warning("nemo_text_processing is not installed") + pass # Normalizer imported only for annotation purposes, error can be ignored AnyPath = Union[Path, str] diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py index 1f15e49e0b0d..8486f956c3b7 100644 --- a/nemo/collections/asr/models/hybrid_asr_tts_models.py +++ b/nemo/collections/asr/models/hybrid_asr_tts_models.py @@ -33,6 +33,7 @@ ) from nemo.collections.asr.models.asr_model import ASRModel from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE +from nemo.collections.asr.models.hybrid_rnnt_ctc_bpe_models import EncDecHybridRNNTCTCBPEModel from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch @@ -89,7 +90,7 @@ class ASRWithTTSModel(ASRModel): Text-only data can be mixed with audio-text pairs """ - asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE] + asr_model: Union[EncDecRNNTBPEModel, EncDecCTCModelBPE, EncDecHybridRNNTCTCBPEModel] tts_model: FastPitchModel enhancer_model: Optional[SpectrogramEnhancerModel] @@ -100,6 +101,7 @@ class ASRModelTypes(PrettyStrEnum): RNNT_BPE = "rnnt_bpe" CTC_BPE = "ctc_bpe" + HYBRID_RNNT_CTC_BPE = "hybrid_rnnt_ctc_bpe" @classmethod def from_asr_model(cls, model: Any): @@ -107,6 +109,8 @@ def from_asr_model(cls, model: Any): return cls.RNNT_BPE if isinstance(model, EncDecCTCModelBPE): return cls.CTC_BPE + if isinstance(model, EncDecHybridRNNTCTCBPEModel): + return cls.HYBRID_RNNT_CTC_BPE raise ValueError(f"Unsupported model type: {type(model)}") def get_asr_cls(self): @@ -114,6 +118,8 @@ def get_asr_cls(self): return EncDecRNNTBPEModel if self == self.CTC_BPE: return EncDecCTCModelBPE + if self == self.HYBRID_RNNT_CTC_BPE: + return EncDecHybridRNNTCTCBPEModel raise NotImplementedError(f"Not implemented for value {self.value}") @classmethod @@ -540,7 +546,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer, @@ -556,7 +562,7 @@ def _setup_text_dataset_from_config( manifest_filepath=text_data_config.manifest_filepath, speakers_filepath=text_data_config.speakers_filepath, asr_tokenizer=self.asr_model.tokenizer, - asr_use_start_end_token=train_data_config.use_start_end_token, + asr_use_start_end_token=train_data_config.get("use_start_end_token", False), tts_parser=self.tts_model.parser, tts_text_pad_id=self.tts_model.vocab.pad, tts_text_normalizer=self.tts_model.normalizer, From 88816f0b87364ee539731a0f59f298e958ab68f9 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Fri, 12 May 2023 19:50:57 -0700 Subject: [PATCH 47/62] fix conversion and eval (#6648) * fix conversion and eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron_ckpt_to_nemo.py | 17 +++- .../tuning/megatron_gpt_peft_eval.py | 84 +++++++++++-------- 2 files changed, 66 insertions(+), 35 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py index 5ec767c34a10..e2fd1d4bbcd1 100644 --- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py @@ -29,12 +29,14 @@ import torch from megatron.core import parallel_state +from omegaconf import open_dict from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer.trainer import Trainer from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel @@ -80,7 +82,11 @@ def get_args(): help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.", ) parser.add_argument( - "--model_type", type=str, required=True, default="gpt", choices=["gpt", "t5", "bert", "nmt", "bart", "retro"] + "--model_type", + type=str, + required=True, + default="gpt", + choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"], ) parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1)) parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform") @@ -138,6 +144,15 @@ def convert(local_rank, rank, world_size, args): if args.model_type == 'gpt': model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) + elif args.model_type == 'sft': + model = MegatronGPTSFTModel.load_from_checkpoint( + checkpoint_path, hparams_file=args.hparams_file, trainer=trainer + ) + # we force the target for the loaded model to have the correct target + # because the hparams.yaml sometimes contains MegatronGPTModel as the target. + with open_dict(model.cfg): + model.cfg.target = f"{MegatronGPTSFTModel.__module__}.{MegatronGPTSFTModel.__name__}" + elif args.model_type == 'bert': model = MegatronBertModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index 8cccaa024396..338b66a80cfa 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -13,6 +13,7 @@ # limitations under the License. +import json import os import torch.multiprocessing as mp @@ -21,14 +22,9 @@ from pytorch_lightning.plugins.environments import TorchElasticEnvironment from torch.utils.data import DataLoader -from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import ( - MegatronGPTAdapterModel, - MegatronGPTAdapterPTuningModel, - MegatronGPTIA3Model, - MegatronGPTLoRAModel, - MegatronGPTPEFTModel, - MegatronGPTPTuningModel, -) +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import MegatronGPTPEFTModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, @@ -42,27 +38,35 @@ from nemo.utils import logging mp.set_start_method("spawn", force=True) - """ -This is the script to train an Adapter infused GPT Model for text generation. -A base GPT Model is required as a starting point. This script will then insert -Adapters into each Transformer layer and will train/update only these adapters -during training. The base GPT Model weights will remain frozen. - -During training this script will only save the newly trained Adapter weights -in checkpoints. At the end of training a .nemo file of Adapter weights will -be saved. - -Usage: - Assuming the base model is a 125m GPT Model, with TP=1, PP=1: - a. run a training run for a base gpt nemo file: - python megatron_gpt_adapter_tuning.py \ - "model.data.train_ds=[PATH TO TRAINING JSONL FILE]", - "model.data.validation_ds=[PATH TO VALIDATION JSONL FILE]", - model.language_model_path="PATH TO BASE GPT MODEL .nemo FILE" - name="NAME OF TRAINING RUN" - exp_manager.exp_dir="DIR TO SAVE CHECKPOINTS and .nemo FILE", - trainer.max_epochs=2 +This is the script to run inference with a PEFT model or an SFT Model. + +If you want to evaluate an SFT .nemo file: + +python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ + model.restore_from_path= \ + model.peft.restore_from_path=null \ + trainer.devices=1 model.data.test_ds.file_names=\[, ] \ + model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier + model.data.test_ds.global_batch_size=4 \ # or some other value + model.data.test_ds.micro_batch_size=4 \ + model.data.test_ds.tokens_to_generate=30 \ + inference.greedy=True \ + inference.outfile_path=\'' + +If you want to evaluate a PEFT Model, you should provide a base GPT model and a PEFT model .nemo file + +python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \ + model.restore_from_path= \ + model.peft.restore_from_path= \ # this will be created if you use `megatron_gpt_peft_tuning.py` + trainer.devices=1 model.data.test_ds.file_names=\[, ] \ + model.data.test_ds.names=\['name_for_test_file1', 'name_for_test_file2'] \ # this is not the filename just some identifier + model.data.test_ds.global_batch_size=4 \ # or some other value + model.data.test_ds.micro_batch_size=4 \ + model.data.test_ds.tokens_to_generate=30 \ + inference.greedy=True \ + inference.outfile_path=\'' + """ @@ -105,7 +109,7 @@ def main(cfg) -> None: restore_path=cfg.model.peft.restore_from_path, trainer=trainer, return_config=True, ) else: - peft_model_cfg = MegatronGPTPEFTModel.restore_from( + peft_model_cfg = MegatronGPTSFTModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True, ) @@ -114,6 +118,8 @@ def main(cfg) -> None: # update the model config of the trained model with params we want to set at inference time. peft_model_cfg.precision = cfg.trainer.precision peft_model_cfg.data.test_ds = cfg.model.data.test_ds + peft_model_cfg.activations_checkpoint_granularity = None + peft_model_cfg.activations_checkpoint_method = None with open_dict(cfg): # update the config with the trained model config @@ -128,9 +134,8 @@ def main(cfg) -> None: else: save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(peft_model_cfg.restore_from_path): + if os.path.isdir(cfg.model.restore_from_path): save_restore_connector.model_extracted_dir = cfg.model.restore_from_path - # peft_cls = _get_peft_scheme(peft_model_cfg) model = NLPModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, @@ -148,14 +153,25 @@ def main(cfg) -> None: config = OmegaConf.to_container(cfg.inference, resolve=True) model.set_inference_config(config) response = trainer.predict(model, request_dl) + if model.global_rank == 0: print("***************************") if cfg.inference.outfile_path is not None: with open(cfg.inference.outfile_path, "w", encoding="utf-8") as f: for batch in response: - for sentence in batch["sentences"]: - s = " ".join(sentence.split("\n")) - f.write(s + "\n") + batch_sentences = [s for s in batch['sentences']] + batch_tokens = [s for s in batch['tokens']] + batch_logprob = [s.tolist() for s in batch['logprob']] + for s, t, l in zip(batch_sentences, batch_tokens, batch_logprob): + if cfg.inference.get("verbose", False): + d = { + 'sentence': s, + 'tokens_with_logprobs': ', '.join([f"{_t} {_l:.4f}" for _t, _l in zip(t, l)]), + } + f.write(json.dumps(d, sort_keys=True, indent=2) + '\n') + else: + d = {'sentence': s} + f.write(json.dumps(d) + '\n') print("predictions saved to {}".format(cfg.inference.outfile_path)) else: print(response) From 9b9ed6eebdd86112ccfb77783453a9fa1e7aa717 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Mon, 15 May 2023 13:05:16 -0700 Subject: [PATCH 48/62] Confidence ensembles implementation (#6614) * Working version to train conf model + save ensemble class Signed-off-by: Igor Gitman * Working version Signed-off-by: Igor Gitman * Remove copy of transcribe_speech.py Signed-off-by: Igor Gitman * Move models parameter to config Signed-off-by: Igor Gitman * Add explicit parameters to transcribe Signed-off-by: Igor Gitman * Small cleanups Signed-off-by: Igor Gitman * Add temperature and integration tests Signed-off-by: Igor Gitman * Add more tests Signed-off-by: Igor Gitman * Add pc removal config Signed-off-by: Igor Gitman * Cleanup Signed-off-by: Igor Gitman * Fix typo Signed-off-by: Igor Gitman * Address review comments Signed-off-by: Igor Gitman --------- Signed-off-by: Igor Gitman --- examples/asr/transcribe_speech.py | 14 +- nemo/collections/asr/metrics/rnnt_wer.py | 3 + nemo/collections/asr/metrics/rnnt_wer_bpe.py | 2 +- nemo/collections/asr/metrics/wer.py | 11 +- .../asr/models/confidence_ensemble.py | 203 ++++++++++++++ nemo/collections/asr/models/ctc_bpe_models.py | 2 + nemo/collections/asr/models/ctc_models.py | 2 + .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 4 + .../asr/models/hybrid_rnnt_ctc_models.py | 2 + .../collections/asr/models/rnnt_bpe_models.py | 2 + nemo/collections/asr/models/rnnt_models.py | 2 + nemo/collections/asr/modules/conv_asr.py | 7 + nemo/collections/asr/modules/rnnt.py | 13 +- .../asr/parts/utils/asr_confidence_utils.py | 4 +- nemo/core/classes/modelPT.py | 1 - .../confidence_ensembles/build_ensemble.py | 251 ++++++++++++++++++ .../confidence_ensembles/ensemble_config.yaml | 23 ++ .../test_confidence_ensembles.py | 100 +++++++ 18 files changed, 633 insertions(+), 13 deletions(-) create mode 100644 nemo/collections/asr/models/confidence_ensemble.py create mode 100644 scripts/confidence_ensembles/build_ensemble.py create mode 100644 scripts/confidence_ensembles/ensemble_config.yaml create mode 100644 scripts/confidence_ensembles/test_confidence_ensembles.py diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 8c8d11132183..1c1d5c08199c 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -15,7 +15,7 @@ import contextlib import os from dataclasses import dataclass, is_dataclass -from typing import Optional, Union +from typing import List, Optional, Union import pytorch_lightning as pl import torch @@ -163,9 +163,14 @@ class TranscriptionConfig: langid: str = "en" # specify this for convert_num_to_words step in groundtruth cleaning use_cer: bool = False + # can be set to True to return list of transcriptions instead of the config + # if True, will also skip writing anything to the output file + return_transcriptions: bool = False + @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) -def main(cfg: TranscriptionConfig) -> TranscriptionConfig: +# just specifying List in the return type as otherwise it's too many things +def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List]: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') for key in cfg: @@ -299,7 +304,7 @@ def autocast(): cfg = compute_output_filename(cfg, model_name) # if transcripts should not be overwritten, and already exists, skip re-transcription step and return - if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename): + if not cfg.return_transcriptions and not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename): logging.info( f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`" f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text." @@ -349,6 +354,9 @@ def autocast(): if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] + if cfg.return_transcriptions: + return transcriptions + # write audio transcriptions output_filename, pred_text_attr_name = write_transcription( transcriptions, diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 0634a45f6a23..1ccc2d0ac6fc 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -1268,3 +1268,6 @@ class RNNTDecodingConfig: # beam decoding config beam: beam_decode.BeamRNNTInferConfig = beam_decode.BeamRNNTInferConfig(beam_size=4) + + # can be used to change temperature for decoding + temperature: float = 1.0 diff --git a/nemo/collections/asr/metrics/rnnt_wer_bpe.py b/nemo/collections/asr/metrics/rnnt_wer_bpe.py index c59b65552842..99c71daebaa9 100644 --- a/nemo/collections/asr/metrics/rnnt_wer_bpe.py +++ b/nemo/collections/asr/metrics/rnnt_wer_bpe.py @@ -62,7 +62,7 @@ class RNNTBPEDecoding(AbstractRNNTDecoding): The timestamps will be available in the returned Hypothesis.timestep as a dictionary. compute_langs: a bool flag, which allows to compute language id (LID) information per token, - word, and the entire sample (most likely language id). The LIDS will be available + word, and the entire sample (most likely language id). The LIDS will be available in the returned Hypothesis object as a dictionary rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated. diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index d9b745cbc940..7f7f853d307d 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -75,8 +75,8 @@ def word_error_rate_detail( ) -> Tuple[float, int, float, float, float]: """ Computes Average Word Error Rate with details (insertion rate, deletion rate, substitution rate) - between two texts represented as corresponding lists of string. - + between two texts represented as corresponding lists of string. + Hypotheses and references must have same length. Args: hypotheses (list): list of hypotheses @@ -88,7 +88,7 @@ def word_error_rate_detail( ins_rate (float): average insertion error rate del_rate (float): average deletion error rate sub_rate (float): average substitution error rate - + """ scores = 0 words = 0 @@ -1222,5 +1222,8 @@ class CTCDecodingConfig: # beam decoding config beam: ctc_beam_decoding.BeamCTCInferConfig = ctc_beam_decoding.BeamCTCInferConfig(beam_size=4) - # confidence config + # confidence config confidence_cfg: ConfidenceConfig = ConfidenceConfig() + + # can be used to change temperature for decoding + temperature: float = 1.0 diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py new file mode 100644 index 000000000000..34fe037e30b5 --- /dev/null +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -0,0 +1,203 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +import joblib +import numpy as np +import torch +from omegaconf import DictConfig, OmegaConf, open_dict +from pytorch_lightning import Trainer + +from nemo.collections.asr.models.asr_model import ASRModel +from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel +from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, get_confidence_aggregation_bank +from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType +from nemo.core.classes import ModelPT +from nemo.utils import model_utils + +__all__ = ['ConfidenceEnsembleModel'] + + +class ConfidenceEnsembleModel(ModelPT): + def __init__( + self, cfg: DictConfig, trainer: 'Trainer' = None, + ): + super().__init__(cfg=cfg, trainer=trainer) + + # either we load all models from ``load_models`` cfg parameter + # or all of them are specified in the config as modelX alongside the num_models key + # + # ideally, we'd like to directly store all models in a list, but that + # is not currently supported by the submodule logic + # so to access all the models, we do something like + # + # for model_idx in range(self.num_models): + # model = getattr(self, f"model{model_idx}") + + if 'num_models' in self.cfg: + self.num_models = self.cfg.num_models + for idx in range(self.num_models): + cfg_field = f"model{idx}" + model_cfg = self.cfg[cfg_field] + model_class = model_utils.import_class_by_path(model_cfg['target']) + self.register_nemo_submodule( + name=cfg_field, config_field=cfg_field, model=model_class(model_cfg, trainer=trainer), + ) + else: + self.num_models = len(cfg.load_models) + with open_dict(self.cfg): + self.cfg.num_models = self.num_models + for idx, model in enumerate(cfg.load_models): + cfg_field = f"model{idx}" + if model.endswith(".nemo"): + self.register_nemo_submodule( + name=cfg_field, + config_field=cfg_field, + model=ASRModel.restore_from(model, trainer=trainer, map_location="cpu"), + ) + else: + self.register_nemo_submodule( + cfg_field, config_field=cfg_field, model=ASRModel.from_pretrained(model, map_location="cpu"), + ) + + # registering model selection block - this is expected to be a joblib-saved + # pretrained sklearn pipeline containing standardization + logistic regression + # trained to predict "most-confident" model index from the confidence scores of all models + model_selection_block_path = self.register_artifact("model_selection_block", cfg.model_selection_block) + self.model_selection_block = joblib.load(model_selection_block_path) + self.confidence_cfg = ConfidenceConfig(**self.cfg.confidence) + + # making sure each model has correct confidence settings in the decoder strategy + for model_idx in range(self.num_models): + model = getattr(self, f"model{model_idx}") + # for now we assume users are direclty responsible for matching + # decoder type when building ensemlbe with inference type + # TODO: add automatic checks for errors + if isinstance(model, EncDecHybridRNNTCTCModel): + self.update_decoding_parameters(model.cfg.decoding) + model.change_decoding_strategy(model.cfg.decoding, decoder_type="rnnt") + self.update_decoding_parameters(model.cfg.aux_ctc.decoding) + model.change_decoding_strategy(model.cfg.aux_ctc.decoding, decoder_type="ctc") + else: + self.update_decoding_parameters(model.cfg.decoding) + model.change_decoding_strategy(model.cfg.decoding) + + def update_decoding_parameters(self, decoding_cfg): + """Updating confidence/temperature parameters of the config.""" + with open_dict(decoding_cfg): + decoding_cfg.confidence_cfg = self.confidence_cfg + decoding_cfg.temperature = self.cfg.temperature + + def setup_training_data(self, train_data_config: Union[DictConfig, Dict]): + """Pass-through to the ensemble models. + + Note that training is not actually supported for this class! + """ + for model_idx in range(self.num_models): + getattr(self, f"model{model_idx}").setup_training_data(train_data_config) + + def setup_validation_data(self, val_data_config: Union[DictConfig, Dict]): + """Pass-through to the ensemble models.""" + for model_idx in range(self.num_models): + getattr(self, f"model{model_idx}").setup_validation_data(val_data_config) + + def change_attention_model( + self, self_attention_model: str = None, att_context_size: List[int] = None, update_config: bool = True + ): + """Pass-through to the ensemble models.""" + for model_idx in range(self.num_models): + getattr(self, f"model{model_idx}").change_attention_model( + self_attention_model, att_context_size, update_config + ) + + def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type: str = None): + """Pass-through to the ensemble models. + + The only change here is that we always require frame-confidence to + be returned. + """ + decoding_cfg.confidence_cfg = self.confidence_cfg + for model_idx in range(self.num_models): + model = getattr(self, f"model{model_idx}") + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoding_cfg, decoder_type=decoder_type) + else: + model.change_decoding_strategy(decoding_cfg) + + @torch.no_grad() + def transcribe( + self, + paths2audio_files: List[str], + batch_size: int = 4, + return_hypotheses: bool = False, + num_workers: int = 0, + channel_selector: Optional[ChannelSelectorType] = None, + augmentor: DictConfig = None, + verbose: bool = True, + **kwargs, # any other model specific parameters are passed directly + ) -> List[str]: + """Confidence-ensemble transcribe method. + + Consists of the following steps: + + 1. Run all models (TODO: in parallel) + 2. Compute confidence for each model + 3. Use logistic regression to pick the "most confident" model + 4. Return the output of that model + """ + # TODO: lots of duplicate code with building ensemble script + aggr_func = get_confidence_aggregation_bank()[self.confidence_cfg.aggregation] + confidences = [] + all_transcriptions = [] + # always requiring to return hypothesis + # TODO: make sure to return text only if was False originally + return_hypotheses = True + for model_idx in range(self.num_models): + model = getattr(self, f"model{model_idx}") + transcriptions = model.transcribe( + paths2audio_files=paths2audio_files, + batch_size=batch_size, + return_hypotheses=return_hypotheses, + num_workers=num_workers, + channel_selector=channel_selector, + augmentor=augmentor, + verbose=verbose, + **kwargs, + ) + if isinstance(transcriptions, tuple): # transducers return a tuple + transcriptions = transcriptions[0] + + model_confidences = [] + for transcription in transcriptions: + if isinstance(transcription.frame_confidence[0], list): + # NeMo Transducer API returns list of lists for confidences + conf_values = [conf_value for confs in transcription.frame_confidence for conf_value in confs] + else: + conf_values = transcription.frame_confidence + model_confidences.append(aggr_func(conf_values)) + confidences.append(model_confidences) + all_transcriptions.append(transcriptions) + + # transposing with zip(*list) + features = np.array(list(zip(*confidences))) + model_indices = self.model_selection_block.predict(features) + final_transcriptions = [] + for transcrption_idx in range(len(all_transcriptions[0])): + final_transcriptions.append(all_transcriptions[model_indices[transcrption_idx]][transcrption_idx]) + + return final_transcriptions + + def list_available_models(self): + return [] diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index b97bf769132c..a74c7f3de5c2 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -305,6 +305,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig): dist_sync_on_step=True, ) + self.decoder.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.decoding): self.cfg.decoding = decoding_cfg diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index b7816ec5040d..1446e1ce871f 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -337,6 +337,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig): dist_sync_on_step=True, ) + self.decoder.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.decoding): self.cfg.decoding = decoding_cfg diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index d10d3364ea29..b88669a1fbc0 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -415,6 +415,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type self.joint.set_loss(self.loss) self.joint.set_wer(self.wer) + self.joint.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.decoding): self.cfg.decoding = decoding_cfg @@ -442,6 +444,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type dist_sync_on_step=True, ) + self.ctc_decoder.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.aux_ctc.decoding): self.cfg.aux_ctc.decoding = decoding_cfg diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 9ba5533dbe64..447caa3f5de6 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -347,6 +347,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type dist_sync_on_step=True, ) + self.ctc_decoder.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.aux_ctc): self.cfg.aux_ctc.decoding = decoding_cfg diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 5ee5824b9d27..6fed8be9d410 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -454,6 +454,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig): self.joint.set_loss(self.loss) self.joint.set_wer(self.wer) + self.joint.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.decoding): self.cfg.decoding = decoding_cfg diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 7c91aed99cda..84e08635834d 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -442,6 +442,8 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig): self.joint.set_loss(self.loss) self.joint.set_wer(self.wer) + self.joint.temperature = decoding_cfg.get('temperature', 1.0) + # Update config with open_dict(self.cfg.decoding): self.cfg.decoding = decoding_cfg diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py index a45ee47d0de2..a05ee894f050 100644 --- a/nemo/collections/asr/modules/conv_asr.py +++ b/nemo/collections/asr/modules/conv_asr.py @@ -445,6 +445,9 @@ def __init__(self, feat_in, num_classes, init_mode="xavier_uniform", vocabulary= accepted_adapters = [adapter_utils.LINEAR_ADAPTER_CLASSPATH] self.set_accepted_adapter_types(accepted_adapters) + # to change, requires running ``model.temperature = T`` explicitly + self.temperature = 1.0 + @typecheck() def forward(self, encoder_output): # Adapter module forward step @@ -453,6 +456,10 @@ def forward(self, encoder_output): encoder_output = self.forward_enabled_adapters(encoder_output) encoder_output = encoder_output.transpose(1, 2) # [B, C, T] + if self.temperature != 1.0: + return torch.nn.functional.log_softmax( + self.decoder_layers(encoder_output).transpose(1, 2) / self.temperature, dim=-1 + ) return torch.nn.functional.log_softmax(self.decoder_layers(encoder_output).transpose(1, 2), dim=-1) def input_example(self, max_batch=1, max_dim=256): diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index a07b03731aee..04bdd25ac351 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -1235,6 +1235,9 @@ def __init__( # Flag needed for RNNT export support self._rnnt_export = False + # to change, requires running ``model.temperature = T`` explicitly + self.temperature = 1.0 + @typecheck() def forward( self, @@ -1430,10 +1433,16 @@ def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor: # If log_softmax is automatic if self.log_softmax is None: if not res.is_cuda: # Use log softmax only if on CPU - res = res.log_softmax(dim=-1) + if self.temperature != 1.0: + res = (res / self.temperature).log_softmax(dim=-1) + else: + res = res.log_softmax(dim=-1) else: if self.log_softmax: - res = res.log_softmax(dim=-1) + if self.temperature != 1.0: + res = (res / self.temperature).log_softmax(dim=-1) + else: + res = res.log_softmax(dim=-1) return res diff --git a/nemo/collections/asr/parts/utils/asr_confidence_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_utils.py index 0891ea7312d0..a15428ee52df 100644 --- a/nemo/collections/asr/parts/utils/asr_confidence_utils.py +++ b/nemo/collections/asr/parts/utils/asr_confidence_utils.py @@ -156,7 +156,7 @@ def get_confidence_aggregation_bank(): class ConfidenceMeasureMixin(ABC): """Confidence Measure Mixin class. - + It initializes per-frame confidence measure. """ @@ -193,7 +193,7 @@ def _init_confidence_measure(self, confidence_method_cfg: Optional[DictConfig] = class ConfidenceMixin(ABC): """Confidence Mixin class. - + It initializes per-frame confidence measure. """ diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index c7221dfef0f1..01cf1611f7a4 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -220,7 +220,6 @@ def register_artifact( src (str): Path to artifact. verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if src is not found. Defaults to True. - save_restore_connector (SaveRestoreConnector): Can be overridden to add custom save and restore logic. Returns: str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py new file mode 100644 index 000000000000..9620b73aac87 --- /dev/null +++ b/scripts/confidence_ensembles/build_ensemble.py @@ -0,0 +1,251 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Run ``python build_ensemble.py --help`` for usage examples. + +import atexit + +# using default logging to be able to silence unnecessary messages from nemo +import logging +import os +import random +import sys +import tempfile +from dataclasses import dataclass, is_dataclass +from pathlib import Path +from typing import List + +import joblib +import numpy as np +import pytorch_lightning as pl +from omegaconf import DictConfig, OmegaConf +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import confusion_matrix +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +from nemo.collections.asr.models.confidence_ensemble import ConfidenceEnsembleModel +from nemo.collections.asr.parts.utils.asr_confidence_utils import ( + ConfidenceConfig, + ConfidenceMethodConfig, + get_confidence_aggregation_bank, +) +from nemo.core.config import hydra_runner + +LOG = logging.getLogger(__file__) + +# adding Python path. If not found, asking user to get the file +try: + sys.path.append(str(Path(__file__).parents[2] / "examples" / "asr")) + import transcribe_speech +except ImportError: + # if users run script normally from nemo repo, this shouldn't be triggered as + # we modify the path above. But if they downloaded the build_ensemble.py as + # an isolated script, we'd ask them to also download corresponding version + # of the transcribe_speech.py + print( + "Current script depends on 'examples/asr/transcribe_speech.py', but can't find it. " + "If it's not present, download it from the NeMo github manually and put inside this folder." + ) + + +@dataclass +class EnsembleConfig: + # .nemo path or pretrained name + model: str + # path to the training data manifest (non-tarred) + training_manifest: str + # specify to limit the number of training samples + # 100 is most likely enough, but setting higher default just in case + max_training_samples: int = 1000 + # specify to provide dev data manifest for HP tuning + # dev_manifest: Optional[str] = None + + +@dataclass +class BuildEnsembleConfig: + # where to save the resulting ensemble model + output_path: str + + # each model specification + ensemble: List[EnsembleConfig] + + random_seed: int = 0 # for reproducibility + + # default confidence, can override + confidence: ConfidenceConfig = ConfidenceConfig( + # we keep frame confidences and apply aggregation manually to get full-utterance confidence + preserve_frame_confidence=True, + exclude_blank=True, + aggregation="mean", + method_cfg=ConfidenceMethodConfig( + name="entropy", + entropy_type="renui", + temperature=0.25, # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 + entropy_norm="lin", + ), + ) + temperature: float = 1.0 # this is a real temperature that will be applied to logits + + # this is optional, but can be used to change any aspect of the transcription + # config, such as batch size or amp usage. Note that model, data and confidence + # will be overriden by this script + transcription: transcribe_speech.TranscriptionConfig = transcribe_speech.TranscriptionConfig() + + +def calculate_score(features, labels, pipe): + """Score is always calculated as mean of the per-class scores. + + This is done to account for possible class imbalances. + """ + predictions = pipe.predict(features) + conf_m = confusion_matrix(labels, predictions) + score = np.diag(conf_m).sum() / conf_m.sum() + return score, conf_m + + +def train_model_selection( + training_features, + training_labels, + multi_class="multinomial", + C=10000.0, # disabling regularization by default as overfitting is likely not an issue + class_weight="balanced", # in case training data is imbalanced + max_iter=1000, +): + pipe = make_pipeline( + StandardScaler(), + LogisticRegression(multi_class=multi_class, C=C, max_iter=max_iter, class_weight=class_weight), + ) + pipe.fit(training_features, training_labels) + + accuracy, confusion = calculate_score(training_features, training_labels, pipe) + + LOG.info("Training fit accuracy: %.4f", accuracy * 100.0) + LOG.info("Training confusion matrix:\n%s", str(confusion)) + return pipe + + +def subsample_manifest(manifest_file, max_samples): + """Will save a subsampled version of the manifest to the same folder. + + Have to save to the same folder to support relative paths. + """ + with open(manifest_file, "rt", encoding="utf-8") as fin: + lines = fin.readlines() + if max_samples < len(lines): + lines = random.sample(lines, max_samples) + output_file = manifest_file + "-subsampled" + with open(output_file, "wt", encoding="utf-8") as fout: + fout.write("".join(lines)) + return output_file + + +def cleanup_subsampled_manifests(subsampled_manifests): + for manifest in subsampled_manifests: + os.remove(manifest) + + +@hydra_runner(schema=BuildEnsembleConfig) +def main(cfg: BuildEnsembleConfig): + # silencing all messages from nemo/ptl to avoid dumping tons of configs to the stdout + logging.getLogger('pytorch_lightning').setLevel(logging.CRITICAL) + logging.getLogger('nemo_logger').setLevel(logging.CRITICAL) + LOG.info(f'Build ensemble config:\n{OmegaConf.to_yaml(cfg)}') + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + # no matter what's in the config, frame confidence is required + cfg.confidence.preserve_frame_confidence = True + + pl.seed_everything(cfg.random_seed) + cfg.transcription.random_seed = None # seed is already applied + cfg.transcription.return_transcriptions = True + cfg.transcription.ctc_decoding.confidence_cfg = cfg.confidence + cfg.transcription.rnnt_decoding.confidence_cfg = cfg.confidence + cfg.transcription.ctc_decoding.temperature = cfg.temperature + cfg.transcription.rnnt_decoding.temperature = cfg.temperature + + aggregations = get_confidence_aggregation_bank() + aggr_func = aggregations[cfg.confidence.aggregation] + + confidences = [] + labels = [] + + # registering clean-up function that will hold on to this list and + # should clean up even if there is partial error in some of the transcribe + # calls + subsampled_manifests = [] + atexit.register(cleanup_subsampled_manifests, subsampled_manifests) + + # note that we loop over the same config. + # This is intentional, as we need to run all models on all datasets + for model_idx, model_cfg in enumerate(cfg.ensemble): + model_confidences = [] + for data_idx, data_cfg in enumerate(cfg.ensemble): + if model_idx == 0: # generating subsampled manifests only one time + subsampled_manifests.append( + subsample_manifest(data_cfg.training_manifest, data_cfg.max_training_samples) + ) + subsampled_manifest = subsampled_manifests[data_idx] + + if model_cfg.model.endswith(".nemo"): + cfg.transcription.model_path = model_cfg.model + else: # assuming pretrained model + cfg.transcription.pretrained_name = model_cfg.model + + cfg.transcription.dataset_manifest = subsampled_manifest + + with tempfile.NamedTemporaryFile() as output_file: + cfg.transcription.output_filename = output_file.name + LOG.info("Transcribing dataset %d with model %d", data_idx, model_idx) + transcriptions = transcribe_speech.main(cfg.transcription.copy()) + + for transcription in transcriptions: + if isinstance(transcription.frame_confidence[0], list): + # NeMo Transducer API returns list of lists for confidences + conf_values = [conf_value for confs in transcription.frame_confidence for conf_value in confs] + else: + conf_values = transcription.frame_confidence + model_confidences.append(aggr_func(conf_values)) + if model_idx == 0: # labels are the same for all models + labels.append(data_idx) + + confidences.append(model_confidences) + + # transposing with zip(*list) + training_features = np.array(list(zip(*confidences))) + training_labels = np.array(labels) + model_selection_block = train_model_selection(training_features, training_labels) + with tempfile.TemporaryDirectory() as tmpdir: + model_selection_block_path = os.path.join(tmpdir, 'model_selection_block.pkl') + joblib.dump(model_selection_block, model_selection_block_path) + + # creating ensemble checkpoint + ensemble_model = ConfidenceEnsembleModel( + cfg=DictConfig( + { + 'model_selection_block': model_selection_block_path, + 'confidence': cfg.confidence, + 'temperature': cfg.temperature, + 'load_models': [model_cfg.model for model_cfg in cfg.ensemble], + } + ), + trainer=None, + ) + ensemble_model.save_to(cfg.output_path) + + +if __name__ == '__main__': + main() diff --git a/scripts/confidence_ensembles/ensemble_config.yaml b/scripts/confidence_ensembles/ensemble_config.yaml new file mode 100644 index 000000000000..954876a0c3cc --- /dev/null +++ b/scripts/confidence_ensembles/ensemble_config.yaml @@ -0,0 +1,23 @@ +# an example of it-es ctc model ensemble +# see test_confidence_ensembles.py for expected data structure +# and additional usage examples +ensemble: + - model: stt_es_conformer_ctc_large + training_manifest: ${oc.env:TEST_DATA_PATH}/es/train_manifest.json + - model: stt_it_conformer_ctc_large + training_manifest: ${oc.env:TEST_DATA_PATH}/it/train_manifest.json + +output_path: confidence-ensemble.nemo + +# this is default +temperature: 1.0 + +# this is default +confidence: + exclude_blank: True + aggregation: mean + method_cfg: + name: entropy + entropy_type: renui + temperature: 0.25 # this is not really temperature, but alpha, see https://arxiv.org/abs/2212.08703 + entropy_norm: lin diff --git a/scripts/confidence_ensembles/test_confidence_ensembles.py b/scripts/confidence_ensembles/test_confidence_ensembles.py new file mode 100644 index 000000000000..3e225384de92 --- /dev/null +++ b/scripts/confidence_ensembles/test_confidence_ensembles.py @@ -0,0 +1,100 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# these tests are not included in CI, since they take moderate amount of time +# they are supposed to be run in the nightly pipeline instead + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +from nemo.collections.asr.parts.utils.transcribe_utils import TextProcessingConfig + +sys.path.append(str(Path(__file__).parents[2] / 'examples' / 'asr')) +import speech_to_text_eval + + +@pytest.mark.parametrize( + 'build_args', + [ + "ensemble.0.model=stt_es_conformer_ctc_large ensemble.1.model=stt_it_conformer_ctc_large", + "ensemble.0.model=stt_es_conformer_transducer_large ensemble.1.model=stt_it_conformer_transducer_large", + "ensemble.0.model=stt_es_fastconformer_hybrid_large_pc ensemble.1.model=stt_it_fastconformer_hybrid_large_pc", + ( + "ensemble.0.model=stt_es_fastconformer_hybrid_large_pc " + "ensemble.1.model=stt_it_fastconformer_hybrid_large_pc " + "transcription.decoder_type=ctc" + ), + "ensemble.0.model=stt_es_conformer_ctc_large ensemble.1.model=stt_it_conformer_transducer_large", + ], + ids=( + [ + "CTC models", + "Transducer models", + "Hybrid models (Transducer mode)", + "Hybrid models (CTC mode)", + "CTC + Transducer", + ] + ), +) +def test_confidence_ensemble(tmp_path, build_args): + """Integration tests for confidence-ensembles. + + Tests building ensemble and running inference with the model. + To use, make sure to define TEST_DATA_PATH env variable with path to + the test data. The following structure is assumed: + + $TEST_DATA_PATH + ├── es + │ ├── dev + │ ├── dev_manifest.json + │ ├── test + │ ├── train + │ └── train_manifest.json + ├── it + │ ├── dev + │ ├── dev_manifest.json + │ ├── test + │ ├── test_manifest.json + │ ├── train + │ └── train_manifest.json + └── test_manifest.json + + """ + # checking for test data and failing right away if not defined + if not os.getenv("TEST_DATA_PATH"): + raise ValueError("TEST_DATA_PATH env variable has to be defined!") + + test_data_path = Path(os.environ['TEST_DATA_PATH']) + + build_ensemble_cmd = f""" + python {Path(__file__).parent / 'build_ensemble.py'} \ + --config-name=ensemble_config.yaml \ + output_path={tmp_path / 'ensemble.nemo'} \ + {build_args} + """ + subprocess.run(build_ensemble_cmd, check=True, shell=True) + + eval_cfg = speech_to_text_eval.EvaluationConfig( + dataset_manifest=str(test_data_path / 'test_manifest.json'), + output_filename=str(tmp_path / 'output.json'), + model_path=str(tmp_path / 'ensemble.nemo'), + text_processing=TextProcessingConfig(punctuation_marks=".,?", do_lowercase=True, rm_punctuation=True), + ) + + results = speech_to_text_eval.main(eval_cfg) + assert results.metric_value < 0.15 # relaxed check for better than 15% WER From 90156b190db239698596b20be25a24a1bfdc889e Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 15 May 2023 14:59:46 -0700 Subject: [PATCH 49/62] Patch memory used for NeMo Megatron models (#6615) * Patch memory used for NeMo Megatron models Signed-off-by: smajumdar * Cleanup the dtype of embeddings Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor util function for parsing precision Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor util function for parsing precision Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Try patch for Megatron O2 Signed-off-by: smajumdar * Refactor to incorporate megatron amp 02 state Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor to incorporate megatron amp 02 state Signed-off-by: smajumdar * Correct indent Signed-off-by: smajumdar * Correct utils import Signed-off-by: smajumdar --------- Signed-off-by: smajumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../language_modeling/megatron/bert_model.py | 2 ++ .../language_modeling/megatron/gpt_model.py | 9 ++++- .../language_modeling/megatron_bert_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 3 +- .../megatron_lm_encoder_decoder_model.py | 1 + .../nlp/modules/common/megatron/attention.py | 11 +++++++ .../modules/common/megatron/language_model.py | 23 +++++++++++-- .../common/megatron/megatron_decoders.py | 3 ++ .../common/megatron/megatron_encoders.py | 4 +++ .../megatron/megatron_perceiver_encoders.py | 4 +++ .../megatron/megatron_transformer_decoder.py | 2 ++ .../megatron/megatron_transformer_encoder.py | 2 ++ .../nlp/modules/common/megatron/mlp.py | 8 +++++ .../nlp/modules/common/megatron/module.py | 4 +-- .../retrieval_token_level_encoder_decoder.py | 8 +++++ .../common/megatron/retrieval_transformer.py | 4 +++ .../megatron/token_level_encoder_decoder.py | 9 +++++ .../modules/common/megatron/transformer.py | 33 ++++++++++--------- nemo/collections/nlp/parts/utils_funcs.py | 16 ++++++++- .../core/connectors/save_restore_connector.py | 2 +- 20 files changed, 124 insertions(+), 25 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py index 464d69c72043..132f900298a6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py @@ -172,6 +172,7 @@ def __init__( init_method_std=0.02, fp16_lm_cross_entropy=False, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, precision=16, fp32_residual_connection=False, @@ -219,6 +220,7 @@ def __init__( post_process=self.post_process, init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, precision=precision, fp32_residual_connection=fp32_residual_connection, activations_checkpoint_granularity=activations_checkpoint_granularity, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index d6af1960eae9..e890e6ae4807 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -24,6 +24,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.collections.nlp.parts import utils_funcs try: from apex.transformer.enums import AttnMaskType @@ -123,6 +124,7 @@ def __init__( use_scaled_init_method=True, fp16_lm_cross_entropy=False, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, ffn_dropout=0.0, @@ -171,6 +173,7 @@ def __init__( self.sequence_parallel = sequence_parallel self.gradient_accumulation_fusion = gradient_accumulation_fusion self.share_embeddings_and_output_weights = share_embeddings_and_output_weights + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) if kv_channels is None: assert ( @@ -204,6 +207,7 @@ def __init__( post_process=self.post_process, init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, precision=precision, fp32_residual_connection=fp32_residual_connection, activations_checkpoint_granularity=activations_checkpoint_granularity, @@ -243,7 +247,10 @@ def __init__( if self.share_embeddings_and_output_weights: self.initialize_word_embeddings( - init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size + init_method=init_method_normal(init_method_std), + vocab_size=vocab_size, + hidden_size=hidden_size, + param_dtype=self.dtype, ) def set_input_tensor(self, input_tensor): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index bda1a595655a..64430a669269 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -156,6 +156,7 @@ def model_provider_func(self, pre_process, post_process): init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), + megatron_amp_O2=self.cfg.get('megatron_amp_O2', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b5f8b2b18f69..e9545361b88d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -148,7 +148,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): converted_model = [] for module in self.model: converted_model.append(Float16Module(module=module, precision=cfg.precision)) - self.model = converted_model + self.model = converted_model else: self.model = Float16Module(module=self.model, precision=cfg.precision) @@ -213,6 +213,7 @@ def model_provider_func(self, pre_process, post_process): use_scaled_init_method=self.cfg.get('use_scaled_init_method', True), fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=self.cfg.get('use_cpu_initialization', False), + megatron_amp_O2=self.cfg.get('megatron_amp_O2', False), hidden_dropout=self.cfg.get('hidden_dropout', 0.1), attention_dropout=self.cfg.get('attention_dropout', 0.1), ffn_dropout=self.cfg.get('ffn_dropout', 0.0), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 80d980858f1c..217b707f5014 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -272,6 +272,7 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode post_process=post_process, fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=self.cfg.get('use_cpu_initialization', False), + megatron_amp_O2=self.cfg.get('megatron_amp_O2', False), precision=self.cfg.get('precision', 16), embedding_init_method_std=embedding_init_method_std, embedding_dropout=embedding_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index 64ab50e59118..c025c1fc32ba 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -27,6 +27,7 @@ from nemo.collections.nlp.modules.common.megatron.module import MegatronModule from nemo.collections.nlp.modules.common.megatron.rotary_pos_embedding import apply_rotary_pos_emb from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults, attention_mask_func +from nemo.collections.nlp.parts import utils_funcs from nemo.core import adapter_mixins try: @@ -88,6 +89,7 @@ def __init__( apply_query_key_layer_scaling=True, kv_channels=None, use_cpu_initialization=False, + megatron_amp_O2=False, masked_softmax_fusion=True, attention_dropout=0.1, layer_type=None, @@ -111,6 +113,7 @@ def __init__( self.multi_query_attention = multi_query_attention self.megatron_legacy = megatron_legacy + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) self.set_accepted_adapter_types([InfusedAdapterConfig._target_, LoraKQVAdapterConfig._target_]) @@ -141,6 +144,7 @@ def __init__( gather_output=False, init_method=init_method, use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, async_tensor_model_parallel_allreduce=async_tensor_model_parallel_allreduce, @@ -153,6 +157,8 @@ def __init__( projection_size, gather_output=False, init_method=init_method, + use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, async_tensor_model_parallel_allreduce=async_tensor_model_parallel_allreduce, @@ -164,6 +170,8 @@ def __init__( 2 * projection_size, gather_output=False, init_method=init_method, + use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, async_tensor_model_parallel_allreduce=async_tensor_model_parallel_allreduce, @@ -194,6 +202,7 @@ def __init__( init_method=output_layer_init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, gradient_accumulation_fusion=gradient_accumulation_fusion, @@ -515,6 +524,7 @@ def __init__( apply_query_key_layer_scaling=True, kv_channels=None, use_cpu_initialization=False, + megatron_amp_O2=False, masked_softmax_fusion=True, attention_dropout=0.1, megatron_legacy=False, @@ -537,6 +547,7 @@ def __init__( apply_query_key_layer_scaling=apply_query_key_layer_scaling, kv_channels=kv_channels, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, masked_softmax_fusion=masked_softmax_fusion, attention_dropout=attention_dropout, megatron_legacy=megatron_legacy, diff --git a/nemo/collections/nlp/modules/common/megatron/language_model.py b/nemo/collections/nlp/modules/common/megatron/language_model.py index c946038fb7a9..b8b12cf0caec 100755 --- a/nemo/collections/nlp/modules/common/megatron/language_model.py +++ b/nemo/collections/nlp/modules/common/megatron/language_model.py @@ -29,6 +29,7 @@ init_method_normal, scaled_init_method_normal, ) +from nemo.collections.nlp.parts import utils_funcs from nemo.core import adapter_mixins try: @@ -74,6 +75,7 @@ def get_language_model( post_process=True, init_method_std=0.02, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, ffn_dropout=0.0, @@ -149,6 +151,7 @@ def get_language_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, @@ -252,6 +255,8 @@ def __init__( init_method, num_tokentypes=0, use_cpu_initialization=False, + megatron_amp_O2=False, + dtype=torch.float32, fp32_residual_connection=False, sequence_parallel=False, position_embedding_type='learned_absolute', @@ -267,13 +272,17 @@ def __init__( # Word embeddings (parallel). self.word_embeddings = tensor_parallel.VocabParallelEmbedding( - vocab_size, self.hidden_size, init_method=self.init_method, use_cpu_initialization=use_cpu_initialization, + vocab_size, + self.hidden_size, + init_method=self.init_method, + use_cpu_initialization=use_cpu_initialization, + params_dtype=dtype, ) self._word_embeddings_key = 'word_embeddings' if self.position_embedding_type == 'learned_absolute': # Position embedding (serial). - self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size) + self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size, dtype=dtype) self._position_embeddings_key = 'position_embeddings' # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) @@ -284,7 +293,7 @@ def __init__( # token types and add them as needed. self._tokentype_embeddings_key = 'tokentype_embeddings' if self.num_tokentypes > 0: - self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size) + self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size, dtype=dtype) # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: @@ -448,6 +457,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, ffn_dropout=0.0, @@ -507,6 +517,7 @@ def __init__( self.position_embedding_type = position_embedding_type self.share_embeddings_and_output_weights = share_embeddings_and_output_weights self.sequence_parallel = sequence_parallel + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) if kv_channels is None: @@ -524,10 +535,12 @@ def __init__( init_method=self.init_method, num_tokentypes=self.num_tokentypes, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, embedding_dropout_prob=self.hidden_dropout, sequence_parallel=sequence_parallel, position_embedding_type=position_embedding_type, fp32_residual_connection=fp32_residual_connection, + dtype=self.dtype, ) self._embedding_key = 'embedding' @@ -561,6 +574,7 @@ def __init__( attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, persist_layer_norm=persist_layer_norm, openai_gelu=openai_gelu, onnx_safe=onnx_safe, @@ -615,6 +629,7 @@ def __init__( hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, @@ -640,6 +655,8 @@ def __init__( self.output_layer = tensor_parallel.ColumnParallelLinear( self.hidden_size, self.vocab_size, + use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, bias=False, # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias. init_method=self.init_method, ) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py index 3e82537b6b71..28eb39e630fc 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py @@ -53,6 +53,7 @@ def get_decoder_model( post_process=True, init_method_std=0.02, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, ffn_dropout=0.0, @@ -117,6 +118,7 @@ def get_decoder_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, @@ -158,6 +160,7 @@ def get_decoder_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, precision=precision, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py index 998b16240347..4005ffbd879e 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py @@ -54,6 +54,7 @@ def get_encoder_model( post_process=True, init_method_std=0.02, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, ffn_dropout=0.0, @@ -119,6 +120,7 @@ def get_encoder_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, @@ -160,6 +162,7 @@ def get_encoder_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, precision=precision, @@ -202,6 +205,7 @@ def get_encoder_model( pre_process=pre_process, post_process=post_process, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_perceiver_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_perceiver_encoders.py index 73774573596d..150c6466bcde 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_perceiver_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_perceiver_encoders.py @@ -56,6 +56,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, encoder_attn_mask_type=AttnMaskType.padding, hidden_dropout=0.1, attention_dropout=0.1, @@ -124,6 +125,7 @@ def __init__( self.ffn_dropout = ffn_dropout self.normalize_attention_scores = normalize_attention_scores self.megatron_legacy = megatron_legacy + self.megatron_amp_O2 = megatron_amp_O2 assert self.num_self_attention_per_cross_attention >= 1 assert self.hidden_steps >= 1 @@ -165,6 +167,7 @@ def _build_cross_attn_layer(self): attention_dropout=self.attention_dropout, ffn_dropout=self.ffn_dropout, use_cpu_initialization=self.use_cpu_initialization, + megatron_amp_O2=self.megatron_amp_O2, bias_activation_fusion=self.bias_activation_fusion, bias_dropout_add_fusion=self.bias_dropout_add_fusion, masked_softmax_fusion=self.masked_softmax_fusion, @@ -204,6 +207,7 @@ def _build_self_attn_layer(self): attention_dropout=self.attention_dropout, ffn_dropout=self.ffn_dropout, use_cpu_initialization=self.use_cpu_initialization, + megatron_amp_O2=self.megatron_amp_O2, bias_activation_fusion=self.bias_activation_fusion, bias_dropout_add_fusion=self.bias_dropout_add_fusion, masked_softmax_fusion=self.masked_softmax_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py index 77f8e2c3fa25..c3cb1fd05c3b 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py @@ -57,6 +57,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, decoder_attn_mask_type=AttnMaskType.causal, hidden_dropout=0.1, attention_dropout=0.1, @@ -129,6 +130,7 @@ def __init__( attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py index 667d000f7a9f..2eacf8aad672 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py @@ -54,6 +54,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, encoder_attn_mask_type=AttnMaskType.padding, hidden_dropout=0.1, attention_dropout=0.1, @@ -127,6 +128,7 @@ def __init__( attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/mlp.py b/nemo/collections/nlp/modules/common/megatron/mlp.py index 43e30784c63a..499608fdabb9 100644 --- a/nemo/collections/nlp/modules/common/megatron/mlp.py +++ b/nemo/collections/nlp/modules/common/megatron/mlp.py @@ -69,6 +69,7 @@ def __init__( hidden_size, ffn_hidden_size, use_cpu_initialization=False, + dtype=torch.float32, bias_activation_fusion=True, openai_gelu=False, onnx_safe=False, @@ -91,6 +92,7 @@ def __init__( self.persist_layer_norm = persist_layer_norm self.activation = activation self.dropout = dropout + self.dtype = dtype self.set_accepted_adapter_types([MLPInfusedAdapterConfig._target_]) supported_activations = [ @@ -123,6 +125,7 @@ def __init__( init_method=init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, + params_dtype=dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, async_tensor_model_parallel_allreduce=async_tensor_model_parallel_allreduce, @@ -139,6 +142,7 @@ def __init__( init_method=init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, + params_dtype=dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, async_tensor_model_parallel_allreduce=async_tensor_model_parallel_allreduce, @@ -195,6 +199,7 @@ def __init__( init_method=output_layer_init_method, skip_bias_add=True, use_cpu_initialization=use_cpu_initialization, + params_dtype=dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, gradient_accumulation_fusion=gradient_accumulation_fusion, @@ -280,6 +285,7 @@ def __init__( hidden_size, ffn_hidden_size, use_cpu_initialization=False, + dtype=torch.float32, bias_activation_fusion=True, openai_gelu=False, onnx_safe=False, @@ -304,6 +310,7 @@ def __init__( init_method=init_method, skip_bias_add=False, use_cpu_initialization=use_cpu_initialization, + params_dtype=dtype, bias=bias, sequence_parallel_enabled=sequence_parallel, gradient_accumulation_fusion=gradient_accumulation_fusion, @@ -315,6 +322,7 @@ def __init__( 'hidden_size': hidden_size, 'ffn_hidden_size': ffn_hidden_size, 'use_cpu_initialization': use_cpu_initialization, + 'dtype': dtype, 'bias_activation_fusion': bias_activation_fusion, 'openai_gelu': openai_gelu, 'onnx_safe': onnx_safe, diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 52464b819c2f..58ce7a7bae18 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -111,7 +111,7 @@ def decoder_cross_attention_relative_position_embeddings_weight(self): f"No decoder_cross_attention_relative_position_embedding found on this rank. Looking for decoder_cross_attention_relative_position_embedding.relative_position_embedding.weight" ) - def initialize_word_embeddings(self, init_method, vocab_size, hidden_size): + def initialize_word_embeddings(self, init_method, vocab_size, hidden_size, param_dtype=torch.float32): if not self.share_token_embeddings: raise Exception('initialize_word_embeddings() was called but ' 'share_token_embeddings is false') @@ -140,7 +140,7 @@ def initialize_word_embeddings(self, init_method, vocab_size, hidden_size): # set word_embeddings weights to 0 here, then copy first # stage's weights using all_reduce below. self.word_embeddings = tensor_parallel.VocabParallelEmbedding( - vocab_size, hidden_size, init_method=init_method + vocab_size, hidden_size, init_method=init_method, params_dtype=param_dtype ) self.word_embeddings.weight.data.fill_(0) self.word_embeddings.weight.shared = True diff --git a/nemo/collections/nlp/modules/common/megatron/retrieval_token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/retrieval_token_level_encoder_decoder.py index 0b164a80e0e4..cbec4c754840 100644 --- a/nemo/collections/nlp/modules/common/megatron/retrieval_token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/retrieval_token_level_encoder_decoder.py @@ -26,6 +26,7 @@ init_method_normal, scaled_init_method_normal, ) +from nemo.collections.nlp.parts import utils_funcs try: from apex.transformer.enums import ModelType @@ -68,6 +69,7 @@ def __init__( init_method_std=0.02, fp16_cross_entropy=False, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, precision=16, @@ -121,6 +123,8 @@ def __init__( self.num_chunked_cross_attention = len(dec_cross_attention) self.megatron_lm_compatible = megatron_lm_compatible + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) + if kv_channels is None: assert ( hidden_size % num_attention_heads == 0 @@ -138,6 +142,7 @@ def __init__( embedding_dropout_prob=hidden_dropout, position_embedding_type='learned_absolute' if add_position_embedding else '', transpose_batch_sequence=False, + dtype=self.dtype, ) self._embedding_key = "embedding" @@ -172,6 +177,7 @@ def __init__( else post_process, # megatron lm model has no final layer_norm init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, precision=precision, @@ -236,6 +242,7 @@ def __init__( post_process=False, # no need for post process init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, precision=precision, @@ -280,6 +287,7 @@ def __init__( post_process=post_process, init_method_std=init_method_std, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, precision=precision, diff --git a/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py b/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py index 76d171eb55da..73c41cee6c6f 100644 --- a/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/retrieval_transformer.py @@ -54,6 +54,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, precision=16, @@ -126,6 +127,7 @@ def __init__( hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, @@ -337,6 +339,7 @@ def __init__( pre_process=True, post_process=True, use_cpu_initialization=False, + megatron_amp_O2=False, hidden_dropout=0.1, attention_dropout=0.1, precision=16, @@ -408,6 +411,7 @@ def __init__( hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py index dcf41a696b6e..229a9af48048 100644 --- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py @@ -38,6 +38,7 @@ scaled_init_method_normal, ) from nemo.collections.nlp.modules.common.megatron.vocab_parallel_cross_entropy import vocab_parallel_cross_entropy +from nemo.collections.nlp.parts import utils_funcs try: from apex.transformer.enums import AttnMaskType, ModelType @@ -115,6 +116,7 @@ def __init__( post_process=True, fp16_cross_entropy=False, use_cpu_initialization=False, + megatron_amp_O2=False, precision=16, embedding_init_method_std=0.02, embedding_dropout=0.1, @@ -143,6 +145,8 @@ def __init__( encoder_kv_channels, decoder_kv_channels = self._validate_config() + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) + encoder, decoder = None, None if add_encoder: if pre_process: @@ -153,6 +157,7 @@ def __init__( init_method=init_method_normal(embedding_init_method_std), num_tokentypes=num_tokentypes, use_cpu_initialization=use_cpu_initialization, + dtype=self.dtype, embedding_dropout_prob=embedding_dropout, position_embedding_type=encoder_cfg.get('position_embedding_type', 'learned_absolute'), ) @@ -209,6 +214,7 @@ def __init__( post_process=post_process, init_method_std=encoder_cfg.get('init_method_std', 0.02), use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=encoder_cfg.get('hidden_dropout', 0.1), attention_dropout=encoder_cfg.get('attention_dropout', 0.1), ffn_dropout=encoder_cfg.get('ffn_dropout', 0.0), @@ -254,6 +260,7 @@ def __init__( init_method=init_method_normal(embedding_init_method_std), num_tokentypes=num_tokentypes, use_cpu_initialization=use_cpu_initialization, + dtype=self.dtype, embedding_dropout_prob=embedding_dropout, position_embedding_type=decoder_cfg.get('position_embedding_type', 'learned_absolute'), ) @@ -338,6 +345,7 @@ def __init__( post_process=post_process, init_method_std=decoder_cfg.get('init_method_std', 0.02), use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, hidden_dropout=decoder_cfg.get('hidden_dropout', 0.1), attention_dropout=decoder_cfg.get('attention_dropout', 0.1), ffn_dropout=decoder_cfg.get('ffn_dropout', 0.0), @@ -393,6 +401,7 @@ def __init__( gather_output=not self.parallel_output, init_method=init_method_normal(decoder_cfg.init_method_std), use_cpu_initialization=use_cpu_initialization, + params_dtype=self.dtype, ) self._tokens_head_key = 'tokens_head' diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 85d055f70e37..0f6112e08036 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -38,6 +38,7 @@ from nemo.collections.nlp.modules.common.megatron.mlp import ParallelMLP, SwitchMLP from nemo.collections.nlp.modules.common.megatron.module import MegatronModule from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults +from nemo.collections.nlp.parts import utils_funcs from nemo.core import adapter_mixins from nemo.utils import logging @@ -139,6 +140,7 @@ def __init__( hidden_dropout=0.1, persist_layer_norm=False, use_cpu_initialization=False, + megatron_amp_O2=False, bias_activation_fusion=True, bias_dropout_add_fusion=True, masked_softmax_fusion=True, @@ -176,6 +178,8 @@ def __init__( self.bias = bias self.transformer_block_type = transformer_block_type self.position_embedding_type = position_embedding_type + self.param_dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2) + self.set_accepted_adapter_types([LinearAdapterConfig._target_, ParallelLinearAdapterConfig._target_]) if not bias and bias_dropout_add_fusion: @@ -223,6 +227,7 @@ def __init__( apply_query_key_layer_scaling=apply_query_key_layer_scaling, kv_channels=kv_channels, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, masked_softmax_fusion=masked_softmax_fusion, attention_dropout=attention_dropout, multi_query_attention=multi_query_attention, @@ -292,6 +297,7 @@ def __init__( kv_channels=kv_channels, multi_query_attention=multi_query_attention, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, masked_softmax_fusion=masked_softmax_fusion, attention_dropout=attention_dropout, megatron_legacy=megatron_legacy, @@ -339,6 +345,7 @@ def __init__( apply_query_key_layer_scaling=apply_query_key_layer_scaling, kv_channels=kv_channels, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, masked_softmax_fusion=masked_softmax_fusion, attention_dropout=attention_dropout, megatron_legacy=megatron_legacy, @@ -381,6 +388,7 @@ def __init__( hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, use_cpu_initialization=use_cpu_initialization, + dtype=self.param_dtype, bias_activation_fusion=bias_activation_fusion, openai_gelu=openai_gelu, onnx_safe=onnx_safe, @@ -401,6 +409,7 @@ def __init__( hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, use_cpu_initialization=use_cpu_initialization, + dtype=self.param_dtype, bias_activation_fusion=bias_activation_fusion, openai_gelu=openai_gelu, onnx_safe=onnx_safe, @@ -637,6 +646,7 @@ def __init__( bias_dropout_add_fusion=True, persist_layer_norm=False, use_cpu_initialization=False, + megatron_amp_O2=False, bias_activation_fusion=True, openai_gelu=False, onnx_safe=False, @@ -678,6 +688,7 @@ def __init__( bias_dropout_add_fusion=bias_dropout_add_fusion, persist_layer_norm=persist_layer_norm, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, openai_gelu=openai_gelu, onnx_safe=onnx_safe, @@ -702,14 +713,8 @@ def __init__( moe_dropout=moe_dropout, ) - if precision == 'bf16': - self.dtype = torch.bfloat16 - elif int(precision) == 16: - self.dtype = torch.float16 - elif int(precision) == 32: - self.dtype = torch.float32 - else: - raise ValueError + # Dtype for forward pass - ignore amp O2 + self.dtype = utils_funcs.dtype_from_precision(precision, megatron_amp_O2=None) def forward( self, @@ -822,14 +827,8 @@ def __init__( ) # use_emha=use_emha, - if autocast_dtype == 32: - self.dtype = torch.float32 - elif autocast_dtype == 16: - self.dtype = torch.float16 - elif autocast_dtype == 'bf16': - self.dtype = torch.bfloat16 - else: - raise ValueError + # Dtype for forward pass - ignore amp O2 + self.dtype = utils_funcs.dtype_from_precision(autocast_dtype, megatron_amp_O2=None) def forward( self, @@ -889,6 +888,7 @@ def __init__( attention_dropout=0.1, ffn_dropout=0.0, use_cpu_initialization=False, + megatron_amp_O2=False, bias_activation_fusion=True, bias_dropout_add_fusion=True, masked_softmax_fusion=True, @@ -1079,6 +1079,7 @@ def build_layer(layer_number): attention_dropout=attention_dropout, ffn_dropout=ffn_dropout, use_cpu_initialization=use_cpu_initialization, + megatron_amp_O2=megatron_amp_O2, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, masked_softmax_fusion=masked_softmax_fusion, diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py index 58872c6b4670..cd76840c8db8 100644 --- a/nemo/collections/nlp/parts/utils_funcs.py +++ b/nemo/collections/nlp/parts/utils_funcs.py @@ -16,7 +16,7 @@ import os import time -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import numpy as np import torch @@ -27,6 +27,20 @@ from nemo.utils import logging +def dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool]) -> torch.dtype: + if megatron_amp_O2 is not None and megatron_amp_O2 is False: + return torch.float32 + + if precision == 'bf16': + return torch.bfloat16 + elif int(precision) == 16: + return torch.float16 + elif int(precision) == 32: + return torch.float32 + else: + raise ValueError(f"Could not parse the precision of `{precision}` to a valid torch.dtype") + + def list2str(l: List[int]) -> str: """ Converts list to a string""" return ' '.join([str(x) for x in l]) diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py index e5eb4930e224..998de3e91059 100644 --- a/nemo/core/connectors/save_restore_connector.py +++ b/nemo/core/connectors/save_restore_connector.py @@ -562,7 +562,7 @@ def _save_state_dict_to_disk(state_dict, filepath): @staticmethod def _load_state_dict_from_disk(model_weights, map_location=None): - return torch.load(model_weights, map_location=map_location) + return torch.load(model_weights, map_location='cpu') @property def model_config_yaml(self) -> str: From cfff834f440072771c07dcfc1ed9d4e195f7c593 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Mon, 15 May 2023 18:44:48 -0700 Subject: [PATCH 50/62] handle artifacts when path is dir (#6658) Signed-off-by: arendu --- nemo/core/connectors/save_restore_connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/core/connectors/save_restore_connector.py b/nemo/core/connectors/save_restore_connector.py index 998de3e91059..473118744594 100644 --- a/nemo/core/connectors/save_restore_connector.py +++ b/nemo/core/connectors/save_restore_connector.py @@ -470,7 +470,10 @@ def _handle_artifacts(self, model, nemo_file_folder): # unpack all restorations paths (nemo checkpoints) # in nemo checkpoints all resources contain hash in name, so there should be no collisions for path in restoration_paths: - self._unpack_nemo_file(path2file=path, out_folder=archive_dir) + if self.model_extracted_dir: + shutil.copytree(src=path, dst=archive_dir, dirs_exist_ok=True) + else: + self._unpack_nemo_file(path2file=path, out_folder=archive_dir) os.chdir(archive_dir) for conf_path, artiitem in tarfile_artifacts: # Get basename and copy it to nemo_file_folder From 609c7b7bebd112ca7825cc4d8d1f7321562e37b8 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Mon, 15 May 2023 21:19:07 -0700 Subject: [PATCH 51/62] remove upgrading setuptools in reinstall.sh (#6659) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: fayejf <36722593+fayejf@users.noreply.github.com> --- reinstall.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reinstall.sh b/reinstall.sh index 06f9f0f284c0..7a533ff146d9 100755 --- a/reinstall.sh +++ b/reinstall.sh @@ -5,6 +5,8 @@ INSTALL_OPTION=${1:-"dev"} PIP=pip +${PIP} install -U ${PIP} + echo 'Uninstalling stuff' ${PIP} uninstall -y nemo_toolkit ${PIP} uninstall -y sacrebleu @@ -14,8 +16,6 @@ ${PIP} uninstall -y nemo_asr ${PIP} uninstall -y nemo_nlp ${PIP} uninstall -y nemo_tts -${PIP} install -U setuptools - if [ -n "${NVIDIA_PYTORCH_VERSION}" ]; then echo 'Installing NeMo in NVIDIA PyTorch container:' "${NVIDIA_PYTORCH_VERSION}" 'so will not install numba' else From 623c37e08a8347e7aef9ef97bdc1b00f92f02429 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Mon, 15 May 2023 22:25:05 -0700 Subject: [PATCH 52/62] merge lora weights into base model (#6597) * merge lora weights into base model Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo fix Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor update Signed-off-by: arendu * update copyright Signed-off-by: arendu * eval needs to know the PEFT class Signed-off-by: arendu * add target class in training script so that we can use it in eval Signed-off-by: arendu * update Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update to work for tp1 Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * set restore model path Signed-off-by: arendu * peft can be none Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated merge script so that eval works easily Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * eval with peft or sft model Signed-off-by: arendu * keep sentences in jsonl format Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * convert sft using correct classpath Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated to force sft yaml to have the correct target Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated docs Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix conversion and eval Signed-off-by: arendu * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: arendu Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../tuning/megatron_gpt_peft_eval.py | 1 - .../tuning/megatron_gpt_peft_tuning.py | 2 + .../conf/merge_lora_weights.yaml | 16 ++ .../merge_lora_weights/merge.py | 223 ++++++++++++++++++ 4 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 scripts/nlp_language_modeling/merge_lora_weights/conf/merge_lora_weights.yaml create mode 100644 scripts/nlp_language_modeling/merge_lora_weights/merge.py diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py index 338b66a80cfa..b45f5da69e89 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py @@ -22,7 +22,6 @@ from pytorch_lightning.plugins.environments import TorchElasticEnvironment from torch.utils.data import DataLoader -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import MegatronGPTPEFTModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel from nemo.collections.nlp.models.nlp_model import NLPModel diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py index d0f95b371a13..bf2705aa99e1 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py @@ -97,6 +97,8 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.0) gpt_cfg.ffn_dropout = cfg.model.ffn_dropout gpt_cfg.peft = cfg.model.peft + peft_cls = _get_peft_scheme(cfg.model) + gpt_cfg.target = f"{peft_cls.__module__}.{peft_cls.__name__}" # This is needed when modifying a hparam file directly to load `.ckpt` files. # This is not needed to modify the cfg in `.nemo` files. diff --git a/scripts/nlp_language_modeling/merge_lora_weights/conf/merge_lora_weights.yaml b/scripts/nlp_language_modeling/merge_lora_weights/conf/merge_lora_weights.yaml new file mode 100644 index 000000000000..891509c15996 --- /dev/null +++ b/scripts/nlp_language_modeling/merge_lora_weights/conf/merge_lora_weights.yaml @@ -0,0 +1,16 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: 16 # 16, 32, or bf16 + +tensor_model_parallel_size: -1 +pipeline_model_parallel_size: -1 +pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) +gpt_model_file: null, # GPT nemo file path +checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training +checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading +hparams_file: null # model configuration file, only used for PTL checkpoint loading +lora_model_path: ??? +merged_model_path: ??? \ No newline at end of file diff --git a/scripts/nlp_language_modeling/merge_lora_weights/merge.py b/scripts/nlp_language_modeling/merge_lora_weights/merge.py new file mode 100644 index 000000000000..9989574cbf5b --- /dev/null +++ b/scripts/nlp_language_modeling/merge_lora_weights/merge.py @@ -0,0 +1,223 @@ +#!/usr/bin/env +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Merge lora weights into a base GPT LM. Only PP=1 supported so far. +""" + + +import os +import tempfile +from typing import Any, Dict + +import torch +from omegaconf import OmegaConf, open_dict +from pytorch_lightning.trainer.trainer import Trainer +from torch.utils.data import DataLoader, Dataset + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_peft_models import MegatronGPTLoRAModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.model_utils import inject_model_parallel_rank + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + + +class RequestDataSet(Dataset): + def __init__(self, sentences): + super().__init__() + self.sentences = sentences + + def __len__(self,): + return len(self.sentences) + + def __getitem__(self, idx): + return self.sentences[idx] + + +def load_lora(lora_nemo, tp): + lora_state_dict = {} + with tempfile.TemporaryDirectory() as tmpdir: + NLPSaveRestoreConnector._unpack_nemo_file(lora_nemo, tmpdir) + # assert os.path.isdir(lora_extracted_dir), "requires the untar'ed the lora .nemo file" + for i in range(tp): + if tp == 1: + ckpt_file = f"{tmpdir}/model_weights.ckpt" + else: + ckpt_file = f"{tmpdir}/mp_rank_0{i}/model_weights.ckpt" + + l = torch.load(ckpt_file, map_location=torch.device('cpu')) + lora_state_dict[i] = l + return lora_state_dict + + +def merge( + base_model_state_dict: Dict[str, Any], lora_state_dict: Dict[int, Any], tp: int, num_layers: int, curr_rank: int +): + """ + Iterate through all the self_attention.query_key_value projection feedforward weights in all the layers. + Collect the corresponding lora weights for each layer and across tp ranks. + Computes the "full rank" weight from the two low-rank weights and add it to the self_attention.query_key_value weight. + Args: + base_model_state_dict: A state_dict for the base model for the current rank. + lora_state_dict: A complete set of weights for the lora model across all tp ranks. They key for this dict is an int tp rank. + tp: the tensor_model_parallel_size for the base_model (and the lora model) + num_layers: the number of layers in the base_model to iterate over. + curr_rank: current tp rank of the base model which is being merged with Lora. + """ + + for nl in range(num_layers): + key_self_attn_kqv = f'model.language_model.encoder.layers.{nl}.self_attention.query_key_value.weight' + key_lora_in = ( + f'model.language_model.encoder.layers.{nl}.self_attention.adapter_layer.lora_kqv_adapter.linear_in.weight' + ) + key_lora_out = ( + f'model.language_model.encoder.layers.{nl}.self_attention.adapter_layer.lora_kqv_adapter.linear_out.weight' + ) + wt_lora_in = torch.cat([lora_state_dict[_tp][key_lora_in] for _tp in range(tp)], dim=0) + wt_lora_out = lora_state_dict[curr_rank][key_lora_out] + wt_self_attn = base_model_state_dict[key_self_attn_kqv] + wt_lora = wt_lora_out @ wt_lora_in + base_model_state_dict[key_self_attn_kqv] = wt_self_attn + wt_lora.type_as(wt_self_attn) + return base_model_state_dict + + +@hydra_runner(config_path="conf", config_name="merge_lora_weights") +def main(cfg) -> None: + + # trainer required for restoring model parallel models + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + + if ( + cfg.tensor_model_parallel_size < 0 + or cfg.pipeline_model_parallel_size < 0 + or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 + ): + model_config = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True, + ) + + with open_dict(cfg): + cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) + cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1) + cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0) + + assert ( + cfg.trainer.devices * cfg.trainer.num_nodes + == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + if cfg.gpt_model_file: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + + pretrained_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + OmegaConf.set_struct(pretrained_cfg, True) + with open_dict(pretrained_cfg): + pretrained_cfg.sequence_parallel = False + pretrained_cfg.activations_checkpoint_granularity = None + pretrained_cfg.activations_checkpoint_method = None + pretrained_cfg.precision = trainer.precision + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False + model = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + map_location=torch.device("cpu"), + save_restore_connector=save_restore_connector, + ) + elif cfg.checkpoint_dir: + app_state = AppState() + if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, + ) + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) + model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) + else: + raise ValueError("need at least a nemo file or checkpoint dir") + + lora_model_cfg = MegatronGPTLoRAModel.restore_from( + restore_path=cfg.lora_model_path, trainer=trainer, return_config=True, + ) + + # load the lora weights on cpu for all ranks of the lora model + lora_weights = load_lora(cfg.lora_model_path, model.cfg.tensor_model_parallel_size) + + # merge the lora weights with the base model, for this current rank. + merged_weights = merge( + model.state_dict(), + lora_weights, + tp=model.cfg.tensor_model_parallel_size, + num_layers=model.cfg.num_layers, + curr_rank=model.global_rank, + ) + + # load the merged_weights back into the base model, for this current rank. + model.load_state_dict(merged_weights) + + # Going to go through the motions of inference to force PTL to run subprocess for loading all base model's ranks. + input = "Context: In 2004, philosopher and psychologist Michel ter Hark (Groningen, The Netherlands) published a book, called Popper, Otto Selz and the rise of evolutionary epistemology, in which he claimed that Popper took some of his ideas from his tutor, the German psychologist Otto Selz. Selz never published his ideas, partly because of the rise of Nazism, which forced him to quit his work in 1933, and the prohibition of referring to Selz' work. Popper, the historian of ideas and his scholarship, is criticised in some academic quarters for his rejection of Plato, Hegel and Marx. Question: Who claimed Otto Selz deserved credit for ideas published by Popper? Answer:" + ds = RequestDataSet([input]) + request_dl = DataLoader(dataset=ds, batch_size=1) + config = {'greedy': True, 'compute_logprob': False, 'tokens_to_generate': 5, 'add_BOS': False} + model.set_inference_config(config) + response = trainer.predict(model, request_dl) + print(response) + + with open_dict(model.cfg): + model.cfg.restore_from_path = cfg.merged_model_path + model.cfg.data = lora_model_cfg.data + model.cfg.target = f"{MegatronGPTSFTModel.__module__}.{MegatronGPTSFTModel.__name__}" + + model.save_to(cfg.merged_model_path) + logging.info(f"saved merged model to {cfg.merged_model_path}") + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter From d810e1ba2cb77bf6d1a9d32f8f6681e129c5ae48 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 16 May 2023 10:42:12 -0600 Subject: [PATCH 53/62] upgrade to 23.04 (#6660) Signed-off-by: ericharper --- Dockerfile | 2 +- Jenkinsfile | 2 +- README.rst | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index d27ed857a88a..a2b3eacf664c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.03-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3 # build an image that includes only the nemo dependencies, ensures that dependencies # are included first for optimal caching, and useful for building a development diff --git a/Jenkinsfile b/Jenkinsfile index b1774732007b..3ef2ade5b729 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,7 +1,7 @@ pipeline { agent { docker { - image 'pytorch_23.03:apex_57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2' + image 'nvcr.io/nvidia/pytorch:23.04-py3' args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1' } } diff --git a/README.rst b/README.rst index da24655d008f..1335620ead25 100644 --- a/README.rst +++ b/README.rst @@ -301,13 +301,13 @@ To build a nemo container with Dockerfile from a branch, please run DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest . -If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.03-py3 and then installing from GitHub. +If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub. .. code-block:: bash docker run --gpus all -it --rm -v :/NeMo --shm-size=8g \ -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \ - stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.03-py3 + stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3 Examples -------- From fa3dca7352ab7bb0d922486ed83eb4807d9eed5e Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Tue, 16 May 2023 10:48:01 -0600 Subject: [PATCH 54/62] Merge r1.18.0 bugfixes and doc updates to main (#6655) * update branch Signed-off-by: ericharper * Remove from jenkins (#6641) * add megatron_core to requirements Signed-off-by: ericharper * remove from jenkins Signed-off-by: ericharper --------- Signed-off-by: ericharper * remove dup Signed-off-by: ericharper * update branch Signed-off-by: ericharper * [TTS] reformat NeMo versions in the tts logging messages to avoid batch process them when upgrading NeMo versions. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --------- Signed-off-by: ericharper Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- Dockerfile | 2 +- Jenkinsfile | 4 ++-- nemo/collections/tts/g2p/modules.py | 6 +++--- nemo/collections/tts/models/aligner.py | 2 +- nemo/collections/tts/models/fastpitch.py | 2 +- nemo/collections/tts/models/mixer_tts.py | 2 +- nemo/collections/tts/models/radtts.py | 2 +- nemo/collections/tts/models/tacotron2.py | 2 +- nemo/collections/tts/models/vits.py | 2 +- nemo/collections/tts/torch/g2ps.py | 6 +++--- nemo/package_info.py | 2 +- tutorials/VoiceSwapSample.ipynb | 2 +- tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb | 2 +- 13 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index a2b3eacf664c..82d16a561886 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,7 +89,7 @@ COPY . . # start building the final container FROM nemo-deps as nemo -ARG NEMO_VERSION=1.18.0 +ARG NEMO_VERSION=1.19.0 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container # version information as runtime environment variable for introspection purposes diff --git a/Jenkinsfile b/Jenkinsfile index 3ef2ade5b729..f43b301afdc0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3799,8 +3799,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') { when { anyOf { - branch 'r1.18.0' - changeRequest target: 'r1.18.0' + branch 'main' + changeRequest target: 'main' } } failFast true diff --git a/nemo/collections/tts/g2p/modules.py b/nemo/collections/tts/g2p/modules.py index b8124489f3b1..cff81345a52d 100644 --- a/nemo/collections/tts/g2p/modules.py +++ b/nemo/collections/tts/g2p/modules.py @@ -15,7 +15,7 @@ from nemo.collections.tts.g2p.models.en_us_arpabet import EnglishG2p from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p as IPAG2P -# TODO @xueyang: This file is kept for backward-compatibility purposes since all older NGC models (<= r1.16.0) used this -# import path. We will remove this file soon; `IPAG2P` will be also renamed as `IpaG2p`. Please start using new import -# path and the new `IpaG2p` name from r1.16.0. +# TODO @xueyang: This file is kept for backward-compatibility purposes since all older NGC models that were trained on +# and before NeMo 1.16.0 used this import path. We will remove this file soon; `IPAG2P` will be also renamed as +# `IpaG2p`. Please start using new import path and the new `IpaG2p` name from NeMo 1.16.0. from nemo.collections.tts.g2p.models.zh_cn_pinyin import ChineseG2p diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py index 05c32cf09b36..49301afc1591 100644 --- a/nemo/collections/tts/models/aligner.py +++ b/nemo/collections/tts/models/aligner.py @@ -103,7 +103,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 28185c8f8622..d44de8ce0075 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -222,7 +222,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/models/mixer_tts.py b/nemo/collections/tts/models/mixer_tts.py index 9623de698f8e..38efd5a147a0 100644 --- a/nemo/collections/tts/models/mixer_tts.py +++ b/nemo/collections/tts/models/mixer_tts.py @@ -153,7 +153,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/models/radtts.py b/nemo/collections/tts/models/radtts.py index cf2ca3c73590..98bfbb4c2a18 100644 --- a/nemo/collections/tts/models/radtts.py +++ b/nemo/collections/tts/models/radtts.py @@ -337,7 +337,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/models/tacotron2.py b/nemo/collections/tts/models/tacotron2.py index 27462f97149d..37880a0eae6f 100644 --- a/nemo/collections/tts/models/tacotron2.py +++ b/nemo/collections/tts/models/tacotron2.py @@ -337,7 +337,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py index 8f1dd96a56a0..78614fa6264b 100644 --- a/nemo/collections/tts/models/vits.py +++ b/nemo/collections/tts/models/vits.py @@ -117,7 +117,7 @@ def _setup_tokenizer(self, cfg): cfg.text_tokenizer.g2p['_target_'] = cfg.text_tokenizer.g2p['_target_'].replace( "nemo_text_processing.g2p", "nemo.collections.tts.g2p" ) - logging.warning("This checkpoint support will be dropped after r1.18.0.") + logging.warning("This checkpoint support will be dropped after NeMo 1.18.0.") g2p_kwargs = {} diff --git a/nemo/collections/tts/torch/g2ps.py b/nemo/collections/tts/torch/g2ps.py index 90c2798c8baa..084a4c9d7699 100644 --- a/nemo/collections/tts/torch/g2ps.py +++ b/nemo/collections/tts/torch/g2ps.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO @xueyang: This file is kept for backward-compatibility purposes since all older NGC models (<= r1.16.0) used this -# import path. We will remove this file soon; `IPAG2P` will be also renamed as `IpaG2p`. Please start using new import -# path and the new `IpaG2p` name from r1.16.0. +# TODO @xueyang: This file is kept for backward-compatibility purposes since all older NGC models that were trained on +# and before NeMo 1.16.0 used this import path. We will remove this file soon; `IPAG2P` will be also renamed as +# `IpaG2p`. Please start using new import path and the new `IpaG2p` name from NeMo 1.16.0. from nemo.collections.tts.g2p.models.en_us_arpabet import EnglishG2p from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p as IPAG2P from nemo.collections.tts.g2p.models.zh_cn_pinyin import ChineseG2p diff --git a/nemo/package_info.py b/nemo/package_info.py index 1655a7d860af..709159dd575a 100644 --- a/nemo/package_info.py +++ b/nemo/package_info.py @@ -14,7 +14,7 @@ MAJOR = 1 -MINOR = 18 +MINOR = 19 PATCH = 0 PRE_RELEASE = 'rc0' diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb index 934071faa768..addf19f3b236 100644 --- a/tutorials/VoiceSwapSample.ipynb +++ b/tutorials/VoiceSwapSample.ipynb @@ -39,7 +39,7 @@ }, "outputs": [], "source": [ - "BRANCH = 'r1.18.0'\n", + "BRANCH = 'main'\n", "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n" ] }, diff --git a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb index 8b2474597819..6204bf2516bb 100644 --- a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb +++ b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb @@ -21,7 +21,7 @@ "import os\n", "\n", "# install NeMo\n", - "BRANCH = 'r1.18.0'\n", + "BRANCH = 'main'\n", "\n", "GITHUB_ACCOUNT = 'NVIDIA' # change this if using a fork\n", "\n", From e1995497c9c5572da580b18eed8124a6a155c593 Mon Sep 17 00:00:00 2001 From: Igor Gitman Date: Tue, 16 May 2023 09:50:01 -0700 Subject: [PATCH 55/62] Confidence ensembles: fix issues and add tuning functionality (#6657) * Implement compute confidence to properly handle blanks Signed-off-by: Igor Gitman * Implement proper confidence for transducers Signed-off-by: Igor Gitman * Implement tuning logic Signed-off-by: Igor Gitman * Add tests for confidence tuning Signed-off-by: Igor Gitman * Remove unused imports Signed-off-by: Igor Gitman * Add types/docs Signed-off-by: Igor Gitman * Add comment about the main conf compute loop Signed-off-by: Igor Gitman --------- Signed-off-by: Igor Gitman --- examples/asr/transcribe_speech.py | 5 +- .../asr/models/confidence_ensemble.py | 149 +++++- .../asr/parts/utils/asr_confidence_utils.py | 2 +- .../confidence_ensembles/build_ensemble.py | 475 +++++++++++++++--- .../test_confidence_ensembles.py | 18 +- 5 files changed, 567 insertions(+), 82 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 1c1d5c08199c..4a93e630876c 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -26,6 +26,7 @@ from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.asr.parts.utils.transcribe_utils import ( compute_output_filename, prepare_audio_data, @@ -33,7 +34,6 @@ transcribe_partial_audio, write_transcription, ) -from nemo.collections.common.tokenizers.aggregate_tokenizer import AggregateTokenizer from nemo.core.config import hydra_runner from nemo.utils import logging @@ -169,8 +169,7 @@ class TranscriptionConfig: @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) -# just specifying List in the return type as otherwise it's too many things -def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List]: +def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') for key in cfg: diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py index 34fe037e30b5..0a5441a1cd52 100644 --- a/nemo/collections/asr/models/confidence_ensemble.py +++ b/nemo/collections/asr/models/confidence_ensemble.py @@ -12,25 +12,139 @@ # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass from typing import Dict, List, Optional, Union import joblib import numpy as np import torch -from omegaconf import DictConfig, OmegaConf, open_dict +from omegaconf import DictConfig, open_dict from pytorch_lightning import Trainer from nemo.collections.asr.models.asr_model import ASRModel from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel -from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceConfig, get_confidence_aggregation_bank +from nemo.collections.asr.parts.utils.asr_confidence_utils import ( + ConfidenceConfig, + ConfidenceMethodConfig, + get_confidence_aggregation_bank, + get_confidence_measure_bank, +) from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.core.classes import ModelPT from nemo.utils import model_utils -__all__ = ['ConfidenceEnsembleModel'] + +# frozen is required to allow hashing of this class and use it +# as a dictionary key when running confidence tuning +@dataclass(frozen=True) +class ConfidenceSpec: + exclude_blank: bool + aggregation: str + confidence_type: str + alpha: float + + def to_confidence_config(self) -> ConfidenceConfig: + """Converts confidence spec to the confidence config. + + Internally, the tuning procedure uses this "spec" objects as they + are more aligned with how things are implemented. But when it's time + to save the models or call transcribe, we need to use the proper + object of type ``ConfidenceConfig``. + """ + if self.confidence_type == 'max_prob': + name = 'max_prob' + entropy_type = 'tsallis' # can be any + entropy_norm = 'lin' # can be any + else: + name, entropy_type, entropy_norm = self.confidence_type.split("_") + return ConfidenceConfig( + exclude_blank=self.exclude_blank, + aggregation=self.aggregation, + method_cfg=ConfidenceMethodConfig( + name=name, entropy_type=entropy_type, temperature=self.alpha, entropy_norm=entropy_norm, + ), + ) + + +def get_filtered_logprobs(hypothesis: Hypothesis, exclude_blank: bool) -> torch.Tensor: + """Returns logprobs from the hypothesis object with optional blanks filter. + + This function supports both CTC and Transducer hypotheses. Will place the + logprobs on GPU if it's available. + + Args: + hypothesis: generated hypothesis as returned from the transcribe + method of the ASR model. + exclude_blank: whether to filter out all ```` tokens. + + Returns: + torch.Tensor: of shape [S, V], where S is (filtered) sequence length and + V is the vocabulary size. + """ + if isinstance(hypothesis.alignments, list): # Transducer + filtered_logprobs = [] + for alignment in hypothesis.alignments: + for align_elem in alignment: + if exclude_blank and align_elem[1].item() != align_elem[0].shape[-1] - 1: + filtered_logprobs.append(align_elem[0]) + filtered_logprobs.append(align_elem[0]) + if not filtered_logprobs: # for the edge-case of all blanks + filtered_logprobs.append(align_elem[0]) + filtered_logprobs = torch.stack(filtered_logprobs) + if torch.cuda.is_available(): # by default logprobs are placed on cpu in nemo + filtered_logprobs = filtered_logprobs.cuda() + else: # CTC + logprobs = hypothesis.y_sequence + if torch.cuda.is_available(): # by default logprobs are placed on cpu in nemo + logprobs = logprobs.cuda() + if exclude_blank: # filtering blanks + labels = logprobs.argmax(dim=-1) + filtered_logprobs = logprobs[labels != logprobs.shape[1] - 1] + else: + filtered_logprobs = logprobs + return filtered_logprobs + + +def compute_confidence(hypothesis: Hypothesis, confidence_cfg: ConfidenceConfig) -> float: + """Computes confidence score of the full utterance from a given hypothesis. + + This is essentially a re-implementation of the built-in confidence + computation in NeMo. The difference is that we aggregate full-utterance + scores, while core functionality only supports word and token level + aggregations. + + Args: + hypothesis: generated hypothesis as returned from the transcribe + method of the ASR model. + confidence_cfg: confidence config specifying what kind of + measure/aggregation should be used. + + Returns: + float: confidence score. + + """ + filtered_logprobs = get_filtered_logprobs(hypothesis, confidence_cfg.exclude_blank) + vocab_size = filtered_logprobs.shape[1] + aggr_func = get_confidence_aggregation_bank()[confidence_cfg.aggregation] + if confidence_cfg.method_cfg.name == "max_prob": + conf_type = "max_prob" + alpha = 1.0 + else: + conf_type = f"entropy_{confidence_cfg.method_cfg.entropy_type}_{confidence_cfg.method_cfg.entropy_norm}" + alpha = confidence_cfg.method_cfg.temperature + conf_func = get_confidence_measure_bank()[conf_type] + + conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() + return conf_value class ConfidenceEnsembleModel(ModelPT): + """Implementation of the confidence ensemble model. + + See for details. + """ + def __init__( self, cfg: DictConfig, trainer: 'Trainer' = None, ): @@ -79,7 +193,7 @@ def __init__( self.model_selection_block = joblib.load(model_selection_block_path) self.confidence_cfg = ConfidenceConfig(**self.cfg.confidence) - # making sure each model has correct confidence settings in the decoder strategy + # making sure each model has correct temperature setting in the decoder strategy for model_idx in range(self.num_models): model = getattr(self, f"model{model_idx}") # for now we assume users are direclty responsible for matching @@ -94,11 +208,15 @@ def __init__( self.update_decoding_parameters(model.cfg.decoding) model.change_decoding_strategy(model.cfg.decoding) - def update_decoding_parameters(self, decoding_cfg): - """Updating confidence/temperature parameters of the config.""" + def update_decoding_parameters(self, decoding_cfg: DictConfig): + """Updating temperature/preserve_alignment/preserve_frame_confidence parameters of the config.""" with open_dict(decoding_cfg): - decoding_cfg.confidence_cfg = self.confidence_cfg decoding_cfg.temperature = self.cfg.temperature + decoding_cfg.preserve_alignments = True + if 'confidence_cfg' in decoding_cfg: + decoding_cfg.confidence_cfg.preserve_frame_confidence = True + else: + decoding_cfg.confidence_cfg = ConfidenceConfig(preserve_frame_confidence=True) def setup_training_data(self, train_data_config: Union[DictConfig, Dict]): """Pass-through to the ensemble models. @@ -122,13 +240,13 @@ def change_attention_model( self_attention_model, att_context_size, update_config ) - def change_decoding_strategy(self, decoding_cfg: DictConfig = None, decoder_type: str = None): + def change_decoding_strategy(self, decoding_cfg: Optional[DictConfig] = None, decoder_type: str = None): """Pass-through to the ensemble models. - The only change here is that we always require frame-confidence to - be returned. + The only change here is that we always require expected temperature + to be set as well as ``decoding_cfg.preserve_alignments = True`` """ - decoding_cfg.confidence_cfg = self.confidence_cfg + self.update_decoding_parameters(decoding_cfg) for model_idx in range(self.num_models): model = getattr(self, f"model{model_idx}") if isinstance(model, EncDecHybridRNNTCTCModel): @@ -157,8 +275,6 @@ def transcribe( 3. Use logistic regression to pick the "most confident" model 4. Return the output of that model """ - # TODO: lots of duplicate code with building ensemble script - aggr_func = get_confidence_aggregation_bank()[self.confidence_cfg.aggregation] confidences = [] all_transcriptions = [] # always requiring to return hypothesis @@ -181,12 +297,7 @@ def transcribe( model_confidences = [] for transcription in transcriptions: - if isinstance(transcription.frame_confidence[0], list): - # NeMo Transducer API returns list of lists for confidences - conf_values = [conf_value for confs in transcription.frame_confidence for conf_value in confs] - else: - conf_values = transcription.frame_confidence - model_confidences.append(aggr_func(conf_values)) + model_confidences.append(compute_confidence(transcription, self.confidence_cfg)) confidences.append(model_confidences) all_transcriptions.append(transcriptions) diff --git a/nemo/collections/asr/parts/utils/asr_confidence_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_utils.py index a15428ee52df..1387f6940b38 100644 --- a/nemo/collections/asr/parts/utils/asr_confidence_utils.py +++ b/nemo/collections/asr/parts/utils/asr_confidence_utils.py @@ -312,7 +312,7 @@ def _aggregate_token_confidence_subwords_sentencepiece( raise RuntimeError( f"""Something went wrong with word-level confidence aggregation.\n Please check these values for debugging:\n - len(words): {len(hypothesis.words)},\n + len(words): {len(words)},\n len(word_confidence): {len(word_confidence)},\n recognized text: `{' '.join(words)}`""" ) diff --git a/scripts/confidence_ensembles/build_ensemble.py b/scripts/confidence_ensembles/build_ensemble.py index 9620b73aac87..07ceccb8b3d5 100644 --- a/scripts/confidence_ensembles/build_ensemble.py +++ b/scripts/confidence_ensembles/build_ensemble.py @@ -13,6 +13,7 @@ # limitations under the License. # # Run ``python build_ensemble.py --help`` for usage examples. +# TODO: write usage. Mention that neither train nor dev requires transcriptions import atexit @@ -22,9 +23,10 @@ import random import sys import tempfile -from dataclasses import dataclass, is_dataclass +from copy import deepcopy +from dataclasses import dataclass from pathlib import Path -from typing import List +from typing import Dict, List, Optional, Tuple import joblib import numpy as np @@ -32,15 +34,23 @@ from omegaconf import DictConfig, OmegaConf from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler +from tqdm import tqdm -from nemo.collections.asr.models.confidence_ensemble import ConfidenceEnsembleModel +from nemo.collections.asr.models.confidence_ensemble import ( + ConfidenceEnsembleModel, + ConfidenceSpec, + compute_confidence, + get_filtered_logprobs, +) from nemo.collections.asr.parts.utils.asr_confidence_utils import ( ConfidenceConfig, ConfidenceMethodConfig, get_confidence_aggregation_bank, + get_confidence_measure_bank, ) +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.core.config import hydra_runner LOG = logging.getLogger(__file__) @@ -70,7 +80,71 @@ class EnsembleConfig: # 100 is most likely enough, but setting higher default just in case max_training_samples: int = 1000 # specify to provide dev data manifest for HP tuning - # dev_manifest: Optional[str] = None + dev_manifest: Optional[str] = None + + +@dataclass +class TuneConfidenceConfig: + # important parameter, so should always be tuned + exclude_blank: Tuple[bool] = (True, False) + # prod is pretty much always worse, so not including by default + aggregation: Tuple[str] = ("mean", "min", "max") + # not including max prob, as there is always an entropy-based metric + # that's better but otherwise including everything + confidence_type: Tuple[str] = ( + "entropy_renui_exp", + "entropy_renui_lin", + "entropy_tsallis_exp", + "entropy_tsallis_lin", + "entropy_gibbs_lin", + "entropy_gibbs_exp", + ) + + # TODO: currently it's not possible to efficiently tune temperature, as we always + # apply log-softmax in the decoder, so to try different values it will be required + # to rerun the decoding, which is very slow. To support this for one-off experiments + # it's possible to modify the code of CTC decoder / Transducer joint to + # remove log-softmax and then apply it directly in this script with the temperature + # + # Alternatively, one can run this script multiple times with different values of + # temperature and pick the best performing ensemble. Note that this will increase + # tuning time by the number of temperature values tried. On the other hand, + # the above approach is a lot more efficient and will only slightly increase + # the total tuning runtime. + + # very important to tune for max prob, but for entropy metrics 1.0 is almost always best + # temperature: Tuple[float] = (1.0,) + + # not that important, but can sometimes make a small difference + alpha: Tuple[float] = (0.25, 0.33, 0.5, 1.0) + + def get_grid_size(self) -> int: + """Returns the total number of points in the search space.""" + if "max_prob" in self.confidence_type: + return ( + len(self.exclude_blank) + * len(self.aggregation) + * ((len(self.confidence_type) - 1) * len(self.alpha) + 1) + ) + return len(self.exclude_blank) * len(self.aggregation) * len(self.confidence_type) * len(self.alpha) + + +@dataclass +class TuneLogisticRegressionConfig: + # will have log-uniform grid over this range with that many points + # note that a value of 10000.0 (not regularization) is always added + C_num_points: int = 10 + C_min: float = 0.0001 + C_max: float = 10.0 + + # not too important + multi_class: Tuple[str] = ("ovr", "multinomial") + + # should try to include weights directly if the data is too imbalanced + class_weight: Tuple = (None, "balanced") + + # increase if getting many warnings that algorithm didn't converge + max_iter: int = 1000 @dataclass @@ -103,11 +177,60 @@ class BuildEnsembleConfig: # will be overriden by this script transcription: transcribe_speech.TranscriptionConfig = transcribe_speech.TranscriptionConfig() - -def calculate_score(features, labels, pipe): + # set to True to tune the confidence. + # requires dev manifests to be specified for each model + tune_confidence: bool = False + # used to specify what to tune over. By default runs tuning over some + # reasonalbe grid, so that it does not take forever. + # Can be changed as needed + tune_confidence_config: TuneConfidenceConfig = TuneConfidenceConfig() + + # very fast to tune and can be important in case of imbalanced datasets + # will automatically set to False if dev data is not available + tune_logistic_regression: bool = True + tune_logistic_regression_config: TuneLogisticRegressionConfig = TuneLogisticRegressionConfig() + + def __post_init__(self): + """Checking that if any dev data is provided, all are provided. + + Will also auto-set tune_logistic_regression to False if no dev data + is available. + + If tune_confidence is set to True (user choice) and no dev data is + provided, will raise an error. + """ + num_dev_data = 0 + for ensemble_cfg in self.ensemble: + num_dev_data += ensemble_cfg.dev_manifest is not None + if num_dev_data == 0: + if self.tune_confidence: + raise ValueError("tune_confidence is set to True, but no dev data is provided") + LOG.info("Setting tune_logistic_regression = False since no dev data is provided") + self.tune_logistic_regression = False + return + + if num_dev_data < len(self.ensemble): + raise ValueError( + "Some ensemble configs specify dev data, but some don't. Either all have to specify it or none!" + ) + + +def calculate_score(features: np.ndarray, labels: np.ndarray, pipe: Pipeline) -> Tuple[float, np.ndarray]: """Score is always calculated as mean of the per-class scores. This is done to account for possible class imbalances. + + Args: + features: numpy array of features of shape [N x D], where N is the + number of objects (typically a total number of utterances in + all datasets) and D is the total number of confidence scores + used to train the model (typically = number of models). + labels: numpy array of shape [N] contatining ground-truth model indices. + pipe: classification pipeline (currently, standardization + logistic + regression). + + Returns: + tuple: score value in [0, 1] and full classification confusion matrix. """ predictions = pipe.predict(features) conf_m = confusion_matrix(labels, predictions) @@ -116,30 +239,105 @@ def calculate_score(features, labels, pipe): def train_model_selection( - training_features, - training_labels, - multi_class="multinomial", - C=10000.0, # disabling regularization by default as overfitting is likely not an issue - class_weight="balanced", # in case training data is imbalanced - max_iter=1000, -): - pipe = make_pipeline( - StandardScaler(), - LogisticRegression(multi_class=multi_class, C=C, max_iter=max_iter, class_weight=class_weight), - ) - pipe.fit(training_features, training_labels) - - accuracy, confusion = calculate_score(training_features, training_labels, pipe) - - LOG.info("Training fit accuracy: %.4f", accuracy * 100.0) - LOG.info("Training confusion matrix:\n%s", str(confusion)) - return pipe - + training_features: np.ndarray, + training_labels: np.ndarray, + dev_features: Optional[np.ndarray] = None, + dev_labels: Optional[np.ndarray] = None, + tune_lr: bool = False, + tune_lr_cfg: Optional[TuneLogisticRegressionConfig] = None, + verbose: bool = False, +) -> Tuple[Pipeline, float]: + """Trains model selection block with an (optional) tuning of the parameters. + + Returns a pipeline consisting of feature standardization and logistic + regression. If tune_lr is set to True, dev features/labels will be used + to tune the hyperparameters of the logistic regression with the grid + search that's defined via ``tune_lr_cfg``. + + If no tuning is requested, uses the following parameters:: + + best_pipe = make_pipeline( + StandardScaler(), + LogisticRegression( + multi_class="multinomial", + C=10000.0, + max_iter=1000, + class_weight="balanced", + ), + ) -def subsample_manifest(manifest_file, max_samples): + Args: + training_features: numpy array of features of shape [N x D], where N is + the number of objects (typically a total number of utterances in + all training datasets) and D is the total number of confidence + scores used to train the model (typically = number of models). + training_labels: numpy array of shape [N] contatining ground-truth + model indices. + dev_features: same as training, but for the validation subset. + dev_labels: same as training, but for the validation subset. + tune_lr: controls whether tuning of LR hyperparameters is performed. + If set to True, it's required to also provide dev features/labels. + tune_lr_cfg: specifies what values of LR hyperparameters to try. + verbose: if True, will output final training/dev scores. + + Returns: + tuple: trained model selection pipeline, best score (or -1 if no tuning + was done). + """ + if not tune_lr: + # default parameters: C=10000.0 disables regularization + best_pipe = make_pipeline( + StandardScaler(), + LogisticRegression(multi_class="multinomial", C=10000.0, max_iter=1000, class_weight="balanced"), + ) + max_score = -1 + else: + C_pms = np.append( + np.exp(np.linspace(np.log(tune_lr_cfg.C_min), np.log(tune_lr_cfg.C_max), tune_lr_cfg.C_num_points)), + 10000.0, + ) + max_score = 0 + best_pipe = None + for class_weight in tune_lr_cfg.class_weight: + for multi_class in tune_lr_cfg.multi_class: + for C in C_pms: + pipe = make_pipeline( + StandardScaler(), + LogisticRegression( + multi_class=multi_class, C=C, max_iter=tune_lr_cfg.max_iter, class_weight=class_weight + ), + ) + pipe.fit(training_features, training_labels) + score, confusion = calculate_score(dev_features, dev_labels, pipe) + if score > max_score: + max_score = score + best_pipe = pipe + + best_pipe.fit(training_features, training_labels) + if verbose: + accuracy, confusion = calculate_score(training_features, training_labels, best_pipe) + LOG.info("Training fit accuracy: %.4f", accuracy * 100.0) + LOG.info("Training confusion matrix:\n%s", str(confusion)) + if dev_features is not None and verbose: + accuracy, confusion = calculate_score(dev_features, dev_labels, best_pipe) + LOG.info("Dev fit accuracy: %.4f", accuracy * 100.0) + LOG.info("Dev confusion matrix:\n%s", str(confusion)) + + return best_pipe, max_score + + +def subsample_manifest(manifest_file: str, max_samples: int) -> str: """Will save a subsampled version of the manifest to the same folder. Have to save to the same folder to support relative paths. + + Args: + manifest_file: path to the manifest file that needs subsampling. + max_samples: how many samples to retain. Will randomly select that + many lines from the manifest. + + Returns: + str: the path to the subsampled manifest file. """ with open(manifest_file, "rt", encoding="utf-8") as fin: lines = fin.readlines() @@ -151,11 +349,115 @@ def subsample_manifest(manifest_file, max_samples): return output_file -def cleanup_subsampled_manifests(subsampled_manifests): +def cleanup_subsampled_manifests(subsampled_manifests: List[str]): + """Removes all generated subsamples manifests.""" for manifest in subsampled_manifests: os.remove(manifest) +def compute_all_confidences( + hypothesis: Hypothesis, tune_confidence_cfg: TuneConfidenceConfig +) -> Dict[ConfidenceSpec, float]: + """Computes a set of confidence scores from a given hypothesis. + + Works with the output of both CTC and Transducer decoding. + + Args: + hypothesis: generated hypothesis as returned from the transcribe + method of the ASR model. + tune_confidence_cfg: config specifying what confidence scores to + compute. + + Returns: + dict: dictionary with confidenct spec -> confidence score mapping. + """ + conf_values = {} + + for exclude_blank in tune_confidence_cfg.exclude_blank: + filtered_logprobs = get_filtered_logprobs(hypothesis, exclude_blank) + vocab_size = filtered_logprobs.shape[1] + for aggregation in tune_confidence_cfg.aggregation: + aggr_func = get_confidence_aggregation_bank()[aggregation] + for conf_type in tune_confidence_cfg.confidence_type: + conf_func = get_confidence_measure_bank()[conf_type] + if conf_type == "max_prob": # skipping alpha in this case + conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=1.0)).cpu().item() + conf_values[ConfidenceSpec(exclude_blank, aggregation, conf_type, 1.0)] = conf_value + else: + for alpha in tune_confidence_cfg.alpha: + conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() + conf_values[ConfidenceSpec(exclude_blank, aggregation, conf_type, alpha)] = conf_value + + return conf_values + + +def find_best_confidence( + train_confidences: List[List[Dict[ConfidenceSpec, float]]], + train_labels: List[int], + dev_confidences: List[List[Dict[ConfidenceSpec, float]]], + dev_labels: List[int], + tune_lr: bool, + tune_lr_config: TuneConfidenceConfig, +) -> Tuple[ConfidenceConfig, Pipeline]: + """Finds the best confidence configuration for model selection. + + Will loop over all values in the confidence dictionary and fit the LR + model (optionally tuning its HPs). The best performing confidence (on the + dev set) will be used for the final LR model. + + Args: + train_confidences: this is an object of type + ``List[List[Dict[ConfidenceSpec, float]]]``. The shape of this + object is [M, N, S], where + M: number of models + N: number of utterances in all training sets + S: number of confidence scores to try + + This argument will be used to construct np.array objects for each + of the confidence scores with the shape [M, N] + + train_labels: ground-truth labels of the correct model for each data + points. This is a list of size [N] + dev_confidences: same as training, but for the validation subset. + dev_labels: same as training, but for the validation subset. + tune_lr: controls whether tuning of LR hyperparameters is performed. + tune_lr_cfg: specifies what values of LR hyperparameters to try. + + Returns: + tuple: best confidence config, best model selection pipeline + """ + max_score = 0 + best_pipe = None + best_conf_spec = None + LOG.info("Evaluation all confidences. Total grid size: %d", len(train_confidences[0][0].keys())) + for conf_spec in tqdm(train_confidences[0][0].keys()): + cur_train_confidences = [] + for model_confs in train_confidences: + cur_train_confidences.append([]) + for model_conf in model_confs: + cur_train_confidences[-1].append(model_conf[conf_spec]) + cur_dev_confidences = [] + for model_confs in dev_confidences: + cur_dev_confidences.append([]) + for model_conf in model_confs: + cur_dev_confidences[-1].append(model_conf[conf_spec]) + # transposing with zip(*list) + training_features = np.array(list(zip(*cur_train_confidences))) + training_labels = np.array(train_labels) + dev_features = np.array(list(zip(*cur_dev_confidences))) + dev_labels = np.array(dev_labels) + pipe, score = train_model_selection( + training_features, training_labels, dev_features, dev_labels, tune_lr, tune_lr_config, + ) + if max_score < score: + max_score = score + best_pipe = pipe + best_conf_spec = conf_spec + LOG.info("Found better parameters: %s. New score: %.4f", str(conf_spec), max_score) + + return best_conf_spec.to_confidence_config(), best_pipe + + @hydra_runner(schema=BuildEnsembleConfig) def main(cfg: BuildEnsembleConfig): # silencing all messages from nemo/ptl to avoid dumping tons of configs to the stdout @@ -163,25 +465,23 @@ def main(cfg: BuildEnsembleConfig): logging.getLogger('nemo_logger').setLevel(logging.CRITICAL) LOG.info(f'Build ensemble config:\n{OmegaConf.to_yaml(cfg)}') - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) - - # no matter what's in the config, frame confidence is required - cfg.confidence.preserve_frame_confidence = True + # to ensure post init is called + cfg = BuildEnsembleConfig(**cfg) pl.seed_everything(cfg.random_seed) cfg.transcription.random_seed = None # seed is already applied cfg.transcription.return_transcriptions = True - cfg.transcription.ctc_decoding.confidence_cfg = cfg.confidence - cfg.transcription.rnnt_decoding.confidence_cfg = cfg.confidence + # that sets preserve_alignment to True + cfg.transcription.compute_timestamps = True cfg.transcription.ctc_decoding.temperature = cfg.temperature cfg.transcription.rnnt_decoding.temperature = cfg.temperature + # this ensures that generated output is after log-softmax for consistency with CTC + cfg.transcription.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True - aggregations = get_confidence_aggregation_bank() - aggr_func = aggregations[cfg.confidence.aggregation] - - confidences = [] - labels = [] + train_confidences = [] + dev_confidences = [] + train_labels = [] + dev_labels = [] # registering clean-up function that will hold on to this list and # should clean up even if there is partial error in some of the transcribe @@ -191,8 +491,19 @@ def main(cfg: BuildEnsembleConfig): # note that we loop over the same config. # This is intentional, as we need to run all models on all datasets + # this loop will do the following things: + # 1. Goes through each model X each training dataset + # 2. Computes predictions by directly calling transcribe_speech.main + # 3. Converts transcription to the confidence score(s) as specified in the config + # 4. If dev sets are provided, computes the same for them + # 5. Creates a list of ground-truth model indices by mapping each model + # to its own training dataset as specified in the config. + # 6. After the loop, we either run tuning over all confidence scores or + # directly use a single score to fit logistic regression and save the + # final ensemble model. for model_idx, model_cfg in enumerate(cfg.ensemble): - model_confidences = [] + train_model_confidences = [] + dev_model_confidences = [] for data_idx, data_cfg in enumerate(cfg.ensemble): if model_idx == 0: # generating subsampled manifests only one time subsampled_manifests.append( @@ -207,27 +518,75 @@ def main(cfg: BuildEnsembleConfig): cfg.transcription.dataset_manifest = subsampled_manifest + # training with tempfile.NamedTemporaryFile() as output_file: cfg.transcription.output_filename = output_file.name - LOG.info("Transcribing dataset %d with model %d", data_idx, model_idx) - transcriptions = transcribe_speech.main(cfg.transcription.copy()) - - for transcription in transcriptions: - if isinstance(transcription.frame_confidence[0], list): - # NeMo Transducer API returns list of lists for confidences - conf_values = [conf_value for confs in transcription.frame_confidence for conf_value in confs] + LOG.info("Transcribing training dataset %d with model %d", data_idx, model_idx) + transcriptions = transcribe_speech.main(deepcopy(cfg.transcription)) + LOG.info("Generating confidence scores") + # TODO: parallelize this loop? + for transcription in tqdm(transcriptions): + if cfg.tune_confidence: + train_model_confidences.append( + compute_all_confidences(transcription, cfg.tune_confidence_config) + ) else: - conf_values = transcription.frame_confidence - model_confidences.append(aggr_func(conf_values)) + train_model_confidences.append(compute_confidence(transcription, cfg.confidence)) if model_idx == 0: # labels are the same for all models - labels.append(data_idx) - - confidences.append(model_confidences) + train_labels.append(data_idx) + + # optional dev + if data_cfg.dev_manifest is not None: + cfg.transcription.dataset_manifest = data_cfg.dev_manifest + with tempfile.NamedTemporaryFile() as output_file: + cfg.transcription.output_filename = output_file.name + LOG.info("Transcribing dev dataset %d with model %d", data_idx, model_idx) + transcriptions = transcribe_speech.main(deepcopy(cfg.transcription)) + LOG.info("Generating confidence scores") + for transcription in tqdm(transcriptions): + if cfg.tune_confidence: + dev_model_confidences.append( + compute_all_confidences(transcription, cfg.tune_confidence_config) + ) + else: + dev_model_confidences.append(compute_confidence(transcription, cfg.confidence)) + if model_idx == 0: # labels are the same for all models + dev_labels.append(data_idx) + + train_confidences.append(train_model_confidences) + if dev_model_confidences: + dev_confidences.append(dev_model_confidences) + + if cfg.tune_confidence: + best_confidence, model_selection_block = find_best_confidence( + train_confidences, + train_labels, + dev_confidences, + dev_labels, + cfg.tune_logistic_regression, + cfg.tune_logistic_regression_config, + ) + else: + best_confidence = cfg.confidence + # transposing with zip(*list) + training_features = np.array(list(zip(*train_confidences))) + training_labels = np.array(train_labels) + if dev_confidences: + dev_features = np.array(list(zip(*dev_confidences))) + dev_labels = np.array(dev_labels) + else: + dev_features = None + dev_labels = None + model_selection_block, _ = train_model_selection( + training_features, + training_labels, + dev_features, + dev_labels, + cfg.tune_logistic_regression, + cfg.tune_logistic_regression_config, + verbose=True, + ) - # transposing with zip(*list) - training_features = np.array(list(zip(*confidences))) - training_labels = np.array(labels) - model_selection_block = train_model_selection(training_features, training_labels) with tempfile.TemporaryDirectory() as tmpdir: model_selection_block_path = os.path.join(tmpdir, 'model_selection_block.pkl') joblib.dump(model_selection_block, model_selection_block_path) @@ -237,7 +596,7 @@ def main(cfg: BuildEnsembleConfig): cfg=DictConfig( { 'model_selection_block': model_selection_block_path, - 'confidence': cfg.confidence, + 'confidence': best_confidence, 'temperature': cfg.temperature, 'load_models': [model_cfg.model for model_cfg in cfg.ensemble], } diff --git a/scripts/confidence_ensembles/test_confidence_ensembles.py b/scripts/confidence_ensembles/test_confidence_ensembles.py index 3e225384de92..b665375c0c33 100644 --- a/scripts/confidence_ensembles/test_confidence_ensembles.py +++ b/scripts/confidence_ensembles/test_confidence_ensembles.py @@ -37,9 +37,23 @@ ( "ensemble.0.model=stt_es_fastconformer_hybrid_large_pc " "ensemble.1.model=stt_it_fastconformer_hybrid_large_pc " - "transcription.decoder_type=ctc" + "transcription.decoder_type=ctc " ), "ensemble.0.model=stt_es_conformer_ctc_large ensemble.1.model=stt_it_conformer_transducer_large", + ( + "ensemble.0.model=stt_es_conformer_ctc_large " + "ensemble.1.model=stt_it_conformer_ctc_large " + f"ensemble.0.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'es' / 'dev_manifest.json'} " + f"ensemble.1.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'it' / 'dev_manifest.json'} " + "tune_confidence=True " + ), + ( + "ensemble.0.model=stt_es_conformer_transducer_large " + "ensemble.1.model=stt_it_conformer_transducer_large " + f"ensemble.0.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'es' / 'dev_manifest.json'} " + f"ensemble.1.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'it' / 'dev_manifest.json'} " + "tune_confidence=True " + ), ], ids=( [ @@ -48,6 +62,8 @@ "Hybrid models (Transducer mode)", "Hybrid models (CTC mode)", "CTC + Transducer", + "CTC models + confidence tuning", + "Transducer models + confidence tuning", ] ), ) From 29c8ac39c9047b965b248569328d6217bf5f5e4d Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Tue, 16 May 2023 09:50:13 -0700 Subject: [PATCH 56/62] [TTS] Implement new TextToSpeech dataset (#6575) * [TTS] Implement new TextToSpeech dataset Signed-off-by: Ryan * [TTS] Add unit tests Signed-off-by: Ryan * [TTS] Fix defaulting of use_log_energy Signed-off-by: Ryan * [TTS] Fix TTS export test Signed-off-by: Ryan --------- Signed-off-by: Ryan --- .../tts/conf/fastpitch/fastpitch_22050.yaml | 220 +++++++++++++ .../{features => feature}/feature_22050.yaml | 18 +- .../{features => feature}/feature_44100.yaml | 18 +- .../tts/data/text_to_speech_dataset.py | 297 ++++++++++++++++++ nemo/collections/tts/models/fastpitch.py | 77 +++-- nemo/collections/tts/modules/fastpitch.py | 6 +- .../tts/parts/preprocessing/features.py | 86 ++++- .../tts/parts/utils/tts_dataset_utils.py | 81 ++++- .../tts/parts/utils/test_tts_dataset_utils.py | 59 +++- 9 files changed, 802 insertions(+), 60 deletions(-) create mode 100644 examples/tts/conf/fastpitch/fastpitch_22050.yaml rename examples/tts/conf/{features => feature}/feature_22050.yaml (61%) rename examples/tts/conf/{features => feature}/feature_44100.yaml (61%) create mode 100644 nemo/collections/tts/data/text_to_speech_dataset.py diff --git a/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/examples/tts/conf/fastpitch/fastpitch_22050.yaml new file mode 100644 index 000000000000..016e157ce39f --- /dev/null +++ b/examples/tts/conf/fastpitch/fastpitch_22050.yaml @@ -0,0 +1,220 @@ +# This config contains the default values for training a FastPitch model with aligner. +# If you want to train a model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +max_epochs: ??? +batch_size: 32 +weighted_sample_steps: null + +n_speakers: ??? +speaker_path: null +feature_stats_path: null + +train_ds_meta: ??? +val_ds_meta: ??? + +phoneme_dict_path: ??? +heteronyms_path: ??? + +defaults: + - feature: feature_22050 + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: ${n_speakers} + n_mel_channels: ${feature.mel_feature.mel_dim} + max_token_duration: 75 + symbols_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + energy_embedding_kernel_size: 3 + speaker_emb_condition_prosody: true + speaker_emb_condition_aligner: true + use_log_energy: false + dur_loss_scale: 0.1 + pitch_loss_scale: 0.1 + energy_loss_scale: 0.1 + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${feature.mel_feature.mel_dim} + lowfreq: ${feature.mel_feature.lowfreq} + highfreq: ${feature.mel_feature.highfreq} + n_fft: ${feature.win_length} + n_window_size: ${feature.win_length} + window_size: false + n_window_stride: ${feature.hop_length} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${feature.sample_rate} + window: hann + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1.0 + mag_power: 1.0 + mel_norm: null + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.8 + # Relies on the heteronyms list for anything that needs to be disambiguated + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + pitch_processor: + _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization + field: pitch + stats_path: ${feature_stats_path} + + energy_processor: + _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization + field: energy + stats_path: ${feature_stats_path} + + align_prior_config: + _target_: nemo.collections.tts.data.text_to_speech_dataset.AlignPriorConfig + hop_length: ${feature.hop_length} + use_beta_binomial_interpolator: false + + train_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + dataset_meta: ${train_ds_meta} + weighted_sample_steps: ${weighted_sample_steps} + sample_rate: ${feature.sample_rate} + speaker_path: ${speaker_path} + featurizers: ${feature.featurizers} + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + align_prior_config: ${model.align_prior_config} + min_duration: 0.1 + max_duration: 10.0 + + dataloader_params: + batch_size: ${batch_size} + drop_last: true + num_workers: 8 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + dataset_meta: ${val_ds_meta} + sample_rate: ${feature.sample_rate} + speaker_path: ${speaker_path} + featurizers: ${feature.featurizers} + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + align_prior_config: ${model.align_prior_config} + + dataloader_params: + batch_size: ${batch_size} + drop_last: false + num_workers: 2 + + input_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 2 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + energy_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + gradient_clip_val: 10.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/features/feature_22050.yaml b/examples/tts/conf/feature/feature_22050.yaml similarity index 61% rename from examples/tts/conf/features/feature_22050.yaml rename to examples/tts/conf/feature/feature_22050.yaml index c5779500bc3c..1b159bc66ddf 100644 --- a/examples/tts/conf/features/feature_22050.yaml +++ b/examples/tts/conf/feature/feature_22050.yaml @@ -4,25 +4,25 @@ hop_length: 256 mel_feature: _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} + sample_rate: ${..sample_rate} + win_length: ${..win_length} + hop_length: ${..hop_length} mel_dim: 80 lowfreq: 0 highfreq: 8000 pitch_feature: _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} + sample_rate: ${..sample_rate} + win_length: ${..win_length} + hop_length: ${..hop_length} pitch_fmin: 60 pitch_fmax: 640 energy_feature: _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} + spec_featurizer: ${..mel_feature} featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} + pitch: ${..pitch_feature} + energy: ${..energy_feature} diff --git a/examples/tts/conf/features/feature_44100.yaml b/examples/tts/conf/feature/feature_44100.yaml similarity index 61% rename from examples/tts/conf/features/feature_44100.yaml rename to examples/tts/conf/feature/feature_44100.yaml index 0cfc27f4dab3..e852a93a2d6c 100644 --- a/examples/tts/conf/features/feature_44100.yaml +++ b/examples/tts/conf/feature/feature_44100.yaml @@ -4,25 +4,25 @@ hop_length: 512 mel_feature: _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} + sample_rate: ${..sample_rate} + win_length: ${..win_length} + hop_length: ${..hop_length} mel_dim: 80 lowfreq: 0 highfreq: null pitch_feature: _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${sample_rate} - win_length: ${win_length} - hop_length: ${hop_length} + sample_rate: ${..sample_rate} + win_length: ${..win_length} + hop_length: ${..hop_length} pitch_fmin: 60 pitch_fmax: 640 energy_feature: _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${mel_feature} + spec_featurizer: ${..mel_feature} featurizers: - pitch: ${pitch_feature} - energy: ${energy_feature} + pitch: ${..pitch_feature} + energy: ${..energy_feature} diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py new file mode 100644 index 000000000000..f6230fa3493a --- /dev/null +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -0,0 +1,297 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional + +import librosa +import torch.utils.data + +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest +from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer +from nemo.collections.tts.parts.preprocessing.feature_processors import FeatureProcessor +from nemo.collections.tts.parts.preprocessing.features import Featurizer +from nemo.collections.tts.parts.utils.tts_dataset_utils import ( + BetaBinomialInterpolator, + beta_binomial_prior_distribution, + filter_dataset_by_duration, + get_abs_rel_paths, + get_weighted_sampler, + stack_tensors, +) +from nemo.core.classes import Dataset +from nemo.utils import logging +from nemo.utils.decorators import experimental + + +@dataclass +class DatasetMeta: + manifest_path: Path + audio_dir: Path + feature_dir: Path + sample_weight: float = 1.0 + + +@dataclass +class DatasetSample: + manifest_entry: Dict[str, Any] + audio_dir: Path + feature_dir: Path + text: str + speaker: str + speaker_index: int = None + + +@dataclass +class AlignPriorConfig: + hop_length: int + use_beta_binomial_interpolator: bool = False + + +@experimental +class TextToSpeechDataset(Dataset): + """ + Class for processing and loading text to speech training examples. + + Args: + dataset_meta: Dict of dataset names (string) to dataset metadata. + sample_rate: Sample rate to load audio as. If the audio is stored at a different sample rate, then it will + be resampled. + text_tokenizer: Tokenizer to apply to the text field. + weighted_sample_steps: Optional int, If provided, then data will be sampled (with replacement) based on + the sample weights provided in the dataset metadata. If None, then sample weights will be ignored. + speaker_path: Optional, path to JSON file with speaker indices, for multi-speaker training. Can be created with + scripts.dataset_processing.tts.create_speaker_map.py + featurizers: Optional, list of featurizers to load feature data from. Should be the same config provided + when running scripts.dataset_processing.tts.compute_features.py before training. + feature_processors: Optional, list of feature processors to run on training examples. + align_prior_config: Optional, if provided alignment prior will be calculated and included in + batch output. + min_duration: Optional float, if provided audio files in the training manifest shorter than 'min_duration' + will be ignored. + max_duration: Optional float, if provided audio files in the training manifest longer than 'max_duration' + will be ignored. + """ + + def __init__( + self, + dataset_meta: Dict[str, DatasetMeta], + sample_rate: int, + text_tokenizer: BaseTokenizer, + weighted_sample_steps: Optional[int] = None, + speaker_path: Optional[Path] = None, + featurizers: Optional[Dict[str, Featurizer]] = None, + feature_processors: Optional[Dict[str, FeatureProcessor]] = None, + align_prior_config: Optional[AlignPriorConfig] = None, + min_duration: Optional[float] = None, + max_duration: Optional[float] = None, + ): + super().__init__() + + self.sample_rate = sample_rate + self.text_tokenizer = text_tokenizer + self.weighted_sample_steps = weighted_sample_steps + + if speaker_path: + self.include_speaker = True + with open(speaker_path, 'r', encoding="utf-8") as speaker_f: + speaker_index_map = json.load(speaker_f) + else: + self.include_speaker = False + speaker_index_map = None + + if featurizers: + logging.info(f"Found featurizers {featurizers.keys()}") + self.featurizers = featurizers.values() + else: + self.featurizers = [] + + if feature_processors: + logging.info(f"Found featurize processors {feature_processors.keys()}") + self.feature_processors = feature_processors.values() + else: + self.feature_processors = [] + + self.align_prior_config = align_prior_config + if self.align_prior_config.use_beta_binomial_interpolator: + self.beta_binomial_interpolator = BetaBinomialInterpolator() + else: + self.beta_binomial_interpolator = None + + self.data_samples = [] + self.sample_weights = [] + for dataset_name, dataset in dataset_meta.items(): + samples, weights = self._process_dataset( + dataset_name=dataset_name, + dataset=dataset, + min_duration=min_duration, + max_duration=max_duration, + speaker_index_map=speaker_index_map, + ) + self.data_samples += samples + self.sample_weights += weights + + def get_sampler(self, batch_size: int) -> Optional[torch.utils.data.Sampler]: + if not self.weighted_sample_steps: + return None + + sampler = get_weighted_sampler( + sample_weights=self.sample_weights, batch_size=batch_size, num_steps=self.weighted_sample_steps + ) + return sampler + + def _process_dataset( + self, + dataset_name: str, + dataset: DatasetMeta, + min_duration: float, + max_duration: float, + speaker_index_map: Dict[str, int], + ): + entries = read_manifest(dataset.manifest_path) + filtered_entries, total_hours, filtered_hours = filter_dataset_by_duration( + entries=entries, min_duration=min_duration, max_duration=max_duration + ) + + logging.info(dataset_name) + logging.info(f"Original # of files: {len(entries)}") + logging.info(f"Filtered # of files: {len(filtered_entries)}") + logging.info(f"Original duration: {total_hours} hours") + logging.info(f"Filtered duration: {filtered_hours} hours") + + samples = [] + sample_weights = [] + for entry in filtered_entries: + + if "normalized_text" in entry: + text = entry["normalized_text"] + else: + text = entry["text"] + + if self.include_speaker: + speaker = entry["speaker"] + speaker_index = speaker_index_map[speaker] + else: + speaker = None + speaker_index = 0 + + sample = DatasetSample( + manifest_entry=entry, + audio_dir=dataset.audio_dir, + feature_dir=dataset.feature_dir, + text=text, + speaker=speaker, + speaker_index=speaker_index, + ) + samples.append(sample) + sample_weights.append(dataset.sample_weight) + + return samples, sample_weights + + def __len__(self): + return len(self.data_samples) + + def __getitem__(self, index): + data = self.data_samples[index] + + audio_filepath = Path(data.manifest_entry["audio_filepath"]) + audio_path, _ = get_abs_rel_paths(input_path=audio_filepath, base_path=data.audio_dir) + + audio, _ = librosa.load(audio_path, sr=self.sample_rate) + tokens = self.text_tokenizer(data.text) + + example = {"audio": audio, "tokens": tokens} + + if data.speaker is not None: + example["speaker"] = data.speaker + example["speaker_index"] = data.speaker_index + + if self.align_prior_config: + text_len = len(tokens) + spec_len = 1 + librosa.core.samples_to_frames( + audio.shape[0], hop_length=self.align_prior_config.hop_length + ) + if self.beta_binomial_interpolator: + align_prior = self.beta_binomial_interpolator(w=spec_len, h=text_len) + else: + align_prior = beta_binomial_prior_distribution(phoneme_count=text_len, mel_count=spec_len) + align_prior = torch.tensor(align_prior, dtype=torch.float32) + example["align_prior"] = align_prior + + for featurizer in self.featurizers: + feature_dict = featurizer.load( + manifest_entry=data.manifest_entry, audio_dir=data.audio_dir, feature_dir=data.feature_dir + ) + example.update(feature_dict) + + for processor in self.feature_processors: + processor.process(example) + + return example + + def collate_fn(self, batch: List[dict]): + + audio_list = [] + audio_len_list = [] + token_list = [] + token_len_list = [] + speaker_list = [] + prior_list = [] + + for example in batch: + audio_tensor = torch.tensor(example["audio"], dtype=torch.float32) + audio_list.append(audio_tensor) + audio_len_list.append(audio_tensor.shape[0]) + + token_tensor = torch.tensor(example["tokens"], dtype=torch.int32) + token_list.append(token_tensor) + token_len_list.append(token_tensor.shape[0]) + + if self.include_speaker: + speaker_list.append(example["speaker_index"]) + + if self.align_prior_config: + prior_list.append(example["align_prior"]) + + batch_audio_len = torch.IntTensor(audio_len_list) + audio_max_len = int(batch_audio_len.max().item()) + + batch_token_len = torch.IntTensor(token_len_list) + token_max_len = int(batch_token_len.max().item()) + + batch_audio = stack_tensors(audio_list, max_lens=[audio_max_len]) + batch_tokens = stack_tensors(token_list, max_lens=[token_max_len], pad_value=self.text_tokenizer.pad) + + batch_dict = { + "audio": batch_audio, + "audio_lens": batch_audio_len, + "text": batch_tokens, + "text_lens": batch_token_len, + } + + if self.include_speaker: + batch_dict["speaker_id"] = torch.IntTensor(speaker_list) + + if self.align_prior_config: + spec_max_len = max([prior.shape[0] for prior in prior_list]) + text_max_len = max([prior.shape[1] for prior in prior_list]) + batch_dict["align_prior_matrix"] = stack_tensors(prior_list, max_lens=[text_max_len, spec_max_len],) + + for featurizer in self.featurizers: + feature_dict = featurizer.collate_fn(batch) + batch_dict.update(feature_dict) + + return batch_dict diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index d44de8ce0075..281a7c2891b3 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -95,15 +95,20 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): input_fft_kwargs = {} if self.learn_alignment: self.vocab = None - self.ds_class_name = cfg.train_ds.dataset._target_.split(".")[-1] - if self.ds_class_name == "TTSDataset": - self._setup_tokenizer(cfg) - assert self.vocab is not None - input_fft_kwargs["n_embed"] = len(self.vocab.tokens) - input_fft_kwargs["padding_idx"] = self.vocab.pad - else: - raise ValueError(f"Unknown dataset class: {self.ds_class_name}.") + self.ds_class = cfg.train_ds.dataset._target_ + self.ds_class_name = self.ds_class.split(".")[-1] + if not self.ds_class in [ + "nemo.collections.tts.data.dataset.TTSDataset", + "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset", + "nemo.collections.tts.torch.data.TTSDataset", + ]: + raise ValueError(f"Unknown dataset class: {self.ds_class}.") + + self._setup_tokenizer(cfg) + assert self.vocab is not None + input_fft_kwargs["n_embed"] = len(self.vocab.tokens) + input_fft_kwargs["padding_idx"] = self.vocab.pad self._parser = None self._tb_logger = None @@ -149,6 +154,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): speaker_emb_condition_prosody = cfg.get("speaker_emb_condition_prosody", False) speaker_emb_condition_decoder = cfg.get("speaker_emb_condition_decoder", False) speaker_emb_condition_aligner = cfg.get("speaker_emb_condition_aligner", False) + use_log_energy = cfg.get("use_log_energy", True) if n_speakers > 1 and "add" not in input_fft.cond_input.condition_types: input_fft.cond_input.condition_types.append("add") if speaker_emb_condition_prosody: @@ -173,6 +179,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): energy_embedding_kernel_size, cfg.n_mel_channels, cfg.max_token_duration, + use_log_energy, ) self._input_types = self._output_types = None self.export_config = { @@ -261,12 +268,7 @@ def parser(self): return self._parser if self.learn_alignment: - ds_class_name = self._cfg.train_ds.dataset._target_.split(".")[-1] - - if ds_class_name == "TTSDataset": - self._parser = self.vocab.encode - else: - raise ValueError(f"Unknown dataset class: {ds_class_name}") + self._parser = self.vocab.encode else: self._parser = parsers.make_parser( labels=self._cfg.labels, @@ -382,8 +384,10 @@ def training_step(self, batch, batch_idx): None, ) if self.learn_alignment: - assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" - batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) + if self.ds_class == "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset": + batch_dict = batch + else: + batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) audio = batch_dict.get("audio") audio_lens = batch_dict.get("audio_lens") text = batch_dict.get("text") @@ -493,8 +497,10 @@ def validation_step(self, batch, batch_idx): None, ) if self.learn_alignment: - assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" - batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) + if self.ds_class == "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset": + batch_dict = batch + else: + batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) audio = batch_dict.get("audio") audio_lens = batch_dict.get("audio_lens") text = batch_dict.get("text") @@ -578,6 +584,29 @@ def validation_epoch_end(self, outputs): ) self.log_train_images = True + def _setup_train_dataloader(self, cfg): + phon_mode = contextlib.nullcontext() + if hasattr(self.vocab, "set_phone_prob"): + phon_mode = self.vocab.set_phone_prob(self.vocab.phoneme_probability) + + with phon_mode: + dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,) + + sampler = dataset.get_sampler(cfg.dataloader_params.batch_size) + return torch.utils.data.DataLoader( + dataset, collate_fn=dataset.collate_fn, sampler=sampler, **cfg.dataloader_params + ) + + def _setup_test_dataloader(self, cfg): + phon_mode = contextlib.nullcontext() + if hasattr(self.vocab, "set_phone_prob"): + phon_mode = self.vocab.set_phone_prob(0.0) + + with phon_mode: + dataset = instantiate(cfg.dataset, text_tokenizer=self.vocab,) + + return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) + def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, name: str = "train"): if "dataset" not in cfg or not isinstance(cfg.dataset, DictConfig): raise ValueError(f"No dataset for {name}") @@ -596,7 +625,7 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na elif cfg.dataloader_params.shuffle: logging.error(f"The {name} dataloader for {self} has shuffle set to True!!!") - if cfg.dataset._target_ == "nemo.collections.tts.data.dataset.TTSDataset": + if self.ds_class == "nemo.collections.tts.data.dataset.TTSDataset": phon_mode = contextlib.nullcontext() if hasattr(self.vocab, "set_phone_prob"): phon_mode = self.vocab.set_phone_prob(prob=None if name == "val" else self.vocab.phoneme_probability) @@ -614,10 +643,16 @@ def __setup_dataloader_from_config(self, cfg, shuffle_should_be: bool = True, na return torch.utils.data.DataLoader(dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params) def setup_training_data(self, cfg): - self._train_dl = self.__setup_dataloader_from_config(cfg) + if self.ds_class == "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset": + self._train_dl = self._setup_train_dataloader(cfg) + else: + self._train_dl = self.__setup_dataloader_from_config(cfg) def setup_validation_data(self, cfg): - self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="val") + if self.ds_class == "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset": + self._validation_dl = self._setup_test_dataloader(cfg) + else: + self._validation_dl = self.__setup_dataloader_from_config(cfg, shuffle_should_be=False, name="val") def setup_test_data(self, cfg): """Omitted.""" diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 77dff7bc85ed..b26aafa72e32 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -164,6 +164,7 @@ def __init__( energy_embedding_kernel_size: int, n_mel_channels: int = 80, max_token_duration: int = 75, + use_log_energy: bool = True, ): super().__init__() @@ -177,6 +178,8 @@ def __init__( self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False + self.use_log_energy = use_log_energy + # TODO: combine self.speaker_emb with self.speaker_encoder # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` @@ -327,7 +330,8 @@ def forward( energy_tgt = average_features(energy.unsqueeze(1), attn_hard_dur) else: energy_tgt = average_features(energy.unsqueeze(1), durs_predicted) - energy_tgt = torch.log(1.0 + energy_tgt) + if self.use_log_energy: + energy_tgt = torch.log(1.0 + energy_tgt) energy_emb = self.energy_emb(energy_tgt) energy_tgt = energy_tgt.squeeze(1) else: diff --git a/nemo/collections/tts/parts/preprocessing/features.py b/nemo/collections/tts/parts/preprocessing/features.py index 7d7150a7050f..2972279339b5 100644 --- a/nemo/collections/tts/parts/preprocessing/features.py +++ b/nemo/collections/tts/parts/preprocessing/features.py @@ -15,7 +15,7 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import librosa import numpy as np @@ -23,14 +23,17 @@ from torch import Tensor from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor -from nemo.collections.tts.parts.utils.tts_dataset_utils import get_audio_filepaths +from nemo.collections.tts.parts.utils.tts_dataset_utils import get_audio_filepaths, stack_tensors from nemo.utils.decorators import experimental @experimental class Featurizer(ABC): + def __init__(self, feature_names: List[str]) -> None: + self.feature_names = feature_names + @abstractmethod - def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None: + def save(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> None: """ Save feature value to disk for given manifest entry. @@ -41,7 +44,7 @@ def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None """ @abstractmethod - def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: + def load(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: """ Read saved feature value for given manifest entry. @@ -54,8 +57,17 @@ def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict Dictionary of feature names to Tensors """ + @abstractmethod + def collate_fn(self, train_batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: + """ + Combine list/batch of features into a feature dictionary. + """ + raise NotImplementedError + -def _get_feature_filepath(manifest_entry: dict, audio_dir: Path, feature_dir: Path, feature_name: str) -> Path: +def _get_feature_filepath( + manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path, feature_name: str +) -> Path: """ Get the absolute path for the feature file corresponding to the input manifest entry @@ -68,7 +80,11 @@ def _get_feature_filepath(manifest_entry: dict, audio_dir: Path, feature_dir: Pa def _save_pt_feature( - feature_name: Optional[str], feature_tensor: Tensor, manifest_entry: Dict, audio_dir: Path, feature_dir: Path, + feature_name: Optional[str], + feature_tensor: Tensor, + manifest_entry: Dict[str, Any], + audio_dir: Path, + feature_dir: Path, ) -> None: """ If feature_name is provided, save feature as .pt file. @@ -84,12 +100,15 @@ def _save_pt_feature( def _load_pt_feature( - feature_dict: Dict, feature_name: Optional[str], manifest_entry: Dict, audio_dir: Path, feature_dir: Path, + feature_dict: Dict[str, Tensor], + feature_name: Optional[str], + manifest_entry: Dict[str, Any], + audio_dir: Path, + feature_dir: Path, ) -> None: """ If feature_name is provided, load feature into feature_dict from .pt file. """ - if feature_name is None: return @@ -100,6 +119,22 @@ def _load_pt_feature( feature_dict[feature_name] = feature_tensor +def _collate_feature( + feature_dict: Dict[str, Tensor], feature_name: Optional[str], train_batch: List[Dict[str, Tensor]] +) -> None: + if feature_name is None: + return + + feature_tensors = [] + for example in train_batch: + feature_tensor = example[feature_name] + feature_tensors.append(feature_tensor) + + max_len = max([f.shape[0] for f in feature_tensors]) + stacked_features = stack_tensors(feature_tensors, max_lens=[max_len]) + feature_dict[feature_name] = stacked_features + + class MelSpectrogramFeaturizer: def __init__( self, @@ -141,7 +176,7 @@ def __init__( dither=0.0, ) - def compute_mel_spec(self, manifest_entry: dict, audio_dir: Path) -> Tensor: + def compute_mel_spec(self, manifest_entry: Dict[str, Any], audio_dir: Path) -> Tensor: """ Computes mel spectrogram for the input manifest entry. @@ -168,7 +203,7 @@ def compute_mel_spec(self, manifest_entry: dict, audio_dir: Path) -> Tensor: return spec_tensor - def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None: + def save(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> None: spec_tensor = self.compute_mel_spec(manifest_entry=manifest_entry, audio_dir=audio_dir) _save_pt_feature( feature_name=self.feature_name, @@ -178,7 +213,7 @@ def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None feature_dir=feature_dir, ) - def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: + def load(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: feature_dict = {} _load_pt_feature( feature_dict=feature_dict, @@ -189,13 +224,18 @@ def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict ) return feature_dict + def collate_fn(self, train_batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: + feature_dict = {} + _collate_feature(feature_dict=feature_dict, feature_name=self.feature_name, train_batch=train_batch) + return feature_dict + class EnergyFeaturizer: def __init__(self, spec_featurizer: MelSpectrogramFeaturizer, feature_name: str = "energy") -> None: self.feature_name = feature_name self.spec_featurizer = spec_featurizer - def compute_energy(self, manifest_entry: dict, audio_dir: Path) -> Tensor: + def compute_energy(self, manifest_entry: Dict[str, Any], audio_dir: Path) -> Tensor: """ Computes energy for the input manifest entry. @@ -213,7 +253,7 @@ def compute_energy(self, manifest_entry: dict, audio_dir: Path) -> Tensor: return energy - def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None: + def save(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> None: energy_tensor = self.compute_energy(manifest_entry=manifest_entry, audio_dir=audio_dir) _save_pt_feature( feature_name=self.feature_name, @@ -223,7 +263,7 @@ def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None feature_dir=feature_dir, ) - def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: + def load(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: feature_dict = {} _load_pt_feature( feature_dict=feature_dict, @@ -234,6 +274,11 @@ def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict ) return feature_dict + def collate_fn(self, train_batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: + feature_dict = {} + _collate_feature(feature_dict=feature_dict, feature_name=self.feature_name, train_batch=train_batch) + return feature_dict + class PitchFeaturizer: def __init__( @@ -256,7 +301,7 @@ def __init__( self.pitch_fmin = pitch_fmin self.pitch_fmax = pitch_fmax - def compute_pitch(self, manifest_entry: dict, audio_dir: Path) -> Tuple[Tensor, Tensor, Tensor]: + def compute_pitch(self, manifest_entry: Dict[str, Any], audio_dir: Path) -> Tuple[Tensor, Tensor, Tensor]: """ Computes pitch and optional voiced mask for the input manifest entry. @@ -287,7 +332,7 @@ def compute_pitch(self, manifest_entry: dict, audio_dir: Path) -> Tuple[Tensor, return pitch_tensor, voiced_mask_tensor, voiced_prob_tensor - def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None: + def save(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> None: pitch_tensor, voiced_mask_tensor, voiced_prob_tensor = self.compute_pitch( manifest_entry=manifest_entry, audio_dir=audio_dir ) @@ -313,7 +358,7 @@ def save(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> None feature_dir=feature_dir, ) - def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: + def load(self, manifest_entry: Dict[str, Any], audio_dir: Path, feature_dir: Path) -> Dict[str, Tensor]: feature_dict = {} _load_pt_feature( feature_dict=feature_dict, @@ -337,3 +382,10 @@ def load(self, manifest_entry: dict, audio_dir: Path, feature_dir: Path) -> Dict feature_dir=feature_dir, ) return feature_dict + + def collate_fn(self, train_batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]: + feature_dict = {} + _collate_feature(feature_dict=feature_dict, feature_name=self.pitch_name, train_batch=train_batch) + _collate_feature(feature_dict=feature_dict, feature_name=self.voiced_mask_name, train_batch=train_batch) + _collate_feature(feature_dict=feature_dict, feature_name=self.voiced_prob_name, train_batch=train_batch) + return feature_dict diff --git a/nemo/collections/tts/parts/utils/tts_dataset_utils.py b/nemo/collections/tts/parts/utils/tts_dataset_utils.py index 06befcb6ec02..47c7b8cd78da 100644 --- a/nemo/collections/tts/parts/utils/tts_dataset_utils.py +++ b/nemo/collections/tts/parts/utils/tts_dataset_utils.py @@ -15,7 +15,7 @@ import functools import os from pathlib import Path -from typing import Tuple +from typing import Any, Dict, List, Tuple import numpy as np import torch @@ -45,7 +45,7 @@ def get_abs_rel_paths(input_path: Path, base_path: Path) -> Tuple[Path, Path]: return abs_path, rel_path -def get_audio_filepaths(manifest_entry: dict, audio_dir: Path) -> Tuple[Path, Path]: +def get_audio_filepaths(manifest_entry: Dict[str, Any], audio_dir: Path) -> Tuple[Path, Path]: """ Get the absolute and relative paths of audio from a manifest entry. @@ -107,6 +107,31 @@ def general_padding(item, item_len, max_len, pad_value=0): return item +def stack_tensors(tensors: List[torch.Tensor], max_lens: List[int], pad_value: float = 0.0) -> torch.Tensor: + """ + Create batch by stacking input tensor list along the time axes. + + Args: + tensors: List of tensors to pad and stack + max_lens: List of lengths to pad each axis to, starting with the last axis + pad_value: Value for padding + + Returns: + Padded and stacked tensor. + """ + padded_tensors = [] + for tensor in tensors: + padding = [] + for i, max_len in enumerate(max_lens, 1): + padding += [0, max_len - tensor.shape[-i]] + + padded_tensor = torch.nn.functional.pad(tensor, pad=padding, value=pad_value) + padded_tensors.append(padded_tensor) + + stacked_tensor = torch.stack(padded_tensors) + return stacked_tensor + + def logbeta(x, y): return gammaln(x) + gammaln(y) - gammaln(x + y) @@ -153,3 +178,55 @@ def common_path(path1, path2): base_dir = common_path(base_dir, audio_dir) return base_dir + + +def filter_dataset_by_duration(entries: List[Dict[str, Any]], min_duration: float, max_duration: float): + """ + Filter out manifest entries based on duration. + + Args: + entries: List of manifest entry dictionaries. + min_duration: Minimum duration below which entries are removed. + max_duration: Maximum duration above which entries are removed. + + Returns: + filtered_entries: List of manifest entries after filtering. + total_hours: Total duration of original dataset, in hours + filtered_hours: Total duration of dataset after filtering, in hours + """ + filtered_entries = [] + total_duration = 0.0 + filtered_duration = 0.0 + for entry in entries: + duration = entry["duration"] + total_duration += duration + if (min_duration and duration < min_duration) or (max_duration and duration > max_duration): + continue + + filtered_duration += duration + filtered_entries.append(entry) + + total_hours = total_duration / 3600.0 + filtered_hours = filtered_duration / 3600.0 + + return filtered_entries, total_hours, filtered_hours + + +def get_weighted_sampler( + sample_weights: List[float], batch_size: int, num_steps: int +) -> torch.utils.data.WeightedRandomSampler: + """ + Create pytorch sampler for doing weighted random sampling. + + Args: + sample_weights: List of sampling weights for all elements in the dataset. + batch_size: Batch size to sample. + num_steps: Number of steps to be considered an epoch. + + Returns: + Pytorch sampler + """ + weights = torch.tensor(sample_weights, dtype=torch.float64) + num_samples = batch_size * num_steps + sampler = torch.utils.data.WeightedRandomSampler(weights=weights, num_samples=num_samples) + return sampler diff --git a/tests/collections/tts/parts/utils/test_tts_dataset_utils.py b/tests/collections/tts/parts/utils/test_tts_dataset_utils.py index dadb1844eca6..0730934d46dc 100644 --- a/tests/collections/tts/parts/utils/test_tts_dataset_utils.py +++ b/tests/collections/tts/parts/utils/test_tts_dataset_utils.py @@ -16,8 +16,15 @@ import numpy as np import pytest +import torch -from nemo.collections.tts.parts.utils.tts_dataset_utils import get_abs_rel_paths, get_audio_filepaths, normalize_volume +from nemo.collections.tts.parts.utils.tts_dataset_utils import ( + filter_dataset_by_duration, + get_abs_rel_paths, + get_audio_filepaths, + normalize_volume, + stack_tensors, +) class TestTTSDatasetUtils: @@ -119,3 +126,53 @@ def test_normalize_volume_out_of_range(self): input_audio = np.array([0.0, 0.1, 0.3, 0.5]) with pytest.raises(ValueError, match="Volume must be in range"): normalize_volume(audio=input_audio, volume_level=2.0) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_stack_tensors(self): + tensors = [torch.ones([2]), torch.ones([4]), torch.ones([3])] + max_lens = [6] + expected_output = torch.tensor( + [[1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0]], dtype=torch.float32 + ) + + stacked_tensor = stack_tensors(tensors=tensors, max_lens=max_lens) + + torch.testing.assert_close(stacked_tensor, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_stack_tensors_3d(self): + tensors = [torch.ones([2, 2]), torch.ones([1, 3])] + max_lens = [4, 2] + expected_output = torch.tensor( + [[[1, 1, 0, 0], [1, 1, 0, 0]], [[1, 1, 1, 0], [0, 0, 0, 0]]], dtype=torch.float32 + ) + + stacked_tensor = stack_tensors(tensors=tensors, max_lens=max_lens) + + torch.testing.assert_close(stacked_tensor, expected_output) + + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_filter_dataset_by_duration(self): + min_duration = 1.0 + max_duration = 10.0 + entries = [ + {"duration": 0.5}, + {"duration": 10.0}, + {"duration": 20.0}, + {"duration": 0.1}, + {"duration": 100.0}, + {"duration": 5.0}, + ] + + filtered_entries, total_hours, filtered_hours = filter_dataset_by_duration( + entries=entries, min_duration=min_duration, max_duration=max_duration + ) + + assert len(filtered_entries) == 2 + assert filtered_entries[0]["duration"] == 10.0 + assert filtered_entries[1]["duration"] == 5.0 + assert total_hours == (135.6 / 3600.0) + assert filtered_hours == (15.0 / 3600.0) From 78e5a2e9abef63151cb03f1d37f843a9a8cac2ed Mon Sep 17 00:00:00 2001 From: Yi Dong <43824965+yidong72@users.noreply.github.com> Date: Tue, 16 May 2023 15:00:46 -0400 Subject: [PATCH 57/62] Dialogue dataset (#6654) * chatbot interface Signed-off-by: Yi Dong * latest gradio Signed-off-by: Yi Dong * default greedy Signed-off-by: Yi Dong * better chatbot Signed-off-by: Yi Dong * handle preamble Signed-off-by: Yi Dong * added chatbot training capablity Signed-off-by: Yi Dong * added chatbot ui Signed-off-by: Yi Dong * remove debug code Signed-off-by: Yi Dong * default human Signed-off-by: Yi Dong * use special token for roles Signed-off-by: Yi Dong * special tokens Signed-off-by: Yi Dong * fix name Signed-off-by: Yi Dong * new chat dataset Signed-off-by: Yi Dong * fix the system token Signed-off-by: Yi Dong * upgrade gradio Signed-off-by: Yi Dong * save the chat history Signed-off-by: Yi Dong * update ui Signed-off-by: root * update chat interface Signed-off-by: Yi Dong * handles canonical form Signed-off-by: Yi Dong * new sft chatbot Signed-off-by: Yi Dong * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change format Signed-off-by: Yi Dong * check extra_id in the tokenizer Signed-off-by: Yi Dong * added vocab property check Signed-off-by: Yi Dong * added missing file Signed-off-by: Yi Dong --------- Signed-off-by: Yi Dong Signed-off-by: root Co-authored-by: root Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sandeep Subramanian --- .../conf/megatron_gpt_inference.yaml | 3 +- .../language_modeling/megatron_gpt_eval.py | 8 +- .../tuning/conf/megatron_gpt_sft.yaml | 1 + .../megatron/gpt_sft_chat_dataset.py | 207 ++++++++++++++++++ .../megatron_gpt_sft_model.py | 7 +- .../nlp/modules/common/chat_css.py | 84 +++++++ .../nlp/modules/common/chatbot_component.py | 173 +++++++++++++++ .../nlp/modules/common/megatron_web_server.py | 184 ++++++++++++++-- requirements/requirements_nlp.txt | 3 +- 9 files changed, 646 insertions(+), 24 deletions(-) create mode 100644 nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py create mode 100644 nemo/collections/nlp/modules/common/chat_css.py create mode 100644 nemo/collections/nlp/modules/common/chatbot_component.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml index f61f5342041e..6bd1be905a97 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml @@ -34,4 +34,5 @@ web_server: False # whether launch the web inference server share: False # whether create a public URL username: test # user name for web client password: test2 # password for web client -web_port: 9889 # the port number of the web server \ No newline at end of file +web_port: 9889 # the port number of the web server +chat: False # use the chat interface \ No newline at end of file diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 00b53a9f6f8f..0ac155374512 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -23,7 +23,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.modules.common.megatron_web_server import get_demo +from nemo.collections.nlp.modules.common.megatron_web_server import get_chatbot_demo, get_demo from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer from nemo.collections.nlp.modules.common.text_generation_utils import generate from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam @@ -277,9 +277,13 @@ def main(cfg) -> None: if cfg.server: if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0: if cfg.web_server: + if cfg.chat: + web_ui = get_chatbot_demo + else: + web_ui = get_demo loop = asyncio.new_event_loop() thread = threading.Thread( - target=get_demo, + target=web_ui, daemon=True, args=(cfg.share, cfg.username, cfg.password, cfg.port, cfg.web_port, loop), ) diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml index 12db9133104a..678851db3b01 100644 --- a/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml +++ b/examples/nlp/language_modeling/tuning/conf/megatron_gpt_sft.yaml @@ -66,6 +66,7 @@ model: ffn_dropout: 0.0 data: + chat: False # whether use chatbot data or not train_ds: # Example of how to specify paths to multiple datasets # file_names: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py new file mode 100644 index 000000000000..deb6e77cdb92 --- /dev/null +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -0,0 +1,207 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import torch + +from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset +from nemo.utils import logging + +__all__ = ['GPTSFTChatDataset'] + +IGNORE_INDEX = -100 +END_SIGNAL = "\n" +END_NAME_SIGNAL = "\n" + +SYSTEM_TOKEN = "System\n" +TURN_TOKEN = "" + +GUARD_RAIL_INSTRUCTION = { + "TEXT_TO_CANONICAL_FORM": "Given a dialogue, for each turn you need to generate a short summary called a canonical form. Generate the canonical form for the last turn in the dialogue.", + "CANONICAL_FORM_TO_TEXT": "Given a dialogue, for each turn we also have a short summary called a canonical form. Generate the canonical form given the last turn message and canonical form. Then generate the message.", +} + + +def _mask_targets(target, tokenized_lens, speakers, header_len, s_ids, tokenizer, mask_role): + cur_idx = header_len + tgt_len = target.shape[0] + for i, (tokenized_len, speaker, s_id) in enumerate(zip(tokenized_lens, speakers, s_ids)): + # note, sentence piece will add extra empty token in front. s_id has that extra token too + skip_name_len = len(tokenizer.text_to_ids(TURN_TOKEN + speaker + END_NAME_SIGNAL)) + if cur_idx >= tgt_len: + break + elif cur_idx + tokenized_len < tgt_len: + # Check whether the mask is applied to the correct position, the first token is turn token: + # s_id[2:] skips the artifact empty token and the turn token + # target[cur_idx + 1:cur_idx + tokenized_len] skip the turn token + if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[2:]): + logging.warning("a sentence mismatches the corresponding piece " "in the conversation") + if i == 0: + # mask the first turn completely to provide at least one turn as context + target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX + elif speaker == mask_role: + # leave the first human tag unmasked + target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX + else: + # mask up to the name end, need to remove one as skip name has an extra artifact empty token + target[cur_idx : cur_idx + skip_name_len - 1] = IGNORE_INDEX + cur_idx += tokenized_len + + +def cannonical_form_formater(cannoical_form): + return f'{cannoical_form}\n' + + +def _add_speaker_and_signal(header, source, mask_role, gtype): + """Add speaker and start/end signal on each round.""" + BEGIN_SIGNAL = "" + conversation = header + for i, sentence in enumerate(source): + sentence_from = sentence["from"] + role_token = TURN_TOKEN + if gtype is None: + sentence["value"] = ( + BEGIN_SIGNAL + role_token + sentence_from + END_NAME_SIGNAL + sentence["value"] + END_SIGNAL + ) + elif gtype == "TEXT_TO_CANONICAL_FORM": + sentence["value"] = ( + BEGIN_SIGNAL + + role_token + + sentence_from + + END_NAME_SIGNAL + + sentence["value"] + + END_SIGNAL + + cannonical_form_formater(sentence['canonical_form']) + ) + elif gtype == "CANONICAL_FORM_TO_TEXT": + sentence["value"] = ( + BEGIN_SIGNAL + + role_token + + sentence_from + + END_NAME_SIGNAL + + cannonical_form_formater(sentence['canonical_form']) + + sentence["value"] + + END_SIGNAL + ) + else: + raise ValueError(f"source type {gtype} not supported") + conversation += sentence["value"] + # if the last turn is not masked, add next token start token to the end, which will be included for loss calculation + if sentence_from != mask_role and i == len(source) - 1: + conversation += TURN_TOKEN + return conversation + + +def preprocess( + source: dict, tokenizer: TokenizerSpec, +): + """ + Given a conversation list. This transform: + 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; + 2. Concatenate conversations together; + 3. Tokenize the concatenated conversation; + 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. + """ + canonical_type = None + if 'type' in source: + canonical_type = source['type'] + assert canonical_type in GUARD_RAIL_INSTRUCTION, f"source type {canonical_type} not supported" + # add end signal and concatenate together + conversation = source['system'] + if canonical_type is not None: + conversation = conversation + '\n' + GUARD_RAIL_INSTRUCTION[canonical_type] + mask_role = source.get('mask', 'User') + header = f"{SYSTEM_TOKEN}{conversation}\n\n" + conversation = _add_speaker_and_signal(header, source['conversations'], mask_role, canonical_type) + # tokenize conversations + input_ids = tokenizer.text_to_ids(conversation) + target = copy.deepcopy(input_ids) + header_len = len(tokenizer.text_to_ids(header)) + + ids = [] + tokenized_lens = [] + for s in source['conversations']: + tokenized_sentence = tokenizer.text_to_ids(s["value"]) + ids.append(torch.tensor(tokenized_sentence)) + # remove one token as it adds an empty token in front + tokenized_lens.append(len(tokenized_sentence) - 1) + speakers = [sentence["from"] for sentence in source['conversations']] + assert mask_role in speakers, "mask role not in the conversation" + target = torch.LongTensor(target) + # not going to train on the header + target[:header_len] = IGNORE_INDEX + input_ids = torch.LongTensor(input_ids) + + _mask_targets(target, tokenized_lens, speakers, header_len, ids, tokenizer, mask_role) + mask = (target != IGNORE_INDEX).bool() + assert mask.sum().item() != 0, "mask is empty" + return dict(input_ids=input_ids, mask=mask) + + +class GPTSFTChatDataset(GPTSFTDataset): + def _build_samples_mapping(self): + super()._build_samples_mapping() + assert hasattr(self.tokenizer, "vocab"), "tokenizer should have vocab property, not supported" + assert '' in self.tokenizer.vocab, " not in the tokenizer vocab. not supported" + assert '' in self.tokenizer.vocab, " not in the tokenizer vocab. not supported" + + def _process_example(self, example): + """ + Create an example by concatenating text and answer. + Truncation is carried out when needed, but it is performed only on the prompt side. + BOS, EOS, and SEP, are added if specified. + """ + result = preprocess(example, self.tokenizer) + + return result + + def collate_fn(self, batch): + input_ids = [item['input_ids'][:-1].tolist() for item in batch] + labels = [item['input_ids'][1:].tolist() for item in batch] + loss_mask = [item['mask'][1:].tolist() for item in batch] + + max_length = max([len(x) for x in input_ids]) + if max_length > self.max_seq_length: + # truncate the sequences if it is longer than max_seq_length + input_ids = [x[: self.max_seq_length] for x in input_ids] + labels = [x[: self.max_seq_length] for x in labels] + loss_mask = [x[: self.max_seq_length] for x in loss_mask] + # increase max length to nearest multiple of 4 or 8 + if self.pad_to_max_length: + max_length = self.max_seq_length + else: + max_length = min(self.max_seq_length, self._round_to_nearest(max_length, 8)) + assert max_length <= self.max_seq_length + + attention_mask = [self._create_attention_mask(max_length) for _ in batch] + attention_mask = torch.stack(attention_mask) + position_ids = [list(range(max_length)) for _ in batch] + position_ids = torch.LongTensor(position_ids) + input_ids = torch.LongTensor( + self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id) + ) + labels = torch.LongTensor(self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id)) + loss_mask = torch.LongTensor(self._collate_item(loss_mask, max_length=max_length, pad_id=0)) + + processed_batch = { + 'tokens': input_ids, + 'labels': labels, + 'attention_mask': attention_mask, + 'loss_mask': loss_mask, + 'position_ids': position_ids, + } + + return processed_batch diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index a52a7d22e219..61b491d4af1d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -24,6 +24,7 @@ get_datasets_weights_and_num_samples, ) from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset +from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import GPTSFTChatDataset from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import ( MegatronPretrainingBatchSampler, @@ -234,7 +235,11 @@ def _build_dataset(self, data_cfg, is_train=True): num_train_samples_per_dataset = [[None]] * len(data_cfg.file_names) for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): - dataset = GPTSFTDataset( + if self.cfg.data.chat: + dataset_cls = GPTSFTChatDataset + else: + dataset_cls = GPTSFTDataset + dataset = dataset_cls( file_path=file_path, tokenizer=self.tokenizer, max_seq_length=data_cfg.max_seq_length, diff --git a/nemo/collections/nlp/modules/common/chat_css.py b/nemo/collections/nlp/modules/common/chat_css.py new file mode 100644 index 000000000000..e6b9a79c4bfe --- /dev/null +++ b/nemo/collections/nlp/modules/common/chat_css.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CSS = """ +#chatbot .hll { background-color: #ffffcc } +#chatbot .c { color: #408080; font-style: italic } +#chatbot .err { border: 1px solid #FF0000 } +#chatbot .k { color: #008000; font-weight: bold } +#chatbot .o { color: #666666 } +#chatbot .ch { color: #408080; font-style: italic } +#chatbot .cm { color: #408080; font-style: italic } +#chatbot .cp { color: #BC7A00 } +#chatbot .cpf { color: #408080; font-style: italic } +#chatbot .c1 { color: #408080; font-style: italic } +#chatbot .cs { color: #408080; font-style: italic } +#chatbot .gd { color: #A00000 } +#chatbot .ge { font-style: italic } +#chatbot .gr { color: #FF0000 } +#chatbot .gh { color: #000080; font-weight: bold } +#chatbot .gi { color: #00A000 } +#chatbot .go { color: #888888 } +#chatbot .gp { color: #000080; font-weight: bold } +#chatbot .gs { font-weight: bold } +#chatbot .gu { color: #800080; font-weight: bold } +#chatbot .gt { color: #0044DD } +#chatbot .kc { color: #008000; font-weight: bold } +#chatbot .kd { color: #008000; font-weight: bold } +#chatbot .kn { color: #008000; font-weight: bold } +#chatbot .kp { color: #008000 } +#chatbot .kr { color: #008000; font-weight: bold } +#chatbot .kt { color: #B00040 } +#chatbot .m { color: #666666 } +#chatbot .s { color: #BA2121 } +#chatbot .na { color: #7D9029 } +#chatbot .nb { color: #008000 } +#chatbot .nc { color: #0000FF; font-weight: bold } +#chatbot .no { color: #880000 } +#chatbot .nd { color: #AA22FF } +#chatbot .ni { color: #999999; font-weight: bold } +#chatbot .ne { color: #D2413A; font-weight: bold } +#chatbot .nf { color: #0000FF } +#chatbot .nl { color: #A0A000 } +#chatbot .nn { color: #0000FF; font-weight: bold } +#chatbot .nt { color: #008000; font-weight: bold } +#chatbot .nv { color: #19177C } +#chatbot .ow { color: #AA22FF; font-weight: bold } +#chatbot .w { color: #bbbbbb } +#chatbot .mb { color: #666666 } +#chatbot .mf { color: #666666 } +#chatbot .mh { color: #666666 } +#chatbot .mi { color: #666666 } +#chatbot .mo { color: #666666 } +#chatbot .sa { color: #BA2121 } +#chatbot .sb { color: #BA2121 } +#chatbot .sc { color: #BA2121 } +#chatbot .dl { color: #BA2121 } +#chatbot .sd { color: #BA2121; font-style: italic } +#chatbot .s2 { color: #BA2121 } +#chatbot .se { color: #BB6622; font-weight: bold } +#chatbot .sh { color: #BA2121 } +#chatbot .si { color: #BB6688; font-weight: bold } +#chatbot .sx { color: #008000 } +#chatbot .sr { color: #BB6688 } +#chatbot .s1 { color: #BA2121 } +#chatbot .ss { color: #19177C } +#chatbot .bp { color: #008000 } +#chatbot .fm { color: #0000FF } +#chatbot .vc { color: #19177C } +#chatbot .vg { color: #19177C } +#chatbot .vi { color: #19177C } +#chatbot .vm { color: #19177C } +#chatbot .il { color: #666666 } +""" diff --git a/nemo/collections/nlp/modules/common/chatbot_component.py b/nemo/collections/nlp/modules/common/chatbot_component.py new file mode 100644 index 000000000000..548458df7e29 --- /dev/null +++ b/nemo/collections/nlp/modules/common/chatbot_component.py @@ -0,0 +1,173 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" + +Adopted from https://github.com/gradio-app/gradio/blob/main/gradio/components.py +Fix a markdown render problem. +""" +from __future__ import annotations + +from gradio.components import * +from markdown2 import Markdown + + +class _Keywords(Enum): + NO_VALUE = "NO_VALUE" # Used as a sentinel to determine if nothing is provided as a argument for `value` in `Component.update()` + FINISHED_ITERATING = ( + "FINISHED_ITERATING" # Used to skip processing of a component's value (needed for generators + state) + ) + + +@document("style") +class Chatbot(Changeable, Selectable, IOComponent, JSONSerializable): + """ + Displays a chatbot output showing both user submitted messages and responses. Supports a subset of Markdown including bold, italics, code, and images. + Preprocessing: this component does *not* accept input. + Postprocessing: expects function to return a {List[Tuple[str | None | Tuple, str | None | Tuple]]}, a list of tuples with user message and response messages. Messages should be strings, tuples, or Nones. If the message is a string, it can include Markdown. If it is a tuple, it should consist of (string filepath to image/video/audio, [optional string alt text]). Messages that are `None` are not displayed. + + Demos: chatbot_simple, chatbot_multimodal + """ + + def __init__( + self, + value: List[Tuple[str | None, str | None]] | Callable | None = None, + color_map: Dict[str, str] | None = None, # Parameter moved to Chatbot.style() + *, + label: str | None = None, + every: float | None = None, + show_label: bool = True, + visible: bool = True, + elem_id: str | None = None, + elem_classes: List[str] | str | None = None, + **kwargs, + ): + """ + Parameters: + value: Default value to show in chatbot. If callable, the function will be called whenever the app loads to set the initial value of the component. + label: component name in interface. + every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. Queue must be enabled. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute. + show_label: if True, will display label. + visible: If False, component will be hidden. + elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles. + elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles. + """ + if color_map is not None: + warnings.warn("The 'color_map' parameter has been deprecated.",) + # self.md = utils.get_markdown_parser() + self.md = Markdown(extras=["fenced-code-blocks", "tables", "break-on-newline"]) + self.select: EventListenerMethod + """ + Event listener for when the user selects message from Chatbot. + Uses event data gradio.SelectData to carry `value` referring to text of selected message, and `index` tuple to refer to [message, participant] index. + See EventData documentation on how to use this event data. + """ + + IOComponent.__init__( + self, + label=label, + every=every, + show_label=show_label, + visible=visible, + elem_id=elem_id, + elem_classes=elem_classes, + value=value, + **kwargs, + ) + + def get_config(self): + return { + "value": self.value, + "selectable": self.selectable, + **IOComponent.get_config(self), + } + + @staticmethod + def update( + value: Any | Literal[_Keywords.NO_VALUE] | None = _Keywords.NO_VALUE, + label: str | None = None, + show_label: bool | None = None, + visible: bool | None = None, + ): + updated_config = { + "label": label, + "show_label": show_label, + "visible": visible, + "value": value, + "__type__": "update", + } + return updated_config + + def _process_chat_messages(self, chat_message: str | Tuple | List | Dict | None) -> str | Dict | None: + if chat_message is None: + return None + elif isinstance(chat_message, (tuple, list)): + mime_type = processing_utils.get_mimetype(chat_message[0]) + return { + "name": chat_message[0], + "mime_type": mime_type, + "alt_text": chat_message[1] if len(chat_message) > 1 else None, + "data": None, # These last two fields are filled in by the frontend + "is_file": True, + } + elif isinstance(chat_message, dict): # This happens for previously processed messages + return chat_message + elif isinstance(chat_message, str): + # return self.md.render(chat_message) + return str(self.md.convert(chat_message)) + else: + raise ValueError(f"Invalid message for Chatbot component: {chat_message}") + + def postprocess( + self, y: List[Tuple[str | Tuple | List | Dict | None, str | Tuple | List | Dict | None]], + ) -> List[Tuple[str | Dict | None, str | Dict | None]]: + """ + Parameters: + y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format. It can also be a tuple whose first element is a string filepath or URL to an image/video/audio, and second (optional) element is the alt text, in which case the media file is displayed. It can also be None, in which case that message is not displayed. + Returns: + List of tuples representing the message and response. Each message and response will be a string of HTML, or a dictionary with media information. + """ + if y is None: + return [] + processed_messages = [] + for message_pair in y: + assert isinstance( + message_pair, (tuple, list) + ), f"Expected a list of lists or list of tuples. Received: {message_pair}" + assert ( + len(message_pair) == 2 + ), f"Expected a list of lists of length 2 or list of tuples of length 2. Received: {message_pair}" + processed_messages.append( + ( + # '

' +
+                    #                    message_pair[0] + "
", + message_pair[0], + self._process_chat_messages(message_pair[1]), + ) + ) + return processed_messages + + def style(self, height: int | None = None, **kwargs): + """ + This method can be used to change the appearance of the Chatbot component. + """ + if height is not None: + self._style["height"] = height + if kwargs.get("color_map") is not None: + warnings.warn("The 'color_map' parameter has been deprecated.") + + Component.style( + self, **kwargs, + ) + return self diff --git a/nemo/collections/nlp/modules/common/megatron_web_server.py b/nemo/collections/nlp/modules/common/megatron_web_server.py index 84ceeb286ea3..884f7abe5f01 100644 --- a/nemo/collections/nlp/modules/common/megatron_web_server.py +++ b/nemo/collections/nlp/modules/common/megatron_web_server.py @@ -16,6 +16,8 @@ import gradio as gr +from nemo.collections.nlp.modules.common.chat_css import CSS +from nemo.collections.nlp.modules.common.chatbot_component import Chatbot from nemo.collections.nlp.modules.common.megatron.retrieval_services.util import ( convert_retrieved_to_md, request_data, @@ -24,25 +26,64 @@ __all__ = ['RetroDemoWebApp', 'get_demo'] +TURN_TOKEN = '' -def create_gen_function(port=5555): - def get_generation(prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings): - data = { - "sentences": [prompt], - "tokens_to_generate": int(token_to_gen), - "temperature": temp, - "add_BOS": add_BOS, - "top_k": top_k, - "top_p": top_p, - "greedy": greedy, - "all_probs": False, - "repetition_penalty": repetition, - "min_tokens_to_generate": int(min_tokens), - "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], - } - response = text_generation(data, port=port) - sentences = response['sentences'] - return sentences[0] +DEFAULT_SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" +SYSTEM_TOKEN = 'System\n' +# HUMAN_TOKEN = 'Human:' +# ASSITANT_TOKEN = 'Assistant:' + + +def create_gen_function(port=5555, chat=False): + if chat: + + def get_generation( + prompt, preamble, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings + ): + if preamble is not None and preamble != '': + prompt = SYSTEM_TOKEN + preamble + prompt + data = { + "sentences": [prompt], + "tokens_to_generate": int(token_to_gen), + "temperature": temp, + "add_BOS": add_BOS, + "top_k": top_k, + "top_p": top_p, + "greedy": greedy, + "all_probs": False, + "repetition_penalty": repetition, + "min_tokens_to_generate": int(min_tokens), + "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], + } + response = text_generation(data, port=port) + sentences = response['sentences'] + bot_message = sentences[0] + bot_message = bot_message[len(prompt) :] + return bot_message + + else: + + def get_generation( + prompt, greedy, add_BOS, token_to_gen, min_tokens, temp, top_p, top_k, repetition, end_strings + ): + data = { + "sentences": [prompt], + "tokens_to_generate": int(token_to_gen), + "temperature": temp, + "add_BOS": add_BOS, + "top_k": top_k, + "top_p": top_p, + "greedy": greedy, + "all_probs": False, + "repetition_penalty": repetition, + "min_tokens_to_generate": int(min_tokens), + "end_strings": [i.strip() for i in end_strings.split(',') if len(i) != 0], + } + response = text_generation(data, port=port) + sentences = response['sentences'] + bot_message = sentences[0] + bot_message = bot_message[len(prompt) :] + return bot_message return get_generation @@ -72,7 +113,7 @@ def get_demo(share, username, password, server_port=5555, web_port=9889, loop=No output_box = gr.Textbox(value="", label="Output") btn = gr.Button(value="Submit") btn.click( - create_gen_function(server_port), + create_gen_function(server_port, chat=False), inputs=[ input_prompt, greedy_flag, @@ -90,6 +131,111 @@ def get_demo(share, username, password, server_port=5555, web_port=9889, loop=No demo.launch(share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password)) +def get_chatbot_demo(share, username, password, server_port=5555, web_port=9889, loop=None): + asyncio.set_event_loop(loop) + with gr.Blocks(css=CSS) as demo: + # store the mutliple turn conversation + with gr.Row(): + with gr.Column(scale=2, width=200): + # store the mutliple turn conversation + session_state = gr.State(value=[]) + greedy_flag = gr.Checkbox(label="Greedy", value=True) + add_BOS = gr.Checkbox(label="Add BOS token", value=False) + token_to_gen = gr.Number(label='Number of Tokens to generate', value=300, type=int) + min_token_to_gen = gr.Number(label='Min number of Tokens to generate', value=1, type=int) + temperature = gr.Slider(minimum=0.0, maximum=10.0, value=1.0, label='Temperature', step=0.1) + top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.02, value=0.9, label='Top P') + top_k = gr.Slider(minimum=0, maximum=10000, step=2, value=0, label='Top K') + repetition_penality = gr.Slider( + minimum=1.0, maximum=5.0, step=0.02, value=1.2, label='Repetition penalty' + ) + end_strings = gr.Textbox( + label="End strings (comma separated)", value=f"<|endoftext|>,,", lines=1, + ) + gr.HTML("
") + human_name = gr.Textbox(label="Human Name", value="User", line=1,) + assistant_name = gr.Textbox(label="Assistant Name", value="Assistant", line=1,) + preamble = gr.Textbox(label="System", value=DEFAULT_SYSTEM, lines=2,) + with gr.Column(scale=1, min_width=800): + chatbot = Chatbot(elem_id="chatbot").style(height=800) + msg = gr.Textbox(label="User", value="", lines=1,) + clear = gr.Button("Clear") + + def user(user_message, history, session_state): + session_state.append(user_message) + user_message = user_message.replace('\n', '
') + return "", history + [[user_message, None]] + + def bot( + history, + preamble, + greedy_flag, + add_BOS, + token_to_gen, + min_token_to_gen, + temperature, + top_p, + top_k, + repetition_penality, + end_strings, + human_name, + assistant_name, + session_state, + ): + prompt_text = '' + names = [human_name, assistant_name] + for i, meg in enumerate(session_state): + name = names[i % 2] + prompt_text += TURN_TOKEN + name + '\n' + meg + '\n' + prompt_text += TURN_TOKEN + assistant_name + '\n' + bot_message = create_gen_function(server_port, chat=True)( + prompt_text, + preamble, + greedy_flag, + add_BOS, + token_to_gen, + min_token_to_gen, + temperature, + top_p, + top_k, + repetition_penality, + end_strings, + ) + if bot_message.endswith(TURN_TOKEN): + bot_message = bot_message[: -len(TURN_TOKEN)] + history[-1][1] = bot_message + session_state.append(bot_message.strip()) + return history + + msg.submit(user, [msg, chatbot, session_state], [msg, chatbot], queue=False).then( + bot, + [ + chatbot, + preamble, + greedy_flag, + add_BOS, + token_to_gen, + min_token_to_gen, + temperature, + top_p, + top_k, + repetition_penality, + end_strings, + human_name, + assistant_name, + session_state, + ], + chatbot, + ) + + def clear_fun(session_state): + session_state.clear() + return None + + clear.click(clear_fun, [session_state], chatbot, queue=False) + demo.launch(share=share, server_port=web_port, server_name='0.0.0.0', auth=(username, password)) + + class RetroDemoWebApp: def __init__(self, text_service_ip, text_service_port, combo_service_ip, combo_service_port): self.text_service_ip = text_service_ip diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 0c3c42ba583f..d88280b363c2 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -5,11 +5,12 @@ fasttext flask_restful ftfy gdown -gradio +gradio==3.28.3 h5py ijson inflect jieba +markdown2 matplotlib>=3.3.2 megatron_core==0.1.0 nltk>=3.6.5 From 1a34379461d1982e463189c73d28be5783281185 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 16 May 2023 18:42:08 -0400 Subject: [PATCH 58/62] Add support for RNNT/hybrid models to partial transcribe (#6609) * Add support for RNNT/hybrid models to partial transcribe Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_utils.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_speech.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Update transcribe_utils.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/asr/transcribe_speech.py | 32 ++++++------------- .../asr/parts/utils/transcribe_utils.py | 28 +++++++++++----- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 4a93e630876c..4ed3d92a6305 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -314,28 +314,16 @@ def autocast(): with autocast(): with torch.no_grad(): if partial_audio: - if isinstance(asr_model, EncDecCTCModel): - transcriptions = transcribe_partial_audio( - asr_model=asr_model, - path2manifest=cfg.dataset_manifest, - batch_size=cfg.batch_size, - num_workers=cfg.num_workers, - return_hypotheses=return_hypotheses, - channel_selector=cfg.channel_selector, - augmentor=augmentor, - ) - else: - logging.warning( - "RNNT models do not support transcribe partial audio for now. Transcribing full audio." - ) - transcriptions = asr_model.transcribe( - paths2audio_files=filepaths, - batch_size=cfg.batch_size, - num_workers=cfg.num_workers, - return_hypotheses=return_hypotheses, - channel_selector=cfg.channel_selector, - augmentor=augmentor, - ) + transcriptions = transcribe_partial_audio( + asr_model=asr_model, + path2manifest=cfg.dataset_manifest, + batch_size=cfg.batch_size, + num_workers=cfg.num_workers, + return_hypotheses=return_hypotheses, + channel_selector=cfg.channel_selector, + augmentor=augmentor, + decoder_type=cfg.decoder_type, + ) else: transcriptions = asr_model.transcribe( paths2audio_files=filepaths, diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 990e3b96b0fc..60f936306d05 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -362,11 +362,11 @@ def transcribe_partial_audio( num_workers: int = 0, channel_selector: Optional[int] = None, augmentor: DictConfig = None, + decoder_type: Optional[str] = None, ) -> List[str]: """ - See description of this function in trancribe() in nemo/collections/asr/models/ctc_models.py """ - - assert isinstance(asr_model, EncDecCTCModel), "Currently support CTC model only." + See description of this function in trancribe() in nemo/collections/asr/models/ctc_models.py and nemo/collections/asr/models/rnnt_models.py + """ if return_hypotheses and logprobs: raise ValueError( @@ -384,6 +384,17 @@ def transcribe_partial_audio( dither_value = asr_model.preprocessor.featurizer.dither pad_to_value = asr_model.preprocessor.featurizer.pad_to + if decoder_type is not None: # Hybrid model + decode_function = ( + asr_model.decoding.rnnt_decoder_predictions_tensor + if decoder_type == 'rnnt' + else asr_model.decoding.ctc_decoder_predictions_tensor + ) + elif hasattr(asr_model, 'joint'): # RNNT model + decode_function = asr_model.decoding.rnnt_decoder_predictions_tensor + else: # CTC model + decode_function = asr_model.decoding.ctc_decoder_predictions_tensor + try: asr_model.preprocessor.featurizer.dither = 0.0 asr_model.preprocessor.featurizer.pad_to = 0 @@ -406,18 +417,20 @@ def transcribe_partial_audio( temporary_datalayer = asr_model._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - logits, logits_len, greedy_predictions = asr_model.forward( + outputs = asr_model.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) + logits, logits_len = outputs[0], outputs[1] if logprobs: # dump log probs per file for idx in range(logits.shape[0]): lg = logits[idx][: logits_len[idx]] hypotheses.append(lg.cpu().numpy()) else: - current_hypotheses, all_hyp = asr_model.decoding.ctc_decoder_predictions_tensor( - logits, decoder_lengths=logits_len, return_hypotheses=return_hypotheses, - ) + current_hypotheses, all_hyp = decode_function(logits, logits_len, return_hypotheses=return_hypotheses,) + + if isinstance(current_hypotheses, tuple) and len(current_hypotheses) == 2: + current_hypotheses = current_hypotheses[0] if return_hypotheses: # dump log probs per file @@ -428,7 +441,6 @@ def transcribe_partial_audio( hypotheses += current_hypotheses - del greedy_predictions del logits del test_batch From e9fcc418ed97e689ee8a784da6e8593997f28ce9 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Wed, 17 May 2023 12:25:27 +0400 Subject: [PATCH 59/62] eval_beamsearch_ngram.py with hybrid ctc (#6656) * separate_punctuation = false * ctc decoding strategy = model.decoding * transcribe(files, logprobs=True) returns logprobs --------- Signed-off-by: Nikolay Karpov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/asr/speech_to_text_eval.py | 3 +- .../asr/models/hybrid_rnnt_ctc_models.py | 11 +++++- .../ngram_lm/eval_beamsearch_ngram.py | 39 ++++++++++++++----- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/examples/asr/speech_to_text_eval.py b/examples/asr/speech_to_text_eval.py index f8dcbcf81bbd..f4d2a66ffec0 100644 --- a/examples/asr/speech_to_text_eval.py +++ b/examples/asr/speech_to_text_eval.py @@ -82,7 +82,7 @@ class EvaluationConfig(transcribe_speech.TranscriptionConfig): only_score_manifest: bool = False text_processing: Optional[TextProcessingConfig] = TextProcessingConfig( - punctuation_marks=".,?", separate_punctuation=True, do_lowercase=False, rm_punctuation=False, + punctuation_marks=".,?", separate_punctuation=False, do_lowercase=False, rm_punctuation=False, ) @@ -134,6 +134,7 @@ def main(cfg: EvaluationConfig): pc = PunctuationCapitalization(cfg.text_processing.punctuation_marks) if cfg.text_processing.separate_punctuation: ground_truth_text = pc.separate_punctuation(ground_truth_text) + predicted_text = pc.separate_punctuation(predicted_text) if cfg.text_processing.do_lowercase: ground_truth_text = pc.do_lowercase(ground_truth_text) predicted_text = pc.do_lowercase(predicted_text) diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py index 447caa3f5de6..5ca6124ecfd7 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_models.py @@ -102,6 +102,7 @@ def transcribe( channel_selector: Optional[ChannelSelectorType] = None, augmentor: DictConfig = None, verbose: bool = True, + logprobs: bool = False, ) -> (List[str], Optional[List['Hypothesis']]): """ Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. @@ -119,6 +120,7 @@ def transcribe( channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing. augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied. verbose: (bool) whether to display tqdm progress bar + logprobs: (bool) whether to return ctc logits insted of hypotheses Returns: Returns a tuple of 2 items - @@ -189,6 +191,7 @@ def transcribe( config['augmentor'] = augmentor temporary_datalayer = self._setup_transcribe_dataloader(config) + logits_list = [] for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): encoded, encoded_len = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) @@ -206,6 +209,9 @@ def transcribe( best_hyp[idx].y_sequence = logits[idx][: encoded_len[idx]] if best_hyp[idx].alignments is None: best_hyp[idx].alignments = best_hyp[idx].y_sequence + if logprobs: + for logit, elen in zip(logits, encoded_len): + logits_list.append(logit[:elen]) del logits hypotheses += best_hyp @@ -229,7 +235,10 @@ def transcribe( self.joint.unfreeze() if hasattr(self, 'ctc_decoder'): self.ctc_decoder.unfreeze() - return hypotheses, all_hypotheses + if logprobs: + return logits_list + else: + return hypotheses, all_hypotheses def change_vocabulary( self, diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py index 1f62da6bb168..1846a986bf6e 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py @@ -15,10 +15,12 @@ """ # This script would evaluate an N-gram language model trained with KenLM library (https://github.com/kpu/kenlm) in -# fusion with beam search decoders on top of a trained ASR model. NeMo's beam search decoders are capable of using the -# KenLM's N-gram models to find the best candidates. This script supports both character level and BPE level +# fusion with beam search decoders on top of a trained ASR model with CTC decoder. To evaluate a model with +# Transducer (RNN-T) decoder use another script 'scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py'. +# NeMo's beam search decoders are capable of using the KenLM's N-gram models +# to find the best candidates. This script supports both character level and BPE level # encodings and models which is detected automatically from the type of the model. -# You may train the LM model with 'scripts/ngram_lm/train_kenlm.py'. +# You may train the LM model with 'scripts/asr_language_modeling/ngram_lm/train_kenlm.py'. # Config Help @@ -29,7 +31,7 @@ # USAGE python eval_beamsearch_ngram.py nemo_model_file= \ - input_manifest= \ kenlm_model_file= \ beam_width=[] \ beam_alpha=[] \ @@ -70,6 +72,7 @@ from tqdm.auto import tqdm import nemo.collections.asr as nemo_asr +from nemo.collections.asr.models import EncDecHybridRNNTCTCModel from nemo.collections.asr.parts.submodules import ctc_beam_decoding from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig from nemo.core.config import hydra_runner @@ -113,7 +116,7 @@ class EvalBeamSearchNGramConfig: text_processing: Optional[TextProcessingConfig] = TextProcessingConfig( punctuation_marks = ".,?", - separate_punctuation = True, + separate_punctuation = False, do_lowercase = False, rm_punctuation = False, ) @@ -151,7 +154,12 @@ def beam_search_eval( model.cfg.decoding.beam = cfg.decoding # Update model's decoding strategy - model.change_decoding_strategy(model.cfg.decoding) + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc') + decoding = model.ctc_decoding + else: + model.change_decoding_strategy(model.cfg.decoding) + decoding = model.decoding logging.setLevel(level) wer_dist_first = cer_dist_first = 0 @@ -182,7 +190,7 @@ def beam_search_eval( probs_batch[prob_index], device=packed_batch.device, dtype=packed_batch.dtype ) - _, beams_batch = model.decoding.ctc_decoder_predictions_tensor( + _, beams_batch = decoding.ctc_decoder_predictions_tensor( packed_batch, decoder_lengths=probs_lens, return_hypotheses=True, ) @@ -199,6 +207,8 @@ def beam_search_eval( pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] if cfg.text_processing.rm_punctuation: pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] + if cfg.text_processing.separate_punctuation: + pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0] pred_split_w = pred_text.split() wer_dist = editdistance.eval(target_split_w, pred_split_w) pred_split_c = list(pred_text) @@ -247,6 +257,7 @@ def beam_search_eval( @hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig) def main(cfg: EvalBeamSearchNGramConfig): + logging.warning("This file will be renamed to eval_beamsearch_ngram_ctc.py in the future NeMo (1.21) release.") if is_dataclass(cfg): cfg = OmegaConf.structured(cfg) # type: EvalBeamSearchNGramConfig @@ -279,12 +290,12 @@ def main(cfg: EvalBeamSearchNGramConfig): audio_file_paths.append(str(audio_file.absolute())) punctuation_capitalization = PunctuationCapitalization(cfg.text_processing.punctuation_marks) - if cfg.text_processing.separate_punctuation: - target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts) if cfg.text_processing.do_lowercase: target_transcripts = punctuation_capitalization.do_lowercase(target_transcripts) if cfg.text_processing.rm_punctuation: target_transcripts = punctuation_capitalization.rm_punctuation(target_transcripts) + if cfg.text_processing.separate_punctuation: + target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts) if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file): logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.") @@ -316,6 +327,8 @@ def default_autocast(): with autocast(): with torch.no_grad(): + if isinstance(asr_model, EncDecHybridRNNTCTCModel): + asr_model.cur_decoder = 'ctc' all_logits = asr_model.transcribe(audio_file_paths, batch_size=cfg.acoustic_batch_size, logprobs=True) all_probs = all_logits @@ -331,11 +344,17 @@ def default_autocast(): for batch_idx, probs in enumerate(all_probs): preds = np.argmax(probs, axis=1) preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0) - pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + if isinstance(asr_model, EncDecHybridRNNTCTCModel): + pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + else: + pred_text = asr_model._wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0] + if cfg.text_processing.do_lowercase: pred_text = punctuation_capitalization.do_lowercase([pred_text])[0] if cfg.text_processing.rm_punctuation: pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0] + if cfg.text_processing.separate_punctuation: + pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0] pred_split_w = pred_text.split() target_split_w = target_transcripts[batch_idx].split() From 78fe8937ab8e4527f01b11e5f952b5f33dcdb39c Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Wed, 17 May 2023 10:35:11 -0700 Subject: [PATCH 60/62] fix bucketing bug issue for picking new bucket (#6663) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- nemo/collections/asr/data/audio_to_text.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index 3b2e2a767a97..58cd3630e322 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -1341,8 +1341,7 @@ def __next__(self): try: sample = next(self.wrapped_iter) except StopIteration: - self.wrapped_iter = iter(self.wrapped_ds) - sample = next(self.wrapped_iter) + break batches.append(sample) if len(batches) == 0: raise StopIteration From 8aa80ee661bbddd511a652aa207e80ebb82e2930 Mon Sep 17 00:00:00 2001 From: Adi Renduchintala Date: Wed, 17 May 2023 19:48:58 -0700 Subject: [PATCH 61/62] minor fix for missing chat attr (#6671) Signed-off-by: arendu --- .../nlp/models/language_modeling/megatron_gpt_sft_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py index 61b491d4af1d..7819d28e8150 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py @@ -235,7 +235,7 @@ def _build_dataset(self, data_cfg, is_train=True): num_train_samples_per_dataset = [[None]] * len(data_cfg.file_names) for file_path, num_samples in zip(data_cfg.file_names, num_train_samples_per_dataset): - if self.cfg.data.chat: + if self.cfg.data.get("chat", False): dataset_cls = GPTSFTChatDataset else: dataset_cls = GPTSFTDataset From 4c6e2bc0139823a9262736abeccac8c5f0bc5e96 Mon Sep 17 00:00:00 2001 From: Ryan Langman Date: Thu, 18 May 2023 09:39:56 -0700 Subject: [PATCH 62/62] [TTS] Add callback for saving audio during FastPitch training (#6665) * [TTS] Add callback for saving audio during FastPitch training Signed-off-by: Ryan * [TTS] Allow NGC model name for vocoder Signed-off-by: Ryan --------- Signed-off-by: Ryan --- .../tts/conf/fastpitch/fastpitch_22050.yaml | 44 +- .../tts/data/text_to_speech_dataset.py | 15 +- nemo/collections/tts/models/fastpitch.py | 36 +- nemo/collections/tts/modules/fastpitch.py | 27 +- nemo/collections/tts/parts/utils/callbacks.py | 393 ++++++++++++++++++ nemo/collections/tts/parts/utils/helpers.py | 17 + 6 files changed, 517 insertions(+), 15 deletions(-) create mode 100644 nemo/collections/tts/parts/utils/callbacks.py diff --git a/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/examples/tts/conf/fastpitch/fastpitch_22050.yaml index 016e157ce39f..4022e8e91c97 100644 --- a/examples/tts/conf/fastpitch/fastpitch_22050.yaml +++ b/examples/tts/conf/fastpitch/fastpitch_22050.yaml @@ -14,10 +14,16 @@ feature_stats_path: null train_ds_meta: ??? val_ds_meta: ??? +log_ds_meta: ??? phoneme_dict_path: ??? heteronyms_path: ??? +log_dir: ??? +vocoder_type: ??? +vocoder_name: null +vocoder_checkpoint_path: null + defaults: - feature: feature_22050 @@ -27,6 +33,7 @@ model: n_speakers: ${n_speakers} n_mel_channels: ${feature.mel_feature.mel_dim} + min_token_duration: 1 max_token_duration: 75 symbols_embedding_dim: 384 pitch_embedding_kernel_size: 3 @@ -126,7 +133,42 @@ model: dataloader_params: batch_size: ${batch_size} - drop_last: false + num_workers: 2 + + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.FastPitchArtifactGenerator + log_spectrogram: true + log_alignment: true + audio_params: + _target_: nemo.collections.tts.parts.utils.callbacks.LogAudioParams + log_audio_gta: true + vocoder_type: ${vocoder_type} + vocoder_name: ${vocoder_name} + vocoder_checkpoint_path: ${vocoder_checkpoint_path} + + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + text_tokenizer: ${model.text_tokenizer} + sample_rate: ${feature.sample_rate} + speaker_path: ${speaker_path} + align_prior_config: ${model.align_prior_config} + featurizers: ${feature.featurizers} + + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 8 num_workers: 2 input_fft: diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py index f6230fa3493a..47868d41d1ec 100644 --- a/nemo/collections/tts/data/text_to_speech_dataset.py +++ b/nemo/collections/tts/data/text_to_speech_dataset.py @@ -190,8 +190,8 @@ def _process_dataset( sample = DatasetSample( manifest_entry=entry, - audio_dir=dataset.audio_dir, - feature_dir=dataset.feature_dir, + audio_dir=Path(dataset.audio_dir), + feature_dir=Path(dataset.feature_dir), text=text, speaker=speaker, speaker_index=speaker_index, @@ -208,12 +208,12 @@ def __getitem__(self, index): data = self.data_samples[index] audio_filepath = Path(data.manifest_entry["audio_filepath"]) - audio_path, _ = get_abs_rel_paths(input_path=audio_filepath, base_path=data.audio_dir) + audio_filepath_abs, audio_filepath_rel = get_abs_rel_paths(input_path=audio_filepath, base_path=data.audio_dir) - audio, _ = librosa.load(audio_path, sr=self.sample_rate) + audio, _ = librosa.load(audio_filepath_abs, sr=self.sample_rate) tokens = self.text_tokenizer(data.text) - example = {"audio": audio, "tokens": tokens} + example = {"audio_filepath": audio_filepath_rel, "audio": audio, "tokens": tokens} if data.speaker is not None: example["speaker"] = data.speaker @@ -243,7 +243,7 @@ def __getitem__(self, index): return example def collate_fn(self, batch: List[dict]): - + audio_filepath_list = [] audio_list = [] audio_len_list = [] token_list = [] @@ -252,6 +252,8 @@ def collate_fn(self, batch: List[dict]): prior_list = [] for example in batch: + audio_filepath_list.append(example["audio_filepath"]) + audio_tensor = torch.tensor(example["audio"], dtype=torch.float32) audio_list.append(audio_tensor) audio_len_list.append(audio_tensor.shape[0]) @@ -276,6 +278,7 @@ def collate_fn(self, batch: List[dict]): batch_tokens = stack_tensors(token_list, max_lens=[token_max_len], pad_value=self.text_tokenizer.pad) batch_dict = { + "audio_filepaths": audio_filepath_list, "audio": batch_audio, "audio_lens": batch_audio_len, "text": batch_tokens, diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 281a7c2891b3..3939c9453911 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -13,6 +13,7 @@ # limitations under the License. import contextlib from dataclasses import dataclass +from pathlib import Path from typing import List, Optional import torch @@ -27,6 +28,7 @@ from nemo.collections.tts.models.base import SpectrogramGenerator from nemo.collections.tts.modules.fastpitch import FastPitchModule from nemo.collections.tts.parts.mixins import FastPitchAdapterModelMixin +from nemo.collections.tts.parts.utils.callbacks import LoggingCallback from nemo.collections.tts.parts.utils.helpers import ( batch_from_ragged, plot_alignment_to_numpy, @@ -115,6 +117,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): super().__init__(cfg=cfg, trainer=trainer) self.bin_loss_warmup_epochs = cfg.get("bin_loss_warmup_epochs", 100) + self.log_images = cfg.get("log_images", False) self.log_train_images = False loss_scale = 0.1 if self.learn_alignment else 1.0 @@ -154,6 +157,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): speaker_emb_condition_prosody = cfg.get("speaker_emb_condition_prosody", False) speaker_emb_condition_decoder = cfg.get("speaker_emb_condition_decoder", False) speaker_emb_condition_aligner = cfg.get("speaker_emb_condition_aligner", False) + min_token_duration = cfg.get("min_token_duration", 0) use_log_energy = cfg.get("use_log_energy", True) if n_speakers > 1 and "add" not in input_fft.cond_input.condition_types: input_fft.cond_input.condition_types.append("add") @@ -178,6 +182,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): cfg.pitch_embedding_kernel_size, energy_embedding_kernel_size, cfg.n_mel_channels, + min_token_duration, cfg.max_token_duration, use_log_energy, ) @@ -190,6 +195,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): if self.fastpitch.speaker_emb is not None: self.export_config["num_speakers"] = cfg.n_speakers + self.log_config = cfg.get("log_config", None) + # Adapter modules setup (from FastPitchAdapterModelMixin) self.setup_adapters() @@ -462,7 +469,7 @@ def training_step(self, batch, batch_idx): self.log("t_bin_loss", bin_loss) # Log images to tensorboard - if self.log_train_images and isinstance(self.logger, TensorBoardLogger): + if self.log_images and self.log_train_images and isinstance(self.logger, TensorBoardLogger): self.log_train_images = False self.tb_logger.add_image( @@ -571,7 +578,7 @@ def validation_epoch_end(self, outputs): _, _, _, _, _, spec_target, spec_predict = outputs[0].values() - if isinstance(self.logger, TensorBoardLogger): + if self.log_images and isinstance(self.logger, TensorBoardLogger): self.tb_logger.add_image( "val_mel_target", plot_spectrogram_to_numpy(spec_target[0].data.cpu().float().numpy()), @@ -658,6 +665,31 @@ def setup_test_data(self, cfg): """Omitted.""" pass + def configure_callbacks(self): + if not self.log_config: + return [] + + sample_ds_class = self.log_config.dataset._target_ + if sample_ds_class != "nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset": + raise ValueError(f"Logging callback only supported for TextToSpeechDataset, got {sample_ds_class}") + + data_loader = self._setup_test_dataloader(self.log_config) + + generators = instantiate(self.log_config.generators) + log_dir = Path(self.log_config.log_dir) if self.log_config.log_dir else None + log_callback = LoggingCallback( + generators=generators, + data_loader=data_loader, + log_epochs=self.log_config.log_epochs, + epoch_frequency=self.log_config.epoch_frequency, + output_dir=log_dir, + loggers=self.trainer.loggers, + log_tensorboard=self.log_config.log_tensorboard, + log_wandb=self.log_config.log_wandb, + ) + + return [log_callback] + @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': """ diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index b26aafa72e32..f7601302d81e 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -80,6 +80,12 @@ def average_features(pitch, durs): return pitch_avg +def log_to_duration(log_dur, min_dur, max_dur, mask): + dur = torch.clamp(torch.exp(log_dur) - 1.0, min_dur, max_dur) + dur *= mask.squeeze(2) + return dur + + class ConvReLUNorm(torch.nn.Module, adapter_mixins.AdapterModuleMixin): def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0, condition_dim=384, condition_types=[]): super(ConvReLUNorm, self).__init__() @@ -163,6 +169,7 @@ def __init__( pitch_embedding_kernel_size: int, energy_embedding_kernel_size: int, n_mel_channels: int = 80, + min_token_duration: int = 0, max_token_duration: int = 75, use_log_energy: bool = True, ): @@ -188,8 +195,8 @@ def __init__( else: self.speaker_emb = None + self.min_token_duration = min_token_duration self.max_token_duration = max_token_duration - self.min_token_duration = 0 self.pitch_emb = torch.nn.Conv1d( 1, @@ -294,7 +301,9 @@ def forward( # Predict duration log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) - durs_predicted = torch.clamp(torch.exp(log_durs_predicted) - 1, 0, self.max_token_duration) + durs_predicted = log_to_duration( + log_dur=log_durs_predicted, min_dur=self.min_token_duration, max_dur=self.max_token_duration, mask=enc_mask + ) attn_soft, attn_hard, attn_hard_dur, attn_logprob = None, None, None, None if self.learn_alignment and spec is not None: @@ -398,8 +407,8 @@ def infer( # Predict duration and pitch log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) - durs_predicted = torch.clamp( - torch.exp(log_durs_predicted) - 1.0, self.min_token_duration, self.max_token_duration + durs_predicted = log_to_duration( + log_dur=log_durs_predicted, min_dur=self.min_token_duration, max_dur=self.max_token_duration, mask=enc_mask ) pitch_predicted = self.pitch_predictor(enc_out, enc_mask, conditioning=spk_emb) + pitch pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) @@ -444,6 +453,7 @@ def __init__( symbols_embedding_dim: int, pitch_embedding_kernel_size: int, n_mel_channels: int = 80, + min_token_duration: int = 0, max_token_duration: int = 75, ): super().__init__() @@ -453,8 +463,8 @@ def __init__( self.duration_predictor = duration_predictor self.pitch_predictor = pitch_predictor + self.min_token_duration = min_token_duration self.max_token_duration = max_token_duration - self.min_token_duration = 0 if self.pitch_predictor is not None: self.pitch_emb = torch.nn.Conv1d( @@ -497,7 +507,12 @@ def forward(self, *, enc_out=None, enc_mask=None, durs=None, pitch=None, pace=1. log_durs_predicted, durs_predicted = None, None if self.duration_predictor is not None: log_durs_predicted = self.duration_predictor(enc_out, enc_mask) - durs_predicted = torch.clamp(torch.exp(log_durs_predicted) - 1, 0, self.max_token_duration) + durs_predicted = log_to_duration( + log_dur=log_durs_predicted, + min_dur=self.min_token_duration, + max_dur=self.max_token_duration, + mask=enc_mask, + ) # Predict pitch pitch_predicted = None diff --git a/nemo/collections/tts/parts/utils/callbacks.py b/nemo/collections/tts/parts/utils/callbacks.py new file mode 100644 index 000000000000..0f8bd0fa4177 --- /dev/null +++ b/nemo/collections/tts/parts/utils/callbacks.py @@ -0,0 +1,393 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Type + +import librosa +import numpy as np +import soundfile as sf +import torch +from pytorch_lightning import Callback, LightningModule, Trainer +from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.loggers.logger import Logger +from pytorch_lightning.loggers.wandb import WandbLogger + +from nemo.collections.tts.parts.utils.helpers import create_plot +from nemo.utils.decorators import experimental + +HAVE_WANDB = True +try: + import wandb +except ModuleNotFoundError: + HAVE_WANDB = False + + +def _get_logger(loggers: List[Logger], logger_type: Type[Logger]): + for logger in loggers: + if isinstance(logger, logger_type): + if hasattr(logger, "experiment"): + return logger.experiment + else: + return logger + raise ValueError(f"Could not find {logger_type} logger in {loggers}.") + + +def _load_vocoder(model_name: Optional[str], checkpoint_path: Optional[str], type: str): + assert (model_name is None) != ( + checkpoint_path is None + ), f"Must provide exactly one of vocoder model_name or checkpoint: ({model_name}, {checkpoint_path})" + + checkpoint_path = str(checkpoint_path) + if type == "hifigan": + from nemo.collections.tts.models import HifiGanModel + + model_type = HifiGanModel + elif type == "univnet": + from nemo.collections.tts.models import UnivNetModel + + model_type = UnivNetModel + else: + raise ValueError(f"Unknown vocoder type '{type}'") + + if model_name is not None: + vocoder = model_type.from_pretrained(model_name).eval() + else: + vocoder = model_type.load_from_checkpoint(checkpoint_path).eval() + + return vocoder + + +@dataclass +class AudioArtifact: + id: str + data: np.ndarray + sample_rate: int + filename: str + + +@dataclass +class ImageArtifact: + id: str + data: np.ndarray + filename: str + x_axis: str + y_axis: str + + +@dataclass +class LogAudioParams: + vocoder_type: str + vocoder_name: str + vocoder_checkpoint_path: str + log_audio_gta: bool = False + + +def create_id(filepath: Path) -> str: + path_prefix = str(filepath.with_suffix("")) + file_id = path_prefix.replace(os.sep, "_") + return file_id + + +class ArtifactGenerator(ABC): + @abstractmethod + def generate_artifacts( + self, model: LightningModule, batch_dict: Dict + ) -> Tuple[List[AudioArtifact], List[ImageArtifact]]: + """ + Create artifacts for the input model and test batch. + + Args: + model: Model instance being trained to use for inference. + batch_dict: Test batch to generate artifacts for. + + Returns: + List of audio and image artifacts to log. + """ + + +@experimental +class LoggingCallback(Callback): + """ + Callback which can log artifacts (eg. model predictions, graphs) to local disk, Tensorboard, and/or WandB. + + Args: + generators: List of generators to create and log artifacts from. + data_loader: Data to log artifacts for. + log_epochs: Optional list of specific training epoch numbers to log artifacts for. + epoch_frequency: Frequency with which to log + output_dir: Optional local directory. If provided, artifacts will be saved in output_dir. + loggers: Optional list of loggers to use if logging to tensorboard or wandb. + log_tensorboard: Whether to log artifacts to tensorboard. + log_wandb: Whether to log artifacts to WandB. + """ + + def __init__( + self, + generators: List[ArtifactGenerator], + data_loader: torch.utils.data.DataLoader, + log_epochs: Optional[List[int]] = None, + epoch_frequency: int = 1, + output_dir: Optional[Path] = None, + loggers: Optional[List[Logger]] = None, + log_tensorboard: bool = False, + log_wandb: bool = False, + ): + self.generators = generators + self.data_loader = data_loader + self.log_epochs = log_epochs if log_epochs else [] + self.epoch_frequency = epoch_frequency + self.output_dir = Path(output_dir) if output_dir else None + self.loggers = loggers if loggers else [] + self.log_tensorboard = log_tensorboard + self.log_wandb = log_wandb + + if log_tensorboard: + self.tensorboard_logger = _get_logger(self.loggers, TensorBoardLogger) + else: + self.tensorboard_logger = None + + if log_wandb: + if not HAVE_WANDB: + raise ValueError("Wandb not installed.") + self.wandb_logger = _get_logger(self.loggers, WandbLogger) + else: + self.wandb_logger = None + + def _log_audio(self, audio: AudioArtifact, log_dir: Path, step: int): + if log_dir: + filepath = log_dir / audio.filename + sf.write(file=filepath, data=audio.data, samplerate=audio.sample_rate) + + if self.tensorboard_logger: + self.tensorboard_logger.add_audio( + tag=audio.id, snd_tensor=audio.data, global_step=step, sample_rate=audio.sample_rate, + ) + + if self.wandb_logger: + wandb_audio = (wandb.Audio(audio.data, sample_rate=audio.sample_rate, caption=audio.id),) + self.wandb_logger.log({audio.id: wandb_audio}) + + def _log_image(self, image: ImageArtifact, log_dir: Path, step: int): + if log_dir: + filepath = log_dir / image.filename + else: + filepath = None + + image_plot = create_plot(output_filepath=filepath, data=image.data, x_axis=image.x_axis, y_axis=image.y_axis) + + if self.tensorboard_logger: + self.tensorboard_logger.add_image( + tag=image.id, img_tensor=image_plot, global_step=step, dataformats="HWC", + ) + + if self.wandb_logger: + wandb_image = (wandb.Image(image_plot, caption=image.id),) + self.wandb_logger.log({image.id: wandb_image}) + + def on_train_epoch_end(self, trainer: Trainer, model: LightningModule): + epoch = 1 + model.current_epoch + if (epoch not in self.log_epochs) and (epoch % self.epoch_frequency != 0): + return + + if self.output_dir: + log_dir = self.output_dir / f"epoch_{epoch}" + log_dir.mkdir(parents=True, exist_ok=True) + else: + log_dir = None + + audio_list = [] + image_list = [] + for batch_dict in self.data_loader: + for key, value in batch_dict.items(): + if isinstance(value, torch.Tensor): + batch_dict[key] = value.to(model.device) + + for generator in self.generators: + audio, images = generator.generate_artifacts(model=model, batch_dict=batch_dict) + audio_list += audio + image_list += images + + for audio in audio_list: + self._log_audio(audio=audio, log_dir=log_dir, step=model.global_step) + + for image in image_list: + self._log_image(image=image, log_dir=log_dir, step=model.global_step) + + +class FastPitchArtifactGenerator(ArtifactGenerator): + """ + Generator for logging FastPitch model outputs. + + Args: + log_spectrogram: Whether to log predicted spectrograms. + log_alignment: Whether to log alignment graphs. + audio_params: Optional parameters for saving predicted audio. + Requires a vocoder model checkpoint for generating audio from predicted spectrograms. + """ + + def __init__( + self, + log_spectrogram: bool = False, + log_alignment: bool = False, + audio_params: Optional[LogAudioParams] = None, + ): + self.log_spectrogram = log_spectrogram + self.log_alignment = log_alignment + + if not audio_params: + self.log_audio = False + self.log_audio_gta = False + self.vocoder = None + else: + self.log_audio = True + self.log_audio_gta = audio_params.log_audio_gta + self.vocoder = _load_vocoder( + model_name=audio_params.vocoder_name, + checkpoint_path=audio_params.vocoder_checkpoint_path, + type=audio_params.vocoder_type, + ) + + def _generate_audio(self, mels, mels_len, hop_length): + voc_input = mels.to(self.vocoder.device) + with torch.no_grad(): + audio_pred = self.vocoder.convert_spectrogram_to_audio(spec=voc_input) + + mels_len_array = mels_len.cpu().numpy() + audio_pred_lens = librosa.core.frames_to_samples(mels_len_array, hop_length=hop_length) + return audio_pred, audio_pred_lens + + def _generate_predictions(self, model: LightningModule, audio_ids: List[str], batch_dict: Dict): + audio_artifacts = [] + image_artifacts = [] + + text = batch_dict.get("text") + text_lens = batch_dict.get("text_lens") + speaker = batch_dict.get("speaker_id", None) + + with torch.no_grad(): + # [B, C, T_spec] + mels_pred, mels_pred_len, *_ = model.forward(text=text, input_lens=text_lens, speaker=speaker,) + + if self.log_spectrogram: + for i, audio_id in enumerate(audio_ids): + spec_i = mels_pred[i][:, : mels_pred_len[i]].cpu().numpy() + spec_artifact = ImageArtifact( + id=f"spec_{audio_id}", + data=spec_i, + filename=f"{audio_id}_spec.png", + x_axis="Audio Frames", + y_axis="Channels", + ) + image_artifacts.append(spec_artifact) + + if self.log_audio: + # [B, T_audio] + audio_pred, audio_pred_lens = self._generate_audio( + mels=mels_pred, mels_len=mels_pred_len, hop_length=model.preprocessor.hop_length + ) + for i, audio_id in enumerate(audio_ids): + audio_pred_i = audio_pred[i][: audio_pred_lens[i]].cpu().numpy() + audio_artifact = AudioArtifact( + id=f"audio_{audio_id}", + data=audio_pred_i, + filename=f"{audio_id}.wav", + sample_rate=self.vocoder.sample_rate, + ) + audio_artifacts.append(audio_artifact) + + return audio_artifacts, image_artifacts + + def _generate_gta_predictions(self, model: LightningModule, audio_ids: List[str], batch_dict: Dict): + audio_artifacts = [] + image_artifacts = [] + + audio = batch_dict.get("audio") + audio_lens = batch_dict.get("audio_lens") + text = batch_dict.get("text") + text_lens = batch_dict.get("text_lens") + attn_prior = batch_dict.get("align_prior_matrix", None) + pitch = batch_dict.get("pitch", None) + energy = batch_dict.get("energy", None) + speaker = batch_dict.get("speaker_id", None) + + mels, spec_len = model.preprocessor(input_signal=audio, length=audio_lens) + with torch.no_grad(): + mels_pred, mels_pred_len, _, _, _, attn, _, _, _, _, _, _ = model.forward( + text=text, + input_lens=text_lens, + pitch=pitch, + energy=energy, + speaker=speaker, + spec=mels, + mel_lens=spec_len, + attn_prior=attn_prior, + ) + + if self.log_alignment: + # [B, T_spec, T_text] + attn = attn.squeeze(1) + for i, audio_id in enumerate(audio_ids): + attn_i = attn[i][: mels_pred_len[i], : text_lens[i]].cpu().numpy() + alignment_artifact = ImageArtifact( + id=f"align_{audio_id}", + data=attn_i, + filename=f"{audio_id}_align.png", + x_axis="Audio Frames", + y_axis="Text Tokens", + ) + image_artifacts.append(alignment_artifact) + + if self.log_audio_gta: + # [B, T_audio] + audio_pred, audio_pred_lens = self._generate_audio( + mels=mels_pred, mels_len=mels_pred_len, hop_length=model.preprocessor.hop_length + ) + for i, audio_id in enumerate(audio_ids): + audio_pred_i = audio_pred[i][: audio_pred_lens[i]].cpu().numpy() + audio_artifact = AudioArtifact( + id=f"audio_gta_{audio_id}", + data=audio_pred_i, + filename=f"{audio_id}_gta.wav", + sample_rate=self.vocoder.sample_rate, + ) + audio_artifacts.append(audio_artifact) + + return audio_artifacts, image_artifacts + + def generate_artifacts( + self, model: LightningModule, batch_dict: Dict + ) -> Tuple[List[AudioArtifact], List[ImageArtifact]]: + + audio_artifacts = [] + image_artifacts = [] + audio_filepaths = batch_dict.get("audio_filepaths") + audio_ids = [create_id(p) for p in audio_filepaths] + + if self.log_audio or self.log_spectrogram: + audio_pred, spec_pred = self._generate_predictions(model=model, batch_dict=batch_dict, audio_ids=audio_ids) + audio_artifacts += audio_pred + image_artifacts += spec_pred + + if self.log_audio_gta or self.log_alignment: + audio_gta_pred, alignments = self._generate_gta_predictions( + model=model, batch_dict=batch_dict, audio_ids=audio_ids + ) + audio_artifacts += audio_gta_pred + image_artifacts += alignments + + return audio_artifacts, image_artifacts diff --git a/nemo/collections/tts/parts/utils/helpers.py b/nemo/collections/tts/parts/utils/helpers.py index 3109a9658ba3..3af727a848cf 100644 --- a/nemo/collections/tts/parts/utils/helpers.py +++ b/nemo/collections/tts/parts/utils/helpers.py @@ -484,6 +484,23 @@ def plot_spectrogram_to_numpy(spectrogram): return data +def create_plot(data, x_axis, y_axis, output_filepath=None): + fig, ax = plt.subplots(figsize=(12, 3)) + im = ax.imshow(data, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel(x_axis) + plt.ylabel(y_axis) + plt.tight_layout() + + if output_filepath: + plt.savefig(output_filepath, format="png") + + fig.canvas.draw() + data = save_figure_to_numpy(fig) + plt.close() + return data + + def plot_gate_outputs_to_numpy(gate_targets, gate_outputs): fig, ax = plt.subplots(figsize=(12, 3)) ax.scatter(