Skip to content
This repository has been archived by the owner on Nov 22, 2022. It is now read-only.

dist training is not work from user defined task #1217

Open
jiangxiluning opened this issue Dec 29, 2019 · 1 comment
Open

dist training is not work from user defined task #1217

jiangxiluning opened this issue Dec 29, 2019 · 1 comment

Comments

@jiangxiluning
Copy link

jiangxiluning commented Dec 29, 2019

I changed the distributed_world_size to 1, the training is running properly. However, it failed with 2.

Below is my config.

{
  "distributed_world_size": 2,
  "include_dirs": [
    "transformer-crf"
  ],
  "task": {
    "MyTaggingTask": {
      "data": {
        "Data": {
          "source": {
            "UDPOSDataSource": {
              "column_mapping": {"text": "doc", "slots": "tags"},
              "udpos_root": "/home/luning/dev/projects/transformer-crf/.data/udpos/en-ud-v2",
              "fields": ["text", "slots"]
            }
          },
          "batcher": {
            "PoolingBatcher": {
              "train_batch_size": 512,
              "eval_batch_size": 512,
              "test_batch_size": 512,
              "pool_num_batches": 100,
              "num_shuffled_pools": 4
            }
          }
        }
      },
      "model": {
        "decoder": {
          "activation": "relu",
          "dropout": 0.0,
          "freeze": false,
          "hidden_dims": [],
          "layer_norm": false,
          "load_path": null,
          "out_dim": null,
          "save_path": null,
          "shared_module_key": null
        },
        "embedding": {
          "cpu_only": false,
          "delimiter": " ",
          "embed_dim": 512,
          "embedding_init_range": null,
          "embedding_init_strategy": "random",
          "export_input_names": [
            "tokens_vals"
          ],
          "freeze": false,
          "load_path": null,
          "lowercase_tokens": true,
          "min_freq": 1,
          "mlp_layer_dims": [],
          "padding_idx": null,
          "pretrained_embeddings_path": "",
          "save_path": null,
          "shared_module_key": null,
          "skip_header": true,
          "vocab_file": "",
          "vocab_from_all_data": false,
          "vocab_from_pretrained_embeddings": false,
          "vocab_from_train_data": true,
          "vocab_size": 0
        },
        "inputs": {
          "slots": {
            "add_bos_token": false,
            "add_eos_token": false,
            "column": "tags",
            "max_seq_len": null,
            "use_eos_token_for_bos": false,
            "tokenizer": {
              "DoNothingTokenizer": {
                "do_nothing": ""
              }
            },
            "vocab": {
              "build_from_data": true,
              "size_from_data": 0,
              "vocab_files": [

              ]
            },
            "vocab_file_delimiter": " "
          },
          "tokens": {
            "add_bos_token": false,
            "add_eos_token": false,
            "column": "doc",
            "max_seq_len": null,
            "use_eos_token_for_bos": false,
            "tokenizer": {
              "DoNothingTokenizer": {
                "do_nothing": ""
              }
            },
            "vocab": {
              "build_from_data": true,
              "size_from_data": 0,
              "vocab_files": [
              ]
            },
            "vocab_file_delimiter": " "
          }
        },
        "output_layer": {
          "CRFOutputLayer": {
            "freeze": false,
            "load_path": null,
            "save_path": null,
            "shared_module_key": null
          }
        },
        "representation": {
          "bidirectional": true,
          "dropout": 0.9,
          "freeze": false,
          "load_path": null,
          "lstm_dim": 512,
          "num_layers": 1,
          "pack_sequence": true,
          "save_path": null,
          "shared_module_key": null
        }
      },
      "trainer": {
        "TaskTrainer": {
          "do_eval": true,
          "early_stop_after": 0,
          "epochs": 10000,
          "fp16_args": {
            "FP16OptimizerFairseq": {
              "init_loss_scale": 128,
              "min_loss_scale": 0.0001,
              "scale_tolerance": 0.0,
              "scale_window": null,
              "threshold_loss_scale": null
            }
          },
          "max_clip_norm": null,
          "num_accumulated_batches": 1,
          "num_batches_per_epoch": null,
          "num_samples_to_log_progress": 1000,
          "optimizer": {
            "Adam": {
              "eps": 1e-08,
              "lr": 0.001,
              "weight_decay": 1e-05
            }
          },
          "report_train_metrics": true,
          "scheduler": null,
          "sparsifier": null,
          "target_time_limit_seconds": null
        }
      }
    }
  },
  "use_cuda_if_available": true,
  "use_tensorboard": true,
  "version": 18
}

Errors

Including: transformer-crf
... importing module: transformer-crf.task
... task: MyTaggingTask
... importing module: transformer-crf.metric
... importing: <class 'transformer-crf.metric.MyTaggingMetricReporter'>
... importing module: transformer-crf.source
... importing: <class 'transformer-crf.source.TaggingDataSource'>
... importing: <class 'transformer-crf.source.UDPOSDataSource'>
... importing module: transformer-crf.model
... importing: <class 'transformer-crf.model.MyTagger'>

===Starting training...

=== Starting training, World size is 2
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
WARNING - Can not find class type in json: trying with first class taskbase in the union.
WARNING - Can not find class type in json: trying with first class taskbase in the union.
Traceback (most recent call last):
  File "/home/luning/.pyenv/versions/pytext-nlp/bin/pytext", line 8, in <module>
    sys.exit(main())
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/core.py", line 764, in __call__
    return self.main(*args, **kwargs)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/core.py", line 717, in main
    rv = self.invoke(ctx)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/core.py", line 1137, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/core.py", line 956, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/core.py", line 555, in invoke
    return callback(*args, **kwargs)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/click/decorators.py", line 17, in new_func
    return f(get_current_context(), *args, **kwargs)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/main.py", line 365, in train
    train_model_distributed(config, metric_channels)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/main.py", line 90, in train_model_distributed
    config.distributed_world_size,
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 171, in spawn
    while not spawn_context.join():
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 118, in join
    raise Exception(msg)
Exception: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 81, in _union_from_json
    return _value_from_json(subclasses_dict[type_name], json_obj)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 111, in _value_from_json
    return config_from_json(cls, value)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 204, in config_from_json
    f"missing value for {field} in class {cls_name} with json {json_obj}"
pytext.config.serialize.MissingValueError: missing value for data_handler in class TaskBase.Config with json {'MyTaggingTask': {'data': {'Data': {'source': {'UDPOSDataSource': {'column_mapping': {'text': 'doc', 'slots': 'tags'}, 'fields': ['text', 'slots'], 'udpos_root': '/home/luning/dev/projects/transformer-crf/.data/udpos/en-ud-v2'}}, 'batcher': {'PoolingBatcher': {'train_batch_size': 512, 'eval_batch_size': 512, 'test_batch_size': 512, 'pool_num_batches': 100, 'num_shuffled_pools': 4}}, 'sort_key': None, 'in_memory': True}}, 'trainer': {'TaskTrainer': {'epochs': 10000, 'early_stop_after': 0, 'max_clip_norm': None, 'report_train_metrics': True, 'target_time_limit_seconds': None, 'do_eval': True, 'num_samples_to_log_progress': 1000, 'num_accumulated_batches': 1, 'num_batches_per_epoch': None, 'optimizer': {'Adam': {'lr': 0.001, 'weight_decay': 1e-05, 'eps': 1e-08}}, 'scheduler': None, 'sparsifier': None, 'fp16_args': {'FP16OptimizerFairseq': {'init_loss_scale': 128, 'scale_window': None, 'scale_tolerance': 0.0, 'threshold_loss_scale': None, 'min_loss_scale': 0.0001}}}}, 'model': {'inputs': {'tokens': {'column': 'doc', 'tokenizer': {'DoNothingTokenizer': {'do_nothing': ''}}, 'add_bos_token': False, 'add_eos_token': False, 'use_eos_token_for_bos': False, 'max_seq_len': None, 'vocab': {'build_from_data': True, 'size_from_data': 0, 'vocab_files': []}, 'vocab_file_delimiter': ' '}, 'slots': {'column': 'tags', 'tokenizer': {'DoNothingTokenizer': {'do_nothing': ''}}, 'add_bos_token': False, 'add_eos_token': False, 'use_eos_token_for_bos': False, 'max_seq_len': None, 'vocab': {'build_from_data': True, 'size_from_data': 0, 'vocab_files': []}, 'vocab_file_delimiter': ' '}}, 'embedding': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'embed_dim': 512, 'embedding_init_strategy': 'random', 'embedding_init_range': None, 'export_input_names': ['tokens_vals'], 'pretrained_embeddings_path': '', 'vocab_file': '', 'vocab_size': 0, 'vocab_from_train_data': True, 'vocab_from_all_data': False, 'vocab_from_pretrained_embeddings': False, 'lowercase_tokens': True, 'min_freq': 1, 'mlp_layer_dims': [], 'padding_idx': None, 'cpu_only': False, 'skip_header': True, 'delimiter': ' '}, 'representation': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'dropout': 0.9, 'lstm_dim': 512, 'num_layers': 1, 'bidirectional': True, 'pack_sequence': True}, 'decoder': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'hidden_dims': [], 'out_dim': None, 'layer_norm': False, 'dropout': 0.0, 'activation': 'relu'}, 'output_layer': {'CRFOutputLayer': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None}}}, 'metric_reporter': {'output_path': '/tmp/test_out.txt', 'pep_format': False}}}

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/main.py", line 102, in run_single
    config = pytext_config_from_json(config_json)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 153, in pytext_config_from_json
    return config_from_json(PyTextConfig, json_obj, ignore_fields)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 193, in config_from_json
    value = _value_from_json(f_cls, json_obj[field])
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 113, in _value_from_json
    return _union_from_json(cls.__args__, value)
  File "/home/luning/.pyenv/versions/3.7.5/envs/pytext-nlp/lib/python3.7/site-packages/pytext/config/serialize.py", line 85, in _union_from_json
    ) from e
pytext.config.serialize.UnionTypeError: failed to parse union (<class 'pytext.task.task.TaskBase.Config'>, <class 'pytext.config.component.ComponentMeta.__new__.<locals>.Config'>, <class 'pytext.task.new_task._NewTask.Config'>, <class 'pytext.task.new_task.NewTask.Config'>, <class 'pytext.task.disjoint_multitask.DisjointMultitask.Config'>, <class 'pytext.task.disjoint_multitask.NewDisjointMultitask.Config'>, <class 'pytext.task.tasks.QueryDocumentPairwiseRankingTask.Config'>, <class 'pytext.task.tasks.EnsembleTask.Config'>, <class 'pytext.task.tasks.DocumentClassificationTask.Config'>, <class 'pytext.task.tasks.DocumentRegressionTask.Config'>, <class 'pytext.task.tasks.NewBertClassificationTask.Config'>, <class 'pytext.task.tasks.NewBertPairClassificationTask.Config'>, <class 'pytext.task.tasks.BertPairRegressionTask.Config'>, <class 'pytext.task.tasks.WordTaggingTask.Config'>, <class 'pytext.task.tasks.IntentSlotTask.Config'>, <class 'pytext.task.tasks.LMTask.Config'>, <class 'pytext.task.tasks.MaskedLMTask.Config'>, <class 'pytext.task.tasks.PairwiseClassificationTask.Config'>, <class 'pytext.task.tasks.SeqNNTask.Config'>, <class 'pytext.task.tasks.SquadQATask.Config'>, <class 'pytext.task.tasks.SemanticParsingTask.Config'>) from json payload {'MyTaggingTask': {'data': {'Data': {'source': {'UDPOSDataSource': {'column_mapping': {'text': 'doc', 'slots': 'tags'}, 'fields': ['text', 'slots'], 'udpos_root': '/home/luning/dev/projects/transformer-crf/.data/udpos/en-ud-v2'}}, 'batcher': {'PoolingBatcher': {'train_batch_size': 512, 'eval_batch_size': 512, 'test_batch_size': 512, 'pool_num_batches': 100, 'num_shuffled_pools': 4}}, 'sort_key': None, 'in_memory': True}}, 'trainer': {'TaskTrainer': {'epochs': 10000, 'early_stop_after': 0, 'max_clip_norm': None, 'report_train_metrics': True, 'target_time_limit_seconds': None, 'do_eval': True, 'num_samples_to_log_progress': 1000, 'num_accumulated_batches': 1, 'num_batches_per_epoch': None, 'optimizer': {'Adam': {'lr': 0.001, 'weight_decay': 1e-05, 'eps': 1e-08}}, 'scheduler': None, 'sparsifier': None, 'fp16_args': {'FP16OptimizerFairseq': {'init_loss_scale': 128, 'scale_window': None, 'scale_tolerance': 0.0, 'threshold_loss_scale': None, 'min_loss_scale': 0.0001}}}}, 'model': {'inputs': {'tokens': {'column': 'doc', 'tokenizer': {'DoNothingTokenizer': {'do_nothing': ''}}, 'add_bos_token': False, 'add_eos_token': False, 'use_eos_token_for_bos': False, 'max_seq_len': None, 'vocab': {'build_from_data': True, 'size_from_data': 0, 'vocab_files': []}, 'vocab_file_delimiter': ' '}, 'slots': {'column': 'tags', 'tokenizer': {'DoNothingTokenizer': {'do_nothing': ''}}, 'add_bos_token': False, 'add_eos_token': False, 'use_eos_token_for_bos': False, 'max_seq_len': None, 'vocab': {'build_from_data': True, 'size_from_data': 0, 'vocab_files': []}, 'vocab_file_delimiter': ' '}}, 'embedding': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'embed_dim': 512, 'embedding_init_strategy': 'random', 'embedding_init_range': None, 'export_input_names': ['tokens_vals'], 'pretrained_embeddings_path': '', 'vocab_file': '', 'vocab_size': 0, 'vocab_from_train_data': True, 'vocab_from_all_data': False, 'vocab_from_pretrained_embeddings': False, 'lowercase_tokens': True, 'min_freq': 1, 'mlp_layer_dims': [], 'padding_idx': None, 'cpu_only': False, 'skip_header': True, 'delimiter': ' '}, 'representation': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'dropout': 0.9, 'lstm_dim': 512, 'num_layers': 1, 'bidirectional': True, 'pack_sequence': True}, 'decoder': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None, 'hidden_dims': [], 'out_dim': None, 'layer_norm': False, 'dropout': 0.0, 'activation': 'relu'}, 'output_layer': {'CRFOutputLayer': {'load_path': None, 'save_path': None, 'freeze': False, 'shared_module_key': None}}}, 'metric_reporter': {'output_path': '/tmp/test_out.txt', 'pep_format': False}}}

How to run

https://github.com/jiangxiluning/pytext_demo

this is my code repo. You can run with below commands

git clone https://github.com/jiangxiluning/pytext_demo
PYTHONPATH=. pytext --config-file pytext_demo/pos.json train
@jiangxiluning
Copy link
Author

@hudeven @chenyangyu1988 da lao can you guys help me ?

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant