forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommands.examples.txt
83 lines (70 loc) · 4.82 KB
/
commands.examples.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
FAIRSEQ_PATH=~/fairseq
CHECKPOINTS=checkpoints/levt
DATASET=data-bin/joint-bpe-37k
python3 train.py \
--save-dir $CHECKPOINTS \
--ddp-backend=no_c10d \
--task translation_lev \
--criterion nat_loss \
--arch levenshtein_transformer \
--noise random_delete \
--share-all-embeddings \
--optimizer adam --adam-betas '(0.9,0.98)' \
--lr 0.0005 --lr-scheduler inverse_sqrt \
--min-lr '1e-09' --warmup-updates 10000 \
--warmup-init-lr '1e-07' --label-smoothing 0.1 \
--dropout 0.3 --weight-decay 0.01 \
--decoder-learned-pos \
--encoder-learned-pos \
--apply-bert-init \
--log-format 'simple' --log-interval 100 \
--fixed-validation-seed 7 \
--max-tokens 500 \
--save-interval-updates 5000 \
--max-update 300000 \
$DATASET
python3 train.py \
$DATASET \
--arch transformer --share-decoder-input-output-embed \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
--dropout 0.3 --weight-decay 0.0001 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-tokens 8182 --fp16
Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False,
adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer', attention_dropout=0.0, best_checkpoint_metric='loss',
bpe=None, bucket_cap_mb=25, clip_norm=0.0, cpu=False, criterion='label_smoothed_cross_entropy', cross_self_attention=False,
curriculum=0, data='data-bin/joint-bpe-37k', dataset_impl=None, ddp_backend='c10d', decoder_attention_heads=8,
decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layerdrop=0,
decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=512,
device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method='tcp://localhost:16620',
distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=4, dropout=0.3, empty_cache_freq=0,
encoder_attention_heads=8, encoder_embed_dim=512, encoder_embed_path=None, encoder_ffn_embed_dim=2048, encoder_layerdrop=0, encoder_layers=6,
encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, fast_stat_sync=False, find_unused_parameters=False,
fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None,
keep_interval_updates=-1, keep_last_epochs=-1, label_smoothing=0.1, layer_wise_attention=False, lazy_load=False, left_pad_source='True',
left_pad_target='False', load_alignments=False, log_format=None, log_interval=1000, lr=[0.0005], lr_scheduler='inverse_sqrt', max_epoch=0,
max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=4096, max_tokens_valid=4096,
max_update=0, maximize_best_checkpoint_metric=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_lr=-1, no_cross_attention=False,
no_epoch_checkpoints=False, no_last_checkpoints=False, no_progress_bar=False, no_save=False, no_save_optimizer_state=False,
no_token_positional_embeddings=False, num_workers=1, optimizer='adam', optimizer_overrides='{}', raw_text=False, required_batch_size_multiple=8,
reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt',
save_dir='checkpoints', save_interval=1, save_interval_updates=0, seed=1, sentence_avg=False, share_all_embeddings=False,
share_decoder_input_output_embed=True, skip_invalid_size_inputs_valid_test=False, source_lang=None, target_lang=None, task='translation',
tbmf_wrapper=False, tensorboard_logdir='', threshold_loss_scale=None, tokenizer=None, train_subset='train', update_freq=[1], upsample_primary=1,
use_bmuf=False, user_dir=None, valid_subset='valid', validate_interval=1, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.0001)
Diese Frage hat Gutachs Bürgermeister gestern klar beantwortet .
cat gec.src gec.trg | python subword-nmt/learn_bpe.py -s 8000 -o bpecodes
python subword-nmt/apply_bpe.py -c bpecodes <gec.src> gec.src.bpe
python3 train.py \
data-bin/gec \
--arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
--save-dir checkpoints/err_generator \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
--lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--dropout 0.3 --weight-decay 0.0 \
--criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
--max-update 100000 \
--max-tokens 8000 \
--fp16 \
--keep-last-epochs 2