|
36 | 36 | #
|
37 | 37 | # ::
|
38 | 38 | #
|
39 |
| -# python -m spacy download en |
40 |
| -# python -m spacy download de |
| 39 | +# python -m spacy download en_core_web_sm |
| 40 | +# python -m spacy download de_core_news_sm |
41 | 41 |
|
42 | 42 | import torchtext
|
43 | 43 | import torch
|
44 | 44 | from torchtext.data.utils import get_tokenizer
|
45 | 45 | from collections import Counter
|
46 |
| -from torchtext.vocab import Vocab |
| 46 | +from torchtext.vocab import vocab |
47 | 47 | from torchtext.utils import download_from_url, extract_archive
|
48 | 48 | import io
|
49 | 49 |
|
|
56 | 56 | val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
|
57 | 57 | test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
|
58 | 58 |
|
59 |
| -de_tokenizer = get_tokenizer('spacy', language='de') |
60 |
| -en_tokenizer = get_tokenizer('spacy', language='en') |
| 59 | +de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm') |
| 60 | +en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm') |
61 | 61 |
|
62 | 62 | def build_vocab(filepath, tokenizer):
|
63 | 63 | counter = Counter()
|
64 | 64 | with io.open(filepath, encoding="utf8") as f:
|
65 | 65 | for string_ in f:
|
66 | 66 | counter.update(tokenizer(string_))
|
67 |
| - return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) |
| 67 | + return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) |
68 | 68 |
|
69 | 69 | de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
|
70 | 70 | en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
|
|
0 commit comments