Skip to content

Commit 4e81cc8

Browse files
authored
beginner/torchtext_translation 변경사항 반영 등 (#644) (#652)
1 parent b1572d3 commit 4e81cc8

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

beginner_source/torchtext_translation.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@
3636
#
3737
# ::
3838
#
39-
# python -m spacy download en
40-
# python -m spacy download de
39+
# python -m spacy download en_core_web_sm
40+
# python -m spacy download de_core_news_sm
4141

4242
import torchtext
4343
import torch
4444
from torchtext.data.utils import get_tokenizer
4545
from collections import Counter
46-
from torchtext.vocab import Vocab
46+
from torchtext.vocab import vocab
4747
from torchtext.utils import download_from_url, extract_archive
4848
import io
4949

@@ -56,15 +56,15 @@
5656
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
5757
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
5858

59-
de_tokenizer = get_tokenizer('spacy', language='de')
60-
en_tokenizer = get_tokenizer('spacy', language='en')
59+
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
60+
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
6161

6262
def build_vocab(filepath, tokenizer):
6363
counter = Counter()
6464
with io.open(filepath, encoding="utf8") as f:
6565
for string_ in f:
6666
counter.update(tokenizer(string_))
67-
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
67+
return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
6868

6969
de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
7070
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

0 commit comments

Comments
 (0)