beginner/torchtext_translation 변경사항 반영 등 (#644) (#652)

9bow · web-flow · commit 4e81cc8e679d · 2023-04-08T21:54:39.000+09:00
diff --git a/beginner_source/torchtext_translation.py b/beginner_source/torchtext_translation.py
@@ -36,14 +36,14 @@
 #
 # ::
 #
-#    python -m spacy download en
-#    python -m spacy download de
+#    python -m spacy download en_core_web_sm
+#    python -m spacy download de_core_news_sm
 
 import torchtext
 import torch
 from torchtext.data.utils import get_tokenizer
 from collections import Counter
-from torchtext.vocab import Vocab
+from torchtext.vocab import vocab
 from torchtext.utils import download_from_url, extract_archive
 import io
 
@@ -56,15 +56,15 @@
 val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
 test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
 
-de_tokenizer = get_tokenizer('spacy', language='de')
-en_tokenizer = get_tokenizer('spacy', language='en')
+de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
+en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
 
 def build_vocab(filepath, tokenizer):
   counter = Counter()
   with io.open(filepath, encoding="utf8") as f:
     for string_ in f:
       counter.update(tokenizer(string_))
-  return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
+  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
 
 de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
 en_vocab = build_vocab(train_filepaths[1], en_tokenizer)