From 13cb8fe27c320053192b731e93f84edae15cba20 Mon Sep 17 00:00:00 2001
From: chanmuzi <101971295+chanmuzi@users.noreply.github.com>
Date: Sat, 5 Aug 2023 00:53:18 +0900
Subject: [PATCH 1/5] =?UTF-8?q?beginner=5Fsource/nn=5Ftutorial.py=20?=
 =?UTF-8?q?=EC=98=A4=ED=83=80=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beginner_source/nn_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
index e9bf54214..8243f895f 100644
--- a/beginner_source/nn_tutorial.py
+++ b/beginner_source/nn_tutorial.py
@@ -20,7 +20,7 @@
 # 와 같은 잘 디자인된 모듈과 클래스들을 제공합니다.
 # 이들의 성능을 최대한 활용하고 여러분의 문제에 맞게 커스터마이즈하기 위해서,
 # 정확히 이들이 어떤 작업을 수행하는지 이해할 필요가 있습니다.
-# 이해를 증진하기 위해서, 우리는 먼저 이들 모델들로 부터 아무 피쳐도 사용하지 않고
+# 이해를 증진하기 위해서, 우리는 먼저 이들 모델들로부터 아무 피쳐도 사용하지 않고
 # MNIST 데이터셋에 대해 기초적인 신경망을 학습시킬 것입니다;
 # 우리는 처음에는 가장 기초적인 PyTorch 텐서(tensor) 기능만을 사용할 것입니다.
 # 그리고나서 우리는 점차적으로 ``torch.nn``, ``torch.optim``, ``Dataset``, 또는

From 28ea711df0278b86c201100b8eeab62b6f54cda7 Mon Sep 17 00:00:00 2001
From: chanmuzi <101971295+chanmuzi@users.noreply.github.com>
Date: Wed, 6 Sep 2023 22:59:37 +0900
Subject: [PATCH 2/5] =?UTF-8?q?beginner=5Fsource/flava=5Ffinetuning=5Ftuto?=
 =?UTF-8?q?rial.py=20=EB=B2=88=EC=97=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beginner_source/flava_finetuning_tutorial.py | 104 ++++++++-----------
 1 file changed, 44 insertions(+), 60 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 12e20f475..de07d87e0 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -1,36 +1,33 @@
 # -*- coding: utf-8 -*-
 """
-TorchMultimodal Tutorial: Finetuning FLAVA
+TorchMultimodal 튜토리얼: FLAVA 미세조정
 ============================================
 """
 
 ######################################################################
-# Multimodal AI has recently become very popular owing to its ubiquitous
-# nature, from use cases like image captioning and visual search to more
-# recent applications like image generation from text. **TorchMultimodal
-# is a library powered by Pytorch consisting of building blocks and end to
-# end examples, aiming to enable and accelerate research in
-# multimodality**.
-# 
-# In this tutorial, we will demonstrate how to use a **pretrained SoTA
-# model called** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **from
-# TorchMultimodal library to finetune on a multimodal task i.e. visual
-# question answering** (VQA). The model consists of two unimodal transformer
-# based encoders for text and image and a multimodal encoder to combine
-# the two embeddings. It is pretrained using contrastive, image text matching and 
-# text, image and multimodal masking losses.
+# 멀티 모달 AI는 최근에 이미지 캡셔닝, 시각적 검색부터 텍스트로부터 이미지를 생성같은
+# 최근의 응용까지 그 사용이 빠르게 확산되고 있습니다. **TorchMultimodal은 PyTorch를
+# 기반으로 하는 라이브러리로, 다중 모달 연구를 가능하게 하고 가속화하기 위한 빌딩 블록과
+# end-to-end 예제들을 제공합니다**.
+#
+# 본 튜토리얼에서는 **사전 훈련된 SoTA 모델인** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **를 
+# TorchMultimodal 라이브러리에서 사용하여 멀티 모달 작업인 시각적 질의 응답**(VQA)에 미세조정하는 방법을 보여 드리겠습니다.
+# 이 모델은 텍스트와 이미지를 위한 두 개의 단일 모달 트랜스포머 기반 인코더와
+# 두 임베딩을 결합하는 다중 모달 인코더로 구성되어 있습니다.
+# 이 모델은 대조적, 이미지-텍스트 매칭, 그리고 텍스트, 이미지 및 다중 모달 마스킹 손실을 사용하여 사전 훈련되었습니다.
+
 
 
 ######################################################################
-# Installation
+# 설치
 # -----------------
-# We will use TextVQA dataset and ``bert tokenizer`` from Hugging Face for this
-# tutorial. So you need to install datasets and transformers in addition to TorchMultimodal.
+# 이 튜토리얼을 위해서는 TextVQA 데이터셋과 Hugging Face의 ``bert 토크나이저``를 사용할 것입니다.
+# 따라서 TorchMultimodal 외에도 datasets과 transformers를 설치해야 합니다.
 #
 # .. note::
-#
-#    When running this tutorial in Google Colab, install the required packages by
-#    creating a new cell and running the following commands:
+#    
+#    이 튜토리얼을 Google Colab에서 실행할 경우, 새로운 셀을 만들고 다음의 명령어를 실행하여
+#    필요한 패키지를 설치하세요:
 #
 #    .. code-block::
 #
@@ -40,10 +37,10 @@
 #
 
 ######################################################################
-# Steps
+# 단계
 # -----
 #
-# 1. Download the Hugging Face dataset to a directory on your computer by running the following command:
+# 1. 다음 명령어를 실행하여 Hugging Face 데이터셋을 컴퓨터의 디렉토리에 다운로드하세요:
 #
 #    .. code-block::
 #
@@ -51,21 +48,16 @@
 #       tar xf vocab.tar.gz
 #
 #    .. note:: 
-#       If you are running this tutorial in Google Colab, run these commands
-#       in a new cell and prepend these commands with an exclamation mark (!)
+#       이 튜토리얼을 Google Colab에서 실행하는 경우, 새 셀에서 이 명령어를 실행하고 명령어 앞에 느낌표 (!)를 붙이세요.
 #
 #
-# 2. For this tutorial, we treat VQA as a classification task where
-#    the inputs are images and question (text) and the output is an answer class. 
-#    So we need to download the vocab file with answer classes and create the answer to
-#    label mapping.
+# 2. 본 튜토리얼에서는 VQA를 이미지와 질문(텍스트)이 입력되고 출력이 답변 클래스인 분류 작업으로 취급합니다.
+#    따라서 답변 클래스와 레이블 매핑을 생성할 단어장 파일을 다운로드해야 합니다.
 #
-#    We also load the `textvqa
-#    dataset <https://arxiv.org/pdf/1904.08920.pdf>`__ containing 34602 training samples
-#    (images,questions and answers) from Hugging Face
+#    또한 Hugging Face에서 `textvqa 데이터셋 <https://arxiv.org/pdf/1904.08920.pdf>`__을 불러오는데, 
+#    이 데이터셋은 34602개의 훈련 샘플(이미지, 질문, 답변)을 포함하고 있습니다.
 #
-# We see there are 3997 answer classes including a class representing
-# unknown answers.
+# 3997개의 답변 클래스가 있음을 확인할 수 있으며, 이에는 알 수 없는 답변을 나타내는 클래스도 포함되어 있습니다.
 #
 
 with open("data/vocabs/answers_textvqa_more_than_1.txt") as f:
@@ -81,7 +73,7 @@
 dataset = load_dataset("textvqa")
 
 ######################################################################
-# Lets display a sample entry from the dataset:
+# 데이터셋에서 샘플 엔트리를 표시해 봅시다:
 #
 
 import matplotlib.pyplot as plt
@@ -95,12 +87,10 @@
 
 
 ######################################################################
-# 3. Next, we write the transform function to convert the image and text into
-# Tensors consumable by our model - For images, we use the transforms from
-# torchvision to convert to Tensor and resize to uniform sizes - For text,
-# we tokenize (and pad) them using the ``BertTokenizer`` from Hugging Face -
-# For answers (i.e. labels), we take the most frequently occurring answer
-# as the label to train with:
+# 3. 다음으로, 이미지와 텍스트를 모델에서 사용할 수 있는 텐서로 변환하기 위한 변환 함수를 작성합니다.
+# - 이미지의 경우, torchvision의 변환을 사용하여 텐서로 변환하고 일정한 크기로 조정합니다.
+# - 텍스트의 경우, Hugging Face의 ``BertTokenizer``를 사용하여 토큰화(및 패딩)합니다.
+# - 답변(즉, 레이블)의 경우, 가장 빈번하게 나타나는 답변을 훈련 레이블로 사용합니다:
 #
 
 import torch
@@ -133,16 +123,12 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# 4. Finally, we import the ``flava_model_for_classification`` from
-# ``torchmultimodal``. It loads the pretrained FLAVA checkpoint by default and
-# includes a classification head.
+# 4. 마지막으로, ``torchmultimodal``에서 ``flava_model_for_classification``을 가져옵니다.
+# 이것은 기본적으로 사전 훈련된 FLAVA 체크포인트를 로드하고 분류 헤드를 포함합니다.
 #
-# The model forward function passes the image through the visual encoder
-# and the question through the text encoder. The image and question
-# embeddings are then passed through the multimodal encoder. The final
-# embedding corresponding to the CLS token is passed through a MLP head
-# which finally gives the probability distribution over each possible
-# answers.
+# 모델의 순방향 함수는 이미지를 시각 인코더에 통과시키고 질문을 텍스트 인코더에 통과시킵니다.
+# 이미지와 질문의 임베딩은 그 후 멀티 모달 인코더를 통과합니다.
+# 최종 임베딩은 CLS 토큰에 해당하며, 이는 MLP 헤드를 통과하여 각 가능한 답변에 대한 확률 분포를 제공합니다.
 #
 
 from torchmultimodal.models.flava.model import flava_model_for_classification
@@ -150,8 +136,8 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# 5. We put together the dataset and model in a toy training loop to
-# demonstrate how to train the model for 3 iterations:
+# 5. 데이터셋과 모델을 함께 모아 3회 반복을 위한 간단한 훈련 루프를 작성하여 
+# 모델 훈련 방법을 보여줍니다:
 #
 
 from torch import nn
@@ -177,14 +163,12 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# Conclusion
+# 결론
 # -------------------
 #
-# This tutorial introduced the basics around how to finetune on a
-# multimodal task using FLAVA from TorchMultimodal. Please also check out
-# other examples from the library like
-# `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__
-# which is a multimodal model for object detection and
-# `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
-# which is multitask model spanning image, video and 3d classification.
+# 이 튜토리얼에서는 TorchMultimodal의 FLAVA를 사용하여 멀티 모달 작업에 미세 조정하는
+# 기본적인 방식을 소개했습니다. 객체 탐지를 위한 멀티 모달 모델인 `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__과
+# 이미지, 비디오, 3D 분류를 포괄하는 다작업 모델 `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
+# 같은 라이브러리의 다른 예제들도 확인해 보세요.
+# 
 #

From 3fcd366de15f41b55acfe4f1f86f376da43bff60 Mon Sep 17 00:00:00 2001
From: chanmuzi <chanmuzi@naver.com>
Date: Thu, 7 Sep 2023 23:20:15 +0900
Subject: [PATCH 3/5] =?UTF-8?q?beginner=5Fsource/flava=5Ffinetuning=5Ftuto?=
 =?UTF-8?q?rial.py=20=EB=B2=88=EC=97=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beginner_source/flava_finetuning_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index de07d87e0..70e0f5d50 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -3,7 +3,7 @@
 TorchMultimodal 튜토리얼: FLAVA 미세조정
 ============================================
 """
-
+# **번역:** `김찬 <https://github.com/chanmzui>`_
 ######################################################################
 # 멀티 모달 AI는 최근에 이미지 캡셔닝, 시각적 검색부터 텍스트로부터 이미지를 생성같은
 # 최근의 응용까지 그 사용이 빠르게 확산되고 있습니다. **TorchMultimodal은 PyTorch를

From c7a015b1e4fed713b53e4c7cba5b2b83b6ecc3de Mon Sep 17 00:00:00 2001
From: chanmuzi <chanmuzi@naver.com>
Date: Sun, 10 Sep 2023 21:46:47 +0900
Subject: [PATCH 4/5] =?UTF-8?q?beginner=5Fsource/flava=5Ffinetuning=5Ftuto?=
 =?UTF-8?q?rial.py=20=EB=AC=B8=EB=B2=95=20=EB=B0=8F=20=EB=A0=8C=EB=8D=94?=
 =?UTF-8?q?=EB=A7=81=20=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beginner_source/flava_finetuning_tutorial.py | 26 +++++++++++---------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 70e0f5d50..0e94cad88 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -2,16 +2,20 @@
 """
 TorchMultimodal 튜토리얼: FLAVA 미세조정
 ============================================
+
+**번역:** `김찬 <https://github.com/chanmuzi>`__
+
 """
-# **번역:** `김찬 <https://github.com/chanmzui>`_
+
+
 ######################################################################
 # 멀티 모달 AI는 최근에 이미지 캡셔닝, 시각적 검색부터 텍스트로부터 이미지를 생성같은
 # 최근의 응용까지 그 사용이 빠르게 확산되고 있습니다. **TorchMultimodal은 PyTorch를
-# 기반으로 하는 라이브러리로, 다중 모달 연구를 가능하게 하고 가속화하기 위한 빌딩 블록과
+# 기반으로 하는 라이브러리로, 멀티 모달 연구를 가능하게 하고 가속화하기 위한 빌딩 블록과
 # end-to-end 예제들을 제공합니다**.
 #
-# 본 튜토리얼에서는 **사전 훈련된 SoTA 모델인** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **를 
-# TorchMultimodal 라이브러리에서 사용하여 멀티 모달 작업인 시각적 질의 응답**(VQA)에 미세조정하는 방법을 보여 드리겠습니다.
+# 본 튜토리얼에서는 **사전 훈련된 SoTA 모델인** `FLAVA <https://arxiv.org/pdf/2112.04482.pdf>`__ **를**
+# **TorchMultimodal 라이브러리에서 사용하여 멀티 모달 작업인 시각적 질의 응답(VQA)에 미세조정하는 방법을 보여 드리겠습니다.**
 # 이 모델은 텍스트와 이미지를 위한 두 개의 단일 모달 트랜스포머 기반 인코더와
 # 두 임베딩을 결합하는 다중 모달 인코더로 구성되어 있습니다.
 # 이 모델은 대조적, 이미지-텍스트 매칭, 그리고 텍스트, 이미지 및 다중 모달 마스킹 손실을 사용하여 사전 훈련되었습니다.
@@ -21,7 +25,7 @@
 ######################################################################
 # 설치
 # -----------------
-# 이 튜토리얼을 위해서는 TextVQA 데이터셋과 Hugging Face의 ``bert 토크나이저``를 사용할 것입니다.
+# 이 튜토리얼을 위해서는 TextVQA 데이터셋과 Hugging Face의 ``bert 토크나이저`` 를 사용할 것입니다.
 # 따라서 TorchMultimodal 외에도 datasets과 transformers를 설치해야 합니다.
 #
 # .. note::
@@ -54,7 +58,7 @@
 # 2. 본 튜토리얼에서는 VQA를 이미지와 질문(텍스트)이 입력되고 출력이 답변 클래스인 분류 작업으로 취급합니다.
 #    따라서 답변 클래스와 레이블 매핑을 생성할 단어장 파일을 다운로드해야 합니다.
 #
-#    또한 Hugging Face에서 `textvqa 데이터셋 <https://arxiv.org/pdf/1904.08920.pdf>`__을 불러오는데, 
+#    또한 Hugging Face에서 `textvqa 데이터셋 <https://arxiv.org/pdf/1904.08920.pdf>`__ 을 불러오는데, 
 #    이 데이터셋은 34602개의 훈련 샘플(이미지, 질문, 답변)을 포함하고 있습니다.
 #
 # 3997개의 답변 클래스가 있음을 확인할 수 있으며, 이에는 알 수 없는 답변을 나타내는 클래스도 포함되어 있습니다.
@@ -87,9 +91,9 @@
 
 
 ######################################################################
-# 3. 다음으로, 이미지와 텍스트를 모델에서 사용할 수 있는 텐서로 변환하기 위한 변환 함수를 작성합니다.
-# - 이미지의 경우, torchvision의 변환을 사용하여 텐서로 변환하고 일정한 크기로 조정합니다.
-# - 텍스트의 경우, Hugging Face의 ``BertTokenizer``를 사용하여 토큰화(및 패딩)합니다.
+# 3. 다음으로, 이미지와 텍스트를 모델에서 사용할 수 있는 텐서로 변환하기 위한 변환 함수를 작성합니다. 
+# - 이미지의 경우, torchvision의 변환을 사용하여 텐서로 변환하고 일정한 크기로 조정합니다. 
+# - 텍스트의 경우, Hugging Face의 ``BertTokenizer`` 를 사용하여 토큰화(및 패딩)합니다. 
 # - 답변(즉, 레이블)의 경우, 가장 빈번하게 나타나는 답변을 훈련 레이블로 사용합니다:
 #
 
@@ -123,7 +127,7 @@ def transform(tokenizer, input):
 
 
 ######################################################################
-# 4. 마지막으로, ``torchmultimodal``에서 ``flava_model_for_classification``을 가져옵니다.
+# 4. 마지막으로, ``torchmultimodal`` 에서 ``flava_model_for_classification`` 을 가져옵니다.
 # 이것은 기본적으로 사전 훈련된 FLAVA 체크포인트를 로드하고 분류 헤드를 포함합니다.
 #
 # 모델의 순방향 함수는 이미지를 시각 인코더에 통과시키고 질문을 텍스트 인코더에 통과시킵니다.
@@ -167,7 +171,7 @@ def transform(tokenizer, input):
 # -------------------
 #
 # 이 튜토리얼에서는 TorchMultimodal의 FLAVA를 사용하여 멀티 모달 작업에 미세 조정하는
-# 기본적인 방식을 소개했습니다. 객체 탐지를 위한 멀티 모달 모델인 `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__과
+# 기본적인 방식을 소개했습니다. 객체 탐지를 위한 멀티 모달 모델인 `MDETR <https://github.com/facebookresearch/multimodal/tree/main/torchmultimodal/models/mdetr>`__ 과
 # 이미지, 비디오, 3D 분류를 포괄하는 다작업 모델 `Omnivore <https://github.com/facebookresearch/multimodal/blob/main/torchmultimodal/models/omnivore.py>`__
 # 같은 라이브러리의 다른 예제들도 확인해 보세요.
 # 

From 11f7d9460213909f84044870ca493dc48413a291 Mon Sep 17 00:00:00 2001
From: chanmuzi <chanmuzi@naver.com>
Date: Mon, 18 Sep 2023 23:06:22 +0900
Subject: [PATCH 5/5] =?UTF-8?q?beginner=5Fsource/nn=5Ftutorial.py=20?=
 =?UTF-8?q?=EC=96=B4=ED=9C=98=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beginner_source/flava_finetuning_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/flava_finetuning_tutorial.py b/beginner_source/flava_finetuning_tutorial.py
index 0e94cad88..15c5f2612 100644
--- a/beginner_source/flava_finetuning_tutorial.py
+++ b/beginner_source/flava_finetuning_tutorial.py
@@ -9,7 +9,7 @@
 
 
 ######################################################################
-# 멀티 모달 AI는 최근에 이미지 캡셔닝, 시각적 검색부터 텍스트로부터 이미지를 생성같은
+# 멀티 모달 AI는 최근에 이미지 자막추가, 시각적 검색부터 텍스트로부터 이미지를 생성같은
 # 최근의 응용까지 그 사용이 빠르게 확산되고 있습니다. **TorchMultimodal은 PyTorch를
 # 기반으로 하는 라이브러리로, 멀티 모달 연구를 가능하게 하고 가속화하기 위한 빌딩 블록과
 # end-to-end 예제들을 제공합니다**.