From 1875eb6c6adf938a4bcf55af38e429d7096b0211 Mon Sep 17 00:00:00 2001
From: Luke Cheng <2258420+chenglu@users.noreply.github.com>
Date: Sun, 27 Nov 2022 14:40:54 +0000
Subject: [PATCH] docs: fix: Accurate for the origin (English) subtitles

---
 .../00_welcome-to-the-hugging-face-course.srt |  539 +++++--
 subtitles/en/01_the-pipeline-function.srt     |  668 ++++++---
 ...2_the-carbon-footprint-of-transformers.srt |  582 +++++++-
 subtitles/en/03_what-is-transfer-learning.srt |  599 +++++---
 .../en/04_the-transformer-architecture.srt    |  418 ++++--
 .../en/05_transformer-models-encoders.srt     |  678 ++++++---
 .../en/06_transformer-models-decoders.srt     |  658 +++++----
 ...07_transformer-models-encoder-decoders.srt |  944 ++++++++----
 ...inside-the-pipeline-function-(pytorch).srt |  715 ++++++---
 ...ide-the-pipeline-function-(tensorflow).srt |  711 ++++++---
 ...antiate-a-transformers-model-(pytorch).srt |  465 ++++--
 ...iate-a-transformers-model-(tensorflow).srt |  512 ++++---
 subtitles/en/12_tokenizers-overview.srt       |  137 +-
 subtitles/en/13_word-based-tokenizers.srt     |  392 +++--
 .../en/14_character-based-tokenizers.srt      |  412 ++++--
 subtitles/en/15_subword-based-tokenizers.srt  |  482 ++++--
 subtitles/en/16_the-tokenization-pipeline.srt |  507 ++++---
 .../17_batching-inputs-together-(pytorch).srt |  435 ++++--
 ..._batching-inputs-together-(tensorflow).srt |  419 ++++--
 ...gging-face-datasets-overview-(pytorch).srt |  505 ++++---
 ...ng-face-datasets-overview-(tensorflow).srt |  484 +++---
 ...preprocessing-sentence-pairs-(pytorch).srt |  443 ++++--
 ...processing-sentence-pairs-(tensorflow).srt |  462 ++++--
 subtitles/en/23_what-is-dynamic-padding.srt   |  488 +++---
 subtitles/en/24_the-trainer-api.srt           |  556 ++++---
 subtitles/en/25_keras-introduction.srt        |  419 ++++--
 .../en/26_fine-tuning-with-tensorflow.srt     |  920 +++++++-----
 ...arning-rate-scheduling-with-tensorflow.srt |  647 +++++---
 .../28_tensorflow-predictions-and-metrics.srt |  655 +++++---
 ...29_write-your-training-loop-in-pytorch.srt |  868 ++++++-----
 ...-pytorch-training-loop-with-accelerate.srt |  495 ++++---
 subtitles/en/31_navigating-the-model-hub.srt  |  526 ++++---
 .../32_managing-a-repo-on-the-model-hub.srt   | 1096 +++++++++-----
 .../en/33_the-push-to-hub-api-(pytorch).srt   |  723 ++++++---
 .../34_the-push-to-hub-api-(tensorflow).srt   | 1311 +++++++++++------
 subtitles/en/35_loading-a-custom-dataset.srt  |  472 ++++--
 ...e-and-dice-a-dataset-\360\237\224\252.srt" |  573 ++++---
 ...dataframes-=-\342\235\244\357\270\217.srt" |  427 ++++--
 .../en/38_saving-and-reloading-a-dataset.srt  |  538 ++++---
 .../en/39_memory-mapping-&-streaming.srt      |  553 ++++---
 .../en/40_uploading-a-dataset-to-the-hub.srt  |  337 +++--
 .../41_text-embeddings-&-semantic-search.srt  |  552 ++++---
 subtitles/en/42_training-a-new-tokenizer.srt  |  834 +++++++----
 ...43_why-are-fast-tokenizers-called-fast.srt |  257 ++--
 .../en/44_fast-tokenizer-superpowers.srt      |  535 ++++---
 ...oken-classification-pipeline-(pytorch).srt |  487 ++++--
 ...n-classification-pipeline-(tensorflow).srt |  509 ++++---
 ...-question-answering-pipeline-(pytorch).srt |  567 ++++---
 ...estion-answering-pipeline-(tensorflow).srt |  535 ++++---
 subtitles/en/49_what-is-normalization.srt     |  664 +++++----
 subtitles/en/50_what-is-pre-tokenization.srt  |  308 ++--
 .../en/51_byte-pair-encoding-tokenization.srt |  581 +++++---
 subtitles/en/52_wordpiece-tokenization.srt    |  444 ++++--
 subtitles/en/53_unigram-tokenization.srt      | 1151 +++++++++------
 subtitles/en/54_building-a-new-tokenizer.srt  |  641 +++++---
 ...ta-processing-for-token-classification.srt |  500 ++++---
 ...rocessing-for-masked-language-modeling.srt |  373 +++--
 subtitles/en/57_what-is-perplexity.srt        |  330 +++--
 subtitles/en/58_what-is-domain-adaptation.srt |  274 ++--
 .../en/59_data-processing-for-translation.srt |  405 +++--
 subtitles/en/60_what-is-the-bleu-metric.srt   |  814 ++++++----
 .../61_data-processing-for-summarization.srt  |  325 ++--
 subtitles/en/62_what-is-the-rouge-metric.srt  |  689 ++++++---
 ...rocessing-for-causal-language-modeling.srt |  629 +++++---
 .../en/64_using-a-custom-loss-function.srt    |  494 ++++---
 ...data-processing-for-question-answering.srt |  462 +++---
 ...g-step-in-question-answering-(pytorch).srt |  511 ++++---
 ...tep-in-question-answering-(tensorflow).srt |  498 ++++---
 subtitles/en/68_data-collators-a-tour.srt     |  996 ++++++++-----
 .../69_what-to-do-when-you-get-an-error.srt   |  405 +++--
 .../en/70_using-a-debugger-in-a-notebook.srt  |  451 ++++--
 .../en/71_using-a-debugger-in-a-terminal.srt  |  503 +++++--
 .../en/72_asking-for-help-on-the-forums.srt   |  520 ++++---
 ...ugging-the-training-pipeline-(pytorch).srt |  662 ++++++---
 ...ing-the-training-pipeline-(tensorflow).srt | 1108 ++++++++++----
 subtitles/en/75_writing-a-good-issue.srt      |  494 ++++---
 76 files changed, 28836 insertions(+), 14443 deletions(-)

diff --git a/subtitles/en/00_welcome-to-the-hugging-face-course.srt b/subtitles/en/00_welcome-to-the-hugging-face-course.srt
index f84bf031a..ae8eb7042 100644
--- a/subtitles/en/00_welcome-to-the-hugging-face-course.srt
+++ b/subtitles/en/00_welcome-to-the-hugging-face-course.srt
@@ -1,84 +1,455 @@
-1
-00:00:05,760 --> 00:00:10,480
-Welcome to the Hugging Face Course! This 
-course has been designed to teach you all  
-
-2
-00:00:10,480 --> 00:00:15,200
-about the Hugging Face ecosystem: how to 
-use the dataset and model hub as well as  
-
-3
-00:00:15,200 --> 00:00:20,880
-all our open source libraries. Here is 
-the Table of Contents. As you can see,  
-
-4
-00:00:20,880 --> 00:00:26,000
-it's divided in three sections which become 
-progressively more advanced. At this stage,  
-
-5
-00:00:26,000 --> 00:00:30,560
-the first two sections have been released. The 
-first will teach you the basics of how to use  
-
-6
-00:00:30,560 --> 00:00:35,680
-a Transformer model, fine-tune it on your own 
-dataset and share the result with the community.  
-
-7
-00:00:36,800 --> 00:00:42,400
-The second will dive deeper into our libraries 
-and teach you how to tackle any NLP task. We are  
-
-8
-00:00:42,400 --> 00:00:46,960
-actively working on the last one and hope to 
-have it ready for you for the spring of 2022.  
-
-9
-00:00:48,320 --> 00:00:52,320
-The first chapter requires no technical knowledge 
-and is a good introduction to learn what  
-
-10
-00:00:52,320 --> 00:00:59,040
-Transformers models can do and how they could be 
-of use to you or your company. The next chapters  
-
-11
-00:00:59,040 --> 00:01:03,280
-require a good knowledge of Python and some basic 
-knowledge of Machine Learning and Deep Learning.  
-
-12
-00:01:04,160 --> 00:01:09,120
-If you don't know what a training and validation 
-set is or what gradient descent means,  
-
-13
-00:01:09,120 --> 00:01:13,600
-you should look at an introductory course such as 
-the ones published by deeplearning.ai or fast.ai.  
-
-14
-00:01:16,000 --> 00:01:20,400
-It's also best if you have some basics in one 
-Deep Learning Framework (PyTorch or TensorFlow).  
-
-15
-00:01:20,960 --> 00:01:25,280
-Each part of the material introduced in this 
-course has a version in both those frameworks,  
-
-16
-00:01:25,280 --> 00:01:30,160
-so you will be able to pick the one you are 
-most comfortable with. This is the team that  
-
-17
-00:01:30,160 --> 00:01:34,240
-developed this course. I'll now let each of 
-the speakers introduce themselves briefly.
+﻿1
+00:00:05,850 --> 00:00:07,713
+- Welcome to the Hugging Face Course.
+
+2
+00:00:08,550 --> 00:00:10,320
+This course has been designed to teach you
+
+3
+00:00:10,320 --> 00:00:12,750
+all about the Hugging Face ecosystem,
+
+4
+00:00:12,750 --> 00:00:14,700
+how to use the dataset and model hub
+
+5
+00:00:14,700 --> 00:00:16,803
+as well as all our open-source libraries.
+
+6
+00:00:18,300 --> 00:00:19,950
+Here is the Table of Contents.
+
+7
+00:00:19,950 --> 00:00:22,770
+As you can see, it's
+divided in three sections
+
+8
+00:00:22,770 --> 00:00:25,110
+which become progressively more advanced.
+
+9
+00:00:25,110 --> 00:00:28,500
+At this stage, the first two
+sections have been released.
+
+10
+00:00:28,500 --> 00:00:30,120
+So first, we'll teach you the basics
+
+11
+00:00:30,120 --> 00:00:32,250
+of how to use a Transformer model,
+
+12
+00:00:32,250 --> 00:00:34,230
+fine-tune it on your own data set
+
+13
+00:00:34,230 --> 00:00:36,960
+and share the result with the community.
+
+14
+00:00:36,960 --> 00:00:39,420
+So second, we'll dive
+deeper into our libraries
+
+15
+00:00:39,420 --> 00:00:42,360
+and teach you how to tackle any NLP task.
+
+16
+00:00:42,360 --> 00:00:44,430
+We're actively working on the last one
+
+17
+00:00:44,430 --> 00:00:47,280
+and hope to have it ready for
+you for the spring of 2022.
+
+18
+00:00:48,510 --> 00:00:50,880
+The first chapter requires
+no technical knowledge
+
+19
+00:00:50,880 --> 00:00:52,320
+and is a good introduction to learn
+
+20
+00:00:52,320 --> 00:00:54,180
+what Transformers models can do
+
+21
+00:00:54,180 --> 00:00:56,883
+and how it could be of use
+to you or your company.
+
+22
+00:00:58,050 --> 00:01:01,110
+The next chapters require
+a good knowledge of Python
+
+23
+00:01:01,110 --> 00:01:02,130
+and some basic knowledge of
+
+24
+00:01:02,130 --> 00:01:04,350
+Machine Learning and Deep Learning.
+
+25
+00:01:04,350 --> 00:01:07,110
+If you don't know what a
+training and validation set are
+
+26
+00:01:07,110 --> 00:01:09,360
+or what gradient decent means,
+
+27
+00:01:09,360 --> 00:01:11,340
+you should look at an introductory course
+
+28
+00:01:11,340 --> 00:01:14,863
+such as the ones published by
+deeplearning.ai or fast.ai.
+
+29
+00:01:16,200 --> 00:01:17,910
+It's also best if you have some basics
+
+30
+00:01:17,910 --> 00:01:21,150
+in one Deep Learning Framework,
+PyTorch or TensorFlow.
+
+31
+00:01:21,150 --> 00:01:23,520
+Each part of the material
+introduced in this course
+
+32
+00:01:23,520 --> 00:01:25,590
+has a version in both those frameworks,
+
+33
+00:01:25,590 --> 00:01:26,730
+so you will be able to pick the one
+
+34
+00:01:26,730 --> 00:01:28,230
+you are most comfortable with.
+
+35
+00:01:29,550 --> 00:01:31,740
+This is the team that
+developed this course.
+
+36
+00:01:31,740 --> 00:01:33,120
+I'll now let each of the speakers
+
+37
+00:01:33,120 --> 00:01:34,570
+introduce themselves briefly.
+
+38
+00:01:37,230 --> 00:01:38,880
+- Hi, my name is Matthew,
+
+39
+00:01:38,880 --> 00:01:41,610
+and I'm a Machine Learning
+Engineer at Hugging Face.
+
+40
+00:01:41,610 --> 00:01:43,200
+I work on the open-source team
+
+41
+00:01:43,200 --> 00:01:45,180
+and I'm responsible for
+maintaining particularly
+
+42
+00:01:45,180 --> 00:01:47,280
+the TensorFlow code there.
+
+43
+00:01:47,280 --> 00:01:50,130
+Previously, I was a Machine
+Learning Engineer at Parsley,
+
+44
+00:01:50,130 --> 00:01:52,620
+who've recently been
+acquired by Automatic,
+
+45
+00:01:52,620 --> 00:01:54,210
+and I was a postdoctoral researcher
+
+46
+00:01:54,210 --> 00:01:57,000
+before that at Trinity
+College, Dublin in Ireland
+
+47
+00:01:57,000 --> 00:02:00,093
+working on computational
+genetics and retinal disease.
+
+48
+00:02:02,400 --> 00:02:03,870
+- Hi, I'm Lysandre.
+
+49
+00:02:03,870 --> 00:02:05,640
+I'm a Machine Learning
+Engineer at Hugging Face
+
+50
+00:02:05,640 --> 00:02:08,700
+and I'm specifically part
+of the open-source team.
+
+51
+00:02:08,700 --> 00:02:10,890
+I've been at Hugging
+Face for a few years now
+
+52
+00:02:10,890 --> 00:02:12,300
+and alongside my team members,
+
+53
+00:02:12,300 --> 00:02:13,890
+I've been working on most of the tools
+
+54
+00:02:13,890 --> 00:02:15,790
+that you'll get to see in this course.
+
+55
+00:02:18,270 --> 00:02:20,130
+- Hi, I'm Sylvain.
+
+56
+00:02:20,130 --> 00:02:22,140
+I'm a Research Engineer at Hugging Face
+
+57
+00:02:22,140 --> 00:02:25,830
+and one of the main maintainers
+of the Transformers Library.
+
+58
+00:02:25,830 --> 00:02:28,110
+Previously, I worked at fast.ai
+
+59
+00:02:28,110 --> 00:02:30,420
+where I helped develop the fast.ai Library
+
+60
+00:02:30,420 --> 00:02:32,220
+as well as the online book.
+
+61
+00:02:32,220 --> 00:02:35,340
+Before that, I was a math
+and computer science teacher
+
+62
+00:02:35,340 --> 00:02:36,173
+in France.
+
+63
+00:02:38,550 --> 00:02:41,340
+- Hi, my name is Sasha and I'm
+a Researcher at Hugging Face,
+
+64
+00:02:41,340 --> 00:02:42,420
+working on the ethical,
+
+65
+00:02:42,420 --> 00:02:46,230
+environmental and social impacts
+of machine learning models.
+
+66
+00:02:46,230 --> 00:02:49,020
+Previously, I was a
+postdoctoral researcher at Mila,
+
+67
+00:02:49,020 --> 00:02:50,400
+University in Montreal
+
+68
+00:02:50,400 --> 00:02:53,040
+and I also worked as an
+Applied AI Researcher
+
+69
+00:02:53,040 --> 00:02:55,140
+for the United Nations Global Pulse.
+
+70
+00:02:55,140 --> 00:02:57,300
+I've been involved in
+projects such as CodeCarbon
+
+71
+00:02:57,300 --> 00:02:59,790
+and the Machine Learning
+Impacts Calculator
+
+72
+00:02:59,790 --> 00:03:02,390
+to measure the carbon
+footprint of machine learning.
+
+73
+00:03:05,160 --> 00:03:07,650
+- Hi, I'm Merve and I'm
+a Developer Advocate
+
+74
+00:03:07,650 --> 00:03:09,390
+at Hugging Face.
+
+75
+00:03:09,390 --> 00:03:12,480
+Previously, I was working as
+a Machine Learning Engineer
+
+76
+00:03:12,480 --> 00:03:15,360
+building NLP tools and chat bots.
+
+77
+00:03:15,360 --> 00:03:17,670
+Currently, I'm working to improve the hub
+
+78
+00:03:17,670 --> 00:03:19,563
+and democratize machine learning.
+
+79
+00:03:22,140 --> 00:03:23,670
+- Hello everyone.
+
+80
+00:03:23,670 --> 00:03:27,210
+My name is Lucile and I'm
+a Machine Learning Engineer
+
+81
+00:03:27,210 --> 00:03:28,353
+at Hugging Face.
+
+82
+00:03:29,580 --> 00:03:32,550
+To tell you in two sentences who I am,
+
+83
+00:03:32,550 --> 00:03:35,590
+I work on the development and
+support of open-source tools
+
+84
+00:03:36,600 --> 00:03:39,595
+and I also participate in
+several research project
+
+85
+00:03:39,595 --> 00:03:41,795
+in the field of Natural
+Language Processing.
+
+86
+00:03:44,610 --> 00:03:45,540
+- Good day there.
+
+87
+00:03:45,540 --> 00:03:47,550
+I'm Lewis and I'm a
+Machine Learning Engineer
+
+88
+00:03:47,550 --> 00:03:50,130
+in the open-source team at Hugging Face.
+
+89
+00:03:50,130 --> 00:03:53,490
+I'm passionate about developing
+tools for the NLP community
+
+90
+00:03:53,490 --> 00:03:55,050
+and you'll see me at
+many of Hugging Face's
+
+91
+00:03:55,050 --> 00:03:56,910
+outreach activities.
+
+92
+00:03:56,910 --> 00:03:58,470
+Before joining Hugging Face,
+
+93
+00:03:58,470 --> 00:03:59,790
+I spent several years developing
+
+94
+00:03:59,790 --> 00:04:01,860
+machine learning applications for startups
+
+95
+00:04:01,860 --> 00:04:04,230
+and enterprises in the domains of NLP,
+
+96
+00:04:04,230 --> 00:04:07,260
+topological data analysis and time series.
+
+97
+00:04:07,260 --> 00:04:10,110
+In a former life, I was
+a theoretical physicist,
+
+98
+00:04:10,110 --> 00:04:11,760
+where I researched particle collisions
+
+99
+00:04:11,760 --> 00:04:13,560
+at the Large Hadron Collider and so.
+
+100
+00:04:15,900 --> 00:04:18,450
+- Hey, I'm Leandro and I'm
+a Machine Learning Engineer
+
+101
+00:04:18,450 --> 00:04:21,030
+in the open-source team at Hugging Face.
+
+102
+00:04:21,030 --> 00:04:23,460
+Before joining Hugging Face,
+I worked as a Data Scientist
+
+103
+00:04:23,460 --> 00:04:26,733
+in Switzerland and have taught
+Data Science at University.
+
diff --git a/subtitles/en/01_the-pipeline-function.srt b/subtitles/en/01_the-pipeline-function.srt
index c5fff35db..44c3bf8bb 100644
--- a/subtitles/en/01_the-pipeline-function.srt
+++ b/subtitles/en/01_the-pipeline-function.srt
@@ -1,223 +1,445 @@
-1
-00:00:05,680 --> 00:00:06,720
-The pipeline function.  
-
-2
-00:00:09,360 --> 00:00:13,280
-The pipeline function is the most 
-high-level API of the Transformers library.  
-
-3
-00:00:13,840 --> 00:00:21,200
-It regroups together all the steps to go from raw 
-texts to usable predictions. The model used is at  
-
-4
-00:00:21,200 --> 00:00:26,720
-the core of a pipeline, but the pipeline also 
-include all the necessary pre-processing (since  
-
-5
-00:00:26,720 --> 00:00:32,800
-the model does not expect texts, but numbers) as 
-well as some post-processing to make the output of  
-
-6
-00:00:32,800 --> 00:00:39,440
-the model human-readable. Let's look at a first 
-example with the sentiment analysis pipeline.  
-
-7
-00:00:40,480 --> 00:00:46,080
-This pipeline performs text classification on a 
-given input, and determines if it's positive or  
-
-8
-00:00:46,080 --> 00:00:53,120
-negative. Here, it attributed the positive label 
-on the given text, with a confidence of 95%.  
-
-9
-00:00:55,440 --> 00:00:59,520
-You can pass multiple texts to the 
-same pipeline, which will be processed  
-
-10
-00:00:59,520 --> 00:01:05,840
-and passed through the model together, as a 
-batch. The output is a list of individual results,  
-
-11
-00:01:05,840 --> 00:01:12,080
-in the same order as the input texts. Here we 
-find the same label and score for the first text,  
-
-12
-00:01:12,080 --> 00:01:16,480
-and the second text is judged 
-positive with a confidence of 99.99%.  
-
-13
-00:01:18,480 --> 00:01:22,720
-The zero-shot classification pipeline is a 
-more general text-classification pipeline:  
-
-14
-00:01:23,360 --> 00:01:28,320
-it allows you to provide the labels you 
-want. Here we want to classify our input  
-
-15
-00:01:28,320 --> 00:01:35,360
-text along the labels "education", "politics" and 
-"business". The pipeline successfully recognizes  
-
-16
-00:01:35,360 --> 00:01:39,360
-it's more about education than the 
-other labels, with a confidence of 84%.  
-
-17
-00:01:41,440 --> 00:01:47,360
-Moving on to other tasks, the text generation 
-pipeline will auto-complete a given prompt. The  
-
-18
-00:01:47,360 --> 00:01:52,560
-output is generated with a bit of randomness, so 
-it changes each time you call the generator object  
-
-19
-00:01:52,560 --> 00:01:58,960
-on a given prompt. Up until now, we have used the 
-pipeline API with the default model associated to  
-
-20
-00:01:58,960 --> 00:02:03,920
-each task, but you can use it with any model that 
-has been pretrained or fine-tuned on this task.  
-
-21
-00:02:06,320 --> 00:02:12,320
-Going on the model hub (huggingface.co/models), 
-you can filter the available models by task.  
-
-22
-00:02:13,120 --> 00:02:16,960
-The default model used in our 
-previous example was gpt2,  
-
-23
-00:02:16,960 --> 00:02:20,080
-but there are many more models 
-available, and not just in English!  
-
-24
-00:02:21,280 --> 00:02:27,120
-Let's go back to the text generation pipeline and 
-load it with another model, distilgpt2. This is  
-
-25
-00:02:27,120 --> 00:02:33,120
-a lighter version of gpt2 created by the Hugging 
-Face team. When applying the pipeline to a given  
-
-26
-00:02:33,120 --> 00:02:39,280
-prompt, we can specify several arguments, such as 
-the maximum length of the generated texts, or the  
-
-27
-00:02:39,280 --> 00:02:43,520
-number of sentences we want to return (since 
-there is some randomness in the generation).  
-
-28
-00:02:45,920 --> 00:02:50,480
-Generating text by guessing the next word in a 
-sentence was the pretraining objective of GPT-2,  
-
-29
-00:02:51,200 --> 00:02:56,240
-the fill mask pipeline is the pretraining 
-objective of BERT, which is to guess the value  
-
-30
-00:02:56,240 --> 00:03:02,480
-of masked word. In this case, we ask the two most 
-likely values for the missing words (according to  
-
-31
-00:03:02,480 --> 00:03:09,120
-the model) and get mathematical or computational 
-as possible answers. Another task Transformers  
-
-32
-00:03:09,120 --> 00:03:13,920
-model can perform is to classify each word in 
-the sentence instead of the sentence as a whole.  
-
-33
-00:03:14,720 --> 00:03:21,040
-One example of this is Named Entity Recognition, 
-which is the task of identifying entities, such as  
-
-34
-00:03:21,040 --> 00:03:29,360
-persons, organizations or locations in a sentence. 
-Here, the model correctly finds the person  
-
-35
-00:03:29,360 --> 00:03:36,000
-(Sylvain), the organization (Hugging Face) as well 
-as the location (Brooklyn) inside the input text.  
-
-36
-00:03:37,440 --> 00:03:42,080
-The grouped_entities=True argument used 
-is to make the pipeline group together  
-
-37
-00:03:42,080 --> 00:03:46,080
-the different words linked to the same 
-entity (such as Hugging and Face here).  
-
-38
-00:03:48,000 --> 00:03:52,160
-Another task available with the pipeline 
-API is extractive question answering.  
-
-39
-00:03:52,720 --> 00:03:58,080
-Providing a context and a question, the model 
-will identify the span of text in the context  
-
-40
-00:03:58,080 --> 00:04:03,920
-containing the answer to the question. Getting 
-short summaries of very long articles is  
-
-41
-00:04:03,920 --> 00:04:07,840
-also something the Transformers library can 
-help with, with the summarization pipeline.  
-
-42
-00:04:09,360 --> 00:04:15,040
-Finally, the last task supported by the 
-pipeline API is translation. Here we use  
-
-43
-00:04:15,040 --> 00:04:19,440
-a French/English model found on the model hub 
-to get the English version of our input text.  
-
-44
-00:04:21,360 --> 00:04:24,720
-Here is a brief summary of all the 
-tasks we looked into in this video.  
-
-45
-00:04:25,280 --> 00:04:27,840
-Try then out through the inference 
-widgets in the model hub!
+﻿1
+00:00:00,069 --> 00:00:01,341
+(screen whooshes)
+
+2
+00:00:01,341 --> 00:00:02,449
+(face logo whooshes)
+
+3
+00:00:02,449 --> 00:00:05,880
+(screen whooshes)
+
+4
+00:00:05,880 --> 00:00:07,080
+- The pipeline function.
+
+5
+00:00:09,540 --> 00:00:12,020
+The pipeline function is
+the most high level API
+
+6
+00:00:12,020 --> 00:00:14,010
+of the Transformers library.
+
+7
+00:00:14,010 --> 00:00:16,050
+It regroups together all the steps
+
+8
+00:00:16,050 --> 00:00:18,873
+to go from raw texts
+to usable predictions.
+
+9
+00:00:20,228 --> 00:00:22,980
+The model used is at
+the core of a pipeline,
+
+10
+00:00:22,980 --> 00:00:24,390
+but the pipeline also include
+
+11
+00:00:24,390 --> 00:00:26,610
+all the necessary pre-processing,
+
+12
+00:00:26,610 --> 00:00:30,240
+since the model does not
+expect texts, but number,
+
+13
+00:00:30,240 --> 00:00:32,040
+as well as some post-processing,
+
+14
+00:00:32,040 --> 00:00:34,533
+to make the output of
+the model human-readable.
+
+15
+00:00:35,910 --> 00:00:37,593
+Let's look at a first example
+
+16
+00:00:37,593 --> 00:00:39,693
+with the sentiment analysis pipeline.
+
+17
+00:00:40,740 --> 00:00:44,670
+This pipeline performs text
+classification on a given input
+
+18
+00:00:44,670 --> 00:00:46,953
+and determines if it's
+positive or negative.
+
+19
+00:00:47,910 --> 00:00:51,750
+Here, it attributed the positive
+label on the given text,
+
+20
+00:00:51,750 --> 00:00:54,413
+with a confidence of 95%.
+
+21
+00:00:55,650 --> 00:00:58,470
+You can pass multiple
+texts to the same pipeline,
+
+22
+00:00:58,470 --> 00:01:00,270
+which will be processed and passed
+
+23
+00:01:00,270 --> 00:01:02,673
+through the model together as a batch.
+
+24
+00:01:03,570 --> 00:01:05,970
+The output is a list of individual results
+
+25
+00:01:05,970 --> 00:01:07,923
+in the same order as the input texts.
+
+26
+00:01:08,790 --> 00:01:12,270
+Here we find the same label
+and score for the first text,
+
+27
+00:01:12,270 --> 00:01:14,443
+and the second text is judged negative
+
+28
+00:01:14,443 --> 00:01:17,243
+with a confidence of 99.9%.
+
+29
+00:01:18,720 --> 00:01:20,700
+The zero-shot classification pipeline
+
+30
+00:01:20,700 --> 00:01:23,610
+is a more general
+text-classification pipeline,
+
+31
+00:01:23,610 --> 00:01:26,370
+it allows you to provide
+the labels you want.
+
+32
+00:01:26,370 --> 00:01:29,850
+Here we want to classify our
+input text along the labels,
+
+33
+00:01:29,850 --> 00:01:32,643
+education, politics, and business.
+
+34
+00:01:33,540 --> 00:01:35,580
+The pipeline successfully recognizes
+
+35
+00:01:35,580 --> 00:01:38,280
+it's more about education
+than the other labels,
+
+36
+00:01:38,280 --> 00:01:40,643
+with a confidence of 84%.
+
+37
+00:01:41,670 --> 00:01:43,110
+Moving on to other tasks,
+
+38
+00:01:43,110 --> 00:01:45,030
+the text generation pipeline will
+
+39
+00:01:45,030 --> 00:01:46,533
+auto-complete a given prompt.
+
+40
+00:01:47,460 --> 00:01:49,980
+The output is generated
+with a bit of randomness,
+
+41
+00:01:49,980 --> 00:01:52,800
+so it changes each time you
+call the generator object
+
+42
+00:01:52,800 --> 00:01:53,763
+on a given prompt.
+
+43
+00:01:54,990 --> 00:01:57,123
+Up until now, we've used
+the the pipeline API
+
+44
+00:01:57,123 --> 00:02:00,360
+with the default model
+associated to each task,
+
+45
+00:02:00,360 --> 00:02:02,880
+but you can use it with any
+model that has been pretrained
+
+46
+00:02:02,880 --> 00:02:04,263
+or fine-tuned on this task.
+
+47
+00:02:06,540 --> 00:02:10,350
+Going on the model hub,
+huggingface.co/models
+
+48
+00:02:10,350 --> 00:02:13,350
+you can filter the
+available models by task.
+
+49
+00:02:13,350 --> 00:02:17,190
+The default model used in our
+previous example was gpt2,
+
+50
+00:02:17,190 --> 00:02:19,290
+but there are many more models available,
+
+51
+00:02:19,290 --> 00:02:20,523
+and not just in English.
+
+52
+00:02:21,450 --> 00:02:23,670
+Let's go back to the
+text generation pipeline
+
+53
+00:02:23,670 --> 00:02:26,193
+and load it with another
+model, distilgpt2.
+
+54
+00:02:27,060 --> 00:02:28,950
+This is a lighter version of gpt2
+
+55
+00:02:28,950 --> 00:02:30,603
+created by the Hugging Face team.
+
+56
+00:02:31,740 --> 00:02:34,110
+When applying the pipeline
+to a given prompt,
+
+57
+00:02:34,110 --> 00:02:36,360
+we can specify several arguments
+
+58
+00:02:36,360 --> 00:02:39,240
+such as the maximum length
+of the generated texts,
+
+59
+00:02:39,240 --> 00:02:41,700
+or the number of sentences
+we want to return,
+
+60
+00:02:41,700 --> 00:02:44,150
+since there is some
+randomness in the generation.
+
+61
+00:02:46,080 --> 00:02:48,750
+Generating texts by guessing
+the next word in a sentence
+
+62
+00:02:48,750 --> 00:02:51,450
+was the pretraining objective of GPT-2.
+
+63
+00:02:51,450 --> 00:02:55,140
+The fill mask pipeline is the
+pretraining objective of BERT,
+
+64
+00:02:55,140 --> 00:02:57,363
+which is to guess the
+value of masked word.
+
+65
+00:02:58,260 --> 00:03:01,020
+In this case, we ask the
+two most likely values
+
+66
+00:03:01,020 --> 00:03:03,660
+for the missing words,
+according to the model,
+
+67
+00:03:03,660 --> 00:03:07,053
+and get mathematical or
+computational as possible answers.
+
+68
+00:03:08,280 --> 00:03:10,170
+Another task Transformers
+model can perform
+
+69
+00:03:10,170 --> 00:03:12,660
+is to classify each word in the sentence
+
+70
+00:03:12,660 --> 00:03:14,970
+instead of the sentence as a whole.
+
+71
+00:03:14,970 --> 00:03:18,390
+One example of this is
+Named Entity Recognition,
+
+72
+00:03:18,390 --> 00:03:20,820
+which is the task of identifying entities,
+
+73
+00:03:20,820 --> 00:03:25,323
+such as persons, organizations
+or locations in a sentence.
+
+74
+00:03:26,400 --> 00:03:30,570
+Here, the model correctly
+finds the person, Sylvain,
+
+75
+00:03:30,570 --> 00:03:32,453
+the organization, Hugging Face,
+
+76
+00:03:32,453 --> 00:03:35,010
+as well as the location, Brooklyn,
+
+77
+00:03:35,010 --> 00:03:36,303
+inside the input text.
+
+78
+00:03:37,661 --> 00:03:40,230
+The grouped_entities=True argument used
+
+79
+00:03:40,230 --> 00:03:42,330
+is to make the pipeline group together
+
+80
+00:03:42,330 --> 00:03:44,790
+the different words
+linked to the same entity,
+
+81
+00:03:44,790 --> 00:03:46,353
+such as Hugging and Face here.
+
+82
+00:03:48,270 --> 00:03:50,670
+Another task available
+with the pipeline API
+
+83
+00:03:50,670 --> 00:03:52,920
+is extractive question answering.
+
+84
+00:03:52,920 --> 00:03:55,380
+Providing a context and a question,
+
+85
+00:03:55,380 --> 00:03:58,290
+the model will identify the
+span of text in the context
+
+86
+00:03:58,290 --> 00:04:00,190
+containing the answer to the question.
+
+87
+00:04:01,650 --> 00:04:03,960
+Getting short summaries
+of very long articles
+
+88
+00:04:03,960 --> 00:04:06,540
+is also something the Transformers
+library can help with,
+
+89
+00:04:06,540 --> 00:04:08,140
+with the summarization pipeline.
+
+90
+00:04:09,480 --> 00:04:12,570
+Finally, the last task
+supported by the pipeline API
+
+91
+00:04:12,570 --> 00:04:14,130
+is translation.
+
+92
+00:04:14,130 --> 00:04:16,170
+Here we use a French/English model
+
+93
+00:04:16,170 --> 00:04:17,460
+found on the model hub
+
+94
+00:04:17,460 --> 00:04:19,893
+to get the English
+version of our input text.
+
+95
+00:04:21,600 --> 00:04:23,490
+Here is a brief summary of all the tasks
+
+96
+00:04:23,490 --> 00:04:25,500
+we've looked into in this video.
+
+97
+00:04:25,500 --> 00:04:27,390
+Try then out through the inference widgets
+
+98
+00:04:27,390 --> 00:04:28,327
+in the model hub.
+
+99
+00:04:30,459 --> 00:04:33,475
+(screen whooshes)
+
+100
+00:04:33,475 --> 00:04:35,175
+(logo whooshes)
+
diff --git a/subtitles/en/02_the-carbon-footprint-of-transformers.srt b/subtitles/en/02_the-carbon-footprint-of-transformers.srt
index 5147f8e12..101d676a1 100644
--- a/subtitles/en/02_the-carbon-footprint-of-transformers.srt
+++ b/subtitles/en/02_the-carbon-footprint-of-transformers.srt
@@ -1 +1,581 @@
-No transcript found for this video!
\ No newline at end of file
+﻿1
+00:00:05,580 --> 00:00:08,820
+- So let's talk about the carbon
+footprint of transformers.
+
+2
+00:00:08,820 --> 00:00:10,530
+Maybe you've seen
+headlines such as this one
+
+3
+00:00:10,530 --> 00:00:13,530
+that training a single AI
+model can emit as much carbon
+
+4
+00:00:13,530 --> 00:00:16,020
+as five cars in their lifetimes.
+
+5
+00:00:16,020 --> 00:00:19,440
+So when is this true
+and is it always true?
+
+6
+00:00:19,440 --> 00:00:21,803
+Well, it actually depends
+on several things.
+
+7
+00:00:21,803 --> 00:00:23,430
+Most importantly, it depends
+
+8
+00:00:23,430 --> 00:00:24,960
+on the type of energy you're using.
+
+9
+00:00:24,960 --> 00:00:26,267
+If you're using renewable energy such as
+
+10
+00:00:26,267 --> 00:00:30,670
+solar, wind, hydroelectricity,
+you're really
+
+11
+00:00:30,670 --> 00:00:33,810
+not emitting any carbon
+at all, very, very little.
+
+12
+00:00:33,810 --> 00:00:36,769
+If you're using non-renewable
+energy sources such as coal
+
+13
+00:00:36,769 --> 00:00:39,570
+then their carbon
+footprint is a lot higher
+
+14
+00:00:39,570 --> 00:00:43,260
+'cuz essentially you are emitting
+a lot of greenhouse gases.
+
+15
+00:00:43,260 --> 00:00:44,670
+Another aspect is training time.
+
+16
+00:00:44,670 --> 00:00:47,232
+So the longer you train,
+the more energy you use
+
+17
+00:00:47,232 --> 00:00:50,250
+the more energy you use, the
+more carbon you emit, right?
+
+18
+00:00:50,250 --> 00:00:51,270
+So this really adds up
+
+19
+00:00:51,270 --> 00:00:53,520
+especially if you're
+training large models for
+
+20
+00:00:53,520 --> 00:00:56,460
+for hours and days and weeks.
+
+21
+00:00:56,460 --> 00:00:58,380
+The hardware you use also matters
+
+22
+00:00:58,380 --> 00:01:00,930
+because some GPUs, for
+example, are more efficient
+
+23
+00:01:00,930 --> 00:01:05,460
+than others and utilizing
+efficiency use properly.
+
+24
+00:01:05,460 --> 00:01:07,500
+So using them a hundred
+percent all the time
+
+25
+00:01:07,500 --> 00:01:10,650
+can really reduce the energy
+consumption that you have.
+
+26
+00:01:10,650 --> 00:01:13,290
+And then once again, reduce
+your carbon footprint.
+
+27
+00:01:13,290 --> 00:01:15,870
+There's also other aspects such as IO
+
+28
+00:01:15,870 --> 00:01:17,730
+such as data, et cetera, et cetera.
+
+29
+00:01:17,730 --> 00:01:20,940
+But these are the main three
+that you should focus on.
+
+30
+00:01:20,940 --> 00:01:23,340
+So when I talk about energy
+sources and carbon intensity
+
+31
+00:01:23,340 --> 00:01:24,420
+what does that really mean?
+
+32
+00:01:24,420 --> 00:01:27,480
+So if you look at the top of the screen
+
+33
+00:01:27,480 --> 00:01:30,480
+you have a carbon footprint
+
+34
+00:01:30,480 --> 00:01:33,860
+of a cloud computing
+instance in Mumbai, India
+
+35
+00:01:33,860 --> 00:01:38,700
+which emits 920 grams of
+CO2 per kilowatt hour.
+
+36
+00:01:38,700 --> 00:01:40,110
+This is almost one kilogram
+
+37
+00:01:40,110 --> 00:01:43,680
+of CO2 per kilowatt hour
+of electricity used.
+
+38
+00:01:43,680 --> 00:01:45,150
+If you compare that with Canada, Montreal
+
+39
+00:01:45,150 --> 00:01:48,720
+where I am right now, 20
+grams of CO2 per kilo hour.
+
+40
+00:01:48,720 --> 00:01:50,040
+So that's a really, really big difference.
+
+41
+00:01:50,040 --> 00:01:54,240
+Almost more than 40
+times more carbon emitted
+
+42
+00:01:54,240 --> 00:01:55,950
+in Mumbai versus Montreal.
+
+43
+00:01:55,950 --> 00:01:57,720
+And so this can really, really add up.
+
+44
+00:01:57,720 --> 00:01:59,820
+If you're training a model
+for weeks, for example
+
+45
+00:01:59,820 --> 00:02:01,920
+you're multiplying times 40
+
+46
+00:02:01,920 --> 00:02:03,450
+the carbon that you're emitting.
+
+47
+00:02:03,450 --> 00:02:05,070
+So choosing the right instance
+
+48
+00:02:05,070 --> 00:02:07,080
+choosing a low carbon compute instance
+
+49
+00:02:07,080 --> 00:02:09,690
+is really the most impactful
+thing that you can do.
+
+50
+00:02:09,690 --> 00:02:13,020
+And this is where it can really add up
+
+51
+00:02:13,020 --> 00:02:15,930
+if you're training in a very intensive
+
+52
+00:02:15,930 --> 00:02:17,580
+in a very carbon intensive region
+
+53
+00:02:19,170 --> 00:02:21,750
+other elements to consider, for example
+
+54
+00:02:21,750 --> 00:02:22,770
+using pre-trained models
+
+55
+00:02:22,770 --> 00:02:25,590
+that's the machine learning
+equivalent of recycling.
+
+56
+00:02:25,590 --> 00:02:28,292
+When you have pre-trained
+models available using them
+
+57
+00:02:28,292 --> 00:02:30,120
+you're not emitting any
+carbon at all, right?
+
+58
+00:02:30,120 --> 00:02:31,230
+You're not retraining anything.
+
+59
+00:02:31,230 --> 00:02:33,450
+So that's also doing your homework
+
+60
+00:02:33,450 --> 00:02:35,574
+and kind of looking around
+what already exists.
+
+61
+00:02:35,574 --> 00:02:37,890
+Fine tuning instead of
+training from scratch.
+
+62
+00:02:37,890 --> 00:02:38,723
+So once again
+
+63
+00:02:38,723 --> 00:02:40,590
+if you find a model that
+is almost what you need
+
+64
+00:02:40,590 --> 00:02:43,530
+but not quite fine tuning
+the last couple of layers
+
+65
+00:02:43,530 --> 00:02:45,210
+making it really fit your purpose instead
+
+66
+00:02:45,210 --> 00:02:46,500
+of training a large transformer
+
+67
+00:02:46,500 --> 00:02:48,810
+from scratch can really help,
+
+68
+00:02:48,810 --> 00:02:51,270
+starting with smaller experiments
+
+69
+00:02:51,270 --> 00:02:52,800
+and debugging as you go.
+
+70
+00:02:52,800 --> 00:02:54,630
+So that means, for example, training
+
+71
+00:02:54,630 --> 00:02:58,770
+figuring out data encoding,
+figuring out, you know
+
+72
+00:02:58,770 --> 00:03:01,170
+making sure that there's
+no small bugs, that you'll
+
+73
+00:03:01,170 --> 00:03:03,840
+you'll realize, you know,
+16 hours into training
+
+74
+00:03:03,840 --> 00:03:05,820
+starting small and really making sure
+
+75
+00:03:05,820 --> 00:03:08,760
+that what you're doing, what
+your code is, is stable.
+
+76
+00:03:08,760 --> 00:03:11,430
+And then finally doing
+a literature review to
+
+77
+00:03:11,430 --> 00:03:13,740
+choose hyper parameter
+ranges and then following
+
+78
+00:03:13,740 --> 00:03:15,900
+up with a random search
+instead of a grid search.
+
+79
+00:03:15,900 --> 00:03:18,420
+So random searches for hyper parameters
+
+80
+00:03:18,420 --> 00:03:21,300
+combinations have actually
+shown to be as efficient
+
+81
+00:03:21,300 --> 00:03:24,000
+in finding the optimal
+configuration as grid search.
+
+82
+00:03:24,000 --> 00:03:27,510
+But obviously you're not trying
+all possible combinations
+
+83
+00:03:27,510 --> 00:03:29,520
+you're only trying a subset of them.
+
+84
+00:03:29,520 --> 00:03:31,800
+So this can really help as well.
+
+85
+00:03:31,800 --> 00:03:32,760
+So now if we go back
+
+86
+00:03:32,760 --> 00:03:36,300
+to the original paper by
+Strubell et all in 2019
+
+87
+00:03:36,300 --> 00:03:39,180
+the infamous five cars
+in their lifetimes paper.
+
+88
+00:03:39,180 --> 00:03:40,013
+If you just look
+
+89
+00:03:40,013 --> 00:03:43,606
+at a transformer of 200
+million perimeter transformer
+
+90
+00:03:43,606 --> 00:03:46,950
+it is carbon footprint is
+around 200 pounds of CO2
+
+91
+00:03:46,950 --> 00:03:47,940
+which is significant
+
+92
+00:03:47,940 --> 00:03:49,980
+but it's nowhere near five cars, right?
+
+93
+00:03:49,980 --> 00:03:52,893
+It's not even a transatlantic flight.
+
+94
+00:03:52,893 --> 00:03:55,020
+How it really adds up is when you're doing
+
+95
+00:03:55,020 --> 00:03:56,190
+neural architecture search
+
+96
+00:03:56,190 --> 00:03:58,560
+when you're doing hyper
+parameter tuning, and
+
+97
+00:03:58,560 --> 00:04:00,930
+this is trying all possible combinations
+
+98
+00:04:00,930 --> 00:04:01,763
+et cetera, et cetera.
+
+99
+00:04:01,763 --> 00:04:02,596
+And this is where
+
+100
+00:04:02,596 --> 00:04:05,400
+like the 600,000 pounds of CO2 came from.
+
+101
+00:04:05,400 --> 00:04:08,490
+So this is really where things add up.
+
+102
+00:04:08,490 --> 00:04:11,880
+So, but if you're doing things
+mindfully and conscientiously
+
+103
+00:04:11,880 --> 00:04:16,410
+then your carbon footprint
+wont be as big as,
+
+104
+00:04:16,410 --> 00:04:20,040
+as the paper implied, some tools to figure
+
+105
+00:04:20,040 --> 00:04:22,111
+out how much CO2 exactly you're emitting.
+
+106
+00:04:22,111 --> 00:04:24,270
+There's a web-based tool called machine
+
+107
+00:04:24,270 --> 00:04:26,430
+learning submissions
+calculator, which allows you
+
+108
+00:04:26,430 --> 00:04:29,010
+to manually input, for example,
+which hardware you used
+
+109
+00:04:29,010 --> 00:04:30,488
+how many hours you used it for
+
+110
+00:04:30,488 --> 00:04:34,260
+where it was located
+locally or in the cloud.
+
+111
+00:04:34,260 --> 00:04:35,640
+And then it's gonna give you an estimate
+
+112
+00:04:35,640 --> 00:04:37,560
+of how much CO2 you emitted.
+
+113
+00:04:37,560 --> 00:04:40,200
+Another tool that does
+this programmatically,
+
+114
+00:04:40,200 --> 00:04:41,190
+is called code carbon.
+
+115
+00:04:41,190 --> 00:04:45,112
+So you can PIP install it, you
+can, you can go to the GitHub
+
+116
+00:04:45,112 --> 00:04:48,120
+and essentially it runs
+in parallel to your code.
+
+117
+00:04:48,120 --> 00:04:49,085
+So essentially you call it
+
+118
+00:04:49,085 --> 00:04:51,060
+and then you do all your training.
+
+119
+00:04:51,060 --> 00:04:53,760
+And then at the end it's
+gonna give you an estimate
+
+120
+00:04:53,760 --> 00:04:57,210
+a CSV file with an
+estimate of your emissions.
+
+121
+00:04:57,210 --> 00:04:59,250
+And it's gonna give you some comparisons.
+
+122
+00:04:59,250 --> 00:05:01,230
+It's got a visual UI
+where you can really look
+
+123
+00:05:01,230 --> 00:05:04,680
+at how this compares to
+driving a car or watching TV.
+
+124
+00:05:04,680 --> 00:05:06,060
+So it can give you an idea
+
+125
+00:05:06,060 --> 00:05:07,740
+of the scope of your emissions as well.
+
+126
+00:05:07,740 --> 00:05:09,930
+And actually, code carbon is
+already integrated into auto
+
+127
+00:05:09,930 --> 00:05:12,270
+and LP and hopefully
+people will be using it
+
+128
+00:05:12,270 --> 00:05:15,240
+out of the box and easily
+tracking their emissions all
+
+129
+00:05:15,240 --> 00:05:17,523
+through training and
+deploying transformers.
+
diff --git a/subtitles/en/03_what-is-transfer-learning.srt b/subtitles/en/03_what-is-transfer-learning.srt
index 29212cdc2..80c9ddeac 100644
--- a/subtitles/en/03_what-is-transfer-learning.srt
+++ b/subtitles/en/03_what-is-transfer-learning.srt
@@ -1,203 +1,396 @@
-1
-00:00:05,440 --> 00:00:07,120
-What is transfer learning?  
-
-2
-00:00:09,360 --> 00:00:13,760
-The idea of Transfer Learning is to leverage the 
-knowledge acquired by a model trained with lots of  
-
-3
-00:00:13,760 --> 00:00:20,720
-data on another task. The model A will be trained 
-specifically for task A. Now, let's say you want  
-
-4
-00:00:20,720 --> 00:00:26,320
-to train a model B for a different task. One 
-option would be to train the model from scratch.  
-
-5
-00:00:27,120 --> 00:00:34,240
-This could take lots of computation, time and 
-data. Instead, we could initialize model B with  
-
-6
-00:00:34,240 --> 00:00:38,880
-the same weights as model A, transferring 
-the knowledge of model A on task B.  
-
-7
-00:00:40,800 --> 00:00:47,040
-When training from scratch, all the model’s 
-weight are initialized randomly. In this example,  
-
-8
-00:00:47,040 --> 00:00:52,480
-we are training a BERT model on the task of 
-recognizing if two sentences are similar or not.  
-
-9
-00:00:53,680 --> 00:00:58,560
-On the left, it’s trained from scratch, and 
-on the right, it’s fine-tuning a pretrained  
-
-10
-00:00:58,560 --> 00:01:04,080
-model. As we can see, using transfer learning 
-and the pretrained model yields better results.  
-
-11
-00:01:04,960 --> 00:01:09,360
-And it doesn’t matter if we train longer, the 
-training from scratch is capped around 70%  
-
-12
-00:01:09,360 --> 00:01:13,040
-accuracy while the pretrained 
-model beats the 86% easily.  
-
-13
-00:01:14,240 --> 00:01:18,720
-This is because pretrained models are usually 
-trained on large amounts of data that provide  
-
-14
-00:01:18,720 --> 00:01:22,720
-the model with a statistical understanding 
-of the language used during pretraining.  
-
-15
-00:01:24,240 --> 00:01:28,960
-In computer vision, transfer learning has been 
-applied successfully for almost ten years.  
-
-16
-00:01:29,840 --> 00:01:35,840
-Models are frequently pretrained on ImageNet, a 
-dataset containing 1.2 millions of photo images.  
-
-17
-00:01:36,880 --> 00:01:41,680
-Each image is classified by one of 
-1000 labels. Training like this,  
-
-18
-00:01:42,240 --> 00:01:48,960
-on labeled data is called supervised 
-learning. In Natural Language Processing,  
-
-19
-00:01:48,960 --> 00:01:54,320
-transfer learning is a bit more recent. A key 
-difference with ImageNet is that the pretraining  
-
-20
-00:01:54,320 --> 00:01:59,280
-is usually self-supervised, which means it 
-doesn’t require humans annotations for the labels.  
-
-21
-00:02:00,480 --> 00:02:05,040
-A very common pretraining objective is 
-to guess the next word in a sentence,  
-
-22
-00:02:05,040 --> 00:02:08,720
-which only requires lots and 
-lots of text. GPT-2 for instance,  
-
-23
-00:02:09,360 --> 00:02:16,720
-was pretrained this way using the content of 45 
-millions links posted by users on Reddit. Another  
-
-24
-00:02:16,720 --> 00:02:21,520
-example of self-supervised pretraining objective 
-is to predict the value of randomly masked words,  
-
-25
-00:02:22,160 --> 00:02:25,360
-which is similar to fill-in-the-blank 
-tests you may have done in school.  
-
-26
-00:02:26,560 --> 00:02:31,520
-BERT was pretrained this way using the English 
-Wikipedia and 11,000 unpublished books.  
-
-27
-00:02:32,960 --> 00:02:38,880
-In practice, transfer learning is applied on a 
-given model by throwing away its head, that is,  
-
-28
-00:02:38,880 --> 00:02:43,680
-its last layers focused on the pretraining 
-objective, and replacing it with a new,  
-
-29
-00:02:43,680 --> 00:02:50,000
-randomly initialized, head suitable for the task 
-at hand. For instance, when we fine-tuned a BERT  
-
-30
-00:02:50,000 --> 00:02:55,440
-model earlier, we removed the head that classified 
-mask words and replaced it with a classifier with  
-
-31
-00:02:55,440 --> 00:03:01,680
-2 outputs, since our task had two labels. To 
-be as efficient as possible, the pretrained  
-
-32
-00:03:01,680 --> 00:03:07,200
-model used should be as similar as possible 
-to the task it’s fine-tuned on. For instance,  
-
-33
-00:03:07,200 --> 00:03:12,720
-if the problem it’s to classify German sentences, 
-it’s best to use a German pretrained model.  
-
-34
-00:03:14,160 --> 00:03:19,200
-But with the good comes the bad. The pretrained 
-model does not only transfer its knowledge,  
-
-35
-00:03:19,200 --> 00:03:25,440
-but also any bias it may contain. ImageNet mostly 
-contains images coming from the United States and  
-
-36
-00:03:25,440 --> 00:03:29,680
-Western Europe, so models fine-tuned with it 
-usually will perform better on images from  
-
-37
-00:03:29,680 --> 00:03:35,280
-these countries. OpenAI also studied the 
-bias in the predictions of its GPT-3 model  
-
-38
-00:03:35,840 --> 00:03:40,960
-(which was pretrained using the guess the next 
-work objective). Changing the gender of the prompt  
-
-39
-00:03:40,960 --> 00:03:46,720
-from "He was very" to "She was very" changed 
-the predictions from mostly neutral adjectives  
-
-40
-00:03:47,360 --> 00:03:52,240
-to almost only physical ones. In 
-their model card of the GPT-2 model,  
-
-41
-00:03:52,240 --> 00:03:59,840
-OpenAI also acknowledges its bias and discourages 
-its use in systems that interact with humans.
+﻿1
+00:00:00,189 --> 00:00:02,856
+(air whooshing)
+
+2
+00:00:05,550 --> 00:00:07,293
+- What is transfer learning?
+
+3
+00:00:09,480 --> 00:00:10,920
+The idea of transfer learning
+
+4
+00:00:10,920 --> 00:00:12,570
+is to leverage the knowledge acquired
+
+5
+00:00:12,570 --> 00:00:15,543
+by a model trained with lots
+of data on another task.
+
+6
+00:00:16,410 --> 00:00:20,130
+The model A will be trained
+specifically for task A.
+
+7
+00:00:20,130 --> 00:00:22,200
+Now let's say you want to train a model B
+
+8
+00:00:22,200 --> 00:00:23,970
+for a different task.
+
+9
+00:00:23,970 --> 00:00:27,330
+One option would be to train
+the model from scratch.
+
+10
+00:00:27,330 --> 00:00:30,633
+This could take lots of
+computation, time and data.
+
+11
+00:00:31,470 --> 00:00:34,260
+Instead, we could initialize model B
+
+12
+00:00:34,260 --> 00:00:36,570
+with the same weights as model A,
+
+13
+00:00:36,570 --> 00:00:39,213
+transferring the knowledge
+of model A on task B.
+
+14
+00:00:41,040 --> 00:00:42,690
+When training from scratch,
+
+15
+00:00:42,690 --> 00:00:45,870
+all the model's weight
+are initialized randomly.
+
+16
+00:00:45,870 --> 00:00:48,870
+In this example, we are
+training a BERT model
+
+17
+00:00:48,870 --> 00:00:50,220
+on the task of recognizing
+
+18
+00:00:50,220 --> 00:00:52,203
+if two sentences are similar or not.
+
+19
+00:00:54,116 --> 00:00:56,730
+On the left, it's trained from scratch,
+
+20
+00:00:56,730 --> 00:01:00,000
+and on the right it's
+fine-tuning a pretrained model.
+
+21
+00:01:00,000 --> 00:01:02,220
+As we can see, using transfer learning
+
+22
+00:01:02,220 --> 00:01:05,160
+and the pretrained model
+yields better results.
+
+23
+00:01:05,160 --> 00:01:07,140
+And it doesn't matter if we train longer.
+
+24
+00:01:07,140 --> 00:01:10,620
+The training from scratch is
+capped around 70% accuracy
+
+25
+00:01:10,620 --> 00:01:13,293
+while the pretrained model
+beats the 86% easily.
+
+26
+00:01:14,460 --> 00:01:16,140
+This is because pretrained models
+
+27
+00:01:16,140 --> 00:01:18,420
+are usually trained on
+large amounts of data
+
+28
+00:01:18,420 --> 00:01:21,000
+that provide the model with
+a statistical understanding
+
+29
+00:01:21,000 --> 00:01:23,413
+of the language used during pretraining.
+
+30
+00:01:24,450 --> 00:01:25,950
+In computer vision,
+
+31
+00:01:25,950 --> 00:01:28,080
+transfer learning has
+been applied successfully
+
+32
+00:01:28,080 --> 00:01:30,060
+for almost ten years.
+
+33
+00:01:30,060 --> 00:01:32,850
+Models are frequently
+pretrained on ImageNet,
+
+34
+00:01:32,850 --> 00:01:36,153
+a dataset containing 1.2
+millions of photo images.
+
+35
+00:01:37,170 --> 00:01:41,130
+Each image is classified
+by one of 1000 labels.
+
+36
+00:01:41,130 --> 00:01:44,010
+Training like this, on labeled data
+
+37
+00:01:44,010 --> 00:01:45,663
+is called supervised learning.
+
+38
+00:01:47,340 --> 00:01:49,140
+In Natural Language Processing,
+
+39
+00:01:49,140 --> 00:01:51,870
+transfer learning is a bit more recent.
+
+40
+00:01:51,870 --> 00:01:54,480
+A key difference with ImageNet
+is that the pretraining
+
+41
+00:01:54,480 --> 00:01:56,460
+is usually self-supervised,
+
+42
+00:01:56,460 --> 00:01:58,770
+which means it doesn't
+require humans annotations
+
+43
+00:01:58,770 --> 00:01:59,673
+for the labels.
+
+44
+00:02:00,780 --> 00:02:02,700
+A very common pretraining objective
+
+45
+00:02:02,700 --> 00:02:05,310
+is to guess the next word in a sentence.
+
+46
+00:02:05,310 --> 00:02:07,710
+Which only requires lots and lots of text.
+
+47
+00:02:07,710 --> 00:02:10,710
+GPT-2 for instance,
+was pretrained this way
+
+48
+00:02:10,710 --> 00:02:12,900
+using the content of 45 millions links
+
+49
+00:02:12,900 --> 00:02:14,673
+posted by users on Reddit.
+
+50
+00:02:16,560 --> 00:02:19,590
+Another example of self-supervised
+pretraining objective
+
+51
+00:02:19,590 --> 00:02:22,470
+is to predict the value
+of randomly masked words.
+
+52
+00:02:22,470 --> 00:02:24,540
+Which is similar to
+fill-in-the-blank tests
+
+53
+00:02:24,540 --> 00:02:26,760
+you may have done in school.
+
+54
+00:02:26,760 --> 00:02:29,880
+BERT was pretrained this way
+using the English Wikipedia
+
+55
+00:02:29,880 --> 00:02:31,893
+and 11,000 unpublished books.
+
+56
+00:02:33,120 --> 00:02:36,450
+In practice, transfer learning
+is applied on a given model
+
+57
+00:02:36,450 --> 00:02:39,090
+by throwing away its head, that is,
+
+58
+00:02:39,090 --> 00:02:42,150
+its last layers focused on
+the pretraining objective,
+
+59
+00:02:42,150 --> 00:02:45,360
+and replacing it with a new,
+randomly initialized head
+
+60
+00:02:45,360 --> 00:02:46,860
+suitable for the task at hand.
+
+61
+00:02:47,970 --> 00:02:51,570
+For instance, when we
+fine-tuned a BERT model earlier,
+
+62
+00:02:51,570 --> 00:02:54,060
+we removed the head that
+classified mask words
+
+63
+00:02:54,060 --> 00:02:56,790
+and replaced it with a
+classifier with 2 outputs.
+
+64
+00:02:56,790 --> 00:02:58,563
+Since our task had two labels.
+
+65
+00:02:59,700 --> 00:03:02,490
+To be as efficient as possible,
+the pretrained model used
+
+66
+00:03:02,490 --> 00:03:03,770
+should be as similar as possible
+
+67
+00:03:03,770 --> 00:03:06,270
+to the task it's fine-tuned on.
+
+68
+00:03:06,270 --> 00:03:08,190
+For instance, if the problem
+
+69
+00:03:08,190 --> 00:03:10,860
+is to classify German sentences,
+
+70
+00:03:10,860 --> 00:03:13,053
+it's best to use a
+German pretrained model.
+
+71
+00:03:14,370 --> 00:03:16,649
+But with the good comes the bad.
+
+72
+00:03:16,649 --> 00:03:19,380
+The pretrained model does not
+only transfer its knowledge,
+
+73
+00:03:19,380 --> 00:03:21,693
+but also any bias it may contain.
+
+74
+00:03:22,530 --> 00:03:24,300
+ImageNet mostly contains images
+
+75
+00:03:24,300 --> 00:03:26,850
+coming from the United
+States and Western Europe.
+
+76
+00:03:26,850 --> 00:03:28,020
+So models fine-tuned with it
+
+77
+00:03:28,020 --> 00:03:31,710
+usually will perform better on
+images from these countries.
+
+78
+00:03:31,710 --> 00:03:33,690
+OpenAI also studied the bias
+
+79
+00:03:33,690 --> 00:03:36,120
+in the predictions of its GPT-3 model
+
+80
+00:03:36,120 --> 00:03:36,953
+which was pretrained
+
+81
+00:03:36,953 --> 00:03:38,750
+using the guess the next word objective.
+
+82
+00:03:39,720 --> 00:03:41,040
+Changing the gender of the prompt
+
+83
+00:03:41,040 --> 00:03:44,250
+from he was very to she was very
+
+84
+00:03:44,250 --> 00:03:47,550
+changed the predictions from
+mostly neutral adjectives
+
+85
+00:03:47,550 --> 00:03:49,233
+to almost only physical ones.
+
+86
+00:03:50,400 --> 00:03:52,367
+In their model card of the GPT-2 model,
+
+87
+00:03:52,367 --> 00:03:54,990
+OpenAI also acknowledges its bias
+
+88
+00:03:54,990 --> 00:03:56,730
+and discourages its use
+
+89
+00:03:56,730 --> 00:03:58,803
+in systems that interact with humans.
+
+90
+00:04:01,040 --> 00:04:03,707
+(air whooshing)
+
diff --git a/subtitles/en/04_the-transformer-architecture.srt b/subtitles/en/04_the-transformer-architecture.srt
index 19fd99fb0..aecc03a08 100644
--- a/subtitles/en/04_the-transformer-architecture.srt
+++ b/subtitles/en/04_the-transformer-architecture.srt
@@ -1,138 +1,280 @@
-1
-00:00:04,960 --> 00:00:07,120
-Let's study the transformer architecture.  
-
-2
-00:00:08,960 --> 00:00:13,840
-This video is the introductory video to 
-the encoders, decoders, and encoder-decoder  
-
-3
-00:00:13,840 --> 00:00:18,640
-series of videos. In this series, we'll try to 
-understand what makes a Transformer network,  
-
-4
-00:00:18,640 --> 00:00:24,720
-and we'll try to explain it in simple, high-level 
-terms. No understanding of neural networks is  
-
-5
-00:00:24,720 --> 00:00:29,840
-necessary, only an understanding of 
-basic vectors and tensors may help.  
-
-6
-00:00:32,320 --> 00:00:36,480
-To get started, we'll take up this diagram 
-from the original transformer paper,  
-
-7
-00:00:36,480 --> 00:00:42,640
-entitled "Attention is all you need". As we'll 
-see here we can leverage only some parts of it,  
-
-8
-00:00:42,640 --> 00:00:48,080
-according to what we're trying to do. We won't 
-dive into the specific layers building up that  
-
-9
-00:00:48,080 --> 00:00:52,560
-architecture, but we'll try to understand the 
-different ways this architecture can be used.  
-
-10
-00:00:54,960 --> 00:00:59,760
-Let's first start by splitting that architecture 
-into two parts. On the left we have the encoder,  
-
-11
-00:00:59,760 --> 00:01:04,320
-and on the right, the decoder. These two can 
-be used together, but they can also be used  
-
-12
-00:01:04,320 --> 00:01:11,280
-independently! Let's understand how these work: 
-The encoder accepts inputs that represent text.  
-
-13
-00:01:11,280 --> 00:01:17,200
-It converts this text, these words, into numerical 
-representations. These numerical representations  
-
-14
-00:01:17,200 --> 00:01:23,120
-can also be called embeddings, or features. We'll 
-see that it uses the self-attention mechanism as  
-
-15
-00:01:23,120 --> 00:01:29,840
-its main component. We recommend you check out the 
-video on encoders especially to understand what is  
-
-16
-00:01:29,840 --> 00:01:36,640
-this numerical representation, as well as how it 
-works. We'll study the self-attention mechanism as  
-
-17
-00:01:36,640 --> 00:01:44,000
-well as its bi-directional properties. The decoder 
-is similar to the encoder: it can also accept  
-
-18
-00:01:44,000 --> 00:01:47,200
-the same inputs as the encoder: inputs that 
-represent text. It uses a similar mechanism as  
-
-19
-00:01:47,200 --> 00:01:53,200
-the encoder, which is the masked self-attention 
-as well. It differs from the encoder due to its  
-
-20
-00:01:53,200 --> 00:01:59,200
-uni-directional property, and is traditionally 
-used in an auto-regressive manner. Here too,  
-
-21
-00:01:59,200 --> 00:02:03,600
-we recommend you check out the video on decoders 
-especially to understand how all of this works.  
-
-22
-00:02:06,560 --> 00:02:11,120
-Combining the two parts results in what is known 
-as an encoder-decoder, or a sequence-to-sequence  
-
-23
-00:02:11,120 --> 00:02:16,640
-transformer. The encoder accepts inputs and 
-computes a high-level representation of those  
-
-24
-00:02:16,640 --> 00:02:22,640
-inputs. These outputs are then passed to the 
-decoder. The decoder uses the encoder's output  
-
-25
-00:02:22,640 --> 00:02:27,680
-alongside other inputs, in order to generate 
-a prediction. It then predicts an output,  
-
-26
-00:02:27,680 --> 00:02:32,000
-which it will re-use in future iterations, 
-hence the term "auto-regressive".  
-
-27
-00:02:33,040 --> 00:02:36,480
-Finally, to get an understanding 
-of the encoder-decoders as a whole,  
-
-28
-00:02:36,480 --> 00:02:44,080
-we recommend you check out 
-the video on encoder-decoders.
+﻿1
+00:00:00,000 --> 00:00:02,750
+(logo whooshing)
+
+2
+00:00:05,010 --> 00:00:07,323
+- Let's study the
+transformer architecture.
+
+3
+00:00:09,150 --> 00:00:12,030
+This video is the introductory
+video to the encoders,
+
+4
+00:00:12,030 --> 00:00:15,510
+decoders, and encoder-decoder
+series of videos.
+
+5
+00:00:15,510 --> 00:00:16,343
+In this series,
+
+6
+00:00:16,343 --> 00:00:18,900
+we'll try to understand what
+makes a transformer network,
+
+7
+00:00:18,900 --> 00:00:22,770
+and we'll try to explain it
+in simple, high-level terms.
+
+8
+00:00:22,770 --> 00:00:25,800
+No advanced understanding of
+neural networks is necessary,
+
+9
+00:00:25,800 --> 00:00:29,343
+but an understanding of basic
+vectors and tensors may help.
+
+10
+00:00:32,250 --> 00:00:33,270
+To get started,
+
+11
+00:00:33,270 --> 00:00:34,530
+we'll take up this diagram
+
+12
+00:00:34,530 --> 00:00:36,630
+from the original transformer paper,
+
+13
+00:00:36,630 --> 00:00:40,140
+entitled "Attention Is All
+You Need", by Vaswani et al.
+
+14
+00:00:40,140 --> 00:00:41,010
+As we'll see here,
+
+15
+00:00:41,010 --> 00:00:42,780
+we can leverage only some parts of it,
+
+16
+00:00:42,780 --> 00:00:44,630
+according to what we're trying to do.
+
+17
+00:00:45,480 --> 00:00:47,610
+We want to dive into the specific layers,
+
+18
+00:00:47,610 --> 00:00:48,990
+building up that architecture,
+
+19
+00:00:48,990 --> 00:00:51,390
+but we'll try to understand
+the different ways
+
+20
+00:00:51,390 --> 00:00:52,893
+this architecture can be used.
+
+21
+00:00:55,170 --> 00:00:56,003
+Let's first start
+
+22
+00:00:56,003 --> 00:00:58,260
+by splitting that
+architecture into two parts.
+
+23
+00:00:58,260 --> 00:00:59,910
+On the left we have the encoder,
+
+24
+00:00:59,910 --> 00:01:01,980
+and on the right, the decoder.
+
+25
+00:01:01,980 --> 00:01:03,330
+These two can be used together,
+
+26
+00:01:03,330 --> 00:01:05,330
+but they can also be used independently.
+
+27
+00:01:06,180 --> 00:01:08,610
+Let's understand how these work.
+
+28
+00:01:08,610 --> 00:01:11,460
+The encoder accepts inputs
+that represent text.
+
+29
+00:01:11,460 --> 00:01:13,620
+It converts this text, these words,
+
+30
+00:01:13,620 --> 00:01:15,675
+into numerical representations.
+
+31
+00:01:15,675 --> 00:01:17,400
+These numerical representations
+
+32
+00:01:17,400 --> 00:01:20,460
+can also be called
+embeddings, or features.
+
+33
+00:01:20,460 --> 00:01:23,100
+We'll see that it uses the
+self-attention mechanism
+
+34
+00:01:23,100 --> 00:01:24,483
+as its main component.
+
+35
+00:01:25,500 --> 00:01:27,120
+We recommend you check out the video
+
+36
+00:01:27,120 --> 00:01:29,700
+on encoders specifically to understand
+
+37
+00:01:29,700 --> 00:01:31,680
+what is this numerical representation,
+
+38
+00:01:31,680 --> 00:01:33,690
+as well as how it works.
+
+39
+00:01:33,690 --> 00:01:36,660
+We'll study the self-attention
+mechanism in more detail,
+
+40
+00:01:36,660 --> 00:01:38,913
+as well as its bi-directional properties.
+
+41
+00:01:40,650 --> 00:01:42,780
+The decoder is similar to the encoder.
+
+42
+00:01:42,780 --> 00:01:45,630
+It can also accept text inputs.
+
+43
+00:01:45,630 --> 00:01:48,210
+It uses a similar
+mechanism as the encoder,
+
+44
+00:01:48,210 --> 00:01:51,150
+which is the masked
+self-attention as well.
+
+45
+00:01:51,150 --> 00:01:52,590
+It differs from the encoder
+
+46
+00:01:52,590 --> 00:01:54,990
+due to its uni-directional feature
+
+47
+00:01:54,990 --> 00:01:58,590
+and is traditionally used in
+an auto-regressive manner.
+
+48
+00:01:58,590 --> 00:02:01,650
+Here too, we recommend you
+check out the video on decoders
+
+49
+00:02:01,650 --> 00:02:04,000
+especially to understand
+how all of this works.
+
+50
+00:02:06,810 --> 00:02:07,890
+Combining the two parts
+
+51
+00:02:07,890 --> 00:02:10,200
+results in what is known
+as an encoder-decoder,
+
+52
+00:02:10,200 --> 00:02:12,720
+or a sequence-to-sequence transformer.
+
+53
+00:02:12,720 --> 00:02:14,280
+The encoder accepts inputs
+
+54
+00:02:14,280 --> 00:02:17,850
+and computes a high-level
+representation of those inputs.
+
+55
+00:02:17,850 --> 00:02:20,252
+These outputs are then
+passed to the decoder.
+
+56
+00:02:20,252 --> 00:02:22,860
+The decoder uses the encoder's output,
+
+57
+00:02:22,860 --> 00:02:26,370
+alongside other inputs
+to generate a prediction.
+
+58
+00:02:26,370 --> 00:02:27,900
+It then predicts an output,
+
+59
+00:02:27,900 --> 00:02:30,248
+which it will re-use in future iterations,
+
+60
+00:02:30,248 --> 00:02:32,662
+hence the term, auto-regressive.
+
+61
+00:02:32,662 --> 00:02:34,740
+Finally, to get an understanding
+
+62
+00:02:34,740 --> 00:02:36,690
+of the encoder-decoders as a whole,
+
+63
+00:02:36,690 --> 00:02:39,670
+we recommend you check out
+the video on encoder-decoders.
+
+64
+00:02:39,670 --> 00:02:42,420
+(logo whooshing)
+
diff --git a/subtitles/en/05_transformer-models-encoders.srt b/subtitles/en/05_transformer-models-encoders.srt
index b438afdbe..1171958b2 100644
--- a/subtitles/en/05_transformer-models-encoders.srt
+++ b/subtitles/en/05_transformer-models-encoders.srt
@@ -1,224 +1,454 @@
-1
-00:00:04,320 --> 00:00:09,120
-In this video, we'll study the encoder 
-architecture. An example of a popular  
-
-2
-00:00:09,120 --> 00:00:13,120
-encoder-only architecture is BERT, which 
-is the most popular model of its kind.  
-
-3
-00:00:14,400 --> 00:00:20,880
-Let's first start by understanding how it works. 
-We'll use a small example, using three words. We  
-
-4
-00:00:20,880 --> 00:00:27,040
-use these as inputs, and pass them through the 
-encoder. We retrieve a numerical representation  
-
-5
-00:00:27,040 --> 00:00:34,160
-of each word. Here, for example, the encoder 
-converts the three words “Welcome to NYC”  
-
-6
-00:00:34,160 --> 00:00:40,880
-in these three sequences of numbers. The encoder 
-outputs exactly one sequence of numbers per input  
-
-7
-00:00:40,880 --> 00:00:46,880
-word. This numerical representation can also be 
-called a "Feature vector", or "Feature tensor". 
-
-8
-00:00:48,880 --> 00:00:53,680
-Let's dive in this representation. It contains 
-one vector per word that was passed through the  
-
-9
-00:00:53,680 --> 00:00:59,680
-encoder. Each of these vector is a numerical 
-representation of the word in question.  
-
-10
-00:01:00,880 --> 00:01:06,400
-The dimension of that vector is defined by the 
-architecture of the model, for the base BERT  
-
-11
-00:01:06,400 --> 00:01:15,280
-model, it is 768. These representations contain 
-the value of a word; but contextualized. For  
-
-12
-00:01:15,280 --> 00:01:21,280
-example, the vector attributed to the word "to", 
-isn't the representation of only the "to" word.  
-
-13
-00:01:22,160 --> 00:01:29,680
-It also takes into account the words around it, 
-which we call the “context”.As in, it looks to the  
-
-14
-00:01:29,680 --> 00:01:34,960
-left context, the word on the left of the one 
-we're studying (here the word "Welcome") and  
-
-15
-00:01:34,960 --> 00:01:41,120
-the context on the right (here the word "NYC") and 
-outputs a value for the word, within its context.  
-
-16
-00:01:41,840 --> 00:01:49,280
-It is therefore a contextualized value. One 
-could say that the vector of 768 values holds the  
-
-17
-00:01:49,280 --> 00:01:55,840
-"meaning" of that word in the text. How it does 
-this is thanks to the self-attention mechanism.  
-
-18
-00:01:57,120 --> 00:02:02,240
-The self-attention mechanism relates to different 
-positions (or different words) in a single  
-
-19
-00:02:02,240 --> 00:02:08,320
-sequence, in order to compute a representation 
-of that sequence. As we've seen before, this  
-
-20
-00:02:08,320 --> 00:02:13,600
-means that the resulting representation of a word 
-has been affected by other words in the sequence.  
-
-21
-00:02:15,600 --> 00:02:20,160
-We won't dive into the specifics here, but we'll 
-offer some further readings if you want to get  
-
-22
-00:02:20,160 --> 00:02:26,480
-a better understanding at what happens under 
-the hood. So when should one use an encoder?  
-
-23
-00:02:27,040 --> 00:02:33,680
-Encoders can be used as standalone models in a 
-wide variety of tasks. For example BERT, arguably  
-
-24
-00:02:33,680 --> 00:02:38,800
-the most famous transformer model, is a standalone 
-encoder model and at the time of release,  
-
-25
-00:02:38,800 --> 00:02:44,000
-beat the state of the art in many sequence 
-classification tasks, question answering tasks,  
-
-26
-00:02:44,000 --> 00:02:50,240
-and masked language modeling, to only cite a 
-few. The idea is that encoders are very powerful  
-
-27
-00:02:50,240 --> 00:02:55,920
-at extracting vectors that carry meaningful 
-information about a sequence. This vector can  
-
-28
-00:02:55,920 --> 00:02:59,680
-then be handled down the road by additional 
-layers of neurons to make sense of them.  
-
-29
-00:03:01,200 --> 00:03:04,240
-Let's take a look at some examples 
-where encoders really shine.  
-
-30
-00:03:06,080 --> 00:03:11,760
-First of all, Masked Language Modeling, or 
-MLM. It's the task of predicting a hidden word  
-
-31
-00:03:11,760 --> 00:03:18,560
-in a sequence of words. Here, for example, we have 
-hidden the word between "My" and "is". This is one  
-
-32
-00:03:18,560 --> 00:03:24,000
-of the objectives with which BERT was trained: it 
-was trained to predict hidden words in a sequence.  
-
-33
-00:03:25,040 --> 00:03:30,160
-Encoders shine in this scenario in particular, 
-as bidirectional information is crucial here.  
-
-34
-00:03:30,960 --> 00:03:35,520
-If we didn't have the words on the right (is, 
-Sylvain, and the dot), then there is very little  
-
-35
-00:03:35,520 --> 00:03:41,200
-chance that BERT would have been able to identify 
-"name" as the correct word. The encoder needs to  
-
-36
-00:03:41,200 --> 00:03:46,720
-have a good understanding of the sequence in order 
-to predict a masked word, as even if the text is  
-
-37
-00:03:46,720 --> 00:03:52,080
-grammatically correct, It does not necessarily 
-make sense in the context of the sequence.  
-
-38
-00:03:54,960 --> 00:03:58,720
-As mentioned earlier, encoders are 
-good at doing sequence classification.  
-
-39
-00:03:59,360 --> 00:04:03,560
-Sentiment analysis is an example 
-of a sequence classification task.  
-
-40
-00:04:04,240 --> 00:04:11,040
-The model's aim is to identify the sentiment of 
-a sequence – it can range from giving a sequence  
-
-41
-00:04:11,040 --> 00:04:16,720
-a rating from one to five stars if doing review 
-analysis, to giving a positive or negative rating  
-
-42
-00:04:16,720 --> 00:04:22,800
-to a sequence, which is what is shown here. 
-For example here, given the two sequences,  
-
-43
-00:04:22,800 --> 00:04:28,800
-we use the model to compute a prediction and to 
-classify the sequences among these two classes:  
-
-44
-00:04:28,800 --> 00:04:35,040
-positive and negative. While the two sequences 
-are very similar, containing the same words,  
-
-45
-00:04:35,040 --> 00:04:41,840
-the meaning is different – and the encoder 
-model is able to grasp that difference.
+﻿1
+00:00:00,253 --> 00:00:03,003
+(intro striking)
+
+2
+00:00:04,440 --> 00:00:07,830
+- In this video, we'll study
+the encoder architecture.
+
+3
+00:00:07,830 --> 00:00:11,070
+An example of a popular encoder
+only architecture is BURT
+
+4
+00:00:11,070 --> 00:00:13,323
+which is the most popular
+model of its kind.
+
+5
+00:00:14,550 --> 00:00:16,950
+Let's first start by
+understanding how it works.
+
+6
+00:00:18,360 --> 00:00:20,910
+We'll use a small example
+using three words.
+
+7
+00:00:20,910 --> 00:00:23,823
+We use these as inputs and
+pass them through the encoder.
+
+8
+00:00:25,290 --> 00:00:28,173
+We retrieve a numerical
+representation of each word.
+
+9
+00:00:29,970 --> 00:00:32,700
+Here, for example, the encoder
+converts those three words,
+
+10
+00:00:32,700 --> 00:00:37,350
+welcome to NYC, in these
+three sequences of numbers.
+
+11
+00:00:37,350 --> 00:00:40,350
+The encoder outputs exactly
+one sequence of numbers
+
+12
+00:00:40,350 --> 00:00:41,493
+per input word.
+
+13
+00:00:42,330 --> 00:00:44,880
+This numerical representation
+can also be called
+
+14
+00:00:44,880 --> 00:00:47,163
+a feature vector, or a feature tensor.
+
+15
+00:00:49,080 --> 00:00:51,030
+Let's dive into this representation.
+
+16
+00:00:51,030 --> 00:00:52,740
+It contains one vector per word
+
+17
+00:00:52,740 --> 00:00:54,540
+that was passed through the encoder.
+
+18
+00:00:56,130 --> 00:00:58,620
+Each of these vector is a
+numerical representation
+
+19
+00:00:58,620 --> 00:01:00,033
+of the word in question.
+
+20
+00:01:01,080 --> 00:01:03,300
+The dimension of that vector is defined
+
+21
+00:01:03,300 --> 00:01:05,520
+by the architecture of the model.
+
+22
+00:01:05,520 --> 00:01:08,703
+For the base BERT model, it is 768.
+
+23
+00:01:10,650 --> 00:01:13,230
+These representations
+contain the value of a word,
+
+24
+00:01:13,230 --> 00:01:15,240
+but contextualized.
+
+25
+00:01:15,240 --> 00:01:18,570
+For example, the vector
+attributed to the word "to"
+
+26
+00:01:18,570 --> 00:01:22,290
+isn't the representation
+of only the "to" word.
+
+27
+00:01:22,290 --> 00:01:25,650
+It also takes into account
+the words around it
+
+28
+00:01:25,650 --> 00:01:27,363
+which we call the context.
+
+29
+00:01:28,650 --> 00:01:30,780
+As in it looks to the left context,
+
+30
+00:01:30,780 --> 00:01:32,970
+the words on the left of
+the one we're studying,
+
+31
+00:01:32,970 --> 00:01:34,980
+here the word "Welcome",
+
+32
+00:01:34,980 --> 00:01:37,497
+and the context on the
+right, here the word "NYC",
+
+33
+00:01:38,348 --> 00:01:42,000
+and it outputs a value for
+the word given its context.
+
+34
+00:01:42,000 --> 00:01:45,420
+It is therefore a contextualized value.
+
+35
+00:01:45,420 --> 00:01:48,810
+One could say that the
+vector of 768 values
+
+36
+00:01:48,810 --> 00:01:51,993
+holds the meaning of the
+word within the text.
+
+37
+00:01:53,310 --> 00:01:56,073
+It does this thanks to the
+self-attention mechanism.
+
+38
+00:01:57,240 --> 00:02:00,630
+The self-attention mechanism
+relates to different positions,
+
+39
+00:02:00,630 --> 00:02:02,850
+or different words in a single sequence
+
+40
+00:02:02,850 --> 00:02:06,003
+in order to compute a
+representation of that sequence.
+
+41
+00:02:07,200 --> 00:02:09,000
+As we've seen before, this means that
+
+42
+00:02:09,000 --> 00:02:11,130
+the resulting representation of a word
+
+43
+00:02:11,130 --> 00:02:13,983
+has been affected by other
+words in the sequence.
+
+44
+00:02:15,840 --> 00:02:18,030
+We won't dive into the specifics here
+
+45
+00:02:18,030 --> 00:02:19,680
+which will offer some further readings
+
+46
+00:02:19,680 --> 00:02:21,330
+if you want to get a better understanding
+
+47
+00:02:21,330 --> 00:02:22,953
+at what happens under the hood.
+
+48
+00:02:25,050 --> 00:02:27,480
+So why should one use and encoder?
+
+49
+00:02:27,480 --> 00:02:29,370
+Encoders can be used as stand-alone models
+
+50
+00:02:29,370 --> 00:02:31,263
+in a wide variety of tasks.
+
+51
+00:02:32,100 --> 00:02:33,360
+For example, BERT,
+
+52
+00:02:33,360 --> 00:02:35,670
+arguably the most famous
+transformer model,
+
+53
+00:02:35,670 --> 00:02:37,590
+is a standalone encoder model,
+
+54
+00:02:37,590 --> 00:02:38,820
+and at the time of release,
+
+55
+00:02:38,820 --> 00:02:40,440
+it'd be the state of the art
+
+56
+00:02:40,440 --> 00:02:42,780
+in many sequence classification tasks,
+
+57
+00:02:42,780 --> 00:02:44,190
+question answering tasks,
+
+58
+00:02:44,190 --> 00:02:46,743
+and mask language modeling
+to only cite of few.
+
+59
+00:02:48,150 --> 00:02:50,460
+The idea is that encoders
+are very powerful
+
+60
+00:02:50,460 --> 00:02:52,470
+at extracting vectors that carry
+
+61
+00:02:52,470 --> 00:02:55,350
+meaningful information about a sequence.
+
+62
+00:02:55,350 --> 00:02:57,870
+This vector can then be
+handled down the road
+
+63
+00:02:57,870 --> 00:03:00,070
+by additional neurons
+to make sense of them.
+
+64
+00:03:01,380 --> 00:03:02,850
+Let's take a look at some examples
+
+65
+00:03:02,850 --> 00:03:04,563
+where encoder really shine.
+
+66
+00:03:06,210 --> 00:03:09,900
+First of all, Masked
+Language Modeling, or MLM.
+
+67
+00:03:09,900 --> 00:03:11,970
+It's the task of predicting a hidden word
+
+68
+00:03:11,970 --> 00:03:13,590
+in a sequence of word.
+
+69
+00:03:13,590 --> 00:03:15,630
+Here, for example, we have hidden the word
+
+70
+00:03:15,630 --> 00:03:17,247
+between "My" and "is".
+
+71
+00:03:18,270 --> 00:03:21,120
+This is one of the objectives
+with which BERT was trained.
+
+72
+00:03:21,120 --> 00:03:24,393
+It was trained to predict
+hidden words in a sequence.
+
+73
+00:03:25,230 --> 00:03:27,930
+Encoders shine in this
+scenario in particular
+
+74
+00:03:27,930 --> 00:03:31,140
+as bi-directional
+information is crucial here.
+
+75
+00:03:31,140 --> 00:03:32,947
+If we didn't have the words on the right,
+
+76
+00:03:32,947 --> 00:03:34,650
+"is", "Sylvain" and the ".",
+
+77
+00:03:34,650 --> 00:03:35,940
+then there is very little chance
+
+78
+00:03:35,940 --> 00:03:38,580
+that BERT would have been
+able to identify name
+
+79
+00:03:38,580 --> 00:03:40,500
+as the correct word.
+
+80
+00:03:40,500 --> 00:03:42,270
+The encoder needs to
+have a good understanding
+
+81
+00:03:42,270 --> 00:03:45,360
+of the sequence in order
+to predict a masked word
+
+82
+00:03:45,360 --> 00:03:48,840
+as even if the text is
+grammatically correct,
+
+83
+00:03:48,840 --> 00:03:50,610
+it does not necessarily make sense
+
+84
+00:03:50,610 --> 00:03:52,413
+in the context of the sequence.
+
+85
+00:03:55,230 --> 00:03:56,580
+As mentioned earlier,
+
+86
+00:03:56,580 --> 00:03:59,520
+encoders are good at doing
+sequence classification.
+
+87
+00:03:59,520 --> 00:04:02,883
+Sentiment analysis is an example
+of sequence classification.
+
+88
+00:04:04,410 --> 00:04:09,410
+The model's aim is to identify
+the sentiment of a sequence.
+
+89
+00:04:09,540 --> 00:04:11,280
+It can range from giving a sequence,
+
+90
+00:04:11,280 --> 00:04:12,960
+a rating from one to five stars
+
+91
+00:04:12,960 --> 00:04:15,900
+if doing review analysis
+to giving a positive,
+
+92
+00:04:15,900 --> 00:04:17,820
+or negative rating to a sequence
+
+93
+00:04:17,820 --> 00:04:19,220
+which is what is shown here.
+
+94
+00:04:20,280 --> 00:04:22,950
+For example, here,
+given the two sequences,
+
+95
+00:04:22,950 --> 00:04:25,860
+we use the model to compute a prediction,
+
+96
+00:04:25,860 --> 00:04:27,420
+and to classify the sequences
+
+97
+00:04:27,420 --> 00:04:30,393
+among these two classes,
+positive and negative.
+
+98
+00:04:31,230 --> 00:04:33,450
+While the two sequences are very similar
+
+99
+00:04:33,450 --> 00:04:35,220
+containing the same words,
+
+100
+00:04:35,220 --> 00:04:37,170
+the meaning is entirely different,
+
+101
+00:04:37,170 --> 00:04:40,143
+and the encoder model is able
+to grasp that difference.
+
+102
+00:04:41,404 --> 00:04:44,154
+(outro striking)
+
diff --git a/subtitles/en/06_transformer-models-decoders.srt b/subtitles/en/06_transformer-models-decoders.srt
index af7fa0170..0b0c84cde 100644
--- a/subtitles/en/06_transformer-models-decoders.srt
+++ b/subtitles/en/06_transformer-models-decoders.srt
@@ -1,263 +1,395 @@
-1
-00:00:03,860 --> 00:00:09,750
-In this video, we'll study the decoder architecture.
-An example of a popular decoder-only architecture
-
-2
-00:00:09,750 --> 00:00:15,809
-is GPT-2. In order to understand how decoders
-work, we recommend taking a look at the video
-
-3
-00:00:15,809 --> 00:00:21,640
-regarding encoders: they're extremely similar
-to decoders. One can use a decoder for most
-
-4
-00:00:21,640 --> 00:00:26,429
-of the same tasks as an encoder, albeit with,
-generally, a little loss of performance. Let's
-
-5
-00:00:26,429 --> 00:00:31,769
-take the same approach we have taken with
-the encoder to try and understand the architectural
-
-6
-00:00:31,769 --> 00:00:38,969
-differences between an encoder and a decoder.
-We'll use a small example, using three words.
-
-7
-00:00:38,969 --> 00:00:46,550
-We pass them through the decoder. We retrieve
-a numerical representation of each word. Here,
-
-8
-00:00:46,550 --> 00:00:51,739
-for example, the decoder converts the three
-words “Welcome to NYC” in these three
-
-9
-00:00:51,739 --> 00:00:57,750
-sequences of numbers. The decoder outputs
-exactly one sequence of numbers per input
-
-10
-00:00:57,750 --> 00:01:03,290
-word. This numerical representation can also
-be called a "Feature vector", or "Feature
-
-11
-00:01:03,290 --> 00:01:09,590
-tensor". Let's dive in this representation.
-It contains one vector per word that was passed
-
-12
-00:01:09,590 --> 00:01:14,830
-through the decoder. Each of these vector
-is a numerical representation of the word
-
-13
-00:01:14,830 --> 00:01:21,810
-in question. The dimension of that vector
-is defined by the architecture of the model.
-
-14
-00:01:21,810 --> 00:01:28,400
-Where the decoder differs from the encoder
-is principally with its self-attention mechanism.
-
-15
-00:01:28,400 --> 00:01:34,090
-It's using what is called "masked self-attention".
-Here for example, if we focus on the word
-
-16
-00:01:34,090 --> 00:01:40,170
-"to", we'll see that its vector is absolutely
-unmodified by the "NYC" word. That's because
-
-17
-00:01:40,170 --> 00:01:45,560
-all the words on the right (also known as
-the right context) of the word is masked.
-
-18
-00:01:45,560 --> 00:01:50,729
-Rather than benefitting from all the words
-on the left and right, I.e., the bidirectional
-
-19
-00:01:50,729 --> 00:02:01,229
-context, decoders only have access to the
-words on their left. The masked self-attention
-
-20
-00:02:01,229 --> 00:02:06,310
-mechanism differs from the self-attention
-mechanism by using an additional mask to hide
-
-21
-00:02:06,310 --> 00:02:12,110
-the context on either side of the word: the
-word's numerical representation will not be
-
-22
-00:02:12,110 --> 00:02:18,730
-affected by the words in the hidden context.
-So when should one use a decoder? Decoders,
-
-23
-00:02:18,730 --> 00:02:24,610
-like encoders, can be used as standalone models.
-As they generate a numerical representation,
-
-24
-00:02:24,610 --> 00:02:30,410
-they can also be used in a wide variety of
-tasks. However, the strength of a decoder
-
-25
-00:02:30,410 --> 00:02:35,420
-lies in the way a word has access to its left
-context. The decoders, having only access
-
-26
-00:02:35,420 --> 00:02:40,280
-to their left context, are inherently good
-at text generation: the ability to generate
-
-27
-00:02:40,280 --> 00:02:46,120
-a word, or a sequence of words, given a known
-sequence of words. In NLP, this is known as
-
-28
-00:02:46,120 --> 00:02:52,150
-Causal Language Modeling. Let's look at an
-example. Here's an example of how causal language
-
-29
-00:02:52,150 --> 00:02:59,240
-modeling works: we start with an initial word,
-which is "My". We use this as input for the
-
-30
-00:02:59,240 --> 00:03:06,330
-decoder. The model outputs a vectors of dimension
-768. This vector contains information about
-
-31
-00:03:06,330 --> 00:03:11,650
-the sequence, which is here a single word,
-or word. We apply a small transformation to
-
-32
-00:03:11,650 --> 00:03:17,019
-that vector so that it maps to all the words
-known by the model (mapping which we'll see
-
-33
-00:03:17,019 --> 00:03:22,650
-later, called a language modeling head). We
-identify that the model believes the most
-
-34
-00:03:22,650 --> 00:03:29,720
-probable following word is "name". We then
-take that new word, and add it to the initial
-
-35
-00:03:29,720 --> 00:03:35,560
-sequence. From "My", we are now at "My name".
-This is where the "autoregressive" aspect
-
-36
-00:03:35,560 --> 00:03:42,689
-comes in. Auto-regressive models re-use their
-past outputs as inputs in the following steps.
-
-37
-00:03:42,689 --> 00:03:49,280
-Once again, we do that the exact same operation:
-we cast that sequence through the decoder,
-
-38
-00:03:49,280 --> 00:03:57,459
-and retrieve the most probable following word.
-In this case, it is the word "is". We repeat
-
-39
-00:03:57,459 --> 00:04:03,049
-the operation until we're satisfied. Starting
-from a single word, we've now generated a
-
-40
-00:04:03,049 --> 00:04:08,870
-full sentence. We decide to stop there, but
-we could continue for a while; GPT-2, for
-
-41
-00:04:08,870 --> 00:04:16,919
-example, has a maximum context size of 1024.
-We could eventually generate up to 1024 words,
-
-42
-00:04:16,919 --> 00:04:20,125
-and the decoder would still have some memory
-of the first words of the sequence! If we
-
-43
-00:04:20,125 --> 00:04:21,125
-go back several levels higher, back to the
-full transformer model, we can see what we
-
-44
-00:04:21,125 --> 00:04:22,125
-learned about the decoder part of the full
-transformer model. It is what we call, auto-regressive:
-
-45
-00:04:22,125 --> 00:04:23,125
-it outputs values that are then used as its
-input values. We repeat this operations as
-
-46
-00:04:23,125 --> 00:04:24,125
-we like. It is based off of the masked self-attention
-layer, which allows to have word embeddings
-
-47
-00:04:24,125 --> 00:04:25,125
-which have access to the context on the left
-side of the word. If you look at the diagram
-
-48
-00:04:25,125 --> 00:04:26,125
-however, you'll see that we haven't seen one
-of the aspects of the decoder. That is: cross-attention.
-
-49
-00:04:26,125 --> 00:04:27,125
-There is a second aspect we haven't seen,
-which is it's ability to convert features
-
-50
-00:04:27,125 --> 00:04:28,125
-to words; heavily linked to the cross attention
-mechanism. However, these only apply in the
-
-51
-00:04:28,125 --> 00:04:29,125
-"encoder-decoder" transformer, or the "sequence-to-sequence"
-transformer (which can generally be used interchangeably).
-
-52
-00:04:29,125 --> 00:04:30,125
-We recommend you check out the video on encoder-decoders
-to get an idea of how the decoder can be used
-
-53
-00:04:30,125 --> 00:04:30,132
-as a component of a larger architecture!
+﻿1
+00:00:03,750 --> 00:00:07,140
+- In this video, we'll study
+the decoder architecture.
+
+2
+00:00:07,140 --> 00:00:07,973
+An example
+
+3
+00:00:07,973 --> 00:00:11,338
+of a popular decoder only
+architecture is GPT two.
+
+4
+00:00:11,338 --> 00:00:14,160
+In order to understand how decoders work
+
+5
+00:00:14,160 --> 00:00:17,430
+we recommend taking a look at
+the video regarding encoders.
+
+6
+00:00:17,430 --> 00:00:19,980
+They're extremely similar to decoders.
+
+7
+00:00:19,980 --> 00:00:21,210
+One can use a decoder
+
+8
+00:00:21,210 --> 00:00:23,760
+for most of the same tasks as an encoder
+
+9
+00:00:23,760 --> 00:00:27,330
+albeit with generally a
+little loss of performance.
+
+10
+00:00:27,330 --> 00:00:28,890
+Let's take the same approach we have taken
+
+11
+00:00:28,890 --> 00:00:30,300
+with the encoder to try
+
+12
+00:00:30,300 --> 00:00:32,670
+and understand the
+architectural differences
+
+13
+00:00:32,670 --> 00:00:34,803
+between an encoder and decoder.
+
+14
+00:00:35,777 --> 00:00:38,910
+We'll use a small example
+using three words.
+
+15
+00:00:38,910 --> 00:00:41,050
+We pass them through their decoder.
+
+16
+00:00:41,050 --> 00:00:44,793
+We retrieve a numerical
+representation for each word.
+
+17
+00:00:46,410 --> 00:00:49,350
+Here for example, the decoder
+converts the three words.
+
+18
+00:00:49,350 --> 00:00:53,545
+Welcome to NYC, and these
+three sequences of numbers.
+
+19
+00:00:53,545 --> 00:00:56,040
+The decoder outputs exactly one sequence
+
+20
+00:00:56,040 --> 00:00:58,740
+of numbers per input word.
+
+21
+00:00:58,740 --> 00:01:00,630
+This numerical representation can also
+
+22
+00:01:00,630 --> 00:01:03,783
+be called a feature vector
+or a feature sensor.
+
+23
+00:01:04,920 --> 00:01:07,200
+Let's dive in this representation.
+
+24
+00:01:07,200 --> 00:01:08,490
+It contains one vector
+
+25
+00:01:08,490 --> 00:01:11,340
+per word that was passed
+through the decoder.
+
+26
+00:01:11,340 --> 00:01:14,250
+Each of these vectors is
+a numerical representation
+
+27
+00:01:14,250 --> 00:01:15,573
+of the word in question.
+
+28
+00:01:16,920 --> 00:01:18,562
+The dimension of that vector is defined
+
+29
+00:01:18,562 --> 00:01:20,703
+by the architecture of the model.
+
+30
+00:01:22,860 --> 00:01:26,040
+Where the decoder differs from
+the encoder is principally
+
+31
+00:01:26,040 --> 00:01:28,200
+with its self attention mechanism.
+
+32
+00:01:28,200 --> 00:01:30,843
+It's using what is called
+masked self attention.
+
+33
+00:01:31,860 --> 00:01:34,650
+Here, for example, if we
+focus on the word "to"
+
+34
+00:01:34,650 --> 00:01:37,620
+we'll see that is vector
+is absolutely unmodified
+
+35
+00:01:37,620 --> 00:01:39,690
+by the NYC word.
+
+36
+00:01:39,690 --> 00:01:41,731
+That's because all the words
+on the right, also known
+
+37
+00:01:41,731 --> 00:01:45,276
+as the right context of
+the word is masked rather
+
+38
+00:01:45,276 --> 00:01:49,230
+than benefiting from all the
+words on the left and right.
+
+39
+00:01:49,230 --> 00:01:51,600
+So the bidirectional context.
+
+40
+00:01:51,600 --> 00:01:55,020
+Decoders only have access
+to a single context
+
+41
+00:01:55,020 --> 00:01:58,203
+which can be the left
+context or the right context.
+
+42
+00:01:59,539 --> 00:02:03,356
+The masked self attention
+mechanism differs
+
+43
+00:02:03,356 --> 00:02:04,320
+from the self attention mechanism
+
+44
+00:02:04,320 --> 00:02:07,110
+by using an additional
+mask to hide the context
+
+45
+00:02:07,110 --> 00:02:09,390
+on either side of the word
+
+46
+00:02:09,390 --> 00:02:12,810
+the words numerical representation
+will not be affected
+
+47
+00:02:12,810 --> 00:02:14,853
+by the words in the hidden context.
+
+48
+00:02:16,260 --> 00:02:18,330
+So when should one use a decoder?
+
+49
+00:02:18,330 --> 00:02:22,380
+Decoders like encoders can
+be used as standalone models
+
+50
+00:02:22,380 --> 00:02:25,020
+as they generate a
+numerical representation.
+
+51
+00:02:25,020 --> 00:02:28,320
+They can also be used in
+a wide variety of tasks.
+
+52
+00:02:28,320 --> 00:02:31,260
+However, the strength of
+a decoder lies in the way.
+
+53
+00:02:31,260 --> 00:02:34,530
+A word can only have
+access to its left context
+
+54
+00:02:34,530 --> 00:02:36,690
+having only access to their left context.
+
+55
+00:02:36,690 --> 00:02:39,120
+They're inherently good at text generation
+
+56
+00:02:39,120 --> 00:02:41,010
+the ability to generate a word
+
+57
+00:02:41,010 --> 00:02:45,000
+or a sequence of words given
+a known sequence of words.
+
+58
+00:02:45,000 --> 00:02:45,833
+This is known
+
+59
+00:02:45,833 --> 00:02:49,083
+as causal language modeling or
+natural language generation.
+
+60
+00:02:50,430 --> 00:02:53,520
+Here's an example of how
+causal language modeling works.
+
+61
+00:02:53,520 --> 00:02:56,410
+We start with an initial word, which is my
+
+62
+00:02:57,339 --> 00:02:59,973
+we use this as input for the decoder.
+
+63
+00:03:00,810 --> 00:03:04,260
+The model outputs a vector of numbers
+
+64
+00:03:04,260 --> 00:03:07,230
+and this vector contains
+information about the sequence
+
+65
+00:03:07,230 --> 00:03:08,733
+which is here a single word.
+
+66
+00:03:09,780 --> 00:03:11,430
+We apply a small transformation
+
+67
+00:03:11,430 --> 00:03:13,110
+to that vector so that it maps
+
+68
+00:03:13,110 --> 00:03:16,500
+to all the words known by
+the model, which is a mapping
+
+69
+00:03:16,500 --> 00:03:19,890
+that we'll see later called
+a language modeling head.
+
+70
+00:03:19,890 --> 00:03:21,930
+We identify that the model believes
+
+71
+00:03:21,930 --> 00:03:25,053
+that the most probable
+following word is name.
+
+72
+00:03:26,250 --> 00:03:28,710
+We then take that new word and add it
+
+73
+00:03:28,710 --> 00:03:33,480
+to the initial sequence from
+my, we are now at my name.
+
+74
+00:03:33,480 --> 00:03:36,870
+This is where the auto
+regressive aspect comes in.
+
+75
+00:03:36,870 --> 00:03:38,490
+Auto regressive models.
+
+76
+00:03:38,490 --> 00:03:42,513
+We use their past outputs as
+inputs and the following steps.
+
+77
+00:03:43,452 --> 00:03:46,980
+Once again, we do the
+exact same operation.
+
+78
+00:03:46,980 --> 00:03:49,500
+We cast that sequence through the decoder
+
+79
+00:03:49,500 --> 00:03:51,993
+and retrieve the most
+probable following word.
+
+80
+00:03:52,978 --> 00:03:57,978
+In this case, it is the word
+"is", we repeat the operation
+
+81
+00:03:58,230 --> 00:04:02,040
+until we're satisfied,
+starting from a single word.
+
+82
+00:04:02,040 --> 00:04:04,590
+We've now generated a full sentence.
+
+83
+00:04:04,590 --> 00:04:07,890
+We decide to stop there, but
+we could continue for a while.
+
+84
+00:04:07,890 --> 00:04:12,890
+GPT two, for example, has a
+maximum context size of 1,024.
+
+85
+00:04:13,170 --> 00:04:16,830
+We could eventually
+generate up to a 1,024 words
+
+86
+00:04:16,830 --> 00:04:19,050
+and the decoder would
+still have some memory
+
+87
+00:04:19,050 --> 00:04:21,003
+of the first words in this sequence.
+
diff --git a/subtitles/en/07_transformer-models-encoder-decoders.srt b/subtitles/en/07_transformer-models-encoder-decoders.srt
index 6abdf581e..e1b47aa21 100644
--- a/subtitles/en/07_transformer-models-encoder-decoders.srt
+++ b/subtitles/en/07_transformer-models-encoder-decoders.srt
@@ -1,323 +1,621 @@
-1
-00:00:04,160 --> 00:00:07,200
-In this video, we'll study the 
-encoder-decoder architecture.  
-
-2
-00:00:08,160 --> 00:00:16,160
-An example of a popular encoder-decoder model is 
-T5. In order to understand how the encoder-decoder  
-
-3
-00:00:16,160 --> 00:00:21,680
-works, we recommend you check out the videos 
-on encoders and decoders as standalone models.  
-
-4
-00:00:22,400 --> 00:00:30,320
-Understanding how they behave individually will 
-help understanding how an encoder-decoder behaves.  
-
-5
-00:00:30,320 --> 00:00:35,360
-Let's start from what we've seen about the 
-encoder. The encoder takes words as inputs,  
-
-6
-00:00:36,000 --> 00:00:40,640
-casts them through the encoder, and 
-retrieves a numerical representation  
-
-7
-00:00:40,640 --> 00:00:47,360
-for each word cast through it. We now know that 
-the numerical representation holds information  
-
-8
-00:00:47,360 --> 00:00:54,000
-about the meaning of the sequence. Let's put 
-this aside and add the decoder to the diagram.  
-
-9
-00:00:56,480 --> 00:01:00,160
-In this scenario, we're using the decoder 
-in a manner that we haven't seen before.  
-
-10
-00:01:00,720 --> 00:01:07,600
-We're passing the outputs of the encoder directly 
-to it! Additionally to the encoder outputs,  
-
-11
-00:01:07,600 --> 00:01:13,040
-we also give the decoder a sequence. When 
-prompting the decoder for an output with no  
-
-12
-00:01:13,040 --> 00:01:17,360
-initial sequence, we can give it the value 
-that indicates the start of a sequence.  
-
-13
-00:01:18,000 --> 00:01:23,520
-And that's where the encoder-decoder magic 
-happens. The encoder accepts a sequence as input.  
-
-14
-00:01:24,560 --> 00:01:30,480
-It computes a prediction, and outputs a 
-numerical representation. Then, it sends  
-
-15
-00:01:30,480 --> 00:01:38,000
-that over to the decoder. It has, in a sense, 
-encoded the sequence. And the decoder, in turn,  
-
-16
-00:01:38,000 --> 00:01:42,960
-using this input alongside its usual sequence 
-input, will take a stab at decoding the sequence.  
-
-17
-00:01:44,720 --> 00:01:50,400
-The decoder decodes the sequence, and outputs a 
-word. As of now, we don't need to make sense of  
-
-18
-00:01:50,400 --> 00:01:55,440
-that word, but we can understand that the decoder 
-is essentially decoding what the encoder has  
-
-19
-00:01:55,440 --> 00:02:02,160
-output. The "start of sequence word" indicates 
-that it should start decoding the sequence.  
-
-20
-00:02:03,600 --> 00:02:10,240
-Now that we have both the feature vector and 
-an initial generated word, we don't need the  
-
-21
-00:02:10,240 --> 00:02:17,760
-encoder anymore. As we have seen before with the 
-decoder, it can act in an auto-regressive manner;  
-
-22
-00:02:18,640 --> 00:02:24,960
-the word it has just output can now be used 
-as an input. This, in combination with the  
-
-23
-00:02:24,960 --> 00:02:30,800
-numerical representation output by the encoder, 
-can now be used to generate a second word.  
-
-24
-00:02:33,200 --> 00:02:38,880
-Please note that the first word is still here; as 
-the model still outputs it. However, it is greyed  
-
-25
-00:02:38,880 --> 00:02:45,120
-out as we have no need for it anymore. We can 
-continue on and on; for example until the decoder  
-
-26
-00:02:45,120 --> 00:02:50,720
-outputs a value that we consider a "stopping 
-value", like a dot, meaning the end of a sequence.  
-
-27
-00:02:53,440 --> 00:02:58,080
-Here, we've seen the full mechanism of the 
-encoder-decoder transformer: let's go over it one  
-
-28
-00:02:58,080 --> 00:03:05,120
-more time. We have an initial sequence, that is 
-sent to the encoder. That encoder output is then  
-
-29
-00:03:05,120 --> 00:03:12,240
-sent to the decoder, for it to be decoded. While 
-we can now discard the encoder after a single use,  
-
-30
-00:03:12,240 --> 00:03:17,840
-the decoder will be used several times: until 
-we have generated every word that we need.  
-
-31
-00:03:20,000 --> 00:03:25,120
-Let's see a concrete example; with Translation 
-Language Modeling; also called transduction;  
-
-32
-00:03:25,120 --> 00:03:30,800
-the act of translating a sequence. Here, we would 
-like to translate this English sequence "Welcome  
-
-33
-00:03:30,800 --> 00:03:38,400
-to NYC" in French. We're using a transformer model 
-that is trained for that task explicitly. We use  
-
-34
-00:03:38,400 --> 00:03:43,520
-the encoder to create a representation 
-of the English sentence. We cast this  
-
-35
-00:03:43,520 --> 00:03:48,880
-to the decoder and, with the use of the start of 
-sequence word, we ask it to output the first word.  
-
-36
-00:03:50,720 --> 00:03:52,960
-It outputs Bienvenue, which means "Welcome".  
-
-37
-00:03:55,280 --> 00:04:02,480
-We then use "Bienvenue" as the input sequence for 
-the decoder. This, alongside the feature vector,  
-
-38
-00:04:04,320 --> 00:04:08,480
-allows the decoder to predict the second 
-word, "à", which is "to" in English.  
-
-39
-00:04:10,160 --> 00:04:14,400
-Finally, we ask the decoder to predict 
-a third word; it predicts "NYC",  
-
-40
-00:04:14,400 --> 00:04:20,240
-which is, once again, correct. We've translated 
-the sentence! Where the encoder-decoder really  
-
-41
-00:04:20,240 --> 00:04:24,880
-shines, is that we have an encoder and a 
-decoder; which often do not share weights.  
-
-42
-00:04:27,280 --> 00:04:31,440
-We, therefore, have an entire block (the encoder) 
-that can be trained to understand the sequence,  
-
-43
-00:04:31,440 --> 00:04:36,480
-and extract the relevant information. For the 
-translation scenario we've seen earlier, for  
-
-44
-00:04:36,480 --> 00:04:44,160
-example, this would mean parsing and understanding 
-what was said in the English language; extracting  
-
-45
-00:04:44,160 --> 00:04:49,040
-information from that language, and putting 
-all of that in a vector dense in information.  
-
-46
-00:04:50,880 --> 00:04:57,280
-On the other hand, we have the decoder, whose 
-sole purpose is to decode the feature output by  
-
-47
-00:04:57,280 --> 00:05:03,760
-the encoder. This decoder can be specialized in 
-a completely different language, or even modality  
-
-48
-00:05:03,760 --> 00:05:11,760
-like images or speech. Encoders-decoders 
-are special for several reasons. Firstly,  
-
-49
-00:05:11,760 --> 00:05:17,040
-they're able to manage sequence to sequence 
-tasks, like translation that we have just seen.  
-
-50
-00:05:18,640 --> 00:05:23,880
-Secondly, the weights between the encoder and the 
-decoder parts are not necessarily shared. Let's  
-
-51
-00:05:24,480 --> 00:05:31,200
-take another example of translation. Here we're 
-translating "Transformers are powerful" in French.  
-
-52
-00:05:32,240 --> 00:05:36,560
-Firstly, this means that from a sequence 
-of three words, we're able to generate  
-
-53
-00:05:36,560 --> 00:05:42,240
-a sequence of four words. One could argue 
-that this could be handled with a decoder;  
-
-54
-00:05:42,240 --> 00:05:46,960
-that would generate the translation in an 
-auto-regressive manner; and they would be right!  
-
-55
-00:05:49,840 --> 00:05:53,840
-Another example of where sequence to sequence 
-transformers shine is in summarization.  
-
-56
-00:05:54,640 --> 00:05:58,560
-Here we have a very long 
-sequence, generally a full text,  
-
-57
-00:05:58,560 --> 00:06:03,840
-and we want to summarize it. Since the 
-encoder and decoders are separated,  
-
-58
-00:06:03,840 --> 00:06:08,880
-we can have different context lengths (for 
-example a very long context for the encoder which  
-
-59
-00:06:08,880 --> 00:06:13,840
-handles the text, and a smaller context for the 
-decoder which handles the summarized sequence).  
-
-60
-00:06:16,240 --> 00:06:20,480
-There are a lot of sequence to sequence 
-models. This contains a few examples of  
-
-61
-00:06:20,480 --> 00:06:24,160
-popular encoder-decoder models 
-available in the transformers library.  
-
-62
-00:06:26,320 --> 00:06:31,200
-Additionally, you can load an encoder 
-and a decoder inside an encoder-decoder  
-
-63
-00:06:31,200 --> 00:06:35,040
-model! Therefore, according to the 
-specific task you are targeting,  
-
-64
-00:06:35,040 --> 00:06:40,240
-you may choose to use specific encoders 
-and decoders, which have proven their worth  
-
-65
-00:06:40,240 --> 00:06:49,850
-on these specific tasks. This wraps things up 
-for the encoder-decoders. Thanks for watching!
+﻿1
+00:00:00,520 --> 00:00:02,603
+(swoosh)
+
+2
+00:00:04,230 --> 00:00:05,063
+- In this video,
+
+3
+00:00:05,063 --> 00:00:07,638
+we'll study the
+encoder-decoder architecture.
+
+4
+00:00:07,638 --> 00:00:12,243
+An example of a popular
+encoder-decoder model is T5.
+
+5
+00:00:13,770 --> 00:00:16,980
+In order to understand how
+the encoder-decoder works,
+
+6
+00:00:16,980 --> 00:00:18,630
+we recommend you check out the videos
+
+7
+00:00:18,630 --> 00:00:22,590
+on encoders and decoders
+as standalone models.
+
+8
+00:00:22,590 --> 00:00:24,990
+Understanding how they work individually
+
+9
+00:00:24,990 --> 00:00:28,323
+will help understanding how
+an encoder-decoder works.
+
+10
+00:00:30,510 --> 00:00:33,390
+Let's start from what we've
+seen about the encoder.
+
+11
+00:00:33,390 --> 00:00:36,240
+The encoder takes words as inputs,
+
+12
+00:00:36,240 --> 00:00:38,520
+casts them through the encoder,
+
+13
+00:00:38,520 --> 00:00:40,800
+and retrieves a numerical representation
+
+14
+00:00:40,800 --> 00:00:42,663
+for each word cast through it.
+
+15
+00:00:43,560 --> 00:00:46,470
+We now know that this
+numerical representation
+
+16
+00:00:46,470 --> 00:00:49,473
+holds information about the
+meaning of the sequence.
+
+17
+00:00:51,090 --> 00:00:54,243
+Let's put this aside and add
+the decoder to the diagram.
+
+18
+00:00:56,610 --> 00:00:57,510
+In this scenario,
+
+19
+00:00:57,510 --> 00:00:59,190
+we're using the decoder in a manner
+
+20
+00:00:59,190 --> 00:01:00,960
+that we haven't seen before.
+
+21
+00:01:00,960 --> 00:01:04,173
+We're passing the outputs of
+the encoder directly to it.
+
+22
+00:01:05,356 --> 00:01:07,770
+Additionally to the encoder outputs,
+
+23
+00:01:07,770 --> 00:01:10,800
+we also give the decoder a sequence.
+
+24
+00:01:10,800 --> 00:01:12,840
+When prompting the decoder for an output
+
+25
+00:01:12,840 --> 00:01:14,190
+with no initial sequence,
+
+26
+00:01:14,190 --> 00:01:16,140
+we can give it the value that indicates
+
+27
+00:01:16,140 --> 00:01:18,060
+the start of a sequence.
+
+28
+00:01:18,060 --> 00:01:20,919
+And that's where the
+encoder-decoder magic happens.
+
+29
+00:01:20,919 --> 00:01:24,082
+The encoder accepts a sequence as input.
+
+30
+00:01:24,082 --> 00:01:25,980
+It computes a prediction,
+
+31
+00:01:25,980 --> 00:01:28,858
+and outputs a numerical representation.
+
+32
+00:01:28,858 --> 00:01:33,120
+Then, it sends that over to the decoder.
+
+33
+00:01:33,120 --> 00:01:36,300
+It has, in a sense, encoded that sequence.
+
+34
+00:01:36,300 --> 00:01:38,130
+And the decoder, in turn,
+
+35
+00:01:38,130 --> 00:01:40,847
+using this input alongside
+its usual sequence input,
+
+36
+00:01:40,847 --> 00:01:43,906
+will take a stab at decoding the sequence.
+
+37
+00:01:43,906 --> 00:01:46,530
+The decoder decodes the sequence,
+
+38
+00:01:46,530 --> 00:01:48,360
+and outputs a word.
+
+39
+00:01:48,360 --> 00:01:51,300
+As of now, we don't need
+to make sense of that word,
+
+40
+00:01:51,300 --> 00:01:53,100
+but we can understand that the decoder
+
+41
+00:01:53,100 --> 00:01:56,103
+is essentially decoding
+what the encoder has output.
+
+42
+00:01:57,008 --> 00:02:00,000
+The start of sequence word here
+
+43
+00:02:00,000 --> 00:02:02,871
+indicates that it should
+start decoding the sequence.
+
+44
+00:02:02,871 --> 00:02:06,870
+Now that we have both the
+encoder numerical representation
+
+45
+00:02:06,870 --> 00:02:09,570
+and an initial generated word,
+
+46
+00:02:09,570 --> 00:02:11,343
+we don't need the encoder anymore.
+
+47
+00:02:12,269 --> 00:02:15,540
+As we have seen before with the decoder,
+
+48
+00:02:15,540 --> 00:02:18,720
+it can act in an auto-regressive manner.
+
+49
+00:02:18,720 --> 00:02:22,933
+The word it has just output
+can now be used as an input.
+
+50
+00:02:22,933 --> 00:02:26,188
+This, in combination with
+the numerical representation
+
+51
+00:02:26,188 --> 00:02:28,560
+output by the encoder,
+
+52
+00:02:28,560 --> 00:02:31,203
+can now be used to generate a second word.
+
+53
+00:02:33,040 --> 00:02:35,910
+Please note that the
+first word is still here,
+
+54
+00:02:35,910 --> 00:02:37,770
+as the model still outputs it.
+
+55
+00:02:37,770 --> 00:02:39,240
+However, we have grayed it out
+
+56
+00:02:39,240 --> 00:02:40,940
+as we have no need for it anymore.
+
+57
+00:02:41,880 --> 00:02:44,070
+We can continue on and on, for example,
+
+58
+00:02:44,070 --> 00:02:46,320
+until the decoder outputs a value
+
+59
+00:02:46,320 --> 00:02:48,540
+that we consider a stopping value,
+
+60
+00:02:48,540 --> 00:02:51,093
+like a dot meaning the end of a sequence.
+
+61
+00:02:53,580 --> 00:02:55,926
+Here, we've seen the full mechanism
+
+62
+00:02:55,926 --> 00:02:57,540
+of the encoder-decoder transformer.
+
+63
+00:02:57,540 --> 00:02:59,280
+Let's go over it one more time.
+
+64
+00:02:59,280 --> 00:03:02,773
+We have an initial sequence
+that is sent to the encoder.
+
+65
+00:03:02,773 --> 00:03:06,450
+That encoder output is
+then sent to the decoder
+
+66
+00:03:06,450 --> 00:03:07,563
+for it to be decoded.
+
+67
+00:03:08,760 --> 00:03:12,450
+While it can now discard the
+encoder after a single use,
+
+68
+00:03:12,450 --> 00:03:14,427
+the decoder will be used several times
+
+69
+00:03:14,427 --> 00:03:17,763
+until we have generated
+every word that we need.
+
+70
+00:03:19,288 --> 00:03:21,510
+So let's see a concrete example
+
+71
+00:03:21,510 --> 00:03:23,460
+with Translation Language Modeling.
+
+72
+00:03:23,460 --> 00:03:24,930
+Also called transduction,
+
+73
+00:03:24,930 --> 00:03:28,200
+which is the act of
+translating a sequence.
+
+74
+00:03:28,200 --> 00:03:30,577
+Here, we would like to
+translate this English sequence
+
+75
+00:03:30,577 --> 00:03:33,067
+"Welcome to NYC" in French.
+
+76
+00:03:33,067 --> 00:03:35,460
+We're using a transformer model
+
+77
+00:03:35,460 --> 00:03:38,070
+that is trained for that task explicitly.
+
+78
+00:03:38,070 --> 00:03:40,560
+We use the encoder to
+create a representation
+
+79
+00:03:40,560 --> 00:03:42,240
+of the English sentence.
+
+80
+00:03:42,240 --> 00:03:44,730
+We cast this to the decoder,
+
+81
+00:03:44,730 --> 00:03:46,620
+with the use of the
+start of sequence word,
+
+82
+00:03:46,620 --> 00:03:49,173
+we ask it to output the first word.
+
+83
+00:03:50,029 --> 00:03:53,607
+It outputs bienvenue, which means welcome.
+
+84
+00:03:53,607 --> 00:03:56,640
+And we then use bienvenue
+
+85
+00:03:56,640 --> 00:03:59,283
+as the input sequence for the decoder.
+
+86
+00:04:00,188 --> 00:04:04,470
+This, alongside the encoder
+numerical representation,
+
+87
+00:04:04,470 --> 00:04:07,440
+allows the decoder to
+predict the second word, Ã,
+
+88
+00:04:07,440 --> 00:04:09,240
+which is to in English.
+
+89
+00:04:09,240 --> 00:04:13,590
+Finally, we ask the decoder
+to predict a third word
+
+90
+00:04:13,590 --> 00:04:15,330
+It predicts NYC, which is correct.
+
+91
+00:04:15,330 --> 00:04:18,288
+We've translated the sentence.
+
+92
+00:04:18,288 --> 00:04:20,760
+Where the encoder-decoder really shines,
+
+93
+00:04:20,760 --> 00:04:23,550
+is that we have an encoder and a decoder,
+
+94
+00:04:23,550 --> 00:04:25,323
+which often do not share weights.
+
+95
+00:04:26,256 --> 00:04:29,460
+Therefore, we have an
+entire block, the encoder,
+
+96
+00:04:29,460 --> 00:04:31,650
+that can be trained to
+understand the sequence
+
+97
+00:04:31,650 --> 00:04:34,290
+and extract the relevant information.
+
+98
+00:04:34,290 --> 00:04:36,450
+For the translation
+scenario we've seen earlier,
+
+99
+00:04:36,450 --> 00:04:38,760
+for example, this would mean parsing
+
+100
+00:04:38,760 --> 00:04:42,003
+and understanding what was
+said in the English language.
+
+101
+00:04:42,900 --> 00:04:45,960
+It would mean extracting
+information from that language,
+
+102
+00:04:45,960 --> 00:04:49,413
+and putting all of that in a
+vector dense in information.
+
+103
+00:04:50,361 --> 00:04:53,370
+On the other hand, we have the decoder,
+
+104
+00:04:53,370 --> 00:04:56,850
+whose sole purpose is to decode
+the numerical representation
+
+105
+00:04:56,850 --> 00:04:58,203
+output by the encoder.
+
+106
+00:04:59,460 --> 00:05:01,170
+This decoder can be specialized
+
+107
+00:05:01,170 --> 00:05:02,970
+in a completely different language,
+
+108
+00:05:02,970 --> 00:05:05,403
+or even modality like images or speech.
+
+109
+00:05:07,170 --> 00:05:10,473
+Encoders-decoders are
+special for several reasons.
+
+110
+00:05:11,310 --> 00:05:15,570
+Firstly, they're able to manage
+sequence to sequence tasks,
+
+111
+00:05:15,570 --> 00:05:18,358
+like translation that we have just seen.
+
+112
+00:05:18,358 --> 00:05:20,940
+Secondly, the weights between the encoder
+
+113
+00:05:20,940 --> 00:05:24,540
+and the decoder parts are
+not necessarily shared.
+
+114
+00:05:24,540 --> 00:05:27,172
+Let's take another example of translation.
+
+115
+00:05:27,172 --> 00:05:30,810
+Here we're translating
+"Transformers are powerful"
+
+116
+00:05:30,810 --> 00:05:32,048
+in French.
+
+117
+00:05:32,048 --> 00:05:35,258
+Firstly, this means that from
+a sequence of three words,
+
+118
+00:05:35,258 --> 00:05:39,030
+we're able to generate a
+sequence of four words.
+
+119
+00:05:39,030 --> 00:05:42,480
+One could argue that this
+could be handled with a decoder
+
+120
+00:05:42,480 --> 00:05:44,160
+that would generate the translation
+
+121
+00:05:44,160 --> 00:05:46,260
+in an auto-regressive manner,
+
+122
+00:05:46,260 --> 00:05:47,460
+and they would be right.
+
+123
+00:05:49,980 --> 00:05:51,930
+Another example of where
+sequence to sequence
+
+124
+00:05:51,930 --> 00:05:54,810
+transformers shine is in summarization.
+
+125
+00:05:54,810 --> 00:05:58,379
+Here we have a very long
+sequence, generally a full text,
+
+126
+00:05:58,379 --> 00:06:01,020
+and we want to summarize it.
+
+127
+00:06:01,020 --> 00:06:04,020
+Since the encoder and
+decoders are separated,
+
+128
+00:06:04,020 --> 00:06:06,300
+we can have different context lengths.
+
+129
+00:06:06,300 --> 00:06:08,910
+For example, a very long
+context for the encoder,
+
+130
+00:06:08,910 --> 00:06:10,230
+which handles the text,
+
+131
+00:06:10,230 --> 00:06:12,210
+and a smaller context for the decoder
+
+132
+00:06:12,210 --> 00:06:14,223
+which handles the summarized sequence.
+
+133
+00:06:16,470 --> 00:06:18,840
+There are a lot of sequence
+to sequence models.
+
+134
+00:06:18,840 --> 00:06:20,310
+This contains a few examples
+
+135
+00:06:20,310 --> 00:06:22,500
+of popular encoder-decoder models
+
+136
+00:06:22,500 --> 00:06:24,400
+available in the transformers library.
+
+137
+00:06:25,829 --> 00:06:29,940
+Additionally, you can load
+an encoder and a decoder
+
+138
+00:06:29,940 --> 00:06:32,130
+inside an encoder-decoder model.
+
+139
+00:06:32,130 --> 00:06:35,190
+Therefore, according to the
+specific task you are targeting,
+
+140
+00:06:35,190 --> 00:06:38,700
+you may choose to use specific
+encoders and decoders,
+
+141
+00:06:38,700 --> 00:06:42,613
+which have proven their worth
+on these specific tasks.
+
+142
+00:06:42,613 --> 00:06:44,696
+(swoosh)
+
diff --git a/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt b/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt
index a4e7b1e00..dc405bae7 100644
--- a/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt
+++ b/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt
@@ -1,244 +1,471 @@
-1
-00:00:05,200 --> 00:00:09,680
-What happens inside the pipeline 
-function? In this video,  
-
-2
-00:00:09,680 --> 00:00:14,240
-we will look at what actually happens when we use 
-the pipeline function of the Transformers library.  
-
-3
-00:00:14,880 --> 00:00:19,440
-More specifically, we will look at the 
-sentiment analysis pipeline, and how it  
-
-4
-00:00:19,440 --> 00:00:24,960
-went from the two following sentences to the 
-positive labels with their respective scores.  
-
-5
-00:00:26,560 --> 00:00:30,720
-As we have seen in the pipeline presentation, 
-there are three stages in the pipeline.  
-
-6
-00:00:31,520 --> 00:00:35,920
-First, we convert the raw texts to 
-numbers the model can make sense of,  
-
-7
-00:00:35,920 --> 00:00:41,520
-using a tokenizer. Then, those numbers go 
-through the model, which outputs logits.  
-
-8
-00:00:42,640 --> 00:00:47,040
-Finally, the post-processing steps transforms 
-those logits into labels and scores.  
-
-9
-00:00:47,920 --> 00:00:53,440
-Let's look in detail at those three steps, and how 
-to replicate them using the Transformers library,  
-
-10
-00:00:53,440 --> 00:01:01,040
-beginning with the first stage, tokenization. The 
-tokenization process has several steps. First,  
-
-11
-00:01:01,040 --> 00:01:07,360
-the text is split into small chunks called tokens. 
-They can be words, parts of words or punctuation  
-
-12
-00:01:07,360 --> 00:01:14,160
-symbols. Then the tokenizer will had some special 
-tokens (if the model expect them). Here the model  
-
-13
-00:01:14,160 --> 00:01:19,440
-uses expects a CLS token at the beginning and a 
-SEP token at the end of the sentence to classify.  
-
-14
-00:01:20,400 --> 00:01:25,440
-Lastly, the tokenizer matches each token to its 
-unique ID in the vocabulary of the pretrained  
-
-15
-00:01:25,440 --> 00:01:31,360
-model. To load such a tokenizer, the Transformers 
-library provides the AutoTokenizer API.  
-
-16
-00:01:32,400 --> 00:01:36,320
-The most important method of this 
-class is from_pretrained, which will  
-
-17
-00:01:36,320 --> 00:01:41,680
-download and cache the configuration and the 
-vocabulary associated to a given checkpoint.  
-
-18
-00:01:43,040 --> 00:01:48,880
-Here, the checkpoint used by default for the 
-sentiment analysis pipeline is distilbert base  
-
-19
-00:01:48,880 --> 00:01:56,080
-uncased finetuned sst2 english. We instantiate 
-a tokenizer associated with that checkpoint,  
-
-20
-00:01:56,640 --> 00:02:01,920
-then feed it the two sentences. Since those 
-two sentences are not of the same size,  
-
-21
-00:02:01,920 --> 00:02:05,040
-we will need to pad the shortest 
-one to be able to build an array.  
-
-22
-00:02:05,760 --> 00:02:08,240
-This is done by the tokenizer 
-with the option padding=True.  
-
-23
-00:02:09,600 --> 00:02:14,800
-With truncation=True, we ensure that any sentence 
-longer than the maximum the model can handle  
-
-24
-00:02:14,800 --> 00:02:21,840
-is truncated. Lastly, the return_tensors option 
-tells the tokenizer to return a PyTorch tensor.  
-
-25
-00:02:23,040 --> 00:02:29,040
-Looking at the result, we see we have a dictionary 
-with two keys. Input IDs contains the IDs of both  
-
-26
-00:02:29,040 --> 00:02:34,080
-sentences, with 0s where the padding is 
-applied. The second key, attention mask,  
-
-27
-00:02:34,080 --> 00:02:37,840
-indicates where padding has been applied, 
-so the model does not pay attention to it.  
-
-28
-00:02:38,640 --> 00:02:43,040
-This is all what is inside the tokenization 
-step. Now let's have a look at the second step,  
-
-29
-00:02:43,760 --> 00:02:50,560
-the model. As for the tokenizer, there is an 
-AutoModel API, with a from_pretrained method.  
-
-30
-00:02:50,560 --> 00:02:54,720
-It will download and cache the configuration 
-of the model as well as the pretrained weights.  
-
-31
-00:02:55,840 --> 00:03:00,480
-However, the AutoModel API will only 
-instantiate the body of the model,  
-
-32
-00:03:00,480 --> 00:03:05,120
-that is, the part of the model that is 
-left once the pretraining head is removed.  
-
-33
-00:03:05,840 --> 00:03:11,360
-It will output a high-dimensional tensor that is a 
-representation of the sentences passed, but which  
-
-34
-00:03:11,360 --> 00:03:17,200
-is not directly useful for our classification 
-problem. Here the tensor has two sentences,  
-
-35
-00:03:17,200 --> 00:03:25,440
-each of sixteen tokens and the last dimension is 
-the hidden size of our model 768. To get an output  
-
-36
-00:03:25,440 --> 00:03:30,240
-linked to our classification problem, we need to 
-use the AutoModelForSequenceClassification class.  
-
-37
-00:03:30,960 --> 00:03:35,200
-It works exactly as the AutoModel class, 
-except that it will build a model with a  
-
-38
-00:03:35,200 --> 00:03:40,720
-classification head. There is one auto class for 
-each common NLP task in the Transformers library.  
-
-39
-00:03:42,000 --> 00:03:47,600
-Here, after giving our model the two 
-sentences, we get a tensor of size two by two:  
-
-40
-00:03:47,600 --> 00:03:53,680
-one result for each sentence and for each possible 
-label. Those outputs are not probabilities yet  
-
-41
-00:03:53,680 --> 00:03:59,120
-(we can see they don't sum to 1). This is because 
-each model of the Transformers library returns  
-
-42
-00:03:59,120 --> 00:04:05,120
-logits. To make sense of those logits, we need to 
-dig into the third and last step of the pipeline:  
-
-43
-00:04:05,680 --> 00:04:11,840
-post-processing. To convert logits into 
-probabilities, we need to apply a SoftMax  
-
-44
-00:04:11,840 --> 00:04:17,760
-layer to them. As we can see, this transforms 
-them into positive numbers that sum up to 1.  
-
-45
-00:04:18,960 --> 00:04:22,800
-The last step is to know which of those 
-corresponds to the positive or the negative label.  
-
-46
-00:04:23,360 --> 00:04:30,160
-This is given by the id2label field of the 
-model config. The first probabilities (index 0)  
-
-47
-00:04:30,160 --> 00:04:35,360
-correspond to the negative label, and the seconds 
-(index 1) correspond to the positive label.  
-
-48
-00:04:36,240 --> 00:04:40,560
-This is how our classifier built with the 
-pipeline function picked those labels and computed  
-
-49
-00:04:40,560 --> 00:04:52,080
-those scores. Now that you know how each steps 
-works, you can easily tweak them to your needs.
+﻿1
+00:00:00,554 --> 00:00:03,304
+(logo whooshing)
+
+2
+00:00:05,340 --> 00:00:07,563
+- What happens inside
+the pipeline function?
+
+3
+00:00:08,760 --> 00:00:11,580
+In this video, we will look
+at what actually happens
+
+4
+00:00:11,580 --> 00:00:13,080
+when we use the pipeline function
+
+5
+00:00:13,080 --> 00:00:15,090
+of the Transformers library.
+
+6
+00:00:15,090 --> 00:00:16,860
+More specifically, we will look
+
+7
+00:00:16,860 --> 00:00:19,200
+at the sentiment analysis pipeline,
+
+8
+00:00:19,200 --> 00:00:22,020
+and how it went from the
+two following sentences,
+
+9
+00:00:22,020 --> 00:00:23,970
+to the positive and negative labels
+
+10
+00:00:23,970 --> 00:00:25,420
+with their respective scores.
+
+11
+00:00:26,760 --> 00:00:29,190
+As we have seen in the
+pipeline presentation,
+
+12
+00:00:29,190 --> 00:00:31,860
+there are three stages in the pipeline.
+
+13
+00:00:31,860 --> 00:00:34,620
+First, we convert the raw texts to numbers
+
+14
+00:00:34,620 --> 00:00:37,173
+the model can make sense
+of using a tokenizer.
+
+15
+00:00:38,010 --> 00:00:40,530
+Then those numbers go through the model,
+
+16
+00:00:40,530 --> 00:00:41,943
+which outputs logits.
+
+17
+00:00:42,780 --> 00:00:45,600
+Finally, the post-processing
+steps transforms
+
+18
+00:00:45,600 --> 00:00:48,150
+those logits into labels and scores.
+
+19
+00:00:48,150 --> 00:00:50,700
+Let's look in detail at those three steps
+
+20
+00:00:50,700 --> 00:00:53,640
+and how to replicate them
+using the Transformers library,
+
+21
+00:00:53,640 --> 00:00:56,043
+beginning with the first
+stage, tokenization.
+
+22
+00:00:57,915 --> 00:01:00,360
+The tokenization process
+has several steps.
+
+23
+00:01:00,360 --> 00:01:04,950
+First, the text is split into
+small chunks called tokens.
+
+24
+00:01:04,950 --> 00:01:08,550
+They can be words, parts of
+words or punctuation symbols.
+
+25
+00:01:08,550 --> 00:01:11,580
+Then the tokenizer will
+had some special tokens,
+
+26
+00:01:11,580 --> 00:01:13,500
+if the model expect them.
+
+27
+00:01:13,500 --> 00:01:16,860
+Here the model uses expects
+a CLS token at the beginning
+
+28
+00:01:16,860 --> 00:01:19,743
+and a SEP token at the end
+of the sentence to classify.
+
+29
+00:01:20,580 --> 00:01:24,180
+Lastly, the tokenizer matches
+each token to its unique ID
+
+30
+00:01:24,180 --> 00:01:27,000
+in the vocabulary of the pretrained model.
+
+31
+00:01:27,000 --> 00:01:28,680
+To load such a tokenizer,
+
+32
+00:01:28,680 --> 00:01:31,743
+the Transformers library
+provides the AutoTokenizer API.
+
+33
+00:01:32,730 --> 00:01:36,120
+The most important method of
+this class is from_pretrained,
+
+34
+00:01:36,120 --> 00:01:38,910
+which will download and
+cache the configuration
+
+35
+00:01:38,910 --> 00:01:41,853
+and the vocabulary associated
+to a given checkpoint.
+
+36
+00:01:43,200 --> 00:01:45,360
+Here the checkpoint used by default
+
+37
+00:01:45,360 --> 00:01:47,280
+for the sentiment analysis pipeline
+
+38
+00:01:47,280 --> 00:01:51,986
+is
+distilbert-base-uncased-finetuned-sst-2-English.
+
+39
+00:01:51,986 --> 00:01:53,700
+(indistinct)
+
+40
+00:01:53,700 --> 00:01:56,490
+We instantiate a tokenizer
+associated with that checkpoint,
+
+41
+00:01:56,490 --> 00:01:59,490
+then feed it the two sentences.
+
+42
+00:01:59,490 --> 00:02:02,100
+Since those two sentences
+are not of the same size,
+
+43
+00:02:02,100 --> 00:02:03,930
+we will need to pad the shortest one
+
+44
+00:02:03,930 --> 00:02:06,030
+to be able to build an array.
+
+45
+00:02:06,030 --> 00:02:09,840
+This is done by the tokenizer
+with the option, padding=True.
+
+46
+00:02:09,840 --> 00:02:12,810
+With truncation=True, we
+ensure that any sentence
+
+47
+00:02:12,810 --> 00:02:15,873
+longer than the maximum the
+model can handle is truncated.
+
+48
+00:02:17,010 --> 00:02:19,620
+Lastly, the return_tensors option
+
+49
+00:02:19,620 --> 00:02:22,323
+tells the tokenizer to
+return a PyTorch tensor.
+
+50
+00:02:23,190 --> 00:02:25,590
+Looking at the result, we
+see we have a dictionary
+
+51
+00:02:25,590 --> 00:02:26,670
+with two keys.
+
+52
+00:02:26,670 --> 00:02:29,970
+Input IDs contains the
+IDs of both sentences,
+
+53
+00:02:29,970 --> 00:02:32,550
+with zero where the padding is applied.
+
+54
+00:02:32,550 --> 00:02:34,260
+The second key, attention mask,
+
+55
+00:02:34,260 --> 00:02:36,150
+indicates where padding has been applied,
+
+56
+00:02:36,150 --> 00:02:38,940
+so the model does not pay attention to it.
+
+57
+00:02:38,940 --> 00:02:42,090
+This is all what is inside
+the tokenization step.
+
+58
+00:02:42,090 --> 00:02:46,289
+Now, let's have a look at
+the second step, the model.
+
+59
+00:02:46,289 --> 00:02:47,952
+As for the tokenizer,
+
+60
+00:02:47,952 --> 00:02:51,133
+there is an AutoModel API
+with a from_pretrained method.
+
+61
+00:02:51,133 --> 00:02:53,954
+It will download and cache
+the configuration of the model
+
+62
+00:02:53,954 --> 00:02:56,280
+as well as the pretrained weights.
+
+63
+00:02:56,280 --> 00:02:58,200
+However, the AutoModel API
+
+64
+00:02:58,200 --> 00:03:00,630
+will only instantiate
+the body of the model,
+
+65
+00:03:00,630 --> 00:03:03,420
+that is the part of the model that is left
+
+66
+00:03:03,420 --> 00:03:06,090
+once the pretraining head is removed.
+
+67
+00:03:06,090 --> 00:03:08,610
+It will output a high-dimensional tensor
+
+68
+00:03:08,610 --> 00:03:11,220
+that is a representation
+of the sentences passed,
+
+69
+00:03:11,220 --> 00:03:12,690
+but which is not directly useful
+
+70
+00:03:12,690 --> 00:03:15,030
+for our classification problem.
+
+71
+00:03:15,030 --> 00:03:19,230
+Here the tensor has two
+sentences, each of 16 tokens,
+
+72
+00:03:19,230 --> 00:03:23,433
+and the last dimension is the
+hidden size of our model, 768.
+
+73
+00:03:24,900 --> 00:03:27,510
+To get an output linked to
+our classification problem,
+
+74
+00:03:27,510 --> 00:03:31,170
+we need to use the
+AutoModelForSequenceClassification class.
+
+75
+00:03:31,170 --> 00:03:33,330
+It works exactly as the AutoModel class,
+
+76
+00:03:33,330 --> 00:03:35,130
+except that it will build a model
+
+77
+00:03:35,130 --> 00:03:36,543
+with a classification head.
+
+78
+00:03:37,483 --> 00:03:39,560
+There is one auto class
+for each common NLP task
+
+79
+00:03:39,560 --> 00:03:40,960
+in the Transformers library.
+
+80
+00:03:42,150 --> 00:03:45,570
+Here after giving our
+model the two sentences,
+
+81
+00:03:45,570 --> 00:03:47,820
+we get a tensor of size two by two,
+
+82
+00:03:47,820 --> 00:03:50,943
+one result for each sentence
+and for each possible label.
+
+83
+00:03:51,840 --> 00:03:53,970
+Those outputs are not probabilities yet,
+
+84
+00:03:53,970 --> 00:03:56,100
+we can see they don't sum to 1.
+
+85
+00:03:56,100 --> 00:03:57,270
+This is because each model
+
+86
+00:03:57,270 --> 00:04:00,810
+of the Transformers
+library returns logits.
+
+87
+00:04:00,810 --> 00:04:02,250
+To make sense of those logits,
+
+88
+00:04:02,250 --> 00:04:05,910
+we need to dig into the third
+and last step of the pipeline.
+
+89
+00:04:05,910 --> 00:04:10,620
+Post-processing, to convert
+logits into probabilities,
+
+90
+00:04:10,620 --> 00:04:13,470
+we need to apply a SoftMax layers to them.
+
+91
+00:04:13,470 --> 00:04:14,610
+As we can see,
+
+92
+00:04:14,610 --> 00:04:17,267
+this transforms them into positive number
+
+93
+00:04:17,267 --> 00:04:18,663
+that sum up to one.
+
+94
+00:04:18,663 --> 00:04:21,360
+The last step is to know
+which of those corresponds
+
+95
+00:04:21,360 --> 00:04:23,580
+to the positive or the negative label.
+
+96
+00:04:23,580 --> 00:04:28,020
+This is given by the id2label
+field of the model config.
+
+97
+00:04:28,020 --> 00:04:30,390
+The first probabilities, index zero,
+
+98
+00:04:30,390 --> 00:04:32,250
+correspond to the negative label,
+
+99
+00:04:32,250 --> 00:04:34,140
+and the seconds, index one,
+
+100
+00:04:34,140 --> 00:04:36,480
+correspond to the positive label.
+
+101
+00:04:36,480 --> 00:04:37,950
+This is how our classifier built
+
+102
+00:04:37,950 --> 00:04:40,230
+with the pipeline function
+picked those labels
+
+103
+00:04:40,230 --> 00:04:42,240
+and computed those scores.
+
+104
+00:04:42,240 --> 00:04:44,220
+Now that you know how each steps works,
+
+105
+00:04:44,220 --> 00:04:46,220
+you can easily tweak them to your needs.
+
+106
+00:04:47,524 --> 00:04:50,274
+(logo whooshing)
+
diff --git a/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt b/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt
index cbff989dc..21c8e3de5 100644
--- a/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt
+++ b/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt
@@ -1,238 +1,473 @@
-1
-00:00:05,360 --> 00:00:07,680
-What happens inside the pipeline function?  
-
-2
-00:00:09,840 --> 00:00:14,800
-In this video, we will look at what actually 
-happens when we use the pipeline function of  
-
-3
-00:00:14,800 --> 00:00:20,880
-the Transformers library. More specifically, we 
-will look at the sentiment analysis pipeline, and  
-
-4
-00:00:20,880 --> 00:00:26,720
-how it went from the two following sentences to 
-the positive labels with their respective scores.  
-
-5
-00:00:28,560 --> 00:00:34,160
-As we have seen in the pipeline presentation, 
-there are three stages in the pipeline. First,  
-
-6
-00:00:34,800 --> 00:00:38,880
-we convert the raw texts to numbers the 
-model can make sense of, using a tokenizer.  
-
-7
-00:00:40,000 --> 00:00:43,520
-Then, those numbers go through 
-the model, which outputs logits.  
-
-8
-00:00:44,400 --> 00:00:49,120
-Finally, the post-processing steps transforms 
-those logits into labels and scores.  
-
-9
-00:00:50,720 --> 00:00:54,960
-Let's look in detail at those three steps, and how 
-to replicate them using the Transformers library,  
-
-10
-00:00:54,960 --> 00:01:03,280
-beginning with the first stage, tokenization. The 
-tokenization process has several steps. First,  
-
-11
-00:01:03,280 --> 00:01:09,120
-the text is split into small chunks called tokens. 
-They can be words, parts of words or punctuation  
-
-12
-00:01:09,120 --> 00:01:17,440
-symbols. Then the tokenizer will had some special 
-tokens (if the model expect them). Here the model  
-
-13
-00:01:17,440 --> 00:01:22,800
-uses expects a CLS token at the beginning and a 
-SEP token at the end of the sentence to classify.  
-
-14
-00:01:23,760 --> 00:01:28,880
-Lastly, the tokenizer matches each token to its 
-unique ID in the vocabulary of the pretrained  
-
-15
-00:01:28,880 --> 00:01:34,640
-model. To load such a tokenizer, the Transformers 
-library provides the AutoTokenizer API.  
-
-16
-00:01:35,680 --> 00:01:40,640
-The most important method of this class is 
-from_pretrained, which will download and cache  
-
-17
-00:01:40,640 --> 00:01:47,200
-the configuration and the vocabulary associated 
-to a given checkpoint. Here, the checkpoint used  
-
-18
-00:01:47,200 --> 00:01:53,840
-by default for the sentiment analysis pipeline is 
-distilbert base uncased finetuned sst2 english.  
-
-19
-00:01:56,560 --> 00:02:01,440
-We instantiate a tokenizer associated with that 
-checkpoint, then feed it the two sentences.  
-
-20
-00:02:02,640 --> 00:02:07,360
-Since those two sentences are not of the same 
-size, we will need to pad the shortest one to  
-
-21
-00:02:07,360 --> 00:02:11,680
-be able to build an array. This is done by 
-the tokenizer with the option padding=True.  
-
-22
-00:02:13,840 --> 00:02:18,960
-With truncation=True, we ensure that any sentence 
-longer than the maximum the model can handle  
-
-23
-00:02:18,960 --> 00:02:25,600
-is truncated. Lastly, the return_tensors option 
-tells the tokenizer to return a TensorFlow tensor.  
-
-24
-00:02:26,720 --> 00:02:29,680
-Looking at the result, we see we 
-have a dictionary with two keys.  
-
-25
-00:02:30,240 --> 00:02:37,280
-Input IDs contains the IDs of both sentences, with 
-0s where the padding is applied. The second key,  
-
-26
-00:02:37,280 --> 00:02:42,080
-attention mask, indicates where padding has been 
-applied, so the model does not pay attention to  
-
-27
-00:02:42,080 --> 00:02:48,000
-it. This is all what is inside the tokenization 
-step. Now let's have a look at the second step,  
-
-28
-00:02:48,640 --> 00:02:54,960
-the model. As for the tokenizer, there is an 
-TFAutoModel API, with a from_pretrained method.  
-
-29
-00:02:55,600 --> 00:02:59,840
-It will download and cache the configuration 
-of the model as well as the pretrained  
-
-30
-00:02:59,840 --> 00:03:05,600
-weights. However, the TFAutoModel API will 
-only instantiate the body of the model,  
-
-31
-00:03:06,320 --> 00:03:10,640
-that is, the part of the model that is 
-left once the pretraining head is removed.  
-
-32
-00:03:12,000 --> 00:03:16,960
-It will output a high-dimensional tensor that 
-is a representation of the sentences passed,  
-
-33
-00:03:16,960 --> 00:03:20,080
-but which is not directly useful 
-for our classification problem.  
-
-34
-00:03:21,760 --> 00:03:28,080
-Here the tensor has two sentences, each of sixteen 
-tokens and the last dimension is the hidden size  
-
-35
-00:03:28,080 --> 00:03:34,320
-of our model 768. To get an output linked 
-to our classification problem, we need to  
-
-36
-00:03:34,320 --> 00:03:40,000
-use the TFAutoModelForSequenceClassification 
-class. It works exactly as the AutoModel class,  
-
-37
-00:03:40,000 --> 00:03:45,440
-except that it will build a model with a 
-classification head. There is one auto class for  
-
-38
-00:03:45,440 --> 00:03:52,160
-each common NLP task in the Transformers library. 
-Here, after giving our model the two sentences,  
-
-39
-00:03:52,160 --> 00:03:59,120
-we get a tensor of size two by two: one result for 
-each sentence and for each possible label. Those  
-
-40
-00:03:59,120 --> 00:04:04,800
-outputs are not probabilities yet (we can see they 
-don't sum to 1). This is because each model of the  
-
-41
-00:04:04,800 --> 00:04:10,960
-Transformers library returns logits. To make sense 
-of those logits, we need to dig into the third and  
-
-42
-00:04:10,960 --> 00:04:17,520
-last step of the pipeline: post-processing. To 
-convert logits into probabilities, we need to  
-
-43
-00:04:17,520 --> 00:04:22,800
-apply a SoftMax layer to them. As we can see, 
-this transforms them into positive numbers that  
-
-44
-00:04:22,800 --> 00:04:28,160
-sum up to 1. The last step is to know which of 
-those corresponds to the positive or the negative  
-
-45
-00:04:28,160 --> 00:04:34,720
-label. This is given by the id2label field 
-of the model config. The first probabilities  
-
-46
-00:04:34,720 --> 00:04:40,800
-(index 0) correspond to the negative label, and 
-the seconds (index 1) correspond to the positive  
-
-47
-00:04:40,800 --> 00:04:46,640
-label. This is how our classifier built with the 
-pipeline function picked those labels and computed  
-
-48
-00:04:46,640 --> 00:04:55,840
-those scores. Now that you know how each steps 
-works, you can easily tweak them to your needs.
+﻿1
+00:00:00,397 --> 00:00:02,980
+(subtle blast)
+
+2
+00:00:05,490 --> 00:00:07,953
+- What happens inside
+the pipeline function?
+
+3
+00:00:09,930 --> 00:00:13,050
+In this video, we will look
+at what actually happens
+
+4
+00:00:13,050 --> 00:00:14,820
+when we use the pipeline function
+
+5
+00:00:14,820 --> 00:00:16,920
+of the Transformers library.
+
+6
+00:00:16,920 --> 00:00:18,930
+More specifically, we will look at
+
+7
+00:00:18,930 --> 00:00:21,030
+the sentiment analysis pipeline,
+
+8
+00:00:21,030 --> 00:00:23,760
+and how it went from the
+two following sentences
+
+9
+00:00:23,760 --> 00:00:25,800
+to the positive and negative labels
+
+10
+00:00:25,800 --> 00:00:27,250
+with their respective scores.
+
+11
+00:00:28,740 --> 00:00:31,110
+As we have seen in the pipeline video,
+
+12
+00:00:31,110 --> 00:00:33,900
+there are three stages in the pipeline.
+
+13
+00:00:33,900 --> 00:00:36,810
+First, we convert the raw texts to numbers
+
+14
+00:00:36,810 --> 00:00:39,160
+the model can make sense
+of, using a tokenizer.
+
+15
+00:00:40,140 --> 00:00:42,600
+Then, those numbers go through the model,
+
+16
+00:00:42,600 --> 00:00:44,550
+which outputs logits.
+
+17
+00:00:44,550 --> 00:00:47,190
+Finally, the post-processing steps
+
+18
+00:00:47,190 --> 00:00:49,490
+transforms those logits
+into labels and score.
+
+19
+00:00:51,000 --> 00:00:52,590
+Let's look in detail at those three steps,
+
+20
+00:00:52,590 --> 00:00:55,200
+and how to replicate them
+using the Transformers library,
+
+21
+00:00:55,200 --> 00:00:57,903
+beginning with the first
+stage, tokenization.
+
+22
+00:00:59,905 --> 00:01:02,520
+The tokenization process
+has several steps.
+
+23
+00:01:02,520 --> 00:01:06,900
+First, the text is split into
+small chunks called token.
+
+24
+00:01:06,900 --> 00:01:09,933
+They can be words, parts of
+words or punctuation symbols.
+
+25
+00:01:10,800 --> 00:01:14,310
+Then the tokenizer will
+had some special tokens
+
+26
+00:01:14,310 --> 00:01:15,573
+if the model expect them.
+
+27
+00:01:16,440 --> 00:01:20,430
+Here, the model used expects
+a CLS token at the beginning
+
+28
+00:01:20,430 --> 00:01:23,910
+and a SEP token at the end
+of the sentence to classify.
+
+29
+00:01:23,910 --> 00:01:27,630
+Lastly, the tokenizer matches
+each token to its unique ID
+
+30
+00:01:27,630 --> 00:01:29,730
+in the vocabulary of the pretrained model.
+
+31
+00:01:30,660 --> 00:01:32,040
+To load such a tokenizer,
+
+32
+00:01:32,040 --> 00:01:34,983
+the Transformers library
+provides the AutoTokenizer API.
+
+33
+00:01:35,880 --> 00:01:39,510
+The most important method of
+this class is from_pretrained,
+
+34
+00:01:39,510 --> 00:01:41,940
+which will download and
+cache the configuration
+
+35
+00:01:41,940 --> 00:01:44,913
+and the vocabulary associated
+to a given checkpoint.
+
+36
+00:01:46,410 --> 00:01:48,180
+Here, the checkpoint used by default
+
+37
+00:01:48,180 --> 00:01:50,310
+for the sentiment analysis pipeline
+
+38
+00:01:50,310 --> 00:01:54,510
+is distilbert base uncased
+finetuned sst2 English,
+
+39
+00:01:54,510 --> 00:01:55,960
+which is a bit of a mouthful.
+
+40
+00:01:56,820 --> 00:01:59,760
+We instantiate a tokenizer
+associated with that checkpoint,
+
+41
+00:01:59,760 --> 00:02:01,833
+then feed it the two sentences.
+
+42
+00:02:02,790 --> 00:02:05,490
+Since those two sentences
+are not of the same size,
+
+43
+00:02:05,490 --> 00:02:07,440
+we will need to pad the shortest one
+
+44
+00:02:07,440 --> 00:02:09,570
+to be able to build an array.
+
+45
+00:02:09,570 --> 00:02:10,403
+This is done by the tokenizer
+
+46
+00:02:10,403 --> 00:02:12,603
+with the option padding=True.
+
+47
+00:02:14,130 --> 00:02:17,340
+With truncation=True, we
+ensure that any sentence longer
+
+48
+00:02:17,340 --> 00:02:19,953
+than the maximum the model
+can handle is truncated.
+
+49
+00:02:20,820 --> 00:02:24,200
+Lastly, the return_tensors
+option tells the tokenizer
+
+50
+00:02:24,200 --> 00:02:25,773
+to return a PyTorch tensor.
+
+51
+00:02:26,910 --> 00:02:28,050
+Looking at the result,
+
+52
+00:02:28,050 --> 00:02:30,450
+we see we have a dictionary with two keys.
+
+53
+00:02:30,450 --> 00:02:33,840
+Input IDs contains the
+IDs of both sentences,
+
+54
+00:02:33,840 --> 00:02:35,840
+with zeros where the padding is applied.
+
+55
+00:02:36,750 --> 00:02:38,550
+The second key, attention mask,
+
+56
+00:02:38,550 --> 00:02:40,650
+indicates where padding has been applied,
+
+57
+00:02:40,650 --> 00:02:42,750
+so the model does not pay attention to it.
+
+58
+00:02:43,590 --> 00:02:46,380
+This is all what is inside
+the tokenization step.
+
+59
+00:02:46,380 --> 00:02:49,653
+Now let's have a look at
+the second step, the model.
+
+60
+00:02:51,090 --> 00:02:53,850
+As for the tokenizer,
+there is an AutoModel API,
+
+61
+00:02:53,850 --> 00:02:55,890
+with a from_pretrained method.
+
+62
+00:02:55,890 --> 00:02:59,100
+It will download and cache
+the configuration of the model
+
+63
+00:02:59,100 --> 00:03:01,560
+as well as the pretrained weights.
+
+64
+00:03:01,560 --> 00:03:04,830
+However, the AutoModel
+API will only instantiate
+
+65
+00:03:04,830 --> 00:03:06,540
+the body of the model,
+
+66
+00:03:06,540 --> 00:03:09,120
+that is, the part of
+the model that is left
+
+67
+00:03:09,120 --> 00:03:11,103
+once the pretraining head is removed.
+
+68
+00:03:12,210 --> 00:03:14,460
+It will output a high-dimensional tensor
+
+69
+00:03:14,460 --> 00:03:17,190
+that is a representation
+of the sentences passed,
+
+70
+00:03:17,190 --> 00:03:18,930
+but which is not directly useful
+
+71
+00:03:18,930 --> 00:03:20,480
+for our classification problem.
+
+72
+00:03:21,930 --> 00:03:24,210
+Here the tensor has two sentences,
+
+73
+00:03:24,210 --> 00:03:26,070
+each of sixteen token,
+
+74
+00:03:26,070 --> 00:03:30,393
+and the last dimension is the
+hidden size of our model, 768.
+
+75
+00:03:31,620 --> 00:03:34,020
+To get an output linked to
+our classification problem,
+
+76
+00:03:34,020 --> 00:03:37,800
+we need to use the
+AutoModelForSequenceClassification class.
+
+77
+00:03:37,800 --> 00:03:40,170
+It works exactly as the AutoModel class,
+
+78
+00:03:40,170 --> 00:03:41,970
+except that it will build a model
+
+79
+00:03:41,970 --> 00:03:43,353
+with a classification head.
+
+80
+00:03:44,520 --> 00:03:46,770
+There is one auto class
+for each common NLP task
+
+81
+00:03:46,770 --> 00:03:48,170
+in the Transformers library.
+
+82
+00:03:49,050 --> 00:03:52,380
+Here, after giving our
+model the two sentences,
+
+83
+00:03:52,380 --> 00:03:54,600
+we get a tensor of size two by two;
+
+84
+00:03:54,600 --> 00:03:57,783
+one result for each sentence
+and for each possible label.
+
+85
+00:03:59,100 --> 00:04:01,470
+Those outputs are not probabilities yet.
+
+86
+00:04:01,470 --> 00:04:03,660
+We can see they don't sum to 1.
+
+87
+00:04:03,660 --> 00:04:06,090
+This is because each model
+of the Transformers library
+
+88
+00:04:06,090 --> 00:04:07,830
+returns logits.
+
+89
+00:04:07,830 --> 00:04:09,480
+To make sense of those logits,
+
+90
+00:04:09,480 --> 00:04:10,980
+we need to dig into the third
+
+91
+00:04:10,980 --> 00:04:13,653
+and last step of the
+pipeline, post-processing.
+
+92
+00:04:15,300 --> 00:04:17,310
+To convert logits into probabilities,
+
+93
+00:04:17,310 --> 00:04:19,950
+we need to apply a SoftMax layer to them.
+
+94
+00:04:19,950 --> 00:04:22,800
+As we can see, this transforms
+them into positive numbers
+
+95
+00:04:22,800 --> 00:04:23,793
+that sum up to 1.
+
+96
+00:04:24,990 --> 00:04:27,030
+The last step is to know
+which of those corresponds
+
+97
+00:04:27,030 --> 00:04:29,400
+to the positive or the negative label.
+
+98
+00:04:29,400 --> 00:04:33,480
+This is given by the id2label
+field of the model config.
+
+99
+00:04:33,480 --> 00:04:36,000
+The first probabilities, index 0,
+
+100
+00:04:36,000 --> 00:04:37,740
+correspond to the negative label,
+
+101
+00:04:37,740 --> 00:04:42,060
+and the seconds, index 1,
+correspond to the positive label.
+
+102
+00:04:42,060 --> 00:04:43,830
+This is how our classifier built
+
+103
+00:04:43,830 --> 00:04:46,260
+with the pipeline function
+picked those labels
+
+104
+00:04:46,260 --> 00:04:47,560
+and computed those scores.
+
+105
+00:04:48,420 --> 00:04:50,400
+Now that you know how each steps works,
+
+106
+00:04:50,400 --> 00:04:52,533
+you can easily tweak them to your needs.
+
+107
+00:04:55,314 --> 00:04:57,897
+(subtle blast)
+
diff --git a/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt b/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt
index f7b29b5ad..d29c6933f 100644
--- a/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt
+++ b/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt
@@ -1,157 +1,308 @@
-1
-00:00:05,120 --> 00:00:07,440
-How to instantiate a Transformers model?  
-
-2
-00:00:08,640 --> 00:00:12,960
-In this video we will look at how we can create 
-and use a model from the Transformers library.  
-
-3
-00:00:14,160 --> 00:00:19,440
-As we've seen before, the AutoModel class allows 
-you to instantiate a pretrained model from any  
-
-4
-00:00:19,440 --> 00:00:24,960
-checkpoint on the Hugging Face Hub. It will 
-pick the right model class from the library to  
-
-5
-00:00:24,960 --> 00:00:30,800
-instantiate the proper architecture and load the 
-weights of the pretrained model inside it. As we  
-
-6
-00:00:30,800 --> 00:00:37,760
-can see, when given a BERT checkpoint, we end up 
-with a BertModel, and similarly for GPT-2 or BART.  
-
-7
-00:00:39,680 --> 00:00:43,440
-Behind the scenes, this API can take 
-the name of a checkpoint on the Hub,  
-
-8
-00:00:44,080 --> 00:00:48,400
-in which case it will download and cache the 
-configuration file as well as the model weights  
-
-9
-00:00:48,400 --> 00:00:54,800
-file. You can also specify the path to a local 
-folder that contains a valid configuration file  
-
-10
-00:00:54,800 --> 00:01:00,720
-and a model weights file. To instantiate the 
-pretrained model, the AutoModel API will first  
-
-11
-00:01:00,720 --> 00:01:04,960
-open the configuration file to look at the 
-configuration class that should be used.  
-
-12
-00:01:06,080 --> 00:01:12,240
-The configuration class depends on the type of 
-the model (BERT, GPT-2 or BART for instance).  
-
-13
-00:01:13,440 --> 00:01:18,160
-Once it has the proper configuration class, 
-it can instantiate that configuration,  
-
-14
-00:01:18,160 --> 00:01:23,920
-which is a blueprint to know how to create the 
-model. It also uses this configuration class  
-
-15
-00:01:23,920 --> 00:01:29,360
-to find the proper model class, which is combined 
-with the loaded configuration, to load the model.  
-
-16
-00:01:30,800 --> 00:01:35,520
-This model is not yet our pretrained model as it 
-has just been initialized with random weights.  
-
-17
-00:01:36,560 --> 00:01:42,960
-The last step is to load the weights from the 
-model file inside this model. To easily load  
-
-18
-00:01:42,960 --> 00:01:47,360
-the configuration of a model from any checkpoint 
-or a folder containing the configuration folder,  
-
-19
-00:01:48,000 --> 00:01:49,920
-we can use the AutoConfig class.  
-
-20
-00:01:51,040 --> 00:01:55,360
-Like the AutoModel class, it will pick the 
-right configuration class from the library.  
-
-21
-00:01:56,800 --> 00:02:01,360
-We can also use the specific class corresponding 
-to a checkpoint, but we will need to change the  
-
-22
-00:02:01,360 --> 00:02:08,320
-code each time we want to try a different model. 
-As we said before, the configuration of a model is  
-
-23
-00:02:08,320 --> 00:02:12,720
-a blueprint that contains all the information 
-necessary to create the model architecture.  
-
-24
-00:02:13,600 --> 00:02:19,680
-For instance the BERT model associated with 
-the bert-base-cased checkpoint has 12 layers,  
-
-25
-00:02:19,680 --> 00:02:29,120
-a hidden size of 768, and a vocabulary size 
-of 28,996. Once we have the configuration,  
-
-26
-00:02:29,680 --> 00:02:33,120
-we can create a model that has the same 
-architecture as our checkpoint but is  
-
-27
-00:02:33,120 --> 00:02:37,840
-randomly initialized. We can then train it from 
-scratch like any PyTorch module/TensorFlow model.  
-
-28
-00:02:38,800 --> 00:02:42,960
-We can also change any part of the 
-configuration by using keyword arguments.  
-
-29
-00:02:43,920 --> 00:02:49,280
-The second snippet of code instantiates a 
-randomly initialized BERT model with ten layers  
-
-30
-00:02:49,280 --> 00:02:56,160
-instead of 12. Saving a model once it's trained 
-or fine-tuned is very easy: we just have to use  
-
-31
-00:02:56,160 --> 00:03:02,880
-the save_pretrained method. Here the model will 
-be saved in a folder named my-bert-model inside  
-
-32
-00:03:02,880 --> 00:03:08,240
-the current working directory. Such a model can 
-then be reloaded using the from_pretrained method.
+﻿1
+00:00:00,519 --> 00:00:03,186
+(logo swooshes)
+
+2
+00:00:05,310 --> 00:00:08,483
+- How to instantiate a Transformers model.
+
+3
+00:00:08,483 --> 00:00:11,790
+In this video, we'll look at
+how we can create a user model
+
+4
+00:00:11,790 --> 00:00:13,290
+from the Transformers library.
+
+5
+00:00:14,310 --> 00:00:17,100
+As we have seen before
+the AutoModel class allows
+
+6
+00:00:17,100 --> 00:00:19,140
+you to instantiate a pretrained model
+
+7
+00:00:19,140 --> 00:00:21,513
+from any checkpoint on
+the Hugging Face Hub.
+
+8
+00:00:22,350 --> 00:00:23,910
+It'll pick the right model class
+
+9
+00:00:23,910 --> 00:00:26,654
+from the library to instantiate
+the proper architecture
+
+10
+00:00:26,654 --> 00:00:29,793
+and loads of weights as the
+pretrained model inside.
+
+11
+00:00:30,690 --> 00:00:33,810
+As we can see, when
+given a BERT checkpoint
+
+12
+00:00:33,810 --> 00:00:38,043
+we end up with a BertModel and
+similarly, for GPT-2 or BART.
+
+13
+00:00:40,020 --> 00:00:42,360
+Behind the scenes,this
+API can take the name
+
+14
+00:00:42,360 --> 00:00:44,250
+of a checkpoint on the Hub
+
+15
+00:00:44,250 --> 00:00:46,980
+in which case it will download
+and cache the configuration
+
+16
+00:00:46,980 --> 00:00:48,843
+file as well as a model weights file.
+
+17
+00:00:49,698 --> 00:00:52,710
+You can also specify the
+path to a local folder
+
+18
+00:00:52,710 --> 00:00:55,290
+that contains a valid
+configuration file and a
+
+19
+00:00:55,290 --> 00:00:56,390
+model of weights file.
+
+20
+00:00:57,600 --> 00:00:59,479
+To instantiate the pretrained model,
+
+21
+00:00:59,479 --> 00:01:01,950
+the AutoModel API will
+first open the configuration
+
+22
+00:01:01,950 --> 00:01:05,403
+file to look at a configuration
+class that should be used.
+
+23
+00:01:06,420 --> 00:01:08,580
+The configuration class
+depends on the type
+
+24
+00:01:08,580 --> 00:01:12,663
+of the model BERT, GPT-2
+or BART for instance.
+
+25
+00:01:13,680 --> 00:01:15,930
+Once it has a proper configuration class,
+
+26
+00:01:15,930 --> 00:01:18,390
+it can instantiate that configuration
+
+27
+00:01:18,390 --> 00:01:21,900
+which is a blueprint to know
+how to create the model.
+
+28
+00:01:21,900 --> 00:01:24,240
+It also uses this configuration class to
+
+29
+00:01:24,240 --> 00:01:27,150
+find the proper model class,
+which is then combined
+
+30
+00:01:27,150 --> 00:01:29,823
+with the loaded configuration
+to load the model.
+
+31
+00:01:30,904 --> 00:01:33,210
+This model is not yet a pretrained model
+
+32
+00:01:33,210 --> 00:01:35,883
+as it has just been initialized
+with random weights.
+
+33
+00:01:36,840 --> 00:01:39,810
+The last step is to load the
+weight from the model file
+
+34
+00:01:39,810 --> 00:01:40,923
+inside this model.
+
+35
+00:01:42,330 --> 00:01:44,250
+To easily load the
+configuration of a model
+
+36
+00:01:44,250 --> 00:01:46,410
+from any checkpoint or folder containing
+
+37
+00:01:46,410 --> 00:01:48,210
+the configuration file.
+
+38
+00:01:48,210 --> 00:01:50,373
+We can use the AutoConfig class.
+
+39
+00:01:51,240 --> 00:01:52,693
+Like the AutoModel class,
+
+40
+00:01:52,693 --> 00:01:55,693
+it will pick the right configuration
+class from the library.
+
+41
+00:01:57,060 --> 00:01:59,220
+We can also use a specific
+class corresponding
+
+42
+00:01:59,220 --> 00:02:01,470
+to a checkpoint, but
+we will need to change
+
+43
+00:02:01,470 --> 00:02:03,000
+the code each time we want to try
+
+44
+00:02:03,000 --> 00:02:04,550
+a different model architecture.
+
+45
+00:02:06,030 --> 00:02:07,860
+As we said before, the configuration
+
+46
+00:02:07,860 --> 00:02:10,350
+of a model is a blueprint
+that contains all the
+
+47
+00:02:10,350 --> 00:02:13,830
+information necessary to
+create the model architecture.
+
+48
+00:02:13,830 --> 00:02:15,990
+For instance, the BERT model associated
+
+49
+00:02:15,990 --> 00:02:19,980
+with the bert-base-cased
+checkpoint has 12 layers,
+
+50
+00:02:19,980 --> 00:02:24,980
+a hidden side of 768 and a
+vocabulary side of 28,996.
+
+51
+00:02:28,020 --> 00:02:29,910
+Once we have the configuration,
+
+52
+00:02:29,910 --> 00:02:31,950
+we can create a model that
+does the same architecture
+
+53
+00:02:31,950 --> 00:02:35,280
+as our checkpoint, but
+is randomly initialized.
+
+54
+00:02:35,280 --> 00:02:36,660
+We can then train it from scratch.
+
+55
+00:02:36,660 --> 00:02:38,010
+Like any bio PyTorch module
+
+56
+00:02:39,497 --> 00:02:40,380
+We can also change any part
+
+57
+00:02:40,380 --> 00:02:43,200
+of the configuration by
+using keyword arguments.
+
+58
+00:02:43,200 --> 00:02:46,138
+The second snippet of code instantiates
+
+59
+00:02:46,138 --> 00:02:48,360
+a randomly initialized BERT model
+
+60
+00:02:48,360 --> 00:02:50,403
+with 10 layers instead of 12.
+
+61
+00:02:51,409 --> 00:02:55,051
+Saving a model once it's trained
+or fine-tuned is very easy.
+
+62
+00:02:55,051 --> 00:02:57,603
+We just have to use a
+safe pretrained method.
+
+63
+00:02:58,500 --> 00:03:01,417
+Here the model will be
+saved in a folder named
+
+64
+00:03:01,417 --> 00:03:04,473
+"my-bert-model" inside the
+current working directory.
+
+65
+00:03:05,400 --> 00:03:08,255
+Such a model can then be
+reloaded using the form
+
+66
+00:03:08,255 --> 00:03:09,596
+pretrained method.
+
+67
+00:03:09,596 --> 00:03:11,250
+To learn how to easily approach this model
+
+68
+00:03:11,250 --> 00:03:13,473
+to that, check out the push to a video.
+
diff --git a/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt b/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt
index dda26cb36..17a04807a 100644
--- a/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt
+++ b/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt
@@ -1,195 +1,317 @@
-1
-00:00:05,540 --> 00:00:07,870
-How to instantiate a Transformers model?
-
-2
-00:00:07,870 --> 00:00:14,800
-In this video we will look at how we can create
-and use a model from the Transformers library.
-
-3
-00:00:14,800 --> 00:00:20,130
-As we've seen before, the TFAutoModel class
-allows you to instantiate a pretrained model
-
-4
-00:00:20,130 --> 00:00:23,490
-from any checkpoint on the Hugging Face Hub.
-
-5
-00:00:23,490 --> 00:00:27,740
-It will pick the right model class from the
-library to instantiate the proper architecture
-
-6
-00:00:27,740 --> 00:00:31,310
-and load the weights of the pretrained model
-inside it.
-
-7
-00:00:31,310 --> 00:00:36,630
-As we can see, when given a BERT checkpoint,
-we end up with a TFBertModel, and similarly
-
-8
-00:00:36,630 --> 00:00:39,890
-for GPT-2 or BART.
-
-9
-00:00:39,890 --> 00:00:44,489
-Behind the scenes, this API can take the name
-of a checkpoint on the Hub, in which case
-
-10
-00:00:44,489 --> 00:00:49,649
-it will download and cache the configuration
-file as well as the model weights file.
-
-11
-00:00:49,649 --> 00:00:54,059
-You can also specify the path to a local folder
-that contains a valid configuration file and
-
-12
-00:00:54,059 --> 00:00:56,739
-a model weights file.
-
-13
-00:00:56,739 --> 00:01:02,480
-To instantiate the pretrained model, the AutoModel
-API will first open the configuration file
-
-14
-00:01:02,480 --> 00:01:06,409
-to look at the configuration class that should
-be used.
-
-15
-00:01:06,409 --> 00:01:13,509
-The configuration class depends on the type
-of the model (BERT, GPT-2 or BART for instance).
-
-16
-00:01:13,509 --> 00:01:18,130
-Once it has the proper configuration class,
-it can instantiate that configuration, which
-
-17
-00:01:18,130 --> 00:01:20,420
-is a blueprint to know how to create the model.
-
-18
-00:01:20,420 --> 00:01:25,420
-It also uses this configuration class to find
-the proper model class, which is combined
-
-19
-00:01:25,420 --> 00:01:28,470
-with the loaded configuration, to load the
-model.
-
-20
-00:01:28,470 --> 00:01:33,759
-This model is not yet our pretrained model
-as it has just been initialized with random
-
-21
-00:01:33,759 --> 00:01:34,759
-weights.
-
-22
-00:01:34,759 --> 00:01:40,299
-The last step is to load the weights from
-the model file inside this model.
-
-23
-00:01:40,299 --> 00:01:44,659
-To easily load the configuration of a model
-from any checkpoint or a folder containing
-
-24
-00:01:44,659 --> 00:01:48,100
-the configuration folder, we can use the AutoConfig
-class.
-
-25
-00:01:48,100 --> 00:01:54,270
-Like the TFAutoModel class, it will pick the
-right configuration class from the library.
-
-26
-00:01:54,270 --> 00:01:58,869
-We can also use the specific class corresponding
-to a checkpoint, but we will need to change
-
-27
-00:01:58,869 --> 00:02:03,280
-the code each time we want to try a different
-model.
-
-28
-00:02:03,280 --> 00:02:07,490
-As we said before, the configuration of a
-model is a blueprint that contains all the
-
-29
-00:02:07,490 --> 00:02:11,190
-information necessary to create the model
-architecture.
-
-30
-00:02:11,190 --> 00:02:14,629
-For instance the BERT model associated with
-the bert-base-cased checkpoint has 12 layers,
-
-31
-00:02:14,629 --> 00:02:21,790
-a hidden size of 768, and a vocabulary size
-of 28,996.
-
-32
-00:02:21,790 --> 00:02:28,959
-Once we have the configuration, we can create
-a model that has the same architecture as
-
-33
-00:02:28,959 --> 00:02:31,420
-our checkpoint but is randomly initialized.
-
-34
-00:02:31,420 --> 00:02:36,080
-We can then train it from scratch like any
-PyTorch module/TensorFlow model.
-
-35
-00:02:36,080 --> 00:02:40,870
-We can also change any part of the configuration
-by using keyword arguments.
-
-36
-00:02:40,870 --> 00:02:45,860
-The second snippet of code instantiates a
-randomly initialized BERT model with ten layers
-
-37
-00:02:45,860 --> 00:02:48,379
-instead of 12.
-
-38
-00:02:48,379 --> 00:02:53,019
-Saving a model once it's trained or fine-tuned
-is very easy: we just have to use the save_pretrained
-
-39
-00:02:53,019 --> 00:02:54,019
-method.
-
-40
-00:02:54,019 --> 00:03:00,510
-Here the model will be saved in a folder named
-my-bert-model inside the current working directory.
-
-41
-00:03:00,510 --> 00:03:13,120
-Such a model can then be reloaded using the
-from_pretrained method.
+﻿1
+00:00:00,125 --> 00:00:02,958
+(whooshing sound)
+
+2
+00:00:05,463 --> 00:00:08,820
+- How to instantiate
+the Transformers model?
+
+3
+00:00:08,820 --> 00:00:11,250
+In this video, we will
+look at how we can create
+
+4
+00:00:11,250 --> 00:00:13,550
+and use a model from the
+Transformers library.
+
+5
+00:00:15,000 --> 00:00:17,850
+As we've seen before,
+the TFAutoModel class
+
+6
+00:00:17,850 --> 00:00:20,100
+allows you to instantiate
+a pre-trained model
+
+7
+00:00:20,100 --> 00:00:22,503
+from any checkpoint on
+the Hugging Face Hub.
+
+8
+00:00:23,430 --> 00:00:25,620
+It will pick the right
+model class from the library
+
+9
+00:00:25,620 --> 00:00:27,750
+to instantiate the proper architecture
+
+10
+00:00:27,750 --> 00:00:31,200
+and load the weights of the
+pre-trained model inside.
+
+11
+00:00:31,200 --> 00:00:34,020
+As we can see, when
+given a BERT checkpoint,
+
+12
+00:00:34,020 --> 00:00:36,090
+we end up with a TFBertModel,
+
+13
+00:00:36,090 --> 00:00:38,553
+and similarly for GPT2 or BART.
+
+14
+00:00:40,170 --> 00:00:42,510
+Behind the scenes, this
+API can take the name
+
+15
+00:00:42,510 --> 00:00:44,040
+of a checkpoint on the Hub,
+
+16
+00:00:44,040 --> 00:00:45,810
+in which case it will download and cache
+
+17
+00:00:45,810 --> 00:00:48,660
+the configuration file as well
+as the model weights file.
+
+18
+00:00:49,590 --> 00:00:52,020
+You can also specify the
+path to a local folder
+
+19
+00:00:52,020 --> 00:00:54,090
+that contains a valid configuration file
+
+20
+00:00:54,090 --> 00:00:55,340
+and a model weights file.
+
+21
+00:00:56,670 --> 00:00:58,167
+To instantiate the pre-trained model,
+
+22
+00:00:58,167 --> 00:01:02,400
+the TFAutoModel API will first
+open the configuration file
+
+23
+00:01:02,400 --> 00:01:05,253
+to look at the configuration
+class that should be used.
+
+24
+00:01:06,390 --> 00:01:09,660
+The configuration class depends
+on the type of the model,
+
+25
+00:01:09,660 --> 00:01:12,333
+BERT, GPT2 or BART for instance.
+
+26
+00:01:13,320 --> 00:01:15,720
+Once it has the proper
+configuration class,
+
+27
+00:01:15,720 --> 00:01:18,000
+it can instantiate that configuration,
+
+28
+00:01:18,000 --> 00:01:21,090
+which is a blueprint to know
+how to create the model.
+
+29
+00:01:21,090 --> 00:01:22,770
+It also uses this configuration class
+
+30
+00:01:22,770 --> 00:01:24,750
+to find the proper model class,
+
+31
+00:01:24,750 --> 00:01:27,120
+which is combined with
+the loaded configuration
+
+32
+00:01:27,120 --> 00:01:28,143
+to load the model.
+
+33
+00:01:29,250 --> 00:01:31,800
+This model is not yet
+our pre-trained model
+
+34
+00:01:31,800 --> 00:01:34,560
+as it has just been initialized
+with random weights.
+
+35
+00:01:34,560 --> 00:01:36,690
+The last step is to load the weights
+
+36
+00:01:36,690 --> 00:01:38,973
+from the model file inside this model.
+
+37
+00:01:40,230 --> 00:01:42,270
+To easily load the
+configuration of a model
+
+38
+00:01:42,270 --> 00:01:44,220
+from any checkpoint or a folder
+
+39
+00:01:44,220 --> 00:01:46,170
+containing the configuration file,
+
+40
+00:01:46,170 --> 00:01:47,790
+we can use the AutoConfig class.
+
+41
+00:01:47,790 --> 00:01:50,460
+Like the TFAutoModel class,
+
+42
+00:01:50,460 --> 00:01:54,210
+it will pick the right configuration
+class from the library.
+
+43
+00:01:54,210 --> 00:01:56,040
+We can also use the specific class
+
+44
+00:01:56,040 --> 00:01:57,840
+corresponding to a checkpoint,
+
+45
+00:01:57,840 --> 00:01:59,430
+but we will need to change the code
+
+46
+00:01:59,430 --> 00:02:02,230
+each time we want to try a
+different model architecture.
+
+47
+00:02:03,180 --> 00:02:05,353
+As we said before, the
+configuration of a model
+
+48
+00:02:05,353 --> 00:02:08,610
+is a blueprint that contains
+all the information necessary
+
+49
+00:02:08,610 --> 00:02:11,070
+to create the model architecture.
+
+50
+00:02:11,070 --> 00:02:12,750
+For instance, the BERT model
+
+51
+00:02:12,750 --> 00:02:15,510
+associated with the
+bert-base-cased checkpoint
+
+52
+00:02:15,510 --> 00:02:19,710
+has 12 layers, a hidden size of 768,
+
+53
+00:02:19,710 --> 00:02:23,403
+and a vocabulary size of 28,996.
+
+54
+00:02:24,810 --> 00:02:26,670
+Once we have the configuration,
+
+55
+00:02:26,670 --> 00:02:28,890
+we can create a model that
+has the same architecture
+
+56
+00:02:28,890 --> 00:02:32,160
+as our checkpoint but
+is randomly initialized.
+
+57
+00:02:32,160 --> 00:02:36,030
+We can then train it from scratch
+like any TensorFlow model.
+
+58
+00:02:36,030 --> 00:02:38,063
+We can also change any
+part of the configuration
+
+59
+00:02:38,063 --> 00:02:40,770
+by using keyword arguments.
+
+60
+00:02:40,770 --> 00:02:43,110
+The second snippet of code instantiates
+
+61
+00:02:43,110 --> 00:02:44,970
+a randomly initialized BERT model
+
+62
+00:02:44,970 --> 00:02:46,983
+with 10 layers instead of 12.
+
+63
+00:02:48,240 --> 00:02:51,360
+Saving a model once it's trained
+or fine-tuned is very easy.
+
+64
+00:02:51,360 --> 00:02:53,880
+We just have to use the
+save_pretrained method.
+
+65
+00:02:53,880 --> 00:02:55,980
+Here, the model will be saved in a folder
+
+66
+00:02:55,980 --> 00:02:59,463
+named my-bert-model inside
+the current working directory.
+
+67
+00:03:00,480 --> 00:03:02,250
+Such a model can then be reloaded
+
+68
+00:03:02,250 --> 00:03:04,500
+using the from_pretrained method.
+
+69
+00:03:04,500 --> 00:03:06,600
+To run it to a projects model to the Hub,
+
+70
+00:03:06,600 --> 00:03:08,350
+check out the push (mumbles) video.
+
+71
+00:03:09,355 --> 00:03:12,188
+(whooshing sound)
+
diff --git a/subtitles/en/12_tokenizers-overview.srt b/subtitles/en/12_tokenizers-overview.srt
index e60183723..cc5880413 100644
--- a/subtitles/en/12_tokenizers-overview.srt
+++ b/subtitles/en/12_tokenizers-overview.srt
@@ -1,38 +1,99 @@
-1
-00:00:03,840 --> 00:00:09,200
-In these few videos, we'll take a look at the 
-tokenizers. In Natural Language Processing,  
-
-2
-00:00:09,200 --> 00:00:14,880
-most of the data that we handle consists of raw 
-text. However, machine learning models cannot read  
-
-3
-00:00:14,880 --> 00:00:23,200
-and understand text in its raw form they can only 
-work with numbers. The tokenizer's objective will  
-
-4
-00:00:23,200 --> 00:00:30,080
-be to translate the text into numbers. There are 
-several possible approaches to this conversion,  
-
-5
-00:00:30,080 --> 00:00:33,120
-and the objective is to find the 
-most meaningful representation.  
-
-6
-00:00:36,000 --> 00:00:40,400
-We'll take a look at three distinct tokenization 
-algorithms. We compare them one to one,  
-
-7
-00:00:40,400 --> 00:00:44,880
-so we recommend you look at the videos 
-in the following order: Word-based,  
-
-8
-00:00:45,680 --> 00:00:55,680
-Character-based, and Subword-based.
+﻿1
+00:00:00,450 --> 00:00:01,509
+(intro whooshing)
+
+2
+00:00:01,509 --> 00:00:02,720
+(smiley snapping)
+
+3
+00:00:02,720 --> 00:00:03,930
+(words whooshing)
+
+4
+00:00:03,930 --> 00:00:04,920
+- In the next few videos,
+
+5
+00:00:04,920 --> 00:00:06,720
+we'll take a look at the tokenizers.
+
+6
+00:00:07,860 --> 00:00:09,240
+In natural language processing,
+
+7
+00:00:09,240 --> 00:00:12,930
+most of the data that we
+handle consists of raw text.
+
+8
+00:00:12,930 --> 00:00:14,280
+However, machine learning models
+
+9
+00:00:14,280 --> 00:00:17,103
+cannot read or understand
+text in its raw form,
+
+10
+00:00:18,540 --> 00:00:20,253
+they can only work with numbers.
+
+11
+00:00:21,360 --> 00:00:23,220
+So the tokenizer's objective
+
+12
+00:00:23,220 --> 00:00:25,923
+will be to translate
+the text into numbers.
+
+13
+00:00:27,600 --> 00:00:30,240
+There are several possible
+approaches to this conversion,
+
+14
+00:00:30,240 --> 00:00:31,110
+and the objective
+
+15
+00:00:31,110 --> 00:00:33,453
+is to find the most
+meaningful representation.
+
+16
+00:00:36,240 --> 00:00:39,390
+We'll take a look at three
+distinct tokenization algorithms.
+
+17
+00:00:39,390 --> 00:00:40,530
+We compare them one to one,
+
+18
+00:00:40,530 --> 00:00:42,600
+so we recommend you take
+a look at the videos
+
+19
+00:00:42,600 --> 00:00:44,040
+in the following order.
+
+20
+00:00:44,040 --> 00:00:45,390
+First, "Word-based,"
+
+21
+00:00:45,390 --> 00:00:46,800
+followed by "Character-based,"
+
+22
+00:00:46,800 --> 00:00:48,877
+and finally, "Subword-based."
+
+23
+00:00:48,877 --> 00:00:51,794
+(outro whooshing)
+
diff --git a/subtitles/en/13_word-based-tokenizers.srt b/subtitles/en/13_word-based-tokenizers.srt
index ffd34249d..a2908fd3b 100644
--- a/subtitles/en/13_word-based-tokenizers.srt
+++ b/subtitles/en/13_word-based-tokenizers.srt
@@ -1,128 +1,264 @@
-1
-00:00:03,120 --> 00:00:10,240
-Let's take a look at word-based tokenization. 
-Word-based tokenization is the idea of splitting  
-
-2
-00:00:10,240 --> 00:00:19,040
-the raw text into words, by splitting on spaces 
-or other specific rules like punctuation. In this  
-
-3
-00:00:19,040 --> 00:00:25,040
-algorithm, each word has a specific number, an 
-"ID", attributed to it. In this example, "Let's"  
-
-4
-00:00:25,040 --> 00:00:33,120
-has the ID 250, do has ID 861, and tokenization 
-followed by an exclamation point has the ID 345.  
-
-5
-00:00:34,160 --> 00:00:39,840
-This approach is interesting, as the model has 
-representations that are based on entire words.  
-
-6
-00:00:42,560 --> 00:00:45,680
-The information held in a single number is high  
-
-7
-00:00:45,680 --> 00:00:52,880
-as a word contains a lot of contextual 
-and semantic information in a sentence.  
-
-8
-00:00:52,880 --> 00:00:58,720
-However, this approach does have its limits. 
-For example, the word dog and the word  
-
-9
-00:00:58,720 --> 00:01:04,320
-dogs are very similar, and their meaning is 
-close. However, the word-based tokenization  
-
-10
-00:01:05,280 --> 00:01:10,320
-will attribute entirely different IDs to these 
-two words, and the model will therefore learn  
-
-11
-00:01:10,320 --> 00:01:14,880
-different meanings for these two words. This 
-is unfortunate, as we would like the model  
-
-12
-00:01:14,880 --> 00:01:21,120
-to understand that these words are indeed related 
-and that dogs is the plural form of the word dog.  
-
-13
-00:01:22,800 --> 00:01:26,400
-Another issue with this approach is that there 
-are a lot of different words in a language.  
-
-14
-00:01:27,840 --> 00:01:31,920
-If we want our model to understand all 
-possible sentences in that language,  
-
-15
-00:01:31,920 --> 00:01:37,200
-then we will need to have an ID for each 
-different word, and the total number of words,  
-
-16
-00:01:37,200 --> 00:01:41,440
-which is also known as the vocabulary 
-size, can quickly become very large.  
-
-17
-00:01:44,160 --> 00:01:48,800
-This is an issue because each ID is mapped to a 
-large vector that represents the word's meaning,  
-
-18
-00:01:50,000 --> 00:01:55,840
-and keeping track of these mappings requires an 
-enormous number of weights when the vocabulary  
-
-19
-00:01:55,840 --> 00:02:03,360
-size is large. If we want our models to stay 
-lean, we can opt for our tokenizer to ignore  
-
-20
-00:02:03,360 --> 00:02:11,760
-certain words that we don't necessarily need. For 
-example, when training our tokenizer on a text,  
-
-21
-00:02:11,760 --> 00:02:15,680
-we might want to take the 10,000 
-most frequent words in that text  
-
-22
-00:02:20,640 --> 00:02:23,520
-to create our basic vocabulary, instead 
-of taking all of that language's words.  
-
-23
-00:02:23,520 --> 00:02:27,200
-The tokenizer will know how to convert 
-those 10,000 words into numbers,  
-
-24
-00:02:27,200 --> 00:02:33,520
-but any other word will be converted to the 
-out-of-vocabulary word, or the "unknown" word.  
-
-25
-00:02:36,000 --> 00:02:39,760
-This can rapidly become an issue: the model 
-will have the exact same representation  
-
-26
-00:02:39,760 --> 00:02:44,720
-for all words that it doesn't know, which 
-will result in a lot of lost information.
+﻿1
+00:00:00,165 --> 00:00:01,416
+(screen whooshing)
+
+2
+00:00:01,416 --> 00:00:02,716
+(sticker popping)
+
+3
+00:00:02,716 --> 00:00:03,549
+(screen whooshing)
+
+4
+00:00:03,549 --> 00:00:05,603
+- Let's take a look at
+word-based tokenization.
+
+5
+00:00:07,650 --> 00:00:09,780
+Word-based tokenization is the idea
+
+6
+00:00:09,780 --> 00:00:11,940
+of splitting the raw text into words
+
+7
+00:00:11,940 --> 00:00:14,673
+by splitting on spaces
+or other specific rules,
+
+8
+00:00:16,020 --> 00:00:17,163
+like punctuation.
+
+9
+00:00:18,900 --> 00:00:21,810
+In this algorithm, each
+word has a specific number
+
+10
+00:00:21,810 --> 00:00:23,463
+or ID attributed to it.
+
+11
+00:00:24,360 --> 00:00:27,270
+Here, let's has the ID 250,
+
+12
+00:00:27,270 --> 00:00:30,150
+do has 861, and tokenization
+
+13
+00:00:30,150 --> 00:00:33,393
+followed by an exclamation mark has 345.
+
+14
+00:00:34,380 --> 00:00:36,000
+This approach is interesting
+
+15
+00:00:36,000 --> 00:00:38,100
+as the model has representations
+
+16
+00:00:38,100 --> 00:00:40,233
+that are based on entire words.
+
+17
+00:00:42,720 --> 00:00:45,960
+The information held in
+a single number is high,
+
+18
+00:00:45,960 --> 00:00:48,240
+as a word contains a lot of contextual
+
+19
+00:00:48,240 --> 00:00:49,803
+and semantic information.
+
+20
+00:00:53,070 --> 00:00:55,473
+However, this approach
+does have its limits.
+
+21
+00:00:56,610 --> 00:01:00,570
+For example, the word dog and
+the word dogs are very similar
+
+22
+00:01:00,570 --> 00:01:01,923
+and their meaning is close.
+
+23
+00:01:03,210 --> 00:01:05,550
+The word-based tokenization, however,
+
+24
+00:01:05,550 --> 00:01:08,520
+will attribute entirely
+different IDs to these two words
+
+25
+00:01:08,520 --> 00:01:10,110
+and the model will therefore learn
+
+26
+00:01:10,110 --> 00:01:12,930
+two different embeddings
+for these two words.
+
+27
+00:01:12,930 --> 00:01:15,090
+This is unfortunate as
+we would like the model
+
+28
+00:01:15,090 --> 00:01:18,240
+to understand that these
+words are indeed related,
+
+29
+00:01:18,240 --> 00:01:21,483
+and that dogs is simply the
+plural form of the word dog.
+
+30
+00:01:22,980 --> 00:01:24,480
+Another issue with this approach,
+
+31
+00:01:24,480 --> 00:01:28,050
+is that there are a lot of
+different words in the language.
+
+32
+00:01:28,050 --> 00:01:29,490
+If we want our model to understand
+
+33
+00:01:29,490 --> 00:01:32,160
+all possible sentences in that language,
+
+34
+00:01:32,160 --> 00:01:35,850
+then we will need to have an
+ID for each different word.
+
+35
+00:01:35,850 --> 00:01:37,380
+And the total number of words,
+
+36
+00:01:37,380 --> 00:01:40,080
+which is also known as
+the vocabulary size,
+
+37
+00:01:40,080 --> 00:01:41,913
+can quickly become very large.
+
+38
+00:01:44,400 --> 00:01:47,640
+This is an issue because each
+ID is mapped to a large vector
+
+39
+00:01:47,640 --> 00:01:50,190
+that represents the word's meaning,
+
+40
+00:01:50,190 --> 00:01:52,170
+and keeping track of these mappings
+
+41
+00:01:52,170 --> 00:01:54,990
+requires an enormous number of weights
+
+42
+00:01:54,990 --> 00:01:57,123
+when the vocabulary size is very large.
+
+43
+00:01:59,160 --> 00:02:00,960
+If we want our models to stay lean,
+
+44
+00:02:00,960 --> 00:02:04,440
+we can opt for our tokenizer
+to ignore certain words
+
+45
+00:02:04,440 --> 00:02:06,093
+that we don't necessarily need.
+
+46
+00:02:08,400 --> 00:02:11,970
+For example, here, when training
+our tokenizer on a text,
+
+47
+00:02:11,970 --> 00:02:15,020
+we might want to take only
+the 10,000 most frequent words
+
+48
+00:02:15,020 --> 00:02:16,320
+in that text.
+
+49
+00:02:16,320 --> 00:02:18,600
+Rather than taking all
+words from in that text
+
+50
+00:02:18,600 --> 00:02:22,503
+or all languages words to
+create our basic vocabulary.
+
+51
+00:02:23,790 --> 00:02:26,520
+The tokenizer will know how
+to convert those 10,000 words
+
+52
+00:02:26,520 --> 00:02:29,370
+into numbers, but any other
+word will be converted
+
+53
+00:02:29,370 --> 00:02:31,530
+to the out-of-vocabulary word,
+
+54
+00:02:31,530 --> 00:02:33,783
+or like shown here, the unknown word.
+
+55
+00:02:35,280 --> 00:02:37,440
+Unfortunately, this is a compromise.
+
+56
+00:02:37,440 --> 00:02:39,900
+The model will have the
+exact same representation
+
+57
+00:02:39,900 --> 00:02:42,390
+for all words that it doesn't know,
+
+58
+00:02:42,390 --> 00:02:45,210
+which can result in a
+lot of lost information
+
+59
+00:02:45,210 --> 00:02:47,664
+if many unknown words are present.
+
+60
+00:02:47,664 --> 00:02:50,581
+(screen whooshing)
+
diff --git a/subtitles/en/14_character-based-tokenizers.srt b/subtitles/en/14_character-based-tokenizers.srt
index 1b3fcd616..c86407bd6 100644
--- a/subtitles/en/14_character-based-tokenizers.srt
+++ b/subtitles/en/14_character-based-tokenizers.srt
@@ -1,134 +1,278 @@
-1
-00:00:04,160 --> 00:00:09,440
-Before diving in character-based tokenization, 
-understanding why this kind of tokenization  
-
-2
-00:00:09,440 --> 00:00:13,680
-is interesting requires understanding 
-the flaws of word-based tokenization.  
-
-3
-00:00:14,560 --> 00:00:18,400
-If you haven't seen the first video on 
-word-based tokenization we recommend you  
-
-4
-00:00:18,400 --> 00:00:23,920
-check it out before looking at this video. Let's 
-take a look at character-based tokenization.  
-
-5
-00:00:25,440 --> 00:00:29,840
-We now split our text into individual 
-characters, rather than words.  
-
-6
-00:00:32,720 --> 00:00:37,200
-There are generally a lot of different words in 
-languages, while the number of characters stays  
-
-7
-00:00:37,200 --> 00:00:45,520
-low. Here for example, for the English language 
-that has an estimated 170,000 different words,  
-
-8
-00:00:45,520 --> 00:00:48,960
-we would need a very large 
-vocabulary to encompass all words.  
-
-9
-00:00:50,080 --> 00:00:55,040
-With a character-based vocabulary, we 
-can get by with only 256 characters!  
-
-10
-00:00:59,600 --> 00:01:04,880
-Even languages with a lot of different characters 
-like the Chinese languages have dictionaries with  
-
-11
-00:01:06,160 --> 00:01:14,160
-~20,000 different characters but more than 375,000 
-different words. Character-based vocabularies  
-
-12
-00:01:14,160 --> 00:01:20,240
-let us fewer different tokens than the word-based 
-tokenization dictionaries we would otherwise use.  
-
-13
-00:01:23,040 --> 00:01:28,000
-These vocabularies are also more complete than 
-their word-based vocabularies counterparts.  
-
-14
-00:01:28,720 --> 00:01:34,160
-As our vocabulary contains all characters used 
-in a language, even words unseen during the  
-
-15
-00:01:34,160 --> 00:01:39,840
-tokenizer training can still be tokenized, so 
-out-of-vocabulary tokens will be less frequent.  
-
-16
-00:01:40,480 --> 00:01:45,200
-This includes the ability to correctly tokenize 
-misspelled words, rather than discarding them as  
-
-17
-00:01:45,200 --> 00:01:53,600
-unknown straight away. However, this algorithm 
-isn't perfect either! Intuitively, characters  
-
-18
-00:01:53,600 --> 00:01:59,760
-do not hold as much information individually as 
-a word would hold. For example, "Let's" holds  
-
-19
-00:01:59,760 --> 00:02:07,040
-more information than "l". Of course, this is not 
-true for all languages, as some languages like  
-
-20
-00:02:07,040 --> 00:02:11,280
-ideogram-based languages have a lot of 
-information held in single characters,  
-
-21
-00:02:12,480 --> 00:02:17,200
-but for others like roman-based languages, 
-the model will have to make sense of multiple  
-
-22
-00:02:17,200 --> 00:02:25,120
-tokens at a time to get the information held in 
-a single word. This leads to another issue with  
-
-23
-00:02:25,120 --> 00:02:30,320
-character-based tokenizers: their sequences are 
-translated into very large amount of tokens to be  
-
-24
-00:02:30,320 --> 00:02:37,680
-processed by the model. This can have an impact 
-on the size of the context the model will carry  
-
-25
-00:02:37,680 --> 00:02:45,120
-around, and will reduce the size of the text we 
-can use as input for our model. This tokenization,  
-
-26
-00:02:45,120 --> 00:02:49,920
-while it has some issues, has seen some very good 
-results in the past and should be considered when  
-
-27
-00:02:49,920 --> 00:03:00,720
-approaching a new problem as it solves some 
-issues encountered in the word-based algorithm.
+﻿1
+00:00:00,234 --> 00:00:02,901
+(page whirring)
+
+2
+00:00:04,260 --> 00:00:07,200
+- Before diving in
+character-based tokenization,
+
+3
+00:00:07,200 --> 00:00:10,350
+understanding why this kind
+of tokenization is interesting
+
+4
+00:00:10,350 --> 00:00:13,533
+requires understanding the flaws
+of word-based tokenization.
+
+5
+00:00:14,640 --> 00:00:16,320
+If you haven't seen the first video
+
+6
+00:00:16,320 --> 00:00:17,880
+on word-based tokenization
+
+7
+00:00:17,880 --> 00:00:21,450
+we recommend you check it out
+before looking at this video.
+
+8
+00:00:21,450 --> 00:00:24,250
+Okay, let's take a look at
+character-based tokenization.
+
+9
+00:00:25,650 --> 00:00:28,560
+We now split our text into
+individual characters,
+
+10
+00:00:28,560 --> 00:00:29,673
+rather than words.
+
+11
+00:00:32,850 --> 00:00:35,550
+There are generally a lot of
+different words in languages,
+
+12
+00:00:35,550 --> 00:00:37,743
+while the number of characters stays low.
+
+13
+00:00:38,610 --> 00:00:41,313
+To begin let's take a look
+at the English language,
+
+14
+00:00:42,210 --> 00:00:45,540
+it has an estimated
+170,000 different words,
+
+15
+00:00:45,540 --> 00:00:47,730
+so we would need a very large vocabulary
+
+16
+00:00:47,730 --> 00:00:49,413
+to encompass all words.
+
+17
+00:00:50,280 --> 00:00:52,200
+With a character-based vocabulary,
+
+18
+00:00:52,200 --> 00:00:55,440
+we can get by with only 256 characters,
+
+19
+00:00:55,440 --> 00:00:58,683
+which includes letters,
+numbers and special characters.
+
+20
+00:00:59,760 --> 00:01:02,190
+Even languages with a lot
+of different characters
+
+21
+00:01:02,190 --> 00:01:04,800
+like the Chinese languages
+can have dictionaries
+
+22
+00:01:04,800 --> 00:01:08,130
+with up to 20,000 different characters
+
+23
+00:01:08,130 --> 00:01:11,523
+but more than 375,000 different words.
+
+24
+00:01:12,480 --> 00:01:14,310
+So character-based vocabularies
+
+25
+00:01:14,310 --> 00:01:16,293
+let us use fewer different tokens
+
+26
+00:01:16,293 --> 00:01:19,050
+than the word-based
+tokenization dictionaries
+
+27
+00:01:19,050 --> 00:01:20,523
+we would otherwise use.
+
+28
+00:01:23,250 --> 00:01:25,830
+These vocabularies are also more complete
+
+29
+00:01:25,830 --> 00:01:28,950
+than their word-based
+vocabularies counterparts.
+
+30
+00:01:28,950 --> 00:01:31,410
+As our vocabulary contains all characters
+
+31
+00:01:31,410 --> 00:01:33,960
+used in a language, even words unseen
+
+32
+00:01:33,960 --> 00:01:36,990
+during the tokenizer training
+can still be tokenized,
+
+33
+00:01:36,990 --> 00:01:39,633
+so out-of-vocabulary tokens
+will be less frequent.
+
+34
+00:01:40,680 --> 00:01:42,840
+This includes the ability
+to correctly tokenize
+
+35
+00:01:42,840 --> 00:01:45,210
+misspelled words, rather
+than discarding them
+
+36
+00:01:45,210 --> 00:01:46,623
+as unknown straight away.
+
+37
+00:01:48,240 --> 00:01:52,380
+However, this algorithm
+isn't perfect either.
+
+38
+00:01:52,380 --> 00:01:54,360
+Intuitively, characters do not hold
+
+39
+00:01:54,360 --> 00:01:57,990
+as much information individually
+as a word would hold.
+
+40
+00:01:57,990 --> 00:02:00,930
+For example, "Let's"
+holds more information
+
+41
+00:02:00,930 --> 00:02:03,570
+than it's first letter "l".
+
+42
+00:02:03,570 --> 00:02:05,880
+Of course, this is not
+true for all languages,
+
+43
+00:02:05,880 --> 00:02:08,880
+as some languages like
+ideogram-based languages
+
+44
+00:02:08,880 --> 00:02:11,523
+have a lot of information
+held in single characters,
+
+45
+00:02:12,750 --> 00:02:15,360
+but for others like roman-based languages,
+
+46
+00:02:15,360 --> 00:02:17,760
+the model will have to make
+sense of multiple tokens
+
+47
+00:02:17,760 --> 00:02:20,670
+at a time to get the
+information otherwise held
+
+48
+00:02:20,670 --> 00:02:21,753
+in a single word.
+
+49
+00:02:23,760 --> 00:02:27,000
+This leads to another issue
+with character-based tokenizers,
+
+50
+00:02:27,000 --> 00:02:29,520
+their sequences are translated
+into very large amount
+
+51
+00:02:29,520 --> 00:02:31,593
+of tokens to be processed by the model.
+
+52
+00:02:33,090 --> 00:02:36,810
+And this can have an impact
+on the size of the context
+
+53
+00:02:36,810 --> 00:02:40,020
+the model will carry around,
+and will reduce the size
+
+54
+00:02:40,020 --> 00:02:42,030
+of the text we can use
+as input for our model,
+
+55
+00:02:42,030 --> 00:02:43,233
+which is often limited.
+
+56
+00:02:44,100 --> 00:02:46,650
+This tokenization, while
+it has some issues,
+
+57
+00:02:46,650 --> 00:02:48,720
+has seen some very good
+results in the past
+
+58
+00:02:48,720 --> 00:02:50,490
+and so it should be
+considered when approaching
+
+59
+00:02:50,490 --> 00:02:52,680
+a new problem as it solves issues
+
+60
+00:02:52,680 --> 00:02:54,843
+encountered in the word-based algorithm.
+
+61
+00:02:56,107 --> 00:02:58,774
+(page whirring)
+
diff --git a/subtitles/en/15_subword-based-tokenizers.srt b/subtitles/en/15_subword-based-tokenizers.srt
index 49a3d6d2d..6c4ef7ff3 100644
--- a/subtitles/en/15_subword-based-tokenizers.srt
+++ b/subtitles/en/15_subword-based-tokenizers.srt
@@ -1,159 +1,323 @@
-1
-00:00:06,320 --> 00:00:11,440
-Let's take a look at subword-based tokenization. 
-Understanding why subword-based tokenization  
-
-2
-00:00:11,440 --> 00:00:16,320
-is interesting requires understanding the flaws 
-of word-based and character-based tokenization.  
-
-3
-00:00:17,200 --> 00:00:21,760
-If you haven't seen the first videos on 
-word-based and character-based tokenization,  
-
-4
-00:00:21,760 --> 00:00:24,400
-we recommend you check them out 
-before looking at this video.  
-
-5
-00:00:27,680 --> 00:00:33,440
-Subword-tokenization lies in between 
-character-based and word-based tokenization  
-
-6
-00:00:33,440 --> 00:00:40,960
-algorithms. The idea is to find a middle ground 
-between very large vocabularies, large quantity of  
-
-7
-00:00:40,960 --> 00:00:47,040
-out-of-vocabulary tokens, loss of meaning across 
-very similar words, for word-based tokenizers,  
-
-8
-00:00:47,040 --> 00:00:52,800
-and very long sequences, less meaningful 
-individual tokens for character-based tokenizers.  
-
-9
-00:00:54,720 --> 00:00:59,360
-These algorithms rely on the following 
-principle: frequently used words should not  
-
-10
-00:00:59,360 --> 00:01:04,800
-be split into smaller subwords, but rare words 
-should be decomposed into meaningful subwords.  
-
-11
-00:01:06,320 --> 00:01:11,520
-An example is the word dog: we would like to have 
-our tokenizer to have a single ID for the word  
-
-12
-00:01:11,520 --> 00:01:18,480
-dog, rather than splitting it into characters: 
-d, o, and g. However, when encountering the word  
-
-13
-00:01:18,480 --> 00:01:23,920
-dogs, we would like our tokenizer to understand 
-that at the root, this is still the word dog,  
-
-14
-00:01:23,920 --> 00:01:31,280
-with an added s while slightly changes the meaning 
-while keeping the original idea. Another example  
-
-15
-00:01:31,280 --> 00:01:37,520
-is a complex word like tokenization, which can 
-be split into meaningful subwords. The root of  
-
-16
-00:01:37,520 --> 00:01:42,000
-the word is token, and ization completes the 
-root to give it a slightly different meaning.  
-
-17
-00:01:42,720 --> 00:01:48,960
-It makes sense to split the word into two: token, 
-as the root of the word (labeled as the "start" of  
-
-18
-00:01:48,960 --> 00:01:53,840
-the word). ization as additional information 
-(labeled as a "completion" of the word).  
-
-19
-00:01:56,240 --> 00:02:00,320
-In turn, the model will now be able to make 
-sense of token in different situations.  
-
-20
-00:02:00,880 --> 00:02:06,400
-It will understand that the words token, tokens, 
-tokenizing, and tokenization are linked and have  
-
-21
-00:02:06,400 --> 00:02:14,000
-a similar meaning. It will also understand that 
-tokenization, modernization, and immunization,  
-
-22
-00:02:14,000 --> 00:02:18,960
-which all have the same suffixes, are probably 
-used in the same syntactic situations.  
-
-23
-00:02:20,320 --> 00:02:25,920
-Subword-based tokenizers generally have a way 
-to identify which tokens are start of words, and  
-
-24
-00:02:25,920 --> 00:02:34,320
-which tokens complete start of words: token as the 
-start of a word. ##ization as completing a word.  
-
-25
-00:02:34,960 --> 00:02:40,800
-Here the ## prefix indicates that ization is 
-part of a word rather than the beginning of it.  
-
-26
-00:02:41,760 --> 00:02:49,440
-The ## comes from the BERT tokenizer, based on the 
-WordPiece algorithm. Other tokenizers use other  
-
-27
-00:02:49,440 --> 00:02:54,720
-prefixes, which can be placed to indicate part of 
-words like seen here, or start of words instead!  
-
-28
-00:02:56,000 --> 00:03:01,040
-There are a lot of different algorithms that can 
-be used for subword tokenization, and most models  
-
-29
-00:03:01,040 --> 00:03:05,760
-obtaining state-of-the-art results in English 
-today use some kind of subword-tokenization  
-
-30
-00:03:05,760 --> 00:03:12,320
-algorithm. These approaches help in reducing 
-the vocabulary sizes by sharing information  
-
-31
-00:03:12,320 --> 00:03:17,840
-across different words, having the ability to 
-have prefixes and suffixes understood as such.  
-
-32
-00:03:18,480 --> 00:03:27,760
-They keep meaning across very similar words, 
-by recognizing similar tokens making them up.
+﻿1
+00:00:06,450 --> 00:00:09,540
+- Let's take a look at
+subword based tokenization.
+
+2
+00:00:09,540 --> 00:00:11,610
+Understanding why subword
+based tokenization is
+
+3
+00:00:11,610 --> 00:00:13,980
+interesting requires
+understanding the flaws
+
+4
+00:00:13,980 --> 00:00:17,340
+of word based and corrector
+based tokenization.
+
+5
+00:00:17,340 --> 00:00:18,780
+If you haven't seen the first videos
+
+6
+00:00:18,780 --> 00:00:22,020
+on word based and character
+based tokenization
+
+7
+00:00:22,020 --> 00:00:23,130
+we recommend you check them
+
+8
+00:00:23,130 --> 00:00:24,780
+out before looking at this video.
+
+9
+00:00:27,840 --> 00:00:31,493
+Subword based tokenization
+lies in between character based
+
+10
+00:00:31,493 --> 00:00:35,280
+and word based tokenization algorithms.
+
+11
+00:00:35,280 --> 00:00:37,410
+The idea is to find a middle ground
+
+12
+00:00:37,410 --> 00:00:39,486
+between very large vocabularies
+
+13
+00:00:39,486 --> 00:00:42,600
+a large quantity of out vocabulary tokens
+
+14
+00:00:42,600 --> 00:00:45,360
+and a loss of meaning
+across very similar words
+
+15
+00:00:45,360 --> 00:00:48,630
+for word based tokenizers
+and very long sequences
+
+16
+00:00:48,630 --> 00:00:51,330
+as well as less meaningful
+individual tokens.
+
+17
+00:00:51,330 --> 00:00:53,133
+For character based tokenizers.
+
+18
+00:00:54,840 --> 00:00:57,960
+These algorithms rely on
+the following principle.
+
+19
+00:00:57,960 --> 00:01:00,000
+Frequently used words should not be split
+
+20
+00:01:00,000 --> 00:01:01,500
+into smaller subwords
+
+21
+00:01:01,500 --> 00:01:03,433
+while rare words should be decomposed
+
+22
+00:01:03,433 --> 00:01:05,103
+into meaningful subwords.
+
+23
+00:01:06,510 --> 00:01:08,460
+An example is the word dog.
+
+24
+00:01:08,460 --> 00:01:11,190
+We would like to have our
+tokenizer to have a single ID
+
+25
+00:01:11,190 --> 00:01:12,600
+for the word dog rather
+
+26
+00:01:12,600 --> 00:01:15,363
+than splitting it into
+correctors D O and G.
+
+27
+00:01:16,650 --> 00:01:19,260
+However, when encountering the word dogs
+
+28
+00:01:19,260 --> 00:01:22,710
+we would like our tokenize to
+understand that at the root
+
+29
+00:01:22,710 --> 00:01:24,120
+this is still the word dog.
+
+30
+00:01:24,120 --> 00:01:27,030
+With an added S, that
+slightly changes the meaning
+
+31
+00:01:27,030 --> 00:01:28,923
+while keeping the original idea.
+
+32
+00:01:30,600 --> 00:01:34,080
+Another example is a complex
+word like tokenization
+
+33
+00:01:34,080 --> 00:01:37,140
+which can be split into
+meaningful subwords.
+
+34
+00:01:37,140 --> 00:01:37,973
+The root
+
+35
+00:01:37,973 --> 00:01:40,590
+of the word is token and
+-ization completes the root
+
+36
+00:01:40,590 --> 00:01:42,870
+to give it a slightly different meaning.
+
+37
+00:01:42,870 --> 00:01:44,430
+It makes sense to split the word
+
+38
+00:01:44,430 --> 00:01:47,640
+into two, token as the root of the word,
+
+39
+00:01:47,640 --> 00:01:49,950
+labeled as the start of the word
+
+40
+00:01:49,950 --> 00:01:52,530
+and ization as additional
+information labeled
+
+41
+00:01:52,530 --> 00:01:54,393
+as a completion of the word.
+
+42
+00:01:55,826 --> 00:01:58,740
+In turn, the model will
+now be able to make sense
+
+43
+00:01:58,740 --> 00:02:01,080
+of token in different situations.
+
+44
+00:02:01,080 --> 00:02:04,602
+It will understand that the
+word's token, tokens, tokenizing
+
+45
+00:02:04,602 --> 00:02:08,760
+and tokenization have a
+similar meaning and are linked.
+
+46
+00:02:08,760 --> 00:02:12,450
+It's will also understand that
+tokenization, modernization
+
+47
+00:02:12,450 --> 00:02:16,200
+and immunization, which
+all have the same suffixes
+
+48
+00:02:16,200 --> 00:02:19,383
+are probably used in the
+same syntactic situations.
+
+49
+00:02:20,610 --> 00:02:23,130
+Subword based tokenizers
+generally have a way to
+
+50
+00:02:23,130 --> 00:02:25,890
+identify which tokens are a start of word
+
+51
+00:02:25,890 --> 00:02:28,443
+and which tokens complete start of words.
+
+52
+00:02:29,520 --> 00:02:31,140
+So here token as the start
+
+53
+00:02:31,140 --> 00:02:35,100
+of a ward and hash hash
+ization as completion of award.
+
+54
+00:02:35,100 --> 00:02:38,103
+Here, the hash hash prefix
+indicates that ization is part
+
+55
+00:02:38,103 --> 00:02:41,013
+of award rather than the beginning of it.
+
+56
+00:02:41,910 --> 00:02:43,110
+The hash hash comes
+
+57
+00:02:43,110 --> 00:02:47,013
+from the BERT tokenizer based
+on the word piece algorithm.
+
+58
+00:02:47,850 --> 00:02:50,700
+Other tokenizes use other
+prefixes which can be
+
+59
+00:02:50,700 --> 00:02:52,200
+placed to indicate part of words
+
+60
+00:02:52,200 --> 00:02:55,083
+like in here or start of words instead.
+
+61
+00:02:56,250 --> 00:02:57,083
+There are a lot
+
+62
+00:02:57,083 --> 00:02:58,740
+of different algorithms that can be used
+
+63
+00:02:58,740 --> 00:03:00,090
+for subword tokenization
+
+64
+00:03:00,090 --> 00:03:02,670
+and most models obtaining
+state-of-the-art results
+
+65
+00:03:02,670 --> 00:03:03,780
+in English today
+
+66
+00:03:03,780 --> 00:03:06,663
+use some kind of subword
+tokenization algorithms.
+
+67
+00:03:07,620 --> 00:03:10,953
+These approaches help in
+reducing the vocabulary sizes
+
+68
+00:03:10,953 --> 00:03:13,636
+by sharing information
+across different words
+
+69
+00:03:13,636 --> 00:03:15,960
+having the ability to have prefixes
+
+70
+00:03:15,960 --> 00:03:18,630
+and suffixes understood as such.
+
+71
+00:03:18,630 --> 00:03:20,700
+They keep meaning across
+very similar words
+
+72
+00:03:20,700 --> 00:03:23,103
+by recognizing similar
+tokens, making them up.
+
diff --git a/subtitles/en/16_the-tokenization-pipeline.srt b/subtitles/en/16_the-tokenization-pipeline.srt
index 537af57bb..f0da01106 100644
--- a/subtitles/en/16_the-tokenization-pipeline.srt
+++ b/subtitles/en/16_the-tokenization-pipeline.srt
@@ -1,168 +1,339 @@
-1
-00:00:05,440 --> 00:00:12,320
-The tokenizer pipeline. In this video, we'll look 
-at how a tokenizer converts raw text to numbers  
-
-2
-00:00:12,320 --> 00:00:18,080
-that a Transformer model can make sense of, 
-like when we execute this code. Here is a quick  
-
-3
-00:00:18,080 --> 00:00:24,400
-overview of what happens inside the tokenizer 
-object: first the text is split into tokens, which  
-
-4
-00:00:24,400 --> 00:00:31,280
-are words, parts of words, or punctuation symbols. 
-Then the tokenizer adds potential special tokens  
-
-5
-00:00:31,280 --> 00:00:36,560
-and converts each token to their unique respective 
-ID as defined by the tokenizer's vocabulary.  
-
-6
-00:00:37,520 --> 00:00:41,440
-As we'll see it doesn't actually happen 
-in this order, but viewing it like this  
-
-7
-00:00:41,440 --> 00:00:46,320
-is better for understanding what happens. 
-The first step is to split our input text  
-
-8
-00:00:46,320 --> 00:00:53,840
-into tokens with the tokenize method. To do this, 
-the tokenizer may first perform some operations  
-
-9
-00:00:53,840 --> 00:00:58,000
-like lowercasing all words, then follow a 
-set of rules to split the result in small  
-
-10
-00:00:58,000 --> 00:01:03,520
-chunks of text. Most of the Transformers 
-models use a subword tokenization algorithm,  
-
-11
-00:01:04,160 --> 00:01:08,720
-which means that one given word can be 
-split in several tokens, like tokenize  
-
-12
-00:01:08,720 --> 00:01:13,360
-here. Look at the "Tokenization algorithms" 
-videos linked below for more information!  
-
-13
-00:01:14,480 --> 00:01:19,600
-The ## prefix we see in front of ize is 
-the convention used by BERT to indicate  
-
-14
-00:01:19,600 --> 00:01:26,080
-this token is not the beginning of a word. Other 
-tokenizers may use different conventions however:  
-
-15
-00:01:26,080 --> 00:01:31,040
-for instance ALBERT tokenizers will add 
-a long underscore in front of all the  
-
-16
-00:01:31,040 --> 00:01:36,640
-tokens that had a space before them, which is 
-a convention used by sentencepiece tokenizers.  
-
-17
-00:01:38,320 --> 00:01:43,280
-The second step of the tokenization pipeline 
-is to map those tokens to their respective IDs  
-
-18
-00:01:43,280 --> 00:01:48,960
-as defined by the vocabulary of the tokenizer. 
-This is why we need to download a file when we  
-
-19
-00:01:48,960 --> 00:01:53,600
-instantiate a tokenizer with the from_pretrained 
-method: we have to make sure we use the same  
-
-20
-00:01:53,600 --> 00:01:59,520
-mapping as when the model was pretrained. To do 
-this, we use the convert_tokens_to_ids method.  
-
-21
-00:02:00,720 --> 00:02:05,360
-You may have noticed that we don't have the 
-exact same result as in our first slide — or not,  
-
-22
-00:02:05,360 --> 00:02:09,840
-as this looks like a list of random numbers, 
-in which case allow me to refresh your memory.  
-
-23
-00:02:10,480 --> 00:02:13,680
-We had a number at the beginning 
-and at the end that are missing,  
-
-24
-00:02:14,400 --> 00:02:20,160
-those are the special tokens. The special tokens 
-are added by the prepare_for_model method,  
-
-25
-00:02:20,160 --> 00:02:25,280
-which knows the indices of those tokens in the 
-vocabulary and just adds the proper numbers.  
-
-26
-00:02:28,320 --> 00:02:32,480
-You can look at the special tokens (and more 
-generally at how the tokenizer has changed  
-
-27
-00:02:32,480 --> 00:02:37,120
-your text) by using the decode method 
-on the outputs of the tokenizer object.  
-
-28
-00:02:38,240 --> 00:02:44,080
-As for the prefix for beginning of words/part 
-of words, those special tokens vary depending on  
-
-29
-00:02:44,080 --> 00:02:50,080
-which tokenizer you are using. The BERT tokenizer 
-uses [CLS] and [SEP] but the roberta tokenizer  
-
-30
-00:02:50,080 --> 00:02:57,520
-uses html-like anchors  and . Now that 
-you know how the tokenizer works, you can forget  
-
-31
-00:02:57,520 --> 00:03:02,560
-all those intermediaries methods and only remember 
-that you just have to call it on your input texts.  
-
-32
-00:03:03,600 --> 00:03:06,880
-The inputs don't contain the inputs IDs however,  
-
-33
-00:03:07,520 --> 00:03:11,600
-to learn what the attention mask is, check 
-out the "Batch inputs together" video.  
-
-34
-00:03:12,160 --> 00:03:17,840
-To learn about token type IDs, look at 
-the "Process pairs of sentences" video.
+﻿1
+00:00:00,479 --> 00:00:03,396
+(object whooshing)
+
+2
+00:00:05,610 --> 00:00:06,873
+- The tokenizer pipeline.
+
+3
+00:00:07,920 --> 00:00:10,570
+In this video, we'll look
+at how a tokenizer converts
+
+4
+00:00:11,433 --> 00:00:12,480
+raw texts to numbers,
+
+5
+00:00:12,480 --> 00:00:14,970
+that a Transformer
+model can make sense of,
+
+6
+00:00:14,970 --> 00:00:16,520
+like when we execute this code.
+
+7
+00:00:17,760 --> 00:00:18,690
+Here is a quick overview
+
+8
+00:00:18,690 --> 00:00:21,630
+of what happens inside
+the tokenizer object:
+
+9
+00:00:21,630 --> 00:00:24,360
+first, the text is split into tokens,
+
+10
+00:00:24,360 --> 00:00:27,453
+which are words, parts of
+words, or punctuation symbols.
+
+11
+00:00:28,440 --> 00:00:31,500
+Then the tokenizer adds
+potential special tokens
+
+12
+00:00:31,500 --> 00:00:34,680
+and converts each token to
+their unique respective ID
+
+13
+00:00:34,680 --> 00:00:36,843
+as defined by the tokenizer's vocabulary.
+
+14
+00:00:37,710 --> 00:00:40,380
+As we'll see, it doesn't
+quite happen in this order,
+
+15
+00:00:40,380 --> 00:00:43,233
+but doing it like this is
+better for understandings.
+
+16
+00:00:44,280 --> 00:00:47,670
+The first step is to split
+our input text into tokens.
+
+17
+00:00:47,670 --> 00:00:49,653
+We use the tokenize method for this.
+
+18
+00:00:50,550 --> 00:00:54,030
+To do that, the tokenizer may
+first perform some operations,
+
+19
+00:00:54,030 --> 00:00:56,880
+like lowercasing all words,
+then follow a set of rules
+
+20
+00:00:56,880 --> 00:00:59,283
+to split the result in
+small chunks of text.
+
+21
+00:01:00,480 --> 00:01:02,286
+Most of the Transformer models uses
+
+22
+00:01:02,286 --> 00:01:04,890
+a word tokenization algorithm, which means
+
+23
+00:01:04,890 --> 00:01:06,750
+that one given word can be split
+
+24
+00:01:06,750 --> 00:01:10,050
+in several tokens like tokenize here.
+
+25
+00:01:10,050 --> 00:01:12,570
+Look at the "Tokenization
+algorithms" video link below
+
+26
+00:01:12,570 --> 00:01:13,743
+for more information.
+
+27
+00:01:14,760 --> 00:01:17,820
+The # # prefix we see in front of ize is
+
+28
+00:01:17,820 --> 00:01:19,830
+a convention used by Bert to indicate
+
+29
+00:01:19,830 --> 00:01:22,762
+this token is not the
+beginning of the word.
+
+30
+00:01:22,762 --> 00:01:26,310
+Other tokenizers may use
+different conventions however:
+
+31
+00:01:26,310 --> 00:01:29,984
+for instance, ALBERT tokenizers
+will add a long underscore
+
+32
+00:01:29,984 --> 00:01:31,620
+in front of all the tokens
+
+33
+00:01:31,620 --> 00:01:34,920
+that added space before them,
+which is a convention shared
+
+34
+00:01:34,920 --> 00:01:37,700
+by all sentencepiece tokenizers.
+
+35
+00:01:38,580 --> 00:01:41,040
+The second step of the
+tokenization pipeline is
+
+36
+00:01:41,040 --> 00:01:43,470
+to map those tokens to
+their respective IDs
+
+37
+00:01:43,470 --> 00:01:45,770
+as defined by the
+vocabulary of the tokenizer.
+
+38
+00:01:46,770 --> 00:01:48,690
+This is why we need to download the file
+
+39
+00:01:48,690 --> 00:01:50,580
+when we instantiate a tokenizer
+
+40
+00:01:50,580 --> 00:01:52,400
+with the from_pretrained method.
+
+41
+00:01:52,400 --> 00:01:54,390
+We have to make sure
+we use the same mapping
+
+42
+00:01:54,390 --> 00:01:56,520
+as when the model was pretrained.
+
+43
+00:01:56,520 --> 00:01:59,703
+To do this, we use the
+convert_tokens_to_ids method.
+
+44
+00:02:01,050 --> 00:02:01,883
+We may have noticed
+
+45
+00:02:01,883 --> 00:02:03,540
+that we don't have the exact same results
+
+46
+00:02:03,540 --> 00:02:05,580
+as in our first slide, or not
+
+47
+00:02:05,580 --> 00:02:07,920
+as this looks like a list
+of random numbers anyway,
+
+48
+00:02:07,920 --> 00:02:10,680
+in which case, allow me
+to refresh your memory.
+
+49
+00:02:10,680 --> 00:02:12,350
+We had a the number at
+the beginning and a number
+
+50
+00:02:12,350 --> 00:02:17,130
+at the end that are missing,
+those are the special tokens.
+
+51
+00:02:17,130 --> 00:02:20,340
+The special tokens are added
+by the prepare_for_model method
+
+52
+00:02:20,340 --> 00:02:22,350
+which knows the indices of this token
+
+53
+00:02:22,350 --> 00:02:25,680
+in the vocabulary and just
+adds the proper numbers.
+
+54
+00:02:25,680 --> 00:02:27,243
+in the input IDs list.
+
+55
+00:02:28,590 --> 00:02:29,541
+You can look at the special tokens
+
+56
+00:02:29,541 --> 00:02:30,990
+and, more generally,
+
+57
+00:02:30,990 --> 00:02:33,870
+at how the tokenizer
+has changed your text,
+
+58
+00:02:33,870 --> 00:02:35,280
+by using the decode method
+
+59
+00:02:35,280 --> 00:02:37,503
+on the outputs of the tokenizer object.
+
+60
+00:02:38,490 --> 00:02:39,423
+As for the prefix for beginning
+
+61
+00:02:39,423 --> 00:02:44,160
+of words/ part of words, for
+special tokens vary depending
+
+62
+00:02:44,160 --> 00:02:46,500
+on which tokenizer you're using.
+
+63
+00:02:46,500 --> 00:02:48,810
+So that tokenizer uses CLS and SEP,
+
+64
+00:02:48,810 --> 00:02:52,417
+but the roberta tokenizer
+uses HTML-like anchors
+
+65
+00:02:52,417 --> 00:02:55,230
+<s> and </s>.
+
+66
+00:02:55,230 --> 00:02:57,090
+Now that you know how the tokenizer works,
+
+67
+00:02:57,090 --> 00:02:59,390
+you can forget all those
+intermediate methods,
+
+68
+00:03:00,283 --> 00:03:01,650
+and then you remember that
+you just have to call it
+
+69
+00:03:01,650 --> 00:03:02,913
+on your input texts.
+
+70
+00:03:03,870 --> 00:03:05,310
+The output of a tokenizer don't
+
+71
+00:03:05,310 --> 00:03:07,853
+just contain the input IDs, however.
+
+72
+00:03:07,853 --> 00:03:09,750
+To learn what the attention mask is,
+
+73
+00:03:09,750 --> 00:03:12,360
+check out the "Batch
+input together" video.
+
+74
+00:03:12,360 --> 00:03:14,220
+To learn about token type IDs,
+
+75
+00:03:14,220 --> 00:03:16,570
+look at the "Process
+pairs of sentences" video.
+
+76
+00:03:18,003 --> 00:03:20,920
+(object whooshing)
+
diff --git a/subtitles/en/17_batching-inputs-together-(pytorch).srt b/subtitles/en/17_batching-inputs-together-(pytorch).srt
index dd1115253..ca7f696c8 100644
--- a/subtitles/en/17_batching-inputs-together-(pytorch).srt
+++ b/subtitles/en/17_batching-inputs-together-(pytorch).srt
@@ -1,144 +1,291 @@
-1
-00:00:05,200 --> 00:00:10,880
-How to batch inputs together? In this video, we 
-will see how to batch input sequences together.  
-
-2
-00:00:12,320 --> 00:00:16,560
-In general, the sentences we want to pass through 
-our model won't all have the same lengths.  
-
-3
-00:00:17,520 --> 00:00:21,280
-Here we are using the model we saw 
-in the sentiment analysis pipeline  
-
-4
-00:00:21,840 --> 00:00:26,800
-and want to classify two sentences. 
-When tokenizing them and mapping each  
-
-5
-00:00:26,800 --> 00:00:31,280
-token to its corresponding input IDs, 
-we get two lists of different lengths.  
-
-6
-00:00:33,040 --> 00:00:38,400
-Trying to create a tensor or a NumPy array from 
-those two lists will result in an error, because  
-
-7
-00:00:38,400 --> 00:00:44,560
-all arrays and tensors should be rectangular. 
-One way to overcome this limit is to make the  
-
-8
-00:00:44,560 --> 00:00:50,160
-second sentence the same length as the first by 
-adding a special token as many times as necessary.  
-
-9
-00:00:51,360 --> 00:00:55,760
-Another way would be to truncate the first 
-sequence to the length of the second, but we  
-
-10
-00:00:55,760 --> 00:01:00,720
-would them lose a lot of information that might 
-be necessary to properly classify the sentence.  
-
-11
-00:01:02,000 --> 00:01:06,720
-In general, we only truncate sentences when 
-they are longer than the maximum length the  
-
-12
-00:01:06,720 --> 00:01:14,000
-model can handle. The value used to pad the second 
-sentence should not be picked randomly: the model  
-
-13
-00:01:14,000 --> 00:01:19,200
-has been pretrained with a certain padding ID, 
-which you can find in tokenizer.pad_token_id.  
-
-14
-00:01:20,800 --> 00:01:25,200
-Now that we have padded our sentences, 
-we can make a batch with them. If  
-
-15
-00:01:25,200 --> 00:01:29,840
-we pass the two sentences to the model 
-separately and batched together however,  
-
-16
-00:01:29,840 --> 00:01:35,120
-we notice that we don't get the same results for 
-the sentence that is padded (here the second one).  
-
-17
-00:01:39,120 --> 00:01:42,880
-If you remember that Transformer models make 
-heavy use of attention layers, this should  
-
-18
-00:01:42,880 --> 00:01:47,760
-not come as a total surprise: when computing 
-the contextual representation of each token,  
-
-19
-00:01:48,560 --> 00:01:54,320
-the attention layers look at all the other words 
-in the sentence. If we have just the sentence or  
-
-20
-00:01:54,320 --> 00:01:58,720
-the sentence with several padding tokens added, 
-it's logical we don't get the same values.  
-
-21
-00:02:00,000 --> 00:02:05,120
-To get the same results with or without padding, 
-we need to indicate to the attention layers  
-
-22
-00:02:05,120 --> 00:02:10,320
-that they should ignore those padding tokens. 
-This is done by creating an attention mask,  
-
-23
-00:02:10,320 --> 00:02:16,560
-a tensor with the same shape as the input 
-IDs, with zeros and ones. Ones indicate the  
-
-24
-00:02:16,560 --> 00:02:21,840
-tokens the attention layers should consider in the 
-context and zeros the tokens they should ignore.  
-
-25
-00:02:23,360 --> 00:02:26,560
-Now passing this attention 
-mask along with the input ids  
-
-26
-00:02:26,560 --> 00:02:30,720
-will give us the same results as when we sent 
-the two sentences individually to the model!  
-
-27
-00:02:32,160 --> 00:02:36,640
-This is all done behind the scenes by the 
-tokenizer when you apply it to several sentences  
-
-28
-00:02:36,640 --> 00:02:41,280
-with the flag padding=True. It will 
-apply the padding with the proper value  
-
-29
-00:02:41,280 --> 00:02:49,840
-to the smaller sentences and create 
-the appropriate attention mask.
+﻿1
+00:00:00,373 --> 00:00:02,956
+(subtle blast)
+
+2
+00:00:05,400 --> 00:00:07,590
+- How to batch inputs together.
+
+3
+00:00:07,590 --> 00:00:09,240
+In this video, we will see how
+
+4
+00:00:09,240 --> 00:00:11,073
+to batch input sequences together.
+
+5
+00:00:12,137 --> 00:00:15,420
+In general, the sentences we
+want to pass through our model
+
+6
+00:00:15,420 --> 00:00:17,670
+won't all have the same lengths.
+
+7
+00:00:17,670 --> 00:00:19,740
+Here, we are using the model we saw
+
+8
+00:00:19,740 --> 00:00:22,080
+in the sentiment analysis pipeline
+
+9
+00:00:22,080 --> 00:00:24,063
+and want to classify two sentences.
+
+10
+00:00:24,900 --> 00:00:27,360
+When tokenizing them
+and mapping each token
+
+11
+00:00:27,360 --> 00:00:29,610
+to its corresponding input IDs,
+
+12
+00:00:29,610 --> 00:00:31,593
+we get two lists of different lengths.
+
+13
+00:00:33,240 --> 00:00:35,340
+Trying to create a tensor or a NumPy array
+
+14
+00:00:35,340 --> 00:00:38,220
+from those two lists
+will result in an error,
+
+15
+00:00:38,220 --> 00:00:41,043
+because all arrays and
+tensors should be rectangular.
+
+16
+00:00:42,240 --> 00:00:44,160
+One way to overcome this limit
+
+17
+00:00:44,160 --> 00:00:45,690
+is to make the second sentence
+
+18
+00:00:45,690 --> 00:00:47,640
+the same length as the first
+
+19
+00:00:47,640 --> 00:00:50,463
+by adding a special token
+as many times as necessary.
+
+20
+00:00:51,600 --> 00:00:53,970
+Another way would be to
+truncate the first sequence
+
+21
+00:00:53,970 --> 00:00:55,710
+to the length of the second,
+
+22
+00:00:55,710 --> 00:00:58,140
+but we would them lose
+a lot of information
+
+23
+00:00:58,140 --> 00:01:01,083
+that might be necessary to
+properly classify the sentence.
+
+24
+00:01:02,190 --> 00:01:04,830
+In general, we only truncate sentences
+
+25
+00:01:04,830 --> 00:01:06,840
+when they are longer
+than the maximum length
+
+26
+00:01:06,840 --> 00:01:08,073
+the model can handle.
+
+27
+00:01:09,720 --> 00:01:11,850
+The value used to pad the second sentence
+
+28
+00:01:11,850 --> 00:01:13,740
+should not be picked randomly;
+
+29
+00:01:13,740 --> 00:01:16,680
+the model has been pretrained
+with a certain padding ID,
+
+30
+00:01:16,680 --> 00:01:19,533
+which you can find in
+tokenizer.pad_token_id.
+
+31
+00:01:21,090 --> 00:01:22,800
+Now that we have padded our sentences,
+
+32
+00:01:22,800 --> 00:01:24,303
+we can make a batch with them.
+
+33
+00:01:25,380 --> 00:01:28,320
+If we pass the two sentences
+to the model separately
+
+34
+00:01:28,320 --> 00:01:30,120
+and batched together however,
+
+35
+00:01:30,120 --> 00:01:32,100
+we notice that we don't
+get the same results
+
+36
+00:01:32,100 --> 00:01:34,060
+for the sentence that is padded,
+
+37
+00:01:34,060 --> 00:01:35,403
+here, the second one.
+
+38
+00:01:36,390 --> 00:01:39,420
+It's at the back in the
+Transformers Library? No.
+
+39
+00:01:39,420 --> 00:01:40,770
+If you remember that Transformer models
+
+40
+00:01:40,770 --> 00:01:42,810
+make heavy use of attention layers,
+
+41
+00:01:42,810 --> 00:01:45,210
+this should not come as a total surprise;
+
+42
+00:01:45,210 --> 00:01:48,277
+when computing the contextual
+representation of each token,
+
+43
+00:01:48,277 --> 00:01:50,910
+the attention layers look
+at all the other words
+
+44
+00:01:50,910 --> 00:01:52,410
+in the sentence.
+
+45
+00:01:52,410 --> 00:01:53,850
+If we have just the sentence
+
+46
+00:01:53,850 --> 00:01:56,970
+or the sentence with several
+padding tokens added,
+
+47
+00:01:56,970 --> 00:01:59,073
+it's logical we don't get the same values.
+
+48
+00:02:00,270 --> 00:02:03,030
+To get the same results
+with or without padding,
+
+49
+00:02:03,030 --> 00:02:05,340
+we need to indicate to
+the attention layers
+
+50
+00:02:05,340 --> 00:02:08,070
+that they should ignore
+those padding tokens.
+
+51
+00:02:08,070 --> 00:02:10,620
+This is done by creating
+an attention mask,
+
+52
+00:02:10,620 --> 00:02:13,320
+a tensor with the same
+shape as the input IDs,
+
+53
+00:02:13,320 --> 00:02:14,733
+with zeros and ones.
+
+54
+00:02:15,780 --> 00:02:18,120
+Ones indicate the tokens
+the attention layers
+
+55
+00:02:18,120 --> 00:02:20,100
+should consider in the context
+
+56
+00:02:20,100 --> 00:02:22,100
+and zeros the tokens they should ignore.
+
+57
+00:02:23,520 --> 00:02:26,760
+Now, passing this attention
+mask along with the input ID
+
+58
+00:02:26,760 --> 00:02:28,170
+will give us the same results
+
+59
+00:02:28,170 --> 00:02:31,170
+as when we sent the two sentences
+individually to the model.
+
+60
+00:02:32,400 --> 00:02:34,950
+This is all done behind
+the scenes by the tokenizer
+
+61
+00:02:34,950 --> 00:02:36,900
+when you apply it to several sentences
+
+62
+00:02:36,900 --> 00:02:38,613
+with the flag padding=True.
+
+63
+00:02:39,599 --> 00:02:41,490
+It will apply the padding
+with the proper value
+
+64
+00:02:41,490 --> 00:02:43,140
+to the smaller sentences
+
+65
+00:02:43,140 --> 00:02:45,423
+and create the appropriate attention mask.
+
+66
+00:02:46,993 --> 00:02:49,576
+(subtle blast)
+
diff --git a/subtitles/en/18_batching-inputs-together-(tensorflow).srt b/subtitles/en/18_batching-inputs-together-(tensorflow).srt
index 8c78a8f0c..c31449704 100644
--- a/subtitles/en/18_batching-inputs-together-(tensorflow).srt
+++ b/subtitles/en/18_batching-inputs-together-(tensorflow).srt
@@ -1,138 +1,281 @@
-1
-00:00:05,120 --> 00:00:10,880
-How to batch inputs together? In this video, we 
-will see how to batch input sequences together.  
-
-2
-00:00:12,480 --> 00:00:16,560
-In general, the sentences we want to pass 
-through our model won't all have the same  
-
-3
-00:00:16,560 --> 00:00:23,520
-lengths. Here we are using the model we saw in the 
-sentiment analysis pipeline and want to classify  
-
-4
-00:00:23,520 --> 00:00:29,760
-two sentences. When tokenizing them and mapping 
-each token to its corresponding input IDs,  
-
-5
-00:00:29,760 --> 00:00:31,680
-we get two lists of different lengths.  
-
-6
-00:00:33,120 --> 00:00:38,240
-Trying to create a tensor or a NumPy array from 
-those two lists will result in an error, because  
-
-7
-00:00:38,240 --> 00:00:44,320
-all arrays and tensors should be rectangular. 
-One way to overcome this limit is to make the  
-
-8
-00:00:44,320 --> 00:00:50,080
-second sentence the same length as the first by 
-adding a special token as many times as necessary.  
-
-9
-00:00:51,040 --> 00:00:55,360
-Another way would be to truncate the first 
-sequence to the length of the second, but we  
-
-10
-00:00:55,360 --> 00:01:00,080
-would them lose a lot of information that might 
-be necessary to properly classify the sentence.  
-
-11
-00:01:01,040 --> 00:01:05,760
-In general, we only truncate sentences when 
-they are longer than the maximum length the  
-
-12
-00:01:05,760 --> 00:01:12,560
-model can handle. The value used to pad the second 
-sentence should not be picked randomly: the model  
-
-13
-00:01:12,560 --> 00:01:18,000
-has been pretrained with a certain padding ID, 
-which you can find in tokenizer.pad_token_id.  
-
-14
-00:01:19,760 --> 00:01:22,640
-Now that we have padded our sentences, 
-we can make a batch with them.  
-
-15
-00:01:23,920 --> 00:01:28,400
-If we pass the two sentences to the model 
-separately and batched together however,  
-
-16
-00:01:28,400 --> 00:01:33,600
-we notice that we don't get the same results for 
-the sentence that is padded (here the second one).  
-
-17
-00:01:37,360 --> 00:01:41,440
-If you remember that Transformer models make 
-heavy use of attention layers, this should  
-
-18
-00:01:41,440 --> 00:01:46,800
-not come as a total surprise: when computing 
-the contextual representation of each token,  
-
-19
-00:01:46,800 --> 00:01:52,800
-the attention layers look at all the other words 
-in the sentence. If we have just the sentence or  
-
-20
-00:01:52,800 --> 00:01:57,200
-the sentence with several padding tokens added, 
-it's logical we don't get the same values.  
-
-21
-00:01:58,560 --> 00:02:03,520
-To get the same results with or without padding, 
-we need to indicate to the attention layers  
-
-22
-00:02:03,520 --> 00:02:08,640
-that they should ignore those padding tokens. 
-This is done by creating an attention mask,  
-
-23
-00:02:08,640 --> 00:02:15,920
-a tensor with the same shape as the input IDs, 
-with zeros and ones. Ones indicate the tokens the  
-
-24
-00:02:15,920 --> 00:02:22,160
-attention layers should consider in the context 
-and zeros the tokens they should ignore. Now  
-
-25
-00:02:22,160 --> 00:02:27,040
-passing this attention mask along with the input 
-ids will give us the same results as when we sent  
-
-26
-00:02:27,040 --> 00:02:33,600
-the two sentences individually to the model! This 
-is all done behind the scenes by the tokenizer  
-
-27
-00:02:33,600 --> 00:02:39,680
-when you apply it to several sentences with the 
-flag padding=True. It will apply the padding with  
-
-28
-00:02:39,680 --> 00:02:49,840
-the proper value to the smaller sentences 
-and create the appropriate attention mask.
+﻿1
+00:00:00,458 --> 00:00:02,791
+(logo whooshes)
+
+2
+00:00:05,310 --> 00:00:07,590
+- How to batch inputs together.
+
+3
+00:00:07,590 --> 00:00:09,150
+In this video, we'll see
+
+4
+00:00:09,150 --> 00:00:11,050
+how to batch input sequences together.
+
+5
+00:00:12,630 --> 00:00:14,910
+In general, the sentences we want to pass
+
+6
+00:00:14,910 --> 00:00:18,000
+through our model won't
+all have the same lengths.
+
+7
+00:00:18,000 --> 00:00:20,310
+Here, we are using the model we saw
+
+8
+00:00:20,310 --> 00:00:22,650
+in the sentiment analysis pipeline
+
+9
+00:00:22,650 --> 00:00:24,753
+and want to classify two sentences.
+
+10
+00:00:25,860 --> 00:00:27,870
+When tokenizing them
+and mapping each token
+
+11
+00:00:27,870 --> 00:00:30,000
+to its corresponding input IDs,
+
+12
+00:00:30,000 --> 00:00:31,900
+we get two lists of different lengths.
+
+13
+00:00:33,360 --> 00:00:35,070
+Trying to create a tensor and NumPy array
+
+14
+00:00:35,070 --> 00:00:38,100
+from those two lists
+will result in an error
+
+15
+00:00:38,100 --> 00:00:40,953
+because all arrays and
+tensors should be rectangular.
+
+16
+00:00:42,510 --> 00:00:43,920
+One way to overcome this limit
+
+17
+00:00:43,920 --> 00:00:47,340
+is to make the second sentence
+the same length as the first
+
+18
+00:00:47,340 --> 00:00:50,373
+by adding a special token
+as many times as necessary.
+
+19
+00:00:51,300 --> 00:00:53,340
+Another way would be to
+truncate the first sequence
+
+20
+00:00:53,340 --> 00:00:56,550
+to the length of the second,
+but we would then lose a lot
+
+21
+00:00:56,550 --> 00:00:58,590
+of information that may be necessary
+
+22
+00:00:58,590 --> 00:01:01,230
+to properly classify the sentence.
+
+23
+00:01:01,230 --> 00:01:04,710
+In general, we only truncate
+sentences when they are longer
+
+24
+00:01:04,710 --> 00:01:07,083
+than the maximum length
+the model can handle.
+
+25
+00:01:08,310 --> 00:01:10,320
+The value used to pad the second sentence
+
+26
+00:01:10,320 --> 00:01:12,390
+should not be picked randomly.
+
+27
+00:01:12,390 --> 00:01:15,330
+The model has been pretrained
+with a certain padding ID,
+
+28
+00:01:15,330 --> 00:01:18,093
+which you can find in
+tokenizer.pad_token_id.
+
+29
+00:01:19,950 --> 00:01:21,630
+Now that we have padded our sentences,
+
+30
+00:01:21,630 --> 00:01:23,130
+we can make a batch with them.
+
+31
+00:01:24,210 --> 00:01:26,730
+If we pass the two sentences
+to the model separately
+
+32
+00:01:26,730 --> 00:01:29,130
+or batched together, however, we notice
+
+33
+00:01:29,130 --> 00:01:30,630
+that we don't get the same results
+
+34
+00:01:30,630 --> 00:01:32,070
+for the sentence that is padded.
+
+35
+00:01:32,070 --> 00:01:34,440
+Here, the second one.
+
+36
+00:01:34,440 --> 00:01:36,690
+Expect the word in the
+transformer library?
+
+37
+00:01:36,690 --> 00:01:37,620
+No.
+
+38
+00:01:37,620 --> 00:01:39,720
+If you remember that Transformer
+models make heavy use
+
+39
+00:01:39,720 --> 00:01:43,800
+of attention layers, it should
+not come as a total surprise.
+
+40
+00:01:43,800 --> 00:01:47,100
+When computing the contextual
+representation of each token,
+
+41
+00:01:47,100 --> 00:01:49,440
+the attention layers look
+at all the other words
+
+42
+00:01:49,440 --> 00:01:51,240
+in the sentence.
+
+43
+00:01:51,240 --> 00:01:52,252
+If we have just a sentence
+
+44
+00:01:52,252 --> 00:01:55,650
+or the sentence with several
+padding tokens added,
+
+45
+00:01:55,650 --> 00:01:57,750
+it's logical we don't get the same values.
+
+46
+00:01:58,830 --> 00:02:01,410
+To get the same results
+with or without padding,
+
+47
+00:02:01,410 --> 00:02:03,750
+we need to indicate to
+the attention layers
+
+48
+00:02:03,750 --> 00:02:06,660
+that they should ignore
+those padding tokens.
+
+49
+00:02:06,660 --> 00:02:08,970
+This is done by creating
+an attention mask,
+
+50
+00:02:08,970 --> 00:02:11,700
+a tensor with the same
+shape as the input IDs
+
+51
+00:02:11,700 --> 00:02:13,173
+with zeros and ones.
+
+52
+00:02:14,640 --> 00:02:16,830
+Ones indicate the tokens
+the attention layers
+
+53
+00:02:16,830 --> 00:02:18,660
+should consider in the context,
+
+54
+00:02:18,660 --> 00:02:20,823
+and zeros, the tokens they should ignore.
+
+55
+00:02:21,810 --> 00:02:23,290
+Now, passing this attention mask
+
+56
+00:02:23,290 --> 00:02:26,460
+along with the input IDs
+will give us the same results
+
+57
+00:02:26,460 --> 00:02:29,460
+as when we sent the two sentences
+individually to the model.
+
+58
+00:02:30,870 --> 00:02:33,870
+This is all done behind
+the scenes by the tokenizer
+
+59
+00:02:33,870 --> 00:02:35,583
+when you apply it to several sentences
+
+60
+00:02:35,583 --> 00:02:37,713
+with the flag padding equals true.
+
+61
+00:02:38,640 --> 00:02:39,690
+It will apply the padding
+
+62
+00:02:39,690 --> 00:02:42,180
+with the proper value
+to the smaller sentences
+
+63
+00:02:42,180 --> 00:02:44,373
+and create the appropriate attention mask.
+
diff --git a/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt b/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt
index 21a4ee3c3..f05f6ceab 100644
--- a/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt
+++ b/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt
@@ -1,164 +1,341 @@
-1
-00:00:05,120 --> 00:00:11,520
-The Hugging Face Datasets library: A Quick 
-overview. The Hugging Face Datasets library  
-
-2
-00:00:11,520 --> 00:00:16,560
-is a library that provides an API to quickly 
-download many public datasets and preprocess them.  
-
-3
-00:00:17,360 --> 00:00:22,560
-In this video we will explore how to do that. The 
-downloading part is easy: with the load_dataset  
-
-4
-00:00:22,560 --> 00:00:28,400
-function, you can directly download and cache a 
-dataset from its identifier on the Dataset hub.  
-
-5
-00:00:29,520 --> 00:00:32,720
-Here we fetch the MRPC dataset 
-from the GLUE benchmark,  
-
-6
-00:00:33,360 --> 00:00:38,320
-which is a dataset containing pairs of sentences 
-where the task is to determine the paraphrases.  
-
-7
-00:00:39,520 --> 00:00:45,440
-The object returned by the load_dataset function 
-is a DatasetDict, which is a sort of dictionary  
-
-8
-00:00:45,440 --> 00:00:51,120
-containing each split of our dataset. We can 
-access each split by indexing with its name.  
-
-9
-00:00:52,000 --> 00:00:57,440
-This split is then an instance of the 
-Dataset class, with columns (here sentence1,  
-
-10
-00:00:57,440 --> 00:01:04,240
-sentence2. label and idx) and rows. We 
-can access a given element by its index.  
-
-11
-00:01:05,200 --> 00:01:10,000
-The amazing thing about the Hugging Face Datasets 
-library is that everything is saved to disk  
-
-12
-00:01:10,000 --> 00:01:15,520
-using Apache Arrow, which means that even if 
-your dataset is huge you won't get out of RAM:  
-
-13
-00:01:16,080 --> 00:01:21,920
-only the elements you request are loaded in 
-memory. Accessing a slice of your dataset is  
-
-14
-00:01:21,920 --> 00:01:26,720
-as easy as one element. The result is then a 
-dictionary with list of values for each keys  
-
-15
-00:01:27,280 --> 00:01:31,600
-(here the list of labels, the list of first 
-sentences and the list of second sentences).  
-
-16
-00:01:33,440 --> 00:01:38,880
-The features attribute of a Dataset gives us more 
-information about its columns. In particular,  
-
-17
-00:01:38,880 --> 00:01:45,280
-we can see here it gives us the correspondence 
-between the integers and names for the labels. 0  
-
-18
-00:01:45,280 --> 00:01:51,760
-stands for not equivalent and 1 for equivalent. 
-To preprocess all the elements of our dataset,  
-
-19
-00:01:51,760 --> 00:01:56,800
-we need to tokenize them. Have a look at the 
-video "Preprocess sentence pairs" for a refresher,  
-
-20
-00:01:57,360 --> 00:02:02,320
-but you just have to send the two sentences to the 
-tokenizer with some additional keyword arguments.  
-
-21
-00:02:03,520 --> 00:02:08,560
-Here we indicate a maximum length of 128 
-and pad inputs shorter than this length,  
-
-22
-00:02:08,560 --> 00:02:14,320
-truncate inputs that are longer. We put all of 
-this in a tokenize_function that we can directly  
-
-23
-00:02:14,320 --> 00:02:20,240
-apply to all the splits in our dataset with the 
-map method. As long as the function returns a  
-
-24
-00:02:20,240 --> 00:02:25,680
-dictionary-like object, the map method will add 
-new columns as needed or update existing ones.  
-
-25
-00:02:27,360 --> 00:02:31,840
-To speed up preprocessing and take advantage 
-of the fact our tokenizer is backed by Rust  
-
-26
-00:02:31,840 --> 00:02:36,880
-thanks to the Hugging Face Tokenizers library, we 
-can process several elements at the same time to  
-
-27
-00:02:36,880 --> 00:02:42,160
-our tokenize function, using the batched=True 
-argument. Since the tokenizer can handle list  
-
-28
-00:02:42,160 --> 00:02:48,880
-of first/second sentences, the tokenize_function 
-does not need to change for this. You can also use  
-
-29
-00:02:49,440 --> 00:02:56,400
-multiprocessing with the map method, check out its 
-documentation! Once this is done, we are almost  
-
-30
-00:02:56,400 --> 00:03:01,920
-ready for training: we just remove the columns we 
-don't need anymore with the remove_columns method,  
-
-31
-00:03:01,920 --> 00:03:06,640
-rename label to labels (since the models 
-from Hugging Face Transformers expect that)  
-
-32
-00:03:07,440 --> 00:03:14,000
-and set the output format to our desired 
-backend: torch, tensorflow or numpy. If needed,  
-
-33
-00:03:14,000 --> 00:03:17,840
-we can also generate a short sample 
-of a dataset using the select method.
+﻿1
+00:00:00,213 --> 00:00:02,963
+(slide whooshes)
+
+2
+00:00:05,340 --> 00:00:08,373
+- The Hugging Face Datasets
+library, a quick overview.
+
+3
+00:00:09,990 --> 00:00:11,670
+The Hugging Face Datasets library
+
+4
+00:00:11,670 --> 00:00:14,310
+is a library that provides
+an API to quickly download
+
+5
+00:00:14,310 --> 00:00:17,610
+many public datasets and preprocess them.
+
+6
+00:00:17,610 --> 00:00:20,614
+In this video we will
+explore how to do that.
+
+7
+00:00:20,614 --> 00:00:21,780
+The downloading part is easy,
+
+8
+00:00:21,780 --> 00:00:23,760
+with the load_dataset function.
+
+9
+00:00:23,760 --> 00:00:26,460
+You can directly download
+and cache a dataset
+
+10
+00:00:26,460 --> 00:00:28,473
+from its identifier on the Dataset hub.
+
+11
+00:00:29,640 --> 00:00:33,570
+Here, we fetch the MRPC dataset
+from the GLUE benchmark,
+
+12
+00:00:33,570 --> 00:00:36,390
+which is a dataset
+containing pairs of sentences
+
+13
+00:00:36,390 --> 00:00:38,740
+where the task is to
+determine the paraphrases.
+
+14
+00:00:39,810 --> 00:00:42,420
+The object returned by
+the load_dataset function
+
+15
+00:00:42,420 --> 00:00:45,600
+is a DatasetDict, which
+is a sort of dictionary
+
+16
+00:00:45,600 --> 00:00:47,463
+containing each split of our dataset.
+
+17
+00:00:48,946 --> 00:00:52,170
+We can access each split
+by indexing with its name.
+
+18
+00:00:52,170 --> 00:00:55,047
+This split is then an
+instance of the Dataset class,
+
+19
+00:00:55,047 --> 00:00:58,590
+with columns, here sentence1, sentence2,
+
+20
+00:00:58,590 --> 00:01:01,233
+label and idx, and rows.
+
+21
+00:01:02,400 --> 00:01:04,563
+We can access a given
+element by its index.
+
+22
+00:01:05,460 --> 00:01:08,220
+The amazing thing about the
+Hugging Face Datasets library
+
+23
+00:01:08,220 --> 00:01:11,880
+is that everything is saved
+to disk using Apache Arrow,
+
+24
+00:01:11,880 --> 00:01:14,550
+which means that even
+if your dataset is huge,
+
+25
+00:01:14,550 --> 00:01:16,350
+you won't get out of RAM.
+
+26
+00:01:16,350 --> 00:01:19,113
+Only the elements you
+request are loaded in memory.
+
+27
+00:01:20,340 --> 00:01:23,940
+Accessing a slice of your dataset
+is as easy as one element.
+
+28
+00:01:23,940 --> 00:01:26,220
+The result is then a
+dictionary with list of values
+
+29
+00:01:26,220 --> 00:01:27,480
+for each keys.
+
+30
+00:01:27,480 --> 00:01:29,070
+Here the list of labels,
+
+31
+00:01:29,070 --> 00:01:30,147
+the list of first sentences
+
+32
+00:01:30,147 --> 00:01:31,923
+and the list of second sentences.
+
+33
+00:01:33,690 --> 00:01:35,580
+The features attribute of a Dataset
+
+34
+00:01:35,580 --> 00:01:37,470
+gives us more information
+about its columns.
+
+35
+00:01:37,470 --> 00:01:40,020
+In particular, we can see here
+
+36
+00:01:40,020 --> 00:01:41,400
+it gives us the correspondence
+
+37
+00:01:41,400 --> 00:01:44,810
+between the integers and
+names for the labels.
+
+38
+00:01:44,810 --> 00:01:48,543
+Zero stands for not equivalent
+and one for equivalent.
+
+39
+00:01:49,830 --> 00:01:52,020
+To preprocess all the
+elements of our dataset,
+
+40
+00:01:52,020 --> 00:01:53,850
+we need to tokenize them.
+
+41
+00:01:53,850 --> 00:01:56,160
+Have a look at the video
+"Preprocess sentence pairs"
+
+42
+00:01:56,160 --> 00:01:57,570
+for a refresher,
+
+43
+00:01:57,570 --> 00:01:59,430
+but you just have to
+send the two sentences
+
+44
+00:01:59,430 --> 00:02:02,733
+to the tokenizer with some
+additional keyword arguments.
+
+45
+00:02:03,780 --> 00:02:06,600
+Here we indicate a maximum length of 128
+
+46
+00:02:06,600 --> 00:02:08,820
+and pad inputs shorter than this length,
+
+47
+00:02:08,820 --> 00:02:10,420
+truncate inputs that are longer.
+
+48
+00:02:11,460 --> 00:02:13,470
+We put all of this in a tokenize_function
+
+49
+00:02:13,470 --> 00:02:16,710
+that we can directly apply to
+all the splits in our dataset
+
+50
+00:02:16,710 --> 00:02:17,710
+with the map method.
+
+51
+00:02:18,840 --> 00:02:22,110
+As long as the function returns
+a dictionary-like object,
+
+52
+00:02:22,110 --> 00:02:24,300
+the map method will add
+new columns as needed
+
+53
+00:02:24,300 --> 00:02:26,043
+or update existing ones.
+
+54
+00:02:27,315 --> 00:02:28,830
+To speed up preprocessing
+
+55
+00:02:28,830 --> 00:02:30,870
+and take advantage of
+the fact our tokenizer
+
+56
+00:02:30,870 --> 00:02:32,040
+is backed by Rust,
+
+57
+00:02:32,040 --> 00:02:34,770
+thanks to the Hugging
+Face Tokenizers library,
+
+58
+00:02:34,770 --> 00:02:37,110
+we can process several
+elements at the same time
+
+59
+00:02:37,110 --> 00:02:40,710
+to our tokenize function, using
+the batched=True argument.
+
+60
+00:02:40,710 --> 00:02:42,120
+Since the tokenizer can handle
+
+61
+00:02:42,120 --> 00:02:44,610
+list of first sentences,
+list of second sentences,
+
+62
+00:02:44,610 --> 00:02:47,493
+the tokenize_function does
+not need to change for this.
+
+63
+00:02:48,360 --> 00:02:51,180
+You can also use multiprocessing
+with the map method.
+
+64
+00:02:51,180 --> 00:02:53,583
+Check out its documentation
+in the linked video.
+
+65
+00:02:54,840 --> 00:02:57,990
+Once this is done, we are
+almost ready for training.
+
+66
+00:02:57,990 --> 00:02:59,970
+We just remove the columns
+we don't need anymore
+
+67
+00:02:59,970 --> 00:03:02,190
+with the remove_columns method,
+
+68
+00:03:02,190 --> 00:03:03,750
+rename label to labels,
+
+69
+00:03:03,750 --> 00:03:05,790
+since the models from the
+Hugging Face Transformers
+
+70
+00:03:05,790 --> 00:03:07,710
+library expect that,
+
+71
+00:03:07,710 --> 00:03:10,470
+and set the output format
+to our desired backend,
+
+72
+00:03:10,470 --> 00:03:12,053
+Torch, TensorFlow or NumPy.
+
+73
+00:03:13,440 --> 00:03:16,800
+If needed, we can also generate
+a short sample of a dataset
+
+74
+00:03:16,800 --> 00:03:18,000
+using the select method.
+
+75
+00:03:20,211 --> 00:03:22,961
+(slide whooshes)
+
diff --git a/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt b/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt
index 56d938c9d..5daa26c88 100644
--- a/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt
+++ b/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt
@@ -1,164 +1,320 @@
-1
-00:00:05,200 --> 00:00:11,200
-The Hugging Face Datasets library: A Quick 
-overview. The Hugging Face Datasets library  
-
-2
-00:00:11,200 --> 00:00:15,920
-is a library that provides an API to quickly 
-download many public datasets and preprocess them.  
-
-3
-00:00:16,880 --> 00:00:22,480
-In this video we will explore how to do that. The 
-downloading part is easy: with the load_dataset  
-
-4
-00:00:22,480 --> 00:00:27,760
-function, you can directly download and cache a 
-dataset from its identifier on the Dataset hub.  
-
-5
-00:00:29,040 --> 00:00:34,160
-Here we fetch the MRPC dataset from 
-the GLUE benchmark, which is a dataset  
-
-6
-00:00:34,160 --> 00:00:38,000
-containing pairs of sentences where the 
-task is to determine the paraphrases.  
-
-7
-00:00:39,360 --> 00:00:44,880
-The object returned by the load_dataset function 
-is a DatasetDict, which is a sort of dictionary  
-
-8
-00:00:44,880 --> 00:00:50,800
-containing each split of our dataset. We can 
-access each split by indexing with its name.  
-
-9
-00:00:51,520 --> 00:00:55,120
-This split is then an instance of 
-the Dataset class, with columns  
-
-10
-00:00:55,680 --> 00:01:04,000
-(here sentence1, sentence2. label and idx) and 
-rows. We can access a given element by its index.  
-
-11
-00:01:04,880 --> 00:01:10,240
-The amazing thing about the Hugging Face Datasets 
-library is that everything is saved to disk using  
-
-12
-00:01:10,240 --> 00:01:15,280
-Apache Arrow, which means that even if your 
-dataset is huge you won't get out of RAM:  
-
-13
-00:01:15,920 --> 00:01:21,760
-only the elements you request are loaded in 
-memory. Accessing a slice of your dataset is  
-
-14
-00:01:21,760 --> 00:01:27,760
-as easy as one element. The result is then a 
-dictionary with list of values for each keys  
-
-15
-00:01:28,480 --> 00:01:35,040
-(here the list of labels, the list of first 
-sentences and the list of second sentences). The  
-
-16
-00:01:35,040 --> 00:01:40,720
-features attribute of a Dataset gives us more 
-information about its columns. In particular,  
-
-17
-00:01:40,720 --> 00:01:45,040
-we can see here it gives us the correspondence 
-between the integers and names for the labels.  
-
-18
-00:01:45,920 --> 00:01:53,840
-0 stands for not equivalent and 1 for equivalent. 
-To preprocess all the elements of our dataset,  
-
-19
-00:01:53,840 --> 00:01:59,120
-we need to tokenize them. Have a look at the 
-video "Preprocess sentence pairs" for a refresher,  
-
-20
-00:01:59,840 --> 00:02:04,480
-but you just have to send the two sentences to the 
-tokenizer with some additional keyword arguments.  
-
-21
-00:02:05,760 --> 00:02:11,200
-Here we indicate a maximum length of 128 
-and pad inputs shorter than this length,  
-
-22
-00:02:11,200 --> 00:02:17,040
-truncate inputs that are longer. We put all of 
-this in a tokenize_function that we can directly  
-
-23
-00:02:17,040 --> 00:02:22,320
-apply to all the splits in our dataset with the 
-map method. As long as the function returns a  
-
-24
-00:02:22,320 --> 00:02:27,760
-dictionary-like object, the map method will add 
-new columns as needed or update existing ones.  
-
-25
-00:02:29,840 --> 00:02:34,960
-To speed up preprocessing and take advantage 
-of the fact our tokenizer is backed by Rust  
-
-26
-00:02:34,960 --> 00:02:40,320
-thanks to the Hugging Face Tokenizers library, 
-we can process several elements at the same time  
-
-27
-00:02:40,320 --> 00:02:46,800
-to our tokenize function, using the batched=True 
-argument. Since the tokenizer can handle list  
-
-28
-00:02:46,800 --> 00:02:53,360
-of first/second sentences, the tokenize_function 
-does not need to change for this. You can also use  
-
-29
-00:02:53,360 --> 00:03:00,320
-multiprocessing with the map method, check out its 
-documentation! Once this is done, we are almost  
-
-30
-00:03:00,320 --> 00:03:05,360
-ready for training: we just remove the columns we 
-don't need anymore with the remove_columns method,  
-
-31
-00:03:05,920 --> 00:03:10,320
-rename label to labels (since the models 
-from Hugging Face Transformers expect that)  
-
-32
-00:03:11,200 --> 00:03:17,280
-and set the output format to our desired 
-backend: torch, tensorflow or numpy. If needed,  
-
-33
-00:03:17,280 --> 00:03:27,440
-we can also generate a short sample 
-of a dataset using the select method.
+﻿1
+00:00:00,170 --> 00:00:03,087
+(screen whooshing)
+
+2
+00:00:05,371 --> 00:00:09,690
+- The Hugging Face Datasets
+library: A Quick overview.
+
+3
+00:00:09,690 --> 00:00:10,917
+The Hugging Face Datasets library
+
+4
+00:00:10,917 --> 00:00:12,870
+is a library that provides an API
+
+5
+00:00:12,870 --> 00:00:15,150
+to quickly download many public datasets
+
+6
+00:00:15,150 --> 00:00:16,200
+and pre-process them.
+
+7
+00:00:17,070 --> 00:00:19,473
+In this video we will explore to do that.
+
+8
+00:00:20,520 --> 00:00:23,730
+The downloading part is easy
+with the load_dataset function,
+
+9
+00:00:23,730 --> 00:00:26,010
+you can directly download
+and cache a dataset
+
+10
+00:00:26,010 --> 00:00:28,023
+from its identifier on the Dataset hub.
+
+11
+00:00:29,160 --> 00:00:33,690
+Here we fetch the MRPC dataset
+from the GLUE benchmark,
+
+12
+00:00:33,690 --> 00:00:36,030
+is a dataset containing pairs of sentences
+
+13
+00:00:36,030 --> 00:00:38,380
+where the task is to
+determine the paraphrases.
+
+14
+00:00:39,720 --> 00:00:42,120
+The object returned by
+the load_dataset function
+
+15
+00:00:42,120 --> 00:00:45,090
+is a DatasetDict, which
+is a sort of dictionary
+
+16
+00:00:45,090 --> 00:00:46,940
+containing each split of our dataset.
+
+17
+00:00:48,600 --> 00:00:51,780
+We can access each split
+by indexing with its name.
+
+18
+00:00:51,780 --> 00:00:54,540
+This split is then an
+instance of the Dataset class,
+
+19
+00:00:54,540 --> 00:00:57,423
+with columns, here sentence1, sentence2,
+
+20
+00:00:58,350 --> 00:01:00,813
+label and idx, and rows.
+
+21
+00:01:02,160 --> 00:01:05,220
+We can access a given
+element by its index.
+
+22
+00:01:05,220 --> 00:01:08,220
+The amazing thing about the
+Hugging Face Datasets library
+
+23
+00:01:08,220 --> 00:01:11,700
+is that everything is saved
+to disk using Apache Arrow,
+
+24
+00:01:11,700 --> 00:01:14,460
+which means that even
+if your dataset is huge
+
+25
+00:01:14,460 --> 00:01:16,219
+you won't get out of RAM,
+
+26
+00:01:16,219 --> 00:01:18,769
+only the elements you
+request are loaded in memory.
+
+27
+00:01:19,920 --> 00:01:24,510
+Accessing a slice of your dataset
+is as easy as one element.
+
+28
+00:01:24,510 --> 00:01:27,150
+The result is then a
+dictionary with list of values
+
+29
+00:01:27,150 --> 00:01:30,630
+for each keys, here the list of labels,
+
+30
+00:01:30,630 --> 00:01:32,190
+the list of first sentences,
+
+31
+00:01:32,190 --> 00:01:33,840
+and the list of second sentences.
+
+32
+00:01:35,100 --> 00:01:37,080
+The features attribute of a Dataset
+
+33
+00:01:37,080 --> 00:01:39,840
+gives us more information
+about its columns.
+
+34
+00:01:39,840 --> 00:01:42,150
+In particular, we can see here it gives us
+
+35
+00:01:42,150 --> 00:01:43,980
+a correspondence between the integers
+
+36
+00:01:43,980 --> 00:01:46,110
+and names for the labels.
+
+37
+00:01:46,110 --> 00:01:49,623
+0 stands for not equivalent
+and 1 for equivalent.
+
+38
+00:01:51,630 --> 00:01:54,090
+To pre-process all the
+elements of our dataset,
+
+39
+00:01:54,090 --> 00:01:55,980
+we need to tokenize them.
+
+40
+00:01:55,980 --> 00:01:58,470
+Have a look at the video
+"Pre-process sentence pairs"
+
+41
+00:01:58,470 --> 00:02:01,800
+for a refresher, but you just
+have to send the two sentences
+
+42
+00:02:01,800 --> 00:02:04,833
+to the tokenizer with some
+additional keyword arguments.
+
+43
+00:02:05,880 --> 00:02:09,300
+Here we indicate a maximum length of 128
+
+44
+00:02:09,300 --> 00:02:11,460
+and pad inputs shorter than this length,
+
+45
+00:02:11,460 --> 00:02:13,060
+truncate inputs that are longer.
+
+46
+00:02:14,040 --> 00:02:16,170
+We put all of this in a tokenize_function
+
+47
+00:02:16,170 --> 00:02:18,510
+that we can directly
+apply to all the splits
+
+48
+00:02:18,510 --> 00:02:20,260
+in our dataset with the map method.
+
+49
+00:02:21,210 --> 00:02:24,120
+As long as the function returns
+a dictionary-like object,
+
+50
+00:02:24,120 --> 00:02:26,580
+the map method will add
+new columns as needed
+
+51
+00:02:26,580 --> 00:02:28,113
+or update existing ones.
+
+52
+00:02:30,060 --> 00:02:32,520
+To speed up pre-processing
+and take advantage
+
+53
+00:02:32,520 --> 00:02:35,130
+of the fact our tokenizer
+is backed by Rust
+
+54
+00:02:35,130 --> 00:02:38,160
+thanks to the Hugging
+Face Tokenizers library,
+
+55
+00:02:38,160 --> 00:02:40,590
+we can process several
+elements at the same time
+
+56
+00:02:40,590 --> 00:02:43,923
+in our tokenize function, using
+the batched=True argument.
+
+57
+00:02:45,300 --> 00:02:46,980
+Since the tokenizer can handle a list
+
+58
+00:02:46,980 --> 00:02:50,280
+of first or second sentences,
+the tokenize_function
+
+59
+00:02:50,280 --> 00:02:52,740
+does not need to change for this.
+
+60
+00:02:52,740 --> 00:02:55,410
+You can also use multiprocessing
+with the map method,
+
+61
+00:02:55,410 --> 00:02:57,460
+check out its documentation linked below.
+
+62
+00:02:58,740 --> 00:03:02,130
+Once this is done, we are
+almost ready for training,
+
+63
+00:03:02,130 --> 00:03:04,020
+we just remove the columns
+we don't need anymore
+
+64
+00:03:04,020 --> 00:03:06,120
+with the remove_columns method,
+
+65
+00:03:06,120 --> 00:03:08,580
+rename label to labels, since the models
+
+66
+00:03:08,580 --> 00:03:11,430
+from the transformers library expect that,
+
+67
+00:03:11,430 --> 00:03:14,040
+and set the output format
+to our desired backend,
+
+68
+00:03:14,040 --> 00:03:15,893
+torch, tensorflow or numpy.
+
+69
+00:03:16,800 --> 00:03:19,050
+If needed, we can also
+generate a short sample
+
+70
+00:03:19,050 --> 00:03:21,377
+of a dataset using the select method.
+
+71
+00:03:22,817 --> 00:03:25,734
+(screen whooshing)
+
diff --git a/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt b/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt
index 605a85a95..3199e76a7 100644
--- a/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt
+++ b/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt
@@ -1,149 +1,294 @@
-1
-00:00:05,200 --> 00:00:11,680
-How to preprocess pairs of sentences? We have 
-seen how to tokenize single sentences and batch  
-
-2
-00:00:11,680 --> 00:00:18,080
-them together in the "Batching inputs together" 
-video. If this code look unfamiliar to you,  
-
-3
-00:00:18,080 --> 00:00:24,160
-be sure to check that video again! Here we will 
-focus on tasks that classify pairs of sentences.  
-
-4
-00:00:25,440 --> 00:00:30,960
-For instance, we may want to classify whether two 
-texts are paraphrases or not. Here is an example  
-
-5
-00:00:30,960 --> 00:00:36,320
-taken from the Quora Question Pairs dataset, 
-which focuses on identifying duplicate questions.  
-
-6
-00:00:37,360 --> 00:00:42,200
-In the first pair, the two questions 
-are duplicates; in the second, they  
-
-7
-00:00:43,360 --> 00:00:47,120
-are not. Another pair classification problem 
-is when we want to know if two sentences  
-
-8
-00:00:47,120 --> 00:00:54,000
-are logically related or not (a problem called 
-Natural Language Inference or NLI). In this  
-
-9
-00:00:54,000 --> 00:00:59,680
-example taken from the MultiNLI dataset, we have 
-a pair of sentences for each possible label:  
-
-10
-00:00:59,680 --> 00:01:04,560
-contradiction, neutral or entailment (which 
-is a fancy way of saying the first sentence  
-
-11
-00:01:04,560 --> 00:01:09,280
-implies the second). So classifying pairs 
-of sentences is a problem worth studying.  
-
-12
-00:01:10,080 --> 00:01:14,880
-In fact, in the GLUE benchmark (which is an 
-academic benchmark for text classification),  
-
-13
-00:01:15,600 --> 00:01:19,600
-8 of the 10 datasets are focused 
-on tasks using pairs of sentences.  
-
-14
-00:01:20,720 --> 00:01:24,240
-That's why models like BERT are often 
-pretrained with a dual objective:  
-
-15
-00:01:25,120 --> 00:01:29,920
-on top of the language modeling objective, they 
-often have an objective related to sentence pairs.  
-
-16
-00:01:31,040 --> 00:01:36,720
-For instance, during pretraining, BERT is shown 
-pairs of sentences and must predict both the  
-
-17
-00:01:36,720 --> 00:01:41,040
-value of randomly masked tokens and whether 
-the second sentence follows from the first.  
-
-18
-00:01:42,800 --> 00:01:46,640
-Fortunately, the tokenizer from the 
-Transformers library has a nice API  
-
-19
-00:01:46,640 --> 00:01:52,000
-to deal with pairs of sentences: you just have 
-to pass them as two arguments to the tokenizer.  
-
-20
-00:01:53,200 --> 00:01:57,600
-On top of the input IDs and the attention 
-mask we studied already, it returns a new  
-
-21
-00:01:57,600 --> 00:02:02,800
-field called token type IDs, which tells the 
-model which tokens belong to the first sentence  
-
-22
-00:02:03,440 --> 00:02:09,680
-and which ones belong to the second sentence. 
-Zooming in a little bit, here are the input IDs,  
-
-23
-00:02:09,680 --> 00:02:14,480
-aligned with the tokens they correspond to, 
-their respective token type ID and attention  
-
-24
-00:02:14,480 --> 00:02:21,360
-mask. We can see the tokenizer also added special 
-tokens so we have a CLS token, the tokens from the  
-
-25
-00:02:21,360 --> 00:02:28,720
-first sentence, a SEP token, the tokens from the 
-second sentence, and a final SEP token. If we have  
-
-26
-00:02:28,720 --> 00:02:33,760
-several pairs of sentences, we can tokenize them 
-together by passing the list of first sentences,  
-
-27
-00:02:34,480 --> 00:02:39,360
-then the list of second sentences and all the 
-keyword arguments we studied already, like  
-
-28
-00:02:39,360 --> 00:02:45,600
-padding=True. Zooming in at the result, we can see 
-how the tokenizer added padding to the second pair  
-
-29
-00:02:45,600 --> 00:02:51,200
-of sentences, to make the two outputs the same 
-length, and properly dealt with token type IDS  
-
-30
-00:02:51,200 --> 00:03:03,520
-and attention masks for the two sentences. This 
-is then all ready to pass through our model!
+﻿1
+00:00:00,000 --> 00:00:03,083
+(graphics whooshing)
+
+2
+00:00:05,370 --> 00:00:07,413
+- How to pre-process pairs of sentences.
+
+3
+00:00:09,150 --> 00:00:11,340
+We have seen how to
+tokenize single sentences
+
+4
+00:00:11,340 --> 00:00:12,877
+and batch them together in the,
+
+5
+00:00:12,877 --> 00:00:15,810
+"Batching inputs together video."
+
+6
+00:00:15,810 --> 00:00:18,330
+If this code look unfamiliar to you,
+
+7
+00:00:18,330 --> 00:00:20,030
+be sure to check that video again.
+
+8
+00:00:21,330 --> 00:00:24,543
+Here will focus on tasks that
+classify pair of sentences.
+
+9
+00:00:25,620 --> 00:00:28,470
+For instance, we may want to
+classify whether two texts
+
+10
+00:00:28,470 --> 00:00:30,360
+are paraphrased or not.
+
+11
+00:00:30,360 --> 00:00:32,880
+Here is an example taken
+from the Quora Question Pairs
+
+12
+00:00:32,880 --> 00:00:37,530
+dataset, which focuses on
+identifying duplicate questions.
+
+13
+00:00:37,530 --> 00:00:40,650
+In the first pair, the two
+questions are duplicates,
+
+14
+00:00:40,650 --> 00:00:42,000
+in the second they are not.
+
+15
+00:00:43,283 --> 00:00:45,540
+Another pair classification problem is
+
+16
+00:00:45,540 --> 00:00:47,400
+when we want to know if two sentences are
+
+17
+00:00:47,400 --> 00:00:49,590
+logically related or not,
+
+18
+00:00:49,590 --> 00:00:53,970
+a problem called natural
+language inference or NLI.
+
+19
+00:00:53,970 --> 00:00:57,000
+In this example, taken
+from the MultiNLI data set,
+
+20
+00:00:57,000 --> 00:00:59,880
+we have a pair of sentences
+for each possible label.
+
+21
+00:00:59,880 --> 00:01:02,490
+Contradiction, natural or entailment,
+
+22
+00:01:02,490 --> 00:01:04,680
+which is a fancy way of
+saying the first sentence
+
+23
+00:01:04,680 --> 00:01:05,793
+implies the second.
+
+24
+00:01:06,930 --> 00:01:08,820
+So classifying pairs of
+sentences is a problem
+
+25
+00:01:08,820 --> 00:01:10,260
+worth studying.
+
+26
+00:01:10,260 --> 00:01:12,630
+In fact, in the GLUE benchmark,
+
+27
+00:01:12,630 --> 00:01:15,750
+which is an academic benchmark
+for text classification
+
+28
+00:01:15,750 --> 00:01:17,910
+eight of the 10 data sets are focused
+
+29
+00:01:17,910 --> 00:01:19,953
+on tasks using pairs of sentences.
+
+30
+00:01:20,910 --> 00:01:22,560
+That's why models like BERT
+
+31
+00:01:22,560 --> 00:01:25,320
+are often pre-trained
+with a dual objective.
+
+32
+00:01:25,320 --> 00:01:27,660
+On top of the language modeling objective,
+
+33
+00:01:27,660 --> 00:01:31,230
+they often have an objective
+related to sentence pairs.
+
+34
+00:01:31,230 --> 00:01:34,320
+For instance, during
+pretraining BERT is shown
+
+35
+00:01:34,320 --> 00:01:36,810
+pairs of sentences and must predict both
+
+36
+00:01:36,810 --> 00:01:39,930
+the value of randomly masked
+tokens, and whether the second
+
+37
+00:01:39,930 --> 00:01:41,830
+sentence follow from the first or not.
+
+38
+00:01:43,084 --> 00:01:45,930
+Fortunately, the tokenizer
+from the Transformers library
+
+39
+00:01:45,930 --> 00:01:49,170
+has a nice API to deal
+with pairs of sentences.
+
+40
+00:01:49,170 --> 00:01:51,270
+You just have to pass
+them as two arguments
+
+41
+00:01:51,270 --> 00:01:52,120
+to the tokenizer.
+
+42
+00:01:53,430 --> 00:01:55,470
+On top of the input IDs
+and the attention mask
+
+43
+00:01:55,470 --> 00:01:56,970
+we studied already,
+
+44
+00:01:56,970 --> 00:01:59,910
+it returns a new field
+called token type IDs,
+
+45
+00:01:59,910 --> 00:02:01,790
+which tells the model which tokens belong
+
+46
+00:02:01,790 --> 00:02:03,630
+to the first sentence,
+
+47
+00:02:03,630 --> 00:02:05,943
+and which ones belong
+to the second sentence.
+
+48
+00:02:07,290 --> 00:02:09,840
+Zooming in a little bit,
+here has an input IDs
+
+49
+00:02:09,840 --> 00:02:12,180
+aligned with the tokens
+they correspond to,
+
+50
+00:02:12,180 --> 00:02:15,213
+their respective token
+type ID and attention mask.
+
+51
+00:02:16,080 --> 00:02:19,260
+We can see the tokenizer
+also added special tokens.
+
+52
+00:02:19,260 --> 00:02:22,620
+So we have a CLS token, the
+tokens from the first sentence,
+
+53
+00:02:22,620 --> 00:02:25,770
+a SEP token, the tokens
+from the second sentence,
+
+54
+00:02:25,770 --> 00:02:27,003
+and a final SEP token.
+
+55
+00:02:28,500 --> 00:02:30,570
+If we have several pairs of sentences,
+
+56
+00:02:30,570 --> 00:02:32,840
+we can tokenize them
+together by passing the list
+
+57
+00:02:32,840 --> 00:02:36,630
+of first sentences, then
+the list of second sentences
+
+58
+00:02:36,630 --> 00:02:39,300
+and all the keyword
+arguments we studied already
+
+59
+00:02:39,300 --> 00:02:40,353
+like padding=True.
+
+60
+00:02:41,940 --> 00:02:43,140
+Zooming in at the result,
+
+61
+00:02:43,140 --> 00:02:45,030
+we can see also tokenize added padding
+
+62
+00:02:45,030 --> 00:02:48,090
+to the second pair sentences
+to make the two outputs
+
+63
+00:02:48,090 --> 00:02:51,360
+the same length, and properly
+dealt with token type IDs
+
+64
+00:02:51,360 --> 00:02:53,643
+and attention masks for the two sentences.
+
+65
+00:02:54,900 --> 00:02:57,573
+This is then all ready to
+pass through our model.
+
diff --git a/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt b/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt
index 30bb9ba2e..980986853 100644
--- a/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt
+++ b/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt
@@ -1,153 +1,309 @@
-1
-00:00:05,440 --> 00:00:11,760
-How to preprocess pairs of sentences? We have 
-seen how to tokenize single sentences and batch  
-
-2
-00:00:11,760 --> 00:00:17,280
-them together in the "Batching inputs together" 
-video. If this code look unfamiliar to you,  
-
-3
-00:00:17,840 --> 00:00:23,680
-be sure to check that video again! Here we will 
-focus on tasks that classify pairs of sentences.  
-
-4
-00:00:24,720 --> 00:00:30,400
-For instance, we may want to classify whether two 
-texts are paraphrases or not. Here is an example  
-
-5
-00:00:30,400 --> 00:00:35,520
-taken from the Quora Question Pairs dataset, 
-which focuses on identifying duplicate questions.  
-
-6
-00:00:36,960 --> 00:00:39,760
-In the first pair, the two 
-questions are duplicates;  
-
-7
-00:00:40,400 --> 00:00:45,520
-in the second, they are not. Another pair 
-classification problem is when we want to know  
-
-8
-00:00:45,520 --> 00:00:51,920
-if two sentences are logically related or not (a 
-problem called Natural Language Inference or NLI).  
-
-9
-00:00:52,880 --> 00:00:58,480
-In this example taken from the MultiNLI dataset, 
-we have a pair of sentences for each possible  
-
-10
-00:00:58,480 --> 00:01:04,560
-label: contradiction, neutral or entailment 
-(which is a fancy way of saying the first sentence  
-
-11
-00:01:04,560 --> 00:01:12,240
-implies the second). So classifying pairs of 
-sentences is a problem worth studying. In fact,  
-
-12
-00:01:12,240 --> 00:01:15,840
-in the GLUE benchmark (which is an academic 
-benchmark for text classification),  
-
-13
-00:01:16,640 --> 00:01:20,800
-8 of the 10 datasets are focused 
-on tasks using pairs of sentences.  
-
-14
-00:01:21,920 --> 00:01:26,320
-That's why models like BERT are often 
-pretrained with a dual objective:  
-
-15
-00:01:26,320 --> 00:01:31,040
-on top of the language modeling objective, they 
-often have an objective related to sentence pairs.  
-
-16
-00:01:31,840 --> 00:01:37,520
-For instance, during pretraining, BERT is shown 
-pairs of sentences and must predict both the  
-
-17
-00:01:37,520 --> 00:01:42,080
-value of randomly masked tokens and whether 
-the second sentence follows from the first.  
-
-18
-00:01:43,840 --> 00:01:47,920
-Fortunately, the tokenizer from the 
-Transformers library has a nice API  
-
-19
-00:01:47,920 --> 00:01:53,840
-to deal with pairs of sentences: you just have 
-to pass them as two arguments to the tokenizer.  
-
-20
-00:01:54,640 --> 00:01:59,040
-On top of the input IDs and the attention 
-mask we studied already, it returns a new  
-
-21
-00:01:59,040 --> 00:02:04,320
-field called token type IDs, which tells the 
-model which tokens belong to the first sentence  
-
-22
-00:02:04,880 --> 00:02:11,280
-and which ones belong to the second sentence. 
-Zooming in a little bit, here are the input IDs,  
-
-23
-00:02:11,280 --> 00:02:16,800
-aligned with the tokens they correspond to, their 
-respective token type ID and attention mask.  
-
-24
-00:02:18,240 --> 00:02:23,440
-We can see the tokenizer also added special 
-tokens so we have a CLS token, the tokens  
-
-25
-00:02:23,440 --> 00:02:29,920
-from the first sentence, a SEP token, the tokens 
-from the second sentence, and a final SEP token.  
-
-26
-00:02:31,440 --> 00:02:36,640
-If we have several pairs of sentences, we can 
-tokenize them together by passing the list of  
-
-27
-00:02:36,640 --> 00:02:42,880
-first sentences, then the list of second sentences 
-and all the keyword arguments we studied already,  
-
-28
-00:02:42,880 --> 00:02:48,800
-like padding=True. Zooming in at the result, 
-we can see how the tokenizer added padding  
-
-29
-00:02:48,800 --> 00:02:52,480
-to the second pair of sentences, to 
-make the two outputs the same length,  
-
-30
-00:02:53,440 --> 00:02:57,280
-and properly dealt with token type IDS 
-and attention masks for the two sentences.  
-
-31
-00:02:58,720 --> 00:03:03,840
-This is then all ready to pass through our model!
+﻿1
+00:00:00,225 --> 00:00:02,892
+(air whooshing)
+
+2
+00:00:05,578 --> 00:00:09,180
+- How to preprocess pairs of sentences?
+
+3
+00:00:09,180 --> 00:00:11,490
+We have seen how to
+tokenize single sentences
+
+4
+00:00:11,490 --> 00:00:13,020
+and batch them together
+
+5
+00:00:13,020 --> 00:00:15,660
+in the "Batching inputs together" video.
+
+6
+00:00:15,660 --> 00:00:18,060
+If this code looks unfamiliar to you,
+
+7
+00:00:18,060 --> 00:00:19,760
+be sure to check that video again!
+
+8
+00:00:21,101 --> 00:00:22,110
+Here, we will focus on tasks
+
+9
+00:00:22,110 --> 00:00:24,033
+that classify pairs of sentences.
+
+10
+00:00:24,900 --> 00:00:27,030
+For instance, we may want to classify
+
+11
+00:00:27,030 --> 00:00:29,820
+whether two texts are paraphrases or not.
+
+12
+00:00:29,820 --> 00:00:30,900
+Here is an example taken
+
+13
+00:00:30,900 --> 00:00:33,180
+from the Quora Question Pairs dataset,
+
+14
+00:00:33,180 --> 00:00:36,033
+which focuses on identifying
+duplicate questions.
+
+15
+00:00:37,110 --> 00:00:40,650
+In the first pair, the two
+questions are duplicates;
+
+16
+00:00:40,650 --> 00:00:43,620
+in the second, they are not.
+
+17
+00:00:43,620 --> 00:00:44,730
+Another classification problem
+
+18
+00:00:44,730 --> 00:00:46,980
+is when we want to know if two sentences
+
+19
+00:00:46,980 --> 00:00:49,290
+are logically related or not,
+
+20
+00:00:49,290 --> 00:00:52,173
+a problem called Natural
+Language Inference or NLI.
+
+21
+00:00:53,100 --> 00:00:55,830
+In this example taken
+from the MultiNLI dataset,
+
+22
+00:00:55,830 --> 00:00:59,460
+we have a pair of sentences
+for each possible label:
+
+23
+00:00:59,460 --> 00:01:02,400
+contradiction, neutral or entailment,
+
+24
+00:01:02,400 --> 00:01:04,680
+which is a fancy way of
+saying the first sentence
+
+25
+00:01:04,680 --> 00:01:05,853
+implies the second.
+
+26
+00:01:07,140 --> 00:01:09,000
+So classifying pairs of sentences
+
+27
+00:01:09,000 --> 00:01:10,533
+is a problem worth studying.
+
+28
+00:01:11,370 --> 00:01:13,770
+In fact, in the GLUE benchmark,
+
+29
+00:01:13,770 --> 00:01:16,830
+which is an academic benchmark
+for text classification,
+
+30
+00:01:16,830 --> 00:01:19,680
+eight of the 10 datasets
+are focused on tasks
+
+31
+00:01:19,680 --> 00:01:20,973
+using pairs of sentences.
+
+32
+00:01:22,110 --> 00:01:24,720
+That's why models like
+BERT are often pretrained
+
+33
+00:01:24,720 --> 00:01:26,520
+with a dual objective:
+
+34
+00:01:26,520 --> 00:01:28,890
+on top of the language modeling objective,
+
+35
+00:01:28,890 --> 00:01:32,010
+they often have an objective
+related to sentence pairs.
+
+36
+00:01:32,010 --> 00:01:34,560
+For instance, during pretraining,
+
+37
+00:01:34,560 --> 00:01:36,690
+BERT is shown pairs of sentences
+
+38
+00:01:36,690 --> 00:01:39,900
+and must predict both the
+value of randomly masked tokens
+
+39
+00:01:39,900 --> 00:01:41,250
+and whether the second sentence
+
+40
+00:01:41,250 --> 00:01:42,903
+follows from the first or not.
+
+41
+00:01:44,070 --> 00:01:47,100
+Fortunately, the tokenizer
+from the Transformers library
+
+42
+00:01:47,100 --> 00:01:50,550
+has a nice API to deal
+with pairs of sentences:
+
+43
+00:01:50,550 --> 00:01:52,650
+you just have to pass
+them as two arguments
+
+44
+00:01:52,650 --> 00:01:53,613
+to the tokenizer.
+
+45
+00:01:54,900 --> 00:01:56,040
+On top of the input IDs
+
+46
+00:01:56,040 --> 00:01:58,440
+and the attention mask we studied already,
+
+47
+00:01:58,440 --> 00:02:01,530
+it returns a new field
+called token type IDs,
+
+48
+00:02:01,530 --> 00:02:03,210
+which tells the model which tokens
+
+49
+00:02:03,210 --> 00:02:05,100
+belong to the first sentence
+
+50
+00:02:05,100 --> 00:02:07,350
+and which ones belong
+to the second sentence.
+
+51
+00:02:08,670 --> 00:02:11,430
+Zooming in a little bit,
+here are the input IDs,
+
+52
+00:02:11,430 --> 00:02:13,710
+aligned with the tokens
+they correspond to,
+
+53
+00:02:13,710 --> 00:02:17,193
+their respective token
+type ID and attention mask.
+
+54
+00:02:18,540 --> 00:02:21,300
+We can see the tokenizer
+also added special tokens
+
+55
+00:02:21,300 --> 00:02:25,230
+so we have a CLS token, the
+tokens from the first sentence,
+
+56
+00:02:25,230 --> 00:02:28,590
+a SEP token, the tokens
+from the second sentence,
+
+57
+00:02:28,590 --> 00:02:30,153
+and a final SEP token.
+
+58
+00:02:31,680 --> 00:02:33,720
+If we have several pairs of sentences,
+
+59
+00:02:33,720 --> 00:02:35,640
+we can tokenize them together
+
+60
+00:02:35,640 --> 00:02:38,280
+by passing the list of first sentences,
+
+61
+00:02:38,280 --> 00:02:40,710
+then the list of second sentences
+
+62
+00:02:40,710 --> 00:02:43,050
+and all the keyword
+arguments we studied already,
+
+63
+00:02:43,050 --> 00:02:44,133
+like padding=True.
+
+64
+00:02:45,510 --> 00:02:46,770
+Zooming in at the result,
+
+65
+00:02:46,770 --> 00:02:49,050
+we can see how the tokenizer added padding
+
+66
+00:02:49,050 --> 00:02:50,940
+to the second pair of sentences,
+
+67
+00:02:50,940 --> 00:02:53,490
+to make the two outputs the same length.
+
+68
+00:02:53,490 --> 00:02:55,620
+It also properly dealt with token type IDS
+
+69
+00:02:55,620 --> 00:02:57,720
+and attention masks for the two sentences.
+
+70
+00:02:59,010 --> 00:03:01,460
+This is then all ready to
+pass through our model!
+
+71
+00:03:03,799 --> 00:03:06,466
+(air whooshing)
+
diff --git a/subtitles/en/23_what-is-dynamic-padding.srt b/subtitles/en/23_what-is-dynamic-padding.srt
index 48fbfb69d..64514035a 100644
--- a/subtitles/en/23_what-is-dynamic-padding.srt
+++ b/subtitles/en/23_what-is-dynamic-padding.srt
@@ -1,188 +1,300 @@
-1
-00:00:05,270 --> 00:00:07,640
-What is dynamic padding?
-
-2
-00:00:07,640 --> 00:00:12,620
-In the "Batching Inputs together" video, we
-have seen that to be able to group inputs
-
-3
-00:00:12,620 --> 00:00:17,320
-of different lengths in the same batch, we
-need to add padding tokens to all the short
-
-4
-00:00:17,320 --> 00:00:20,520
-inputs until they are all of the same length.
-
-5
-00:00:20,520 --> 00:00:26,300
-Here for instance, the longest sentence is
-the third one, and we need to add 5, 2 and
-
-6
-00:00:26,300 --> 00:00:32,509
-7 pad tokens to the other to have four sentences
-of the same lengths.
-
-7
-00:00:32,509 --> 00:00:37,530
-When dealing with a whole dataset, there are
-various padding strategies we can apply.
-
-8
-00:00:37,530 --> 00:00:41,870
-The most obvious one is to pad all the elements
-of the dataset to the same length: the length
-
-9
-00:00:41,870 --> 00:00:44,129
-of the longest sample.
-
-10
-00:00:44,129 --> 00:00:48,450
-This will then give us batches that all have
-the same shape determined by the maximum sequence
-
-11
-00:00:48,450 --> 00:00:49,450
-length.
-
-12
-00:00:49,450 --> 00:00:54,039
-The downside is that batches composed from
-short sentences will have a lot of padding
-
-13
-00:00:54,039 --> 00:01:00,080
-tokens which introduce more computations in
-the model we ultimately don't need.
-
-14
-00:01:00,080 --> 00:01:05,320
-To avoid this, another strategy is to pad
-the elements when we batch them together,
-
-15
-00:01:05,320 --> 00:01:08,240
-to the longest sentence inside the batch.
-
-16
-00:01:08,240 --> 00:01:12,880
-This way batches composed of short inputs
-will be smaller than the batch containing
-
-17
-00:01:12,880 --> 00:01:15,600
-the longest sentence in the dataset.
-
-18
-00:01:15,600 --> 00:01:19,090
-This will yield some nice speedup on CPU and
-GPU.
-
-19
-00:01:19,090 --> 00:01:23,130
-The downside is that all batches will then
-have different shapes, which slows down training
-
-20
-00:01:23,130 --> 00:01:24,790
-on other accelerators like TPUs.
-
-21
-00:01:24,790 --> 00:01:28,850
-Let's see how to apply both strategies in
-practice.
-
-22
-00:01:28,850 --> 00:01:34,750
-We have actually seen how to apply fixed padding
-in the Datasets Overview video, when we preprocessed
-
-23
-00:01:34,750 --> 00:01:39,320
-the MRPC dataset: after loading the dataset
-and tokenizer, we applied the tokenization
-
-24
-00:01:39,320 --> 00:01:45,260
-to all the dataset with padding and truncation
-to make all samples of length 128.
-
-25
-00:01:45,260 --> 00:01:51,630
-As a result, if we pass this dataset to a
-PyTorch DataLoader, we get batches of shape
-
-26
-00:01:51,630 --> 00:01:57,079
-batch size (here 16) by 128.
-
-27
-00:01:57,079 --> 00:02:01,950
-To apply dynamic padding, we must defer the
-padding to the batch preparation, so we remove
-
-28
-00:02:01,950 --> 00:02:04,789
-that part from our tokenize function.
-
-29
-00:02:04,789 --> 00:02:08,569
-We still leave the truncation part so that
-inputs that are bigger than the maximum length
-
-30
-00:02:08,569 --> 00:02:14,069
-accepted by the model (usually 512) get truncated
-to that length.
-
-31
-00:02:14,069 --> 00:02:17,629
-Then we pad our samples dynamically by using
-a data collator.
-
-32
-00:02:17,629 --> 00:02:22,110
-Those classes in the Transformers library
-are responsible for applying all the final
-
-33
-00:02:22,110 --> 00:02:27,970
-processing needed before forming a batch,
-here DataCollatorWithPadding will pad the
-
-34
-00:02:27,970 --> 00:02:32,200
-samples to the maximum length inside the batch
-of sentences.
-
-35
-00:02:32,200 --> 00:02:36,790
-We pass it to the PyTorch DataLoader as a
-collate function, then observe that the batches
-
-36
-00:02:36,790 --> 00:02:42,950
-generated have various lenghs, all way below
-the 128 from before.
-
-37
-00:02:42,950 --> 00:02:48,200
-Dynamic batching will almost always be faster
-on CPUs and GPUs, so you should apply it if
-
-38
-00:02:48,200 --> 00:02:49,200
-you can.
-
-39
-00:02:49,200 --> 00:02:53,879
-Remember to switch back to fixed padding however
-if you run your training script on TPU or
-
-40
-00:02:53,879 --> 00:03:00,599
-need batches of fixed shapes.
+﻿1
+00:00:00,242 --> 00:00:02,909
+(air whooshing)
+
+2
+00:00:05,460 --> 00:00:06,963
+- What is dynamic padding?
+
+3
+00:00:08,630 --> 00:00:10,890
+In the "Batching Inputs together" video,
+
+4
+00:00:10,890 --> 00:00:12,720
+we have seen that to
+be able to group inputs
+
+5
+00:00:12,720 --> 00:00:15,300
+of different lengths in the same batch,
+
+6
+00:00:15,300 --> 00:00:18,270
+we need to add padding tokens
+to all the short inputs
+
+7
+00:00:18,270 --> 00:00:20,970
+until they are all of the same length.
+
+8
+00:00:20,970 --> 00:00:24,600
+Here, for instance, the longest
+sentence is the third one,
+
+9
+00:00:24,600 --> 00:00:27,270
+and we need to add five,
+two, or seven pad tokens
+
+10
+00:00:27,270 --> 00:00:30,090
+to the other sentences
+to have four sentences
+
+11
+00:00:30,090 --> 00:00:31,090
+of the same lengths.
+
+12
+00:00:32,430 --> 00:00:33,900
+When dealing with a whole dataset,
+
+13
+00:00:33,900 --> 00:00:36,633
+there are various padding
+strategies we can apply.
+
+14
+00:00:37,560 --> 00:00:39,540
+The most obvious one is
+to pad all the elements
+
+15
+00:00:39,540 --> 00:00:40,923
+of the dataset to the same length:
+
+16
+00:00:40,923 --> 00:00:43,053
+the length of the longest sample.
+
+17
+00:00:44,070 --> 00:00:45,330
+This will then give us batches
+
+18
+00:00:45,330 --> 00:00:46,890
+that all have the same shape
+
+19
+00:00:46,890 --> 00:00:49,800
+determined by the maximum sequence length.
+
+20
+00:00:49,800 --> 00:00:52,893
+The downside is that batches
+composed from short sentences
+
+21
+00:00:52,893 --> 00:00:54,960
+will have a lot of padding tokens
+
+22
+00:00:54,960 --> 00:00:57,660
+which will introduce more
+computations in the model
+
+23
+00:00:57,660 --> 00:00:58,910
+we ultimately don't need.
+
+24
+00:01:00,060 --> 00:01:03,300
+To avoid this, another
+strategy is to pad the elements
+
+25
+00:01:03,300 --> 00:01:05,280
+when we batch them together,
+
+26
+00:01:05,280 --> 00:01:08,190
+to the longest sentence inside the batch.
+
+27
+00:01:08,190 --> 00:01:12,000
+This way, batches composed of
+short inputs will be smaller
+
+28
+00:01:12,000 --> 00:01:13,920
+than the batch containing
+the longest sentence
+
+29
+00:01:13,920 --> 00:01:15,510
+in the dataset.
+
+30
+00:01:15,510 --> 00:01:18,063
+This will yield some nice
+speedup on CPU and GPU.
+
+31
+00:01:19,110 --> 00:01:20,490
+The downside is that all batches
+
+32
+00:01:20,490 --> 00:01:22,140
+will then have different shapes,
+
+33
+00:01:22,140 --> 00:01:24,740
+which slows down training
+on accelerators like TPUs.
+
+34
+00:01:26,160 --> 00:01:29,370
+Let's see how to apply both
+strategies in practice.
+
+35
+00:01:29,370 --> 00:01:31,280
+We have actually seen how
+to apply fixed padding
+
+36
+00:01:31,280 --> 00:01:33,390
+in the Datasets Overview video,
+
+37
+00:01:33,390 --> 00:01:36,030
+when we preprocessed the MRPC dataset:
+
+38
+00:01:36,030 --> 00:01:38,250
+after loading the dataset and tokenizer,
+
+39
+00:01:38,250 --> 00:01:40,680
+we applied the tokenization
+to all the dataset
+
+40
+00:01:40,680 --> 00:01:42,480
+with padding and truncation
+
+41
+00:01:42,480 --> 00:01:45,273
+to make all samples of length 128.
+
+42
+00:01:46,530 --> 00:01:48,360
+As a result, if we pass this dataset
+
+43
+00:01:48,360 --> 00:01:50,520
+to a PyTorch DataLoader,
+
+44
+00:01:50,520 --> 00:01:55,503
+we get batches of shape
+batch size, here 16, by 128.
+
+45
+00:01:57,060 --> 00:01:58,380
+To apply dynamic padding,
+
+46
+00:01:58,380 --> 00:02:01,440
+we must defer the padding
+to the batch preparation,
+
+47
+00:02:01,440 --> 00:02:04,740
+so we remove that part
+from our tokenize function.
+
+48
+00:02:04,740 --> 00:02:06,150
+We still leave the truncation part
+
+49
+00:02:06,150 --> 00:02:08,580
+so that inputs that are
+bigger than the maximum length
+
+50
+00:02:08,580 --> 00:02:12,060
+accepted by the model, usually 512,
+
+51
+00:02:12,060 --> 00:02:13,510
+get truncated to that length.
+
+52
+00:02:14,940 --> 00:02:16,380
+Then we pad our samples dynamically
+
+53
+00:02:16,380 --> 00:02:18,330
+by using a data collator.
+
+54
+00:02:18,330 --> 00:02:20,280
+Those classes in the Transformers library
+
+55
+00:02:20,280 --> 00:02:22,740
+are responsible for applying
+all the final processing
+
+56
+00:02:22,740 --> 00:02:25,290
+needed before forming a batch,
+
+57
+00:02:25,290 --> 00:02:28,470
+here DataCollatorWithPadding
+will pad the samples
+
+58
+00:02:28,470 --> 00:02:31,083
+to the maximum length inside
+the batch of sentences.
+
+59
+00:02:32,160 --> 00:02:35,310
+We pass it to the PyTorch
+DataLoader as a collate function,
+
+60
+00:02:35,310 --> 00:02:37,620
+then observe that the batches generated
+
+61
+00:02:37,620 --> 00:02:38,850
+have various lengths,
+
+62
+00:02:38,850 --> 00:02:41,253
+all way below the 128 from before.
+
+63
+00:02:42,660 --> 00:02:44,820
+Dynamic batching will
+almost always be faster
+
+64
+00:02:44,820 --> 00:02:47,913
+on CPUs and GPUs, so you
+should apply it if you can.
+
+65
+00:02:48,930 --> 00:02:51,330
+Remember to switch back
+to fixed padding, however,
+
+66
+00:02:51,330 --> 00:02:53,490
+if you run your training script on TPU
+
+67
+00:02:53,490 --> 00:02:55,293
+or need batches of fixed shapes.
+
+68
+00:02:56,917 --> 00:02:59,584
+(air whooshing)
+
diff --git a/subtitles/en/24_the-trainer-api.srt b/subtitles/en/24_the-trainer-api.srt
index bee53acec..55405374a 100644
--- a/subtitles/en/24_the-trainer-api.srt
+++ b/subtitles/en/24_the-trainer-api.srt
@@ -1,174 +1,382 @@
-1
-00:00:05,280 --> 00:00:11,200
-The Trainer API. The Transformers library 
-provides a Trainer API that allows you to  
-
-2
-00:00:11,200 --> 00:00:17,040
-easily fine-tune transformer models on your own 
-dataset. The Trainer class take your datasets,  
-
-3
-00:00:17,040 --> 00:00:22,240
-your model as well as the training hyperparameters 
-and can perform the training on any kind of  
-
-4
-00:00:22,240 --> 00:00:30,160
-setup (CPU, GPU, multi GPUs, TPUs). It can also 
-compute the predictions on any dataset, and if  
-
-5
-00:00:30,160 --> 00:00:36,720
-you provided metrics, evaluate your model on any 
-dataset. It can also handle final data-processing  
-
-6
-00:00:36,720 --> 00:00:41,760
-such as dynamic padding as long as you provide 
-the tokenizer or a given data collator.  
-
-7
-00:00:43,040 --> 00:00:48,160
-We will try this API on the MRPC dataset, since 
-it's relatively small and easy to preprocess.  
-
-8
-00:00:49,520 --> 00:00:54,800
-As we saw in the Datasets overview video, here 
-is how we can preprocess it. We do not apply  
-
-9
-00:00:54,800 --> 00:00:59,840
-padding during the preprocessing as we will use 
-dynamic padding with our DataCollatorWithPadding.  
-
-10
-00:01:00,960 --> 00:01:05,440
-Note that we don't do the final steps of 
-renaming/removing columns or set the format  
-
-11
-00:01:05,440 --> 00:01:11,280
-to torch tensors: the Trainer will do all of 
-this automatically for us by analyzing the  
-
-12
-00:01:11,280 --> 00:01:18,080
-model signature. The last steps before creating 
-the Trainer are to define our model and some  
-
-13
-00:01:18,080 --> 00:01:24,400
-training hyperparameters. We saw how to do the 
-first in the model API video. For the second,  
-
-14
-00:01:24,400 --> 00:01:29,600
-we use the TrainingArguments class. It only needs 
-a path to a folder where results and checkpoints  
-
-15
-00:01:29,600 --> 00:01:34,240
-will be saved, but you can also customize 
-all the hyperparameters the Trainer will use:  
-
-16
-00:01:34,240 --> 00:01:39,600
-learning rate, number of training epochs etc. 
-It's then very easy to create a Trainer and  
-
-17
-00:01:39,600 --> 00:01:44,720
-launch a training. This should display a progress 
-bar and after a few minutes (if you are running  
-
-18
-00:01:44,720 --> 00:01:50,480
-on a GPU) you should have the training finished. 
-The result will be rather anticlimatic however,  
-
-19
-00:01:50,480 --> 00:01:54,880
-as you will only get a training loss which 
-doesn't really tell you anything about how you  
-
-20
-00:01:54,880 --> 00:01:59,920
-model is performing. This is because we didn't 
-specify anything metric for the evaluation.  
-
-21
-00:02:00,960 --> 00:02:05,520
-To get those metrics, we will first gather the 
-predictions on the whole evaluation set using the  
-
-22
-00:02:05,520 --> 00:02:11,760
-predict method. It returns a namedtuple with three 
-fields: predictions (which contains the model  
-
-23
-00:02:11,760 --> 00:02:17,760
-predictions), label_ids (which contains the labels 
-if your dataset had them) and metrics (which is  
-
-24
-00:02:17,760 --> 00:02:24,480
-empty here). The predictions are the logits of 
-the models for all the sentences in the dataset,  
-
-25
-00:02:24,480 --> 00:02:31,440
-so a NumPy array of shape 408 by 2. To match them 
-with our labels, we need to take the maximum logit  
-
-26
-00:02:31,440 --> 00:02:36,560
-for each prediction (to know which of the two 
-classes was predicted), which we do with the  
-
-27
-00:02:36,560 --> 00:02:42,480
-argmax function. Then we can use a Metric from 
-the Datasets library: it can be loaded as easily  
-
-28
-00:02:42,480 --> 00:02:47,200
-as our dataset with the load_metric function, 
-and it returns the evaluation metric used for  
-
-29
-00:02:47,200 --> 00:02:54,080
-the dataser we are using. We can see our model 
-did learn something as it is 85.7% accurate.  
-
-30
-00:02:55,200 --> 00:02:59,920
-To monitor the evaluation metrics during training 
-we need to define a compute_metrics function  
-
-31
-00:02:59,920 --> 00:03:05,200
-that does the same step as before: it takes 
-a namedtuple with predictions and labels  
-
-32
-00:03:05,200 --> 00:03:08,000
-and must return a dictionary with 
-the metric we want to keep track of.  
-
-33
-00:03:09,120 --> 00:03:14,400
-By passing the epoch evaluation strategy to our 
-TrainingArguments, we tell the Trainer to evaluate  
-
-34
-00:03:14,400 --> 00:03:20,400
-at the end of every epoch. Launching a training 
-inside a notebook will then display a progress bar  
-
-35
-00:03:20,400 --> 00:03:29,920
-and complete the table you see 
-here as you pass every epoch.
+﻿1
+00:00:00,304 --> 00:00:01,285
+(air whooshing)
+
+2
+00:00:01,285 --> 00:00:02,345
+(air popping)
+
+3
+00:00:02,345 --> 00:00:05,698
+(air whooshing)
+
+4
+00:00:05,698 --> 00:00:06,548
+- So Trainer API.
+
+5
+00:00:08,070 --> 00:00:10,040
+So Transformers Library
+provides a Trainer API
+
+6
+00:00:10,040 --> 00:00:13,320
+that allows you to easily
+find tune transformers models
+
+7
+00:00:13,320 --> 00:00:14,193
+on your dataset.
+
+8
+00:00:15,150 --> 00:00:17,250
+So Trainer class takes your datasets,
+
+9
+00:00:17,250 --> 00:00:19,900
+your model as well as the
+training hyperparameters
+
+10
+00:00:20,820 --> 00:00:23,310
+and can perform the training
+on any kind of setup,
+
+11
+00:00:23,310 --> 00:00:26,654
+CPU, GPU, multiple GPUs, TPUs
+
+12
+00:00:26,654 --> 00:00:28,680
+can also compute the predictions
+
+13
+00:00:28,680 --> 00:00:31,710
+on any dataset and if you provided metrics
+
+14
+00:00:31,710 --> 00:00:33,813
+evaluate your model on any dataset.
+
+15
+00:00:34,950 --> 00:00:36,930
+You can also involve final data processing
+
+16
+00:00:36,930 --> 00:00:38,670
+such as dynamic padding,
+
+17
+00:00:38,670 --> 00:00:40,377
+as long as you provide the tokenizer
+
+18
+00:00:40,377 --> 00:00:42,693
+or given data collator.
+
+19
+00:00:43,572 --> 00:00:45,900
+We will try this API on the MRPC dataset,
+
+20
+00:00:45,900 --> 00:00:48,492
+since it's relatively small
+and easy to preprocess.
+
+21
+00:00:48,492 --> 00:00:49,325
+As we saw in the Datasets overview video,
+
+22
+00:00:49,325 --> 00:00:54,325
+however we can preprocess it.
+
+23
+00:00:54,511 --> 00:00:57,030
+We do not apply padding
+during the preprocessing,
+
+24
+00:00:57,030 --> 00:00:58,590
+as we will use dynamic padding
+
+25
+00:00:58,590 --> 00:01:00,083
+before DataCollatorWithPadding.
+
+26
+00:01:01,170 --> 00:01:02,790
+Note that we don't do the final steps
+
+27
+00:01:02,790 --> 00:01:04,830
+of renaming removing columns
+
+28
+00:01:04,830 --> 00:01:06,873
+or set the format to torch tensors.
+
+29
+00:01:07,710 --> 00:01:10,560
+So Trainer will do all of
+this automatically for us
+
+30
+00:01:10,560 --> 00:01:12,633
+by analyzing the model signature.
+
+31
+00:01:14,054 --> 00:01:16,650
+The last step before
+creating the Trainer are
+
+32
+00:01:16,650 --> 00:01:17,940
+to define a model
+
+33
+00:01:17,940 --> 00:01:20,250
+and some training hyperparameters.
+
+34
+00:01:20,250 --> 00:01:22,653
+We saw to do the first
+in the model API video.
+
+35
+00:01:23,734 --> 00:01:26,790
+For the second we use the
+TrainingArguments class.
+
+36
+00:01:26,790 --> 00:01:28,710
+It only takes a path to a folder
+
+37
+00:01:28,710 --> 00:01:30,900
+where results and
+checkpoint will be saved,
+
+38
+00:01:30,900 --> 00:01:33,060
+but you can also customize
+all the hyperparameters
+
+39
+00:01:33,060 --> 00:01:34,470
+your Trainer will use,
+
+40
+00:01:34,470 --> 00:01:37,270
+learning weight, number of
+training impacts, et. cetera.
+
+41
+00:01:38,190 --> 00:01:39,660
+It's been very easy to create a Trainer
+
+42
+00:01:39,660 --> 00:01:41,400
+and launch a training.
+
+43
+00:01:41,400 --> 00:01:43,170
+You should display a progress bar
+
+44
+00:01:43,170 --> 00:01:45,900
+and after a few minutes
+if you're running on a GPU
+
+45
+00:01:45,900 --> 00:01:48,000
+you should have the training finished.
+
+46
+00:01:48,000 --> 00:01:50,790
+The result will be rather
+anticlimactic however,
+
+47
+00:01:50,790 --> 00:01:52,710
+as you will only get a training loss
+
+48
+00:01:52,710 --> 00:01:54,300
+which doesn't really tell you anything
+
+49
+00:01:54,300 --> 00:01:56,820
+about how well your model is performing.
+
+50
+00:01:56,820 --> 00:01:58,977
+This is because we
+didn't specify any metric
+
+51
+00:01:58,977 --> 00:02:00,273
+for the evaluation.
+
+52
+00:02:01,200 --> 00:02:02,160
+To get those metrics,
+
+53
+00:02:02,160 --> 00:02:03,810
+we will first gather the predictions
+
+54
+00:02:03,810 --> 00:02:06,513
+on the whole evaluation set
+using the predict method.
+
+55
+00:02:07,440 --> 00:02:10,020
+It returns a namedtuple with three fields,
+
+56
+00:02:10,020 --> 00:02:12,990
+Prediction, which contains
+the model of predictions.
+
+57
+00:02:12,990 --> 00:02:15,030
+Label_IDs, which contains the labels
+
+58
+00:02:15,030 --> 00:02:16,800
+if your dataset had them
+
+59
+00:02:16,800 --> 00:02:18,570
+and metrics which is empty here.
+
+60
+00:02:18,570 --> 00:02:20,520
+We're trying to do that.
+
+61
+00:02:20,520 --> 00:02:22,470
+The predictions are the
+logits of the models
+
+62
+00:02:22,470 --> 00:02:24,143
+for all the sentences in the dataset.
+
+63
+00:02:24,143 --> 00:02:27,513
+So a NumPy array of shape 408 by 2.
+
+64
+00:02:28,500 --> 00:02:30,270
+To match them with our labels,
+
+65
+00:02:30,270 --> 00:02:31,590
+we need to take the maximum logit
+
+66
+00:02:31,590 --> 00:02:32,850
+for each prediction
+
+67
+00:02:32,850 --> 00:02:35,820
+to know which of the two
+classes was predicted.
+
+68
+00:02:35,820 --> 00:02:37,683
+We do this with the argmax function.
+
+69
+00:02:38,640 --> 00:02:41,550
+Then we can use a metric
+from the Datasets library.
+
+70
+00:02:41,550 --> 00:02:43,500
+It can be loaded as easily as a dataset
+
+71
+00:02:43,500 --> 00:02:45,360
+with the load metric function
+
+72
+00:02:45,360 --> 00:02:49,500
+and each returns the evaluation
+metric used for the dataset.
+
+73
+00:02:49,500 --> 00:02:51,600
+We can see our model did learn something
+
+74
+00:02:51,600 --> 00:02:54,363
+as it is 85.7% accurate.
+
+75
+00:02:55,440 --> 00:02:57,870
+To monitor the evaluation
+matrix during training,
+
+76
+00:02:57,870 --> 00:02:59,829
+we need to define a
+compute_metrics function
+
+77
+00:02:59,829 --> 00:03:02,670
+that does the same step as before.
+
+78
+00:03:02,670 --> 00:03:04,728
+It takes a namedtuple with
+predictions and labels
+
+79
+00:03:04,728 --> 00:03:06,327
+and must return a dictionary
+
+80
+00:03:06,327 --> 00:03:08,427
+with the metrics we want to keep track of.
+
+81
+00:03:09,360 --> 00:03:11,490
+By passing the epoch evaluation strategy
+
+82
+00:03:11,490 --> 00:03:13,080
+to our training arguments,
+
+83
+00:03:13,080 --> 00:03:14,490
+we tell the Trainer to evaluate
+
+84
+00:03:14,490 --> 00:03:15,903
+at the end of every epoch.
+
+85
+00:03:17,280 --> 00:03:18,587
+Launching a training inside a notebook
+
+86
+00:03:18,587 --> 00:03:20,640
+will then display a progress bar
+
+87
+00:03:20,640 --> 00:03:23,643
+and complete the table you see
+here as you pass every epoch.
+
+88
+00:03:25,400 --> 00:03:28,249
+(air whooshing)
+
+89
+00:03:28,249 --> 00:03:29,974
+(air decrescendos)
+
diff --git a/subtitles/en/25_keras-introduction.srt b/subtitles/en/25_keras-introduction.srt
index cace81f9f..d2960bba1 100644
--- a/subtitles/en/25_keras-introduction.srt
+++ b/subtitles/en/25_keras-introduction.srt
@@ -1,129 +1,290 @@
-1
-00:00:05,120 --> 00:00:10,640
-In this video, I'm going to give you a very quick 
-introduction to how our transformers models work  
-
-2
-00:00:10,640 --> 00:00:17,120
-together with Tensorflow and Keras! The very short 
-explanation is that all of our Tensorflow models  
-
-3
-00:00:17,120 --> 00:00:23,760
-are also Keras model objects, and so they have the 
-standard Keras model API. If you're an experienced  
-
-4
-00:00:23,760 --> 00:00:28,640
-ML engineer who's used Keras a lot, that's 
-probably all you need to know to start working  
-
-5
-00:00:28,640 --> 00:00:34,160
-with them. But for everyone else, including 
-the prodigal PyTorch engineers out there who  
-
-6
-00:00:34,160 --> 00:00:39,360
-are returning to the fold, I'm going to quickly 
-introduce Keras models, and how we work with them.  
-
-7
-00:00:40,320 --> 00:00:46,240
-In other videos, which I'll link below, I'll run 
-through training with Keras models in more detail.  
-
-8
-00:00:46,240 --> 00:00:54,640
-But first, what is a Keras model? Your model 
-basically contains your entire network:  
-
-9
-00:00:54,640 --> 00:00:59,600
-It contains the layers, and the weights for 
-those layers, and also tells the model what  
-
-10
-00:00:59,600 --> 00:01:04,560
-to do with them; it defines the whole path 
-all the way from your inputs to your outputs.  
-
-11
-00:01:05,280 --> 00:01:10,880
-If you've used Keras before, you probably 
-started by building your model out by  
-
-12
-00:01:10,880 --> 00:01:17,600
-hand - you added one layer after another, maybe 
-using model.add() or the functional approach.  
-
-13
-00:01:18,480 --> 00:01:26,240
-And there's nothing wrong with that! But you can 
-also pre-load an entire model, weights and all.  
-
-14
-00:01:26,960 --> 00:01:33,920
-This is really helpful, because if you try 
-reading the paper or looking at the code,  
-
-15
-00:01:33,920 --> 00:01:38,400
-you'll see the inside of a Transformer is 
-pretty complex, and writing it all out from  
-
-16
-00:01:38,400 --> 00:01:43,280
-scratch and getting it right would be hard even 
-for an experienced machine learning engineer.  
-
-17
-00:01:43,280 --> 00:01:48,080
-But because it's all packed inside a Model, you 
-don't need to worry about that complexity if  
-
-18
-00:01:48,080 --> 00:01:53,840
-you don't want to! You have the flexibility to 
-write any model you like, but you can also just  
-
-19
-00:01:54,400 --> 00:01:58,640
-load a pre-trained, pre-configured 
-transformer model in one line of code.  
-
-20
-00:02:00,000 --> 00:02:09,040
-And whether you write your own model from scratch 
-or load a pre-trained one, you interact with the  
-
-21
-00:02:09,040 --> 00:02:14,560
-model in the same way - through the same few 
-methods you're going to see again and again,  
-
-22
-00:02:15,200 --> 00:02:22,000
-like *fit*, *compile* and *predict,* and we'll 
-cover concrete examples of how to use those  
-
-23
-00:02:22,000 --> 00:02:26,960
-methods in other videos that I'll link below. For 
-now the key thing to take away from this video, if  
-
-24
-00:02:26,960 --> 00:02:31,920
-you've never seen Keras before, is that this neat 
-encapsulation means that all of the complexity of  
-
-25
-00:02:31,920 --> 00:02:36,560
-a huge neural net becomes manageable, because 
-you interact with it in exactly the same way,  
-
-26
-00:02:36,560 --> 00:02:49,760
-using exactly the same methods, as you would 
-with a simple model that you wrote out by hand.
+﻿1
+00:00:00,430 --> 00:00:03,013
+(upbeat music)
+
+2
+00:00:05,160 --> 00:00:07,080
+- In this video, I'm going to give you
+
+3
+00:00:07,080 --> 00:00:10,350
+a very quick introduction to
+how our transformer models
+
+4
+00:00:10,350 --> 00:00:14,040
+work together with Tensorflow and Keras.
+
+5
+00:00:14,040 --> 00:00:15,510
+The very short explanation
+
+6
+00:00:15,510 --> 00:00:17,310
+is that all of our Tensorflow models
+
+7
+00:00:17,310 --> 00:00:19,470
+are also Keras model objects,
+
+8
+00:00:19,470 --> 00:00:22,950
+and so they have the
+standard Keras model API.
+
+9
+00:00:22,950 --> 00:00:24,960
+If you're an experienced
+machine learning engineer
+
+10
+00:00:24,960 --> 00:00:28,230
+who's used Keras a lot, that's
+probably all you need to know
+
+11
+00:00:28,230 --> 00:00:29,610
+to start working with them.
+
+12
+00:00:29,610 --> 00:00:30,900
+But for everyone else,
+
+13
+00:00:30,900 --> 00:00:34,170
+including the prodigal
+PyTorch engineers out there
+
+14
+00:00:34,170 --> 00:00:35,910
+who are returning to the fold,
+
+15
+00:00:35,910 --> 00:00:38,430
+I'm going to quickly
+introduce Keras models,
+
+16
+00:00:38,430 --> 00:00:40,440
+and how we work with them.
+
+17
+00:00:40,440 --> 00:00:43,080
+In other videos, which I'll link below,
+
+18
+00:00:43,080 --> 00:00:46,440
+I'll run through training with
+Keras models in more detail.
+
+19
+00:00:46,440 --> 00:00:50,820
+But first, at a high level,
+what is a Keras model?
+
+20
+00:00:50,820 --> 00:00:54,810
+So your model basically
+contains your entire network.
+
+21
+00:00:54,810 --> 00:00:58,230
+It contains the layers, and
+the weights for those layers,
+
+22
+00:00:58,230 --> 00:01:00,690
+and also tells the model
+what to do with them
+
+23
+00:01:00,690 --> 00:01:02,880
+so it defines the whole path all the way
+
+24
+00:01:02,880 --> 00:01:05,460
+from your inputs to your outputs.
+
+25
+00:01:05,460 --> 00:01:07,380
+If you've used Keras before,
+
+26
+00:01:07,380 --> 00:01:09,480
+you probably started using model objects
+
+27
+00:01:09,480 --> 00:01:11,850
+by building them out by hand,
+
+28
+00:01:11,850 --> 00:01:14,250
+you added one layer after another
+
+29
+00:01:14,250 --> 00:01:18,690
+and maybe using the model.add()
+or the functional approach.
+
+30
+00:01:18,690 --> 00:01:20,490
+And there's nothing wrong with that.
+
+31
+00:01:21,390 --> 00:01:23,430
+Lots of great models are built that way
+
+32
+00:01:23,430 --> 00:01:26,970
+but you can also pre-load an
+entire model, weights and all.
+
+33
+00:01:26,970 --> 00:01:29,994
+And this is really
+helpful, because if you,
+
+34
+00:01:29,994 --> 00:01:32,490
+as you can see here, if
+you try reading the paper
+
+35
+00:01:32,490 --> 00:01:34,110
+or if you try looking at the code,
+
+36
+00:01:34,110 --> 00:01:37,350
+you'll see the inside of a
+Transformer is pretty complex,
+
+37
+00:01:37,350 --> 00:01:40,110
+and writing it all out from
+scratch and getting it right
+
+38
+00:01:40,110 --> 00:01:41,850
+would be hard even for an experienced
+
+39
+00:01:41,850 --> 00:01:43,500
+machine learning engineer.
+
+40
+00:01:43,500 --> 00:01:45,870
+But because it's all
+packed inside a model,
+
+41
+00:01:45,870 --> 00:01:48,150
+you don't need to worry
+about that complexity on that
+
+42
+00:01:48,150 --> 00:01:49,140
+if you don't want to.
+
+43
+00:01:49,140 --> 00:01:51,570
+If you're a researcher, if you
+want to really dig in there
+
+44
+00:01:51,570 --> 00:01:55,650
+you can, but you can also
+just load a pre-trained,
+
+45
+00:01:55,650 --> 00:01:59,013
+pre-configured transformer
+model in just one line of code.
+
+46
+00:02:00,150 --> 00:02:03,480
+And when I mentioned
+earlier about the Keras API,
+
+47
+00:02:03,480 --> 00:02:04,560
+the advantage of it is that
+
+48
+00:02:04,560 --> 00:02:06,690
+whether you write your
+own model from scratch
+
+49
+00:02:06,690 --> 00:02:09,510
+or load a pre-trained one,
+you interact with the model
+
+50
+00:02:09,510 --> 00:02:11,850
+through that same API, so you use exactly
+
+51
+00:02:11,850 --> 00:02:13,950
+the same few methods and
+you're gonna see them
+
+52
+00:02:13,950 --> 00:02:16,380
+again and again, these methods like fit,
+
+53
+00:02:16,380 --> 00:02:19,650
+compile and predict,
+and like I've mentioned
+
+54
+00:02:19,650 --> 00:02:22,530
+we'll cover concrete examples
+of how to use those methods
+
+55
+00:02:22,530 --> 00:02:24,330
+in the videos I'll link below.
+
+56
+00:02:24,330 --> 00:02:27,000
+For now the key thing to
+take away from this video,
+
+57
+00:02:27,000 --> 00:02:28,950
+if you've never seen Keras before,
+
+58
+00:02:28,950 --> 00:02:30,870
+is that this neat encapsulation means
+
+59
+00:02:30,870 --> 00:02:33,090
+that all the complexity
+of a huge neural net
+
+60
+00:02:33,090 --> 00:02:35,430
+becomes manageable, because
+you interact with it
+
+61
+00:02:35,430 --> 00:02:39,000
+in exactly the same way, using
+exactly the same methods,
+
+62
+00:02:39,000 --> 00:02:41,700
+whether it's a huge
+pre-trained language model
+
+63
+00:02:41,700 --> 00:02:43,950
+or a simple model that
+you wrote out by hand.
+
+64
+00:02:45,466 --> 00:02:48,049
+(upbeat music)
+
diff --git a/subtitles/en/26_fine-tuning-with-tensorflow.srt b/subtitles/en/26_fine-tuning-with-tensorflow.srt
index 259fc2f47..fb2536667 100644
--- a/subtitles/en/26_fine-tuning-with-tensorflow.srt
+++ b/subtitles/en/26_fine-tuning-with-tensorflow.srt
@@ -1,353 +1,567 @@
-1
-00:00:06,069 --> 00:00:11,580
-In this video, we're going to see how to load
-and fine-tune a pre-trained model.
-
-2
-00:00:11,580 --> 00:00:16,010
-It's very quick, and if you've watched our
-pipeline videos, which I'll link below, the
-
-3
-00:00:16,010 --> 00:00:18,330
-process is very similar.
-
-4
-00:00:18,330 --> 00:00:21,990
-This time, though, we're going to be using
-transfer learning and doing some training
-
-5
-00:00:21,990 --> 00:00:26,660
-ourselves, rather than just loading a model
-and using it as-is.
-
-6
-00:00:26,660 --> 00:00:30,610
-To learn more about transfer learning, head
-to the 'What is transfer learning?'
-
-7
-00:00:30,610 --> 00:00:33,000
-video, which we'll link below too!
-
-8
-00:00:33,000 --> 00:00:35,660
-So now let's look at this code.
-
-9
-00:00:35,660 --> 00:00:40,340
-To start, we pick which model we want to start
-with - in this case we're going to use the
-
-10
-00:00:40,340 --> 00:00:42,540
-famous, the original BERT.
-
-11
-00:00:42,540 --> 00:00:50,500
-But what does this monstrosity, 'TFAutoModelForSequenceClassification'
-mean?
-
-12
-00:00:50,500 --> 00:00:56,460
-Well, the TF stands for TensorFlow, and the
-rest means "take a language model, and stick
-
-13
-00:00:56,460 --> 00:01:00,879
-a sequence classification head onto it if
-it doesn't have one already".
-
-14
-00:01:00,879 --> 00:01:05,420
-So what we're going to do here is load BERT,
-a general language model, and then do some
-
-15
-00:01:05,420 --> 00:01:09,490
-transfer learning to use it on our task of
-interest.
-
-16
-00:01:09,490 --> 00:01:13,530
-We load the language model with this one line
-of code here, using the "from_pretrained"
-
-17
-00:01:13,530 --> 00:01:14,530
-method.
-
-18
-00:01:14,530 --> 00:01:21,230
-That method needs to know two things: Firstly
-the name of the model you want it to load,
-
-19
-00:01:21,230 --> 00:01:29,840
-and secondly how many classes your problem
-has.
-
-20
-00:01:29,840 --> 00:01:33,500
-If you want to follow along with the data
-from our datasets videos, which I'll link
-
-21
-00:01:33,500 --> 00:01:41,200
-below, then you'll have two classes, positive
-and negative, and thus num_labels equals two.
-
-22
-00:01:41,200 --> 00:01:43,590
-What about this "compile" thing?
-
-23
-00:01:43,590 --> 00:01:47,909
-If you're familiar with Keras, you've probably
-seen this already, but if not, this is one
-
-24
-00:01:47,909 --> 00:01:55,520
-of its core methods - you always need to "compile"
-your model before you train it.
-
-25
-00:01:55,520 --> 00:02:01,240
-Compile needs to know two things: Firstly,
-the loss function - what are we trying to
-
-26
-00:02:01,240 --> 00:02:02,240
-optimize?
-
-27
-00:02:02,240 --> 00:02:08,509
-Here, we import the sparse categorical crossentropy
-loss function - that's a mouthful, but it's
-
-28
-00:02:08,509 --> 00:02:13,390
-the standard loss function for any neural
-network that's doing a classification task.
-
-29
-00:02:13,390 --> 00:02:18,170
-It basically encourages the network to output
-large values for the right class, and low
-
-30
-00:02:18,170 --> 00:02:21,080
-values for the wrong classes.
-
-31
-00:02:21,080 --> 00:02:26,140
-Note that you can specify the loss function
-as a string, like we did with the optimizer,
-
-32
-00:02:26,140 --> 00:02:34,319
-but there's a very common pitfall there - by
-default, this loss assumes the output is probabilities
-
-33
-00:02:34,319 --> 00:02:39,650
-after a softmax layer, but what our model
-has actually output is the values before the
-
-34
-00:02:39,650 --> 00:02:50,140
-softmax, often called the "logits" - you saw
-these before in the videos about pipelines.
-
-35
-00:02:50,140 --> 00:02:54,580
-If you get this wrong, your model won't train
-and it'll be very annoying to figure out why.
-
-36
-00:02:54,580 --> 00:02:58,500
-In fact, if you remember absolutely nothing
-else from this video, remember to always check
-
-37
-00:02:58,500 --> 00:03:02,990
-whether your model is outputting logits or
-probabilities, and to make sure your loss
-
-38
-00:03:02,990 --> 00:03:05,270
-is set up to match that.
-
-39
-00:03:05,270 --> 00:03:09,460
-It'll save you a lot of debugging headaches
-in your career!
-
-40
-00:03:09,460 --> 00:03:13,340
-The second thing compile needs to know is
-the optimizer you want.
-
-41
-00:03:13,340 --> 00:03:17,570
-In our case, we use Adam, which is sort of
-the standard optimizer for deep learning these
-
-42
-00:03:17,570 --> 00:03:18,730
-days.
-
-43
-00:03:18,730 --> 00:03:22,770
-The one thing you might want to change is
-the learning rate, and to do that we'll need
-
-44
-00:03:22,770 --> 00:03:27,330
-to import the actual optimizer rather than
-just calling it by string, but we'll talk
-
-45
-00:03:27,330 --> 00:03:30,050
-about that in another video, which I'll link
-below.
-
-46
-00:03:30,050 --> 00:03:33,610
-For now, let's just try training the model!
-
-47
-00:03:33,610 --> 00:03:35,830
-So how do you train a model?
-
-48
-00:03:35,830 --> 00:03:40,670
-Well, if you’ve used Keras before, this
-will all be very familiar to you - but if
-
-49
-00:03:40,670 --> 00:03:43,370
-not, let's look at what we're doing here.
-
-50
-00:03:43,370 --> 00:03:48,371
-Fit() is pretty much the central method for
-Keras models - it tells the model to break
-
-51
-00:03:48,371 --> 00:03:49,371
-the data into batches and train on it.
-
-52
-00:03:49,371 --> 00:03:50,371
-So the first input is tokenized text - you
-will almost always be getting this from a
-
-53
-00:03:50,371 --> 00:03:52,120
-tokenizer, and if you want to learn more about
-that process, and what exactly the outputs
-
-54
-00:03:52,120 --> 00:03:53,120
-look like, please check out our videos on
-tokenizers - there'll be links below for those
-
-55
-00:03:53,120 --> 00:03:54,120
-too!
-
-56
-00:03:54,120 --> 00:03:55,120
-So that's our inputs, and then the second
-input is our labels - this is just a one-dimensional
-
-57
-00:03:55,120 --> 00:03:56,840
-Numpy or Tensorflow array of integers, corresponding
-to the classes for our examples, and that’s
-
-58
-00:03:56,840 --> 00:03:57,840
-it.
-
-59
-00:03:57,840 --> 00:03:58,840
-If you're following along with the data from
-our datasets video, there'll only be two classes,
-
-60
-00:03:58,840 --> 00:04:00,300
-so this will just be zeroes and ones.
-
-61
-00:04:00,300 --> 00:04:04,870
-Once we have our inputs and our labels, we
-do the same thing with the validation data,
-
-62
-00:04:04,870 --> 00:04:07,120
-we pass the validation inputs and the validation
-labels in a tuple, then we can, if we want,
-
-63
-00:04:07,120 --> 00:04:15,390
-specify details like the batch_size for training,
-and then you just pass it all to model.fit()
-
-64
-00:04:15,390 --> 00:04:16,540
-and let it rip.
-
-65
-00:04:16,540 --> 00:04:20,449
-If everything works out, you should see a
-little training progress bar as your loss
-
-66
-00:04:20,449 --> 00:04:21,670
-goes down.
-
-67
-00:04:21,670 --> 00:04:26,870
-And while that's running you call your boss
-and tell him you’re a senior NLP machine
-
-68
-00:04:26,870 --> 00:04:30,509
-learning engineer now and you’re going to
-want a salary review next quarter.
-
-69
-00:04:30,509 --> 00:04:38,470
-This is really all it takes to apply the power
-of a massive pretrained language model to
-
-70
-00:04:38,470 --> 00:04:40,770
-your NLP problem.
-
-71
-00:04:40,770 --> 00:04:42,440
-Could we do better, though?
-
-72
-00:04:42,440 --> 00:04:47,180
-We certainly could, with a few more advanced
-Keras features like a tuned, scheduled learning
-
-73
-00:04:47,180 --> 00:04:50,889
-rate we can get an even lower loss, and an
-even more accurate model.
-
-74
-00:04:50,889 --> 00:04:54,039
-And what do we do with our model once it's
-trained?
-
-75
-00:04:54,039 --> 00:05:02,919
-I'll cover this and more in the videos linked
-below!
+﻿1
+00:00:00,253 --> 00:00:02,920
+(air whooshing)
+
+2
+00:00:06,060 --> 00:00:08,070
+- In this video, we're going to see
+
+3
+00:00:08,070 --> 00:00:11,430
+how to load and fine
+tune a pre-trained model.
+
+4
+00:00:11,430 --> 00:00:12,510
+It's very quick.
+
+5
+00:00:12,510 --> 00:00:14,490
+And if you've watched our pipeline videos,
+
+6
+00:00:14,490 --> 00:00:18,150
+which I'll link below, the
+process is very similar.
+
+7
+00:00:18,150 --> 00:00:20,940
+This time, though, we're going
+to be using transfer learning
+
+8
+00:00:20,940 --> 00:00:23,040
+and doing some training ourselves,
+
+9
+00:00:23,040 --> 00:00:26,400
+rather than just loading a
+model and using it as is.
+
+10
+00:00:26,400 --> 00:00:28,710
+So to learn more about transfer learning,
+
+11
+00:00:28,710 --> 00:00:31,320
+head to the 'What is
+transfer learning?' video,
+
+12
+00:00:31,320 --> 00:00:33,420
+and we'll link that below as well.
+
+13
+00:00:33,420 --> 00:00:35,610
+But for now, let's look at this code.
+
+14
+00:00:35,610 --> 00:00:38,730
+To start, we pick which
+model we want to start with.
+
+15
+00:00:38,730 --> 00:00:40,920
+In this case, we're
+going to use the famous,
+
+16
+00:00:40,920 --> 00:00:42,060
+the original BERT,
+
+17
+00:00:42,060 --> 00:00:44,850
+as the foundation for our training today.
+
+18
+00:00:44,850 --> 00:00:46,770
+But what is this monstrosity line,
+
+19
+00:00:46,770 --> 00:00:48,797
+this
+'TFAutoModelForSequenceClassification'?
+
+20
+00:00:49,860 --> 00:00:51,180
+What does that mean?
+
+21
+00:00:51,180 --> 00:00:53,130
+Well, the TF stands for TensorFlow.
+
+22
+00:00:53,130 --> 00:00:54,660
+And the rest means,
+
+23
+00:00:54,660 --> 00:00:55,950
+take a language model,
+
+24
+00:00:55,950 --> 00:00:58,380
+and stick a sequence
+classification head onto it
+
+25
+00:00:58,380 --> 00:01:00,750
+if it doesn't have one already.
+
+26
+00:01:00,750 --> 00:01:02,880
+So this line of code loads BERT,
+
+27
+00:01:02,880 --> 00:01:05,040
+which is a general purpose language model,
+
+28
+00:01:05,040 --> 00:01:07,650
+it loads at weights, architecture, and all
+
+29
+00:01:07,650 --> 00:01:10,920
+and then adds a new sequence
+classification head onto it
+
+30
+00:01:10,920 --> 00:01:13,440
+with randomly initialized weights.
+
+31
+00:01:13,440 --> 00:01:15,870
+So this method needs to know two things.
+
+32
+00:01:15,870 --> 00:01:18,270
+Firstly, it needs to know
+the name of the model
+
+33
+00:01:18,270 --> 00:01:21,060
+you wanted to load, the
+architecture and weights for.
+
+34
+00:01:21,060 --> 00:01:23,940
+And secondly, it needs
+to know how many classes
+
+35
+00:01:23,940 --> 00:01:26,693
+your problem has, because
+that will determine the size,
+
+36
+00:01:26,693 --> 00:01:29,610
+the number of neurons in the output head.
+
+37
+00:01:29,610 --> 00:01:31,530
+So if you want to follow
+along with the data
+
+38
+00:01:31,530 --> 00:01:34,500
+from our datasets videos,
+which I'll link below,
+
+39
+00:01:34,500 --> 00:01:37,440
+then you'll have two classes,
+positive and negative,
+
+40
+00:01:37,440 --> 00:01:39,723
+and thus num_labels equals two.
+
+41
+00:01:40,830 --> 00:01:43,230
+But what about this compile line?
+
+42
+00:01:43,230 --> 00:01:44,970
+Well, if you're familiar with Keras,
+
+43
+00:01:44,970 --> 00:01:46,920
+you've probably seen this already.
+
+44
+00:01:46,920 --> 00:01:49,800
+But if not, this is one of
+the core methods in Keras
+
+45
+00:01:49,800 --> 00:01:51,450
+that you're gonna see again, and again.
+
+46
+00:01:51,450 --> 00:01:54,900
+You always need to compile
+your model before you train it.
+
+47
+00:01:54,900 --> 00:01:57,870
+And compile needs to know two things.
+
+48
+00:01:57,870 --> 00:02:00,090
+Firstly, it needs to
+know the loss function,
+
+49
+00:02:00,090 --> 00:02:02,340
+which is what you're trying to optimize.
+
+50
+00:02:02,340 --> 00:02:05,910
+So here, we import the
+SparseCategoricalCrossentropy
+
+51
+00:02:05,910 --> 00:02:07,260
+loss function.
+
+52
+00:02:07,260 --> 00:02:09,930
+So that's a mouthful, but it's
+the standard loss function
+
+53
+00:02:09,930 --> 00:02:13,260
+for any neural network that's
+doing a classification task.
+
+54
+00:02:13,260 --> 00:02:14,970
+It basically encourages the network
+
+55
+00:02:14,970 --> 00:02:17,730
+to output large values
+for the right class,
+
+56
+00:02:17,730 --> 00:02:20,910
+and low values for the wrong classes.
+
+57
+00:02:20,910 --> 00:02:24,150
+Note that you can specify the
+loss function as a string,
+
+58
+00:02:24,150 --> 00:02:26,010
+like we did with the optimizer.
+
+59
+00:02:26,010 --> 00:02:27,600
+But there's a risk there,
+
+60
+00:02:27,600 --> 00:02:30,090
+there's a very common
+trap people fall into,
+
+61
+00:02:30,090 --> 00:02:32,580
+which is that by default,
+this loss assumes
+
+62
+00:02:32,580 --> 00:02:36,510
+the output is probabilities
+after a softmax layer.
+
+63
+00:02:36,510 --> 00:02:38,310
+But what our model has actually output
+
+64
+00:02:38,310 --> 00:02:40,770
+is the values before the softmax,
+
+65
+00:02:40,770 --> 00:02:43,800
+often called the logits, sometimes logits.
+
+66
+00:02:43,800 --> 00:02:46,110
+No one's quite sure how
+to pronounce that one.
+
+67
+00:02:46,110 --> 00:02:47,790
+But you probably seen these before
+
+68
+00:02:47,790 --> 00:02:49,950
+in the video about pipelines.
+
+69
+00:02:49,950 --> 00:02:52,320
+So if you get this wrong,
+your model won't train
+
+70
+00:02:52,320 --> 00:02:54,723
+and it'll be very annoying
+to figure out why.
+
+71
+00:02:55,590 --> 00:02:57,540
+In future videos, we're gonna see
+
+72
+00:02:57,540 --> 00:03:00,540
+how to use the model's
+internal loss computations,
+
+73
+00:03:00,540 --> 00:03:02,910
+so that you don't have to
+specify the loss yourself
+
+74
+00:03:02,910 --> 00:03:05,340
+and you don't have to
+worry about these details.
+
+75
+00:03:05,340 --> 00:03:09,480
+But for now, remember to
+set from_logits to true.
+
+76
+00:03:09,480 --> 00:03:11,430
+The second thing compile needs to know
+
+77
+00:03:11,430 --> 00:03:13,230
+is the optimizer you want.
+
+78
+00:03:13,230 --> 00:03:15,120
+In our case, we use adam,
+
+79
+00:03:15,120 --> 00:03:16,830
+which is sort of the standard optimizer
+
+80
+00:03:16,830 --> 00:03:18,720
+for deep learning these days.
+
+81
+00:03:18,720 --> 00:03:20,520
+The one thing you might want to change
+
+82
+00:03:20,520 --> 00:03:21,780
+is the learning rate.
+
+83
+00:03:21,780 --> 00:03:24,630
+And to do that, we'll need to
+import the actual optimizer
+
+84
+00:03:24,630 --> 00:03:26,910
+rather than just calling it by string.
+
+85
+00:03:26,910 --> 00:03:28,680
+But we'll talk about
+that in another video,
+
+86
+00:03:28,680 --> 00:03:30,090
+which I'll link below.
+
+87
+00:03:30,090 --> 00:03:33,360
+For now, let's just
+try training the model.
+
+88
+00:03:33,360 --> 00:03:35,580
+Well, so how do you train the model?
+
+89
+00:03:35,580 --> 00:03:37,950
+Again, if you've used Keras before,
+
+90
+00:03:37,950 --> 00:03:40,350
+this is all going to be
+very familiar to you.
+
+91
+00:03:40,350 --> 00:03:42,210
+But if not, let's very quickly look
+
+92
+00:03:42,210 --> 00:03:43,710
+at what we're doing here.
+
+93
+00:03:43,710 --> 00:03:47,010
+fit is pretty much the central
+method for Keras models.
+
+94
+00:03:47,010 --> 00:03:49,983
+It tells the model to train
+on the data we're passing in.
+
+95
+00:03:50,820 --> 00:03:52,920
+So here we pass the datasets we made
+
+96
+00:03:52,920 --> 00:03:54,510
+in the previous section,
+
+97
+00:03:54,510 --> 00:03:57,990
+the dataset contains both
+our inputs and our labels.
+
+98
+00:03:57,990 --> 00:04:00,420
+So we don't need to
+specify separate labels,
+
+99
+00:04:00,420 --> 00:04:01,570
+when we're calling fit.
+
+100
+00:04:02,490 --> 00:04:05,340
+Then we do the same thing
+with the validation_data.
+
+101
+00:04:05,340 --> 00:04:08,190
+And then we can if we want,
+we can specify details,
+
+102
+00:04:08,190 --> 00:04:09,900
+like the number of epochs for training
+
+103
+00:04:09,900 --> 00:04:12,420
+where there's some other
+arguments you can pass to fit.
+
+104
+00:04:12,420 --> 00:04:15,240
+But in the end, you just
+pass all of this to model.fit
+
+105
+00:04:15,240 --> 00:04:16,440
+and you let it run.
+
+106
+00:04:16,440 --> 00:04:17,520
+If everything works out,
+
+107
+00:04:17,520 --> 00:04:19,320
+you should see a little training bar
+
+108
+00:04:19,320 --> 00:04:21,300
+progressing along as your loss goes down.
+
+109
+00:04:21,300 --> 00:04:22,290
+And that's it.
+
+110
+00:04:22,290 --> 00:04:23,123
+While that's running,
+
+111
+00:04:23,123 --> 00:04:25,380
+you know, you can call
+your boss and tell them
+
+112
+00:04:25,380 --> 00:04:27,810
+you're a senior NLP machine
+learning engineer now
+
+113
+00:04:27,810 --> 00:04:30,900
+and you're gonna want a
+salary review next quarter.
+
+114
+00:04:30,900 --> 00:04:32,880
+These few lines of code
+are really all it takes
+
+115
+00:04:32,880 --> 00:04:34,500
+to apply the power of a massive
+
+116
+00:04:34,500 --> 00:04:36,510
+pre-trained language problem,
+
+117
+00:04:36,510 --> 00:04:38,250
+massive pre-trained
+language model, excuse me,
+
+118
+00:04:38,250 --> 00:04:40,080
+to your NLP problem.
+
+119
+00:04:40,080 --> 00:04:42,150
+But could we do better than this?
+
+120
+00:04:42,150 --> 00:04:43,920
+I mean, we certainly could.
+
+121
+00:04:43,920 --> 00:04:45,720
+With a few more advanced Keras features
+
+122
+00:04:45,720 --> 00:04:47,730
+like a tuned, scheduled learning rate,
+
+123
+00:04:47,730 --> 00:04:49,290
+we can get an even lower loss
+
+124
+00:04:49,290 --> 00:04:51,990
+and an even more accurate,
+more useful model.
+
+125
+00:04:51,990 --> 00:04:54,120
+And what do we do with our
+model after we train it?
+
+126
+00:04:54,120 --> 00:04:55,950
+So all of this is going to
+be covered in the videos
+
+127
+00:04:55,950 --> 00:04:57,963
+that are coming up, so stay tuned.
+
+128
+00:04:59,220 --> 00:05:01,887
+(air whooshing)
+
diff --git a/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt b/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt
index e805ae6dd..4a5688bea 100644
--- a/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt
+++ b/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt
@@ -1,179 +1,468 @@
-1
-00:00:05,120 --> 00:00:11,440
-In our other videos we talked about the basics 
-of fine-tuning a language model with Tensorflow  
-
-2
-00:00:11,440 --> 00:00:18,000
-(and as always, when I refer to videos I'll link 
-them below). Still, can we do better? So here's  
-
-3
-00:00:18,000 --> 00:00:23,040
-the code from our model fine-tuning video, and 
-while it works, we could definitely tweak a couple  
-
-4
-00:00:23,040 --> 00:00:29,040
-of things. By far the most important thing is the 
-learning rate. In this video we'll talk about how  
-
-5
-00:00:29,040 --> 00:00:34,800
-to change it, which will make your training 
-much more consistently successful. In fact,  
-
-6
-00:00:36,080 --> 00:00:42,880
-there are two things we want to change about the 
-default learning rate for Adam. The first is that  
-
-7
-00:00:42,880 --> 00:00:51,520
-it's way too high for our models - by default 
-Adam uses a learning rate of 10^-3 1 e minus 3,  
-
-8
-00:00:51,520 --> 00:00:59,600
-which is very high for training Transformers. 
-We're going to start at 5 by 10^-5 5 e minus 5,  
-
-9
-00:00:59,600 --> 00:01:05,520
-which is 20 times lower than the default. And 
-secondly, we don't just want a constant learning  
-
-10
-00:01:05,520 --> 00:01:10,960
-rate - we can get even better performance if we 
-'decay' the learning rate down to a tiny value,  
-
-11
-00:01:10,960 --> 00:01:17,760
-or even 0, over the course of training. That's 
-what this PolynomialDecay schedule thing is doing.  
-
-12
-00:01:19,200 --> 00:01:20,880
-That name might be intimidating, especially 
-if you only vaguely remember what a polynomial  
-
-13
-00:01:21,600 --> 00:01:25,120
-is from maths class. However, all we need to 
-do is tell it how long training is going to be,  
-
-14
-00:01:25,120 --> 00:01:29,040
-so it decays at the right speed - 
-that's what this code here is doing.  
-
-15
-00:01:30,080 --> 00:01:35,280
-We're computing how many minibatches the model 
-is going to see over its entire training run,  
-
-16
-00:01:35,280 --> 00:01:37,640
-which is the size of the training set, divided 
-by the batch_size to get the number of batches  
-
-17
-00:01:37,640 --> 00:01:42,080
-per epoch, and then multiplied by the 
-number of epochs to get the total number  
-
-18
-00:01:42,080 --> 00:01:47,680
-of batches across the whole training run. Once 
-we know how many training steps we're taking,  
-
-19
-00:01:47,680 --> 00:01:51,360
-we just pass all that information to 
-the scheduler and we're ready to go.  
-
-20
-00:01:54,000 --> 00:01:57,360
-What does the polynomial decay schedule look 
-like? With default options, it's actually just a  
-
-21
-00:01:57,360 --> 00:02:04,720
-linear schedule, so it looks like this - it starts 
-at 5e-5, which means 5 times ten to the minus 5,  
-
-22
-00:02:05,280 --> 00:02:11,120
-and then decays down at a constant rate until 
-it hits zero right at the very end of training.  
-
-23
-00:02:11,120 --> 00:02:33,920
-So why do they call it polynomial and not 
-linear? Because if you tweak the options,  
-
-24
-00:02:36,000 --> 00:02:49,840
-you can get a higher-order decay schedule, but 
-there's no need to do that right now. Now, how  
-
-25
-00:02:49,840 --> 00:02:56,400
-do we use our learning rate schedule? Easy, 
-we just pass it to Adam! You'll notice the  
-
-26
-00:02:56,400 --> 00:03:00,480
-first time when we compiled the model, 
-we just passed it the string "adam".  
-
-27
-00:03:02,320 --> 00:03:07,760
-Keras recognizes the names of common optimizers 
-and loss functions if you pass them as strings,  
-
-28
-00:03:07,760 --> 00:03:12,320
-so it saves time to do that if you only want 
-the default settings. But we're professional  
-
-29
-00:03:12,320 --> 00:03:19,600
-machine learners now, with our very own learning 
-rate schedule, so we have to do things properly.  
-
-30
-00:03:19,600 --> 00:03:26,080
-So first we import the optimizer, then we 
-initialize it with our scheduler, and then  
-
-31
-00:03:29,200 --> 00:03:34,720
-we compile the model using the new optimizer, 
-and whatever loss function you want - this will  
-
-32
-00:03:34,720 --> 00:03:39,040
-be sparse categorical crossentropy if you're 
-following along from the fine-tuning video.  
-
-33
-00:03:39,680 --> 00:03:47,120
-And now we have a high-performance model, ready to 
-go. All that remains is to fit the model just like  
-
-34
-00:03:47,120 --> 00:03:53,280
-we did before! Remember, because we compiled the 
-model with the new optimizer with the new learning  
-
-35
-00:03:53,280 --> 00:03:58,800
-rate schedule, we don't need to change anything 
-here. We just call fit again, with exactly the  
-
-36
-00:03:58,800 --> 00:04:04,320
-same command as before, but now we get beautiful 
-training with a nice, smooth learning rate decay.
+﻿1
+00:00:00,288 --> 00:00:02,639
+(screen swishing)
+
+2
+00:00:02,639 --> 00:00:05,190
+(text swishing)
+
+3
+00:00:05,190 --> 00:00:06,780
+In our other videos,
+
+4
+00:00:06,780 --> 00:00:08,280
+we talked about the basics
+
+5
+00:00:08,280 --> 00:00:11,610
+of fine-tuning a language
+model with Tensorflow,
+
+6
+00:00:11,610 --> 00:00:15,030
+and as always, when I refer to
+videos I'll link them below.
+
+7
+00:00:15,030 --> 00:00:17,610
+Still, can we do better?
+
+8
+00:00:17,610 --> 00:00:20,700
+So here's the code from our
+model fine-tuning video,
+
+9
+00:00:20,700 --> 00:00:21,600
+and while it works,
+
+10
+00:00:21,600 --> 00:00:24,390
+we could definitely
+tweak a couple of things.
+
+11
+00:00:24,390 --> 00:00:27,540
+By far the most important
+thing is the learning rate.
+
+12
+00:00:27,540 --> 00:00:29,940
+In this video we'll talk
+about how to change it,
+
+13
+00:00:29,940 --> 00:00:31,080
+which will make your training
+
+14
+00:00:31,080 --> 00:00:33,303
+much more consistently successful.
+
+15
+00:00:34,440 --> 00:00:37,320
+In fact, really there are two things
+
+16
+00:00:37,320 --> 00:00:40,530
+we want to change about the
+default learning rate for Adam.
+
+17
+00:00:40,530 --> 00:00:42,720
+So the first we want to change
+
+18
+00:00:42,720 --> 00:00:45,630
+is that it's way too high for our models,
+
+19
+00:00:45,630 --> 00:00:48,030
+by default, Adam uses a learning rate
+
+20
+00:00:48,030 --> 00:00:51,540
+of 10 to the minus 3, 1 E minus 3,
+
+21
+00:00:51,540 --> 00:00:54,660
+and that's very high for
+training transformer models.
+
+22
+00:00:54,660 --> 00:00:58,200
+We're going to start at
+5 by 10 to the minus 5,
+
+23
+00:00:58,200 --> 00:01:02,700
+5 E minus 5, which is 20
+times lower than the default.
+
+24
+00:01:02,700 --> 00:01:06,330
+And secondly, we don't just
+want a constant learning rate,
+
+25
+00:01:06,330 --> 00:01:07,950
+we can get even better performance
+
+26
+00:01:07,950 --> 00:01:11,160
+if we decay the learning
+rate down to a tiny value,
+
+27
+00:01:11,160 --> 00:01:13,920
+or even to zero , over
+the course of training.
+
+28
+00:01:13,920 --> 00:01:15,510
+So that's what this thing here,
+
+29
+00:01:15,510 --> 00:01:18,540
+this Polynomial Decay
+schedule thing is doing.
+
+30
+00:01:18,540 --> 00:01:21,570
+So I'll show you what that
+decay looks like in a second,
+
+31
+00:01:21,570 --> 00:01:23,160
+but first we need to tell the scheduler
+
+32
+00:01:23,160 --> 00:01:25,290
+how long training is going to be,
+
+33
+00:01:25,290 --> 00:01:27,450
+so that it decays at the right speed,
+
+34
+00:01:27,450 --> 00:01:29,450
+and that's what this code here is doing.
+
+35
+00:01:30,300 --> 00:01:32,280
+We're computing how many minibatches
+
+36
+00:01:32,280 --> 00:01:35,520
+the model is going to see
+over its entire training run,
+
+37
+00:01:35,520 --> 00:01:37,950
+which is the size of the training set,
+
+38
+00:01:37,950 --> 00:01:39,570
+and then we multiply that
+
+39
+00:01:39,570 --> 00:01:41,220
+by the number of epochs
+
+40
+00:01:41,220 --> 00:01:42,930
+to get the total number of batches
+
+41
+00:01:42,930 --> 00:01:45,060
+across the whole training run.
+
+42
+00:01:45,060 --> 00:01:47,880
+Once we know how many
+training steps we're taking,
+
+43
+00:01:47,880 --> 00:01:50,580
+we just pass all that
+information to the scheduler
+
+44
+00:01:50,580 --> 00:01:51,783
+and we're ready to go.
+
+45
+00:01:53,110 --> 00:01:57,510
+What does the polynomial
+decay schedule look like?
+
+46
+00:01:57,510 --> 00:01:59,610
+Well, it looks like this,
+
+47
+00:01:59,610 --> 00:02:02,160
+it starts at 5 E minus 5,
+
+48
+00:02:02,160 --> 00:02:05,490
+which means 5 times 10 to the minus 5,
+
+49
+00:02:05,490 --> 00:02:08,190
+and then decays down at a constant rate
+
+50
+00:02:08,190 --> 00:02:11,310
+until it hits zero right at
+the very end of training.
+
+51
+00:02:11,310 --> 00:02:13,200
+So hold on, I can already hear you
+
+52
+00:02:13,200 --> 00:02:14,640
+yelling at your monitor, though,
+
+53
+00:02:14,640 --> 00:02:16,020
+and yes, I know,
+
+54
+00:02:16,020 --> 00:02:18,690
+this is actually constant
+or a linear decay,
+
+55
+00:02:18,690 --> 00:02:20,310
+and I know the name is polynomial,
+
+56
+00:02:20,310 --> 00:02:21,870
+and you're feeling cheated that, you know,
+
+57
+00:02:21,870 --> 00:02:24,390
+you were promised a polynomial
+and haven't gotten it,
+
+58
+00:02:24,390 --> 00:02:26,550
+so calm down though, it's okay,
+
+59
+00:02:26,550 --> 00:02:28,830
+because, of course,
+linear functions are just
+
+60
+00:02:28,830 --> 00:02:30,480
+first-order special cases
+
+61
+00:02:30,480 --> 00:02:32,850
+of the general polynomial functions,
+
+62
+00:02:32,850 --> 00:02:36,180
+and if you tweak the
+options to this class,
+
+63
+00:02:36,180 --> 00:02:38,130
+you can get a truly polynomial,
+
+64
+00:02:38,130 --> 00:02:40,170
+a higher-order decay schedule,
+
+65
+00:02:40,170 --> 00:02:43,140
+but this linear schedule will
+work fine for us for now,
+
+66
+00:02:43,140 --> 00:02:45,210
+we don't actually need all those
+
+67
+00:02:45,210 --> 00:02:47,610
+fancy tweaks and fancy gadgets.
+
+68
+00:02:47,610 --> 00:02:49,770
+So coming back,
+
+69
+00:02:49,770 --> 00:02:51,990
+how do we actually use
+this learning rate schedule
+
+70
+00:02:51,990 --> 00:02:53,460
+once we've created it?
+
+71
+00:02:53,460 --> 00:02:55,650
+So it's simple, we just pass it to Adam.
+
+72
+00:02:55,650 --> 00:02:58,560
+So the first time we compiled the model,
+
+73
+00:02:58,560 --> 00:03:00,840
+we just passed the string Adam,
+
+74
+00:03:00,840 --> 00:03:02,250
+to get our optimizer.
+
+75
+00:03:02,250 --> 00:03:05,340
+So Keras recognizes the
+names of common optimizers
+
+76
+00:03:05,340 --> 00:03:07,920
+and loss functions if
+you pass them as strings,
+
+77
+00:03:07,920 --> 00:03:09,480
+so it saves time to do that
+
+78
+00:03:09,480 --> 00:03:11,460
+if you only want the default settings.
+
+79
+00:03:11,460 --> 00:03:13,320
+But now we're professional
+machine learners,
+
+80
+00:03:13,320 --> 00:03:15,720
+and, you know, that
+salary review is upcoming,
+
+81
+00:03:15,720 --> 00:03:17,790
+so we've got our very own
+learning rate schedule,
+
+82
+00:03:17,790 --> 00:03:19,770
+and we're gonna do things properly.
+
+83
+00:03:19,770 --> 00:03:22,830
+So the first we do is
+we import the optimizer,
+
+84
+00:03:22,830 --> 00:03:24,960
+and then we initialize
+it with a scheduler,
+
+85
+00:03:24,960 --> 00:03:27,540
+which is getting passed to
+to the learning rate argument
+
+86
+00:03:27,540 --> 00:03:29,100
+of that optimizer.
+
+87
+00:03:29,100 --> 00:03:32,190
+And now we compile the model
+with this new optimizer,
+
+88
+00:03:32,190 --> 00:03:34,140
+and again, whatever
+loss function you want,
+
+89
+00:03:34,140 --> 00:03:37,050
+so this is going to be sparse
+categorical crossentropy
+
+90
+00:03:37,050 --> 00:03:39,840
+if you're following along
+from the fine-tuning video.
+
+91
+00:03:39,840 --> 00:03:41,370
+And then, we're we're ready to go,
+
+92
+00:03:41,370 --> 00:03:43,710
+now we have a high-performance model,
+
+93
+00:03:43,710 --> 00:03:44,970
+and ready for training.
+
+94
+00:03:44,970 --> 00:03:46,830
+All that remains is to fit the model
+
+95
+00:03:46,830 --> 00:03:48,363
+just like we did before.
+
+96
+00:03:49,350 --> 00:03:51,600
+Remember, because we compiled the model
+
+97
+00:03:51,600 --> 00:03:54,300
+with the new optimizer and the
+new learning rate schedule,
+
+98
+00:03:54,300 --> 00:03:56,190
+we actually don't need
+to change anything at all
+
+99
+00:03:56,190 --> 00:03:57,360
+when we call fit,
+
+100
+00:03:57,360 --> 00:03:58,290
+we just call it again,
+
+101
+00:03:58,290 --> 00:04:00,540
+with exactly the same command as before,
+
+102
+00:04:00,540 --> 00:04:02,400
+but now we get a beautiful training,
+
+103
+00:04:02,400 --> 00:04:04,740
+with a nice, smooth learning rate decay,
+
+104
+00:04:04,740 --> 00:04:06,330
+starting from a good value,
+
+105
+00:04:06,330 --> 00:04:07,713
+and decaying down to zero.
+
+106
+00:04:08,867 --> 00:04:12,059
+(screen swishing)
+
+107
+00:04:12,059 --> 00:04:13,395
+(screen swishing)
+
diff --git a/subtitles/en/28_tensorflow-predictions-and-metrics.srt b/subtitles/en/28_tensorflow-predictions-and-metrics.srt
index ac82f1437..24f75a7d7 100644
--- a/subtitles/en/28_tensorflow-predictions-and-metrics.srt
+++ b/subtitles/en/28_tensorflow-predictions-and-metrics.srt
@@ -1,194 +1,461 @@
-1
-00:00:05,600 --> 00:00:10,080
-In our other videos, and as always, there'll 
-be links below if you want to check those out,  
-
-2
-00:00:10,640 --> 00:00:15,600
-we showed you how to initialize and 
-fine-tune a transformer model in TensorFlow,  
-
-3
-00:00:15,600 --> 00:00:20,800
-so the question now is: What can we do with a 
-model after we train it? The obvious thing to  
-
-4
-00:00:20,800 --> 00:00:26,080
-try is to use it to get predictions for new 
-data, so let's see how to do that. Again,  
-
-5
-00:00:26,080 --> 00:00:31,120
-if you're familiar with Keras, the good news is 
-that because there are just standard Keras models,  
-
-6
-00:00:31,680 --> 00:00:35,440
-we can use the standard Keras 
-predict() method, as shown here.  
-
-7
-00:00:36,800 --> 00:00:42,800
-You simply pass in tokenized text to this method, 
-like you'd get from a tokenizer, and you get your  
-
-8
-00:00:42,800 --> 00:00:48,320
-results. Our models can output several different 
-things, depending on the options you set,  
-
-9
-00:00:48,320 --> 00:00:53,280
-but most of the time the thing you want is the 
-output logits. If you haven’t come across them  
-
-10
-00:00:53,280 --> 00:01:02,960
-before, logits are the outputs of the last layer 
-of the network, before a softmax has been applied.  
-
-11
-00:01:02,960 --> 00:01:08,400
-So if you want to turn the logits into the model’s 
-probability outputs, you just apply a softmax,  
-
-12
-00:01:08,400 --> 00:01:13,840
-like so. What if we want to turn those 
-probabilities into class predictions?  
-
-13
-00:01:14,853 --> 00:01:20,960
-Simple, we just pick the biggest probability for 
-each output! The easiest way to do that is with  
-
-14
-00:01:20,960 --> 00:01:26,960
-the argmax function. Argmax will return the 
-index of the largest probability in each row,  
-
-15
-00:01:26,960 --> 00:01:36,400
-which means in this case that we’ll 
-get a vector of 0 and 1 values.  
-
-16
-00:01:37,360 --> 00:01:45,440
-Those are our class predictions! In fact, if 
-class predictions are all you want, you can skip  
-
-17
-00:01:45,440 --> 00:01:50,240
-the softmax step entirely, because the largest 
-logit will always be the largest probability  
-
-18
-00:01:52,400 --> 00:01:56,800
-too. If probabilities and class predictions are 
-all you want, then you’ve seen everything you  
-
-19
-00:01:56,800 --> 00:02:02,000
-need at this point! But if you’re interested in 
-benchmarking your model or using it for research,  
-
-20
-00:02:02,000 --> 00:02:06,320
-you might want to delve deeper into the results 
-you get. And one way to do that is to compute  
-
-21
-00:02:06,320 --> 00:02:10,880
-some metrics for the model’s predictions. If 
-you're following along with our datasets and  
-
-22
-00:02:10,880 --> 00:02:16,400
-fine-tuning videos, we got our data from the MRPC 
-dataset, which is part of the GLUE benchmark.  
-
-23
-00:02:16,960 --> 00:02:24,480
-Each of the GLUE datasets, as well as many of 
-our other datasets, has some predefined metrics,  
-
-24
-00:02:24,480 --> 00:02:31,520
-and we can load them easily with the datasets 
-load_metric() function. For the MRPC dataset,  
-
-25
-00:02:31,520 --> 00:02:36,080
-the built-in metrics are accuracy, which just 
-measures the percentage of the time the model’s  
-
-26
-00:02:36,080 --> 00:02:42,160
-prediction was correct, and the F1 score, which is 
-a slightly more complex measure that measures how  
-
-27
-00:02:42,160 --> 00:02:48,960
-well the model trades off precision and recall. 
-To compute those metrics to benchmark our model,  
-
-28
-00:02:48,960 --> 00:02:54,000
-we just pass them the model’s predictions, and 
-the ground truth labels, and we get our results.  
-
-29
-00:02:56,720 --> 00:03:01,120
-If you’re familiar with Keras, though, you’ll 
-notice that this is a weird way to compute  
-
-30
-00:03:01,120 --> 00:03:06,880
-metrics - we’re only computing metrics at the end 
-of training, but Keras has the built-in ability to  
-
-31
-00:03:06,880 --> 00:03:14,960
-compute a wide range of metrics on the fly while 
-you're training. If you want to use built-in  
-
-32
-00:03:14,960 --> 00:03:21,920
-metric computations, it's very straightforward - 
-you just pass a 'metric' argument to compile().  
-
-33
-00:03:22,960 --> 00:03:28,240
-As with things like loss and optimizer, you 
-can specify the metrics you want by string,  
-
-34
-00:03:28,240 --> 00:03:33,520
-or you can import the actual metric objects if you 
-want to pass specific arguments to them, but note  
-
-35
-00:03:33,520 --> 00:03:40,880
-that unlike loss and accuracy, you have to supply 
-a list of metrics, even if you only have one. Once  
-
-36
-00:03:40,880 --> 00:03:46,320
-a model has been compiled with a metric, it will 
-report that metric for training, validation and  
-
-37
-00:03:49,840 --> 00:03:54,880
-predictions. You can even write your own Metric 
-classes. Though this is a bit beyond the scope  
-
-38
-00:03:54,880 --> 00:03:59,440
-of this course, I'll link to the relevant 
-TF docs below because it can be very handy  
-
-39
-00:03:59,440 --> 00:04:10,800
-if you want a metric that isn't supported 
-by default in Keras, such as the F1 score.
+﻿1
+00:00:00,269 --> 00:00:02,936
+(air whooshing)
+
+2
+00:00:05,700 --> 00:00:07,110
+- In our other videos,
+
+3
+00:00:07,110 --> 00:00:09,000
+and as always, there'll be links below
+
+4
+00:00:09,000 --> 00:00:10,740
+if you want to check those out,
+
+5
+00:00:10,740 --> 00:00:13,230
+we showed you how to
+initialize and fine-tune
+
+6
+00:00:13,230 --> 00:00:15,690
+a transformer model in TensorFlow.
+
+7
+00:00:15,690 --> 00:00:18,600
+So the question now is,
+what can we do with a model
+
+8
+00:00:18,600 --> 00:00:20,070
+after we train it?
+
+9
+00:00:20,070 --> 00:00:21,390
+The obvious thing to try
+
+10
+00:00:21,390 --> 00:00:23,790
+is to use it to get
+predictions for new data,
+
+11
+00:00:23,790 --> 00:00:25,560
+so let's see how to do that.
+
+12
+00:00:25,560 --> 00:00:28,320
+Again, if you're familiar
+with Keras, the good news
+
+13
+00:00:28,320 --> 00:00:31,860
+is that because there are
+just standard Keras models,
+
+14
+00:00:31,860 --> 00:00:34,770
+we can use the standard
+Keras predict method,
+
+15
+00:00:34,770 --> 00:00:35,883
+as shown here.
+
+16
+00:00:36,990 --> 00:00:40,560
+You simply pass in tokenized
+text to this method,
+
+17
+00:00:40,560 --> 00:00:42,330
+like you'd get from a tokenizer,
+
+18
+00:00:42,330 --> 00:00:44,280
+and you get your results.
+
+19
+00:00:44,280 --> 00:00:46,740
+Our models can output
+several different things,
+
+20
+00:00:46,740 --> 00:00:48,510
+depending on the options you set,
+
+21
+00:00:48,510 --> 00:00:50,310
+but most of the time the thing you want
+
+22
+00:00:50,310 --> 00:00:52,290
+is the output logits.
+
+23
+00:00:52,290 --> 00:00:54,900
+If you haven't come
+across them before logits,
+
+24
+00:00:54,900 --> 00:00:57,630
+sometimes pronounced to
+logits because no one's sure,
+
+25
+00:00:57,630 --> 00:01:00,390
+are the outputs of the
+last layer of the network
+
+26
+00:01:00,390 --> 00:01:03,150
+because before a softmax has been applied.
+
+27
+00:01:03,150 --> 00:01:04,710
+So if you want to turn the logits
+
+28
+00:01:04,710 --> 00:01:06,900
+into the model's probability outputs,
+
+29
+00:01:06,900 --> 00:01:09,423
+you just apply a softmax, like so.
+
+30
+00:01:10,981 --> 00:01:12,630
+What if we want to turn
+those probabilities
+
+31
+00:01:12,630 --> 00:01:14,370
+into class predictions?
+
+32
+00:01:14,370 --> 00:01:16,410
+Again, it's very straightforward.
+
+33
+00:01:16,410 --> 00:01:19,470
+We just pick the biggest
+probability for each output
+
+34
+00:01:19,470 --> 00:01:23,070
+and you can get that immediately
+with the argmax function.
+
+35
+00:01:23,070 --> 00:01:24,870
+argmax will return the index
+
+36
+00:01:24,870 --> 00:01:27,120
+of the largest probability in each row
+
+37
+00:01:27,120 --> 00:01:30,360
+which means that we'll
+get a vector of integers.
+
+38
+00:01:30,360 --> 00:01:34,950
+So zero if the largest probability
+was in the zero position,
+
+39
+00:01:34,950 --> 00:01:37,350
+one in the first position, and so on.
+
+40
+00:01:37,350 --> 00:01:40,380
+So these are our class
+predictions indicating class zero,
+
+41
+00:01:40,380 --> 00:01:42,300
+class one, and so on.
+
+42
+00:01:42,300 --> 00:01:45,090
+In fact, if class
+predictions are all you want,
+
+43
+00:01:45,090 --> 00:01:47,310
+you can skip the softmax step entirely
+
+44
+00:01:47,310 --> 00:01:49,740
+because the largest logit
+will always be the largest
+
+45
+00:01:49,740 --> 00:01:51,303
+probability as well.
+
+46
+00:01:52,500 --> 00:01:55,800
+So if probabilities and class
+predictions are all you want,
+
+47
+00:01:55,800 --> 00:01:58,350
+then you've seen everything
+you need at this point.
+
+48
+00:01:58,350 --> 00:02:00,630
+But if you're interested
+in benchmarking your model
+
+49
+00:02:00,630 --> 00:02:02,190
+or using it for research,
+
+50
+00:02:02,190 --> 00:02:05,010
+you might want to delve deeper
+into the results you get.
+
+51
+00:02:05,010 --> 00:02:07,230
+And one way to do that is
+to compute some metrics
+
+52
+00:02:07,230 --> 00:02:09,060
+for the model's predictions.
+
+53
+00:02:09,060 --> 00:02:10,920
+If you're following
+along with our datasets
+
+54
+00:02:10,920 --> 00:02:12,390
+and fine tuning videos,
+
+55
+00:02:12,390 --> 00:02:14,850
+we got our data from the MRPC dataset,
+
+56
+00:02:14,850 --> 00:02:17,130
+which is part of the GLUE benchmark.
+
+57
+00:02:17,130 --> 00:02:19,050
+Each of the GLUE datasets
+
+58
+00:02:19,050 --> 00:02:22,560
+as well as many other datasets
+in our dataset, Light Hub
+
+59
+00:02:22,560 --> 00:02:24,510
+has some predefined metrics,
+
+60
+00:02:24,510 --> 00:02:26,940
+and we can load them easily
+
+61
+00:02:26,940 --> 00:02:29,880
+with the datasets load metric function.
+
+62
+00:02:29,880 --> 00:02:33,720
+For the MRPC dataset, the
+built-in metrics are accuracy
+
+63
+00:02:33,720 --> 00:02:35,790
+which just measures the
+percentage of the time
+
+64
+00:02:35,790 --> 00:02:37,830
+the model's prediction was correct,
+
+65
+00:02:37,830 --> 00:02:39,780
+and the F1 score,
+
+66
+00:02:39,780 --> 00:02:41,610
+which is a slightly more complex measure
+
+67
+00:02:41,610 --> 00:02:43,920
+that measures how well
+the model trades off
+
+68
+00:02:43,920 --> 00:02:45,543
+precision and recall.
+
+69
+00:02:46,470 --> 00:02:49,110
+To compute those metrics
+to benchmark our model,
+
+70
+00:02:49,110 --> 00:02:51,480
+we just pass them the model's predictions,
+
+71
+00:02:51,480 --> 00:02:53,220
+and to the ground truth labels,
+
+72
+00:02:53,220 --> 00:02:56,880
+and we get our results in a
+straightforward Python dict.
+
+73
+00:02:56,880 --> 00:02:58,740
+If you're familiar with Keras though,
+
+74
+00:02:58,740 --> 00:03:00,870
+you might notice that this
+is a slightly weird way
+
+75
+00:03:00,870 --> 00:03:01,800
+to compute metrics,
+
+76
+00:03:01,800 --> 00:03:02,970
+because we're only computing metrics
+
+77
+00:03:02,970 --> 00:03:04,440
+at the very end of training.
+
+78
+00:03:04,440 --> 00:03:06,480
+But in Keras, you have
+this built-in ability
+
+79
+00:03:06,480 --> 00:03:08,790
+to compute a wide range of metrics
+
+80
+00:03:08,790 --> 00:03:10,470
+on the fly while you're training,
+
+81
+00:03:10,470 --> 00:03:11,910
+which gives you a very useful insight
+
+82
+00:03:11,910 --> 00:03:13,740
+into how training is going.
+
+83
+00:03:13,740 --> 00:03:15,900
+So if you want to use built-in metrics,
+
+84
+00:03:15,900 --> 00:03:17,280
+it's very straightforward
+
+85
+00:03:17,280 --> 00:03:19,350
+and you use the standard
+Keras approach again.
+
+86
+00:03:19,350 --> 00:03:23,160
+You just pass a metric
+argument to the compile method.
+
+87
+00:03:23,160 --> 00:03:25,740
+As with things like loss and optimizer,
+
+88
+00:03:25,740 --> 00:03:28,470
+you can specify the
+metrics you want by string
+
+89
+00:03:28,470 --> 00:03:30,810
+or you can import the
+actual metric objects
+
+90
+00:03:30,810 --> 00:03:33,240
+and pass specific arguments to them.
+
+91
+00:03:33,240 --> 00:03:35,610
+But note that unlike loss and accuracy,
+
+92
+00:03:35,610 --> 00:03:37,710
+you have to supply metrics as a list
+
+93
+00:03:37,710 --> 00:03:39,760
+even if there's only one metric you want.
+
+94
+00:03:40,770 --> 00:03:43,140
+Once a model has been
+compiled with a metric,
+
+95
+00:03:43,140 --> 00:03:45,360
+it will report that metric for training,
+
+96
+00:03:45,360 --> 00:03:47,643
+validation, and predictions.
+
+97
+00:03:48,480 --> 00:03:50,820
+Assuming there are labels
+passed to the predictions.
+
+98
+00:03:50,820 --> 00:03:53,400
+You can even write your
+own metric classes.
+
+99
+00:03:53,400 --> 00:03:55,920
+Although this is a bit beyond
+the scope of this course,
+
+100
+00:03:55,920 --> 00:03:58,200
+I'll link to the relevant TF docs below
+
+101
+00:03:58,200 --> 00:03:59,580
+because it can be very handy
+
+102
+00:03:59,580 --> 00:04:01,320
+if you want a metric that isn't supported
+
+103
+00:04:01,320 --> 00:04:02,850
+by default in Keras,
+
+104
+00:04:02,850 --> 00:04:04,473
+such as the F1 score.
+
+105
+00:04:06,076 --> 00:04:08,743
+(air whooshing)
+
diff --git a/subtitles/en/29_write-your-training-loop-in-pytorch.srt b/subtitles/en/29_write-your-training-loop-in-pytorch.srt
index dddf45af7..a517fd436 100644
--- a/subtitles/en/29_write-your-training-loop-in-pytorch.srt
+++ b/subtitles/en/29_write-your-training-loop-in-pytorch.srt
@@ -1,332 +1,536 @@
-1
-00:00:05,430 --> 00:00:07,240
-Write your own training loop in PyTorch.
-
-2
-00:00:07,240 --> 00:00:11,759
-In this video, we will look at how we can
-do the same fine-tuning as in the Trainer
-
-3
-00:00:11,759 --> 00:00:14,120
-video, but without relying on that class.
-
-4
-00:00:14,120 --> 00:00:20,369
-This way you will be able to easily customize
-each step of the training loop to your needs.
-
-5
-00:00:20,369 --> 00:00:23,859
-This is also very useful to manually debug
-something that went wrong with the Trainer
-
-6
-00:00:23,859 --> 00:00:26,189
-API.
-
-7
-00:00:26,189 --> 00:00:31,200
-Before we dive into the code, here is a sketch
-of a training loop: we take a batch of training
-
-8
-00:00:31,200 --> 00:00:33,469
-data and feed it to the model.
-
-9
-00:00:33,469 --> 00:00:36,600
-With the labels, we can then compute a loss.
-
-10
-00:00:36,600 --> 00:00:41,130
-That number is not useful on its own, but
-is used to compute the gradients of our model
-
-11
-00:00:41,130 --> 00:00:46,750
-weights, that is the derivative of the loss
-with respect to each model weight.
-
-12
-00:00:46,750 --> 00:00:51,920
-Those gradients are then used by the optimizer
-to update the model weights and make them
-
-13
-00:00:51,920 --> 00:00:53,360
-a little bit better.
-
-14
-00:00:53,360 --> 00:00:56,170
-We then repeat the process with a new batch
-of training data.
-
-15
-00:00:56,170 --> 00:01:00,969
-If any of this is unclear, don't hesitate
-to take a refresher on your favorite deep
-
-16
-00:01:00,969 --> 00:01:02,240
-learning course.
-
-17
-00:01:02,240 --> 00:01:07,560
-We will use the GLUE MRPC dataset here again,
-and we have seen how to preprocess the data
-
-18
-00:01:07,560 --> 00:01:10,439
-using the Datasets library with dynamic padding.
-
-19
-00:01:10,439 --> 00:01:15,549
-Checkout the videos linked below if you haven't
-seen them already.
-
-20
-00:01:15,549 --> 00:01:20,060
-With this done, we only have to define PyTorch
-DataLoaders, which will be responsible to
-
-21
-00:01:20,060 --> 00:01:24,480
-convert the elements of our dataset into batches.
-
-22
-00:01:24,480 --> 00:01:33,890
-We use our DataCollatorForPadding as the collate
-function, and shuffle the training set.
-
-23
-00:01:33,890 --> 00:01:39,460
-To check that everything works as intended,
-we try to grab a batch of data and inspect
-
-24
-00:01:39,460 --> 00:01:40,460
-it.
-
-25
-00:01:40,460 --> 00:01:44,790
-Like our dataset elements, it's a dictionary,
-but this time the values are not a single
-
-26
-00:01:44,790 --> 00:01:50,460
-list of integers, but a tensor of shape batch
-size by sequence length.
-
-27
-00:01:50,460 --> 00:01:52,869
-The next step is to send the training data
-in our model.
-
-28
-00:01:52,869 --> 00:01:56,790
-For that, we will need to create our model.
-
-29
-00:01:56,790 --> 00:02:01,240
-As seen in the model API video, we use the
-from_pretrained method and adjust the number
-
-30
-00:02:01,240 --> 00:02:06,159
-of labels to the number of classes we have
-on this dataset, here two.
-
-31
-00:02:06,159 --> 00:02:11,020
-Again, to be sure everything is going well,
-we pass the batch we grabbed to our model
-
-32
-00:02:11,020 --> 00:02:12,640
-and check there is no error.
-
-33
-00:02:12,640 --> 00:02:17,780
-If the labels are provided, the models of
-the Transformers library always return the
-
-34
-00:02:17,780 --> 00:02:18,840
-loss directly.
-
-35
-00:02:18,840 --> 00:02:24,129
-We will be able to do loss.backward() to compute
-all the gradients, and will then need an optimizer
-
-36
-00:02:24,129 --> 00:02:26,480
-to do the training step.
-
-37
-00:02:26,480 --> 00:02:30,800
-We use the AdamW optimizer here, which is
-a variant of Adam with proper weight decay,
-
-38
-00:02:30,800 --> 00:02:35,040
-but you can pick any PyTorch optimizer you
-like.
-
-39
-00:02:35,040 --> 00:02:39,519
-Using the previous loss and computing the
-gradients with loss.backward(), we check that
-
-40
-00:02:39,519 --> 00:02:43,510
-we can do the optimizer step without any error.
-
-41
-00:02:43,510 --> 00:02:47,580
-Don't forget to zero your gradient afterward,
-or at the next step they will get added to
-
-42
-00:02:47,580 --> 00:02:49,659
-the gradients you compute!
-
-43
-00:02:49,659 --> 00:02:53,620
-We could already write our training loop,
-but we will add two more things to make it
-
-44
-00:02:53,620 --> 00:02:55,590
-as good as it can be.
-
-45
-00:02:55,590 --> 00:03:01,150
-The first one is a learning rate scheduler,
-to progressively decay our learning rate to
-
-46
-00:03:01,150 --> 00:03:02,150
-zero.
-
-47
-00:03:02,150 --> 00:03:06,180
-The get_scheduler function from the Transformers
-library is just a convenience function to
-
-48
-00:03:06,180 --> 00:03:12,760
-easily build such a scheduler, you can again
-use any PyTorch learning rate scheduler instead.
-
-49
-00:03:12,760 --> 00:03:17,299
-Finally, if we want our training to take a
-couple of minutes instead of a few hours,
-
-50
-00:03:17,299 --> 00:03:19,580
-we will need to use a GPU.
-
-51
-00:03:19,580 --> 00:03:24,340
-The first step is to get one, for instance
-by using a colab notebook.
-
-52
-00:03:24,340 --> 00:03:29,090
-Then you need to actually send your model
-and training data on it by using a torch device.
-
-53
-00:03:29,090 --> 00:03:35,659
-Double-check the following lines print a CUDA
-device for you!
-
-54
-00:03:35,659 --> 00:03:38,450
-We can now put everything together!
-
-55
-00:03:38,450 --> 00:03:42,470
-First we put our model in training mode (which
-will activate the training behavior for some
-
-56
-00:03:42,470 --> 00:03:47,900
-layers like Dropout) then go through the number
-of epochs we picked and all the data in our
-
-57
-00:03:47,900 --> 00:03:50,130
-training dataloader.
-
-58
-00:03:50,130 --> 00:03:54,560
-Then we go through all the steps we have seen
-already: send the data to the GPU, compute
-
-59
-00:03:54,560 --> 00:03:57,870
-the model outputs, and in particular the loss.
-
-60
-00:03:57,870 --> 00:04:02,040
-Use the loss to compute gradients, then make
-a training step with the optimizer.
-
-61
-00:04:02,040 --> 00:04:06,760
-Update the learning rate in our scheduler
-for the next iteration and zero the gradients
-
-62
-00:04:06,760 --> 00:04:09,340
-of the optimizer.
-
-63
-00:04:09,340 --> 00:04:13,590
-Once this is finished, we can evaluate our
-model very easily with a metric from the Datasets
-
-64
-00:04:13,590 --> 00:04:14,730
-library.
-
-65
-00:04:14,730 --> 00:04:22,470
-First we put our model in evaluation mode,
-then go through all the data in the evaluation
-
-66
-00:04:22,470 --> 00:04:23,900
-data loader.
-
-67
-00:04:23,900 --> 00:04:27,480
-As we have seen in the Trainer video, the
-model outputs logits and we need to apply
-
-68
-00:04:27,480 --> 00:04:31,350
-the argmax function to convert them into predictions.
-
-69
-00:04:31,350 --> 00:04:36,910
-The metric object then has an add_batch method
-we can use to send it those intermediate predictions.
-
-70
-00:04:36,910 --> 00:04:40,590
-Once the evaluation loop is finished, we just
-have to call the compute method to get our
-
-71
-00:04:40,590 --> 00:04:41,620
-final results!
-
-72
-00:04:41,620 --> 00:04:50,760
-Congratulations, you have now fine-tuned a
-model all by yourself!
+﻿1
+00:00:00,298 --> 00:00:01,511
+(air whooshing)
+
+2
+00:00:01,511 --> 00:00:02,769
+(smiley face popping)
+
+3
+00:00:02,769 --> 00:00:05,460
+(air whooshing)
+
+4
+00:00:05,460 --> 00:00:08,486
+- Write your own training
+loop with PyTorch.
+
+5
+00:00:08,486 --> 00:00:09,960
+In this video, we'll look at
+
+6
+00:00:09,960 --> 00:00:12,750
+how we can do the same fine-tuning
+as in the Trainer video,
+
+7
+00:00:12,750 --> 00:00:14,760
+but without relying on that class.
+
+8
+00:00:14,760 --> 00:00:17,790
+This way, you'll be able to
+easily customize each step
+
+9
+00:00:17,790 --> 00:00:20,310
+to the training loop to your needs.
+
+10
+00:00:20,310 --> 00:00:21,660
+This is also very useful
+
+11
+00:00:21,660 --> 00:00:22,740
+to manually debug something
+
+12
+00:00:22,740 --> 00:00:24,590
+that went wrong with the Trainer API.
+
+13
+00:00:26,220 --> 00:00:28,020
+Before we dive into the code,
+
+14
+00:00:28,020 --> 00:00:30,481
+here is a sketch of a training loop.
+
+15
+00:00:30,481 --> 00:00:33,381
+We take a batch of training
+data and feed it to the model.
+
+16
+00:00:34,223 --> 00:00:36,960
+With the labels, we can
+then compute a loss.
+
+17
+00:00:36,960 --> 00:00:39,316
+That number is not useful in its own,
+
+18
+00:00:39,316 --> 00:00:40,260
+that is used to compute
+
+19
+00:00:40,260 --> 00:00:42,150
+the ingredients of our model weights,
+
+20
+00:00:42,150 --> 00:00:43,440
+that is the derivative of the loss
+
+21
+00:00:44,610 --> 00:00:47,160
+with respect to each model weight.
+
+22
+00:00:47,160 --> 00:00:49,800
+Those gradients are then
+used by the optimizer
+
+23
+00:00:49,800 --> 00:00:51,210
+to update the model weights,
+
+24
+00:00:51,210 --> 00:00:53,550
+and make them a little bit better.
+
+25
+00:00:53,550 --> 00:00:54,510
+We then repeat the process
+
+26
+00:00:54,510 --> 00:00:56,880
+with a new batch of training data.
+
+27
+00:00:56,880 --> 00:00:58,620
+If any of this isn't clear,
+
+28
+00:00:58,620 --> 00:01:00,270
+don't hesitate to take a refresher
+
+29
+00:01:00,270 --> 00:01:02,170
+on your favorite deep learning course.
+
+30
+00:01:03,210 --> 00:01:06,000
+We'll use the GLUE MRPC
+data set here again,
+
+31
+00:01:06,000 --> 00:01:07,680
+and we've seen how to prepropose the data
+
+32
+00:01:07,680 --> 00:01:11,130
+using the Datasets library
+with dynamic padding.
+
+33
+00:01:11,130 --> 00:01:12,630
+Check out the videos link below
+
+34
+00:01:12,630 --> 00:01:14,280
+if you haven't seen them already.
+
+35
+00:01:15,480 --> 00:01:18,930
+With this done, we only have
+to define PyTorch DataLoaders
+
+36
+00:01:18,930 --> 00:01:20,610
+which will be responsible to convert
+
+37
+00:01:20,610 --> 00:01:23,253
+the elements of our dataset into patches.
+
+38
+00:01:24,450 --> 00:01:27,960
+We use our DataColletorForPadding
+as a collate function,
+
+39
+00:01:27,960 --> 00:01:29,460
+and shuffle the training set
+
+40
+00:01:29,460 --> 00:01:31,080
+to make sure we don't go over the samples
+
+41
+00:01:31,080 --> 00:01:33,870
+in the same order at a epoch*.
+
+42
+00:01:33,870 --> 00:01:36,390
+To check that everything
+works as intended,
+
+43
+00:01:36,390 --> 00:01:38,883
+we try to grab a batch
+of data, and inspect it.
+
+44
+00:01:40,080 --> 00:01:43,050
+Like our data set elements,
+it's a dictionary,
+
+45
+00:01:43,050 --> 00:01:46,260
+but these times the values are
+not a single list of integers
+
+46
+00:01:46,260 --> 00:01:49,053
+but a tensor of shape batch
+size by sequence length.
+
+47
+00:01:50,460 --> 00:01:53,580
+The next step is to send the
+training data in our model.
+
+48
+00:01:53,580 --> 00:01:56,730
+For that, we'll need to
+actually create a model.
+
+49
+00:01:56,730 --> 00:01:58,740
+As seen in the Model API video,
+
+50
+00:01:58,740 --> 00:02:00,540
+we use the from_pretrained method,
+
+51
+00:02:00,540 --> 00:02:03,270
+and adjust the number of
+labels to the number of classes
+
+52
+00:02:03,270 --> 00:02:06,810
+we have on this data set, here two.
+
+53
+00:02:06,810 --> 00:02:08,940
+Again to be sure everything is going well,
+
+54
+00:02:08,940 --> 00:02:11,100
+we pass the batch we grabbed to our model,
+
+55
+00:02:11,100 --> 00:02:13,320
+and check there is no error.
+
+56
+00:02:13,320 --> 00:02:14,940
+If the labels are provided,
+
+57
+00:02:14,940 --> 00:02:16,590
+the models of the Transformers library
+
+58
+00:02:16,590 --> 00:02:18,273
+always returns a loss directly.
+
+59
+00:02:19,525 --> 00:02:21,090
+We will be able to do loss.backward()
+
+60
+00:02:21,090 --> 00:02:22,860
+to compute all the gradients,
+
+61
+00:02:22,860 --> 00:02:26,460
+and will then need an optimizer
+to do the training step.
+
+62
+00:02:26,460 --> 00:02:28,860
+We use the AdamW optimizer here,
+
+63
+00:02:28,860 --> 00:02:31,440
+which is a variant of Adam
+with proper weight decay,
+
+64
+00:02:31,440 --> 00:02:33,840
+but you can pick any
+PyTorch optimizer you like.
+
+65
+00:02:34,830 --> 00:02:36,150
+Using the previous loss,
+
+66
+00:02:36,150 --> 00:02:39,060
+and computing the gradients
+with loss.backward(),
+
+67
+00:02:39,060 --> 00:02:41,130
+we check that we can do the optimizer step
+
+68
+00:02:41,130 --> 00:02:42,030
+without any error.
+
+69
+00:02:43,380 --> 00:02:45,870
+Don't forget to zero
+your gradient afterwards,
+
+70
+00:02:45,870 --> 00:02:46,890
+or at the next step,
+
+71
+00:02:46,890 --> 00:02:49,343
+they will get added to the
+gradients you computed.
+
+72
+00:02:50,490 --> 00:02:52,080
+We could already write our training loop,
+
+73
+00:02:52,080 --> 00:02:53,220
+but we will add two more things
+
+74
+00:02:53,220 --> 00:02:55,620
+to make it as good as it can be.
+
+75
+00:02:55,620 --> 00:02:57,690
+The first one is a
+learning rate scheduler,
+
+76
+00:02:57,690 --> 00:03:00,140
+to progressively decay
+our learning rate to zero.
+
+77
+00:03:01,195 --> 00:03:04,590
+The get_scheduler function
+from the Transformers library
+
+78
+00:03:04,590 --> 00:03:06,150
+is just a convenience function
+
+79
+00:03:06,150 --> 00:03:07,800
+to easily build such a scheduler.
+
+80
+00:03:08,850 --> 00:03:09,683
+You can again use
+
+81
+00:03:09,683 --> 00:03:11,860
+any PyTorch learning
+rate scheduler instead.
+
+82
+00:03:13,110 --> 00:03:14,850
+Finally, if we want our training
+
+83
+00:03:14,850 --> 00:03:17,610
+to take a couple of minutes
+instead of a few hours,
+
+84
+00:03:17,610 --> 00:03:19,530
+we will need to use a GPU.
+
+85
+00:03:19,530 --> 00:03:21,270
+The first step is to get one,
+
+86
+00:03:21,270 --> 00:03:23,283
+for instance by using a collab notebook.
+
+87
+00:03:24,180 --> 00:03:26,040
+Then you need to actually send your model,
+
+88
+00:03:26,040 --> 00:03:28,923
+and training data on it
+by using a torch device.
+
+89
+00:03:29,790 --> 00:03:30,840
+Double-check the following lines
+
+90
+00:03:30,840 --> 00:03:32,340
+print a CUDA device for you.
+
+91
+00:03:32,340 --> 00:03:35,640
+or be prepared for your training
+to less, more than an hour.
+
+92
+00:03:35,640 --> 00:03:37,390
+We can now put everything together.
+
+93
+00:03:38,550 --> 00:03:40,860
+First, we put our model in training mode
+
+94
+00:03:40,860 --> 00:03:42,240
+which will activate the training behavior
+
+95
+00:03:42,240 --> 00:03:44,790
+for some layers, like Dropout.
+
+96
+00:03:44,790 --> 00:03:46,860
+Then go through the number
+of epochs we picked,
+
+97
+00:03:46,860 --> 00:03:50,070
+and all the data in our
+training dataloader.
+
+98
+00:03:50,070 --> 00:03:52,410
+Then we go through all the
+steps we have seen already;
+
+99
+00:03:52,410 --> 00:03:54,240
+send the data to the GPU,
+
+100
+00:03:54,240 --> 00:03:55,560
+compute the model outputs,
+
+101
+00:03:55,560 --> 00:03:57,720
+and in particular the loss.
+
+102
+00:03:57,720 --> 00:03:59,850
+Use the loss to compute gradients,
+
+103
+00:03:59,850 --> 00:04:02,880
+then make a training
+step with the optimizer.
+
+104
+00:04:02,880 --> 00:04:04,500
+Update the learning rate in our scheduler
+
+105
+00:04:04,500 --> 00:04:05,970
+for the next iteration,
+
+106
+00:04:05,970 --> 00:04:07,763
+and zero the gradients of the optimizer.
+
+107
+00:04:09,240 --> 00:04:10,500
+Once this is finished,
+
+108
+00:04:10,500 --> 00:04:12,150
+we can evaluate our model very easily
+
+109
+00:04:12,150 --> 00:04:14,283
+with a metric from the Datasets library.
+
+110
+00:04:15,180 --> 00:04:17,880
+First, we put our model
+in evaluation mode,
+
+111
+00:04:17,880 --> 00:04:20,550
+to deactivate layers like Dropout,
+
+112
+00:04:20,550 --> 00:04:23,850
+then go through all the data
+in the evaluation data loader.
+
+113
+00:04:23,850 --> 00:04:25,530
+As we have seen in the Trainer video,
+
+114
+00:04:25,530 --> 00:04:26,850
+the model outputs logits,
+
+115
+00:04:26,850 --> 00:04:28,530
+and we need to apply the argmax function
+
+116
+00:04:28,530 --> 00:04:30,213
+to convert them into predictions.
+
+117
+00:04:31,350 --> 00:04:33,420
+The metric object then
+has an add_batch method,
+
+118
+00:04:33,420 --> 00:04:36,810
+we can use to send it those
+intermediate predictions.
+
+119
+00:04:36,810 --> 00:04:38,700
+Once the evaluation loop is finished,
+
+120
+00:04:38,700 --> 00:04:40,320
+we just have to call the compute method
+
+121
+00:04:40,320 --> 00:04:42,180
+to get our final results.
+
+122
+00:04:42,180 --> 00:04:44,490
+Congratulations, you have
+now fine-tuned a model
+
+123
+00:04:44,490 --> 00:04:45,633
+all by yourself.
+
+124
+00:04:47,253 --> 00:04:49,920
+(air whooshing)
+
diff --git a/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt b/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt
index 9f8b76dc3..35913784c 100644
--- a/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt
+++ b/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt
@@ -1,173 +1,322 @@
-1
-00:00:05,360 --> 00:00:08,800
-Supercharge your Pytorch training 
-loop with Hugging Face Accelerate.  
-
-2
-00:00:11,120 --> 00:00:17,040
-There are multiple setups on which you can run 
-your training: it could be on CPU, GPUs, TPUs.  
-
-3
-00:00:17,680 --> 00:00:22,640
-Distributed on one machine with several 
-devices, or several machines (often called  
-
-4
-00:00:22,640 --> 00:00:29,360
-nodes) each with multiple devices. On top of that 
-there are new tweaks to make your training faster  
-
-5
-00:00:29,360 --> 00:00:35,680
-or more memory efficient, like mixed precision 
-and DeepSpeed. Each of those setups or training  
-
-6
-00:00:35,680 --> 00:00:40,080
-tweaks, requires you to change the code of 
-your training loop in one way or another  
-
-7
-00:00:40,080 --> 00:00:46,480
-and to learn a new API. All those setups are 
-handled by the Trainer API, and there are several  
-
-8
-00:00:46,480 --> 00:00:51,440
-third-party libraries that can also help you with 
-that. The problem with those is that they can feel  
-
-9
-00:00:51,440 --> 00:00:56,320
-like a black box and that it might not be easy to 
-implement the tweak to the training loop you need.  
-
-10
-00:00:57,680 --> 00:01:02,000
-Accelerate has been designed specifically to let 
-you retain full control over your training loop  
-
-11
-00:01:02,560 --> 00:01:08,000
-and be as non-intrusive as possible. With 
-just four lines to add to your training loop  
-
-12
-00:01:08,640 --> 00:01:11,840
-(here shown on the code of the training 
-loop from the "Raw training loop" video),  
-
-13
-00:01:12,480 --> 00:01:16,800
-Accelerate will handle all the setups and 
-training tweaks mentioned on the first slide.  
-
-14
-00:01:18,320 --> 00:01:21,600
-It's only one API to learn and 
-master instead of ten different ones.  
-
-15
-00:01:23,120 --> 00:01:27,120
-More specifically, you have to import 
-and instantiate an accelerator object,  
-
-16
-00:01:27,120 --> 00:01:30,000
-that will handle all the necessary 
-code for your specific setup.  
-
-17
-00:01:31,200 --> 00:01:36,880
-Then you have to send it the model, optimizer and 
-dataloaders you are using in the prepare method,  
-
-18
-00:01:37,760 --> 00:01:43,600
-which is the main method to remember. Accelerate 
-handles device placement, so you don't need to put  
-
-19
-00:01:43,600 --> 00:01:49,840
-your batch on the specific device you are using. 
-Finally, you have to replace the loss.backward  
-
-20
-00:01:49,840 --> 00:01:54,880
-line by accelerate.backward(loss), 
-and that's all you need!  
-
-21
-00:01:58,240 --> 00:02:00,480
-Accelerate also handles distributed evaluation.  
-
-22
-00:02:01,440 --> 00:02:05,280
-You can still use a classic evaluation loop 
-such as the one we saw in the "Raw training  
-
-23
-00:02:05,280 --> 00:02:09,760
-loop" video, in which case all processes 
-will each perform the full evaluation.  
-
-24
-00:02:11,040 --> 00:02:15,360
-To use a distributed evaluation, you just 
-have to adapt your evaluation loop like this:  
-
-25
-00:02:16,080 --> 00:02:19,920
-pass along the evaluation dataloader 
-to the accelerator.prepare method,  
-
-26
-00:02:19,920 --> 00:02:25,200
-like for training. Then you can dismiss the 
-line that places the batch on the proper device,  
-
-27
-00:02:25,920 --> 00:02:28,800
-and just before passing your 
-predictions and labels to your metric,  
-
-28
-00:02:29,440 --> 00:02:36,160
-use accelerator.gather to gather together 
-the predictions and labels from each process.  
-
-29
-00:02:36,160 --> 00:02:41,440
-A distributed training script has to be launched 
-several times on different processes (for instance  
-
-30
-00:02:41,440 --> 00:02:47,360
-one per GPU you are using). You can use the 
-PyTorch tools if you are familiar with them,  
-
-31
-00:02:48,000 --> 00:02:51,760
-but Accelerate also provides an 
-easy API to configure your setup  
-
-32
-00:02:51,760 --> 00:02:58,000
-and launch your training script. In a terminal, 
-run accelerate config and answer the small  
-
-33
-00:02:58,000 --> 00:03:01,680
-questionnaire to generate a configuration 
-file with all the relevant information,  
-
-34
-00:03:03,120 --> 00:03:07,360
-then you can just run accelerate launch, 
-followed by the path to your training script.  
-
-35
-00:03:08,400 --> 00:03:19,840
-In a notebook, you can use the notebook_launcher 
-function to launch your training function.
+﻿1
+00:00:00,225 --> 00:00:02,892
+(air whooshing)
+
+2
+00:00:05,460 --> 00:00:07,470
+- Supercharge your PyTorch training loop
+
+3
+00:00:07,470 --> 00:00:08,943
+with Hugging Face Accelerate.
+
+4
+00:00:11,340 --> 00:00:12,600
+There are multiple setups
+
+5
+00:00:12,600 --> 00:00:14,580
+on which you can run your training:
+
+6
+00:00:14,580 --> 00:00:17,910
+it could be on CPU, GPUs, TPUs,
+
+7
+00:00:17,910 --> 00:00:20,610
+distributed on one machine
+with several devices,
+
+8
+00:00:20,610 --> 00:00:23,220
+or even several machines,
+often called nodes,
+
+9
+00:00:23,220 --> 00:00:25,173
+each with multiple devices.
+
+10
+00:00:26,340 --> 00:00:28,200
+On top of that, there are new tweaks
+
+11
+00:00:28,200 --> 00:00:30,810
+to make your training
+faster or more efficient,
+
+12
+00:00:30,810 --> 00:00:32,763
+like mixed precision and DeepSpeed.
+
+13
+00:00:33,840 --> 00:00:36,600
+Each of those setups or training tweaks
+
+14
+00:00:36,600 --> 00:00:38,760
+requires you to change the
+code of your training loop
+
+15
+00:00:38,760 --> 00:00:41,733
+in one way or another
+and to learn a new API.
+
+16
+00:00:43,260 --> 00:00:45,940
+All those setups are
+handled by the Trainer API,
+
+17
+00:00:45,940 --> 00:00:49,590
+and there are several third-party
+libraries that can help.
+
+18
+00:00:49,590 --> 00:00:50,760
+The problem with those
+
+19
+00:00:50,760 --> 00:00:53,100
+is that they can feel like a black box
+
+20
+00:00:53,100 --> 00:00:55,320
+and that it might not be
+easy to implement the tweak
+
+21
+00:00:55,320 --> 00:00:56,820
+to the training loop you need.
+
+22
+00:00:57,840 --> 00:00:59,760
+Accelerate has been designed specifically
+
+23
+00:00:59,760 --> 00:01:02,790
+to let you retain full control
+over your training loop
+
+24
+00:01:02,790 --> 00:01:04,833
+and be as non-intrusive as possible.
+
+25
+00:01:05,760 --> 00:01:08,760
+With just four lines of code
+to add to your training loop,
+
+26
+00:01:08,760 --> 00:01:11,733
+here shown on the example
+of the training loop video,
+
+27
+00:01:12,630 --> 00:01:14,730
+Accelerate will handle all the setups
+
+28
+00:01:14,730 --> 00:01:17,180
+and training tweaks
+mentioned on the first slide.
+
+29
+00:01:18,630 --> 00:01:20,400
+It's only one API to learn and master
+
+30
+00:01:20,400 --> 00:01:21,933
+instead of 10 different ones.
+
+31
+00:01:23,340 --> 00:01:25,980
+More specifically, you have
+to import and instantiate
+
+32
+00:01:25,980 --> 00:01:27,360
+an accelerator object,
+
+33
+00:01:27,360 --> 00:01:29,100
+that will handle all the necessary code
+
+34
+00:01:29,100 --> 00:01:30,300
+for your specific setup.
+
+35
+00:01:31,380 --> 00:01:33,780
+Then you have to send it the model,
+
+36
+00:01:33,780 --> 00:01:36,000
+optimizer and dataloaders you are using
+
+37
+00:01:36,000 --> 00:01:39,633
+in the prepare method, which
+is the main method to remember.
+
+38
+00:01:40,860 --> 00:01:42,870
+Accelerate handles device placement,
+
+39
+00:01:42,870 --> 00:01:44,370
+so you don't need to put your batch
+
+40
+00:01:44,370 --> 00:01:46,980
+on the specific device you are using.
+
+41
+00:01:46,980 --> 00:01:50,640
+Finally, you have to replace
+the loss.backward line
+
+42
+00:01:50,640 --> 00:01:54,300
+by accelerator.backwardloss,
+
+43
+00:01:54,300 --> 00:01:55,500
+and that's all you need!
+
+44
+00:01:58,410 --> 00:02:01,710
+Accelerate also handles
+distributed evaluation.
+
+45
+00:02:01,710 --> 00:02:04,020
+You can still use a
+classic evaluation loop
+
+46
+00:02:04,020 --> 00:02:06,750
+such as the one we saw in
+the training loop video,
+
+47
+00:02:06,750 --> 00:02:08,280
+in which case all processes
+
+48
+00:02:08,280 --> 00:02:10,083
+will perform the full evaluation.
+
+49
+00:02:11,340 --> 00:02:13,530
+To use a distributed evaluation,
+
+50
+00:02:13,530 --> 00:02:16,380
+you just have to adapt your
+evaluation loop like this:
+
+51
+00:02:16,380 --> 00:02:17,657
+pass along the evaluation dataloader
+
+52
+00:02:17,657 --> 00:02:21,093
+to the accelerator.prepare
+method, like for training.
+
+53
+00:02:22,170 --> 00:02:23,430
+Then you can dismiss the line
+
+54
+00:02:23,430 --> 00:02:26,160
+that places the batch
+on the proper device,
+
+55
+00:02:26,160 --> 00:02:27,870
+and just before passing your predictions
+
+56
+00:02:27,870 --> 00:02:31,110
+and labels to your metric,
+use accelerator.gather
+
+57
+00:02:31,110 --> 00:02:33,300
+to gather together the predictions
+
+58
+00:02:33,300 --> 00:02:34,803
+and labels from each process.
+
+59
+00:02:36,420 --> 00:02:37,890
+A distributed training script
+
+60
+00:02:37,890 --> 00:02:41,040
+has to be launched several
+times on different processes,
+
+61
+00:02:41,040 --> 00:02:43,203
+for instance, one per GPU you are using.
+
+62
+00:02:44,070 --> 00:02:46,350
+You can use the PyTorch tools to do that
+
+63
+00:02:46,350 --> 00:02:48,210
+if you are familiar with them,
+
+64
+00:02:48,210 --> 00:02:50,520
+but Accelerate also provides an easy API
+
+65
+00:02:50,520 --> 00:02:53,523
+to configure your setup and
+launch your training script.
+
+66
+00:02:54,540 --> 00:02:57,270
+In a terminal, run accelerate config
+
+67
+00:02:57,270 --> 00:02:58,650
+and answer the small questionnaire
+
+68
+00:02:58,650 --> 00:03:00,330
+to generate a configuration file
+
+69
+00:03:00,330 --> 00:03:02,073
+with all the relevant information,
+
+70
+00:03:03,240 --> 00:03:05,790
+then you can just run accelerate launch,
+
+71
+00:03:05,790 --> 00:03:08,580
+followed by the path to
+your training script.
+
+72
+00:03:08,580 --> 00:03:12,000
+In a notebook, you can use
+the notebook launcher function
+
+73
+00:03:12,000 --> 00:03:13,233
+to launch your training.
+
+74
+00:03:15,186 --> 00:03:17,853
+(air whooshing)
+
diff --git a/subtitles/en/31_navigating-the-model-hub.srt b/subtitles/en/31_navigating-the-model-hub.srt
index c8e27a855..8facde57a 100644
--- a/subtitles/en/31_navigating-the-model-hub.srt
+++ b/subtitles/en/31_navigating-the-model-hub.srt
@@ -1,183 +1,343 @@
-1
-00:00:04,000 --> 00:00:07,760
-In this video, we're going to go over 
-the HuggingFace Model Hub navigation.  
-
-2
-00:00:10,080 --> 00:00:16,160
-This is the huggingface.co landing page. To access 
-the model hub, click on the "Models" tab in the  
-
-3
-00:00:16,160 --> 00:00:22,720
-upper right corner. You should be facing this web 
-interface, which can be split into several parts.  
-
-4
-00:00:24,240 --> 00:00:28,560
-On the left, you'll find categories, which 
-you can use to tailor your model search.  
-
-5
-00:00:29,760 --> 00:00:35,840
-The first category is the "Tasks". Models on 
-the hub may be used for a wide variety of tasks.  
-
-6
-00:00:36,480 --> 00:00:41,440
-These include natural language processing tasks, 
-such as question answering or text classification,  
-
-7
-00:00:41,440 --> 00:00:47,600
-but it isn't only limited to NLP. Other 
-tasks from other fields are also available,  
-
-8
-00:00:47,600 --> 00:00:52,240
-such as image classification for computer vision, 
-or automatic speech recognition for speech.  
-
-9
-00:00:54,720 --> 00:01:00,400
-The second category is the "libraries". Models 
-on the hub usually share one of three backbones:  
-
-10
-00:01:01,040 --> 00:01:07,040
-PyTorch, TensorFlow, or JAX. However, other 
-backbones, such as rust or ONNX also exist.  
-
-11
-00:01:09,440 --> 00:01:14,720
-Finally, this tab can also be used to specify 
-from which high-level framework the model comes.  
-
-12
-00:01:15,920 --> 00:01:20,880
-This includes Transformers, but it isn't 
-limited to it. The model Hub is used to host  
-
-13
-00:01:20,880 --> 00:01:25,840
-a lot of different frameworks' models, and we are 
-actively looking to host other frameworks' models.  
-
-14
-00:01:28,400 --> 00:01:33,440
-The third category is the "Datasets" 
-tab. Selecting a dataset from this tab  
-
-15
-00:01:33,440 --> 00:01:37,360
-means filtering the models so that they 
-were trained on that specific dataset.  
-
-16
-00:01:39,040 --> 00:01:43,600
-The fourth category is the "Languages" 
-tab. Selecting a language from this tab  
-
-17
-00:01:43,600 --> 00:01:46,800
-means filtering the models so that 
-they handle the language selected.  
-
-18
-00:01:48,480 --> 00:01:53,840
-Finally, the last category allows to choose 
-the license with which the model is shared.  
-
-19
-00:01:56,480 --> 00:01:59,440
-On the right, you'll find the 
-models available on the model Hub!  
-
-20
-00:02:00,320 --> 00:02:06,400
-The models are ordered by downloads. When clicking 
-on a model, you should be facing its model card.  
-
-21
-00:02:07,040 --> 00:02:11,520
-The model card contains information about 
-the model: its description, intended use,  
-
-22
-00:02:11,520 --> 00:02:18,240
-limitations and biases. It can also show code 
-snippets on how to use the model, as well as  
-
-23
-00:02:18,240 --> 00:02:23,840
-any relevant information: training procedure, 
-data processing, evaluation results, copyrights.  
-
-24
-00:02:25,440 --> 00:02:30,160
-This information is crucial for the model to 
-be used. The better crafted a model card is,  
-
-25
-00:02:30,160 --> 00:02:34,000
-the easier it will be for other users to 
-leverage your model in their applications.  
-
-26
-00:02:35,600 --> 00:02:41,440
-On the right of the model card is the inference 
-API. This inference API can be used to play with  
-
-27
-00:02:41,440 --> 00:02:46,640
-the model directly. Feel free to modify the text 
-and click on compute to see how would the model  
-
-28
-00:02:46,640 --> 00:02:55,200
-behave to your inputs. At the top of the screen 
-lie the model tags. These include the model task,  
-
-29
-00:02:55,200 --> 00:02:58,640
-as well as any other tag that is relevant 
-to the categories we have just seen.  
-
-30
-00:03:01,200 --> 00:03:05,920
-The "Files & Versions tab" displays the 
-architecture of the repository of that model.  
-
-31
-00:03:07,120 --> 00:03:12,080
-Here, we can see all the files that define 
-this model. You'll see all usual features  
-
-32
-00:03:12,080 --> 00:03:22,320
-of a git repository: the branches available, 
-the commit history as well as the commit diff.  
-
-33
-00:03:25,600 --> 00:03:28,800
-Three different buttons are available 
-at the top of the model card.  
-
-34
-00:03:29,600 --> 00:03:32,800
-The first one shows how to use the 
-inference API programmatically.  
-
-35
-00:03:35,760 --> 00:03:38,640
-The second one shows how to 
-train this model in SageMaker,  
-
-36
-00:03:42,720 --> 00:03:45,840
-and the last one shows how to load that 
-model within the appropriate library.  
-
-37
-00:03:46,720 --> 00:03:54,480
-For BERT, this is transformers.
+﻿1
+00:00:00,468 --> 00:00:03,051
+(upbeat music)
+
+2
+00:00:04,050 --> 00:00:05,910
+- [Instructor] In this
+video, we're going to go over
+
+3
+00:00:05,910 --> 00:00:08,013
+the HuggingFace Model Hub navigation.
+
+4
+00:00:10,140 --> 00:00:13,260
+This is the huggingface.co landing page.
+
+5
+00:00:13,260 --> 00:00:16,020
+To access the model hub,
+click on the models tab
+
+6
+00:00:16,020 --> 00:00:17,463
+in the upper right corner.
+
+7
+00:00:18,960 --> 00:00:21,030
+You should be facing this web interface,
+
+8
+00:00:21,030 --> 00:00:23,193
+which can be split into several parts.
+
+9
+00:00:24,480 --> 00:00:26,790
+On the left, you'll find categories,
+
+10
+00:00:26,790 --> 00:00:29,090
+which you can use to
+tailor your model search.
+
+11
+00:00:29,970 --> 00:00:32,970
+The first category is the tasks.
+
+12
+00:00:32,970 --> 00:00:36,660
+Models on the hub may be used
+for a wide variety of tasks.
+
+13
+00:00:36,660 --> 00:00:39,030
+These include natural
+language processing tasks,
+
+14
+00:00:39,030 --> 00:00:41,670
+such as question answering
+or text classification,
+
+15
+00:00:41,670 --> 00:00:43,773
+but it isn't only limited to NLP.
+
+16
+00:00:44,850 --> 00:00:47,790
+Other tasks from other
+fields are also available,
+
+17
+00:00:47,790 --> 00:00:50,340
+such as image classification
+for computer vision,
+
+18
+00:00:50,340 --> 00:00:52,683
+or automatic speech
+recognition for speech.
+
+19
+00:00:54,840 --> 00:00:57,870
+The second category is the libraries.
+
+20
+00:00:57,870 --> 00:01:00,990
+Models on the hub usually
+share one of three backbones,
+
+21
+00:01:00,990 --> 00:01:03,900
+PyTorch, TensorFlow, or JAX.
+
+22
+00:01:03,900 --> 00:01:07,503
+However, other backbones, such
+as rust or ONNX also exist.
+
+23
+00:01:09,540 --> 00:01:11,850
+Finally, this tab can also be used
+
+24
+00:01:11,850 --> 00:01:15,123
+to specify from which high-level
+framework the models comes.
+
+25
+00:01:16,140 --> 00:01:19,260
+This includes Transformers,
+but it isn't limited to it.
+
+26
+00:01:19,260 --> 00:01:21,060
+The model hub is used to host
+
+27
+00:01:21,060 --> 00:01:22,920
+a lot of different frameworks models,
+
+28
+00:01:22,920 --> 00:01:24,600
+and we're actively looking to host
+
+29
+00:01:24,600 --> 00:01:25,893
+other frameworks models.
+
+30
+00:01:28,530 --> 00:01:31,890
+The third category is the datasets tab.
+
+31
+00:01:31,890 --> 00:01:35,070
+Selecting a dataset from this
+tab means filtering the models
+
+32
+00:01:35,070 --> 00:01:37,683
+so that they were trained
+on that specific dataset.
+
+33
+00:01:39,180 --> 00:01:42,300
+The fourth category is the languages tab.
+
+34
+00:01:42,300 --> 00:01:43,800
+Selecting a language from this tab
+
+35
+00:01:43,800 --> 00:01:45,990
+means filtering the
+models so that they handle
+
+36
+00:01:45,990 --> 00:01:47,090
+the language selected.
+
+37
+00:01:48,600 --> 00:01:51,750
+Finally, the last category
+allows to choose the license
+
+38
+00:01:51,750 --> 00:01:53,313
+with which the model is shared.
+
+39
+00:01:56,700 --> 00:01:58,770
+On the right, you'll
+find the models available
+
+40
+00:01:58,770 --> 00:02:00,480
+on the model hub.
+
+41
+00:02:00,480 --> 00:02:03,750
+The models are ordered
+by downloads by default.
+
+42
+00:02:03,750 --> 00:02:04,890
+When clicking on a model,
+
+43
+00:02:04,890 --> 00:02:07,230
+you should be facing its model card.
+
+44
+00:02:07,230 --> 00:02:09,990
+The model card contains
+information about the model,
+
+45
+00:02:09,990 --> 00:02:13,263
+its description, intended
+use, limitations and biases.
+
+46
+00:02:14,310 --> 00:02:17,580
+It can also show code snippets
+on how to use the model,
+
+47
+00:02:17,580 --> 00:02:20,070
+as well as any relevant information;
+
+48
+00:02:20,070 --> 00:02:22,080
+training procedure, data processing,
+
+49
+00:02:22,080 --> 00:02:24,213
+evaluation results or copyrights.
+
+50
+00:02:25,710 --> 00:02:28,350
+This information is crucial
+for the model to be used.
+
+51
+00:02:28,350 --> 00:02:30,360
+The better crafted a model card is,
+
+52
+00:02:30,360 --> 00:02:33,270
+the easier it will be for other
+users to leverage your model
+
+53
+00:02:33,270 --> 00:02:34,443
+in their applications.
+
+54
+00:02:35,820 --> 00:02:38,553
+On the right of the model
+card is the inference API.
+
+55
+00:02:39,540 --> 00:02:41,040
+This inference API can be used
+
+56
+00:02:41,040 --> 00:02:43,290
+to play with the model directly.
+
+57
+00:02:43,290 --> 00:02:45,690
+Feel free to modify the
+text and click on compute
+
+58
+00:02:45,690 --> 00:02:48,140
+to see how would the model
+behave to your inputs.
+
+59
+00:02:50,370 --> 00:02:53,013
+At the top of your screen
+lies the model tags.
+
+60
+00:02:53,850 --> 00:02:56,550
+These include the model task,
+as well as any other tag
+
+61
+00:02:56,550 --> 00:02:59,200
+that is relevant to the
+categories we have just seen.
+
+62
+00:03:01,320 --> 00:03:04,410
+The files & versions tab
+displays the architecture
+
+63
+00:03:04,410 --> 00:03:06,213
+of the repository of that model.
+
+64
+00:03:07,230 --> 00:03:10,920
+Here, we can see all the
+files that define this model.
+
+65
+00:03:10,920 --> 00:03:13,650
+You'll see all usual
+features of a Git repository:
+
+66
+00:03:13,650 --> 00:03:15,093
+the branches available,
+
+67
+00:03:17,160 --> 00:03:18,520
+the commit history
+
+68
+00:03:20,760 --> 00:03:22,683
+as well as the commit diff.
+
+69
+00:03:25,740 --> 00:03:27,510
+Three different buttons are available
+
+70
+00:03:27,510 --> 00:03:29,760
+at the top of the model card.
+
+71
+00:03:29,760 --> 00:03:31,170
+The first one shows how to use
+
+72
+00:03:31,170 --> 00:03:33,093
+the inference API programmatically.
+
+73
+00:03:35,910 --> 00:03:38,913
+The second one shows how to
+train this model in SageMaker.
+
+74
+00:03:42,870 --> 00:03:44,820
+And the last one shows
+how to load that model
+
+75
+00:03:44,820 --> 00:03:46,860
+within the appropriate library.
+
+76
+00:03:46,860 --> 00:03:48,783
+For BERT, this is transformers.
+
+77
+00:03:50,208 --> 00:03:52,791
+(upbeat music)
+
diff --git a/subtitles/en/32_managing-a-repo-on-the-model-hub.srt b/subtitles/en/32_managing-a-repo-on-the-model-hub.srt
index 481275f93..d75814af0 100644
--- a/subtitles/en/32_managing-a-repo-on-the-model-hub.srt
+++ b/subtitles/en/32_managing-a-repo-on-the-model-hub.srt
@@ -1,346 +1,750 @@
-1
-00:00:02,560 --> 00:00:09,130
-In this video, we're going to understand how
-to manage a model repository on the HuggingFace
-
-2
-00:00:09,130 --> 00:00:10,920
-model hub.
-
-3
-00:00:10,920 --> 00:00:15,370
-In order to handle a repository, you should
-first have a Hugging Face account.
-
-4
-00:00:15,370 --> 00:00:20,310
-A link to create a new account is available
-in the description.
-
-5
-00:00:20,310 --> 00:00:25,279
-Once you are logged in, you can create a new
-repository by clicking on the "New model"
-
-6
-00:00:25,279 --> 00:00:26,279
-option.
-
-7
-00:00:26,279 --> 00:00:28,910
-You should be facing a similar modal to the
-following.
-
-8
-00:00:28,910 --> 00:00:34,900
-In the "Owner" input, you can put either your
-own namespace or any of your organisations
-
-9
-00:00:34,900 --> 00:00:36,719
-namespaces.
-
-10
-00:00:36,719 --> 00:00:41,739
-The model name is the model identifier that
-will then be used to identify your model on
-
-11
-00:00:41,739 --> 00:00:44,250
-your chosen namespace.
-
-12
-00:00:44,250 --> 00:00:47,450
-The final choice is between public and private.
-
-13
-00:00:47,450 --> 00:00:50,100
-Public models are accessible by anyone.
-
-14
-00:00:50,100 --> 00:00:55,030
-This is the recommended, free option, as this
-makes your model easily accessible and shareable.
-
-15
-00:00:55,030 --> 00:01:00,440
-The owners of your namespace are the only
-ones who can update and change your model.
-
-16
-00:01:00,440 --> 00:01:03,210
-A more advanced option is the private option.
-
-17
-00:01:03,210 --> 00:01:08,460
-In this case, only the owners of your namespace
-will have visibility over your model.
-
-18
-00:01:08,460 --> 00:01:15,010
-Other users won't know it exists and will
-not be able to use it.
-
-19
-00:01:15,010 --> 00:01:18,320
-Let's create a dummy model to play with.
-
-20
-00:01:18,320 --> 00:01:22,360
-Once your model is created, comes the management
-of that model!
-
-21
-00:01:22,360 --> 00:01:24,170
-Three tabs are available to you.
-
-22
-00:01:24,170 --> 00:01:29,070
-You're facing the first one, which is the
-model card page; this is the page used to
-
-23
-00:01:29,070 --> 00:01:31,170
-showcase your model to the world.
-
-24
-00:01:31,170 --> 00:01:34,600
-We'll see how it can be completed in a bit.
-
-25
-00:01:34,600 --> 00:01:38,479
-The second one is the "Files & Versions".
-
-26
-00:01:38,479 --> 00:01:43,310
-Your model itself is a git repository - if
-you're unaware of what is a git repository,
-
-27
-00:01:43,310 --> 00:01:46,439
-you can think of it as a folder containing
-files.
-
-28
-00:01:46,439 --> 00:01:51,000
-If you have never used git before, we recommend
-looking at an introduction like the one provided
-
-29
-00:01:51,000 --> 00:01:53,960
-in this video's description.
-
-30
-00:01:53,960 --> 00:01:59,020
-The git repository allows you to see the changes
-happening over time in this folder, hence
-
-31
-00:01:59,020 --> 00:02:00,960
-the term "Versions".
-
-32
-00:02:00,960 --> 00:02:07,130
-We'll see how to add files and versions in
-a bit.
-
-33
-00:02:07,130 --> 00:02:12,069
-The final tab is the "Settings" tab, which
-allow you to manage your model's visibility
-
-34
-00:02:12,069 --> 00:02:14,780
-and availability.
-
-35
-00:02:14,780 --> 00:02:18,860
-Let's first start by adding files to the repository.
-
-36
-00:02:18,860 --> 00:02:23,459
-Files can be added through the web interface
-thanks to the "Add File" button.
-
-37
-00:02:23,459 --> 00:02:28,849
-The added files can be of any type: python,
-json, text, you name it!
-
-38
-00:02:28,849 --> 00:02:35,110
-Alongside your added file and its content,
-you should name your change, or commit.
-
-39
-00:02:35,110 --> 00:02:42,670
-Generally, adding files is simpler when using
-the command line.
-
-40
-00:02:42,670 --> 00:02:47,310
-We'll showcase how to do this using git.
-
-41
-00:02:47,310 --> 00:02:52,290
-In addition to git, we're using git-lfs, which
-stands for large file storage in order to
-
-42
-00:02:52,290 --> 00:02:53,560
-manage large model files.
-
-43
-00:02:53,560 --> 00:03:00,980
-First, I make sure that both git and git-lfs
-are correctly installed on my system.
-
-44
-00:03:00,980 --> 00:03:08,280
-Links to install git & git-lfs are provided
-in the video description.
-
-45
-00:03:08,280 --> 00:03:12,470
-Then, we can get to work by cloning the repository
-locally.
-
-46
-00:03:12,470 --> 00:03:14,990
-We have a repository with a single file!
-
-47
-00:03:14,990 --> 00:03:24,180
-The file that we have just added to the repository
-using the web interface.
-
-48
-00:03:24,180 --> 00:03:45,549
-We can edit it to see the contents of this
-file and update these.
-
-49
-00:03:45,549 --> 00:04:04,439
-It just turns out I have a model handy, that
-can be used for sentiment analysis.
-
-50
-00:04:04,439 --> 00:04:10,790
-I'll simply copy over the contents to this
-folder.
-
-51
-00:04:10,790 --> 00:04:20,030
-This includes the model weights, configuration
-file and tokenizer to the repository.
-
-52
-00:04:20,030 --> 00:04:35,850
-I can then track these two files with the
-git add command.
-
-53
-00:04:35,850 --> 00:04:40,639
-Then, I commit the changes.
-
-54
-00:04:40,639 --> 00:04:54,640
-I'm giving this commit the title of "Add model
-weights and configuration".
-
-55
-00:04:54,640 --> 00:05:08,910
-Finally, I can push the new commit to the
-huggingface.co remote.
-
-56
-00:05:08,910 --> 00:05:39,389
-When going back to the files & versions tab,
-we can now see the newly added commit with
-
-57
-00:05:39,389 --> 00:05:41,090
-the updated files.
-
-58
-00:05:41,090 --> 00:05:59,250
-We have seen two ways of adding files to a
-repository, a third way is explored in the
-
-59
-00:05:59,250 --> 00:06:07,310
-video about the push to hub API.
-
-60
-00:06:07,310 --> 00:06:25,099
-A link to this video is in 
-
-61
-00:06:25,099 --> 00:06:45,470
-the description.
-
-62
-00:06:45,470 --> 00:06:50,229
-Go back to readme.
-
-63
-00:06:50,229 --> 00:06:57,510
-Unfortunately, the front page of our model
-is still very empty.
-
-64
-00:06:57,510 --> 00:07:01,860
-Let's add a README markdown file to complete
-it a little bit.
-
-65
-00:07:01,860 --> 00:07:06,770
-This README is known as the modelcard, and
-it's arguably as important as the model and
-
-66
-00:07:06,770 --> 00:07:08,770
-tokenizer files in a model repository.
-
-67
-00:07:08,770 --> 00:07:15,990
-It is the central definition of the model,
-ensuring reusability by fellow community members
-
-68
-00:07:15,990 --> 00:07:21,210
-and reproducibility of results, and providing
-a platform on which other members may build
-
-69
-00:07:21,210 --> 00:07:22,510
-their artifacts.
-
-70
-00:07:22,510 --> 00:07:27,669
-We'll only add a title and a small description
-here for simplicity's sake, but we encourage
-
-71
-00:07:27,669 --> 00:07:33,000
-you to add information relevant to how was
-the model trained, its intended uses and limitations,
-
-72
-00:07:33,000 --> 00:07:39,190
-as well as its identified and potential biases,
-evaluation results and code samples on how
-
-73
-00:07:39,190 --> 00:07:41,479
-your model should be used.
-
-74
-00:07:41,479 --> 00:07:44,220
-Great work contributing a model to the model
-hub!
-
-75
-00:07:44,220 --> 00:07:53,110
-This model can now be used in downstream libraries
-simply by specifying your model identifier.
+﻿1
+00:00:04,200 --> 00:00:06,210
+- [Instructor] In this video,
+we're going to understand how
+
+2
+00:00:06,210 --> 00:00:08,280
+to manage a model repository
+
+3
+00:00:08,280 --> 00:00:10,053
+on the Hugging Face Hub Model Hub.
+
+4
+00:00:10,920 --> 00:00:13,020
+In order to handle a repository
+
+5
+00:00:13,020 --> 00:00:15,450
+you should first have
+a Hugging Face account.
+
+6
+00:00:15,450 --> 00:00:17,610
+A link to create a new
+account is available
+
+7
+00:00:17,610 --> 00:00:18,573
+in the description.
+
+8
+00:00:20,130 --> 00:00:22,980
+Once you are logged in, you
+can create a new repository
+
+9
+00:00:22,980 --> 00:00:25,890
+by clicking on the new model option.
+
+10
+00:00:25,890 --> 00:00:29,400
+You should be facing a similar
+modal to the following.
+
+11
+00:00:29,400 --> 00:00:33,240
+In the owner input, you can
+put either your own namespace
+
+12
+00:00:33,240 --> 00:00:35,703
+or any of your organization's namespaces.
+
+13
+00:00:36,660 --> 00:00:39,330
+The model name is the model identifier
+
+14
+00:00:39,330 --> 00:00:40,320
+that will then be used
+
+15
+00:00:40,320 --> 00:00:43,143
+to identify your model
+on the chosen namespace.
+
+16
+00:00:44,130 --> 00:00:47,700
+The final choice is
+between public and private.
+
+17
+00:00:47,700 --> 00:00:49,950
+Public models are accessible by anyone.
+
+18
+00:00:49,950 --> 00:00:51,840
+This is the recommended free option,
+
+19
+00:00:51,840 --> 00:00:54,960
+as this makes your model easily
+accessible and shareable.
+
+20
+00:00:54,960 --> 00:00:57,630
+The owners of your
+namespace are the only ones
+
+21
+00:00:57,630 --> 00:00:59,523
+who can update and change your model.
+
+22
+00:01:00,450 --> 00:01:03,660
+A more advanced option
+is the private option.
+
+23
+00:01:03,660 --> 00:01:04,560
+In this case,
+
+24
+00:01:04,560 --> 00:01:06,000
+only the owners of your namespace
+
+25
+00:01:06,000 --> 00:01:08,280
+will have visibility over your model.
+
+26
+00:01:08,280 --> 00:01:10,260
+Other users won't know it exists
+
+27
+00:01:10,260 --> 00:01:11,810
+and will not be able to use it.
+
+28
+00:01:15,030 --> 00:01:17,030
+Let's create a dummy model to play with.
+
+29
+00:01:18,180 --> 00:01:19,710
+Once your model is created,
+
+30
+00:01:19,710 --> 00:01:22,230
+comes the management of that model.
+
+31
+00:01:22,230 --> 00:01:24,360
+Three tabs are available to you.
+
+32
+00:01:24,360 --> 00:01:27,960
+You're facing the first one,
+which is the model card page.
+
+33
+00:01:27,960 --> 00:01:29,970
+This is the page you use
+to showcase your model
+
+34
+00:01:29,970 --> 00:01:31,110
+to the world.
+
+35
+00:01:31,110 --> 00:01:33,260
+We'll see how it can
+be completed in a bit.
+
+36
+00:01:34,500 --> 00:01:37,503
+The second one is the
+files and versions tab.
+
+37
+00:01:38,340 --> 00:01:40,920
+Your model itself is a Git repository.
+
+38
+00:01:40,920 --> 00:01:43,230
+If you're unaware of
+what is a Git repository,
+
+39
+00:01:43,230 --> 00:01:46,320
+you can think of it as a
+folder containing files.
+
+40
+00:01:46,320 --> 00:01:48,120
+If you have never used Git before,
+
+41
+00:01:48,120 --> 00:01:50,100
+we recommend looking at an introduction
+
+42
+00:01:50,100 --> 00:01:52,600
+like the one provided in
+this video's description.
+
+43
+00:01:53,850 --> 00:01:56,910
+The Git repository allows you
+to see the changes happening
+
+44
+00:01:56,910 --> 00:02:00,900
+over time in this folder,
+hence the term versions.
+
+45
+00:02:00,900 --> 00:02:03,453
+We'll see how to add files
+and versions in a bit.
+
+46
+00:02:07,020 --> 00:02:09,570
+The final tab is the settings tab,
+
+47
+00:02:09,570 --> 00:02:12,120
+which allows you to manage
+your model's visibility
+
+48
+00:02:12,120 --> 00:02:13,203
+and availability.
+
+49
+00:02:14,790 --> 00:02:17,673
+Let's first start by adding
+files to the repository.
+
+50
+00:02:18,540 --> 00:02:19,560
+Files can be added
+
+51
+00:02:19,560 --> 00:02:23,340
+through the web interface
+thanks to the add file button.
+
+52
+00:02:23,340 --> 00:02:27,060
+The added files can be of
+any type, python, JSON, text,
+
+53
+00:02:27,060 --> 00:02:27,893
+you name it.
+
+54
+00:02:28,740 --> 00:02:31,170
+Alongside your added file and its content,
+
+55
+00:02:31,170 --> 00:02:33,363
+you should name your change or commit.
+
+56
+00:02:36,330 --> 00:02:38,400
+Generally, adding files is simpler
+
+57
+00:02:38,400 --> 00:02:40,770
+by using the Hugging
+Face Hub Python library
+
+58
+00:02:40,770 --> 00:02:43,050
+or by using the command-line.
+
+59
+00:02:43,050 --> 00:02:44,310
+We'll showcase how to do this
+
+60
+00:02:44,310 --> 00:02:46,290
+using the Hugging Face Hub Python library,
+
+61
+00:02:46,290 --> 00:02:48,060
+and there is a link in the description
+
+62
+00:02:48,060 --> 00:02:49,800
+to the previous version of this video,
+
+63
+00:02:49,800 --> 00:02:52,743
+showcasing how to do this
+using Git and the command-line.
+
+64
+00:02:53,610 --> 00:02:54,840
+First, make sure you're logged
+
+65
+00:02:54,840 --> 00:02:56,460
+into your Hugging Face account,
+
+66
+00:02:56,460 --> 00:02:59,523
+either through the command-line
+or in a Python runtime.
+
+67
+00:03:04,634 --> 00:03:06,390
+The first approach we'll take a look at
+
+68
+00:03:06,390 --> 00:03:08,880
+is using the upload file method.
+
+69
+00:03:08,880 --> 00:03:10,770
+This offers an extremely simple API
+
+70
+00:03:10,770 --> 00:03:12,630
+to upload files through the hub.
+
+71
+00:03:12,630 --> 00:03:14,190
+The three required parameters
+
+72
+00:03:14,190 --> 00:03:16,083
+are the current location of the file,
+
+73
+00:03:18,570 --> 00:03:21,300
+the path of that file in the repository,
+
+74
+00:03:21,300 --> 00:03:24,050
+and the idea of the repository
+to which you're pushing.
+
+75
+00:03:25,650 --> 00:03:27,930
+There are a few additional parameters.
+
+76
+00:03:27,930 --> 00:03:29,100
+The token parameter,
+
+77
+00:03:29,100 --> 00:03:31,200
+if you would like to
+specify a different token
+
+78
+00:03:31,200 --> 00:03:33,650
+than the one saved in your
+cache with your login,
+
+79
+00:03:34,830 --> 00:03:36,750
+the repo type parameter,
+
+80
+00:03:36,750 --> 00:03:40,503
+if you would like to push
+to a data set or a space.
+
+81
+00:03:42,300 --> 00:03:45,690
+We'll upload a file called
+readme.md to the repository
+
+82
+00:03:45,690 --> 00:03:47,190
+using this method.
+
+83
+00:03:47,190 --> 00:03:49,710
+We first start by saving
+a file with that name,
+
+84
+00:03:49,710 --> 00:03:51,210
+which contains some information
+
+85
+00:03:51,210 --> 00:03:52,920
+about the repository itself.
+
+86
+00:03:52,920 --> 00:03:54,243
+Here, a title.
+
+87
+00:03:55,950 --> 00:03:57,420
+Now that the file is saved,
+
+88
+00:03:57,420 --> 00:04:00,513
+let's use the upload file
+method to upload it to the hub.
+
+89
+00:04:01,560 --> 00:04:03,540
+If we switch to the web
+interface for a second
+
+90
+00:04:03,540 --> 00:04:07,080
+and refresh the page, we'll
+see that the README is shown.
+
+91
+00:04:07,080 --> 00:04:08,883
+The file upload was a success.
+
+92
+00:04:10,170 --> 00:04:13,500
+Alongside this method
+exists a delete file method
+
+93
+00:04:13,500 --> 00:04:16,170
+so that you may manage
+your repository fully.
+
+94
+00:04:16,170 --> 00:04:18,820
+We'll use it to delete the
+file we have just created.
+
+95
+00:04:22,860 --> 00:04:25,320
+If we refresh the page once again, good,
+
+96
+00:04:25,320 --> 00:04:26,973
+the file was indeed deleted.
+
+97
+00:04:29,070 --> 00:04:32,730
+This approach using only these
+two methods is super simple.
+
+98
+00:04:32,730 --> 00:04:35,400
+It doesn't need Git or Git LFS installed,
+
+99
+00:04:35,400 --> 00:04:37,650
+but it does come with a limitation.
+
+100
+00:04:37,650 --> 00:04:39,600
+The maximum file size one can upload
+
+101
+00:04:39,600 --> 00:04:41,313
+is limited to five gigabytes.
+
+102
+00:04:42,360 --> 00:04:43,890
+To overcome this limit,
+
+103
+00:04:43,890 --> 00:04:45,540
+let's take a look at the second method
+
+104
+00:04:45,540 --> 00:04:47,643
+which is the repository utility.
+
+105
+00:04:48,600 --> 00:04:51,840
+This class is a wrapper over
+Git and Git LFS methods,
+
+106
+00:04:51,840 --> 00:04:53,850
+which abstracts most of the complexity
+
+107
+00:04:53,850 --> 00:04:55,500
+and offers a flexible API
+
+108
+00:04:55,500 --> 00:04:57,990
+to manage your online repositories.
+
+109
+00:04:57,990 --> 00:04:59,690
+Let's take a look at how it works.
+
+110
+00:05:03,870 --> 00:05:08,369
+We first start by instantiating
+the repository utility.
+
+111
+00:05:08,369 --> 00:05:10,380
+We provide the clone from parameter,
+
+112
+00:05:10,380 --> 00:05:13,383
+in order to clone the
+repository we just created.
+
+113
+00:05:14,400 --> 00:05:18,750
+The repository is now
+cloned in the local folder.
+
+114
+00:05:18,750 --> 00:05:22,200
+The repo object that we
+have just initialized
+
+115
+00:05:22,200 --> 00:05:24,873
+offers quite a few methods
+which are useful for us.
+
+116
+00:05:25,920 --> 00:05:28,800
+We're interested in
+pushing a model to the hub.
+
+117
+00:05:28,800 --> 00:05:31,170
+I'll start by loading
+a model and tokenizer
+
+118
+00:05:31,170 --> 00:05:32,643
+I trained a few hours ago.
+
+119
+00:05:34,380 --> 00:05:36,810
+We'll now follow the
+traditional Git approach
+
+120
+00:05:36,810 --> 00:05:38,670
+by first pulling latest changes
+
+121
+00:05:38,670 --> 00:05:40,053
+using the Git pull method.
+
+122
+00:05:40,980 --> 00:05:43,170
+We just cloned the repository,
+
+123
+00:05:43,170 --> 00:05:45,780
+so unless this is a
+super active repository,
+
+124
+00:05:45,780 --> 00:05:48,660
+it's unlikely that new
+changes are available.
+
+125
+00:05:48,660 --> 00:05:51,000
+But it's always a good idea
+to pull the latest changes
+
+126
+00:05:51,000 --> 00:05:52,300
+before doing anything new.
+
+127
+00:05:53,220 --> 00:05:55,200
+Now that we have pulled the repository,
+
+128
+00:05:55,200 --> 00:05:58,500
+I'll save the model and
+tokenizer inside that folder.
+
+129
+00:05:58,500 --> 00:06:01,200
+This includes the model
+weights, configuration file,
+
+130
+00:06:01,200 --> 00:06:02,673
+and tokenizer files.
+
+131
+00:06:04,440 --> 00:06:05,820
+Now that the model is saved,
+
+132
+00:06:05,820 --> 00:06:07,890
+we'll continue with the
+traditional Git approach
+
+133
+00:06:07,890 --> 00:06:10,620
+and push it to the remote repository.
+
+134
+00:06:10,620 --> 00:06:12,150
+If we were using the command-line,
+
+135
+00:06:12,150 --> 00:06:14,250
+there are a few Git LFS specific commands
+
+136
+00:06:14,250 --> 00:06:15,600
+we would have to invoke.
+
+137
+00:06:15,600 --> 00:06:17,940
+But here, the Hugging Face hub package
+
+138
+00:06:17,940 --> 00:06:20,070
+takes care of all of that.
+
+139
+00:06:20,070 --> 00:06:24,420
+We'll start by staging the
+files using the Git add method.
+
+140
+00:06:24,420 --> 00:06:27,600
+We'll then commit these changes
+using Git commit method,
+
+141
+00:06:27,600 --> 00:06:30,690
+and providing a helpful commit message.
+
+142
+00:06:30,690 --> 00:06:33,210
+Finally, we'll push the
+changes to the remote,
+
+143
+00:06:33,210 --> 00:06:34,953
+using the Git push method.
+
+144
+00:06:45,090 --> 00:06:47,430
+If we go back to the
+files and versions tab,
+
+145
+00:06:47,430 --> 00:06:49,950
+we can now see the newly committed files.
+
+146
+00:06:49,950 --> 00:06:52,600
+We can even play with the
+model in the inference API.
+
+147
+00:06:53,790 --> 00:06:55,770
+Unfortunately, the front page of our model
+
+148
+00:06:55,770 --> 00:06:57,540
+is still very empty.
+
+149
+00:06:57,540 --> 00:06:59,280
+Let's add a README markdown file
+
+150
+00:06:59,280 --> 00:07:00,753
+to complete it a little bit.
+
+151
+00:07:01,710 --> 00:07:04,200
+This README is known as the model card
+
+152
+00:07:04,200 --> 00:07:06,030
+and it's arguably as important
+
+153
+00:07:06,030 --> 00:07:09,330
+as the model and tokenizer
+files in the model repository.
+
+154
+00:07:09,330 --> 00:07:11,280
+It is the central definition
+
+155
+00:07:11,280 --> 00:07:13,200
+and documentation of your model,
+
+156
+00:07:13,200 --> 00:07:16,440
+ensuring reusability by
+fellow community members
+
+157
+00:07:16,440 --> 00:07:18,480
+and reproducibility of results.
+
+158
+00:07:18,480 --> 00:07:20,760
+Providing a platform
+on which other members
+
+159
+00:07:20,760 --> 00:07:22,293
+may build their artifacts.
+
+160
+00:07:23,220 --> 00:07:25,590
+We'll only add a title and
+a small description here
+
+161
+00:07:25,590 --> 00:07:27,060
+for simplicity's sake,
+
+162
+00:07:27,060 --> 00:07:29,370
+but we encourage you to
+add information relevant
+
+163
+00:07:29,370 --> 00:07:30,990
+to how was the model trained,
+
+164
+00:07:30,990 --> 00:07:33,120
+it's intended use and limitations,
+
+165
+00:07:33,120 --> 00:07:36,180
+as well as it's identified
+potential biases,
+
+166
+00:07:36,180 --> 00:07:37,440
+evaluation results,
+
+167
+00:07:37,440 --> 00:07:39,843
+and code samples on how to use your model.
+
+168
+00:07:41,460 --> 00:07:44,130
+Great work contributing
+a model to the Model Hub.
+
+169
+00:07:44,130 --> 00:07:46,440
+This model can now be used
+in downstream libraries
+
+170
+00:07:46,440 --> 00:07:48,783
+simply by specifying
+your model identifier.
+
diff --git a/subtitles/en/33_the-push-to-hub-api-(pytorch).srt b/subtitles/en/33_the-push-to-hub-api-(pytorch).srt
index f93d35d17..a2fcf8caf 100644
--- a/subtitles/en/33_the-push-to-hub-api-(pytorch).srt
+++ b/subtitles/en/33_the-push-to-hub-api-(pytorch).srt
@@ -1,244 +1,479 @@
-1
-00:00:05,130 --> 00:00:06,130
-The Push to Hub API.
-
-2
-00:00:06,130 --> 00:00:10,310
-Let's have a look at the push_to_hub API.
-
-3
-00:00:10,310 --> 00:00:16,209
-You will need to be logged in with your Hugging
-Face account, which you can do by executing
-
-4
-00:00:16,209 --> 00:00:22,220
-this first cell or typing huggingface-cli
-login in a terminal.
-
-5
-00:00:22,220 --> 00:00:27,480
-Just enter your username and password and
-click login, which will store an authentication
-
-6
-00:00:27,480 --> 00:00:31,230
-token in the cache of the machine you're using.
-
-7
-00:00:31,230 --> 00:00:37,990
-Now, let's launch the fine-tuning of a BERT
-model on the GLUE COLA dataset.
-
-8
-00:00:37,990 --> 00:00:41,900
-We won't go over the fine-tuning code because
-you can find it in any Transformers tutorial,
-
-9
-00:00:41,900 --> 00:00:44,350
-or by looking at the videos linked below.
-
-10
-00:00:44,350 --> 00:00:49,920
-What interests us here, is how we can leverage
-the Model Hub during training.
-
-11
-00:00:49,920 --> 00:00:56,500
-This is done with the push_to_hub=True passed
-in your TrainingArguments . This will automatically
-
-12
-00:00:56,500 --> 00:01:02,149
-upload your model to the Hub each time it
-is saved (so every epoch in our case), which
-
-13
-00:01:02,149 --> 00:01:08,260
-allows you to resume training from a different
-machine if the current one gets interrupted.
-
-14
-00:01:08,260 --> 00:01:13,610
-The model will be uploaded in your namespace,
-with the name of the output directory as a
-
-15
-00:01:13,610 --> 00:01:14,690
-repository name.
-
-16
-00:01:14,690 --> 00:01:20,580
-You can pick another name by passing it to
-the hub_model_id argument, and you can also
-
-17
-00:01:20,580 --> 00:01:32,420
-push inside an organization you are a member
-of by passing a full repository name.
-
-18
-00:01:32,420 --> 00:01:43,290
-With that done, we can just launch training
-and wait a little bit.
-
-19
-00:01:43,290 --> 00:01:47,820
-Note that the model is pushed asynchronously,
-meaning that the training continues while
-
-20
-00:01:47,820 --> 00:01:50,119
-your model is uploaded to the Hub.
-
-21
-00:01:50,119 --> 00:02:02,399
-When your first commit is finished, you can
-go inspect your model on the Hub and even
-
-22
-00:02:02,399 --> 00:02:11,000
-start playing with its inference widget while
-it's training!
-
-23
-00:02:11,000 --> 00:02:27,370
-There is something wrong with the labels,
-but we will fix this later on in this video.
-
-24
-00:02:27,370 --> 00:02:33,590
-When the training is finished, we should do
-one last push with trainer.push_to_hub for
-
-25
-00:02:33,590 --> 00:02:35,330
-two reasons.
-
-26
-00:02:35,330 --> 00:02:39,980
-One this will make sure we are uploading the
-final version of our models if we didn't already
-
-27
-00:02:39,980 --> 00:02:45,860
-(for instance if we saved every n steps instead
-of every epoch).
-
-28
-00:02:45,860 --> 00:02:51,310
-Two, this will draft a model card that will
-be the landing page of your model repo.
-
-29
-00:02:51,310 --> 00:03:04,690
-Going back to the model page, you can see
-the Trainer included some metadata that is
-
-30
-00:03:04,690 --> 00:03:15,350
-interpreted by the Hugging Face website in
-the model card.
-
-31
-00:03:15,350 --> 00:03:20,120
-On top of informations about the training,
-the intermediate results or the hyperparameter
-
-32
-00:03:20,120 --> 00:03:26,770
-used, we get the values of the metrics automatically
-displayed in a small widget, and a link to
-
-33
-00:03:26,770 --> 00:03:28,860
-a leaderboard in Paper with Code.
-
-34
-00:03:28,860 --> 00:03:35,000
-The Tensorboard runs have also been pushed
-to this repo, and we can look at them directly
-
-35
-00:03:35,000 --> 00:03:36,000
-from the Model Hub.
-
-36
-00:03:36,000 --> 00:03:43,709
-If you were not using the Trainer API to fine-tune
-your model, you can use the push_to_hub method
-
-37
-00:03:43,709 --> 00:03:45,319
-on the model and tokenizer directly.
-
-38
-00:03:45,319 --> 00:03:49,340
-Let's test this to fix our labels in the inference
-widget!
-
-39
-00:03:49,340 --> 00:03:54,140
-The inference widget was using default names
-for labels because we did not indicate the
-
-40
-00:03:54,140 --> 00:03:57,100
-correspondence between integers and label
-names.
-
-41
-00:03:57,100 --> 00:04:02,909
-We can fix in the configuration by setting
-the label2id and id2label fields to their
-
-42
-00:04:02,909 --> 00:04:07,370
-proper value then we can push the fixed config
-to our repo using the push_to_hub method.
-
-43
-00:04:07,370 --> 00:04:12,220
-Once this is done and we can check on the
-website the model is now showing the proper
-
-44
-00:04:12,220 --> 00:04:13,440
-labels!
-
-45
-00:04:13,440 --> 00:04:21,280
-Now that the model is on the hub, we can use
-it from anywhere with the from_pretrained
-
-46
-00:04:21,280 --> 00:04:22,370
-method.
-
-47
-00:04:22,370 --> 00:04:38,880
-We just have to use the identifier from the
-hub and we can see that the model configuration
-
-48
-00:04:38,880 --> 00:04:39,880
-and weights are automatically downloaded.
-
-49
-00:04:39,880 --> 00:04:49,860
-We can use this model as we would any other
-Transformers model, for instance by loading
-
-50
-00:04:49,860 --> 00:04:53,949
-it in a pipeline.
-
-51
-00:04:53,949 --> 00:04:57,550
-Try the push_to_hub API on your next training
-to easily share your model with the rest of
-
-52
-00:04:57,550 --> 00:05:04,800
-the world!
+﻿1
+00:00:00,321 --> 00:00:01,497
+(air whooshing)
+
+2
+00:00:01,497 --> 00:00:02,330
+(smiley face popping)
+
+3
+00:00:02,330 --> 00:00:05,130
+(air whooshing)
+
+4
+00:00:05,130 --> 00:00:06,830
+- [Instructor] So push to hub API.
+
+5
+00:00:08,310 --> 00:00:10,533
+Let's have a look at the push to hub API.
+
+6
+00:00:11,730 --> 00:00:14,640
+You will need to be logged in
+with your Hugging Face account
+
+7
+00:00:14,640 --> 00:00:17,400
+which you can do by
+executing this first cell,
+
+8
+00:00:17,400 --> 00:00:21,123
+or by typing huggingface-cli
+login in a terminal.
+
+9
+00:00:21,990 --> 00:00:26,640
+Just enter you username and
+password, then click login,
+
+10
+00:00:26,640 --> 00:00:28,620
+this will store a notification token
+
+11
+00:00:28,620 --> 00:00:30,670
+in the cache of the machine you're using.
+
+12
+00:00:31,890 --> 00:00:35,790
+Now, let's launch a fine
+tuning of a BERT model
+
+13
+00:00:35,790 --> 00:00:37,920
+on the GLUE COLA dataset.
+
+14
+00:00:37,920 --> 00:00:39,600
+We won't go over the fine tuning code
+
+15
+00:00:39,600 --> 00:00:42,270
+because you can find it in
+any transformer tutorial,
+
+16
+00:00:42,270 --> 00:00:44,670
+or by looking at the videos link below.
+
+17
+00:00:44,670 --> 00:00:46,470
+What interests us here is
+
+18
+00:00:46,470 --> 00:00:48,970
+how we can leverage the
+model hub during training.
+
+19
+00:00:49,860 --> 00:00:52,980
+This is done with the
+"push_to_hub=true" argument
+
+20
+00:00:52,980 --> 00:00:55,530
+passed in your TrainingArguments.
+
+21
+00:00:55,530 --> 00:00:57,240
+This will automatically upload your model
+
+22
+00:00:57,240 --> 00:00:59,400
+to the Hub each time it is saved,
+
+23
+00:00:59,400 --> 00:01:01,323
+so every epoch in our case.
+
+24
+00:01:02,280 --> 00:01:04,860
+This allows you to resume
+training from a different machine
+
+25
+00:01:04,860 --> 00:01:06,873
+if the current one gets interrupted.
+
+26
+00:01:08,220 --> 00:01:10,440
+The model will be updated
+in your name space
+
+27
+00:01:10,440 --> 00:01:14,640
+with the name of the output
+directory you picked by default.
+
+28
+00:01:14,640 --> 00:01:16,020
+You can choose another name
+
+29
+00:01:16,020 --> 00:01:19,113
+by passing it to the
+hub_model_id argument.
+
+30
+00:01:20,070 --> 00:01:23,370
+You can also push inside an
+organization you are a member of
+
+31
+00:01:23,370 --> 00:01:25,740
+by passing a full repository name,
+
+32
+00:01:25,740 --> 00:01:28,933
+with the name of the organization/,
+
+33
+00:01:28,933 --> 00:01:30,433
+the model ID you want to pick.
+
+34
+00:01:32,250 --> 00:01:34,650
+With that done, we can
+just launch training,
+
+35
+00:01:34,650 --> 00:01:36,093
+and wait a little bit.
+
+36
+00:01:36,960 --> 00:01:39,033
+I'll cut the waiting time from the video.
+
+37
+00:01:43,260 --> 00:01:46,350
+Note that the model is
+pushed asynchronously,
+
+38
+00:01:46,350 --> 00:01:47,730
+meaning that the training continues
+
+39
+00:01:47,730 --> 00:01:49,730
+while your model is uploaded to the hub.
+
+40
+00:01:51,060 --> 00:01:52,950
+When your first commit is finished,
+
+41
+00:01:52,950 --> 00:01:55,650
+you can go inspect your model on the Hub
+
+42
+00:01:55,650 --> 00:01:57,960
+by looking inside your name space,
+
+43
+00:01:57,960 --> 00:01:59,943
+and you'll find it at the very top.
+
+44
+00:02:01,980 --> 00:02:04,200
+You can even start playing
+with its inference widget
+
+45
+00:02:04,200 --> 00:02:06,630
+while it's continuing the training.
+
+46
+00:02:06,630 --> 00:02:09,270
+The Cola data set tasks
+the model with determining
+
+47
+00:02:09,270 --> 00:02:11,970
+if the sentence is
+grammatically correct on that.
+
+48
+00:02:11,970 --> 00:02:15,510
+So we pick an example of
+incorrect sentence to test it.
+
+49
+00:02:15,510 --> 00:02:16,950
+Note that it'll take a bit of time
+
+50
+00:02:16,950 --> 00:02:18,750
+to load your model inside
+the inference APIs,
+
+51
+00:02:18,750 --> 00:02:20,880
+so first time you try to use it.
+
+52
+00:02:20,880 --> 00:02:23,280
+We'll cut by time from the video.
+
+53
+00:02:23,280 --> 00:02:24,870
+There is something wrong with the labels,
+
+54
+00:02:24,870 --> 00:02:27,360
+but we'll fix it later in this video.
+
+55
+00:02:27,360 --> 00:02:29,520
+Once your training is finished,
+
+56
+00:02:29,520 --> 00:02:31,770
+you should do one last
+push with the trainer
+
+57
+00:02:31,770 --> 00:02:33,840
+that pushed to a method.
+
+58
+00:02:33,840 --> 00:02:35,430
+This is for two reason.
+
+59
+00:02:35,430 --> 00:02:36,750
+First, this will make sure
+
+60
+00:02:36,750 --> 00:02:39,180
+you are predicting the
+final version of your model
+
+61
+00:02:39,180 --> 00:02:40,680
+if you didn't already.
+
+62
+00:02:40,680 --> 00:02:42,480
+For instance, if you used to save
+
+63
+00:02:42,480 --> 00:02:46,980
+every in step strategy
+instead of every second,
+
+64
+00:02:46,980 --> 00:02:48,180
+this will draft a model card
+
+65
+00:02:48,180 --> 00:02:51,120
+that will be the landing
+page of your model repo.
+
+66
+00:02:51,120 --> 00:02:52,260
+Once the commit is done,
+
+67
+00:02:52,260 --> 00:02:54,810
+let's go back on our
+model page and refresh.
+
+68
+00:02:54,810 --> 00:02:56,820
+We can see the drafters model card
+
+69
+00:02:56,820 --> 00:02:58,080
+which includes information,
+
+70
+00:02:58,080 --> 00:03:00,381
+and which one model we find tuned.
+
+71
+00:03:00,381 --> 00:03:03,570
+So final evaluation loss and metric,
+
+72
+00:03:03,570 --> 00:03:06,300
+the training hyperparameter used,
+
+73
+00:03:06,300 --> 00:03:08,670
+the intermediate training results,
+
+74
+00:03:08,670 --> 00:03:10,320
+and the framework versions we used
+
+75
+00:03:10,320 --> 00:03:13,173
+so that other people can
+easily reproduce our results.
+
+76
+00:03:15,270 --> 00:03:16,860
+On top of all that information,
+
+77
+00:03:16,860 --> 00:03:19,740
+the trainer also included some
+metadata that is interpreted
+
+78
+00:03:19,740 --> 00:03:22,650
+by the Hugging Face
+website in the model cloud.
+
+79
+00:03:22,650 --> 00:03:26,010
+You get the value of the metrics
+reported in a nice widget
+
+80
+00:03:26,010 --> 00:03:29,640
+as well as a link to a
+leaderboard with paper with code.
+
+81
+00:03:29,640 --> 00:03:32,550
+So the Tensorboard runs
+have also been pushed
+
+82
+00:03:32,550 --> 00:03:34,560
+to this report, and we can look at them
+
+83
+00:03:34,560 --> 00:03:36,000
+directly from the model hub
+
+84
+00:03:36,000 --> 00:03:38,850
+by clicking on the
+training metrics sub menu.
+
+85
+00:03:38,850 --> 00:03:39,795
+If you are not using the Trainer API
+
+86
+00:03:39,795 --> 00:03:42,510
+to fine-tune your model,
+
+87
+00:03:42,510 --> 00:03:43,770
+you can use a push_to_hub method
+
+88
+00:03:43,770 --> 00:03:46,427
+on the model, and tokenizer directly.
+
+89
+00:03:46,427 --> 00:03:50,160
+Let's test this to fix all
+labels in the inference widget.
+
+90
+00:03:50,160 --> 00:03:52,740
+The inference widget was using
+different names for labels
+
+91
+00:03:52,740 --> 00:03:54,810
+because we did not
+indicate the correspondence
+
+92
+00:03:54,810 --> 00:03:57,030
+between integer and label names.
+
+93
+00:03:57,030 --> 00:03:58,740
+We can fix this in the configuration
+
+94
+00:03:58,740 --> 00:04:01,350
+by sitting the label2id,
+
+95
+00:04:01,350 --> 00:04:04,170
+and id2label fields
+through the proper values
+
+96
+00:04:04,170 --> 00:04:06,933
+when pushing the model config to the hub.
+
+97
+00:04:07,950 --> 00:04:10,620
+Once this is done, we
+can check on the website,
+
+98
+00:04:10,620 --> 00:04:13,380
+and the model is now
+showing the proper label.
+
+99
+00:04:13,380 --> 00:04:15,240
+Now that the model is on the hub,
+
+100
+00:04:15,240 --> 00:04:17,370
+we can use it from anywhere
+
+101
+00:04:17,370 --> 00:04:19,920
+as we would any other Transformer model
+
+102
+00:04:19,920 --> 00:04:21,113
+with the from_pretrained method
+
+103
+00:04:21,113 --> 00:04:22,923
+of with the pipeline function.
+
+104
+00:04:34,350 --> 00:04:36,780
+We just have to use the
+identifier from the hub,
+
+105
+00:04:36,780 --> 00:04:39,450
+and we can see that the model
+configuration and weights
+
+106
+00:04:39,450 --> 00:04:42,483
+as well as the tokenized files
+are automatically downloaded.
+
+107
+00:04:53,880 --> 00:04:55,950
+Try the push_to_hub API
+in the next training
+
+108
+00:04:55,950 --> 00:04:58,650
+to easily share your model
+with the rest of the world.
+
+109
+00:05:01,151 --> 00:05:03,818
+(air whooshing)
+
diff --git a/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt b/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt
index 2f4a27605..d0f558ccc 100644
--- a/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt
+++ b/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt
@@ -1,434 +1,877 @@
-1
-00:00:05,040 --> 00:00:12,000
-Hi, this is going to be a video about the 
-push_to_hub API for Tensorflow and Keras. So,  
-
-2
-00:00:12,000 --> 00:00:16,240
-to get started, we'll open up our notebook, 
-and the first thing you'll need to do is  
-
-3
-00:00:16,240 --> 00:00:22,480
-log in to your HuggingFace account, for example 
-with the notebook login function. So to do that,  
-
-4
-00:00:23,040 --> 00:00:28,560
-you simply call the function, the popup will 
-emerge, you enter your username and password,  
-
-5
-00:00:28,560 --> 00:00:34,160
-which I'm going to pull out of my password 
-manager here, and you're logged in. The next  
-
-6
-00:00:34,160 --> 00:00:38,720
-two cells are just getting everything ready for 
-training. So we're just going to load a dataset,  
-
-7
-00:00:38,720 --> 00:00:42,960
-we're going to tokenize that dataset, and then 
-we're going to load our model and compile it  
-
-8
-00:00:42,960 --> 00:00:47,040
-with the standard Adam optimizer. So 
-I'm just going to run all of those,  
-
-9
-00:00:49,600 --> 00:00:53,760
-we'll wait a few seconds, and 
-everything should be ready for training.  
-
-10
-00:00:57,600 --> 00:01:03,200
-Okay, so now we're ready to train I'm 
-going to show you the two ways you can  
-
-11
-00:01:03,200 --> 00:01:07,520
-push your model to the Hub. So the 
-first is with the PushToHubCallback.  
-
-12
-00:01:08,080 --> 00:01:14,640
-So a callback in Keras is a function that's called 
-regularly during training. You can set it to be  
-
-13
-00:01:14,640 --> 00:01:20,640
-called after a certain number of steps, or every 
-epoch, or even just once at the end of training.  
-
-14
-00:01:22,480 --> 00:01:27,600
-So a lot of callbacks in Keras, for example, 
-control learning rate decaying on plateau  
-
-15
-00:01:28,320 --> 00:01:34,400
-and things like that. And so this callback, by 
-default, will save your model to the Hub once  
-
-16
-00:01:34,400 --> 00:01:39,200
-every epoch. And that's really helpful especially 
-if your training is very long, because that means  
-
-17
-00:01:39,200 --> 00:01:43,680
-you can resume from that save, so you get this 
-automatic cloud-saving of your model, and you can  
-
-18
-00:01:43,680 --> 00:01:49,760
-even run inference with the checkpoints of your 
-model that have been uploaded by this callback,  
-
-19
-00:01:50,720 --> 00:01:55,120
-and that means you can, y'know, actually 
-run some test inputs and actually see how  
-
-20
-00:01:55,120 --> 00:02:00,560
-your model works at various stages during 
-training, which is a really nice feature. So  
-
-21
-00:02:01,280 --> 00:02:06,560
-we're going to add the PushToHubCallback, and it 
-takes just a few arguments. So the first argument  
-
-22
-00:02:06,560 --> 00:02:11,920
-is the temporary directory that files are going 
-to be saved to before they're uploaded to the Hub.  
-
-23
-00:02:11,920 --> 00:02:16,880
-The second argument is the tokenizer, and the 
-third argument here is the keyword argument  
-
-24
-00:02:17,600 --> 00:02:22,160
-hub_model_id. So that's the name it's going 
-to be saved under on the HuggingFace Hub.  
-
-25
-00:02:23,200 --> 00:02:29,760
-You can also upload to an organization account 
-just by adding the organization name before  
-
-26
-00:02:29,760 --> 00:02:34,320
-the repository name with a slash like this. So 
-you probably don't have permissions to upload to  
-
-27
-00:02:34,320 --> 00:02:38,640
-the Hugging Face organization, if you do please 
-file a bug and let us know extremely urgently.  
-
-28
-00:02:40,640 --> 00:02:44,000
-But if you do have access to your own 
-organization then you can use that  
-
-29
-00:02:44,000 --> 00:02:47,600
-same approach to upload models to their 
-account instead of to your own personal  
-
-30
-00:02:49,280 --> 00:02:56,080
-set of models. So, once you've made your 
-callback you simply add it to the callbacks list  
-
-31
-00:02:56,080 --> 00:03:01,280
-when you're called model.fit() and everything is 
-uploaded for you from there, and there's nothing  
-
-32
-00:03:01,280 --> 00:03:06,320
-else to worry about. The second way to upload a 
-model, though, is to call model.push_to_hub().  
-
-33
-00:03:06,880 --> 00:03:11,920
-So this is more of a once-off method - it's not 
-called regularly during training. You can just  
-
-34
-00:03:11,920 --> 00:03:17,680
-call this manually whenever you want to upload 
-a model to the hub. So we recommend running this  
-
-35
-00:03:17,680 --> 00:03:22,720
-after the end of training, just to make sure that 
-you have a commit message just to guarantee that  
-
-36
-00:03:22,720 --> 00:03:27,280
-this was the final version of the model at the 
-end of training. And it just makes sure that  
-
-37
-00:03:28,160 --> 00:03:32,000
-you're working with the definitive end-of-training 
-model and not accidentally using a model that's  
-
-38
-00:03:32,000 --> 00:03:36,720
-from a checkpoint somewhere along the way. 
-So I'm going to run both of these cells  
-
-39
-00:03:38,800 --> 00:03:42,320
-and then I'm going to cut the video here, just 
-because training is going to take a couple of  
-
-40
-00:03:42,320 --> 00:03:46,160
-minutes, and so I'll skip forward to the end of 
-that, when the models have all been uploaded,  
-
-41
-00:03:46,160 --> 00:03:50,880
-and I'm gonna show you how you can access 
-the models in the Hub and the other things  
-
-42
-00:03:50,880 --> 00:03:58,400
-you can do with them from there. Okay, 
-we're back and our model was uploaded,  
-
-43
-00:03:58,960 --> 00:04:03,760
-both by the PushToHubCallback and also by our 
-call to model.push_to_hub() after training.  
-
-44
-00:04:04,720 --> 00:04:10,320
-So everything's looking good! So now if we drop 
-over to my profile on HuggingFace, and you can get  
-
-45
-00:04:10,320 --> 00:04:15,760
-there just by clicking the profile button in the 
-dropdown, we can see that the bert-fine-tuned-cola  
-
-46
-00:04:15,760 --> 00:04:20,560
-model is here, and was updated 3 minutes ago. So 
-it'll always be at the top of your list, because  
-
-47
-00:04:20,560 --> 00:04:25,280
-they're sorted by how recently they were updated. 
-And we can start querying our model immediately!  
-
-48
-00:04:26,640 --> 00:04:36,720
-So the dataset we were training on is the Glue 
-CoLA dataset, and CoLA is an acronym for Corpus  
-
-49
-00:04:36,720 --> 00:04:42,560
-of Linguistic Acceptability. So what that means 
-is that the model is being trained to decide if a  
-
-50
-00:04:42,560 --> 00:04:49,040
-sentence is grammatically or linguistically okay, 
-or if there's a problem with it. For example,  
-
-51
-00:04:49,680 --> 00:04:54,400
-we could say "This is a legitimate sentence" 
-and hopefully it realizes that this is in  
-
-52
-00:04:54,400 --> 00:05:00,880
-fact a legitimate sentence. So it might take a 
-couple of seconds for the model to load when you  
-
-53
-00:05:00,880 --> 00:05:05,200
-call it for the first time, so I might cut 
-a couple of seconds out of this video here.  
-
-54
-00:05:07,680 --> 00:05:14,160
-Okay, we're back! The model loaded and we got 
-an output, but there's an obvious problem here.  
-
-55
-00:05:14,160 --> 00:05:19,680
-So these labels aren't really telling us what 
-categories the model has actually assigned to  
-
-56
-00:05:19,680 --> 00:05:26,720
-this input sentence. So if we want to fix that, we 
-want to make sure the model config has the correct  
-
-57
-00:05:26,720 --> 00:05:31,920
-names for each of the label classes, and then we 
-want to upload that config. So we can do that down  
-
-58
-00:05:31,920 --> 00:05:38,480
-here. To get the label_names, we can get that 
-from the dataset we loaded, from the 'features'  
-
-59
-00:05:38,480 --> 00:05:44,160
-attribute it has. And then we can create 
-dictionaries "id2label" and "label2id"  
-
-60
-00:05:45,200 --> 00:05:51,040
-and just assign them to the model config, and then 
-we can just push our updated config and that'll  
-
-61
-00:05:51,040 --> 00:05:58,080
-override the existing config in the Hub repo. So 
-that's just been done, so now if we go back here,  
-
-62
-00:05:58,080 --> 00:06:02,720
-I'm going to use a slightly different sentence 
-because the outputs for sentences are sometimes  
-
-63
-00:06:02,720 --> 00:06:07,600
-cached, and so if we want to generate new results 
-I'm going to use something slightly different. So  
-
-64
-00:06:07,600 --> 00:06:13,840
-let's try an incorrect sentence, so this is not 
-valid English grammar and hopefully the model will  
-
-65
-00:06:13,840 --> 00:06:17,360
-see that. It's going to reload here, so 
-I'm going to cut a couple of seconds here,  
-
-66
-00:06:18,480 --> 00:06:26,400
-and then we'll see what the model is going to say. 
-Okay! So the model's confidence isn't very good,  
-
-67
-00:06:26,400 --> 00:06:31,440
-because of course we didn't really optimize our 
-hyperparameters at all, but it has decided that  
-
-68
-00:06:31,440 --> 00:06:37,200
-this sentence is more likely to be unacceptable 
-than acceptable. Presumably if we tried a bit  
-
-69
-00:06:37,200 --> 00:06:41,280
-harder with training we could get a much lower 
-validation loss and therefore the model's  
-
-70
-00:06:41,280 --> 00:06:47,040
-predictions would be more precise. But let's 
-try our original sentence again - of course,  
-
-71
-00:06:47,040 --> 00:06:52,560
-because of the caching issue we're seeing 
-that the original answers are unchanged.  
-
-72
-00:06:52,560 --> 00:06:58,160
-So let's try a different, valid sentence. So 
-let's try "This is a valid English sentence".  
-
-73
-00:06:59,920 --> 00:07:03,680
-And we see that now the model correctly decides 
-that it has a very high probability of being  
-
-74
-00:07:03,680 --> 00:07:09,840
-acceptable and a very low probability of being 
-unacceptable. So you can use this inference API  
-
-75
-00:07:09,840 --> 00:07:14,320
-even with the checkpoints that are uploaded during 
-training, so it can be very interesting to see how  
-
-76
-00:07:15,200 --> 00:07:19,680
-the model's predictions for sample inputs 
-change with each epoch of training.  
-
-77
-00:07:21,920 --> 00:07:27,040
-Also, the model we've uploaded is going to be 
-accessible to you and, if it's shared publicly,  
-
-78
-00:07:27,040 --> 00:07:32,240
-to anyone else. So if you want to load that 
-model all you, or anyone else, needs to do  
-
-79
-00:07:34,160 --> 00:07:40,640
-is just to load it in either a pipeline 
-or you can just load it with, for example,  
-
-80
-00:07:40,640 --> 00:07:50,960
-TFAutoModelForSequenceClassification and then 
-for the name you would just simply pass the path  
-
-81
-00:07:50,960 --> 00:07:58,560
-to the repo you want to upload - or to download, 
-excuse me. So if I want to use this model again,  
-
-82
-00:07:58,560 --> 00:08:02,880
-if I want to load it from the hub, I just run this 
-one line of code, the model will be downloaded  
-
-83
-00:08:05,280 --> 00:08:11,200
-and with any luck it'll be ready to 
-fine-tune on a different dataset,  
-
-84
-00:08:11,200 --> 00:08:17,760
-make predictions with, or do anything else you 
-wanna do. So that was a quick overview of how,  
-
-85
-00:08:17,760 --> 00:08:21,280
-after your training or during your 
-training, you can upload models to the Hub,  
-
-86
-00:08:21,280 --> 00:08:26,800
-you can checkpoint there, you can resume training 
-from there, and you can get inference results from  
-
-87
-00:08:26,800 --> 00:08:37,040
-the models you've uploaded. So thank you, 
-and I hope to see you in a future video!
+﻿1
+00:00:00,587 --> 00:00:02,670
+(swoosh)
+
+2
+00:00:05,100 --> 00:00:07,080
+- [Narrator] Hi, this
+is going to be a video
+
+3
+00:00:07,080 --> 00:00:09,420
+about the push_to_hub API
+
+4
+00:00:09,420 --> 00:00:10,670
+for Tensorflow and Keras.
+
+5
+00:00:11,820 --> 00:00:14,850
+So, to get started, we'll
+open up our notebook.
+
+6
+00:00:14,850 --> 00:00:16,920
+And the first thing you'll
+need to do is log in to
+
+7
+00:00:16,920 --> 00:00:18,170
+your HuggingFace account,
+
+8
+00:00:19,043 --> 00:00:20,663
+for example with the
+notebook login function.
+
+9
+00:00:21,570 --> 00:00:24,630
+So to use that, you
+simply call the function,
+
+10
+00:00:24,630 --> 00:00:26,010
+the popup will emerge.
+
+11
+00:00:26,010 --> 00:00:28,800
+You will enter your username and password,
+
+12
+00:00:28,800 --> 00:00:31,425
+which I'm going to pull out
+of my password manager here,
+
+13
+00:00:31,425 --> 00:00:33,108
+and you log in.
+
+14
+00:00:33,108 --> 00:00:35,670
+The next two cells are just
+
+15
+00:00:35,670 --> 00:00:37,080
+getting everything ready for training.
+
+16
+00:00:37,080 --> 00:00:38,940
+So we're just going to load a dataset,
+
+17
+00:00:38,940 --> 00:00:41,100
+we're going to tokenize that dataset,
+
+18
+00:00:41,100 --> 00:00:42,990
+and then we're going to
+load our model and compile
+
+19
+00:00:42,990 --> 00:00:45,660
+it with the standard Adam optimizer.
+
+20
+00:00:45,660 --> 00:00:47,560
+So I'm just going to run all of those.
+
+21
+00:00:49,830 --> 00:00:52,080
+We'll wait a few seconds,
+
+22
+00:00:52,080 --> 00:00:54,280
+and everything should
+be ready for training.
+
+23
+00:00:57,983 --> 00:00:58,816
+Okay.
+
+24
+00:00:58,816 --> 00:01:01,440
+So now we're ready to train.
+
+25
+00:01:01,440 --> 00:01:03,030
+I'm going to show you the two ways
+
+26
+00:01:03,030 --> 00:01:05,130
+you can push your model to the Hub.
+
+27
+00:01:05,130 --> 00:01:08,190
+So the first is with
+the PushToHubCallback.
+
+28
+00:01:08,190 --> 00:01:10,107
+So a callback in Keras
+
+29
+00:01:10,107 --> 00:01:13,710
+is a function that's called
+regularly during training.
+
+30
+00:01:13,710 --> 00:01:17,400
+You can set it to be called
+after a certain number of steps,
+
+31
+00:01:17,400 --> 00:01:21,427
+or every epoch, or even just
+once at the end of training.
+
+32
+00:01:21,427 --> 00:01:25,080
+So a lot of callbacks
+in Keras, for example,
+
+33
+00:01:25,080 --> 00:01:28,050
+control learning rate decaying on plateau,
+
+34
+00:01:28,050 --> 00:01:30,047
+and things like that.
+
+35
+00:01:30,047 --> 00:01:32,520
+So this callback, by default,
+
+36
+00:01:32,520 --> 00:01:35,760
+will save your model to
+the Hub once every epoch.
+
+37
+00:01:35,760 --> 00:01:37,080
+And that's really helpful,
+
+38
+00:01:37,080 --> 00:01:38,790
+especially if your training is very long,
+
+39
+00:01:38,790 --> 00:01:40,800
+because that means you
+can resume from that save,
+
+40
+00:01:40,800 --> 00:01:43,290
+so you get this automatic
+cloud-saving of your model.
+
+41
+00:01:43,290 --> 00:01:45,027
+And you can even run inference
+
+42
+00:01:45,027 --> 00:01:47,730
+with the checkpoints of your model
+
+43
+00:01:47,730 --> 00:01:50,208
+that have been uploaded by this callback.
+
+44
+00:01:50,208 --> 00:01:52,260
+And that means you can,
+
+45
+00:01:52,260 --> 00:01:54,150
+y'know, run some test inputs
+
+46
+00:01:54,150 --> 00:01:56,100
+and actually see how your model works
+
+47
+00:01:56,100 --> 00:01:57,990
+at various stages during training,
+
+48
+00:01:57,990 --> 00:01:59,540
+which is a really nice feature.
+
+49
+00:02:00,390 --> 00:02:03,960
+So we're going to add
+the PushToHubCallback,
+
+50
+00:02:03,960 --> 00:02:05,670
+and it takes just a few arguments.
+
+51
+00:02:05,670 --> 00:02:08,250
+So the first argument is
+the temporary directory
+
+52
+00:02:08,250 --> 00:02:10,260
+that files are going to be saved to
+
+53
+00:02:10,260 --> 00:02:12,150
+before they're uploaded to the Hub.
+
+54
+00:02:12,150 --> 00:02:14,127
+The second argument is the tokenizer,
+
+55
+00:02:14,127 --> 00:02:15,808
+and the third argument here
+
+56
+00:02:15,808 --> 00:02:19,080
+is the keyword argument hub_model_id.
+
+57
+00:02:19,080 --> 00:02:21,330
+So that's the name it's
+going to be saved under
+
+58
+00:02:21,330 --> 00:02:23,006
+on the HuggingFace Hub.
+
+59
+00:02:23,006 --> 00:02:26,267
+You can also upload to
+an organization account
+
+60
+00:02:26,267 --> 00:02:29,370
+just by adding the organization name
+
+61
+00:02:29,370 --> 00:02:32,460
+before the repository name
+with a slash, like this.
+
+62
+00:02:32,460 --> 00:02:34,020
+So you probably don't have permissions
+
+63
+00:02:34,020 --> 00:02:36,000
+to upload to the HuggingFace organization,
+
+64
+00:02:36,000 --> 00:02:37,170
+if you do please file a bug
+
+65
+00:02:37,170 --> 00:02:38,973
+and let us know extremely urgently.
+
+66
+00:02:40,830 --> 00:02:42,960
+But if you do have access
+to your own organization,
+
+67
+00:02:42,960 --> 00:02:44,730
+then you can use that same approach
+
+68
+00:02:44,730 --> 00:02:46,650
+to upload models to their account
+
+69
+00:02:46,650 --> 00:02:50,760
+instead of to your own
+personal set of models.
+
+70
+00:02:50,760 --> 00:02:53,520
+So, once you've made your callback,
+
+71
+00:02:53,520 --> 00:02:56,310
+you simply add it to the callbacks list
+
+72
+00:02:56,310 --> 00:02:58,080
+when you're calling model.fit.
+
+73
+00:02:58,080 --> 00:03:01,110
+And everything is uploaded
+for you from there,
+
+74
+00:03:01,110 --> 00:03:02,610
+there's nothing else to worry about.
+
+75
+00:03:02,610 --> 00:03:04,530
+The second way to upload a model, though,
+
+76
+00:03:04,530 --> 00:03:07,020
+is to call model.push_to_hub.
+
+77
+00:03:07,020 --> 00:03:09,086
+So this is more of a once-off method.
+
+78
+00:03:09,086 --> 00:03:11,550
+It's not called regularly during training.
+
+79
+00:03:11,550 --> 00:03:13,680
+You can just call this
+manually whenever you want to
+
+80
+00:03:13,680 --> 00:03:15,240
+upload a model to the hub.
+
+81
+00:03:15,240 --> 00:03:18,949
+So we recommend running this
+after the end of training,
+
+82
+00:03:18,949 --> 00:03:21,870
+just to make sure that
+you have a commit message
+
+83
+00:03:21,870 --> 00:03:24,060
+to guarantee that this
+was the final version
+
+84
+00:03:24,060 --> 00:03:26,143
+of the model at the end of training.
+
+85
+00:03:26,143 --> 00:03:27,930
+And it just makes sure that, you know,
+
+86
+00:03:27,930 --> 00:03:30,480
+you're working with the
+definitive end-of-training model
+
+87
+00:03:30,480 --> 00:03:32,190
+and not accidentally using a checkpoint
+
+88
+00:03:32,190 --> 00:03:34,224
+from somewhere along the way.
+
+89
+00:03:34,224 --> 00:03:37,173
+So I'm going to run both of these cells.
+
+90
+00:03:39,299 --> 00:03:41,716
+And then I'm going to cut the video here,
+
+91
+00:03:41,716 --> 00:03:43,080
+just because training is going
+to take a couple of minutes.
+
+92
+00:03:43,080 --> 00:03:44,580
+So I'll skip forward to the end of that,
+
+93
+00:03:44,580 --> 00:03:46,320
+when the models have all been uploaded,
+
+94
+00:03:46,320 --> 00:03:48,390
+and I'm gonna show you how you can
+
+95
+00:03:48,390 --> 00:03:50,010
+access the models in the Hub,
+
+96
+00:03:50,010 --> 00:03:52,713
+and the other things you
+can do with them from there.
+
+97
+00:03:55,440 --> 00:03:56,700
+Okay, we're back,
+
+98
+00:03:56,700 --> 00:03:59,160
+and our model was uploaded.
+
+99
+00:03:59,160 --> 00:04:00,750
+Both by the PushToHubCallback
+
+100
+00:04:00,750 --> 00:04:04,251
+and also by our call to
+model.push_to_hub after training.
+
+101
+00:04:04,251 --> 00:04:05,910
+So everything's looking good.
+
+102
+00:04:05,910 --> 00:04:09,960
+So now if we drop over to
+my profile on HuggingFace,
+
+103
+00:04:09,960 --> 00:04:12,630
+and you can get there just by
+clicking the profile button
+
+104
+00:04:12,630 --> 00:04:13,680
+in the dropdown.
+
+105
+00:04:13,680 --> 00:04:16,860
+We can see that the
+bert-fine-tuned-cola model is here,
+
+106
+00:04:16,860 --> 00:04:18,369
+and was updated 3 minutes ago.
+
+107
+00:04:18,369 --> 00:04:20,520
+So it'll always be at
+the top of your list,
+
+108
+00:04:20,520 --> 00:04:23,340
+because they're sorted by how
+recently they were updated.
+
+109
+00:04:23,340 --> 00:04:25,740
+And we can start querying
+our model immediately.
+
+110
+00:04:30,564 --> 00:04:32,939
+So the dataset we were training on
+
+111
+00:04:32,939 --> 00:04:34,320
+is the Glue CoLA dataset,
+
+112
+00:04:34,320 --> 00:04:36,210
+and CoLA is an acronym standing for
+
+113
+00:04:36,210 --> 00:04:39,420
+the Corpus of Linguistic Acceptability.
+
+114
+00:04:39,420 --> 00:04:42,480
+So what that means is the model
+is being trained to decide
+
+115
+00:04:42,480 --> 00:04:46,350
+if a sentence is grammatically
+or linguistically okay,
+
+116
+00:04:46,350 --> 00:04:48,171
+or if there's a problem with it.
+
+117
+00:04:48,171 --> 00:04:52,890
+For example, we could say,
+"This is a legitimate sentence."
+
+118
+00:04:52,890 --> 00:04:54,180
+And hopefully it realizes that
+
+119
+00:04:54,180 --> 00:04:56,080
+this is in fact a legitimate sentence.
+
+120
+00:04:57,630 --> 00:05:00,240
+So it might take a couple of
+seconds for the model to load
+
+121
+00:05:00,240 --> 00:05:03,060
+when you call it for the first time.
+
+122
+00:05:03,060 --> 00:05:05,960
+So I might cut a couple of
+seconds out of this video here.
+
+123
+00:05:07,860 --> 00:05:09,060
+Okay, we're back.
+
+124
+00:05:09,060 --> 00:05:12,407
+So the model loaded and we got an output,
+
+125
+00:05:12,407 --> 00:05:14,340
+but there's an obvious problem here.
+
+126
+00:05:14,340 --> 00:05:16,888
+So these labels aren't really telling us
+
+127
+00:05:16,888 --> 00:05:19,740
+what categories the model
+has actually assigned
+
+128
+00:05:19,740 --> 00:05:21,655
+to this input sentence.
+
+129
+00:05:21,655 --> 00:05:23,520
+So if we want to fix that,
+
+130
+00:05:23,520 --> 00:05:26,010
+we want to make sure the model config
+
+131
+00:05:26,010 --> 00:05:28,980
+has the correct names for
+each of the label classes,
+
+132
+00:05:28,980 --> 00:05:30,707
+and then we want to upload that config.
+
+133
+00:05:30,707 --> 00:05:32,220
+So we can do that down here.
+
+134
+00:05:32,220 --> 00:05:34,050
+To get the label names,
+
+135
+00:05:34,050 --> 00:05:36,547
+we can get that from
+the dataset we loaded,
+
+136
+00:05:36,547 --> 00:05:39,627
+from the features attribute it has.
+
+137
+00:05:39,627 --> 00:05:42,217
+And then we can create dictionaries
+
+138
+00:05:42,217 --> 00:05:44,865
+"id2label" and "label2id",
+
+139
+00:05:44,865 --> 00:05:47,452
+and just assign them to the model config.
+
+140
+00:05:47,452 --> 00:05:50,790
+And then we can just
+push our updated config,
+
+141
+00:05:50,790 --> 00:05:54,690
+and that'll override the
+existing config in the Hub repo.
+
+142
+00:05:54,690 --> 00:05:56,368
+So that's just been done.
+
+143
+00:05:56,368 --> 00:05:58,320
+So now, if we go back here,
+
+144
+00:05:58,320 --> 00:06:00,000
+I'm going to use a
+slightly different sentence
+
+145
+00:06:00,000 --> 00:06:03,540
+because the outputs for
+sentences are sometimes cached.
+
+146
+00:06:03,540 --> 00:06:06,030
+And so, if we want to generate new results
+
+147
+00:06:06,030 --> 00:06:07,590
+I'm going to use something
+slightly different.
+
+148
+00:06:07,590 --> 00:06:09,783
+So let's try an incorrect sentence.
+
+149
+00:06:10,830 --> 00:06:12,640
+So this is not valid English grammar
+
+150
+00:06:13,538 --> 00:06:15,030
+and hopefully the model will see that.
+
+151
+00:06:15,030 --> 00:06:16,958
+It's going to reload here,
+
+152
+00:06:16,958 --> 00:06:18,630
+so I'm going to cut a
+couple of seconds here,
+
+153
+00:06:18,630 --> 00:06:20,933
+and then we'll see what
+the model is going to say.
+
+154
+00:06:22,860 --> 00:06:23,820
+Okay.
+
+155
+00:06:23,820 --> 00:06:26,580
+So the model, it's
+confidence isn't very good,
+
+156
+00:06:26,580 --> 00:06:28,830
+because of course we
+didn't really optimize
+
+157
+00:06:28,830 --> 00:06:30,630
+our hyperparameters at all.
+
+158
+00:06:30,630 --> 00:06:32,190
+But it has decided that this sentence
+
+159
+00:06:32,190 --> 00:06:35,094
+is more likely to be
+unacceptable than acceptable.
+
+160
+00:06:35,094 --> 00:06:38,160
+Presumably if we tried a
+bit harder with training
+
+161
+00:06:38,160 --> 00:06:40,080
+we could get a much lower validation loss,
+
+162
+00:06:40,080 --> 00:06:43,830
+and therefore the model's
+predictions would be more precise.
+
+163
+00:06:43,830 --> 00:06:46,260
+But let's try our original sentence again.
+
+164
+00:06:46,260 --> 00:06:49,140
+Of course, because of the caching issue,
+
+165
+00:06:49,140 --> 00:06:52,740
+we're seeing that the original
+answers are unchanged.
+
+166
+00:06:52,740 --> 00:06:55,196
+So let's try a different, valid sentence.
+
+167
+00:06:55,196 --> 00:06:58,767
+So let's try, "This is a
+valid English sentence".
+
+168
+00:07:00,150 --> 00:07:02,100
+And we see that now the
+model correctly decides
+
+169
+00:07:02,100 --> 00:07:04,290
+that it has a very high
+probability of being acceptable,
+
+170
+00:07:04,290 --> 00:07:06,900
+and a very low probability
+of being unacceptable.
+
+171
+00:07:06,900 --> 00:07:09,930
+So you can use this inference API
+
+172
+00:07:09,930 --> 00:07:12,810
+even with the checkpoints that
+are uploaded during training,
+
+173
+00:07:12,810 --> 00:07:14,546
+so it can be very interesting to see how
+
+174
+00:07:14,546 --> 00:07:17,690
+the model's predictions
+for sample inputs change
+
+175
+00:07:17,690 --> 00:07:20,579
+with each epoch of training.
+
+176
+00:07:20,579 --> 00:07:23,370
+Also, the model we've uploaded
+
+177
+00:07:23,370 --> 00:07:25,740
+is going to be accessible to you and,
+
+178
+00:07:25,740 --> 00:07:28,046
+if it's shared publicly, to anyone else.
+
+179
+00:07:28,046 --> 00:07:29,788
+So if you want to load that model,
+
+180
+00:07:29,788 --> 00:07:32,500
+all you or anyone else needs to do
+
+181
+00:07:34,290 --> 00:07:37,440
+is just to load it in either a pipeline,
+
+182
+00:07:37,440 --> 00:07:40,925
+or you can just load it with, for example,
+
+183
+00:07:40,925 --> 00:07:43,203
+TFAutoModelForSequenceClassification.
+
+184
+00:07:46,920 --> 00:07:49,989
+And then for the name you
+would just simply pass
+
+185
+00:07:49,989 --> 00:07:53,325
+the path to the repo you want to upload.
+
+186
+00:07:53,325 --> 00:07:55,890
+Or to download, excuse me.
+
+187
+00:07:55,890 --> 00:07:58,710
+So if I want to use this model again,
+
+188
+00:07:58,710 --> 00:08:00,667
+if I want to load it from the hub,
+
+189
+00:08:00,667 --> 00:08:01,763
+I just run this one line of code.
+
+190
+00:08:02,813 --> 00:08:03,773
+The model will be downloaded.
+
+191
+00:08:07,757 --> 00:08:10,080
+And, with any luck, it'll be ready to
+
+192
+00:08:10,080 --> 00:08:12,450
+fine-tune on a different
+dataset, make predictions with,
+
+193
+00:08:12,450 --> 00:08:14,340
+or do anything else you wanna do.
+
+194
+00:08:14,340 --> 00:08:17,700
+So that was a quick overview of how,
+
+195
+00:08:17,700 --> 00:08:19,470
+after your training or
+during your training,
+
+196
+00:08:19,470 --> 00:08:21,420
+you can upload models to the Hub,
+
+197
+00:08:21,420 --> 00:08:22,440
+you can checkpoint there,
+
+198
+00:08:22,440 --> 00:08:24,240
+you can resume training from there,
+
+199
+00:08:24,240 --> 00:08:26,790
+and you can get inference results
+
+200
+00:08:26,790 --> 00:08:28,384
+from the models you've uploaded.
+
+201
+00:08:28,384 --> 00:08:31,084
+So thank you, and I hope to
+see you in a future video.
+
+202
+00:08:32,852 --> 00:08:34,935
+(swoosh)
+
diff --git a/subtitles/en/35_loading-a-custom-dataset.srt b/subtitles/en/35_loading-a-custom-dataset.srt
index 0c31a2132..290b96431 100644
--- a/subtitles/en/35_loading-a-custom-dataset.srt
+++ b/subtitles/en/35_loading-a-custom-dataset.srt
@@ -1,129 +1,343 @@
-1
-00:00:06,080 --> 00:00:11,600
-Loading a custom dataset. Although the Hugging 
-Face Hub hosts over a thousand public datasets,  
-
-2
-00:00:11,600 --> 00:00:15,040
-you'll often need to work with data that is 
-stored on your laptop or some remote server.  
-
-3
-00:00:15,760 --> 00:00:19,520
-In this video we'll explore how the Datasets 
-library can be used to load datasets that  
-
-4
-00:00:19,520 --> 00:00:24,800
-aren’t available on the Hugging Face Hub. 
-As you can see in this table, the Datasets  
-
-5
-00:00:24,800 --> 00:00:30,080
-library provides several in-built scripts to load 
-datasets in several formats. To load a dataset in  
-
-6
-00:00:30,080 --> 00:00:34,160
-one of these formats, you just need to provide the 
-name of the format to the load_dataset function,  
-
-7
-00:00:34,160 --> 00:00:38,000
-along with a data_files argument that 
-points to one or more filepaths or URLs.  
-
-8
-00:00:40,080 --> 00:00:44,400
-To see this in action, let's start by 
-loading a local CSV file. In this example,  
-
-9
-00:00:44,400 --> 00:00:48,720
-we first download a dataset about wine quality 
-from the UCI machine learning repository.  
-
-10
-00:00:50,080 --> 00:00:56,000
-Since this is a CSV file, we then specify the 
-csv loading script. This script needs to know  
-
-11
-00:00:56,000 --> 00:01:00,160
-where our data is located, so we provide the 
-filename as part of the data_files argument.  
-
-12
-00:01:01,920 --> 00:01:05,760
-The CSV loading script also allows you to pass 
-several keyword arguments, so here we've also  
-
-13
-00:01:05,760 --> 00:01:10,640
-specified the separator as a semi-colon. And 
-with that we can see the dataset is loaded  
-
-14
-00:01:10,640 --> 00:01:15,360
-automatically as a DatasetDict object, with each 
-column in the CSV file represented as a feature.  
-
-15
-00:01:17,360 --> 00:01:21,760
-If your dataset is located on some remote 
-server like GitHub or some other repository,  
-
-16
-00:01:21,760 --> 00:01:26,320
-the process is very similar. The only difference 
-is that now the data_files argument points to a  
-
-17
-00:01:26,320 --> 00:01:33,600
-URL instead of a local filepath. Let's now take 
-a look at loading raw text files. This format  
-
-18
-00:01:33,600 --> 00:01:37,840
-is quite common in NLP and you'll typically 
-find books and plays are just a single file  
-
-19
-00:01:37,840 --> 00:01:43,040
-with raw text inside. In this example, we 
-have a text file of Shakespeare plays that's  
-
-20
-00:01:43,040 --> 00:01:48,880
-stored on a GitHub repository. As we did for CSV 
-files, we simply choose the text loading script  
-
-21
-00:01:48,880 --> 00:01:54,080
-and point the data_files argument to the URL. 
-As you can see, these files are processed  
-
-22
-00:01:54,080 --> 00:01:58,640
-line-by-line, so empty lines in the raw text 
-are also represented as a row in the dataset.  
-
-23
-00:02:00,560 --> 00:02:05,840
-For JSON files, there are two main formats to 
-know about. The first one is called JSON Lines,  
-
-24
-00:02:05,840 --> 00:02:10,880
-where every row in the file is a separate JSON 
-object. For these files, you can load the dataset  
-
-25
-00:02:10,880 --> 00:02:15,760
-by selecting the json loading script and pointing 
-the data_files argument to the file or URL.  
-
-26
-00:02:16,960 --> 00:02:21,840
-In this example, we've loaded a JSON lines files 
-based on Stack Exchange questions and answers.
+﻿1
+00:00:00,195 --> 00:00:01,426
+(screen whooshing)
+
+2
+00:00:01,426 --> 00:00:02,614
+(sticker popping)
+
+3
+00:00:02,614 --> 00:00:06,150
+(screen whooshing)
+
+4
+00:00:06,150 --> 00:00:08,430
+- Loading a custom dataset.
+
+5
+00:00:08,430 --> 00:00:09,750
+Although the Hugging Face Hub hosts
+
+6
+00:00:09,750 --> 00:00:11,730
+over a thousand public datasets,
+
+7
+00:00:11,730 --> 00:00:12,930
+you'll often need to work with data
+
+8
+00:00:12,930 --> 00:00:15,900
+that is stored on your
+laptop or some remote server.
+
+9
+00:00:15,900 --> 00:00:18,060
+In this video, we'll explore
+how the Datasets library
+
+10
+00:00:18,060 --> 00:00:20,310
+can be used to load datasets
+that aren't available
+
+11
+00:00:20,310 --> 00:00:21,510
+on the Hugging Face Hub.
+
+12
+00:00:22,980 --> 00:00:25,290
+As you can see in this
+table, the Datasets library
+
+13
+00:00:25,290 --> 00:00:26,700
+provides several in-built scripts
+
+14
+00:00:26,700 --> 00:00:29,370
+to load datasets in several formats.
+
+15
+00:00:29,370 --> 00:00:31,200
+To load a dataset in one of these formats,
+
+16
+00:00:31,200 --> 00:00:32,730
+you just need to provide
+the name of the format
+
+17
+00:00:32,730 --> 00:00:34,350
+to the load_dataset function,
+
+18
+00:00:34,350 --> 00:00:35,790
+along with a data_files argument
+
+19
+00:00:35,790 --> 00:00:37,610
+that points to one or
+more filepaths or URLs.
+
+20
+00:00:40,350 --> 00:00:43,590
+To see this in action, let's
+start by loading a CSV file.
+
+21
+00:00:43,590 --> 00:00:45,960
+In this example, we
+first download a dataset
+
+22
+00:00:45,960 --> 00:00:48,963
+about wine quality from the UCI
+machine learning repository.
+
+23
+00:00:50,220 --> 00:00:52,590
+Since this is a CSV file, we then specify
+
+24
+00:00:52,590 --> 00:00:53,943
+the CSV loading script.
+
+25
+00:00:55,320 --> 00:00:57,570
+Now, this script needs to know
+where our data is located,
+
+26
+00:00:57,570 --> 00:00:58,650
+so we provide the filename
+
+27
+00:00:58,650 --> 00:01:00,483
+as part of the data_files argument.
+
+28
+00:01:01,860 --> 00:01:03,360
+And the loading script also allows you
+
+29
+00:01:03,360 --> 00:01:05,040
+to pass several keyword arguments,
+
+30
+00:01:05,040 --> 00:01:06,750
+so here we've also specified
+
+31
+00:01:06,750 --> 00:01:09,030
+that the separator is a semi-colon.
+
+32
+00:01:09,030 --> 00:01:10,380
+And with that, we can see the dataset
+
+33
+00:01:10,380 --> 00:01:13,020
+is loaded automatically
+as a DatasetDict object,
+
+34
+00:01:13,020 --> 00:01:15,920
+with each column in the CSV
+file represented as a feature.
+
+35
+00:01:17,610 --> 00:01:20,280
+If your dataset is located on
+some remote server like GitHub
+
+36
+00:01:20,280 --> 00:01:22,050
+or some other repository,
+
+37
+00:01:22,050 --> 00:01:23,700
+the process is actually very similar.
+
+38
+00:01:23,700 --> 00:01:25,980
+The only difference is that
+now the data_files argument
+
+39
+00:01:25,980 --> 00:01:28,623
+points to a URL instead
+of a local filepath.
+
+40
+00:01:30,330 --> 00:01:33,270
+Let's now take a look at
+loading raw text files.
+
+41
+00:01:33,270 --> 00:01:35,100
+This format is quite common in NLP,
+
+42
+00:01:35,100 --> 00:01:36,750
+and you'll typically find books and plays
+
+43
+00:01:36,750 --> 00:01:39,393
+are just a single file
+with raw text inside.
+
+44
+00:01:40,410 --> 00:01:43,020
+In this example, we have a
+text file of Shakespeare plays
+
+45
+00:01:43,020 --> 00:01:45,330
+that's stored on a GitHub repository.
+
+46
+00:01:45,330 --> 00:01:47,040
+And as we did for CSV files,
+
+47
+00:01:47,040 --> 00:01:49,020
+we simply choose the text loading script
+
+48
+00:01:49,020 --> 00:01:51,423
+and point the data_files
+argument to the URL.
+
+49
+00:01:52,260 --> 00:01:55,110
+As you can see, these files
+are processed line-by-line,
+
+50
+00:01:55,110 --> 00:01:57,690
+so empty lines in the raw
+text are also represented
+
+51
+00:01:57,690 --> 00:01:58,953
+as a row in the dataset.
+
+52
+00:02:00,810 --> 00:02:04,230
+For JSON files, there are two
+main formats to know about.
+
+53
+00:02:04,230 --> 00:02:06,060
+The first one is called JSON Lines,
+
+54
+00:02:06,060 --> 00:02:09,510
+where every row in the file
+is a separate JSON object.
+
+55
+00:02:09,510 --> 00:02:11,100
+For these files, you can load the dataset
+
+56
+00:02:11,100 --> 00:02:13,020
+by selecting the JSON loading script
+
+57
+00:02:13,020 --> 00:02:16,143
+and pointing the data_files
+argument to the file or URL.
+
+58
+00:02:17,160 --> 00:02:19,410
+In this example, we've
+loaded a JSON lines files
+
+59
+00:02:19,410 --> 00:02:21,710
+based on Stack Exchange
+questions and answers.
+
+60
+00:02:23,490 --> 00:02:26,610
+The other format is nested JSON files.
+
+61
+00:02:26,610 --> 00:02:29,100
+These files basically look
+like one huge dictionary,
+
+62
+00:02:29,100 --> 00:02:31,200
+so the load_dataset function
+allow you to specify
+
+63
+00:02:31,200 --> 00:02:32,733
+which specific key to load.
+
+64
+00:02:33,630 --> 00:02:35,910
+For example, the SQuAD dataset
+for question and answering
+
+65
+00:02:35,910 --> 00:02:38,340
+has its format, and we
+can load it by specifying
+
+66
+00:02:38,340 --> 00:02:40,340
+that we're interested in the data field.
+
+67
+00:02:41,400 --> 00:02:42,780
+There is just one last thing to mention
+
+68
+00:02:42,780 --> 00:02:44,910
+about all of these loading scripts.
+
+69
+00:02:44,910 --> 00:02:46,410
+You can have more than one split,
+
+70
+00:02:46,410 --> 00:02:49,080
+you can load them by treating
+data files as a dictionary,
+
+71
+00:02:49,080 --> 00:02:52,140
+and map each split name
+to its corresponding file.
+
+72
+00:02:52,140 --> 00:02:53,970
+Everything else stays completely unchanged
+
+73
+00:02:53,970 --> 00:02:55,350
+and you can see an example of loading
+
+74
+00:02:55,350 --> 00:02:58,283
+both the training and validation
+splits for this SQuAD here.
+
+75
+00:02:59,550 --> 00:03:02,310
+And with that, you can now
+load datasets from your laptop,
+
+76
+00:03:02,310 --> 00:03:04,653
+the Hugging Face Hub,
+or anywhere else want.
+
+77
+00:03:06,277 --> 00:03:09,194
+(screen whooshing)
+
diff --git "a/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt" "b/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt"
index f899b80e1..dd49935e2 100644
--- "a/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt"
+++ "b/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt"
@@ -1,203 +1,370 @@
-1
-00:00:05,680 --> 00:00:07,440
-How to slice and dice a dataset.  
-
-2
-00:00:08,640 --> 00:00:12,320
-Most of the time, the data you work with won’t 
-be perfectly prepared for training models.  
-
-3
-00:00:13,120 --> 00:00:17,920
-In this video we’ll explore various features 
-that Datasets provides to clean up your datasets.  
-
-4
-00:00:19,760 --> 00:00:23,520
-The Datasets library provides several built-in 
-methods that allow you to wrangle your data.  
-
-5
-00:00:25,200 --> 00:00:29,360
-In this video we'll see how you can shuffle 
-and split your data, select the rows you're  
-
-6
-00:00:29,360 --> 00:00:33,840
-interested in, tweak the columns, and apply 
-processing functions with the map() method.  
-
-7
-00:00:35,440 --> 00:00:39,920
-Let's start with shuffling. It is generally a 
-good idea to apply shuffling to the training set  
-
-8
-00:00:39,920 --> 00:00:42,640
-so that your model doesn't learn 
-any artificial ordering in the data.  
-
-9
-00:00:43,360 --> 00:00:46,880
-If you want to shuffle the whole dataset, you 
-can apply the appropriately named shuffle()  
-
-10
-00:00:46,880 --> 00:00:51,280
-method to your dataset. You can see an example of 
-this method in action here, where we've downloaded  
-
-11
-00:00:51,280 --> 00:00:56,960
-the training split of the SQUAD dataset 
-and shuffled all the rows randomly.Another  
-
-12
-00:00:56,960 --> 00:01:00,000
-way to shuffle the data is to 
-create random train and test splits.  
-
-13
-00:01:00,720 --> 00:01:05,600
-This can be useful if you have to create your own 
-test splits from raw data. To do this, you just  
-
-14
-00:01:05,600 --> 00:01:11,760
-apply the train_test_split method and specify how 
-large the test split should be. In this example,  
-
-15
-00:01:11,760 --> 00:01:17,280
-we've specified that the test set should be 
-10% of the total dataset size. You can see that  
-
-16
-00:01:17,280 --> 00:01:22,400
-the output of train_test_split is a DatasetDict 
-object, whose keys correspond to the new splits.  
-
-17
-00:01:24,960 --> 00:01:28,400
-Now that we know how to shuffle a dataset, 
-let's take a look at returning the rows  
-
-18
-00:01:28,400 --> 00:01:32,080
-we're interested in. The most common way 
-to do this is with the select method.  
-
-19
-00:01:32,960 --> 00:01:36,560
-This method expects a list or 
-generator of the dataset's indices,  
-
-20
-00:01:36,560 --> 00:01:39,840
-and will then return a new Dataset 
-object containing just those rows.  
-
-21
-00:01:41,280 --> 00:01:45,600
-If you want to create a random sample of rows, 
-you can do this by chaining the shuffle and select  
-
-22
-00:01:45,600 --> 00:01:51,120
-methods together. In this example, we've created 
-a sample of 5 elements from the SQuAD dataset.  
-
-23
-00:01:53,280 --> 00:01:57,360
-The last way to pick out specific rows in 
-a dataset is by applying the filter method.  
-
-24
-00:01:58,080 --> 00:02:01,360
-This method checks whether each 
-rows fulfills some condition or not.  
-
-25
-00:02:02,080 --> 00:02:05,840
-For example, here we've created a small 
-lambda function that checks whether the  
-
-26
-00:02:05,840 --> 00:02:10,800
-title starts with the letter "L". Once we 
-apply this function with the filter method,  
-
-27
-00:02:10,800 --> 00:02:13,840
-we get a subset of the data 
-consisting of just these titles.  
-
-28
-00:02:16,080 --> 00:02:19,360
-So far we've been talking about the rows 
-of a dataset, but what about the columns?  
-
-29
-00:02:20,240 --> 00:02:23,280
-The Datasets library has two main 
-methods for transforming columns:  
-
-30
-00:02:23,840 --> 00:02:26,480
-a rename_column method to 
-change the name of a column,  
-
-31
-00:02:26,480 --> 00:02:31,360
-and a remove_columns method to delete them. 
-You can see examples of both these method here.  
-
-32
-00:02:34,000 --> 00:02:38,400
-Some datasets have nested columns and you can 
-expand these by applying the flatten method.  
-
-33
-00:02:39,120 --> 00:02:44,240
-For example in the SQUAD dataset, the answers 
-column contains a text and answer_start field.  
-
-34
-00:02:44,960 --> 00:02:49,840
-If we want to promote them to their own separate 
-columns, we can apply flatten as shown here.  
-
-35
-00:02:51,280 --> 00:02:55,040
-Of course, no discussion of the Datasets 
-library would be complete without mentioning the  
-
-36
-00:02:55,040 --> 00:03:00,240
-famous map method. This method applies a custom 
-processing function to each row in the dataset.  
-
-37
-00:03:00,960 --> 00:03:06,480
-For example,here we first define a lowercase_title 
-function that simply lowercases the text in the  
-
-38
-00:03:06,480 --> 00:03:13,760
-title column and then we feed that to the map 
-method and voila! we now have lowercase titles.  
-
-39
-00:03:15,760 --> 00:03:19,280
-The map method can also be used to feed 
-batches of rows to the processing function.  
-
-40
-00:03:19,840 --> 00:03:24,240
-This is especially useful for tokenization, 
-where the tokenizers are backed by the Tokenizers  
-
-41
-00:03:24,240 --> 00:03:31,840
-library can use fast multithreading 
-to process batches in parallel.
+﻿1
+00:00:00,215 --> 00:00:02,882
+(air whooshing)
+
+2
+00:00:05,760 --> 00:00:07,623
+- How to slice and dice the dataset?
+
+3
+00:00:08,760 --> 00:00:10,410
+Most of the time, the data you work with
+
+4
+00:00:10,410 --> 00:00:13,230
+won't be perfectly prepared
+for training models.
+
+5
+00:00:13,230 --> 00:00:15,810
+In this video, we'll
+explore various features
+
+6
+00:00:15,810 --> 00:00:18,660
+that the datasets library
+provides to clean up your data.
+
+7
+00:00:19,915 --> 00:00:22,500
+The datasets library provides
+several built-in methods
+
+8
+00:00:22,500 --> 00:00:25,350
+that allow you to wrangle
+your data in various ways.
+
+9
+00:00:25,350 --> 00:00:27,360
+In this video, we'll
+see how you can shuffle
+
+10
+00:00:27,360 --> 00:00:30,750
+and split your data, select
+the rows you're interested in,
+
+11
+00:00:30,750 --> 00:00:32,070
+tweak the columns,
+
+12
+00:00:32,070 --> 00:00:34,620
+and apply processing
+functions with the map method.
+
+13
+00:00:35,640 --> 00:00:37,620
+Let's start with shuffling.
+
+14
+00:00:37,620 --> 00:00:38,520
+It is generally a good idea
+
+15
+00:00:38,520 --> 00:00:40,140
+to apply shuffling to your training set
+
+16
+00:00:40,140 --> 00:00:41,250
+so that your model doesn't learn
+
+17
+00:00:41,250 --> 00:00:43,590
+any artificial ordering the data.
+
+18
+00:00:43,590 --> 00:00:45,360
+If you wanna shuffle the whole dataset,
+
+19
+00:00:45,360 --> 00:00:48,390
+you can apply the appropriately
+named shuffle method.
+
+20
+00:00:48,390 --> 00:00:50,730
+You can see an example of
+this method in action here,
+
+21
+00:00:50,730 --> 00:00:52,200
+where we've downloaded the training split
+
+22
+00:00:52,200 --> 00:00:55,000
+of the squad dataset and
+shuffled all the rows randomly.
+
+23
+00:00:56,880 --> 00:00:58,230
+Another way to shuffle the data
+
+24
+00:00:58,230 --> 00:01:00,930
+is to create random train and test splits.
+
+25
+00:01:00,930 --> 00:01:02,280
+This can be useful if you have to create
+
+26
+00:01:02,280 --> 00:01:04,620
+your own test splits from raw data.
+
+27
+00:01:04,620 --> 00:01:07,620
+To do this, you just apply
+the train_test_split method
+
+28
+00:01:07,620 --> 00:01:10,740
+and specify how large
+the test split should be.
+
+29
+00:01:10,740 --> 00:01:14,310
+In this example, we specify
+that the test set should be 10%
+
+30
+00:01:14,310 --> 00:01:15,963
+of the total dataset size.
+
+31
+00:01:16,890 --> 00:01:19,140
+You can see that the output
+of the train_test_split method
+
+32
+00:01:19,140 --> 00:01:20,610
+is a DatasetDict object
+
+33
+00:01:20,610 --> 00:01:22,743
+whose keys correspond to the new splits.
+
+34
+00:01:25,170 --> 00:01:27,210
+Now that we know how
+to shuffle the dataset,
+
+35
+00:01:27,210 --> 00:01:30,060
+let's take a look at returning
+the rows we're interested in.
+
+36
+00:01:30,060 --> 00:01:33,180
+The most common way to do this
+is with the select method.
+
+37
+00:01:33,180 --> 00:01:34,590
+This method expects a list
+
+38
+00:01:34,590 --> 00:01:36,750
+or a generator of the datasets indices,
+
+39
+00:01:36,750 --> 00:01:38,670
+and will then return a new dataset object
+
+40
+00:01:38,670 --> 00:01:40,143
+containing just those rows.
+
+41
+00:01:41,490 --> 00:01:43,740
+If you wanna create a
+random sample of rows,
+
+42
+00:01:43,740 --> 00:01:45,360
+you can do this by chaining the shuffle
+
+43
+00:01:45,360 --> 00:01:47,310
+and select methods together.
+
+44
+00:01:47,310 --> 00:01:48,450
+In this example,
+
+45
+00:01:48,450 --> 00:01:50,250
+we've created a sample of five elements
+
+46
+00:01:50,250 --> 00:01:51,423
+from the squad dataset.
+
+47
+00:01:53,550 --> 00:01:56,010
+The last way to pick out
+specific rows in a dataset
+
+48
+00:01:56,010 --> 00:01:58,290
+is by applying the filter method.
+
+49
+00:01:58,290 --> 00:02:00,120
+This method checks whether each row
+
+50
+00:02:00,120 --> 00:02:02,310
+fulfills some condition or not.
+
+51
+00:02:02,310 --> 00:02:05,130
+For example, here we've
+created a small lambda function
+
+52
+00:02:05,130 --> 00:02:08,460
+that checks whether the title
+starts with the letter L.
+
+53
+00:02:08,460 --> 00:02:11,040
+Once we apply this function
+with the filter method,
+
+54
+00:02:11,040 --> 00:02:14,283
+we get a subset of the data
+just containing these rows.
+
+55
+00:02:16,200 --> 00:02:18,600
+So far, we've been talking
+about the rows of a dataset,
+
+56
+00:02:18,600 --> 00:02:20,490
+but what about the columns?
+
+57
+00:02:20,490 --> 00:02:22,320
+The datasets library has two main methods
+
+58
+00:02:22,320 --> 00:02:24,060
+for transforming columns,
+
+59
+00:02:24,060 --> 00:02:26,760
+a rename_column method to
+change the name of the column
+
+60
+00:02:26,760 --> 00:02:29,460
+and a remove_columns
+method to delete them.
+
+61
+00:02:29,460 --> 00:02:31,860
+You can see examples of
+both these methods here.
+
+62
+00:02:34,140 --> 00:02:36,060
+Some datasets have nested columns,
+
+63
+00:02:36,060 --> 00:02:39,360
+and you can expand these by
+applying the flatten method.
+
+64
+00:02:39,360 --> 00:02:41,430
+For example, in the squad dataset,
+
+65
+00:02:41,430 --> 00:02:45,150
+the answers column contains a
+text and answer_start field.
+
+66
+00:02:45,150 --> 00:02:47,430
+If we wanna promote them to
+their own separate columns,
+
+67
+00:02:47,430 --> 00:02:49,383
+we can apply flatten as shown here.
+
+68
+00:02:51,300 --> 00:02:53,760
+Now of course, no discussion
+of the datasets library
+
+69
+00:02:53,760 --> 00:02:56,880
+would be complete without
+mentioning the famous map method.
+
+70
+00:02:56,880 --> 00:02:59,160
+This method applies a
+custom processing function
+
+71
+00:02:59,160 --> 00:03:01,140
+to each row in the dataset.
+
+72
+00:03:01,140 --> 00:03:03,360
+For example, here we first define
+
+73
+00:03:03,360 --> 00:03:04,890
+a lowercase title function,
+
+74
+00:03:04,890 --> 00:03:07,503
+that simply lowercases the
+text in the title column.
+
+75
+00:03:08,640 --> 00:03:11,700
+And then we feed that
+function to the map method,
+
+76
+00:03:11,700 --> 00:03:14,223
+and voila, we now have lowercase titles.
+
+77
+00:03:16,020 --> 00:03:18,360
+The map method can also be
+used to feed batches of rows
+
+78
+00:03:18,360 --> 00:03:20,100
+to the processing function.
+
+79
+00:03:20,100 --> 00:03:22,410
+This is especially useful for tokenization
+
+80
+00:03:22,410 --> 00:03:25,290
+where the tokenizer is backed
+by the Tokenizers library,
+
+81
+00:03:25,290 --> 00:03:26,910
+and they can use fast multithreading
+
+82
+00:03:26,910 --> 00:03:28,563
+to process batches in parallel.
+
+83
+00:03:30,056 --> 00:03:32,723
+(air whooshing)
+
diff --git "a/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt" "b/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt"
index eb7ab2655..5204eac28 100644
--- "a/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt"
+++ "b/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt"
@@ -1,144 +1,283 @@
-1
-00:00:05,200 --> 00:00:11,680
-Datasets and DataFrames equals love. Although the 
-processing functions of Datasets will cover most  
-
-2
-00:00:11,680 --> 00:00:15,600
-the cases needed to train a model, there are 
-times when you’ll need to switch to a library  
-
-3
-00:00:15,600 --> 00:00:21,840
-like Pandas to access more powerful features or 
-high-level APIs for visualisation. Fortunately,  
-
-4
-00:00:21,840 --> 00:00:25,520
-Datasets is designed to be interoperable 
-with libraries like Pandas,  
-
-5
-00:00:25,520 --> 00:00:30,560
-as well as NumPy, PyTorch, TensorFlow, 
-and JAX. In this video, we'll take a  
-
-6
-00:00:30,560 --> 00:00:33,920
-look at how we can quickly switch our 
-data to Pandas DataFrames and back.  
-
-7
-00:00:35,920 --> 00:00:41,280
-As an example, let's suppose we're analysing 
-Supreme Court cases from Switzerland. As usual  
-
-8
-00:00:41,280 --> 00:00:45,440
-we download our dataset from the Hub using the 
-load_dataset() function, and you can see that the  
-
-9
-00:00:45,440 --> 00:00:49,600
-first element of the training set is an ordinary 
-Python dictionary with various fields of interest.  
-
-10
-00:00:51,440 --> 00:00:54,800
-Now suppose that before we train any 
-models, we'd like to explore the data a bit.  
-
-11
-00:00:55,360 --> 00:00:58,720
-For example we might be interested in 
-knowing which legal area is most common  
-
-12
-00:00:59,600 --> 00:01:02,480
-or we might want to know how the 
-languages are distributed across regions.  
-
-13
-00:01:04,320 --> 00:01:07,920
-Answering these questions with the native 
-Arrow format isn't easy, but we can easily  
-
-14
-00:01:07,920 --> 00:01:13,280
-switch to Pandas to get our answers! The way 
-this works is by using the set_format() method,  
-
-15
-00:01:13,280 --> 00:01:17,600
-which will change the output format of the dataset 
-from Python dictionaries to Pandas DataFrames.  
-
-16
-00:01:18,720 --> 00:01:22,720
-As you can see in this example, each row in 
-the dataset is represented as a DataFrame,  
-
-17
-00:01:22,720 --> 00:01:26,160
-so we can slice the whole dataset to 
-get a single DataFrame of the dataset.  
-
-18
-00:01:27,840 --> 00:01:31,040
-The way this works under the hood is 
-that the Datasets library changes the  
-
-19
-00:01:31,040 --> 00:01:35,440
-magic __getitem__() method of the dataset. 
-The __getitem__() method is a special method  
-
-20
-00:01:35,440 --> 00:01:40,320
-for Python containers that allows you to 
-specify how indexing works. In this case,  
-
-21
-00:01:40,320 --> 00:01:44,320
-the __getitem__() method of the raw dataset 
-starts off by returning Python dictionaries  
-
-22
-00:01:45,120 --> 00:01:49,920
-and then after applying set_format() we change 
-__getitem__() to return DataFrames instead.  
-
-23
-00:01:51,840 --> 00:01:56,240
-The Datasets library also provides a to_pandas() 
-method if you want to do the format conversion and  
-
-24
-00:01:56,240 --> 00:02:02,640
-slicing of the dataset in one go. And once you 
-have a DataFrame, you can find answers to all  
-
-25
-00:02:02,640 --> 00:02:07,840
-sorts of complex questions or make plots with your 
-favourite visualisation library and so on. The  
-
-26
-00:02:07,840 --> 00:02:10,800
-only thing to remember is that once 
-you are done with your Pandas analysis,  
-
-27
-00:02:10,800 --> 00:02:16,240
-you should reset the output format back to Arrow 
-tables. If you don't, you can run into problems if  
-
-28
-00:02:16,240 --> 00:02:20,240
-you try to tokenize your text because it is no 
-longer represented as strings in a dictionary.  
-
-29
-00:02:21,520 --> 00:02:32,160
-By resetting the output format, we get back 
-Arrow tables and can tokenize without problem!
+﻿1
+00:00:00,227 --> 00:00:01,432
+(whooshing sound)
+
+2
+00:00:01,432 --> 00:00:02,420
+(sticker popping)
+
+3
+00:00:02,420 --> 00:00:05,340
+(whooshing sound)
+
+4
+00:00:05,340 --> 00:00:07,833
+- Datasets and DataFrames equals love.
+
+5
+00:00:08,790 --> 00:00:11,010
+Although the processing
+functions of the Datasets library
+
+6
+00:00:11,010 --> 00:00:14,040
+will cover most of the cases
+needed to train a model,
+
+7
+00:00:14,040 --> 00:00:15,660
+there are times when you'll
+need to switch to a library
+
+8
+00:00:15,660 --> 00:00:18,240
+like Pandas to access
+more powerful features
+
+9
+00:00:18,240 --> 00:00:20,970
+or high level APIs for visualization.
+
+10
+00:00:20,970 --> 00:00:23,220
+Fortunately, the Datasets
+library is designed
+
+11
+00:00:23,220 --> 00:00:25,710
+to be interoperable with
+libraries like Pandas,
+
+12
+00:00:25,710 --> 00:00:29,790
+as well as NumPy, PyTorch,
+TensorFlow and JAX.
+
+13
+00:00:29,790 --> 00:00:30,930
+In this video, we'll take a look
+
+14
+00:00:30,930 --> 00:00:32,550
+at how we can quickly switch our data
+
+15
+00:00:32,550 --> 00:00:34,263
+to Pandas DataFrames and back.
+
+16
+00:00:36,120 --> 00:00:38,310
+As an example, let's
+suppose we're analyzing
+
+17
+00:00:38,310 --> 00:00:40,830
+Supreme Court cases from Switzerland.
+
+18
+00:00:40,830 --> 00:00:43,020
+As usual, we download
+our dataset from the hub
+
+19
+00:00:43,020 --> 00:00:44,940
+using the load_dataset function.
+
+20
+00:00:44,940 --> 00:00:46,980
+And you can see that the first
+element of the training set
+
+21
+00:00:46,980 --> 00:00:48,510
+is an ordinary Python dictionary
+
+22
+00:00:48,510 --> 00:00:50,110
+with various fields of interest.
+
+23
+00:00:51,690 --> 00:00:53,670
+Now, suppose that before
+we train any models,
+
+24
+00:00:53,670 --> 00:00:55,590
+we'd like to explore the data a bit.
+
+25
+00:00:55,590 --> 00:00:57,390
+For example, we might
+be interested in knowing
+
+26
+00:00:57,390 --> 00:00:59,820
+which legal areas are the most common
+
+27
+00:00:59,820 --> 00:01:01,380
+or we might wanna know how the languages
+
+28
+00:01:01,380 --> 00:01:02,930
+are distributed across regions.
+
+29
+00:01:04,500 --> 00:01:05,333
+Answering these questions
+
+30
+00:01:05,333 --> 00:01:07,530
+with the native Arrow format isn't easy,
+
+31
+00:01:07,530 --> 00:01:10,500
+but we can quickly switch to
+Pandas to get our answers.
+
+32
+00:01:10,500 --> 00:01:13,500
+The way this works is that by
+using the set_format method,
+
+33
+00:01:13,500 --> 00:01:15,480
+we will change the output
+format of the dataset
+
+34
+00:01:15,480 --> 00:01:18,930
+from Python dictionaries
+to Pandas DataFrames.
+
+35
+00:01:18,930 --> 00:01:20,130
+As you can see in this example,
+
+36
+00:01:20,130 --> 00:01:22,890
+each row in the dataset is
+represented as a DataFrame,
+
+37
+00:01:22,890 --> 00:01:24,540
+so we can slice the whole dataset
+
+38
+00:01:24,540 --> 00:01:26,583
+to get a single DataFrame of the corpus.
+
+39
+00:01:28,080 --> 00:01:29,520
+The way this works under the hood,
+
+40
+00:01:29,520 --> 00:01:31,080
+is that the datasets library changes
+
+41
+00:01:31,080 --> 00:01:33,900
+the magic __getitem__
+method of the dataset.
+
+42
+00:01:33,900 --> 00:01:35,640
+The __getitem__ method is a special method
+
+43
+00:01:35,640 --> 00:01:37,320
+for Python containers that allows you
+
+44
+00:01:37,320 --> 00:01:39,870
+to specify how indexing works.
+
+45
+00:01:39,870 --> 00:01:42,540
+In this case, the __getitem__
+method of the raw dataset
+
+46
+00:01:42,540 --> 00:01:45,150
+starts off by returning
+a Python dictionary
+
+47
+00:01:45,150 --> 00:01:47,520
+and then after applying set_format,
+
+48
+00:01:47,520 --> 00:01:50,283
+we change __getitem__ to
+return DataFrames instead.
+
+49
+00:01:52,080 --> 00:01:54,690
+The Datasets library also
+provides a to_pandas method
+
+50
+00:01:54,690 --> 00:01:56,250
+if you wanna do the format conversion
+
+51
+00:01:56,250 --> 00:01:58,113
+and slicing of the dataset in one go.
+
+52
+00:02:00,090 --> 00:02:01,590
+And once you have a DataFrame,
+
+53
+00:02:01,590 --> 00:02:03,990
+you can find the answers to
+all sorts of complex questions
+
+54
+00:02:03,990 --> 00:02:06,740
+or make plots with your
+favorite visualization library.
+
+55
+00:02:07,890 --> 00:02:08,850
+The only thing to remember
+
+56
+00:02:08,850 --> 00:02:10,830
+is that once you're done
+with your Pandas analysis,
+
+57
+00:02:10,830 --> 00:02:14,460
+you should reset the output
+format back to Arrow tables.
+
+58
+00:02:14,460 --> 00:02:16,350
+If you don't, you can run into problems
+
+59
+00:02:16,350 --> 00:02:17,910
+if you try to tokenize your text
+
+60
+00:02:17,910 --> 00:02:19,260
+because it is no longer represented
+
+61
+00:02:19,260 --> 00:02:20,610
+as strings in a dictionary.
+
+62
+00:02:21,750 --> 00:02:24,780
+By resetting the output format
+we get back Arrow tables
+
+63
+00:02:24,780 --> 00:02:26,580
+and we can tokenize without problem.
+
+64
+00:02:27,513 --> 00:02:30,346
+(whooshing sound)
+
diff --git a/subtitles/en/38_saving-and-reloading-a-dataset.srt b/subtitles/en/38_saving-and-reloading-a-dataset.srt
index 79818d595..046a4b4f1 100644
--- a/subtitles/en/38_saving-and-reloading-a-dataset.srt
+++ b/subtitles/en/38_saving-and-reloading-a-dataset.srt
@@ -1,179 +1,359 @@
-1
-00:00:06,560 --> 00:00:11,600
-Saving and reloading a dataset. In this video 
-we'll take a look saving a dataset in various  
-
-2
-00:00:11,600 --> 00:00:19,200
-formats, and explore the ways to reload the saved 
-data. When you download a dataset, the processing  
-
-3
-00:00:19,200 --> 00:00:23,920
-scripts and data are stored locally on your 
-computer. The cache allows the Datasets library  
-
-4
-00:00:23,920 --> 00:00:29,600
-to avoid re-downloading or processing the entire 
-dataset every time you use it. The data is stored  
-
-5
-00:00:29,600 --> 00:00:34,080
-in the form of Arrow tables whose location can 
-be found by accessing the dataset's cache_files  
-
-6
-00:00:34,080 --> 00:00:39,360
-attribute. In this example, we've downloaded 
-the allocine dataset from the Hugging Face Hub  
-
-7
-00:00:39,360 --> 00:00:43,840
-and you can see there are three Arrow files 
-stored in the cache, one for each split.  
-
-8
-00:00:45,120 --> 00:00:48,720
-But in many cases, you'll want to save your 
-dataset in a different location or format.  
-
-9
-00:00:49,600 --> 00:00:53,760
-As shown in the table, the Datasets library 
-provides four main functions to achieve this.  
-
-10
-00:00:54,880 --> 00:00:59,040
-You're probably familiar with the CSV and JSON 
-formats, both of which are great if you want  
-
-11
-00:00:59,040 --> 00:01:04,800
-to save small to medium-sized datasets. But 
-if your dataset is huge, you'll want to save  
-
-12
-00:01:04,800 --> 00:01:09,520
-it in either the Arrow or Parquet formats. 
-Arrow files are great if you plan to reload  
-
-13
-00:01:09,520 --> 00:01:14,080
-or process the data in the near future. Parquet 
-files are designed for long-term disk storage  
-
-14
-00:01:14,080 --> 00:01:17,440
-and are very space efficient. Let's 
-take a closer look at each format.  
-
-15
-00:01:19,520 --> 00:01:25,520
-To save a Dataset or a DatasetDict object in the 
-Arrow format we use the save_to_disk function. As  
-
-16
-00:01:25,520 --> 00:01:30,240
-you can see in this example, we simply provide the 
-path we wish to save the data to, and the Datasets  
-
-17
-00:01:30,240 --> 00:01:34,720
-library will automatically create a directory for 
-each split to store the Arrow table and metadata.  
-
-18
-00:01:35,600 --> 00:01:38,880
-Since we're dealing with a DatasetDict 
-object that has multiple splits,  
-
-19
-00:01:38,880 --> 00:01:41,920
-this information is also stored 
-in the dataset_dict.json file.  
-
-20
-00:01:44,160 --> 00:01:48,000
-Now when we want to reload the Arrow 
-datasets, we use the load_from_disk function.  
-
-21
-00:01:48,640 --> 00:01:53,840
-We simply pass the path of our dataset directory 
-and voila the original dataset is recovered!  
-
-22
-00:01:55,760 --> 00:01:59,920
-If we want to save our datasets in the 
-CSV format we use the to_csv function.  
-
-23
-00:02:00,800 --> 00:02:05,280
-In this case you'll need to loop over the splits 
-of the DatasetDict object and save each dataset as  
-
-24
-00:02:05,280 --> 00:02:11,280
-an individual CSV file. Since the to_csv file 
-is based on the one from Pandas, you can pass  
-
-25
-00:02:11,280 --> 00:02:16,240
-keyword arguments to configure the output. In 
-this example, we've set the index argument to  
-
-26
-00:02:16,240 --> 00:02:23,440
-None to prevent the dataset's index column from 
-being included in the CSV files. To reload our CSV  
-
-27
-00:02:23,440 --> 00:02:29,760
-files, we use the load_dataset function together 
-with the csv loading script and data_files  
-
-28
-00:02:29,760 --> 00:02:35,120
-argument which specifies the filenames associated 
-with each split. As you can see in this example,  
-
-29
-00:02:35,120 --> 00:02:39,280
-by providing all the splits and their filenames, 
-we've recovered the original DatasetDict object.  
-
-30
-00:02:41,840 --> 00:02:45,920
-To save a dataset in the JSON or Parquet 
-formats is very similar to the CSV case.  
-
-31
-00:02:46,480 --> 00:02:52,720
-We use either the to_json function for JSON files 
-or the to_parquet function for Parquet ones. And  
-
-32
-00:02:52,720 --> 00:02:57,440
-just like the CSV case, we need to loop over the 
-splits and save each one as an individual file.  
-
-33
-00:02:59,680 --> 00:03:03,760
-Once our datasets are saved as JSON or 
-Parquet files, we can reload them again  
-
-34
-00:03:03,760 --> 00:03:09,680
-with the appropriate script in the load_dataset 
-function, and a data_files argument as before.  
-
-35
-00:03:10,640 --> 00:03:14,160
-This example shows how we can reload 
-our saved datasets in either format.  
-
-36
-00:03:16,400 --> 00:03:26,000
-And with that you now know how to 
-save your datasets in various formats!
+﻿1
+00:00:00,000 --> 00:00:02,917
+(transition music)
+
+2
+00:00:06,600 --> 00:00:08,283
+- Saving and reloading a dataset.
+
+3
+00:00:09,210 --> 00:00:10,320
+In this video, we'll take a look
+
+4
+00:00:10,320 --> 00:00:12,360
+at saving a dataset in various formats
+
+5
+00:00:12,360 --> 00:00:14,660
+and explore the ways to
+reload the saved data.
+
+6
+00:00:17,310 --> 00:00:20,100
+When you download a dataset,
+the processing scripts and data
+
+7
+00:00:20,100 --> 00:00:22,470
+are stored locally on your computer.
+
+8
+00:00:22,470 --> 00:00:24,000
+The cache allows the Datasets library
+
+9
+00:00:24,000 --> 00:00:25,230
+to avoid re-downloading
+
+10
+00:00:25,230 --> 00:00:28,620
+or processing the entire
+dataset every time you use it.
+
+11
+00:00:28,620 --> 00:00:31,170
+Now, the data is stored in
+the form of Arrow tables
+
+12
+00:00:31,170 --> 00:00:32,490
+whose location can be found
+
+13
+00:00:32,490 --> 00:00:35,730
+by accessing the dataset's
+cache_files attribute.
+
+14
+00:00:35,730 --> 00:00:38,430
+In this example, we've
+downloaded the allocine dataset
+
+15
+00:00:38,430 --> 00:00:40,080
+from the Hugging Face Hub, and you can see
+
+16
+00:00:40,080 --> 00:00:41,430
+that there are three Arrow files
+
+17
+00:00:41,430 --> 00:00:43,473
+stored in the cache, one for each split.
+
+18
+00:00:45,360 --> 00:00:47,460
+But in many cases, you'll
+wanna save your dataset
+
+19
+00:00:47,460 --> 00:00:49,890
+in a different location or format.
+
+20
+00:00:49,890 --> 00:00:51,900
+As shown in the table,
+the Datasets library
+
+21
+00:00:51,900 --> 00:00:54,870
+provides four main
+functions to achieve this.
+
+22
+00:00:54,870 --> 00:00:56,130
+Now, you're probably already familiar
+
+23
+00:00:56,130 --> 00:00:58,770
+with the CSV and JSON formats,
+both of which are great
+
+24
+00:00:58,770 --> 00:01:00,810
+if you just wanna quickly save a small
+
+25
+00:01:00,810 --> 00:01:02,790
+or medium-sized dataset.
+
+26
+00:01:02,790 --> 00:01:03,976
+But if your dataset is huge,
+
+27
+00:01:03,976 --> 00:01:07,860
+you'll wanna save it in either
+the Arrow or Parquet formats.
+
+28
+00:01:07,860 --> 00:01:09,660
+Arrow files are great
+if you plan to reload
+
+29
+00:01:09,660 --> 00:01:11,850
+or process the data in the near future.
+
+30
+00:01:11,850 --> 00:01:13,290
+While Parquet files are designed
+
+31
+00:01:13,290 --> 00:01:16,140
+for long-term storage and
+are very space-efficient.
+
+32
+00:01:16,140 --> 00:01:18,140
+Let's take a closer look at each format.
+
+33
+00:01:19,800 --> 00:01:21,750
+To save a dataset or a dataset_dict object
+
+34
+00:01:21,750 --> 00:01:25,560
+in the Arrow format, we use
+the save_to_disk function.
+
+35
+00:01:25,560 --> 00:01:26,910
+As you can see in this example,
+
+36
+00:01:26,910 --> 00:01:29,790
+we simply provide the path
+we wish to save the data to
+
+37
+00:01:29,790 --> 00:01:30,720
+and the Datasets library
+
+38
+00:01:30,720 --> 00:01:32,340
+will automatically create a directory
+
+39
+00:01:32,340 --> 00:01:35,790
+for each split to store the
+Arrow table and the metadata.
+
+40
+00:01:35,790 --> 00:01:37,680
+Since we're dealing with
+a dataset_dict object
+
+41
+00:01:37,680 --> 00:01:39,090
+that has multiple splits,
+
+42
+00:01:39,090 --> 00:01:40,590
+this information is also stored
+
+43
+00:01:40,590 --> 00:01:42,243
+in the dataset_dict.json file.
+
+44
+00:01:44,250 --> 00:01:46,710
+Now, when we wanna reload
+the Arrow datasets,
+
+45
+00:01:46,710 --> 00:01:48,870
+we use the load_from_disk function.
+
+46
+00:01:48,870 --> 00:01:51,210
+We simply pass the path
+of our dataset directory,
+
+47
+00:01:51,210 --> 00:01:53,583
+and voila, the original
+dataset is recovered.
+
+48
+00:01:55,594 --> 00:01:57,180
+If we wanna save our dataset
+
+49
+00:01:57,180 --> 00:02:00,990
+in the CSV format, we
+use the to_csv function.
+
+50
+00:02:00,990 --> 00:02:02,280
+In this case, you'll need to loop
+
+51
+00:02:02,280 --> 00:02:04,170
+over the splits of the dataset_dict object
+
+52
+00:02:04,170 --> 00:02:07,710
+and save each dataset as
+an individual CSV file.
+
+53
+00:02:07,710 --> 00:02:10,950
+Since the to_csv function is
+based on the one from Pandas,
+
+54
+00:02:10,950 --> 00:02:13,980
+you can pass keyword arguments
+to configure the output.
+
+55
+00:02:13,980 --> 00:02:16,230
+In this example, we've
+set the index argument
+
+56
+00:02:16,230 --> 00:02:18,480
+to None to prevent the
+dataset's index column
+
+57
+00:02:18,480 --> 00:02:20,553
+from being included in the CSV files.
+
+58
+00:02:22,470 --> 00:02:24,240
+To reload our CSV files,
+
+59
+00:02:24,240 --> 00:02:27,180
+we just then use the familiar
+load_dataset function
+
+60
+00:02:27,180 --> 00:02:29,160
+together with the CSV loading script
+
+61
+00:02:29,160 --> 00:02:30,360
+and the data_files argument,
+
+62
+00:02:30,360 --> 00:02:34,020
+which specifies the file names
+associated with each split.
+
+63
+00:02:34,020 --> 00:02:35,400
+As you can see in this example,
+
+64
+00:02:35,400 --> 00:02:37,320
+by providing all the splits
+and their file names,
+
+65
+00:02:37,320 --> 00:02:39,770
+we've recovered the original
+dataset_dict object.
+
+66
+00:02:41,880 --> 00:02:43,560
+Now, to save a dataset in the JSON
+
+67
+00:02:43,560 --> 00:02:46,710
+or Parquet formats is very
+similar to the CSV case.
+
+68
+00:02:46,710 --> 00:02:49,890
+We use either the to_json
+function for JSON files
+
+69
+00:02:49,890 --> 00:02:52,740
+or the to_parquet
+function for Parquet ones.
+
+70
+00:02:52,740 --> 00:02:55,740
+And just like the CSV case, we
+need to loop over the splits
+
+71
+00:02:55,740 --> 00:02:57,753
+to save each one as an individual file.
+
+72
+00:02:59,580 --> 00:03:02,940
+And once our datasets are
+saved as JSON or Parquet files,
+
+73
+00:03:02,940 --> 00:03:03,990
+we can reload them again
+
+74
+00:03:03,990 --> 00:03:06,960
+with the appropriate script
+in the load_dataset function.
+
+75
+00:03:06,960 --> 00:03:09,993
+And we just need to provide a
+data_files argument as before.
+
+76
+00:03:10,860 --> 00:03:11,910
+This example shows
+
+77
+00:03:11,910 --> 00:03:14,560
+how we can reload our save
+datasets in either format.
+
+78
+00:03:16,620 --> 00:03:17,970
+And with that, you now know
+
+79
+00:03:17,970 --> 00:03:20,220
+how to save your datasets
+in various formats.
+
+80
+00:03:21,441 --> 00:03:24,358
+(transition music)
+
diff --git a/subtitles/en/39_memory-mapping-&-streaming.srt b/subtitles/en/39_memory-mapping-&-streaming.srt
index 162665241..2efb51f8a 100644
--- a/subtitles/en/39_memory-mapping-&-streaming.srt
+++ b/subtitles/en/39_memory-mapping-&-streaming.srt
@@ -1,183 +1,370 @@
-1
-00:00:05,520 --> 00:00:10,720
-Memory mapping and streaming. In this video we'll 
-take a look at two core features of the Datasets  
-
-2
-00:00:10,720 --> 00:00:15,920
-library that allow you to load and process huge 
-datasets without blowing up your laptop's CPU.
-
-3
-00:00:18,160 --> 00:00:22,720
-Nowadays it is not uncommon to find yourself 
-working with multi-GB sized datasets,  
-
-4
-00:00:22,720 --> 00:00:26,880
-especially if you’re planning to pretrain a 
-transformer like BERT or GPT-2 from scratch.  
-
-5
-00:00:27,920 --> 00:00:30,480
-In these cases, even *loading* 
-the data can be a challenge.  
-
-6
-00:00:31,040 --> 00:00:36,560
-For example, the C4 corpus used to
-pretrain T5 consists of over 2 terabytes of data!
-
-7
-00:00:38,160 --> 00:00:42,720
-To handle these large datasets, the Datasets 
-library is built on two core features:  
-
-8
-00:00:42,720 --> 00:00:45,120
-the Apache Arrow format and a streaming API.  
-
-9
-00:00:46,160 --> 00:00:51,120
-Arrow is designed for high-performance data 
-processing and represents each table-like dataset  
-
-10
-00:00:51,120 --> 00:00:56,240
-with an in-memory columnar format. As you can 
-see in this example, columnar formats group  
-
-11
-00:00:56,240 --> 00:01:01,280
-the elements of a table in consecutive blocks of 
-RAM and this unlocks fast access and processing.  
-
-12
-00:01:02,560 --> 00:01:07,600
-Arrow is great at processing data at any scale, 
-but some datasets are so large that you can't even  
-
-13
-00:01:07,600 --> 00:01:12,480
-fit them on your hard disk. For these cases, 
-the Datasets library provides a streaming API  
-
-14
-00:01:13,040 --> 00:01:18,080
-that allows you to progressively download the 
-raw data one element at a time. The result is  
-
-15
-00:01:18,080 --> 00:01:21,600
-a special object called an IterableDataset 
-that we'll see in more detail soon.  
-
-16
-00:01:23,520 --> 00:01:28,160
-Let's start by looking at why Arrow is so 
-powerful. The first feature is that it treat every  
-
-17
-00:01:28,160 --> 00:01:34,000
-dataset as a memory-mapped file. Memory mapping 
-is a mechanism that maps a portion of a file or  
-
-18
-00:01:34,000 --> 00:01:38,967
-an entire file on disk to a chunk of virtual 
-memory. This allows applications to access can  
-
-19
-00:01:38,967 --> 00:01:43,360
-access segments in an extremely large file without 
-having to read the entire file into memory first.  
-
-20
-00:01:44,960 --> 00:01:49,040
-Another cool feature of Arrow's memory 
-mapping capability is that it allows multiple  
-
-21
-00:01:49,040 --> 00:01:53,840
-processes to work with the same large dataset 
-without moving it or copying it in any way.  
-
-22
-00:01:55,520 --> 00:01:59,920
-This "zero-copy" feature of Arrow makes it 
-extremely fast for iterating over a dataset.  
-
-23
-00:02:00,480 --> 00:02:05,920
-In this example you can see that we iterate over 
-15 million rows in about a minute using a standard  
-
-24
-00:02:05,920 --> 00:02:12,480
-laptop - that's not too bad at all! Let's now 
-take a look at how we can stream a large dataset.  
-
-25
-00:02:12,480 --> 00:02:16,720
-The only change you need to make is to set the 
-streaming=True argument in the load_dataset()  
-
-26
-00:02:16,720 --> 00:02:21,120
-function. This will return a special 
-IterableDataset object, which is a bit different  
-
-27
-00:02:21,120 --> 00:02:26,160
-to the Dataset objects we've seen in other 
-videos. This object is an iterable, which means  
-
-28
-00:02:26,160 --> 00:02:31,680
-we can't index it to access elements, but instead 
-iterate on it using the iter and next methods.  
-
-29
-00:02:32,640 --> 00:02:36,080
-This will download and access a single 
-example from the dataset, which means  
-
-30
-00:02:36,080 --> 00:02:39,760
-you can progressively iterate through a huge 
-dataset without having to download it first.  
-
-31
-00:02:41,840 --> 00:02:47,040
-Tokenizing text with the map() method also works 
-in a similar way. We first stream the dataset and  
-
-32
-00:02:47,040 --> 00:02:52,480
-then apply the map() method with the tokenizer. To 
-get the first tokenized example we apply iter and  
-
-33
-00:02:52,480 --> 00:02:58,560
-next. The main difference with an IterableDataset 
-is that instead of using the select() method to  
-
-34
-00:02:58,560 --> 00:03:04,240
-return example, we use the take() and skip() 
-methods because we can't index into the dataset.  
-
-35
-00:03:04,240 --> 00:03:10,320
-The take() method returns the first N examples 
-in the dataset, while skip() skips the first N  
-
-36
-00:03:10,320 --> 00:03:15,680
-and returns the rest. You can see examples 
-of both in action here, where we create  
-
-37
-00:03:15,680 --> 00:03:27,040
-a validation set from the first 1000 examples 
-and then skip those to create the training set.
+﻿1
+00:00:00,511 --> 00:00:01,784
+(air whooshing)
+
+2
+00:00:01,784 --> 00:00:02,964
+(logo popping)
+
+3
+00:00:02,964 --> 00:00:05,640
+(metal sliding)
+
+4
+00:00:05,640 --> 00:00:07,203
+- Memory mapping and streaming.
+
+5
+00:00:08,040 --> 00:00:09,180
+In this video, we'll take a look
+
+6
+00:00:09,180 --> 00:00:11,520
+at two core features
+of the Datasets library
+
+7
+00:00:11,520 --> 00:00:14,220
+that allow you to load
+and process huge datasets
+
+8
+00:00:14,220 --> 00:00:16,263
+without blowing up your laptop's CPU.
+
+9
+00:00:18,300 --> 00:00:20,280
+Nowadays, it's not
+uncommon to find yourself
+
+10
+00:00:20,280 --> 00:00:22,950
+working with multi-GB sized datasets,
+
+11
+00:00:22,950 --> 00:00:24,420
+especially if you're planning to pretrain
+
+12
+00:00:24,420 --> 00:00:28,110
+a transformer like BERT
+or GPT-2 from scratch.
+
+13
+00:00:28,110 --> 00:00:31,260
+In these cases, even loading
+the data can be a challenge.
+
+14
+00:00:31,260 --> 00:00:34,680
+For example, the c4
+corpus used to pretrain T5
+
+15
+00:00:34,680 --> 00:00:36,903
+consists of over two terabytes of data.
+
+16
+00:00:38,400 --> 00:00:40,050
+To handle these large datasets,
+
+17
+00:00:40,050 --> 00:00:42,990
+the Datasets library is
+built on two core features:
+
+18
+00:00:42,990 --> 00:00:46,350
+the Apache Arrow format
+and a streaming API.
+
+19
+00:00:46,350 --> 00:00:49,110
+Arrow is designed for
+high-performance data processing
+
+20
+00:00:49,110 --> 00:00:51,360
+and represents each table-like dataset
+
+21
+00:00:51,360 --> 00:00:52,773
+with a column-based format.
+
+22
+00:00:53,730 --> 00:00:56,130
+As you can see in this
+example, column-based formats
+
+23
+00:00:56,130 --> 00:00:59,280
+group the elements of a table
+in consecutive blocks of RAM
+
+24
+00:00:59,280 --> 00:01:01,563
+and this unlocks fast
+access and processing.
+
+25
+00:01:02,760 --> 00:01:05,550
+Arrow is great at
+processing data at any scale
+
+26
+00:01:05,550 --> 00:01:07,110
+but some datasets are so large
+
+27
+00:01:07,110 --> 00:01:09,600
+that you can't even fit
+them on your hard disk.
+
+28
+00:01:09,600 --> 00:01:11,730
+So for these cases, the
+Datasets library provides
+
+29
+00:01:11,730 --> 00:01:14,820
+a streaming API that allows
+you to progressively download
+
+30
+00:01:14,820 --> 00:01:17,700
+the raw data one element at a time.
+
+31
+00:01:17,700 --> 00:01:20,430
+The result is a special object
+called an IterableDataset
+
+32
+00:01:20,430 --> 00:01:22,180
+that we'll see in more detail soon.
+
+33
+00:01:23,700 --> 00:01:26,670
+Let's start by looking at
+why Arrow is so powerful.
+
+34
+00:01:26,670 --> 00:01:28,860
+The first feature is that
+it treats every dataset
+
+35
+00:01:28,860 --> 00:01:30,153
+as a memory-mapped file.
+
+36
+00:01:31,020 --> 00:01:32,430
+Now, memory mapping is a mechanism
+
+37
+00:01:32,430 --> 00:01:35,400
+that maps a portion of a file
+or an entire file and disc
+
+38
+00:01:35,400 --> 00:01:37,410
+to a chunk of virtual memory.
+
+39
+00:01:37,410 --> 00:01:38,520
+This allows applications
+
+40
+00:01:38,520 --> 00:01:41,280
+to access segments of
+an extremely large file
+
+41
+00:01:41,280 --> 00:01:44,080
+without having to read the
+whole file into memory first.
+
+42
+00:01:45,150 --> 00:01:48,120
+Another cool feature of Arrow's
+memory mapping capabilities
+
+43
+00:01:48,120 --> 00:01:49,860
+is that it allows multiple processes
+
+44
+00:01:49,860 --> 00:01:51,840
+to work with the same large dataset
+
+45
+00:01:51,840 --> 00:01:54,333
+without moving it or
+copying it in any way.
+
+46
+00:01:55,680 --> 00:01:57,570
+This zero-copy feature of Arrow
+
+47
+00:01:57,570 --> 00:02:00,600
+makes it extremely fast for
+iterating over a dataset.
+
+48
+00:02:00,600 --> 00:02:02,640
+And this example, you
+can see that we iterate
+
+49
+00:02:02,640 --> 00:02:05,160
+over 15 million rows in about a minute
+
+50
+00:02:05,160 --> 00:02:06,780
+just using a standard laptop.
+
+51
+00:02:06,780 --> 00:02:08,080
+That's not too bad at all.
+
+52
+00:02:09,750 --> 00:02:12,660
+Let's now take a look at how
+we can stream a large dataset.
+
+53
+00:02:12,660 --> 00:02:14,520
+The only change you need to make is to set
+
+54
+00:02:14,520 --> 00:02:17,910
+the streaming=True argument in
+the load_dataset() function.
+
+55
+00:02:17,910 --> 00:02:20,580
+This will return a special
+IterableDataset object
+
+56
+00:02:20,580 --> 00:02:22,260
+which is a bit different
+to the Dataset objects
+
+57
+00:02:22,260 --> 00:02:24,330
+we've seen in other videos.
+
+58
+00:02:24,330 --> 00:02:25,980
+This object is an iterable,
+
+59
+00:02:25,980 --> 00:02:28,530
+which means we can't index
+it to access elements,
+
+60
+00:02:28,530 --> 00:02:30,180
+but instead we iterate on it
+
+61
+00:02:30,180 --> 00:02:32,850
+using the iter and next methods.
+
+62
+00:02:32,850 --> 00:02:34,050
+This will download and access
+
+63
+00:02:34,050 --> 00:02:35,850
+a single example from the dataset,
+
+64
+00:02:35,850 --> 00:02:37,410
+which means you can progressively iterate
+
+65
+00:02:37,410 --> 00:02:40,360
+through a huge dataset without
+having to download it first.
+
+66
+00:02:42,150 --> 00:02:43,590
+Tokenizing text with a map() method
+
+67
+00:02:43,590 --> 00:02:45,660
+also works in a similar way.
+
+68
+00:02:45,660 --> 00:02:47,160
+We first stream the dataset
+
+69
+00:02:47,160 --> 00:02:49,830
+and then apply the map()
+method with the tokenizer.
+
+70
+00:02:49,830 --> 00:02:53,283
+To get the first tokenized
+example, we apply iter and next.
+
+71
+00:02:54,750 --> 00:02:57,210
+The main difference with
+an IterableDataset is that
+
+72
+00:02:57,210 --> 00:02:59,970
+instead of using a select()
+method to return examples,
+
+73
+00:02:59,970 --> 00:03:01,530
+we use the take() and skip() methods
+
+74
+00:03:01,530 --> 00:03:03,573
+because we can't index into the dataset.
+
+75
+00:03:04,470 --> 00:03:05,460
+The take() method returns
+
+76
+00:03:05,460 --> 00:03:07,500
+the first N examples in the dataset,
+
+77
+00:03:07,500 --> 00:03:09,270
+while skip(), as you can imagine,
+
+78
+00:03:09,270 --> 00:03:12,480
+skips the first N and returns the rest.
+
+79
+00:03:12,480 --> 00:03:15,300
+You can see examples of both
+of these methods in action
+
+80
+00:03:15,300 --> 00:03:16,710
+where we create a validation set
+
+81
+00:03:16,710 --> 00:03:18,660
+from the first 1000 examples
+
+82
+00:03:18,660 --> 00:03:21,010
+and then skip those to
+create the training set.
+
+83
+00:03:23,012 --> 00:03:25,762
+(air whooshing)
+
diff --git a/subtitles/en/40_uploading-a-dataset-to-the-hub.srt b/subtitles/en/40_uploading-a-dataset-to-the-hub.srt
index 3f3caec00..ee571bef0 100644
--- a/subtitles/en/40_uploading-a-dataset-to-the-hub.srt
+++ b/subtitles/en/40_uploading-a-dataset-to-the-hub.srt
@@ -1,109 +1,228 @@
-1
-00:00:07,760 --> 00:00:11,760
-In this video we'll take a look at how you 
-upload your very own dataset to the Hub.  
-
-2
-00:00:13,520 --> 00:00:16,560
-The first you'll need to do is create 
-a new dataset repository on the Hub.  
-
-3
-00:00:17,360 --> 00:00:20,480
-Just click on your profile icon and 
-select the "New Dataset" button.  
-
-4
-00:00:21,600 --> 00:00:26,720
-Next we need to assign an owner of the dataset. 
-By default, this will be your Hub account,  
-
-5
-00:00:26,720 --> 00:00:29,840
-but you can also create datasets under 
-any organisation that you belong to.  
-
-6
-00:00:30,720 --> 00:00:36,160
-Then we just need to give the dataset a name and 
-specify whether it is a public or private dataset.  
-
-7
-00:00:37,200 --> 00:00:41,520
-Public datasets can be accessed by anyone, 
-while private datasets can only be accessed  
-
-8
-00:00:41,520 --> 00:00:46,800
-by you or members of your organisation. And with 
-that we can go ahead and create the dataset!  
-
-9
-00:00:48,480 --> 00:00:52,800
-Now that you have an empty dataset repository on 
-the Hub, the next thing to do is add some data  
-
-10
-00:00:52,800 --> 00:00:59,360
-to it! You can do this with Git, but the easiest 
-way is by selecting "Upload file" and uploading  
-
-11
-00:00:59,360 --> 00:01:04,880
-the files directly from your machine. After you've 
-uploaded the files, you'll see them appear in the  
-
-12
-00:01:04,880 --> 00:01:11,360
-repository under the "Files and versions" tab. 
-The last step is to create a dataset card. Well  
-
-13
-00:01:11,360 --> 00:01:14,160
-documented datasets are more likely to be useful 
-to others (including your future self!) as they  
-
-14
-00:01:14,160 --> 00:01:18,400
-provide the context to decide whether the dataset 
-is relevant or whether there are any biases or  
-
-15
-00:01:18,400 --> 00:01:23,680
-risks associated with using the dataset. On the 
-Hugging Face Hub, this information is stored in  
-
-16
-00:01:23,680 --> 00:01:29,440
-each repository’s README.md file and there are 
-two main steps you should take. First you need  
-
-17
-00:01:29,440 --> 00:01:33,360
-to create some metadata that will allow your 
-dataset to be easily found by others on the Hub.  
-
-18
-00:01:34,400 --> 00:01:38,560
-You can create this metadata using the Datasets 
-Tagging Application which we'll link to in the  
-
-19
-00:01:38,560 --> 00:01:43,040
-video description. Once you have created the 
-metadata, you can fill out the rest of the  
-
-20
-00:01:43,040 --> 00:01:49,200
-dataset card and we provide a template that is 
-also linked in the video. And once your dataset  
-
-21
-00:01:49,200 --> 00:01:53,680
-is up on the Hub, you can load it using the 
-trusty load_dataset() function! Just provide  
-
-22
-00:01:53,680 --> 00:02:04,000
-the name of your repository and a data_files 
-argument for the files and you're good to go!
+﻿1
+00:00:00,000 --> 00:00:02,917
+(transition music)
+
+2
+00:00:05,490 --> 00:00:07,950
+- Uploading a dataset to the hub.
+
+3
+00:00:07,950 --> 00:00:09,060
+In this video, we'll take a look
+
+4
+00:00:09,060 --> 00:00:10,860
+at how you can upload
+your very own dataset
+
+5
+00:00:10,860 --> 00:00:12,060
+to the Hugging Face Hub.
+
+6
+00:00:13,680 --> 00:00:14,670
+The first thing you need to do
+
+7
+00:00:14,670 --> 00:00:17,400
+is create a new dataset
+repository on the hub.
+
+8
+00:00:17,400 --> 00:00:19,260
+So, just click on your profile icon
+
+9
+00:00:19,260 --> 00:00:21,750
+and select the New Dataset button.
+
+10
+00:00:21,750 --> 00:00:24,750
+Next, we need to assign
+an owner of the dataset.
+
+11
+00:00:24,750 --> 00:00:26,970
+By default, this will be your hub account,
+
+12
+00:00:26,970 --> 00:00:28,170
+but you can also create datasets
+
+13
+00:00:28,170 --> 00:00:30,585
+under any organization that you belong to.
+
+14
+00:00:30,585 --> 00:00:33,780
+Then, we just need to give
+the dataset a good name
+
+15
+00:00:33,780 --> 00:00:36,513
+and specify whether it is a
+public or private dataset.
+
+16
+00:00:37,410 --> 00:00:39,810
+Public datasets can be accessed by anyone
+
+17
+00:00:39,810 --> 00:00:41,670
+while private datasets
+can only be accessed
+
+18
+00:00:41,670 --> 00:00:43,653
+by you or members of your organization.
+
+19
+00:00:44,580 --> 00:00:47,280
+And with that, we can go
+ahead and create the dataset.
+
+20
+00:00:48,690 --> 00:00:51,060
+Now that you have an empty
+dataset repository on the hub,
+
+21
+00:00:51,060 --> 00:00:53,880
+the next thing to do is
+add some actual data to it.
+
+22
+00:00:53,880 --> 00:00:55,050
+You can do this with git,
+
+23
+00:00:55,050 --> 00:00:57,960
+but the easiest way is by
+selecting the Upload file button.
+
+24
+00:00:57,960 --> 00:00:59,160
+And then, you can just go ahead
+
+25
+00:00:59,160 --> 00:01:02,243
+and upload the files
+directly from your machine.
+
+26
+00:01:02,243 --> 00:01:03,846
+After you've uploaded your files,
+
+27
+00:01:03,846 --> 00:01:05,670
+you'll see them appear in the repository
+
+28
+00:01:05,670 --> 00:01:07,320
+under the Files and versions tab.
+
+29
+00:01:08,550 --> 00:01:11,370
+The last step is to create a dataset card.
+
+30
+00:01:11,370 --> 00:01:13,590
+Well-documented datasets
+are more likely to be useful
+
+31
+00:01:13,590 --> 00:01:15,600
+to others as they provide
+the context to decide
+
+32
+00:01:15,600 --> 00:01:17,370
+whether the dataset is relevant
+
+33
+00:01:17,370 --> 00:01:18,450
+or whether there are any biases
+
+34
+00:01:18,450 --> 00:01:20,673
+or risks associated
+with using the dataset.
+
+35
+00:01:21,540 --> 00:01:22,710
+On the Hugging Face Hub,
+
+36
+00:01:22,710 --> 00:01:25,650
+this information is stored in
+each repositories README file.
+
+37
+00:01:25,650 --> 00:01:27,988
+There are two main steps
+that you should take.
+
+38
+00:01:27,988 --> 00:01:30,651
+First, you need to create some metadata
+
+39
+00:01:30,651 --> 00:01:32,010
+that will allow your dataset
+
+40
+00:01:32,010 --> 00:01:34,590
+to be easily found by others on the hub.
+
+41
+00:01:34,590 --> 00:01:35,670
+You can create this metadata
+
+42
+00:01:35,670 --> 00:01:37,860
+using the datasets tagging application,
+
+43
+00:01:37,860 --> 00:01:40,620
+which we'll link to in
+the video description.
+
+44
+00:01:40,620 --> 00:01:42,240
+Once you've created the metadata,
+
+45
+00:01:42,240 --> 00:01:44,190
+you can fill out the
+rest of the dataset card,
+
+46
+00:01:44,190 --> 00:01:45,240
+and we provide a template
+
+47
+00:01:45,240 --> 00:01:47,090
+that we'll also link to in the video.
+
+48
+00:01:48,480 --> 00:01:50,280
+And once your dataset is on the hub,
+
+49
+00:01:50,280 --> 00:01:53,400
+you can load it using the
+trusty load_dataset function.
+
+50
+00:01:53,400 --> 00:01:55,015
+Just provide the name of your repository
+
+51
+00:01:55,015 --> 00:01:57,843
+and a data_files argument,
+and you're good to go.
+
+52
+00:01:59,619 --> 00:02:02,536
+(transition music)
+
diff --git a/subtitles/en/41_text-embeddings-&-semantic-search.srt b/subtitles/en/41_text-embeddings-&-semantic-search.srt
index 128d54a4f..51c9d9b29 100644
--- a/subtitles/en/41_text-embeddings-&-semantic-search.srt
+++ b/subtitles/en/41_text-embeddings-&-semantic-search.srt
@@ -1,184 +1,368 @@
-1
-00:00:05,520 --> 00:00:11,200
-Text embeddings and semantic search. In this video 
-we’ll explore how Transformer models represent  
-
-2
-00:00:11,200 --> 00:00:15,920
-text as embedding vectors and how these vectors 
-can be used to find similar documents in a corpus.  
-
-3
-00:00:17,520 --> 00:00:22,000
-Text embeddings are just a fancy way of saying 
-that we can represent text as an array of numbers  
-
-4
-00:00:22,000 --> 00:00:27,120
-called a vector. To create these embeddings we 
-usually use an encoder-based model like BERT.  
-
-5
-00:00:28,320 --> 00:00:32,320
-In this example, you can see how we feed 
-three sentences to the encoder and get  
-
-6
-00:00:32,320 --> 00:00:36,400
-three vectors as the output. Reading 
-the text, we can see that walking the  
-
-7
-00:00:36,400 --> 00:00:40,880
-dog seems to be most similar to walking the 
-cat, but let's see if we can quantify this!  
-
-8
-00:00:42,560 --> 00:00:46,080
-The trick to do the comparison is to 
-compute a similarity metric between each  
-
-9
-00:00:46,080 --> 00:00:50,880
-pair of embedding vectors. These vectors 
-usually live in a high-dimensional space,  
-
-10
-00:00:50,880 --> 00:00:54,640
-so a similarity metric can be anything that 
-measures some sort of distance between vectors.  
-
-11
-00:00:55,520 --> 00:01:00,560
-One popular metric is cosine similarity, which 
-uses the angle between two vectors to measure  
-
-12
-00:01:00,560 --> 00:01:06,160
-how close they are. In this example, our embedding 
-vectors live in 3D and we can see that the orange  
-
-13
-00:01:06,160 --> 00:01:12,080
-and grey vectors are close to each other and have 
-a smaller angle. Now one problem we have to deal  
-
-14
-00:01:12,080 --> 00:01:16,640
-with is that Transformer models like BERT will 
-actually return one embedding vector per token.  
-
-15
-00:01:17,680 --> 00:01:22,560
-For example in the sentence "I took my dog for a 
-walk", we can expect several embedding vectors,  
-
-16
-00:01:22,560 --> 00:01:28,880
-one for each word. For example, here we can see 
-the output of our model has produced 9 embedding  
-
-17
-00:01:28,880 --> 00:01:35,200
-vectors per sentence, and each vector has 384 
-dimensions. But what we really want is a single  
-
-18
-00:01:35,200 --> 00:01:41,040
-embedding vector for the whole sentence. To deal 
-with this, we can use a technique called pooling.  
-
-19
-00:01:41,760 --> 00:01:45,840
-The simplest pooling method is to just 
-take the token embedding of the CLS token.  
-
-20
-00:01:46,880 --> 00:01:50,160
-Alternatively, we can average the 
-token embeddings which is called  
-
-21
-00:01:50,160 --> 00:01:56,400
-mean pooling. With mean pooling only thing 
-we need to make sure is that we don't include  
-
-22
-00:01:56,400 --> 00:02:00,640
-the padding tokens in the average, which is why 
-you can see the attention mask being used here.  
-
-23
-00:02:01,680 --> 00:02:07,160
-This now gives us one 384 dimensional vector 
-per sentence which is exactly what we want! And  
-
-24
-00:02:07,840 --> 00:02:12,240
-once we have our sentence embeddings, we can 
-compute the cosine similarity for each pair of  
-
-25
-00:02:12,240 --> 00:02:17,520
-vectors. In this example we use the function from 
-scikit-learn and you can see that the sentence "I  
-
-26
-00:02:17,520 --> 00:02:22,400
-took my dog for a walk" has an overlap of 0.83 
-with "I took my cat for a walk". Hooray! We  
-
-27
-00:02:25,040 --> 00:02:29,600
-can take this idea one step further by comparing 
-the similarity between a question and a corpus  
-
-28
-00:02:29,600 --> 00:02:36,000
-of documents. For example, suppose we embed every 
-post in the Hugging Face forums. We can then ask a  
-
-29
-00:02:36,000 --> 00:02:41,600
-question, embed it, and check which forum posts 
-are most similar. This process is often called  
-
-30
-00:02:41,600 --> 00:02:48,000
-semantic search, because it allows us to compare 
-queries with context. To create a semantic search  
-
-31
-00:02:48,000 --> 00:02:54,400
-engine is quite simple in Datasets. First we 
-need to embed all the documents. In this example,  
-
-32
-00:02:54,400 --> 00:02:59,120
-we take a small sample from the SQUAD dataset 
-and apply the same embedding logic as before.  
-
-33
-00:03:00,000 --> 00:03:03,840
-This gives us a new column called "embeddings" 
-that stores the embedding of every passage.  
-
-34
-00:03:05,680 --> 00:03:09,280
-Once we have our embeddings, we need a 
-way to find nearest neighbours to a query.  
-
-35
-00:03:10,080 --> 00:03:14,320
-Datasets provides a special object called a 
-FAISS index that allows you to quickly compare  
-
-36
-00:03:14,320 --> 00:03:18,880
-embedding vectors. So we add the 
-FAISS index, embed a question and  
-
-37
-00:03:18,880 --> 00:03:29,360
-voila! we've now found the 3 most similar 
-articles which might store the answer.
+﻿1
+00:00:00,621 --> 00:00:03,204
+(upbeat music)
+
+2
+00:00:05,670 --> 00:00:08,520
+- Text embeddings and semantic search.
+
+3
+00:00:08,520 --> 00:00:10,770
+In this video we'll explore
+how Transformer models
+
+4
+00:00:10,770 --> 00:00:12,810
+represent text as embedding vectors
+
+5
+00:00:12,810 --> 00:00:15,420
+and how these vectors can be
+used to find similar documents
+
+6
+00:00:15,420 --> 00:00:16,293
+in a corpus.
+
+7
+00:00:17,730 --> 00:00:19,890
+Text embeddings are just
+a fancy way of saying
+
+8
+00:00:19,890 --> 00:00:22,170
+that we can represent text
+as an array of numbers
+
+9
+00:00:22,170 --> 00:00:23,640
+called a vector.
+
+10
+00:00:23,640 --> 00:00:25,710
+To create these embeddings we usually use
+
+11
+00:00:25,710 --> 00:00:27,393
+an encoder-based model like BERT.
+
+12
+00:00:28,530 --> 00:00:31,290
+In this example, you can see
+how we feed three sentences
+
+13
+00:00:31,290 --> 00:00:34,830
+to the encoder and get
+three vectors as the output.
+
+14
+00:00:34,830 --> 00:00:37,050
+Reading the text, we can
+see that walking the dog
+
+15
+00:00:37,050 --> 00:00:39,450
+seems to be most similar
+to walking the cat,
+
+16
+00:00:39,450 --> 00:00:41,350
+but let's see if we can quantify this.
+
+17
+00:00:42,810 --> 00:00:44,040
+The trick to do the comparison
+
+18
+00:00:44,040 --> 00:00:45,630
+is to compute a similarity metric
+
+19
+00:00:45,630 --> 00:00:48,210
+between each pair of embedding vectors.
+
+20
+00:00:48,210 --> 00:00:51,120
+These vectors usually live in
+a very high-dimensional space,
+
+21
+00:00:51,120 --> 00:00:53,190
+so a similarity metric can
+be anything that measures
+
+22
+00:00:53,190 --> 00:00:55,740
+some sort of distance between vectors.
+
+23
+00:00:55,740 --> 00:00:58,560
+One very popular metric
+is cosine similarity,
+
+24
+00:00:58,560 --> 00:01:00,390
+which uses the angle between two vectors
+
+25
+00:01:00,390 --> 00:01:02,610
+to measure how close they are.
+
+26
+00:01:02,610 --> 00:01:05,250
+In this example, our
+embedding vectors live in 3D
+
+27
+00:01:05,250 --> 00:01:07,110
+and we can see that the
+orange and Grey vectors
+
+28
+00:01:07,110 --> 00:01:09,560
+are close to each other
+and have a smaller angle.
+
+29
+00:01:11,130 --> 00:01:12,510
+Now one problem we have to deal with
+
+30
+00:01:12,510 --> 00:01:15,180
+is that Transformer models
+like BERT will actually return
+
+31
+00:01:15,180 --> 00:01:16,983
+one embedding vector per token.
+
+32
+00:01:17,880 --> 00:01:20,700
+For example in the sentence,
+"I took my dog for a walk,"
+
+33
+00:01:20,700 --> 00:01:23,853
+we can expect several embedding
+vectors, one for each word.
+
+34
+00:01:25,110 --> 00:01:27,870
+For example, here we can
+see the output of our model
+
+35
+00:01:27,870 --> 00:01:30,540
+has produced 9 embedding
+vectors per sentence,
+
+36
+00:01:30,540 --> 00:01:33,750
+and each vector has 384 dimensions.
+
+37
+00:01:33,750 --> 00:01:36,210
+But what we really want is
+a single embedding vector
+
+38
+00:01:36,210 --> 00:01:37,353
+for each sentence.
+
+39
+00:01:38,940 --> 00:01:42,060
+To deal with this, we can use
+a technique called pooling.
+
+40
+00:01:42,060 --> 00:01:43,050
+The simplest pooling method
+
+41
+00:01:43,050 --> 00:01:44,520
+is to just take the token embedding
+
+42
+00:01:44,520 --> 00:01:46,203
+of the special CLS token.
+
+43
+00:01:47,100 --> 00:01:49,650
+Alternatively, we can
+average the token embeddings
+
+44
+00:01:49,650 --> 00:01:52,500
+which is called mean pooling
+and this is what we do here.
+
+45
+00:01:53,370 --> 00:01:55,800
+With mean pooling the only
+thing we need to make sure
+
+46
+00:01:55,800 --> 00:01:58,410
+is that we don't include the
+padding tokens in the average,
+
+47
+00:01:58,410 --> 00:02:01,860
+which is why you can see the
+attention mask being used here.
+
+48
+00:02:01,860 --> 00:02:05,100
+This gives us a 384 dimensional
+vector for each sentence
+
+49
+00:02:05,100 --> 00:02:06,600
+which is exactly what we want.
+
+50
+00:02:07,920 --> 00:02:09,810
+And once we have our sentence embeddings,
+
+51
+00:02:09,810 --> 00:02:11,730
+we can compute the cosine similarity
+
+52
+00:02:11,730 --> 00:02:13,113
+for each pair of vectors.
+
+53
+00:02:13,993 --> 00:02:16,350
+In this example we use the
+function from scikit-learn
+
+54
+00:02:16,350 --> 00:02:19,140
+and you can see that the sentence
+"I took my dog for a walk"
+
+55
+00:02:19,140 --> 00:02:22,140
+has indeed a strong overlap
+with "I took my cat for a walk".
+
+56
+00:02:22,140 --> 00:02:23,240
+Hooray! We've done it.
+
+57
+00:02:25,110 --> 00:02:27,180
+We can actually take this
+idea one step further
+
+58
+00:02:27,180 --> 00:02:29,220
+by comparing the similarity
+between a question
+
+59
+00:02:29,220 --> 00:02:31,170
+and a corpus of documents.
+
+60
+00:02:31,170 --> 00:02:33,810
+For example, suppose we embed every post
+
+61
+00:02:33,810 --> 00:02:35,430
+in the Hugging Face forums.
+
+62
+00:02:35,430 --> 00:02:37,800
+We can then ask a question, embed it,
+
+63
+00:02:37,800 --> 00:02:40,590
+and check which forum
+posts are most similar.
+
+64
+00:02:40,590 --> 00:02:42,750
+This process is often
+called semantic search,
+
+65
+00:02:42,750 --> 00:02:45,423
+because it allows us to
+compare queries with context.
+
+66
+00:02:47,040 --> 00:02:48,450
+To create a semantic search engine
+
+67
+00:02:48,450 --> 00:02:51,030
+is actually quite simple
+in the datasets library.
+
+68
+00:02:51,030 --> 00:02:53,340
+First we need to embed all the documents.
+
+69
+00:02:53,340 --> 00:02:56,070
+And in this example,
+we take a small sample
+
+70
+00:02:56,070 --> 00:02:57,780
+from the SQUAD dataset and apply
+
+71
+00:02:57,780 --> 00:03:00,180
+the same embedding logic as before.
+
+72
+00:03:00,180 --> 00:03:02,280
+This gives us a new
+column called embeddings,
+
+73
+00:03:02,280 --> 00:03:04,530
+which stores the embeddings
+of every passage.
+
+74
+00:03:05,880 --> 00:03:07,260
+Once we have our embeddings,
+
+75
+00:03:07,260 --> 00:03:10,200
+we need a way to find nearest
+neighbors for a query.
+
+76
+00:03:10,200 --> 00:03:13,170
+The datasets library provides
+a special object called FAISS
+
+77
+00:03:13,170 --> 00:03:16,080
+which allows you to quickly
+compare embedding vectors.
+
+78
+00:03:16,080 --> 00:03:19,950
+So we add the FAISS index,
+embed a question and voila,
+
+79
+00:03:19,950 --> 00:03:21,870
+we've now found the 3
+most similar articles
+
+80
+00:03:21,870 --> 00:03:23,320
+which might store the answer.
+
+81
+00:03:25,182 --> 00:03:27,849
+(upbeat music)
+
diff --git a/subtitles/en/42_training-a-new-tokenizer.srt b/subtitles/en/42_training-a-new-tokenizer.srt
index 00255f7b1..d2d2ded51 100644
--- a/subtitles/en/42_training-a-new-tokenizer.srt
+++ b/subtitles/en/42_training-a-new-tokenizer.srt
@@ -1,322 +1,512 @@
-1
-00:00:05,310 --> 00:00:12,220
-In this video we will see together what is
-the purpose of training a tokenizer, what
-
-2
-00:00:12,220 --> 00:00:18,770
-are the key steps to follow and what is the
-easiest way to do it.
-
-3
-00:00:18,770 --> 00:00:23,039
-You will ask yourself the question "Should
-I train a new tokenizer?"
-
-4
-00:00:23,039 --> 00:00:27,369
-when you plan to train a new model from scratch.
-
-5
-00:00:27,369 --> 00:00:36,600
-A trained tokenizer would not be suitable
-for your corpus if your corpus is in a different
-
-6
-00:00:36,600 --> 00:00:43,640
-language, uses new characters such as accents
-or upper cased letters, has a specific vocabulary,
-
-7
-00:00:43,640 --> 00:00:50,980
-for example medical or legal, or uses a different
-style, a language from another century for
-
-8
-00:00:50,980 --> 00:00:51,989
-instance.
-
-9
-00:00:51,989 --> 00:01:00,719
-For example, if I take the tokenizer trained
-for the bert-base-uncased model and ignore
-
-10
-00:01:00,719 --> 00:01:08,580
-its normalization step then we can see that
-the tokenization operation on the English
-
-11
-00:01:08,580 --> 00:01:14,310
-sentence "here is a sentence adapted to our
-tokenizer" produces a rather satisfactory
-
-12
-00:01:14,310 --> 00:01:20,820
-list of tokens in the sense that this sentence
-of 8 words is tokenized into 9 tokens.
-
-13
-00:01:20,820 --> 00:01:29,909
-On the other hand if we use this same tokenizer
-on a sentence in Bengali, we see that either
-
-14
-00:01:29,909 --> 00:01:36,320
-a word is divided into many sub tokens or
-that the tokenizer does not know one of the
-
-15
-00:01:36,320 --> 00:01:41,359
-unicode characters and returns only an unknown
-token.
-
-16
-00:01:41,359 --> 00:01:47,350
-The fact that a "common" word is split into
-many tokens can be problematic because language
-
-17
-00:01:47,350 --> 00:01:52,750
-models can only handle a sequence of tokens
-of limited length.
-
-18
-00:01:52,750 --> 00:01:59,290
-A tokenizer that excessively splits your initial
-text may even impact the performance of your
-
-19
-00:01:59,290 --> 00:02:00,290
-model.
-
-20
-00:02:00,290 --> 00:02:05,060
-Unknown tokens are also problematic because
-the model will not be able to extract any
-
-21
-00:02:05,060 --> 00:02:11,440
-information from the "unknown" part of the
-text.
-
-22
-00:02:11,440 --> 00:02:16,910
-In this other example, we can see that the
-tokenizer replaces words containing characters
-
-23
-00:02:16,910 --> 00:02:21,230
-with accents and capital letters with unknown
-tokens.
-
-24
-00:02:21,230 --> 00:02:28,140
-Finally, if we use again this tokenizer to
-tokenize medical vocabulary we see again that
-
-25
-00:02:28,140 --> 00:02:37,349
-a single word is divided into many sub tokens:
-4 for "paracetamol" and "pharyngitis".
-
-26
-00:02:37,349 --> 00:02:42,050
-Most of the tokenizers used by the current
-state of the art language models need to be
-
-27
-00:02:42,050 --> 00:02:48,160
-trained on a corpus that is similar to the
-one used to pre-train the language model.
-
-28
-00:02:48,160 --> 00:02:54,390
-This training consists in learning rules to
-divide the text into tokens and the way to
-
-29
-00:02:54,390 --> 00:03:00,510
-learn these rules and use them depends on
-the chosen tokenizer model.
-
-30
-00:03:00,510 --> 00:03:06,710
-Thus, to train a new tokenizer it is first
-necessary to build a training corpus composed
-
-31
-00:03:06,710 --> 00:03:09,239
-of raw texts.
-
-32
-00:03:09,239 --> 00:03:13,440
-Then, you have to choose an architecture for
-your tokenizer.
-
-33
-00:03:13,440 --> 00:03:19,640
-Here there are two options: the simplest is
-to reuse the same architecture as the one
-
-34
-00:03:19,640 --> 00:03:26,760
-of a tokenizer used by another model already
-trained,otherwise it is also possible to completely
-
-35
-00:03:26,760 --> 00:03:33,950
-design your tokenizer but it requires more
-experience and attention.
-
-36
-00:03:33,950 --> 00:03:39,620
-Once the architecture is chosen, one can thus
-train this tokenizer on your constituted corpus.
-
-37
-00:03:39,620 --> 00:03:44,870
-Finally, the last thing that you need to do
-is to save the learned rules to be able to
-
-38
-00:03:44,870 --> 00:03:49,780
-use this tokenizer which is now ready to be
-used.
-
-39
-00:03:49,780 --> 00:03:55,120
-Let's take an example: let's say you want
-to train a GPT-2 model on Python code.
-
-40
-00:03:55,120 --> 00:04:03,000
-Even if the python code is in English this
-type of text is very specific and deserves
-
-41
-00:04:03,000 --> 00:04:09,800
-a tokenizer trained on it - to convince you
-of this we will see at the end the difference
-
-42
-00:04:09,800 --> 00:04:11,319
-produced on an example.
-
-43
-00:04:11,319 --> 00:04:18,889
-For that we are going to use the method "train_new_from_iterator"
-that all the fast tokenizers of the library
-
-44
-00:04:18,889 --> 00:04:22,530
-have and thus in particular GPT2TokenizerFast.
-
-45
-00:04:22,530 --> 00:04:28,389
-This is the simplest method in our case to
-have a tokenizer adapted to python code.
-
-46
-00:04:28,389 --> 00:04:34,229
-Remember, the first step is to gather a training
-corpus.
-
-47
-00:04:34,229 --> 00:04:39,639
-We will use a subpart of the CodeSearchNet
-dataset containing only python functions from
-
-48
-00:04:39,639 --> 00:04:42,039
-open source libraries on Github.
-
-49
-00:04:42,039 --> 00:04:48,890
-It's good timing, this dataset is known by
-the datasets library and we can load it in
-
-50
-00:04:48,890 --> 00:04:51,190
-two lines of code.
-
-51
-00:04:51,190 --> 00:04:57,940
-Then, as the "train_new_from_iterator" method
-expects a iterator of lists of texts we create
-
-52
-00:04:57,940 --> 00:05:04,030
-the "get_training_corpus" function which will
-return an iterator.
-
-53
-00:05:04,030 --> 00:05:10,861
-Now that we have our iterator on our python
-functions corpus, we can load the gpt-2 tokenizer
-
-54
-00:05:10,861 --> 00:05:12,490
-architecture.
-
-55
-00:05:12,490 --> 00:05:19,450
-Here "old_tokenizer" is not adapted to our
-corpus but we only need one more line to train
-
-56
-00:05:19,450 --> 00:05:21,850
-it on our new corpus.
-
-57
-00:05:21,850 --> 00:05:29,310
-An argument that is common to most of the
-tokenization algorithms used at the moment
-
-58
-00:05:29,310 --> 00:05:33,370
-is the size of the vocabulary, we choose here
-the value 52 thousand.
-
-59
-00:05:33,370 --> 00:05:38,780
-Finally, once the training is finished, we
-just have to save our new tokenizer locally
-
-60
-00:05:38,780 --> 00:05:45,430
-or send it to the hub to be able to reuse
-it very easily afterwards.
-
-61
-00:05:45,430 --> 00:05:49,962
-Finally, let's see together on an example
-if it was useful to re-train a tokenizer similar
-
-62
-00:05:49,962 --> 00:05:55,259
-to gpt2 one.
-
-63
-00:05:55,259 --> 00:06:01,610
-With the original tokenizer of GPT-2 we see
-that all spaces are isolated and the method
-
-64
-00:06:01,610 --> 00:06:05,860
-name "randn" relatively common in python code
-is split in 2.
-
-65
-00:06:05,860 --> 00:06:10,919
-With our new tokenizer, single and double
-indentations have been learned and the method
-
-66
-00:06:10,919 --> 00:06:13,410
-"randn" is tokenized into 1 token.
-
-67
-00:06:13,410 --> 00:06:23,190
-And with that, you now know how to train your
-very own tokenizers!
+﻿1
+00:00:00,000 --> 00:00:02,667
+(air whooshing)
+
+2
+00:00:05,310 --> 00:00:08,700
+- In this video we will see together
+
+3
+00:00:08,700 --> 00:00:11,820
+what is the purpose of
+training a tokenizer,
+
+4
+00:00:11,820 --> 00:00:14,400
+what are the key steps to follow,
+
+5
+00:00:14,400 --> 00:00:16,953
+and what is the easiest way to do it.
+
+6
+00:00:18,690 --> 00:00:20,677
+You will ask yourself the question,
+
+7
+00:00:20,677 --> 00:00:23,040
+"Should I train a new tokenizer?",
+
+8
+00:00:23,040 --> 00:00:25,773
+when you plan to train a
+new model from scratch.
+
+9
+00:00:29,520 --> 00:00:34,020
+A trained tokenizer would not
+be suitable for your corpus
+
+10
+00:00:34,020 --> 00:00:37,080
+if your corpus is in a different language,
+
+11
+00:00:37,080 --> 00:00:42,060
+uses new characters, such as
+accents or upper cased letters,
+
+12
+00:00:42,060 --> 00:00:47,060
+has a specific vocabulary,
+for example medical or legal,
+
+13
+00:00:47,100 --> 00:00:49,050
+or uses a different style,
+
+14
+00:00:49,050 --> 00:00:51,873
+a language from another
+century for example.
+
+15
+00:00:56,490 --> 00:00:58,320
+If I take the tokenizer trained on
+
+16
+00:00:58,320 --> 00:01:00,780
+the bert-base-uncased model,
+
+17
+00:01:00,780 --> 00:01:03,213
+and ignore its normalization step,
+
+18
+00:01:04,260 --> 00:01:07,650
+then we can see that the
+tokenization operation
+
+19
+00:01:07,650 --> 00:01:09,277
+on the English sentence,
+
+20
+00:01:09,277 --> 00:01:12,480
+"Here is a sentence
+adapted to our tokenizer",
+
+21
+00:01:12,480 --> 00:01:15,600
+produces a rather
+satisfactory list of tokens,
+
+22
+00:01:15,600 --> 00:01:18,510
+in the sense that this
+sentence of eight words
+
+23
+00:01:18,510 --> 00:01:20,643
+is tokenized into nine tokens.
+
+24
+00:01:22,920 --> 00:01:26,340
+On the other hand, if I
+use this same tokenizer
+
+25
+00:01:26,340 --> 00:01:29,370
+on a sentence in Bengali, we see that
+
+26
+00:01:29,370 --> 00:01:33,690
+either a word is divided
+into many sub tokens,
+
+27
+00:01:33,690 --> 00:01:36,270
+or that the tokenizer does not know one of
+
+28
+00:01:36,270 --> 00:01:39,873
+the unicode characters and
+returns only an unknown token.
+
+29
+00:01:41,220 --> 00:01:44,970
+The fact that a common word
+is split into many tokens
+
+30
+00:01:44,970 --> 00:01:47,910
+can be problematic,
+because language models
+
+31
+00:01:47,910 --> 00:01:51,903
+can only handle a sequence
+of tokens of limited length.
+
+32
+00:01:52,830 --> 00:01:55,830
+A tokenizer that excessively
+splits your initial text
+
+33
+00:01:55,830 --> 00:01:58,503
+may even impact the
+performance of your model.
+
+34
+00:01:59,760 --> 00:02:02,280
+Unknown tokens are also problematic,
+
+35
+00:02:02,280 --> 00:02:04,530
+because the model will
+not be able to extract
+
+36
+00:02:04,530 --> 00:02:07,563
+any information from the
+unknown part of the text.
+
+37
+00:02:11,430 --> 00:02:13,440
+In this other example, we can see that
+
+38
+00:02:13,440 --> 00:02:17,100
+the tokenizer replaces
+words containing characters
+
+39
+00:02:17,100 --> 00:02:20,973
+with accents and capital
+letters with unknown tokens.
+
+40
+00:02:22,050 --> 00:02:24,770
+Finally, if we use again this tokenizer
+
+41
+00:02:24,770 --> 00:02:28,170
+to tokenize medical
+vocabulary, we see again that
+
+42
+00:02:28,170 --> 00:02:31,800
+a single word is divided
+into many sub tokens,
+
+43
+00:02:31,800 --> 00:02:34,803
+four for paracetamol,
+and four for pharyngitis.
+
+44
+00:02:37,110 --> 00:02:39,360
+Most of the tokenizers used by the current
+
+45
+00:02:39,360 --> 00:02:42,540
+state of the art language
+models need to be trained
+
+46
+00:02:42,540 --> 00:02:45,360
+on a corpus that is
+similar to the one used
+
+47
+00:02:45,360 --> 00:02:47,463
+to pre-train the language model.
+
+48
+00:02:49,140 --> 00:02:51,150
+This training consists in learning rules
+
+49
+00:02:51,150 --> 00:02:53,250
+to divide the text into tokens.
+
+50
+00:02:53,250 --> 00:02:56,160
+And the way to learn
+these rules and use them
+
+51
+00:02:56,160 --> 00:02:58,233
+depends on the chosen tokenizer model.
+
+52
+00:03:00,630 --> 00:03:04,590
+Thus, to train a new tokenizer,
+it is first necessary
+
+53
+00:03:04,590 --> 00:03:07,653
+to build a training corpus
+composed of raw texts.
+
+54
+00:03:08,910 --> 00:03:12,423
+Then, you have to choose an
+architecture for your tokenizer.
+
+55
+00:03:13,410 --> 00:03:14,763
+Here there are two options.
+
+56
+00:03:15,900 --> 00:03:19,710
+The simplest is to reuse the
+same architecture as the one
+
+57
+00:03:19,710 --> 00:03:22,863
+of a tokenizer used by
+another model already trained.
+
+58
+00:03:24,210 --> 00:03:25,980
+Otherwise it is also possible
+
+59
+00:03:25,980 --> 00:03:28,560
+to completely design your tokenizer.
+
+60
+00:03:28,560 --> 00:03:31,683
+But it requires more
+experience and attention.
+
+61
+00:03:33,750 --> 00:03:36,660
+Once the architecture
+is chosen, you can thus
+
+62
+00:03:36,660 --> 00:03:39,513
+train this tokenizer on
+your constituted corpus.
+
+63
+00:03:40,650 --> 00:03:43,440
+Finally, the last thing that
+you need to do is to save
+
+64
+00:03:43,440 --> 00:03:46,443
+the learned rules to be
+able to use this tokenizer.
+
+65
+00:03:49,530 --> 00:03:51,330
+Let's take an example.
+
+66
+00:03:51,330 --> 00:03:54,873
+Let's say you want to train
+a GPT-2 model on Python code.
+
+67
+00:03:56,160 --> 00:03:59,640
+Even if the Python code
+is usually in English
+
+68
+00:03:59,640 --> 00:04:02,386
+this type of text is very specific,
+
+69
+00:04:02,386 --> 00:04:04,473
+and deserves a tokenizer trained on it.
+
+70
+00:04:05,340 --> 00:04:07,980
+To convince you of this,
+we will see at the end
+
+71
+00:04:07,980 --> 00:04:10,023
+the difference produced on an example.
+
+72
+00:04:11,400 --> 00:04:13,747
+For that we are going to use the method
+
+73
+00:04:13,747 --> 00:04:18,240
+"train_new_from_iterator"
+that all the fast tokenizers
+
+74
+00:04:18,240 --> 00:04:20,040
+of the library have and thus,
+
+75
+00:04:20,040 --> 00:04:22,503
+in particular GPT2TokenizerFast.
+
+76
+00:04:23,880 --> 00:04:26,100
+This is the simplest method in our case
+
+77
+00:04:26,100 --> 00:04:28,983
+to have a tokenizer
+adapted to Python code.
+
+78
+00:04:30,180 --> 00:04:34,140
+Remember, the first thing is
+to gather a training corpus.
+
+79
+00:04:34,140 --> 00:04:37,320
+We will use a subpart of
+the CodeSearchNet dataset
+
+80
+00:04:37,320 --> 00:04:39,360
+containing only Python functions
+
+81
+00:04:39,360 --> 00:04:42,360
+from open source libraries on Github.
+
+82
+00:04:42,360 --> 00:04:43,650
+It's good timing.
+
+83
+00:04:43,650 --> 00:04:46,980
+This dataset is known
+by the datasets library
+
+84
+00:04:46,980 --> 00:04:49,203
+and we can load it in two lines of code.
+
+85
+00:04:50,760 --> 00:04:55,230
+Then, as the "train_new_from_iterator"
+method expects
+
+86
+00:04:55,230 --> 00:04:57,150
+a iterator of lists of texts,
+
+87
+00:04:57,150 --> 00:04:59,970
+we create the
+"get_training_corpus" function,
+
+88
+00:04:59,970 --> 00:05:01,743
+which will return an iterator.
+
+89
+00:05:03,870 --> 00:05:05,430
+Now that we have our iterator
+
+90
+00:05:05,430 --> 00:05:09,630
+on our Python functions
+corpus, we can load
+
+91
+00:05:09,630 --> 00:05:12,351
+the GPT-2 tokenizer architecture.
+
+92
+00:05:12,351 --> 00:05:16,560
+Here old_tokenizer is not
+adapted to our corpus.
+
+93
+00:05:16,560 --> 00:05:17,700
+But we only need
+
+94
+00:05:17,700 --> 00:05:20,733
+one more line to train
+it on our new corpus.
+
+95
+00:05:21,780 --> 00:05:24,720
+An argument that is common
+to most of the tokenization
+
+96
+00:05:24,720 --> 00:05:28,980
+algorithms used at the moment
+is the size of the vocabulary.
+
+97
+00:05:28,980 --> 00:05:31,773
+We choose here the value 52,000.
+
+98
+00:05:32,820 --> 00:05:35,760
+Finally, once the training is finished,
+
+99
+00:05:35,760 --> 00:05:38,850
+we just have to save our
+new tokenizer locally,
+
+100
+00:05:38,850 --> 00:05:41,730
+or send it to the hub
+to be able to reuse it
+
+101
+00:05:41,730 --> 00:05:43,593
+very easily afterwards.
+
+102
+00:05:45,270 --> 00:05:48,990
+Finally, let's see together
+on an example if it was useful
+
+103
+00:05:48,990 --> 00:05:53,073
+to re-train a tokenizer
+similar to GPT-2 one.
+
+104
+00:05:55,110 --> 00:05:57,660
+With the original tokenizer of GPT-2
+
+105
+00:05:57,660 --> 00:06:00,330
+we see that all spaces are isolated,
+
+106
+00:06:00,330 --> 00:06:01,920
+and the method name randn,
+
+107
+00:06:01,920 --> 00:06:04,833
+relatively common in Python
+code, is split in two.
+
+108
+00:06:05,730 --> 00:06:09,060
+With our new tokenizer,
+single and double indentations
+
+109
+00:06:09,060 --> 00:06:10,890
+have been learned and the method randn
+
+110
+00:06:10,890 --> 00:06:13,770
+is tokenized into one token.
+
+111
+00:06:13,770 --> 00:06:15,000
+And with that,
+
+112
+00:06:15,000 --> 00:06:18,123
+you now know how to train
+your very own tokenizers now.
+
+113
+00:06:19,498 --> 00:06:22,165
+(air whooshing)
+
diff --git a/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt b/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt
index 18577f493..0253c1ace 100644
--- a/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt
+++ b/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt
@@ -1,89 +1,168 @@
-1
-00:00:05,200 --> 00:00:11,600
-Why are fast tokenizers called fast? In this video 
-we will see exactly how much faster the so-called  
-
-2
-00:00:11,600 --> 00:00:16,960
-fast tokenizers are compared to their 
-slow counterparts. For this benchmark,  
-
-3
-00:00:16,960 --> 00:00:22,160
-we will use the GLUE MNLI dataset, which 
-contains 432 thousands pairs of texts.  
-
-4
-00:00:22,880 --> 00:00:27,040
-We will see how long it takes for the 
-fast and slow versions of a BERT tokenizer  
-
-5
-00:00:27,040 --> 00:00:34,080
-to process them all. We define our fast and slow 
-tokenizer using the AutoTokenizer API. The fast  
-
-6
-00:00:34,080 --> 00:00:40,160
-tokenizer is the default (when available), so we 
-pass along use_fast=False to define the slow one.  
-
-7
-00:00:41,200 --> 00:00:45,760
-In a notebook, we can time the execution of a 
-cell with the time magic command, like this.  
-
-8
-00:00:46,560 --> 00:00:50,720
-Processing the whole dataset is four 
-times faster with a fast tokenizer.  
-
-9
-00:00:50,720 --> 00:00:54,960
-That's quicker indeed, but not very impressive 
-however. That's because we passed along the  
-
-10
-00:00:54,960 --> 00:00:59,520
-texts to the tokenizer one at a time. This is 
-a common mistake to do with fast tokenizers,  
-
-11
-00:00:59,520 --> 00:01:04,320
-which are backed by Rust and thus able to 
-parallelize the tokenization of multiple texts.  
-
-12
-00:01:05,120 --> 00:01:09,520
-Passing them only one text at a time is like 
-sending a cargo ship between two continents  
-
-13
-00:01:09,520 --> 00:01:15,600
-with just one container, it's very inefficient. 
-To unleash the full speed of our fast tokenizers,  
-
-14
-00:01:15,600 --> 00:01:20,320
-we need to send them batches of texts, which 
-we can do with the batched=True argument  
-
-15
-00:01:20,320 --> 00:01:26,720
-of the map method. Now those results are 
-impressive! The fast tokenizer takes 12 seconds to  
-
-16
-00:01:26,720 --> 00:01:33,280
-process a dataset that takes 4 minutes to the slow 
-tokenizer. Summarizing the results in this table,  
-
-17
-00:01:33,280 --> 00:01:37,200
-you can see why we have called those 
-tokenizers fast. And this is only for  
-
-18
-00:01:37,200 --> 00:01:48,160
-tokenizing texts. If you ever need to train a 
-new tokenizer, they do this very quickly too!
+﻿1
+00:00:00,418 --> 00:00:03,251
+(dramatic whoosh)
+
+2
+00:00:05,340 --> 00:00:08,460
+- Why are fast tokenizers called fast?
+
+3
+00:00:08,460 --> 00:00:10,950
+In this video, we'll see
+exactly how much faster,
+
+4
+00:00:10,950 --> 00:00:13,800
+also, so-called fast
+organizers are compared
+
+5
+00:00:13,800 --> 00:00:15,153
+to their slow counterparts.
+
+6
+00:00:16,200 --> 00:00:19,260
+For this benchmark, we'll
+use the GLUE MNLI dataset
+
+7
+00:00:19,260 --> 00:00:23,160
+which contains 432,000 spells of text.
+
+8
+00:00:23,160 --> 00:00:25,890
+We'll see how long it takes
+for the fast and slow versions
+
+9
+00:00:25,890 --> 00:00:28,143
+of a BERT tokenizer to process them all.
+
+10
+00:00:29,670 --> 00:00:31,380
+We define our fast and
+slow token organizer
+
+11
+00:00:31,380 --> 00:00:33,717
+using the AutoTokenizer API.
+
+12
+00:00:33,717 --> 00:00:37,110
+The fast tokenizer is a
+default when available.
+
+13
+00:00:37,110 --> 00:00:40,443
+So we pass along, use_fast=False
+to define the slow one.
+
+14
+00:00:41,430 --> 00:00:43,530
+In a notebook, we can time the execution
+
+15
+00:00:43,530 --> 00:00:46,800
+of itself with a time
+magic command, like this.
+
+16
+00:00:46,800 --> 00:00:49,350
+Processing the whole data
+set is four times faster
+
+17
+00:00:49,350 --> 00:00:50,970
+with a fast tokenizer.
+
+18
+00:00:50,970 --> 00:00:54,000
+That quicker indeed,
+but not very impressive.
+
+19
+00:00:54,000 --> 00:00:55,380
+This is because we passed along the texts
+
+20
+00:00:55,380 --> 00:00:57,240
+to the tokenizer one at a time.
+
+21
+00:00:57,240 --> 00:00:59,730
+This is a common mistake
+to do with fast organizers
+
+22
+00:00:59,730 --> 00:01:02,550
+which are backed by Rust,
+and thus able to prioritize
+
+23
+00:01:02,550 --> 00:01:05,370
+the tokenization of multiple texts.
+
+24
+00:01:05,370 --> 00:01:07,290
+Passing them only one text at a time,
+
+25
+00:01:07,290 --> 00:01:09,720
+is like sending a cargo
+ship between two continents
+
+26
+00:01:09,720 --> 00:01:13,140
+with just one container,
+it's very inefficient.
+
+27
+00:01:13,140 --> 00:01:15,810
+To unleash the full speed
+of our fast tokenizers,
+
+28
+00:01:15,810 --> 00:01:18,840
+we need to send them batches
+of texts, which we can do
+
+29
+00:01:18,840 --> 00:01:21,423
+with the batched=True
+argument of the map method.
+
+30
+00:01:22,620 --> 00:01:25,950
+Now those are impressive
+results, so the fast tokenizer
+
+31
+00:01:25,950 --> 00:01:28,410
+takes 12 second to process
+the dataset that takes four
+
+32
+00:01:28,410 --> 00:01:30,093
+minute to the slow tokenizer.
+
+33
+00:01:31,440 --> 00:01:33,510
+Summarizing the results in this table,
+
+34
+00:01:33,510 --> 00:01:36,630
+you can see why we have
+called those tokenizers fast.
+
+35
+00:01:36,630 --> 00:01:38,760
+And this is only for tokenizing texts.
+
+36
+00:01:38,760 --> 00:01:40,710
+If you ever need to train a new tokenizer,
+
+37
+00:01:40,710 --> 00:01:42,523
+they do this very quickly too.
+
diff --git a/subtitles/en/44_fast-tokenizer-superpowers.srt b/subtitles/en/44_fast-tokenizer-superpowers.srt
index de81e3a3e..a5453f675 100644
--- a/subtitles/en/44_fast-tokenizer-superpowers.srt
+++ b/subtitles/en/44_fast-tokenizer-superpowers.srt
@@ -1,200 +1,335 @@
-1
-00:00:05,109 --> 00:00:10,089
-The fast tokenizers of the Transformers library
-are fast, but they also implement features
-
-2
-00:00:10,089 --> 00:00:14,610
-that will be super useful for data pre-processing
-and post-processing.
-
-3
-00:00:14,610 --> 00:00:16,750
-Let's have a look at them!
-
-4
-00:00:16,750 --> 00:00:21,759
-First, let's have a look at the usual output
-of a tokenizer.
-
-5
-00:00:21,759 --> 00:00:28,039
-We get input IDs that correspond to tokens,
-but we lose a lot of information in the process.
-
-6
-00:00:28,039 --> 00:00:33,270
-For instance, here the tokenization is the
-same for the two sentences, even if one has
-
-7
-00:00:33,270 --> 00:00:36,510
-several more spaces than the other.
-
-8
-00:00:36,510 --> 00:00:41,090
-Just having the input IDs is thus not enough
-if we want to match some tokens with a span
-
-9
-00:00:41,090 --> 00:00:46,610
-of text (something we will need to do when
-tackling question answering for instance).
-
-10
-00:00:46,610 --> 00:00:51,699
-It's also difficult to know when two tokens
-belong to the same word or not: it looks easy
-
-11
-00:00:51,699 --> 00:00:57,250
-when you just look at the output of a BERT
-tokenizer, we just need to look for the ##. But
-
-12
-00:00:57,250 --> 00:01:01,490
-other tokenizers have different ways to tokenize
-parts of words.
-
-13
-00:01:01,490 --> 00:01:06,910
-For instance RoBERTa adds this special G symbol
-to mark the tokens at the beginning of a word,
-
-14
-00:01:06,910 --> 00:01:12,160
-and T5 uses this special underscore symbol
-for the same purpose.
-
-15
-00:01:12,160 --> 00:01:16,759
-Thankfully, the fast tokenizers keep track
-of the word each token comes from, with a
-
-16
-00:01:16,759 --> 00:01:20,090
-word_ids method you can use on their outputs.
-
-17
-00:01:20,090 --> 00:01:24,799
-The output is not necessarily clear, but assembled
-together in a nice table like this, we can
-
-18
-00:01:24,799 --> 00:01:28,119
-look at the word position for each token.
-
-19
-00:01:28,119 --> 00:01:32,659
-Even better, the fast tokenizers keep track
-of the span of characters each token comes
-
-20
-00:01:32,659 --> 00:01:38,780
-from, and we can get them when calling it
-on one (or several) text by adding the return_offsets_mapping=True
-
-21
-00:01:38,780 --> 00:01:39,780
-argument.
-
-22
-00:01:39,780 --> 00:01:46,469
-In this instance, we can see how we jump positions
-between the ##s token and the super token,
-
-23
-00:01:46,469 --> 00:01:50,579
-because of the multiple spaces in the initial
-sentence.
-
-24
-00:01:50,579 --> 00:01:54,470
-To enable this, the fast tokenizers store
-additional information at each step of their
-
-25
-00:01:54,470 --> 00:01:55,470
-internal pipeline.
-
-26
-00:01:55,470 --> 00:02:00,899
-That internal pipeline consists of normalization,
-where we apply some cleaning to the text,
-
-27
-00:02:00,899 --> 00:02:05,600
-like lowercasing or removing the accents;()
-pre-tokenization, which is where we split
-
-28
-00:02:05,600 --> 00:02:09,940
-the texts into words;() then we apply the
-model of the tokenizer, which is where the
-
-29
-00:02:09,940 --> 00:02:15,300
-words are splits into tokens,() before finally
-doing the post-processing, where special tokens
-
-30
-00:02:15,300 --> 00:02:17,110
-are added.
-
-31
-00:02:17,110 --> 00:02:20,730
-From the beginning to the end of the pipeline,
-the tokenizer keeps track of each span of
-
-32
-00:02:20,730 --> 00:02:23,680
-text that corresponds to each word, then each
-token.
-
-33
-00:02:23,680 --> 00:02:29,099
-We will see how useful it is when we tackle
-the following tasks: when doing masked language
-
-34
-00:02:29,099 --> 00:02:34,360
-modeling, one variation that gets state-of-the-art
-results is to mask all the tokens of a given
-
-35
-00:02:34,360 --> 00:02:37,600
-word instead of randomly chosen tokens.
-
-36
-00:02:37,600 --> 00:02:40,909
-This will require us to use the word IDs we
-saw.
-
-37
-00:02:40,909 --> 00:02:45,209
-When doing token classification, we'll need
-to convert the labels we have on words, to
-
-38
-00:02:45,209 --> 00:02:47,230
-labels on each tokens.
-
-39
-00:02:47,230 --> 00:02:51,360
-As for the offset mappings, it will be super
-useful when we need to convert token positions
-
-40
-00:02:51,360 --> 00:02:56,330
-in a sentence into a span of text, which we
-will need to know when looking at question
-
-41
-00:02:56,330 --> 00:03:01,200
-answering or when grouping the tokens corresponding
-to the same entity in token classification.
-
-42
-00:03:01,200 --> 00:03:09,730
-To have a look at these tasks, check the videos
-linked below!
+﻿1
+00:00:05,010 --> 00:00:06,270
+- The fast tokenizers
+
+2
+00:00:06,270 --> 00:00:08,580
+of the Transformers library are fast,
+
+3
+00:00:08,580 --> 00:00:11,490
+but they also implement features
+that will be super useful
+
+4
+00:00:11,490 --> 00:00:14,536
+for data pre-processing
+and post-processing.
+
+5
+00:00:14,536 --> 00:00:17,239
+Let's have a look at them!
+
+6
+00:00:17,239 --> 00:00:18,650
+First, let's have a look
+
+7
+00:00:18,650 --> 00:00:21,690
+at the usual output of a tokenizer.
+
+8
+00:00:21,690 --> 00:00:24,278
+We get input IDs that correspond to token,
+
+9
+00:00:24,278 --> 00:00:27,960
+but we lose a lot of
+information in the process.
+
+10
+00:00:27,960 --> 00:00:29,010
+For instance,
+
+11
+00:00:29,010 --> 00:00:31,856
+here the tokenization is the
+same for the two sentences
+
+12
+00:00:31,856 --> 00:00:35,373
+even if one has several
+more spaces than the other.
+
+13
+00:00:36,300 --> 00:00:39,150
+Just having the input
+IDs is thus not enough
+
+14
+00:00:39,150 --> 00:00:42,330
+if we want to match some
+tokens with a span of text,
+
+15
+00:00:42,330 --> 00:00:43,320
+something we'll need to do
+
+16
+00:00:43,320 --> 00:00:46,111
+when tackling question
+answering, for instance.
+
+17
+00:00:46,111 --> 00:00:47,592
+It's also difficult to know
+
+18
+00:00:47,592 --> 00:00:50,850
+when two tokens belong
+to the same word or not.
+
+19
+00:00:50,850 --> 00:00:52,860
+It looks easy when you
+just look at the output
+
+20
+00:00:52,860 --> 00:00:55,650
+of a BERT tokenizer where
+we just need to look
+
+21
+00:00:55,650 --> 00:00:56,779
+for the hash hash.
+
+22
+00:00:56,779 --> 00:00:59,040
+But other tokenizers have different ways
+
+23
+00:00:59,040 --> 00:01:00,987
+to tokenize parts of words.
+
+24
+00:01:00,987 --> 00:01:04,470
+For instance, RoBERTa
+adds this special G symbol
+
+25
+00:01:04,470 --> 00:01:06,491
+to mark the tokens at
+the beginning of the word
+
+26
+00:01:06,491 --> 00:01:09,570
+and T5 uses this special underscore symbol
+
+27
+00:01:09,570 --> 00:01:11,150
+for the same purpose.
+
+28
+00:01:11,150 --> 00:01:14,760
+Thankfully, the fast tokenizers
+keep track of the word
+
+29
+00:01:14,760 --> 00:01:16,230
+each token comes from,
+
+30
+00:01:16,230 --> 00:01:19,571
+with a word_ids method you
+can use on their outputs.
+
+31
+00:01:19,571 --> 00:01:21,870
+The output is not necessarily clear,
+
+32
+00:01:21,870 --> 00:01:24,076
+but assembled together in
+a nice table like this,
+
+33
+00:01:24,076 --> 00:01:26,853
+we can look at the word
+position for each token.
+
+34
+00:01:27,930 --> 00:01:30,220
+Even better, the fast
+tokenizers keep track
+
+35
+00:01:30,220 --> 00:01:33,198
+of the span of characters
+each token comes from,
+
+36
+00:01:33,198 --> 00:01:35,760
+and we can get them when calling it on one
+
+37
+00:01:35,760 --> 00:01:37,221
+or several text by adding
+
+38
+00:01:37,221 --> 00:01:40,470
+the return_offsets_mapping=True argument.
+
+39
+00:01:40,470 --> 00:01:42,312
+In this instance, we can
+see how we jump positions
+
+40
+00:01:42,312 --> 00:01:45,650
+between the hash hash
+token and the super token,
+
+41
+00:01:45,650 --> 00:01:49,992
+because of the multiple spaces
+in the initial sentence.
+
+42
+00:01:49,992 --> 00:01:52,110
+To enable this, the fast tokenizers
+
+43
+00:01:52,110 --> 00:01:54,270
+store additional information at each step
+
+44
+00:01:54,270 --> 00:01:55,440
+of their internal pipeline.
+
+45
+00:01:55,440 --> 00:01:57,951
+That internal pipeline
+consists of normalization,
+
+46
+00:01:57,951 --> 00:02:00,360
+where we apply some cleaning to the text,
+
+47
+00:02:00,360 --> 00:02:02,621
+like lower casing or removing the accents;
+
+48
+00:02:02,621 --> 00:02:04,088
+pre-tokenization,
+
+49
+00:02:04,088 --> 00:02:06,530
+which is where we split
+the texts into words;
+
+50
+00:02:06,530 --> 00:02:09,360
+then we apply the model of the tokenizer,
+
+51
+00:02:09,360 --> 00:02:11,725
+which is where the words
+are split into tokens,
+
+52
+00:02:11,725 --> 00:02:13,748
+before finally doing the post processing,
+
+53
+00:02:13,748 --> 00:02:16,023
+where special tokens are added.
+
+54
+00:02:17,100 --> 00:02:19,050
+From the beginning to
+the end of the pipeline,
+
+55
+00:02:19,050 --> 00:02:21,390
+the tokenizer keeps track
+of each span of text
+
+56
+00:02:21,390 --> 00:02:23,853
+that corresponds to each
+word, then each token.
+
+57
+00:02:24,990 --> 00:02:26,100
+We'll see how useful it is
+
+58
+00:02:26,100 --> 00:02:27,990
+when we tackle the following tasks:
+
+59
+00:02:27,990 --> 00:02:29,549
+when doing masked language modeling
+
+60
+00:02:29,549 --> 00:02:32,407
+one variation that gets
+state-of-the-art results
+
+61
+00:02:32,407 --> 00:02:35,040
+is to mask all the tokens of a given word
+
+62
+00:02:35,040 --> 00:02:37,440
+instead of randomly chosen words.
+
+63
+00:02:37,440 --> 00:02:40,800
+This will require us to
+use the word IDs we saw.
+
+64
+00:02:40,800 --> 00:02:42,329
+When doing token classification,
+
+65
+00:02:42,329 --> 00:02:45,090
+we'll need to convert the
+labels we have on words,
+
+66
+00:02:45,090 --> 00:02:47,250
+to labels on each tokens.
+
+67
+00:02:47,250 --> 00:02:48,480
+As for the offset mappings,
+
+68
+00:02:48,480 --> 00:02:50,610
+it will be super useful
+when we need to convert
+
+69
+00:02:50,610 --> 00:02:53,436
+token positions in a
+sentence into a span of text,
+
+70
+00:02:53,436 --> 00:02:55,800
+which we'll need to
+know when we're looking
+
+71
+00:02:55,800 --> 00:02:56,813
+at question answering
+
+72
+00:02:56,813 --> 00:02:58,680
+or when grouping the tokens corresponding
+
+73
+00:02:58,680 --> 00:03:01,023
+to the same entity in
+token classification.
+
+74
+00:03:02,160 --> 00:03:03,450
+To have a look at these tasks,
+
+75
+00:03:03,450 --> 00:03:04,950
+check the videos linked below!
+
diff --git a/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt b/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt
index f303bdfe9..b248a28c8 100644
--- a/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt
+++ b/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt
@@ -1,154 +1,333 @@
-1
-00:00:05,200 --> 00:00:08,080
-Let's have a look inside the 
-token classification pipeline.  
-
-2
-00:00:10,000 --> 00:00:13,920
-In the pipeline video, we looked at the 
-different applications the Transformers  
-
-3
-00:00:13,920 --> 00:00:19,840
-library supports out of the box, one of them being 
-token classification, for instance predicting for  
-
-4
-00:00:19,840 --> 00:00:24,960
-each word in a sentence whether they correspond 
-to a person, an organization or a location.  
-
-5
-00:00:26,400 --> 00:00:30,240
-We can even group together the tokens 
-corresponding to the same entity,  
-
-6
-00:00:30,240 --> 00:00:34,960
-for instance all the tokens that formed 
-the word Sylvain here, or Hugging and Face.  
-
-7
-00:00:36,960 --> 00:00:42,480
-The token classification pipeline works the same 
-way as the text classification pipeline we studied  
-
-8
-00:00:42,480 --> 00:00:49,360
-in a previous video. There are three steps: the 
-tokenization, the model, and the post processing.  
-
-9
-00:00:50,720 --> 00:00:55,680
-The first two steps are identical to the text 
-classification pipeline, except we use an auto  
-
-10
-00:00:55,680 --> 00:01:01,760
-token classification model instead of a sequence 
-classification one. We tokenize our text then feed  
-
-11
-00:01:01,760 --> 00:01:07,360
-it to the model. Instead of getting one number 
-for each possible label for the whole sentence,  
-
-12
-00:01:07,360 --> 00:01:13,760
-we get one number for each of the possible 9 
-labels for every token in the sentence, here 19.  
-
-13
-00:01:15,120 --> 00:01:19,600
-Like all the other models of the Transformers 
-library, our model outputs logits,  
-
-14
-00:01:19,600 --> 00:01:26,160
-which we turn into predictions by using a SoftMax. 
-We also get the predicted label for each token by  
-
-15
-00:01:26,160 --> 00:01:30,000
-taking the maximum prediction (since the softmax 
-function preserves the order, we could have  
-
-16
-00:01:30,000 --> 00:01:35,200
-done it on the logits if we had no need of the 
-predictions). The model config contains the label  
-
-17
-00:01:35,200 --> 00:01:41,200
-mapping in its id2label field. Using it, we can 
-map every token to its corresponding label. The  
-
-18
-00:01:41,200 --> 00:01:46,400
-label O correspond to "no entity", which is why we 
-didn't see it in our results in the first slide.  
-
-19
-00:01:47,040 --> 00:01:51,360
-On top of the label and the probability, 
-those results included the start and end  
-
-20
-00:01:51,360 --> 00:01:56,960
-character in the sentence. We will need to use the 
-offset mapping of the tokenizer to get those (look  
-
-21
-00:01:56,960 --> 00:02:02,080
-at the video linked below if you don't know about 
-them already). Then, looping through each token  
-
-22
-00:02:02,080 --> 00:02:08,240
-that has a label distinct from O, we can build the 
-list of results we got with our first pipeline.  
-
-23
-00:02:08,240 --> 00:02:13,360
-The last step is to group together tokens 
-that correspond to the same entity.This  
-
-24
-00:02:13,360 --> 00:02:17,680
-is why we had two labels for each type 
-of entity: I-PER and B-PER for instance.  
-
-25
-00:02:18,240 --> 00:02:21,840
-It allows us to know if a token is in 
-the same entity as the previous one.()  
-
-26
-00:02:23,120 --> 00:02:26,720
-Note that there are two ways of 
-labelling used for token classification,  
-
-27
-00:02:26,720 --> 00:02:31,680
-one (in pink here) uses the B-PER label at the 
-beginning of each new entity, but the other  
-
-28
-00:02:31,680 --> 00:02:38,320
-(in blue) only uses it to separate two adjacent 
-entities of the same type. In both cases, we can  
-
-29
-00:02:38,320 --> 00:02:44,720
-flag a new entity each time we see a new label 
-appearing (either with the I or B prefix) then  
-
-30
-00:02:44,720 --> 00:02:50,160
-take all the following tokens labelled the same, 
-with an I-flag. This, coupled with the offset  
-
-31
-00:02:50,160 --> 00:03:01,040
-mapping to get the start and end characters allows 
-us to get the span of texts for each entity.
+﻿1
+00:00:00,076 --> 00:00:01,462
+(title whooshes)
+
+2
+00:00:01,462 --> 00:00:02,382
+(logo pops)
+
+3
+00:00:02,382 --> 00:00:05,340
+(title whooshes)
+
+4
+00:00:05,340 --> 00:00:06,210
+- Let's have a look
+
+5
+00:00:06,210 --> 00:00:08,283
+inside the token classification pipeline.
+
+6
+00:00:10,080 --> 00:00:11,580
+In the pipeline video,
+
+7
+00:00:11,580 --> 00:00:13,320
+we looked at the different applications
+
+8
+00:00:13,320 --> 00:00:15,960
+the Transformers library
+supports out of the box,
+
+9
+00:00:15,960 --> 00:00:18,780
+one of them being token classification,
+
+10
+00:00:18,780 --> 00:00:21,810
+for instance predicting
+for each word in a sentence
+
+11
+00:00:21,810 --> 00:00:24,510
+whether they correspond to
+a person, an organization
+
+12
+00:00:24,510 --> 00:00:25,353
+or a location.
+
+13
+00:00:26,670 --> 00:00:28,920
+We can even group together
+the tokens corresponding
+
+14
+00:00:28,920 --> 00:00:32,040
+to the same entity, for
+instance all the tokens
+
+15
+00:00:32,040 --> 00:00:35,373
+that formed the word Sylvain
+here, or Hugging and Face.
+
+16
+00:00:37,290 --> 00:00:40,230
+The token classification
+pipeline works the same way
+
+17
+00:00:40,230 --> 00:00:42,630
+as the text classification
+pipeline we studied
+
+18
+00:00:42,630 --> 00:00:44,430
+in the previous video.
+
+19
+00:00:44,430 --> 00:00:45,930
+There are three steps.
+
+20
+00:00:45,930 --> 00:00:49,623
+The tokenization, the model,
+and the postprocessing.
+
+21
+00:00:50,940 --> 00:00:52,530
+The first two steps are identical
+
+22
+00:00:52,530 --> 00:00:54,630
+to the text classification pipeline,
+
+23
+00:00:54,630 --> 00:00:57,300
+except we use an auto
+token classification model
+
+24
+00:00:57,300 --> 00:01:00,150
+instead of a sequence classification one.
+
+25
+00:01:00,150 --> 00:01:03,720
+We tokenize our text then
+feed it to the model.
+
+26
+00:01:03,720 --> 00:01:05,877
+Instead of getting one number
+for each possible label
+
+27
+00:01:05,877 --> 00:01:08,700
+for the whole sentence, we get one number
+
+28
+00:01:08,700 --> 00:01:10,770
+for each of the possible nine labels
+
+29
+00:01:10,770 --> 00:01:13,983
+for every token in the sentence, here 19.
+
+30
+00:01:15,300 --> 00:01:18,090
+Like all the other models
+of the Transformers library,
+
+31
+00:01:18,090 --> 00:01:19,830
+our model outputs logits,
+
+32
+00:01:19,830 --> 00:01:23,073
+which we turn into predictions
+by using a SoftMax.
+
+33
+00:01:23,940 --> 00:01:26,190
+We also get the predicted
+label for each token
+
+34
+00:01:26,190 --> 00:01:27,990
+by taking the maximum prediction,
+
+35
+00:01:27,990 --> 00:01:29,880
+since the SoftMax function
+preserves the orders,
+
+36
+00:01:29,880 --> 00:01:31,200
+we could have done it on the logits
+
+37
+00:01:31,200 --> 00:01:33,050
+if we had no need of the predictions.
+
+38
+00:01:33,930 --> 00:01:35,880
+The model config contains
+the label mapping
+
+39
+00:01:35,880 --> 00:01:37,740
+in its id2label field.
+
+40
+00:01:37,740 --> 00:01:41,430
+Using it, we can map every token
+to its corresponding label.
+
+41
+00:01:41,430 --> 00:01:43,950
+The label, O, correspond to no entity,
+
+42
+00:01:43,950 --> 00:01:45,985
+which is why we didn't
+see it in our results
+
+43
+00:01:45,985 --> 00:01:47,547
+in the first slide.
+
+44
+00:01:47,547 --> 00:01:49,440
+On top of the label and the probability,
+
+45
+00:01:49,440 --> 00:01:51,000
+those results included the start
+
+46
+00:01:51,000 --> 00:01:53,103
+and end character in the sentence.
+
+47
+00:01:54,120 --> 00:01:55,380
+We'll need to use the offset mapping
+
+48
+00:01:55,380 --> 00:01:56,640
+of the tokenizer to get those.
+
+49
+00:01:56,640 --> 00:01:58,050
+Look at the video linked below
+
+50
+00:01:58,050 --> 00:02:00,300
+if you don't know about them already.
+
+51
+00:02:00,300 --> 00:02:02,280
+Then, looping through each token
+
+52
+00:02:02,280 --> 00:02:04,080
+that has a label distinct from O,
+
+53
+00:02:04,080 --> 00:02:06,120
+we can build the list of results we got
+
+54
+00:02:06,120 --> 00:02:07,320
+with our first pipeline.
+
+55
+00:02:08,460 --> 00:02:10,560
+The last step is to group together tokens
+
+56
+00:02:10,560 --> 00:02:12,310
+that correspond to the same entity.
+
+57
+00:02:13,264 --> 00:02:16,140
+This is why we had two labels
+for each type of entity,
+
+58
+00:02:16,140 --> 00:02:18,450
+I-PER and B-PER, for instance.
+
+59
+00:02:18,450 --> 00:02:20,100
+It allows us to know if a token is
+
+60
+00:02:20,100 --> 00:02:22,323
+in the same entity as the previous one.
+
+61
+00:02:23,310 --> 00:02:25,350
+Note, that there are two
+ways of labeling used
+
+62
+00:02:25,350 --> 00:02:26,850
+for token classification.
+
+63
+00:02:26,850 --> 00:02:29,420
+One, in pink here, uses the B-PER label
+
+64
+00:02:29,420 --> 00:02:30,810
+at the beginning of each new entity,
+
+65
+00:02:30,810 --> 00:02:32,760
+but the other, in blue,
+
+66
+00:02:32,760 --> 00:02:35,340
+only uses it to separate
+two adjacent entities
+
+67
+00:02:35,340 --> 00:02:37,140
+of the same type.
+
+68
+00:02:37,140 --> 00:02:39,690
+In both cases, we can flag a new entity
+
+69
+00:02:39,690 --> 00:02:41,940
+each time we see a new label appearing,
+
+70
+00:02:41,940 --> 00:02:44,730
+either with the I or B prefix,
+
+71
+00:02:44,730 --> 00:02:47,130
+then take all the following
+tokens labeled the same,
+
+72
+00:02:47,130 --> 00:02:48,870
+with an I-flag.
+
+73
+00:02:48,870 --> 00:02:51,330
+This, coupled with the offset
+mapping to get the start
+
+74
+00:02:51,330 --> 00:02:54,210
+and end characters allows
+us to get the span of texts
+
+75
+00:02:54,210 --> 00:02:55,233
+for each entity.
+
+76
+00:02:56,569 --> 00:02:59,532
+(title whooshes)
+
+77
+00:02:59,532 --> 00:03:01,134
+(title fizzles)
+
diff --git a/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt b/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt
index 1f7367a85..f628d7744 100644
--- a/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt
+++ b/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt
@@ -1,189 +1,320 @@
-1
-00:00:05,250 --> 00:00:09,840
-Let's have a look inside the token classification
-pipeline.
-
-2
-00:00:09,840 --> 00:00:14,690
-In the pipeline video, we looked at the different
-applications the Transformers library supports
-
-3
-00:00:14,690 --> 00:00:20,820
-out of the box, one of them being token classification,
-for instance predicting for each word in a
-
-4
-00:00:20,820 --> 00:00:27,760
-sentence whether they correspond to a person,
-an organization or a location.
-
-5
-00:00:27,760 --> 00:00:32,660
-We can even group together the tokens corresponding
-to the same entity, for instance all the tokens
-
-6
-00:00:32,660 --> 00:00:37,950
-that formed the word Sylvain here, or Hugging
-and Face.
-
-7
-00:00:37,950 --> 00:00:42,480
-The token classification pipeline works the
-same way as the text classification pipeline
-
-8
-00:00:42,480 --> 00:00:44,379
-we studied in a previous video.
-
-9
-00:00:44,379 --> 00:00:49,600
-There are three steps: the tokenization, the
-model, and the post processing.
-
-10
-00:00:49,600 --> 00:00:56,340
-The first two steps are identical to the text
-classification pipeline, except we use an
-
-11
-00:00:56,340 --> 00:01:01,640
-auto token classification model instead of
-a sequence classification one.
-
-12
-00:01:01,640 --> 00:01:05,840
-We tokenize our text then feed it to the model.
-
-13
-00:01:05,840 --> 00:01:10,400
-Instead of getting one number for each possible
-label for the whole sentence, we get one number
-
-14
-00:01:10,400 --> 00:01:16,690
-for each of the possible 9 labels for every
-token in the sentence, here 19.
-
-15
-00:01:16,690 --> 00:01:22,299
-Like all the other models of the Transformers
-library, our model outputs logits, which we
-
-16
-00:01:22,299 --> 00:01:25,900
-turn into predictions by using a SoftMax.
-
-17
-00:01:25,900 --> 00:01:31,430
-We also get the predicted label for each token
-by taking the maximum prediction (since the
-
-18
-00:01:31,430 --> 00:01:35,470
-softmax function preserves the order, we could
-have done it on the logits if we had no need
-
-19
-00:01:35,470 --> 00:01:37,759
-of the predictions).
-
-20
-00:01:37,759 --> 00:01:42,340
-The model config contains the label mapping
-in its id2label field.
-
-21
-00:01:42,340 --> 00:01:45,331
-Using it, we can map every token to its corresponding
-label.
-
-22
-00:01:45,331 --> 00:01:50,490
-The label O correspond to "no entity", which
-is why we didn't see it in our results in
-
-23
-00:01:50,490 --> 00:01:51,579
-the first slide.
-
-24
-00:01:51,579 --> 00:01:57,430
-On top of the label and the probability, those
-results included the start and end character
-
-25
-00:01:57,430 --> 00:01:58,570
-in the sentence.
-
-26
-00:01:58,570 --> 00:02:02,610
-We will need to use the offset mapping of
-the tokenizer to get those (look at the video
-
-27
-00:02:02,610 --> 00:02:04,820
-linked below if you don't know about them
-already).
-
-28
-00:02:04,820 --> 00:02:09,750
-Then, looping through each token that has
-a label distinct from O, we can build the
-
-29
-00:02:09,750 --> 00:02:12,440
-list of results we got with our first pipeline.
-
-30
-00:02:12,440 --> 00:02:18,920
-The last step is to group together tokens
-that correspond to the same entity.
-
-31
-00:02:18,920 --> 00:02:23,290
-This is why we had two labels for each type
-of entity: I-PER and B-PER for instance.
-
-32
-00:02:23,290 --> 00:02:29,190
-It allows us to know if a token is in the
-same entity as the previous one.() Note that
-
-33
-00:02:29,190 --> 00:02:34,750
-there are two ways of labelling used for token
-classification, one (in pink here) uses the
-
-34
-00:02:34,750 --> 00:02:40,380
-B-PER label at the beginning of each new entity,
-but the other (in blue) only uses it to separate
-
-35
-00:02:40,380 --> 00:02:43,380
-two adjacent entities of the same type.
-
-36
-00:02:43,380 --> 00:02:48,880
-In both cases, we can flag a new entity each
-time we see a new label appearing (either
-
-37
-00:02:48,880 --> 00:02:54,051
-with the I or B prefix) then take all the
-following tokens labelled the same, with an
-
-38
-00:02:54,051 --> 00:02:55,051
-I-flag.
-
-39
-00:02:55,051 --> 00:02:59,360
-This, coupled with the offset mapping to get
-the start and end characters allows us to
-
-40
-00:02:59,360 --> 00:03:06,500
-get the span of texts for each entity.
+﻿1
+00:00:00,180 --> 00:00:03,013
+(whooshing sound)
+
+2
+00:00:05,310 --> 00:00:06,143
+- Let's have a look
+
+3
+00:00:06,143 --> 00:00:08,133
+inside the token classification pipeline.
+
+4
+00:00:09,780 --> 00:00:11,430
+In the pipeline video,
+
+5
+00:00:11,430 --> 00:00:13,230
+we looked at the different applications
+
+6
+00:00:13,230 --> 00:00:16,050
+the Transformers library
+supports out of the box.
+
+7
+00:00:16,050 --> 00:00:18,660
+One of them being token classification.
+
+8
+00:00:18,660 --> 00:00:22,050
+For instance, predicting
+for each word in a sentence,
+
+9
+00:00:22,050 --> 00:00:23,790
+whether they correspond to a person,
+
+10
+00:00:23,790 --> 00:00:26,043
+an organization, or location.
+
+11
+00:00:27,690 --> 00:00:29,250
+We can even group together the tokens
+
+12
+00:00:29,250 --> 00:00:31,320
+corresponding to the same entity.
+
+13
+00:00:31,320 --> 00:00:34,890
+For instance, all the tokens
+that form the word Sylvain here
+
+14
+00:00:34,890 --> 00:00:36,423
+or Hugging and Face.
+
+15
+00:00:37,320 --> 00:00:39,720
+So, token classification pipeline
+
+16
+00:00:39,720 --> 00:00:42,480
+works the same way as a
+text classification pipeline
+
+17
+00:00:42,480 --> 00:00:44,910
+we studied in a previous video.
+
+18
+00:00:44,910 --> 00:00:46,500
+There are three steps.
+
+19
+00:00:46,500 --> 00:00:50,043
+Tokenization, the model,
+and the post processing.
+
+20
+00:00:51,690 --> 00:00:53,190
+The first two steps are identical
+
+21
+00:00:53,190 --> 00:00:55,230
+to the text classification pipeline,
+
+22
+00:00:55,230 --> 00:00:58,230
+except we use an auto
+token classification model
+
+23
+00:00:58,230 --> 00:01:00,303
+instead of a sequence classification one.
+
+24
+00:01:01,560 --> 00:01:04,593
+We tokenize our text,
+then feed it to the model.
+
+25
+00:01:05,580 --> 00:01:08,160
+Instead of getting one number
+for each possible level
+
+26
+00:01:08,160 --> 00:01:09,600
+for the whole sentence,
+
+27
+00:01:09,600 --> 00:01:12,270
+we get one number for each
+of the possible nine levels
+
+28
+00:01:12,270 --> 00:01:14,250
+for every token in the sentence.
+
+29
+00:01:14,250 --> 00:01:15,573
+Here, 19.
+
+30
+00:01:17,070 --> 00:01:19,710
+Like all the other models
+of the Transformers library,
+
+31
+00:01:19,710 --> 00:01:22,560
+our model outputs logits
+which we need to turn
+
+32
+00:01:22,560 --> 00:01:24,663
+into predictions by using a SoftMax.
+
+33
+00:01:25,830 --> 00:01:28,170
+We also get the predicted
+label for each token
+
+34
+00:01:28,170 --> 00:01:30,063
+by taking the maximum prediction.
+
+35
+00:01:31,080 --> 00:01:33,540
+Since the softmax function
+preserves the order,
+
+36
+00:01:33,540 --> 00:01:34,980
+we could have done it on the logits
+
+37
+00:01:34,980 --> 00:01:36,830
+if we had no need of the predictions.
+
+38
+00:01:37,680 --> 00:01:40,050
+The model config contains
+the label mapping
+
+39
+00:01:40,050 --> 00:01:42,090
+in its id2label field.
+
+40
+00:01:42,090 --> 00:01:45,600
+Using it, we can map every token
+to its corresponding label.
+
+41
+00:01:45,600 --> 00:01:48,630
+The label O corresponds to "no entity"
+
+42
+00:01:48,630 --> 00:01:50,460
+which is why we didn't
+see it in our results
+
+43
+00:01:50,460 --> 00:01:52,110
+in the first slide.
+
+44
+00:01:52,110 --> 00:01:54,150
+On top of the label and the probability,
+
+45
+00:01:54,150 --> 00:01:55,620
+those results included the start
+
+46
+00:01:55,620 --> 00:01:57,423
+and end character in the sentence.
+
+47
+00:01:58,294 --> 00:01:59,880
+We'll need to use the offset mapping
+
+48
+00:01:59,880 --> 00:02:01,110
+of the tokenizer to get those.
+
+49
+00:02:01,110 --> 00:02:03,090
+Look at the video link below
+
+50
+00:02:03,090 --> 00:02:05,340
+if you don't know about them already.
+
+51
+00:02:05,340 --> 00:02:06,990
+Then, looping through each token
+
+52
+00:02:06,990 --> 00:02:09,090
+that has a label distinct from O,
+
+53
+00:02:09,090 --> 00:02:10,590
+we can build the list of results
+
+54
+00:02:10,590 --> 00:02:12,140
+we got with our first pipeline.
+
+55
+00:02:13,650 --> 00:02:15,840
+The last step is to group together tokens
+
+56
+00:02:15,840 --> 00:02:17,640
+that corresponds to the same entity.
+
+57
+00:02:18,930 --> 00:02:21,540
+This is why we had two labels
+for each type of entity,
+
+58
+00:02:21,540 --> 00:02:23,940
+I-PER and B-PER for instance.
+
+59
+00:02:23,940 --> 00:02:25,530
+It allows us to know if a token
+
+60
+00:02:25,530 --> 00:02:27,603
+is in the same entity as a previous one.
+
+61
+00:02:28,620 --> 00:02:29,850
+Note that there are two ways
+
+62
+00:02:29,850 --> 00:02:32,490
+of labeling used for token classification.
+
+63
+00:02:32,490 --> 00:02:35,360
+One, in pink here, uses the B-PER label
+
+64
+00:02:35,360 --> 00:02:37,530
+at the beginning of each new entity.
+
+65
+00:02:37,530 --> 00:02:39,990
+But the other in blue only uses it
+
+66
+00:02:39,990 --> 00:02:42,933
+to separate two adjacent
+entities of the same types.
+
+67
+00:02:44,340 --> 00:02:46,560
+In both cases we can flag a new entity
+
+68
+00:02:46,560 --> 00:02:49,110
+each time we see a new label appearing,
+
+69
+00:02:49,110 --> 00:02:51,330
+either with the I or B prefix.
+
+70
+00:02:51,330 --> 00:02:53,850
+Then, take all the following
+tokens labeled the same
+
+71
+00:02:53,850 --> 00:02:55,470
+with an I-flag.
+
+72
+00:02:55,470 --> 00:02:57,000
+This, coupled with the offset mapping
+
+73
+00:02:57,000 --> 00:02:59,010
+to get the start and end characters
+
+74
+00:02:59,010 --> 00:03:01,560
+allows us to get the span
+of texts for each entity.
+
+75
+00:03:02,869 --> 00:03:05,702
+(whooshing sound)
+
diff --git a/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt b/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt
index 02844ec4c..135185eff 100644
--- a/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt
+++ b/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt
@@ -1,225 +1,342 @@
-1
-00:00:04,130 --> 00:00:08,390
-Let's have a look inside the question answering
-pipeline.
-
-2
-00:00:08,390 --> 00:00:12,630
-The question answering pipeline can extracts
-answers to questions from a given context
-
-3
-00:00:12,630 --> 00:00:18,150
-or passage of text, like this part of the
-Transformers repo README.
-
-4
-00:00:18,150 --> 00:00:22,440
-It also works for very long contexts, even
-if the answer is at the very end, like in
-
-5
-00:00:22,440 --> 00:00:23,440
-this example.
-
-6
-00:00:23,440 --> 00:00:24,680
-In this video, we will see why!
-
-7
-00:00:24,680 --> 00:00:31,540
-The question answering pipeline follows the
-same steps as the other pipelines: the question
-
-8
-00:00:31,540 --> 00:00:36,380
-and context are tokenized as a sentence pair,
-fed to the model then some post-processing
-
-9
-00:00:36,380 --> 00:00:37,649
-is applied.
-
-10
-00:00:37,649 --> 00:00:41,790
-The tokenization and model steps should be
-familiar.
-
-11
-00:00:41,790 --> 00:00:47,020
-We use the auto class suitable for Question
-Answering instead of sequence classification,
-
-12
-00:00:47,020 --> 00:00:52,039
-but one key difference with text classification
-is that our model outputs two tensors named
-
-13
-00:00:52,039 --> 00:00:54,559
-start logits and end logits.
-
-14
-00:00:54,559 --> 00:00:55,559
-Why is that?
-
-15
-00:00:55,559 --> 00:00:59,850
-Well this is the way the model finds the answer
-to the question.
-
-16
-00:00:59,850 --> 00:01:02,270
-First let's have a look at the model inputs.
-
-17
-00:01:02,270 --> 00:01:07,160
-It's numbers associated with the tokenization
-of the question followed by the context (with
-
-18
-00:01:07,160 --> 00:01:10,710
-the usual CLS and SEP special tokens).
-
-19
-00:01:10,710 --> 00:01:13,310
-The answer is a part of those tokens.
-
-20
-00:01:13,310 --> 00:01:17,759
-So we ask the model to predict which token
-starts the answer and which ends the answer.
-
-21
-00:01:17,759 --> 00:01:24,380
-For our two logit outputs, the theoretical
-labels are the pink and purple vectors.
-
-22
-00:01:24,380 --> 00:01:28,360
-To convert those logits into probabilities,
-we will need to apply a SoftMax, like in the
-
-23
-00:01:28,360 --> 00:01:30,439
-text classification pipeline.
-
-24
-00:01:30,439 --> 00:01:35,070
-We just mask the tokens that are not part
-of the context before doing that, leaving
-
-25
-00:01:35,070 --> 00:01:41,009
-the initial CLS token unmasked as we use it
-to predict an impossible answer.
-
-26
-00:01:41,009 --> 00:01:43,579
-This is what it looks in terms of code.
-
-27
-00:01:43,579 --> 00:01:47,729
-We use a large negative number for the masking,
-since its exponential will then be 0.
-
-28
-00:01:47,729 --> 00:01:53,610
-Now the probability for each start and end
-position corresponding to a possible answer,
-
-29
-00:01:53,610 --> 00:01:57,600
-we give a score that is the product of the
-start probabilities and end probabilities
-
-30
-00:01:57,600 --> 00:02:00,180
-at those positions.
-
-31
-00:02:00,180 --> 00:02:05,430
-Of course, a start index greater than an end
-index corresponds to an impossible answer.
-
-32
-00:02:05,430 --> 00:02:08,940
-Here is the code to find the best score for
-a possible answer.
-
-33
-00:02:08,940 --> 00:02:13,070
-Once we have the start and end positions of
-the tokens, we use the offset mappings provided
-
-34
-00:02:13,070 --> 00:02:18,270
-by our tokenizer to find the span of characters
-in the initial context, and get our answer!
-
-35
-00:02:18,270 --> 00:02:23,820
-Now, when the context is long, it might get
-truncated by the tokenizer.
-
-36
-00:02:23,820 --> 00:02:29,099
-This might result in part of the answer, or
-worse, the whole answer, being truncated.
-
-37
-00:02:29,099 --> 00:02:33,319
-So we don't discard the truncated tokens but
-build new features with them.
-
-38
-00:02:33,319 --> 00:02:39,320
-Each of those features contains the question,
-then a chunk of text in the context.
-
-39
-00:02:39,320 --> 00:02:43,760
-If we take disjoint chunks of texts, we might
-end up with the answer being split between
-
-40
-00:02:43,760 --> 00:02:45,330
-two features.
-
-41
-00:02:45,330 --> 00:02:49,709
-So instead, we take overlapping chunks of
-texts, to make sure at least one of the chunks
-
-42
-00:02:49,709 --> 00:02:51,650
-will fully contain the answer to the question.
-
-43
-00:02:51,650 --> 00:02:56,920
-The tokenizers do all of this for us automatically
-with the return overflowing tokens option.
-
-44
-00:02:56,920 --> 00:03:02,069
-The stride argument controls the number of
-overlapping tokens.
-
-45
-00:03:02,069 --> 00:03:05,930
-Here is how our very long context gets truncated
-in two features with some overlap.
-
-46
-00:03:05,930 --> 00:03:10,051
-By applying the same post-processing we saw
-before for each feature, we get the answer
-
-47
-00:03:10,051 --> 00:03:18,349
-with a score for each of them, and we take
-the answer with the best score as a final
-
-48
-00:03:18,349 --> 00:03:21,239
-solution.
+﻿1
+00:00:04,230 --> 00:00:07,699
+- Let's have a look inside the
+question answering pipeline.
+
+2
+00:00:07,699 --> 00:00:10,680
+The question answering
+pipeline can extracts answers
+
+3
+00:00:10,680 --> 00:00:14,190
+to questions from a given
+context or passage of text,
+
+4
+00:00:14,190 --> 00:00:16,540
+like this part of the
+transformers repo README.
+
+5
+00:00:18,060 --> 00:00:20,310
+It also works for very long contexts,
+
+6
+00:00:20,310 --> 00:00:23,850
+even if the answer is at the
+very end, like in this example.
+
+7
+00:00:23,850 --> 00:00:25,400
+In this video, we will see why.
+
+8
+00:00:26,820 --> 00:00:29,460
+The question answering
+pipeline follows the same steps
+
+9
+00:00:29,460 --> 00:00:31,050
+as the other pipelines:
+
+10
+00:00:31,050 --> 00:00:34,200
+the question and context are
+tokenized as a sentence pair,
+
+11
+00:00:34,200 --> 00:00:37,955
+fed to the model then some
+post-processing is applied.
+
+12
+00:00:37,955 --> 00:00:41,730
+The tokenization and model
+steps should be familiar.
+
+13
+00:00:41,730 --> 00:00:44,610
+We use the auto class suitable
+for question answering
+
+14
+00:00:44,610 --> 00:00:47,070
+instead of sequence classification,
+
+15
+00:00:47,070 --> 00:00:49,392
+but one key difference
+with text classification
+
+16
+00:00:49,392 --> 00:00:52,980
+is that our model outputs two
+tensors named start logits
+
+17
+00:00:52,980 --> 00:00:54,570
+and end logits.
+
+18
+00:00:54,570 --> 00:00:55,830
+Why is that?
+
+19
+00:00:55,830 --> 00:00:57,930
+Well, this is the way the
+model finds the answer
+
+20
+00:00:57,930 --> 00:00:58,803
+to the question.
+
+21
+00:00:59,790 --> 00:01:02,130
+First, let's have a look
+at the model inputs.
+
+22
+00:01:02,130 --> 00:01:04,350
+Its numbers associated
+with the tokenization
+
+23
+00:01:04,350 --> 00:01:06,843
+of the question followed by the context
+
+24
+00:01:06,843 --> 00:01:09,723
+with the usual CLS and SEP special tokens.
+
+25
+00:01:10,620 --> 00:01:13,320
+The answer is a part of those tokens.
+
+26
+00:01:13,320 --> 00:01:15,510
+So we ask the model to
+predict which token starts
+
+27
+00:01:15,510 --> 00:01:17,373
+the answer and which ends the answer.
+
+28
+00:01:18,548 --> 00:01:19,650
+For our two logit outputs,
+
+29
+00:01:19,650 --> 00:01:22,803
+the theoretical labels are
+the pink and purple vectors.
+
+30
+00:01:24,300 --> 00:01:26,430
+To convert those logits
+into probabilities,
+
+31
+00:01:26,430 --> 00:01:28,436
+we will need to apply a SoftMax,
+
+32
+00:01:28,436 --> 00:01:30,360
+like in the text classification pipeline.
+
+33
+00:01:30,360 --> 00:01:33,390
+We just mask the tokens that
+are not part of the context
+
+34
+00:01:33,390 --> 00:01:36,855
+before doing that, leaving
+the initial CLS token unmasked
+
+35
+00:01:36,855 --> 00:01:39,303
+as we use it to predict
+an impossible answer.
+
+36
+00:01:40,267 --> 00:01:43,500
+This is what it looks in terms of code.
+
+37
+00:01:43,500 --> 00:01:45,870
+We use a large negative
+number for the masking,
+
+38
+00:01:45,870 --> 00:01:48,957
+since its exponential will then be zero.
+
+39
+00:01:48,957 --> 00:01:50,580
+Now, the probability for each start
+
+40
+00:01:50,580 --> 00:01:53,550
+and end position corresponding
+to a possible answer,
+
+41
+00:01:53,550 --> 00:01:55,050
+we give a score that is the product
+
+42
+00:01:55,050 --> 00:01:57,630
+of the start probabilities
+and end probabilities
+
+43
+00:01:57,630 --> 00:01:58,803
+at those positions.
+
+44
+00:02:00,120 --> 00:02:02,670
+Of course, a start index
+greater than an end index
+
+45
+00:02:02,670 --> 00:02:04,503
+corresponds to an impossible answer.
+
+46
+00:02:05,430 --> 00:02:07,080
+Here is the code to find the best score
+
+47
+00:02:07,080 --> 00:02:08,820
+for a possible answer.
+
+48
+00:02:08,820 --> 00:02:11,430
+Once we have the start and
+end positions of the tokens,
+
+49
+00:02:11,430 --> 00:02:14,130
+we use the offset mappings
+provided by our tokenizer
+
+50
+00:02:14,130 --> 00:02:16,950
+to find the span of characters
+in the initial context,
+
+51
+00:02:16,950 --> 00:02:17,900
+and get our answer.
+
+52
+00:02:19,470 --> 00:02:21,900
+Now, when the context is
+long, it might get truncated
+
+53
+00:02:21,900 --> 00:02:22,750
+by the tokenizer.
+
+54
+00:02:23,760 --> 00:02:26,220
+This might result in part
+of the answer, or worse,
+
+55
+00:02:26,220 --> 00:02:28,113
+the whole answer, being truncated.
+
+56
+00:02:29,100 --> 00:02:31,050
+So we don't discard the truncated tokens
+
+57
+00:02:31,050 --> 00:02:33,330
+but build new features with them.
+
+58
+00:02:33,330 --> 00:02:35,994
+Each of those features
+contains the question,
+
+59
+00:02:35,994 --> 00:02:39,240
+then a chunk of text in the context.
+
+60
+00:02:39,240 --> 00:02:41,430
+If we take disjoint chunks of texts,
+
+61
+00:02:41,430 --> 00:02:43,530
+we might end up with
+the answer being split
+
+62
+00:02:43,530 --> 00:02:45,330
+between two features.
+
+63
+00:02:45,330 --> 00:02:48,060
+So instead, we take
+overlapping chunks of texts,
+
+64
+00:02:48,060 --> 00:02:50,640
+to make sure at least one of
+the chunks will fully contain
+
+65
+00:02:50,640 --> 00:02:51,990
+the answer to the question.
+
+66
+00:02:52,830 --> 00:02:55,260
+The tokenizers do all of
+this for us automatically
+
+67
+00:02:55,260 --> 00:02:58,170
+with the return overflowing tokens option.
+
+68
+00:02:58,170 --> 00:02:59,700
+The stride argument controls
+
+69
+00:02:59,700 --> 00:03:02,070
+the number of overlapping tokens.
+
+70
+00:03:02,070 --> 00:03:04,020
+Here is how our very long
+context gets truncated
+
+71
+00:03:04,020 --> 00:03:05,850
+in two features with some overlap.
+
+72
+00:03:05,850 --> 00:03:07,950
+By applying the same
+post-processing we saw before
+
+73
+00:03:07,950 --> 00:03:10,636
+for each feature, we get
+the answer with a score
+
+74
+00:03:10,636 --> 00:03:12,453
+for each of them,
+
+75
+00:03:12,453 --> 00:03:14,910
+and we take the answer with the best score
+
+76
+00:03:14,910 --> 00:03:16,203
+as a final solution.
+
diff --git a/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt b/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt
index 07858744e..90725ddf5 100644
--- a/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt
+++ b/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt
@@ -1,177 +1,358 @@
-1
-00:00:05,360 --> 00:00:07,920
-Let's have a look inside the 
-question answering pipeline.  
-
-2
-00:00:09,600 --> 00:00:13,520
-The question answering pipeline 
-can extracts answers to questions  
-
-3
-00:00:13,520 --> 00:00:17,840
-from a given context or passage of text, like 
-this part of the Transformers repo README.  
-
-4
-00:00:19,040 --> 00:00:23,680
-It also works for very long contexts, even if the 
-answer is at the very end, like in this example.  
-
-5
-00:00:24,480 --> 00:00:25,840
-In this video, we will see why!  
-
-6
-00:00:27,600 --> 00:00:32,720
-The question answering pipeline follows the same 
-steps as the other pipelines: the question and  
-
-7
-00:00:32,720 --> 00:00:38,160
-context are tokenized as a sentence pair, fed to 
-the model then some post-processing is applied.  
-
-8
-00:00:39,280 --> 00:00:44,160
-The tokenization and model steps should be 
-familiar. We use the auto class suitable for  
-
-9
-00:00:44,160 --> 00:00:48,240
-Question Answering instead of sequence 
-classification, but one key difference  
-
-10
-00:00:48,240 --> 00:00:53,680
-with text classification is that our model 
-outputs two tensors named start logits and  
-
-11
-00:00:53,680 --> 00:00:58,640
-end logits. Why is that? Well this is the way 
-the model finds the answer to the question.  
-
-12
-00:00:59,920 --> 00:01:04,880
-First let's have a look at the model inputs. It's 
-numbers associated with the tokenization of the  
-
-13
-00:01:04,880 --> 00:01:12,160
-question followed by the context (with the usual 
-CLS and SEP special tokens). The answer is a part  
-
-14
-00:01:12,160 --> 00:01:17,920
-of those tokens. So we ask the model to predict 
-which token starts the answer and which ends the  
-
-15
-00:01:17,920 --> 00:01:25,040
-answer. For our two logit outputs, the theoretical 
-labels are the pink and purple vectors. To convert  
-
-16
-00:01:25,040 --> 00:01:29,520
-those logits into probabilities, we will need to 
-apply a SoftMax, like in the text classification  
-
-17
-00:01:29,520 --> 00:01:36,240
-pipeline. We just mask the tokens that are not 
-part of the context before doing that, leaving the  
-
-18
-00:01:36,240 --> 00:01:43,200
-initial CLS token unmasked as we use it to predict 
-an impossible answer. This is what it looks in  
-
-19
-00:01:43,200 --> 00:01:49,200
-terms of code. We use a large negative number for 
-the masking, since its exponential will then be 0.  
-
-20
-00:01:50,480 --> 00:01:54,640
-Now the probability for each start and end 
-position corresponding to a possible answer,  
-
-21
-00:01:55,520 --> 00:02:00,000
-we give a score that is the product of the start 
-probabilities and end probabilities at those  
-
-22
-00:02:00,000 --> 00:02:06,000
-positions. Of course, a start index greater than 
-an end index corresponds to an impossible answer.  
-
-23
-00:02:07,360 --> 00:02:12,080
-Here is the code to find the best score for 
-a possible answer. Once we have the start and  
-
-24
-00:02:12,080 --> 00:02:17,040
-end positions of the tokens, we use the offset 
-mappings provided by our tokenizer to find the  
-
-25
-00:02:17,040 --> 00:02:23,520
-span of characters in the initial context, and 
-get our answer! Now, when the context is long,  
-
-26
-00:02:23,520 --> 00:02:29,440
-it might get truncated by the tokenizer. This 
-might result in part of the answer, or worse, the  
-
-27
-00:02:29,440 --> 00:02:34,800
-whole answer, being truncated. So we don't discard 
-the truncated tokens but build new features  
-
-28
-00:02:34,800 --> 00:02:42,080
-with them. Each of those features contains the 
-question, then a chunk of text in the context. If  
-
-29
-00:02:42,080 --> 00:02:47,280
-we take disjoint chunks of texts, we might end up 
-with the answer being split between two features.  
-
-30
-00:02:48,560 --> 00:02:51,840
-So instead, we take overlapping chunks of texts,  
-
-31
-00:02:51,840 --> 00:02:55,520
-to make sure at least one of the chunks will 
-fully contain the answer to the question.  
-
-32
-00:02:56,720 --> 00:03:00,880
-The tokenizers do all of this for us automatically 
-with the return overflowing tokens option.  
-
-33
-00:03:01,680 --> 00:03:04,320
-The stride argument controls the 
-number of overlapping tokens.  
-
-34
-00:03:05,680 --> 00:03:10,160
-Here is how our very long context gets 
-truncated in two features with some overlap.  
-
-35
-00:03:10,960 --> 00:03:15,520
-By applying the same post-processing we saw 
-before for each feature, we get the answer  
-
-36
-00:03:15,520 --> 00:03:27,600
-with a score for each of them, and we take the 
-answer with the best score as a final solution.
+﻿1
+00:00:00,000 --> 00:00:03,417
+(light transition music)
+
+2
+00:00:05,490 --> 00:00:08,440
+- Let's have a look inside the
+question answering pipeline.
+
+3
+00:00:09,780 --> 00:00:11,370
+The question answering pipeline
+
+4
+00:00:11,370 --> 00:00:13,710
+can extract answers to questions
+
+5
+00:00:13,710 --> 00:00:16,020
+from a given context or passage of text
+
+6
+00:00:16,020 --> 00:00:18,370
+like this part of the
+Transformers repo README.
+
+7
+00:00:19,290 --> 00:00:21,180
+It also works for very long context,
+
+8
+00:00:21,180 --> 00:00:24,720
+even if the answer is at the
+very end, like in this example.
+
+9
+00:00:24,720 --> 00:00:26,223
+In this video, we'll see why.
+
+10
+00:00:27,840 --> 00:00:29,310
+The question answering pipeline
+
+11
+00:00:29,310 --> 00:00:32,130
+follows the same steps
+as the other pipelines.
+
+12
+00:00:32,130 --> 00:00:35,550
+The question and context are
+tokenized as a sentence pair,
+
+13
+00:00:35,550 --> 00:00:38,463
+fed to the model then some
+post-processing is applied.
+
+14
+00:00:39,540 --> 00:00:42,840
+So tokenization and model
+steps should be familiar.
+
+15
+00:00:42,840 --> 00:00:45,000
+We use the auto class suitable
+for question answering
+
+16
+00:00:45,000 --> 00:00:47,460
+instead of sequence classification,
+
+17
+00:00:47,460 --> 00:00:50,190
+but one key difference
+with text classification
+
+18
+00:00:50,190 --> 00:00:52,380
+is that our model outputs two tensors
+
+19
+00:00:52,380 --> 00:00:55,230
+named start logits and end logits.
+
+20
+00:00:55,230 --> 00:00:56,160
+Why is that?
+
+21
+00:00:56,160 --> 00:00:58,170
+Well, this is the way the
+model finds the answer
+
+22
+00:00:58,170 --> 00:00:59,043
+to the question.
+
+23
+00:01:00,090 --> 00:01:02,610
+First, let's have a look
+at the model inputs.
+
+24
+00:01:02,610 --> 00:01:04,800
+It's numbers associated
+with the tokenization
+
+25
+00:01:04,800 --> 00:01:05,850
+of the question,
+
+26
+00:01:05,850 --> 00:01:07,753
+followed by the context
+
+27
+00:01:07,753 --> 00:01:10,233
+with the usual CLS and SEP special tokens.
+
+28
+00:01:11,130 --> 00:01:13,203
+The answer is a part of those tokens.
+
+29
+00:01:14,040 --> 00:01:15,330
+So we ask the model to predict
+
+30
+00:01:15,330 --> 00:01:17,040
+which token starts the answer
+
+31
+00:01:17,040 --> 00:01:19,320
+and which ends the answer.
+
+32
+00:01:19,320 --> 00:01:20,910
+For our two logit outputs,
+
+33
+00:01:20,910 --> 00:01:23,823
+the theoretical labels are
+the pink and purple vectors.
+
+34
+00:01:24,870 --> 00:01:26,700
+To convert those logits
+into probabilities,
+
+35
+00:01:26,700 --> 00:01:28,596
+we will need to apply a SoftMax,
+
+36
+00:01:28,596 --> 00:01:31,020
+like in the text classification pipeline.
+
+37
+00:01:31,020 --> 00:01:32,310
+We just mask the tokens
+
+38
+00:01:32,310 --> 00:01:35,940
+that are not part of the
+context before doing that,
+
+39
+00:01:35,940 --> 00:01:38,310
+leaving the initial CLS token unmasked
+
+40
+00:01:38,310 --> 00:01:40,773
+as we use it to predict
+an impossible answer.
+
+41
+00:01:41,940 --> 00:01:44,730
+This is what it looks
+like in terms of code.
+
+42
+00:01:44,730 --> 00:01:47,340
+We use a large negative
+number for the masking
+
+43
+00:01:47,340 --> 00:01:49,533
+since its exponential will then be zero.
+
+44
+00:01:50,850 --> 00:01:53,160
+Now the probability for
+each start and end position
+
+45
+00:01:53,160 --> 00:01:55,740
+corresponding to a possible answer
+
+46
+00:01:55,740 --> 00:01:57,540
+will give a score that is a product
+
+47
+00:01:57,540 --> 00:01:58,680
+of the start probabilities
+
+48
+00:01:58,680 --> 00:02:00,873
+and end probabilities at those position.
+
+49
+00:02:01,920 --> 00:02:04,530
+Of course, a start index
+greater than an end index
+
+50
+00:02:04,530 --> 00:02:06,330
+corresponds to an impossible answer.
+
+51
+00:02:07,744 --> 00:02:09,510
+Here is the code to find the best score
+
+52
+00:02:09,510 --> 00:02:11,280
+for a possible answer.
+
+53
+00:02:11,280 --> 00:02:13,830
+Once we have the start and
+end position for the tokens,
+
+54
+00:02:13,830 --> 00:02:16,650
+we use the offset mappings
+provided by our tokenizer
+
+55
+00:02:16,650 --> 00:02:19,710
+to find the span of characters
+in the initial context,
+
+56
+00:02:19,710 --> 00:02:20,810
+and we get our answer.
+
+57
+00:02:22,080 --> 00:02:23,700
+Now, when the context is long,
+
+58
+00:02:23,700 --> 00:02:25,977
+it might get truncated by the tokenizer.
+
+59
+00:02:26,834 --> 00:02:29,790
+This might result in part
+of the answer, or worse,
+
+60
+00:02:29,790 --> 00:02:32,190
+the whole answer, being truncated.
+
+61
+00:02:32,190 --> 00:02:34,020
+So we don't discard the truncated tokens
+
+62
+00:02:34,020 --> 00:02:36,420
+but build new features with them.
+
+63
+00:02:36,420 --> 00:02:39,330
+Each of those features
+contains the question,
+
+64
+00:02:39,330 --> 00:02:42,150
+then a chunk of text in the context.
+
+65
+00:02:42,150 --> 00:02:44,520
+If we take disjoint chunks of texts,
+
+66
+00:02:44,520 --> 00:02:45,840
+we might end up with the answer
+
+67
+00:02:45,840 --> 00:02:47,733
+being split between two features.
+
+68
+00:02:48,720 --> 00:02:52,050
+So instead, we take
+overlapping chunks of text
+
+69
+00:02:52,050 --> 00:02:53,910
+to make sure at least one of the chunks
+
+70
+00:02:53,910 --> 00:02:56,940
+will fully contain the
+answer to the question.
+
+71
+00:02:56,940 --> 00:02:59,220
+So, tokenizers does all of
+this for us automatically
+
+72
+00:02:59,220 --> 00:03:01,920
+with the return overflowing tokens option.
+
+73
+00:03:01,920 --> 00:03:02,753
+The stride argument
+
+74
+00:03:02,753 --> 00:03:04,830
+controls the number of overlapping tokens.
+
+75
+00:03:05,940 --> 00:03:07,740
+Here is how our very long context
+
+76
+00:03:07,740 --> 00:03:10,323
+gets truncated in two
+features with some overlap.
+
+77
+00:03:11,160 --> 00:03:12,720
+By applying the same post-processing
+
+78
+00:03:12,720 --> 00:03:14,850
+we saw before for each feature,
+
+79
+00:03:14,850 --> 00:03:17,970
+we get the answer with a
+score for each of them,
+
+80
+00:03:17,970 --> 00:03:19,920
+and we take the answer with the best score
+
+81
+00:03:19,920 --> 00:03:21,303
+as a final solution.
+
+82
+00:03:23,089 --> 00:03:26,506
+(light transition music)
+
diff --git a/subtitles/en/49_what-is-normalization.srt b/subtitles/en/49_what-is-normalization.srt
index 090be7355..8087de3d1 100644
--- a/subtitles/en/49_what-is-normalization.srt
+++ b/subtitles/en/49_what-is-normalization.srt
@@ -1,250 +1,414 @@
-1
-00:00:05,130 --> 00:00:11,060
-In this video we will see together what is
-the normalizer component that we find at the
-
-2
-00:00:11,060 --> 00:00:12,240
-beginning of each tokenizer.
-
-3
-00:00:12,240 --> 00:00:20,610
-The normalization operation consists in applying
-a succession of normalization rules to the
-
-4
-00:00:20,610 --> 00:00:21,960
-raw text.
-
-5
-00:00:21,960 --> 00:00:27,510
-We choose normalization rules to remove noise
-in the text which seems useless for the learning
-
-6
-00:00:27,510 --> 00:00:31,420
-and use of our language model.
-
-7
-00:00:31,420 --> 00:00:40,790
-Let's take a very diverse sentence with different
-fonts, upper and lower case characters, accents,
-
-8
-00:00:40,790 --> 00:00:48,490
-punctuation and multiple spaces, to see how
-several tokenizers normalize it.
-
-9
-00:00:48,490 --> 00:00:55,039
-The tokenizer from the FNet model has transformed
-the letters with font variants or circled
-
-10
-00:00:55,039 --> 00:01:00,230
-into their basic version and has removed the
-multiple spaces.
-
-11
-00:01:00,230 --> 00:01:07,090
-And now if we look at the normalization with
-Retribert's tokenizer, we can see that it
-
-12
-00:01:07,090 --> 00:01:12,990
-keeps characters with several font variants
-and keeps the multiple spaces but it removes
-
-13
-00:01:12,990 --> 00:01:15,659
-all the accents.
-
-14
-00:01:15,659 --> 00:01:23,050
-And if we continue to test the normalization
-of many other tokenizers associated to models
-
-15
-00:01:23,050 --> 00:01:34,079
-that you can find on the Hub we can see that
-they also propose other normalizations.
-
-16
-00:01:34,079 --> 00:01:39,310
-With the fast tokenizers, it is very easy
-to observe the normalization chosen for the
-
-17
-00:01:39,310 --> 00:01:42,500
-currently loaded tokenizer.
-
-18
-00:01:42,500 --> 00:01:49,250
-Indeed, each instance of a fast tokenizer
-has an underlying tokenizer from the Tokenizers
-
-19
-00:01:49,250 --> 00:01:54,820
-library stored in the backend_tokenizer attribute.
-
-20
-00:01:54,820 --> 00:02:01,070
-This object has itself a normalizer attribute
-that we can use thanks to the "normalize_str"
-
-21
-00:02:01,070 --> 00:02:04,670
-method to normalize a string.
-
-22
-00:02:04,670 --> 00:02:11,000
-It is thus very practical that this normalization
-which was used at the time of the training
-
-23
-00:02:11,000 --> 00:02:17,870
-of the tokenizer was saved and that it applies
-automatically when you asks a trained tokenizer
-
-24
-00:02:17,870 --> 00:02:21,120
-to tokenize a text.
-
-25
-00:02:21,120 --> 00:02:28,130
-For example, if we hadn't included the albert
-normalizer we would have had a lot of unknown
-
-26
-00:02:28,130 --> 00:02:35,870
-tokens by tokenizing this sentence with accents
-and capital letters.
-
-27
-00:02:35,870 --> 00:02:40,319
-These transformations can also be undetectable
-with a simple "print".
-
-28
-00:02:40,319 --> 00:02:46,069
-Indeed, keep in mind that for a computer,
-text is only a succession of 0 and 1 and it
-
-29
-00:02:46,069 --> 00:02:51,230
-happens that different successions of 0 and
-1 render the same printed character.
-
-30
-00:02:51,230 --> 00:02:57,459
-The 0s and 1s go in groups of 8 to form a
-byte.
-
-31
-00:02:57,459 --> 00:03:04,490
-The computer must then decode this sequence
-of bytes into a sequence of "code points".
-
-32
-00:03:04,490 --> 00:03:10,959
-In our example the 2 bytes are transformed
-into a single "code point" by UTF-8.
-
-33
-00:03:10,959 --> 00:03:18,860
-The unicode standard then allows us to find
-the character corresponding to this code point:
-
-34
-00:03:18,860 --> 00:03:22,140
-the c cedilla.
-
-35
-00:03:22,140 --> 00:03:28,060
-Let's repeat the same operation with this
-new sequence composed of 3 bytes, this time
-
-36
-00:03:28,060 --> 00:03:34,450
-it is transformed into 2 "code points" .... which
-also correspond to the c cedilla character!
-
-37
-00:03:34,450 --> 00:03:41,510
-It is in fact the composition of the unicode
-Latin Small Letter Cand the combining cedilla.
-
-38
-00:03:41,510 --> 00:03:47,819
-But it's annoying because what appears to
-us to be a single character is not at all
-
-39
-00:03:47,819 --> 00:03:52,379
-the same thing for the computer.
-
-40
-00:03:52,379 --> 00:04:02,269
-Fortunately, there are unicode standardization
-standards known as NFC, NFD, NFKC and NFKD
-
-41
-00:04:02,269 --> 00:04:05,430
-that allow erasing some of these differences.
-
-42
-00:04:05,430 --> 00:04:10,019
-These standards are often used by tokenizers!
-
-43
-00:04:10,019 --> 00:04:15,239
-On all these previous examples, even if the
-normalizations changed the look of the text,
-
-44
-00:04:15,239 --> 00:04:21,229
-they did not change the content: you could
-still read "Hello world, let's normalize this
-
-45
-00:04:21,229 --> 00:04:22,540
-sentence".
-
-46
-00:04:22,540 --> 00:04:30,120
-However, you must be aware that some normalizations
-can be very harmful if they are not adapted
-
-47
-00:04:30,120 --> 00:04:31,720
-to their corpus.
-
-48
-00:04:31,720 --> 00:04:37,360
-For example, if you take the French sentence
-"un père indigné", which means "An indignant
-
-49
-00:04:37,360 --> 00:04:45,660
-father", and normalize it with the bert-base-uncase
-tokenizer which removes the accents then the
-
-50
-00:04:45,660 --> 00:04:53,550
-sentence becomes "un père indigne" which
-means "An unworthy father".
-
-51
-00:04:53,550 --> 00:04:58,699
-If you watch this video to build your own
-tokenizer, there are no absolute rules to
-
-52
-00:04:58,699 --> 00:05:04,580
-choose or not a normalization for your brand
-new tokenizer but I advise you to take the
-
-53
-00:05:04,580 --> 00:05:15,960
-time to select them so that they do not make
-you lose important information.
+﻿1
+00:00:00,286 --> 00:00:02,869
+(subtle blast)
+
+2
+00:00:04,694 --> 00:00:07,380
+- In this video, we will see together
+
+3
+00:00:07,380 --> 00:00:09,930
+what is the normalizer component
+
+4
+00:00:09,930 --> 00:00:13,023
+that we'd find at the
+beginning of each tokenizer.
+
+5
+00:00:14,550 --> 00:00:16,830
+The normalization operation consists
+
+6
+00:00:16,830 --> 00:00:19,890
+in applying a succession
+of normalization rules
+
+7
+00:00:19,890 --> 00:00:20,853
+to the raw text.
+
+8
+00:00:21,870 --> 00:00:25,710
+We choose normalization rules
+to remove noise in the text
+
+9
+00:00:25,710 --> 00:00:27,900
+which seem useless for the learning
+
+10
+00:00:27,900 --> 00:00:30,363
+and use of our language model.
+
+11
+00:00:33,090 --> 00:00:37,470
+Let's take a very diverse
+sentence with different fonts,
+
+12
+00:00:37,470 --> 00:00:39,780
+upper and lower case characters,
+
+13
+00:00:39,780 --> 00:00:43,083
+accents, punctuation and multiple spaces,
+
+14
+00:00:43,920 --> 00:00:46,683
+to see how several tokenizer normalize it.
+
+15
+00:00:48,488 --> 00:00:50,730
+The tokenizer from the FNet model
+
+16
+00:00:50,730 --> 00:00:53,700
+has transformed the
+letter with font variants
+
+17
+00:00:53,700 --> 00:00:57,480
+or circled into their basic version
+
+18
+00:00:57,480 --> 00:00:59,733
+and has removed the multiple spaces.
+
+19
+00:01:00,960 --> 00:01:03,960
+And now if we look at the normalization
+
+20
+00:01:03,960 --> 00:01:05,880
+with Retribert's tokenizer,
+
+21
+00:01:05,880 --> 00:01:08,010
+we can see that it keeps characters
+
+22
+00:01:08,010 --> 00:01:12,090
+with several font variants
+and keeps the multiple spaces,
+
+23
+00:01:12,090 --> 00:01:14,223
+but it removes all the accents.
+
+24
+00:01:16,170 --> 00:01:18,870
+And if we continue to
+test this normalization
+
+25
+00:01:18,870 --> 00:01:23,040
+of many other tokenizers
+associated to models
+
+26
+00:01:23,040 --> 00:01:25,110
+that we can find on the Hub,
+
+27
+00:01:25,110 --> 00:01:28,833
+we see that they also propose
+other kind of normalization.
+
+28
+00:01:33,900 --> 00:01:35,850
+With the fast tokenizers,
+
+29
+00:01:35,850 --> 00:01:39,060
+it's very easy to observe
+the normalization chosen
+
+30
+00:01:39,060 --> 00:01:41,193
+for the currently loaded tokenizer.
+
+31
+00:01:42,330 --> 00:01:46,140
+Indeed, each instance of a fast tokenizer
+
+32
+00:01:46,140 --> 00:01:48,030
+has an underlying tokenizer
+
+33
+00:01:48,030 --> 00:01:51,390
+from the HuggingFace
+Tokenizers library stored
+
+34
+00:01:51,390 --> 00:01:53,643
+in the backend_tokenizer attribute.
+
+35
+00:01:54,690 --> 00:01:58,470
+This object has itself
+a normalizer attribute
+
+36
+00:01:58,470 --> 00:02:01,830
+that we can use thanks to
+the normalize_str method
+
+37
+00:02:01,830 --> 00:02:03,153
+to normalize a string.
+
+38
+00:02:04,560 --> 00:02:08,700
+It is thus very practical
+that this normalization,
+
+39
+00:02:08,700 --> 00:02:11,070
+which was used at the time of the training
+
+40
+00:02:11,070 --> 00:02:12,903
+of the tokenizer was saved,
+
+41
+00:02:13,857 --> 00:02:16,200
+and that it applies automatically
+
+42
+00:02:16,200 --> 00:02:19,233
+when you ask a trained
+tokenizer to tokenize a text.
+
+43
+00:02:21,000 --> 00:02:25,500
+For example, if we hadn't
+included the albert normalizer,
+
+44
+00:02:25,500 --> 00:02:28,770
+we would have had a lot of unknown tokens
+
+45
+00:02:28,770 --> 00:02:30,930
+by tokenizing this sentence
+
+46
+00:02:30,930 --> 00:02:33,213
+with accents and capital letters.
+
+47
+00:02:35,730 --> 00:02:38,370
+This transformation can
+also be undetectable
+
+48
+00:02:38,370 --> 00:02:40,050
+with a simple print.
+
+49
+00:02:40,050 --> 00:02:42,810
+Indeed, keep in mind that for a computer,
+
+50
+00:02:42,810 --> 00:02:45,840
+text is only a succession of 0 and 1,
+
+51
+00:02:45,840 --> 00:02:47,820
+and it happens that different successions
+
+52
+00:02:47,820 --> 00:02:51,363
+of 0 and 1 render the
+same printed character.
+
+53
+00:02:52,380 --> 00:02:56,403
+The 0 and 1 go in group
+of 8 to form a byte.
+
+54
+00:02:57,480 --> 00:03:00,690
+The computer must then
+decode this sequence of bytes
+
+55
+00:03:00,690 --> 00:03:02,493
+into a sequence of code points.
+
+56
+00:03:04,530 --> 00:03:09,530
+In our example, the 2 bytes
+is decoded using UTF-8
+
+57
+00:03:09,900 --> 00:03:11,403
+into a single code point.
+
+58
+00:03:12,450 --> 00:03:15,090
+The unicode standard then allows us
+
+59
+00:03:15,090 --> 00:03:18,191
+to find the character
+corresponding to this code point,
+
+60
+00:03:18,191 --> 00:03:20,283
+the c cedilla.
+
+61
+00:03:21,499 --> 00:03:23,790
+Let's repeat the same operation
+
+62
+00:03:23,790 --> 00:03:26,577
+with this new sequence
+composed of 3 bytes,.
+
+63
+00:03:27,420 --> 00:03:30,543
+This time it is transformed
+into two code points,
+
+64
+00:03:31,410 --> 00:03:35,280
+which also correspond to
+the c cedilla character.
+
+65
+00:03:35,280 --> 00:03:36,780
+It is in fact the composition
+
+66
+00:03:36,780 --> 00:03:39,810
+of the unicode Latin Small Letter C
+
+67
+00:03:39,810 --> 00:03:42,240
+and the combining cedilla.
+
+68
+00:03:42,240 --> 00:03:45,000
+But it's annoying because
+what appears to us
+
+69
+00:03:45,000 --> 00:03:46,680
+to be a single character
+
+70
+00:03:46,680 --> 00:03:49,653
+is not at all the same
+thing for the computer.
+
+71
+00:03:52,470 --> 00:03:57,240
+Fortunately, there are unicode
+standardization standards
+
+72
+00:03:57,240 --> 00:04:02,130
+known as NFC, NFD, NFKC or NFKD
+
+73
+00:04:02,130 --> 00:04:04,893
+that allow erasing some
+of these differences.
+
+74
+00:04:05,730 --> 00:04:08,223
+These standards are
+often used by tokenizers.
+
+75
+00:04:09,900 --> 00:04:12,090
+On all these previous examples,
+
+76
+00:04:12,090 --> 00:04:15,510
+even if the normalizations
+changed the look of the text,
+
+77
+00:04:15,510 --> 00:04:17,970
+they did not change the content;
+
+78
+00:04:17,970 --> 00:04:19,177
+you could still read,
+
+79
+00:04:19,177 --> 00:04:21,987
+"Hello world, let's
+normalize this sentence."
+
+80
+00:04:22,980 --> 00:04:25,980
+However, you must be aware
+that some normalizations
+
+81
+00:04:25,980 --> 00:04:30,363
+can be very harmful if they are
+not adapted to their corpus.
+
+82
+00:04:31,620 --> 00:04:34,387
+For example, if you take
+the French sentence,
+
+83
+00:04:34,387 --> 00:04:38,790
+"Un pere indigne," which
+means "An indignant father,"
+
+84
+00:04:38,790 --> 00:04:42,510
+and normalize it with the
+bert-base-uncase tokenizer
+
+85
+00:04:42,510 --> 00:04:44,313
+which removes the accent,
+
+86
+00:04:45,150 --> 00:04:48,000
+then the sentence
+becomes "Un pere indigne"
+
+87
+00:04:48,000 --> 00:04:49,707
+which means "An unworthy father".
+
+88
+00:04:53,460 --> 00:04:56,760
+If you watched this video
+to build your own tokenizer,
+
+89
+00:04:56,760 --> 00:04:59,610
+there are no absolute
+rules to choose or not
+
+90
+00:04:59,610 --> 00:05:02,970
+a normalization for a new tokenizer,
+
+91
+00:05:02,970 --> 00:05:06,210
+but I advise you to take
+the time to select them
+
+92
+00:05:06,210 --> 00:05:10,743
+so that they do not make you
+lose important information.
+
+93
+00:05:12,296 --> 00:05:14,879
+(subtle blast)
+
diff --git a/subtitles/en/50_what-is-pre-tokenization.srt b/subtitles/en/50_what-is-pre-tokenization.srt
index 425b0c7a0..840595abc 100644
--- a/subtitles/en/50_what-is-pre-tokenization.srt
+++ b/subtitles/en/50_what-is-pre-tokenization.srt
@@ -1,115 +1,193 @@
-1
-00:00:05,549 --> 00:00:12,309
-The tokenization pipeline involves several
-steps that convert raw text into numbers.
-
-2
-00:00:12,309 --> 00:00:15,990
-In this video, we will see what happens during
-the pre-tokenization step.
-
-3
-00:00:15,990 --> 00:00:23,840
-The pre-tokenization operation is the operation
-performed after the normalization of the text
-
-4
-00:00:23,840 --> 00:00:28,830
-and before the application of the tokenization
-algorithm.
-
-5
-00:00:28,830 --> 00:00:33,489
-This step consists in applying rules that
-do not need to be learned to perform a first
-
-6
-00:00:33,489 --> 00:00:38,270
-division of the text.
-
-7
-00:00:38,270 --> 00:00:46,270
-Let's look at how several tokenizers pre_tokenize
-this example.
-
-8
-00:00:46,270 --> 00:00:53,430
-The gpt 2 pretokenization divides the text
-on spaces and some punctuation - but the apostrophe
-
-9
-00:00:53,430 --> 00:00:57,840
-is not a division criterion for example.
-
-10
-00:00:57,840 --> 00:01:06,580
-We also notice that spaces have been replaced
-by a capital G with a dot above.
-
-11
-00:01:06,580 --> 00:01:12,900
-Albert's pre-tokenization divides the text
-at the level of spaces, adds a space at the
-
-12
-00:01:12,900 --> 00:01:19,610
-beginning of the sentence and replaces spaces
-with a special underscore.
-
-13
-00:01:19,610 --> 00:01:29,320
-Finally, Bert's pre-tokenization divides the
-text at the level of punctuation and spaces.
-
-14
-00:01:29,320 --> 00:01:35,460
-Unlike the previous tokenizers, spaces are
-not transformed and integrated to the tokens
-
-15
-00:01:35,460 --> 00:01:40,079
-produced with this pre-tokenizer.
-
-16
-00:01:40,079 --> 00:01:45,860
-Through these 3 examples, we could observe
-the two main types of operations brought by
-
-17
-00:01:45,860 --> 00:01:54,210
-the pre-tokenization: some changes on the
-text and the division of the string into tokens
-
-18
-00:01:54,210 --> 00:01:57,259
-that can be associated to words.
-
-19
-00:01:57,259 --> 00:02:06,729
-Finally, the "backend_tokenizer" of the fast
-tokenizers also allows to test the pre-tokenization
-
-20
-00:02:06,729 --> 00:02:12,739
-operation very easily thanks to its "pre_tokenize_str"
-method.
-
-21
-00:02:12,739 --> 00:02:18,740
-We notice that the output of this operation
-is composed of both tokens and offsets which
-
-22
-00:02:18,740 --> 00:02:24,830
-allow to link the token to its position in
-the text given in input of the method.
-
-23
-00:02:24,830 --> 00:02:32,269
-This operation defines the largest tokens
-that can be produced by the tokenization or
-
-24
-00:02:32,269 --> 00:02:48,389
-in other words the barriers of the sub-tokens
-which will be produced then.
+﻿1
+00:00:05,550 --> 00:00:08,910
+- The tokenization pipeline
+involves several steps
+
+2
+00:00:08,910 --> 00:00:11,073
+that converts raw text into numbers.
+
+3
+00:00:12,180 --> 00:00:14,280
+In this video, we will see what happens
+
+4
+00:00:14,280 --> 00:00:16,293
+during the pre-tokenization step.
+
+5
+00:00:18,390 --> 00:00:22,110
+The pre-tokenization operation
+is the operation performed
+
+6
+00:00:22,110 --> 00:00:24,630
+after the normalization of the text
+
+7
+00:00:24,630 --> 00:00:27,633
+and before the application of
+the tokenization algorithm.
+
+8
+00:00:29,112 --> 00:00:31,110
+This step consists in applying rules
+
+9
+00:00:31,110 --> 00:00:32,550
+that do not need to be learned
+
+10
+00:00:32,550 --> 00:00:34,563
+to perform a first division of the text.
+
+11
+00:00:38,160 --> 00:00:41,310
+Let's look at how several tokenizers
+
+12
+00:00:41,310 --> 00:00:43,143
+pre-tokenize in this example.
+
+13
+00:00:46,200 --> 00:00:50,820
+The gpt2 pre-tokenization
+divides the text on spaces
+
+14
+00:00:50,820 --> 00:00:55,820
+and some punctuation, but
+not on the apostrophe.
+
+15
+00:00:57,750 --> 00:01:01,170
+We also notice that
+spaces have been replaced
+
+16
+00:01:01,170 --> 00:01:03,813
+by capital G with a dot above.
+
+17
+00:01:07,170 --> 00:01:09,540
+Albert's pre-tokenization divides the text
+
+18
+00:01:09,540 --> 00:01:11,043
+at the level of spaces,
+
+19
+00:01:11,970 --> 00:01:15,300
+adds a space at the
+beginning of the sentence,
+
+20
+00:01:15,300 --> 00:01:18,873
+and replaces spaces with
+a special underscore.
+
+21
+00:01:20,580 --> 00:01:24,780
+Finally, Bert's pre-tokenization
+divides the text
+
+22
+00:01:24,780 --> 00:01:28,083
+at the level of punctuation and spaces.
+
+23
+00:01:28,920 --> 00:01:31,260
+But unlike the previous tokenizers,
+
+24
+00:01:31,260 --> 00:01:33,780
+spaces are not transformed
+
+25
+00:01:33,780 --> 00:01:37,293
+and integrated into tokens
+produced with this pre-tokenizer.
+
+26
+00:01:40,080 --> 00:01:42,120
+Through this three example,
+
+27
+00:01:42,120 --> 00:01:45,330
+we could observe the two
+main type of operation
+
+28
+00:01:45,330 --> 00:01:47,073
+brought by the pre-tokenization;
+
+29
+00:01:48,420 --> 00:01:49,900
+some change on the text
+
+30
+00:01:50,820 --> 00:01:54,180
+and the division of the string into tokens
+
+31
+00:01:54,180 --> 00:01:56,043
+that can be associated to words.
+
+32
+00:01:59,430 --> 00:02:04,230
+Finally, the backend tokenizer
+of the fast tokenizers
+
+33
+00:02:04,230 --> 00:02:07,680
+also allows to test the
+pre-tokenization operation
+
+34
+00:02:07,680 --> 00:02:11,253
+very easily, thanks to its
+pre_tokenize_str method.
+
+35
+00:02:12,630 --> 00:02:14,970
+We notice that the
+output of this operation
+
+36
+00:02:14,970 --> 00:02:18,450
+is composed of both tokens and offsets,
+
+37
+00:02:18,450 --> 00:02:21,960
+which allow to link the tokens
+to its position in the text
+
+38
+00:02:21,960 --> 00:02:23,943
+given in input of the method.
+
+39
+00:02:25,650 --> 00:02:28,860
+This operation defines the largest tokens
+
+40
+00:02:28,860 --> 00:02:31,740
+that can be produced by the tokenization,
+
+41
+00:02:31,740 --> 00:02:36,090
+or in those words, the
+barriers of the sub-tokens
+
+42
+00:02:36,090 --> 00:02:37,653
+which will be produced then.
+
+43
+00:02:40,050 --> 00:02:41,850
+And that's all for the characteristic
+
+44
+00:02:41,850 --> 00:02:43,203
+of the pre-tokenizers.
+
diff --git a/subtitles/en/51_byte-pair-encoding-tokenization.srt b/subtitles/en/51_byte-pair-encoding-tokenization.srt
index 6dfbe13d7..79c87c176 100644
--- a/subtitles/en/51_byte-pair-encoding-tokenization.srt
+++ b/subtitles/en/51_byte-pair-encoding-tokenization.srt
@@ -1,204 +1,377 @@
-1
-00:00:05,120 --> 00:00:07,440
-You are at the right place if you want to  
-
-2
-00:00:07,440 --> 00:00:15,360
-understand what the Byte pair Encoding subword 
-tokenization algorithm is, how to train it  
-
-3
-00:00:15,360 --> 00:00:18,640
-and how the tokenization of a 
-text is done with this algorithm.  
-
-4
-00:00:21,600 --> 00:00:25,920
-The BPE algorithm was initially 
-proposed as a text compression algorithm  
-
-5
-00:00:26,640 --> 00:00:30,800
-but it is also very well suited as a 
-tokenizer for your language models.  
-
-6
-00:00:32,560 --> 00:00:38,720
-The idea of BPE is to divide words into a 
-sequence of "subword units" which are units  
-
-7
-00:00:38,720 --> 00:00:44,400
-that appear frequently in a reference corpus 
-- that is, the corpus we used to train it.  
-
-8
-00:00:46,560 --> 00:00:53,680
-How is a BPE tokenizer trained? First of all, 
-we have to get a corpus of texts. We will not  
-
-9
-00:00:54,480 --> 00:01:02,080
-train our tokenizer on this raw text but we will 
-first normalize it then pre-tokenize it. As the  
-
-10
-00:01:02,080 --> 00:01:07,520
-pre-tokenization divides the text into a list 
-of words, we can represent our corpus in another  
-
-11
-00:01:07,520 --> 00:01:14,000
-way by gathering together the same words and by 
-maintaining a counter, here represented in blue.  
-
-12
-00:01:17,120 --> 00:01:22,960
-To understand how the training works, we consider 
-this toy corpus composed of the following words:  
-
-13
-00:01:23,520 --> 00:01:32,480
-huggingface, hugging, hug, hugger, etc. BPE is an 
-algorithm that starts with an initial vocabulary  
-
-14
-00:01:32,480 --> 00:01:35,200
-and then increases it to the desired size.  
-
-15
-00:01:36,240 --> 00:01:41,360
-To build the initial vocabulary, we start 
-by separating each word of the corpus  
-
-16
-00:01:41,360 --> 00:01:46,640
-into a list of elementary units that 
-compose them -here the characters.  
-
-17
-00:01:50,800 --> 00:01:51,360
-We could also have chosen bytes as elementary 
-units but it would have been less visual. We list  
-
-18
-00:01:51,360 --> 00:01:57,760
-in our vocabulary all the characters that appear 
-and that will constitute our initial vocabulary!  
-
-19
-00:02:00,240 --> 00:02:09,840
-Let's now see how to increase it. We return to 
-our split corpus, we will go through the words  
-
-20
-00:02:09,840 --> 00:02:18,480
-one by one and count all the occurrences of token 
-pairs. The first pair is composed of the token "h"  
-
-21
-00:02:18,480 --> 00:02:26,080
-and "u", the second 'u' and "g", and we continue 
-like that until we have the complete list.  
-
-22
-00:02:35,440 --> 00:02:41,200
-Once we know all the pairs and their frequency 
-of appearance, we will choose the one that  
-
-23
-00:02:41,200 --> 00:02:49,840
-appears the most frequently: here it is the 
-pair composed of the letters 'l' and 'e'.  
-
-24
-00:02:51,680 --> 00:02:57,040
-We note our first merging rule and we 
-add the new token to our vocabulary.  
-
-25
-00:03:00,080 --> 00:03:04,080
-We can then apply this merging rule to our splits:  
-
-26
-00:03:04,080 --> 00:03:09,280
-you can see that we have merged all the pairs 
-of tokens composed of the tokens "l" and "e".  
-
-27
-00:03:13,840 --> 00:03:19,040
-And now we just have to reproduce 
-the same steps with our new splits:  
-
-28
-00:03:21,520 --> 00:03:24,640
-we calculate the frequency of 
-occurrence of each pair of tokens,  
-
-29
-00:03:27,760 --> 00:03:33,680
-we select the pair with the highest 
-frequency, we note it in our merge rules,  
-
-30
-00:03:35,760 --> 00:03:38,720
-we add the new one to the vocabulary  
-
-31
-00:03:39,600 --> 00:03:46,160
-and then we merge all the pairs of tokens composed 
-of the token "le" and "a" into our splits.  
-
-32
-00:03:50,160 --> 00:03:59,840
-And we can repeat this operation until 
-we reach the desired vocabulary size.  
-
-33
-00:04:05,600 --> 00:04:13,200
-Here we stopped when our vocabulary reached 21 
-tokens. We can see now that the words of our  
-
-34
-00:04:13,200 --> 00:04:20,560
-corpus are now divided into far fewer tokens than 
-at the beginning of the training. We can see that  
-
-35
-00:04:20,560 --> 00:04:27,840
-our algorithm has learned the radicals "hug" 
-and "learn" and also the verbal ending "ing".  
-
-36
-00:04:29,760 --> 00:04:35,600
-Now that we have learned our vocabulary and 
-our merging rules, we can tokenize new texts.  
-
-37
-00:04:37,840 --> 00:04:41,120
-For example, if we want to tokenize the word  
-
-38
-00:04:41,120 --> 00:04:48,480
-hugs: first we'll divide it into elementary 
-units so it became a sequence of characters.  
-
-39
-00:04:49,840 --> 00:04:53,680
-Then we'll go through our merge rules 
-until we have one that we can apply.  
-
-40
-00:04:54,480 --> 00:05:01,040
-Here we can merge the letters h and u. And here 
-we can merge 2 tokens to get the new token hug.  
-
-41
-00:05:02,240 --> 00:05:09,840
-When we get to the end of our merge 
-rule the tokenization is finished.  
-
-42
-00:05:10,640 --> 00:05:22,400
-ßAnd that's it, I hope that now the BPE 
-algorithm has no more secret for you!
+﻿1
+00:00:00,125 --> 00:00:05,125
+(air whooshing)
+
+2
+00:00:05,190 --> 00:00:06,720
+- You are at the right place
+
+3
+00:00:06,720 --> 00:00:10,464
+if you want to understand
+what the Byte Pair Encoding
+
+4
+00:00:10,464 --> 00:00:13,263
+subword tokenization algorithm is,
+
+5
+00:00:14,160 --> 00:00:15,505
+how to train it
+
+6
+00:00:15,505 --> 00:00:17,790
+and how the tokenization of a text is done
+
+7
+00:00:17,790 --> 00:00:19,107
+with this algorithm.
+
+8
+00:00:21,417 --> 00:00:22,920
+The BPE algorithm
+
+9
+00:00:22,920 --> 00:00:26,820
+was initially proposed as a
+text compression algorithm
+
+10
+00:00:26,820 --> 00:00:28,770
+but it is also very well suited
+
+11
+00:00:28,770 --> 00:00:31,143
+as a tokenizer for your language models.
+
+12
+00:00:32,910 --> 00:00:34,890
+The idea of BPE is to divide words
+
+13
+00:00:34,890 --> 00:00:36,933
+into a sequence of 'subword units'
+
+14
+00:00:38,100 --> 00:00:41,970
+which are units that appear
+frequently in a reference corpus
+
+15
+00:00:41,970 --> 00:00:44,613
+which is, the corpus we used to train it.
+
+16
+00:00:46,701 --> 00:00:49,083
+How is a BPE tokenizer trained?
+
+17
+00:00:50,100 --> 00:00:53,340
+First of all, we have to
+get a corpus of texts.
+
+18
+00:00:53,340 --> 00:00:56,940
+We will not train our
+tokenizer on this raw text
+
+19
+00:00:56,940 --> 00:00:59,490
+but we will first normalize it
+
+20
+00:00:59,490 --> 00:01:00,873
+then pre-tokenize it.
+
+21
+00:01:01,890 --> 00:01:03,240
+As the pre-tokenization
+
+22
+00:01:03,240 --> 00:01:05,790
+divides the text into a list of words,
+
+23
+00:01:05,790 --> 00:01:08,400
+we can represent our corpus in another way
+
+24
+00:01:08,400 --> 00:01:10,350
+by gathering together the same words
+
+25
+00:01:10,350 --> 00:01:12,450
+and by maintaining a counter,
+
+26
+00:01:12,450 --> 00:01:14,223
+here represented in blue.
+
+27
+00:01:17,340 --> 00:01:19,860
+To understand how the training works,
+
+28
+00:01:19,860 --> 00:01:23,730
+we consider this toy corpus
+composed of the following words:
+
+29
+00:01:23,730 --> 00:01:28,203
+huggingface, hugging, hug, hugger, etc.
+
+30
+00:01:29,100 --> 00:01:32,640
+BPE is an algorithm that starts
+with an initial vocabulary
+
+31
+00:01:32,640 --> 00:01:35,583
+and then increases it to the desired size.
+
+32
+00:01:36,450 --> 00:01:38,460
+To build the initial vocabulary,
+
+33
+00:01:38,460 --> 00:01:41,550
+we start by separating
+each word of the corpus
+
+34
+00:01:41,550 --> 00:01:44,253
+into a list of elementary
+units that compose them,
+
+35
+00:01:45,210 --> 00:01:47,013
+here, the characters.
+
+36
+00:01:50,850 --> 00:01:54,310
+We list in our vocabulary all
+the characters that appear
+
+37
+00:01:55,218 --> 00:01:58,053
+and that will constitute
+our initial vocabulary.
+
+38
+00:02:00,420 --> 00:02:02,523
+Let's now see how to increase it.
+
+39
+00:02:05,520 --> 00:02:08,250
+We return to our split corpus,
+
+40
+00:02:08,250 --> 00:02:11,340
+we will go through the words one by one
+
+41
+00:02:11,340 --> 00:02:14,313
+and count all the
+occurrences of token pairs.
+
+42
+00:02:15,450 --> 00:02:18,397
+The first pair is composed
+of the token 'h' and 'u',
+
+43
+00:02:20,130 --> 00:02:23,067
+the second 'u' and 'g',
+
+44
+00:02:23,067 --> 00:02:26,253
+and we continue like that until
+we have the complete list.
+
+45
+00:02:35,580 --> 00:02:37,724
+Once we know all the pairs
+
+46
+00:02:37,724 --> 00:02:40,140
+and their frequency of appearance,
+
+47
+00:02:40,140 --> 00:02:42,940
+we will choose the one that
+appears the most frequently.
+
+48
+00:02:44,220 --> 00:02:47,697
+Here it is the pair composed
+of the letters 'l' and 'e'.
+
+49
+00:02:51,930 --> 00:02:53,590
+We note our first merging rule
+
+50
+00:02:54,593 --> 00:02:57,243
+and we add the new
+token to our vocabulary.
+
+51
+00:03:00,330 --> 00:03:04,260
+We can then apply this
+merging rule to our splits.
+
+52
+00:03:04,260 --> 00:03:07,350
+You can see that we have
+merged all the pairs of tokens
+
+53
+00:03:07,350 --> 00:03:09,793
+composed of the tokens 'l' and 'e'.
+
+54
+00:03:14,008 --> 00:03:18,150
+And now, we just have to
+reproduce the same steps
+
+55
+00:03:18,150 --> 00:03:19,353
+with our new splits.
+
+56
+00:03:21,750 --> 00:03:23,460
+We calculate the frequency of occurrence
+
+57
+00:03:23,460 --> 00:03:25,023
+of each pair of tokens,
+
+58
+00:03:27,990 --> 00:03:30,603
+we select the pair with
+the highest frequency,
+
+59
+00:03:32,190 --> 00:03:34,083
+we note it in our merge rules,
+
+60
+00:03:36,000 --> 00:03:39,360
+we add the new one token the vocabulary
+
+61
+00:03:39,360 --> 00:03:41,880
+and then we merge all the pairs of tokens
+
+62
+00:03:41,880 --> 00:03:46,503
+composed of the token 'le'
+and 'a' into our splits.
+
+63
+00:03:50,323 --> 00:03:51,960
+And we can repeat this operation
+
+64
+00:03:51,960 --> 00:03:54,843
+until we reach the
+desired vocabulary size.
+
+65
+00:04:05,671 --> 00:04:10,671
+Here, we stopped when our
+vocabulary reached 21 tokens.
+
+66
+00:04:11,040 --> 00:04:13,920
+We can see now that
+the words of our corpus
+
+67
+00:04:13,920 --> 00:04:17,040
+are now divided into far fewer tokens
+
+68
+00:04:17,040 --> 00:04:20,280
+than at the beginning of the training.
+
+69
+00:04:20,280 --> 00:04:21,720
+And that our algorithm
+
+70
+00:04:21,720 --> 00:04:24,990
+has learned the radicals 'hug' and 'learn'
+
+71
+00:04:24,990 --> 00:04:27,537
+and also the verbal ending 'ing'.
+
+72
+00:04:29,880 --> 00:04:32,160
+Now that we have learned our vocabulary
+
+73
+00:04:32,160 --> 00:04:35,943
+and merging rules, we
+can tokenize new texts.
+
+74
+00:04:37,980 --> 00:04:39,210
+For example,
+
+75
+00:04:39,210 --> 00:04:41,160
+if we want to tokenize the word 'hugs',
+
+76
+00:04:42,960 --> 00:04:46,680
+first we'll divide it
+into elementary units
+
+77
+00:04:46,680 --> 00:04:48,843
+so it became a sequence of characters.
+
+78
+00:04:50,040 --> 00:04:52,020
+Then, we'll go through our merge rules
+
+79
+00:04:52,020 --> 00:04:54,690
+until we have one we can apply.
+
+80
+00:04:54,690 --> 00:04:57,930
+Here, we can merge the
+letters 'h' and 'u'.
+
+81
+00:04:57,930 --> 00:05:01,467
+And here, we can merge 2 tokens
+to get the new token 'hug'.
+
+82
+00:05:02,400 --> 00:05:05,760
+When we get to the end of our merge rules,
+
+83
+00:05:05,760 --> 00:05:07,563
+the tokenization is finished.
+
+84
+00:05:10,650 --> 00:05:11,727
+And that's it.
+
+85
+00:05:12,846 --> 00:05:14,850
+I hope that now the BPE algorithm
+
+86
+00:05:14,850 --> 00:05:16,413
+has no more secret for you!
+
+87
+00:05:17,739 --> 00:05:20,406
+(air whooshing)
+
diff --git a/subtitles/en/52_wordpiece-tokenization.srt b/subtitles/en/52_wordpiece-tokenization.srt
index acfd9947d..fb0b3a571 100644
--- a/subtitles/en/52_wordpiece-tokenization.srt
+++ b/subtitles/en/52_wordpiece-tokenization.srt
@@ -1,154 +1,290 @@
-1
-00:00:05,520 --> 00:00:10,000
-Let's see together what is the training 
-strategy of the WordPiece algorithm  
-
-2
-00:00:10,560 --> 00:00:15,920
-and how it performs the 
-tokenization of a text once trained  
-
-3
-00:00:19,200 --> 00:00:25,280
-WordPiece is a tokenization algorithm introduced 
-by Google. It is used for example by Bert.  
-
-4
-00:00:26,480 --> 00:00:30,640
-To our knowledge, the code of Word 
-Pieces has not been open sourced,  
-
-5
-00:00:31,360 --> 00:00:36,640
-so we base our explanations on our own 
-interpretation of the published literature.  
-
-6
-00:00:42,480 --> 00:00:48,480
-What is the training strategy of 
-WordPiece? Similarly to the BPE algorithm,  
-
-7
-00:00:48,480 --> 00:00:54,480
-WordPiece starts by establishing an initial 
-vocabulary composed of elementary units  
-
-8
-00:00:54,480 --> 00:01:01,760
-and then increases this vocabulary to the 
-desired size. To build the initial vocabulary,  
-
-9
-00:01:01,760 --> 00:01:07,120
-we divide each word in the training corpus 
-into the sequence of letters that make it up.  
-
-10
-00:01:08,240 --> 00:01:14,000
-As you can see, there is a small subtlety: 
-we add a 2 hashtags in front of the letters  
-
-11
-00:01:14,000 --> 00:01:20,240
-that do not start a word. By keeping 
-only one occurrence per elementary unit  
-
-12
-00:01:20,240 --> 00:01:29,440
-we now have our initial vocabulary. We will 
-list all the existing pairs in our corpus.  
-
-13
-00:01:30,800 --> 00:01:34,960
-Once we have this list, we will calculate 
-a score for each of these pairs.  
-
-14
-00:01:36,400 --> 00:01:40,400
-As for the BPE algorithm, we will 
-select the pair with the highest score.  
-
-15
-00:01:43,040 --> 00:01:50,000
-Taking for example the first pair composed 
-of H and U. The score of a pair is simply  
-
-16
-00:01:50,000 --> 00:01:54,720
-equal to the frequency of appearance of 
-the pair divided by the product of the  
-
-17
-00:01:54,720 --> 00:01:59,840
-frequency of appearance of the first token by 
-the frequency of appearance of the second token.  
-
-18
-00:02:01,120 --> 00:02:04,560
-Thus at a fixed frequency 
-of appearance of the pair,  
-
-19
-00:02:05,360 --> 00:02:11,440
-if the subparts of the pair are very frequent 
-in the corpus then this score will be decreased.  
-
-20
-00:02:12,960 --> 00:02:24,000
-In our example, the pair "hu" appears 4 times, the 
-letter "h" 4 times and the letter u 4 times. This  
-
-21
-00:02:24,000 --> 00:02:32,320
-gives us a score of 0.25. Now that we know how to 
-calculate this score, we can do it for all pairs.  
-
-22
-00:02:33,200 --> 00:02:36,480
-We can now add to the vocabulary 
-the pair with the highest score,  
-
-23
-00:02:37,120 --> 00:02:43,520
-after merging it of course! And now we can 
-apply this same fusion to our split corpus.  
-
-24
-00:02:45,600 --> 00:02:51,520
-As you can imagine, we just have to repeat the 
-same operations until we have the vocabulary at  
-
-25
-00:02:51,520 --> 00:03:00,320
-the desired size! Let's look at a few more steps 
-to see the evolution of our vocabulary and the  
-
-26
-00:03:00,320 --> 00:03:09,840
-length of the splits getting shorter. Now that we 
-are happy with our vocabulary, you are probably  
-
-27
-00:03:09,840 --> 00:03:16,400
-wondering how to use it to tokenize a text. Let's 
-say we want to tokenize the word "huggingface".  
-
-28
-00:03:17,760 --> 00:03:23,280
-WordPiece follows these rules: We will look for 
-the longest possible token at the beginning of  
-
-29
-00:03:23,280 --> 00:03:30,560
-our word. Then we start again on the remaining 
-part of our word. And so on until we reach the  
-
-30
-00:03:30,560 --> 00:03:38,240
-end! And that's it, huggingface is divided 
-into 4 sub-tokens. ßThis video is about to  
-
-31
-00:03:38,240 --> 00:03:43,040
-end, I hope it helped you to understand 
-better what is behind the word WordPiece!
+﻿1
+00:00:00,151 --> 00:00:02,818
+(air whooshing)
+
+2
+00:00:05,520 --> 00:00:08,370
+- Let's see together what
+is the training strategy
+
+3
+00:00:08,370 --> 00:00:11,851
+of the WordPiece algorithm,
+and how it performs
+
+4
+00:00:11,851 --> 00:00:15,150
+the tokenization of a text, once trained.
+
+5
+00:00:19,351 --> 00:00:23,580
+WordPiece is a tokenization
+algorithm introduced by Google.
+
+6
+00:00:23,580 --> 00:00:25,653
+It is used, for example, by BERT.
+
+7
+00:00:26,640 --> 00:00:28,020
+To our knowledge,
+
+8
+00:00:28,020 --> 00:00:31,590
+the code of WordPiece
+has not been open source.
+
+9
+00:00:31,590 --> 00:00:33,510
+So we base our explanations
+
+10
+00:00:33,510 --> 00:00:36,903
+on our own interpretation
+of the published literature.
+
+11
+00:00:42,090 --> 00:00:44,883
+So, what is the training
+strategy of WordPiece?
+
+12
+00:00:46,200 --> 00:00:48,663
+Similarly to the BPE algorithm,
+
+13
+00:00:48,663 --> 00:00:52,380
+WordPiece starts by establishing
+an initial vocabulary
+
+14
+00:00:52,380 --> 00:00:54,660
+composed of elementary units,
+
+15
+00:00:54,660 --> 00:00:58,773
+and then increases this
+vocabulary to the desired size.
+
+16
+00:00:59,970 --> 00:01:01,950
+To build the initial vocabulary,
+
+17
+00:01:01,950 --> 00:01:04,920
+we divide each word in the training corpus
+
+18
+00:01:04,920 --> 00:01:07,443
+into the sequence of
+letters that make it up.
+
+19
+00:01:08,430 --> 00:01:11,820
+As you can see, there is a small subtlety.
+
+20
+00:01:11,820 --> 00:01:14,190
+We add two hashtags in
+front of the letters
+
+21
+00:01:14,190 --> 00:01:16,083
+that do not start a word.
+
+22
+00:01:17,190 --> 00:01:20,430
+By keeping only one occurrence
+per elementary unit,
+
+23
+00:01:20,430 --> 00:01:23,313
+we now have our initial vocabulary.
+
+24
+00:01:26,580 --> 00:01:29,823
+We will list all the
+existing pairs in our corpus.
+
+25
+00:01:30,990 --> 00:01:32,640
+Once we have this list,
+
+26
+00:01:32,640 --> 00:01:35,253
+we will calculate a score
+for each of these pairs.
+
+27
+00:01:36,630 --> 00:01:38,400
+As for the BPE algorithm,
+
+28
+00:01:38,400 --> 00:01:40,750
+we will select the pair
+with the highest score.
+
+29
+00:01:43,260 --> 00:01:44,340
+Taking for example,
+
+30
+00:01:44,340 --> 00:01:47,343
+the first pair composed
+of the letters H and U.
+
+31
+00:01:48,510 --> 00:01:51,390
+The score of a pair is
+simply equal to the frequency
+
+32
+00:01:51,390 --> 00:01:54,510
+of appearance of the pair,
+divided by the product
+
+33
+00:01:54,510 --> 00:01:57,330
+of the frequency of
+appearance of the first token,
+
+34
+00:01:57,330 --> 00:02:00,063
+by the frequency of appearance
+of the second token.
+
+35
+00:02:01,260 --> 00:02:05,550
+Thus, at a fixed frequency
+of appearance of the pair,
+
+36
+00:02:05,550 --> 00:02:09,913
+if the subparts of the pair are
+very frequent in the corpus,
+
+37
+00:02:09,913 --> 00:02:11,823
+then this score will be decreased.
+
+38
+00:02:13,140 --> 00:02:17,460
+In our example, the pair
+HU appears four times,
+
+39
+00:02:17,460 --> 00:02:22,460
+the letter H four times,
+and the letter U four times.
+
+40
+00:02:24,030 --> 00:02:26,733
+This gives us a score of 0.25.
+
+41
+00:02:28,410 --> 00:02:30,960
+Now that we know how to
+calculate this score,
+
+42
+00:02:30,960 --> 00:02:33,360
+we can do it for all pairs.
+
+43
+00:02:33,360 --> 00:02:35,217
+We can now add to the vocabulary
+
+44
+00:02:35,217 --> 00:02:38,973
+the pair with the highest score,
+after merging it of course.
+
+45
+00:02:40,140 --> 00:02:43,863
+And now we can apply this same
+fusion to our split corpus.
+
+46
+00:02:45,780 --> 00:02:47,490
+As you can imagine,
+
+47
+00:02:47,490 --> 00:02:50,130
+we just have to repeat the same operations
+
+48
+00:02:50,130 --> 00:02:53,013
+until we have the vocabulary
+at the desired size.
+
+49
+00:02:54,000 --> 00:02:55,800
+Let's look at a few more steps
+
+50
+00:02:55,800 --> 00:02:58,113
+to see the evolution of our vocabulary,
+
+51
+00:02:58,957 --> 00:03:01,773
+and also the evolution of
+the length of the splits.
+
+52
+00:03:06,390 --> 00:03:09,180
+And now that we are happy
+with our vocabulary,
+
+53
+00:03:09,180 --> 00:03:12,663
+you are probably wondering how
+to use it to tokenize a text.
+
+54
+00:03:13,830 --> 00:03:17,640
+Let's say we want to tokenize
+the word "huggingface".
+
+55
+00:03:17,640 --> 00:03:20,310
+WordPiece follows these rules.
+
+56
+00:03:20,310 --> 00:03:22,530
+We will look for the
+longest possible token
+
+57
+00:03:22,530 --> 00:03:24,960
+at the beginning of the word.
+
+58
+00:03:24,960 --> 00:03:28,920
+Then we start again on the
+remaining part of our word,
+
+59
+00:03:28,920 --> 00:03:31,143
+and so on until we reach the end.
+
+60
+00:03:32,100 --> 00:03:35,973
+And that's it. Huggingface is
+divided into four sub-tokens.
+
+61
+00:03:37,200 --> 00:03:39,180
+This video is about to end.
+
+62
+00:03:39,180 --> 00:03:41,370
+I hope it helped you to understand better
+
+63
+00:03:41,370 --> 00:03:43,653
+what is behind the work, WordPiece.
+
+64
+00:03:45,114 --> 00:03:47,864
+(air whooshing)
+
diff --git a/subtitles/en/53_unigram-tokenization.srt b/subtitles/en/53_unigram-tokenization.srt
index 265b9eee1..cc6bae91e 100644
--- a/subtitles/en/53_unigram-tokenization.srt
+++ b/subtitles/en/53_unigram-tokenization.srt
@@ -1,444 +1,707 @@
-1
-00:00:05,330 --> 00:00:11,090
-In this video, we will study together "the
-Unigram Language Model subword tokenization
-
-2
-00:00:11,090 --> 00:00:12,090
-algorithm".
-
-3
-00:00:12,090 --> 00:00:20,080
-The overall training strategy of a Unigram
-LM tokenizer is to start with a very large
-
-4
-00:00:20,080 --> 00:00:27,439
-vocabulary and then to remove tokens at each
-iteration until we reach the desired size.
-
-5
-00:00:27,439 --> 00:00:32,250
-At each iteration, we will calculate a loss
-on our training corpus thanks to the Unigram
-
-6
-00:00:32,250 --> 00:00:33,250
-model.
-
-7
-00:00:33,250 --> 00:00:39,160
-As the loss calculation depends on the available
-vocabulary, we can use it to choose how to
-
-8
-00:00:39,160 --> 00:00:41,590
-reduce the vocabulary.
-
-9
-00:00:41,590 --> 00:00:48,090
-So we look at the evolution of the loss by
-removing in turn each token from the vocabulary.
-
-10
-00:00:48,090 --> 00:00:56,730
-We will choose to remove the p percents which
-increase the loss the less.
-
-11
-00:00:56,730 --> 00:01:01,030
-Before going further in the explanation of
-the training algorithm, I need to explain
-
-12
-00:01:01,030 --> 00:01:04,199
-what is an Unigram model.
-
-13
-00:01:04,199 --> 00:01:08,119
-The Unigram LM model is a type of statistical
-Language Modem.
-
-14
-00:01:08,119 --> 00:01:15,550
-A statistical LM will assign a probability
-to a text considering that the text is in
-
-15
-00:01:15,550 --> 00:01:18,189
-fact a sequence of tokens.
-
-16
-00:01:18,189 --> 00:01:23,900
-The simplest sequences of tokens to imagine
-are the words that compose the sentence or
-
-17
-00:01:23,900 --> 00:01:25,410
-the characters.
-
-18
-00:01:25,410 --> 00:01:32,080
-The particularity of Unigram LM is that it
-assumes that the occurrence of each word is
-
-19
-00:01:32,080 --> 00:01:34,670
-independent of its previous word.
-
-20
-00:01:34,670 --> 00:01:40,271
-This "assumption" allows us to write that
-the probability of a text is equal to the
-
-21
-00:01:40,271 --> 00:01:44,430
-product of the probabilities of the tokens
-that compose it.
-
-22
-00:01:44,430 --> 00:01:51,880
-It should be noted here that this is a very
-simple model which would not be adapted to
-
-23
-00:01:51,880 --> 00:01:58,630
-the generation of text since this model would
-always generate the same token, the one which
-
-24
-00:01:58,630 --> 00:02:00,140
-has the greatest probability.
-
-25
-00:02:00,140 --> 00:02:07,409
-Nevertheless, to do tokenization, this model
-is very useful to us because it can be used
-
-26
-00:02:07,409 --> 00:02:14,209
-to estimate the relative likelihood of different
-phrases.
-
-27
-00:02:14,209 --> 00:02:20,000
-We are now ready to return to our explanation
-of the training algorithm.
-
-28
-00:02:20,000 --> 00:02:25,349
-Let's say that we have as a training corpus
-10 times the word hug, 12 times the word pug,
-
-29
-00:02:25,349 --> 00:02:33,270
-5 times the word lug, 4 times bug and 5 times
-dug.
-
-30
-00:02:33,270 --> 00:02:38,910
-As said at the beginning of the video, the
-training starts with a big vocabulary.
-
-31
-00:02:38,910 --> 00:02:45,280
-Obviously, as we are using a toy corpus, this
-vocabulary will not be that big but it should
-
-32
-00:02:45,280 --> 00:02:46,840
-show you the principle.
-
-33
-00:02:46,840 --> 00:02:54,870
-A first method is to list all the possible
-strict substrings that's what we'll do here.
-
-34
-00:02:54,870 --> 00:03:00,379
-We could also have used the BPE algorithm
-with a very large vocabulary size.
-
-35
-00:03:00,379 --> 00:03:07,200
-So we have our initial vocabulary.
-
-36
-00:03:07,200 --> 00:03:13,629
-The training of the Unigram tokenizer is based
-on the Expectation-Maximization method: at
-
-37
-00:03:13,629 --> 00:03:15,210
-each iteration.
-
-38
-00:03:15,210 --> 00:03:19,190
-We estimate the probabilities of the tokens
-of the vocabulary.
-
-39
-00:03:19,190 --> 00:03:26,430
-Then we remove the p percent of tokens that
-minimize the loss on the corpus and which
-
-40
-00:03:26,430 --> 00:03:33,500
-do not belong to the basic characters as we
-want to keep in our final vocabulary the basic
-
-41
-00:03:33,500 --> 00:03:37,980
-characters to be able to tokenize any word.
-
-42
-00:03:37,980 --> 00:03:39,230
-Let's go for it!
-
-43
-00:03:39,230 --> 00:03:44,660
-The probability of a token is simply estimated
-by the number of appearance of this token
-
-44
-00:03:44,660 --> 00:03:51,590
-in our training corpus divided by the total
-number of appearance of all the tokens.
-
-45
-00:03:51,590 --> 00:03:57,239
-We could use this vocabulary to tokenize our
-words according to the unigram model.
-
-46
-00:03:57,239 --> 00:04:04,080
-We will do it together to understand two things:
-how we tokenize a word with a Unigram model
-
-47
-00:04:04,080 --> 00:04:09,160
-and how the loss is calculated on our corpus.
-
-48
-00:04:09,160 --> 00:04:14,610
-The Unigram LM tokenization of our text "Hug"
-will be the one with the highest probability
-
-49
-00:04:14,610 --> 00:04:19,140
-of occurrence according to our Unigram model.
-
-50
-00:04:19,140 --> 00:04:24,090
-To find it, the simplest way to proceed would
-be to list all the possible segmentations
-
-51
-00:04:24,090 --> 00:04:29,949
-of our text "Hug", calculate the probability
-of each of these segmentations and then choose
-
-52
-00:04:29,949 --> 00:04:32,490
-the one with the highest probability.
-
-53
-00:04:32,490 --> 00:04:38,630
-With the current vocabulary, 2 tokenizations
-get exactly the same probability.
-
-54
-00:04:38,630 --> 00:04:43,789
-So we choose one of them and keep in memory
-the associated probability.
-
-55
-00:04:43,789 --> 00:04:48,850
-To compute the loss on our training corpus,
-we need to tokenize as we just did all the
-
-56
-00:04:48,850 --> 00:04:52,810
-remaining words in the corpus.
-
-57
-00:04:52,810 --> 00:04:57,930
-The loss is then the sum over all the words
-in the corpus of the frequency of occurrence
-
-58
-00:04:57,930 --> 00:05:04,220
-of the word multiplied by the opposite of
-the log of the probability associated with
-
-59
-00:05:04,220 --> 00:05:07,720
-the tokenization of the word.
-
-60
-00:05:07,720 --> 00:05:12,700
-We obtain here a loss of one hundred and seventy.
-
-61
-00:05:12,700 --> 00:05:18,750
-Remember, our initial goal was to reduce the
-vocabulary.
-
-62
-00:05:18,750 --> 00:05:27,810
-To do this, we will remove a token from the
-vocabulary and calculate the associated loss.
-
-63
-00:05:27,810 --> 00:05:32,020
-Let's remove for example the token 'ug'.
-
-64
-00:05:32,020 --> 00:05:38,569
-We notice that the tokenization for "hug"
-with the letter h and the tuple ug is now
-
-65
-00:05:38,569 --> 00:05:39,970
-impossible.
-
-66
-00:05:39,970 --> 00:05:45,810
-Nevertheless, as we saw earlier that two tokenizations
-had the same probability and we can still
-
-67
-00:05:45,810 --> 00:05:50,870
-choose the remaining tokenization with a probability
-of one point ten minus two.
-
-68
-00:05:50,870 --> 00:05:58,210
-The tokenizations of the other words of the
-vocabulary also remain unchanged and finally
-
-69
-00:05:58,210 --> 00:06:06,710
-even if we remove the token "ug" from our
-vocabulary the loss remains equal to 170.
-
-70
-00:06:06,710 --> 00:06:11,550
-For this first iteration, if we continue the
-calculation, we would notice that we could
-
-71
-00:06:11,550 --> 00:06:16,190
-remove any token without it impacting the
-loss.
-
-72
-00:06:16,190 --> 00:06:24,620
-We will therefore choose at random to remove
-the token "ug" before starting a second iteration.
-
-73
-00:06:24,620 --> 00:06:29,600
-We estimate again the probability of each
-token before calculating the impact of each
-
-74
-00:06:29,600 --> 00:06:32,280
-token on the loss.
-
-75
-00:06:32,280 --> 00:06:37,840
-For example, if we remove now the token composed
-of the letters "h" and "u", there is only
-
-76
-00:06:37,840 --> 00:06:42,020
-one possible tokenization left for hug.
-
-77
-00:06:42,020 --> 00:06:46,580
-The tokenization of the other words of the
-vocabulary is not changed.
-
-78
-00:06:46,580 --> 00:06:51,880
-In the end, we obtain by removing the token
-composed of the letters "h" and "u" from the
-
-79
-00:06:51,880 --> 00:06:54,650
-vocabulary a loss of one hundred and sixty-eight.
-
-80
-00:06:54,650 --> 00:07:02,550
-Finally, to choose which token to remove,
-we will for each remaining token of the vocabulary
-
-81
-00:07:02,550 --> 00:07:10,090
-which is not an elementary token calculate
-the associated loss then compare these losses
-
-82
-00:07:10,090 --> 00:07:11,850
-between them.
-
-83
-00:07:11,850 --> 00:07:18,100
-The token which we will remove is the token
-which impacts the least the loss: here the
-
-84
-00:07:18,100 --> 00:07:20,129
-token "bu".
-
-85
-00:07:20,129 --> 00:07:25,710
-We had mentioned at the beginning of the video
-that at each iteration we could remove p % of
-
-86
-00:07:25,710 --> 00:07:29,540
-the tokens by iteration.
-
-87
-00:07:29,540 --> 00:07:35,850
-The second token that could be removed at
-this iteration is the "du" token.
-
-88
-00:07:35,850 --> 00:07:42,690
-And that's it, we just have to repeat these
-steps until we get the vocabulary of the desired
-
-89
-00:07:42,690 --> 00:07:45,240
-size.
-
-90
-00:07:45,240 --> 00:07:51,129
-One last thing, in practice, when we tokenize
-a word with a Unigram model we don't compute
-
-91
-00:07:51,129 --> 00:07:57,210
-the set of probabilities of the possible splits
-of a word before comparing them to keep the
-
-92
-00:07:57,210 --> 00:08:05,560
-best one but we use the Viterbi algorithm
-which is much more efficient.
-
-93
-00:08:05,560 --> 00:08:07,300
-And that's it!
-
-94
-00:08:07,300 --> 00:08:15,000
-I hope that this example has allowed you to
-better understand the Unigram tokenization
-
-95
-00:08:15,000 --> 00:08:18,190
-algorithm.
+﻿1
+00:00:00,000 --> 00:00:02,667
+(air whooshing)
+
+2
+00:00:05,310 --> 00:00:06,420
+- In this video,
+
+3
+00:00:06,420 --> 00:00:09,881
+we will study together
+'the Unigram Language Model
+
+4
+00:00:09,881 --> 00:00:13,288
+subword tokenization algorithm'.
+
+5
+00:00:13,288 --> 00:00:15,567
+The overall training strategy
+
+6
+00:00:15,567 --> 00:00:18,450
+of a Unigram Language Model tokenizer
+
+7
+00:00:18,450 --> 00:00:21,480
+is to start with a very large vocabulary
+
+8
+00:00:21,480 --> 00:00:24,240
+and then to remove
+tokens at each iteration
+
+9
+00:00:24,240 --> 00:00:27,300
+until we reach the desired size.
+
+10
+00:00:27,300 --> 00:00:28,530
+At each iteration,
+
+11
+00:00:28,530 --> 00:00:30,930
+we will calculate a loss
+on our training corpus
+
+12
+00:00:30,930 --> 00:00:33,480
+thanks to the Unigram model.
+
+13
+00:00:33,480 --> 00:00:37,470
+As the loss calculation depends
+on the available vocabulary,
+
+14
+00:00:37,470 --> 00:00:40,563
+we can use it to choose how
+to reduce the vocabulary.
+
+15
+00:00:41,550 --> 00:00:43,620
+So we look at the evolution of the loss
+
+16
+00:00:43,620 --> 00:00:47,103
+by removing in turn each
+token from the vocabulary.
+
+17
+00:00:48,000 --> 00:00:50,430
+We will choose to remove the p-percents
+
+18
+00:00:50,430 --> 00:00:52,200
+which increase the loss the less.
+
+19
+00:00:56,310 --> 00:00:57,540
+Before going further
+
+20
+00:00:57,540 --> 00:01:00,240
+in the explanation of
+the training algorithm,
+
+21
+00:01:00,240 --> 00:01:02,973
+I need to explain what
+is an Unigram model.
+
+22
+00:01:04,183 --> 00:01:06,030
+The Unigram Language Model
+
+23
+00:01:06,030 --> 00:01:08,493
+is a type of Statistical Language Modem.
+
+24
+00:01:09,450 --> 00:01:10,980
+A Statistical Language Model
+
+25
+00:01:10,980 --> 00:01:13,530
+will assign a probability to a text
+
+26
+00:01:13,530 --> 00:01:18,090
+considering that the text is
+in fact a sequence of tokens.
+
+27
+00:01:18,090 --> 00:01:21,090
+The simplest sequences
+of tokens to imagine
+
+28
+00:01:21,090 --> 00:01:24,753
+are the words that compose the
+sentence or the characters.
+
+29
+00:01:26,130 --> 00:01:28,890
+The particularity of
+Unigram Language Model
+
+30
+00:01:28,890 --> 00:01:32,010
+is that it assumes that
+the occurrence of each word
+
+31
+00:01:32,010 --> 00:01:34,533
+is independent of its previous word.
+
+32
+00:01:35,400 --> 00:01:37,620
+This assumption allows us to write
+
+33
+00:01:37,620 --> 00:01:39,570
+that the probability of a text
+
+34
+00:01:39,570 --> 00:01:42,210
+is equal to the product
+of the probabilities
+
+35
+00:01:42,210 --> 00:01:43,953
+of the tokens that compose it.
+
+36
+00:01:45,840 --> 00:01:50,220
+It should be noted here that
+it is a very simple model
+
+37
+00:01:50,220 --> 00:01:53,850
+which would not be adapted
+to the generation of text
+
+38
+00:01:53,850 --> 00:01:57,840
+since this model would always
+generate the same token,
+
+39
+00:01:57,840 --> 00:02:00,453
+the one which has the
+greatest probability.
+
+40
+00:02:01,320 --> 00:02:03,360
+Nevertheless, to do tokenization,
+
+41
+00:02:03,360 --> 00:02:05,790
+this model is very useful to us
+
+42
+00:02:05,790 --> 00:02:07,440
+because it can be used
+
+43
+00:02:07,440 --> 00:02:10,893
+to estimate the relative
+likelihood of different phrases.
+
+44
+00:02:14,100 --> 00:02:15,000
+We are now ready
+
+45
+00:02:15,000 --> 00:02:19,830
+to return to our explanation
+of the training algorithm.
+
+46
+00:02:19,830 --> 00:02:21,690
+Let's say that we have
+as a training corpus
+
+47
+00:02:21,690 --> 00:02:23,880
+with 10 times the word hug,
+
+48
+00:02:23,880 --> 00:02:25,410
+12 times the word pug,
+
+49
+00:02:25,410 --> 00:02:27,330
+5 times the word lug,
+
+50
+00:02:27,330 --> 00:02:28,560
+4 times bug
+
+51
+00:02:28,560 --> 00:02:29,943
+and 5 times dug.
+
+52
+00:02:33,120 --> 00:02:34,560
+As said earlier,
+
+53
+00:02:34,560 --> 00:02:37,473
+the training starts with a big vocabulary.
+
+54
+00:02:38,460 --> 00:02:41,400
+Obviously, as we are using a toy corpus,
+
+55
+00:02:41,400 --> 00:02:44,430
+this vocabulary will not be that big
+
+56
+00:02:44,430 --> 00:02:46,773
+but it should show you the principle.
+
+57
+00:02:47,610 --> 00:02:51,870
+A first method is to list all
+the possible strict substrings
+
+58
+00:02:51,870 --> 00:02:53,823
+and that's what we'll do here.
+
+59
+00:02:54,780 --> 00:02:58,170
+We could also have used the BPE algorithm
+
+60
+00:02:58,170 --> 00:03:00,010
+with a very large vocabulary size
+
+61
+00:03:01,410 --> 00:03:05,103
+but for now, the strict
+substrings are enough.
+
+62
+00:03:06,990 --> 00:03:09,120
+The training of the Unigram tokenizer
+
+63
+00:03:09,120 --> 00:03:12,093
+is based on the
+Expectation-Maximization method.
+
+64
+00:03:13,320 --> 00:03:15,120
+At each iteration,
+
+65
+00:03:15,120 --> 00:03:17,430
+we estimate the
+probabilities of the tokens
+
+66
+00:03:17,430 --> 00:03:18,430
+of the vocabulary
+
+67
+00:03:20,130 --> 00:03:23,100
+and then we remove the p-percent of tokens
+
+68
+00:03:23,100 --> 00:03:26,070
+that minimize the loss on the corpus
+
+69
+00:03:26,070 --> 00:03:28,900
+and which do not belong
+to the basic character
+
+70
+00:03:29,880 --> 00:03:33,150
+as we want to keep in our final vocabulary
+
+71
+00:03:33,150 --> 00:03:36,693
+the basic characters to be
+able to tokenize any word.
+
+72
+00:03:37,770 --> 00:03:39,641
+Let's go for it!
+
+73
+00:03:39,641 --> 00:03:42,360
+The probability of a
+token simply estimated
+
+74
+00:03:42,360 --> 00:03:44,760
+by the number of appearance of this token
+
+75
+00:03:44,760 --> 00:03:46,440
+in our training corpus
+
+76
+00:03:46,440 --> 00:03:50,133
+divided by the total number of
+appearance of all the tokens.
+
+77
+00:03:51,510 --> 00:03:54,390
+We could use this vocabulary
+to tokenize our words
+
+78
+00:03:54,390 --> 00:03:56,283
+according to the Unigram model.
+
+79
+00:03:57,150 --> 00:04:00,892
+We will do it together
+to understand two things:
+
+80
+00:04:00,892 --> 00:04:04,110
+how we tokenize a word
+with a Unigram model
+
+81
+00:04:04,110 --> 00:04:07,803
+and how the loss is
+calculated on our corpus.
+
+82
+00:04:09,088 --> 00:04:12,263
+The Unigram LM tokenization
+of our text 'Hug'
+
+83
+00:04:12,263 --> 00:04:15,270
+will be the one with the highest
+probability of occurrence
+
+84
+00:04:15,270 --> 00:04:17,403
+according to our Unigram model.
+
+85
+00:04:19,080 --> 00:04:21,750
+To find it, the simplest way to proceed
+
+86
+00:04:21,750 --> 00:04:24,120
+would be to list all the
+possible segmentations
+
+87
+00:04:24,120 --> 00:04:25,800
+of our text 'Hug',
+
+88
+00:04:25,800 --> 00:04:29,340
+calculate the probability of
+each of these segmentations
+
+89
+00:04:29,340 --> 00:04:32,043
+and then choose the one with
+the highest probability.
+
+90
+00:04:33,210 --> 00:04:34,920
+With the current vocabulary,
+
+91
+00:04:34,920 --> 00:04:38,640
+two tokenizations get
+exactly the same probability.
+
+92
+00:04:38,640 --> 00:04:40,080
+So we choose one of them
+
+93
+00:04:40,080 --> 00:04:42,603
+and keep in memory the
+associated probability.
+
+94
+00:04:43,710 --> 00:04:46,380
+To compute the loss on
+our training corpus,
+
+95
+00:04:46,380 --> 00:04:48,570
+we need to tokenize as we just did
+
+96
+00:04:48,570 --> 00:04:50,673
+all the remaining words in the corpus.
+
+97
+00:04:52,290 --> 00:04:56,430
+The loss is then the sum over
+all the words in the corpus
+
+98
+00:04:56,430 --> 00:04:58,920
+of the frequency of occurrence of the word
+
+99
+00:04:58,920 --> 00:05:02,670
+multiplied by the opposite
+of the log of the probability
+
+100
+00:05:02,670 --> 00:05:05,463
+associated with the
+tokenization of the word.
+
+101
+00:05:07,620 --> 00:05:10,803
+We obtain here a loss of 170.
+
+102
+00:05:13,830 --> 00:05:18,630
+Remember, our initial goal
+was to reduce the vocabulary.
+
+103
+00:05:18,630 --> 00:05:21,870
+To do this, we will remove
+a token from the vocabulary
+
+104
+00:05:21,870 --> 00:05:24,213
+and calculate the associated loss.
+
+105
+00:05:27,630 --> 00:05:30,627
+Let's remove for example, the token 'ug'.
+
+106
+00:05:31,920 --> 00:05:35,370
+We notice that the tokenization for 'hug'
+
+107
+00:05:35,370 --> 00:05:39,990
+with the letter 'h' and the
+tuple 'ug' is now impossible.
+
+108
+00:05:39,990 --> 00:05:42,240
+Nevertheless, as we saw earlier
+
+109
+00:05:42,240 --> 00:05:45,180
+that two tokenizations
+had the same probability,
+
+110
+00:05:45,180 --> 00:05:47,730
+we can still choose the
+remaining tokenization
+
+111
+00:05:47,730 --> 00:05:51,093
+with a probability of 1.10e-2.
+
+112
+00:05:52,410 --> 00:05:55,350
+The tokenizations of the
+other words of the vocabulary
+
+113
+00:05:55,350 --> 00:05:57,060
+also remain unchanged.
+
+114
+00:05:57,060 --> 00:06:00,600
+And finally, even if we
+remove the token 'ug'
+
+115
+00:06:00,600 --> 00:06:05,403
+from our vocabulary the
+loss remains equal to 170.
+
+116
+00:06:06,630 --> 00:06:08,100
+For this first iteration,
+
+117
+00:06:08,100 --> 00:06:10,080
+if we continue the calculation,
+
+118
+00:06:10,080 --> 00:06:13,050
+we would notice that we
+could remove any token
+
+119
+00:06:13,050 --> 00:06:16,110
+without it impacting the loss.
+
+120
+00:06:16,110 --> 00:06:19,200
+We will therefore choose at
+random to remove the token 'ug'
+
+121
+00:06:19,200 --> 00:06:21,843
+before starting a second iteration.
+
+122
+00:06:24,240 --> 00:06:27,300
+So we estimate again the
+probability of each token
+
+123
+00:06:27,300 --> 00:06:30,630
+before calculating the impact
+of each token on the loss.
+
+124
+00:06:32,160 --> 00:06:33,990
+For example, if we remove now
+
+125
+00:06:33,990 --> 00:06:36,290
+the token composed of
+the letters 'h' and 'u',
+
+126
+00:06:37,350 --> 00:06:41,013
+there is only one possible
+tokenization left for hug.
+
+127
+00:06:41,940 --> 00:06:44,700
+The tokenization of the
+other words of the vocabulary
+
+128
+00:06:44,700 --> 00:06:45,633
+is not changed.
+
+129
+00:06:46,560 --> 00:06:47,393
+In the end,
+
+130
+00:06:47,393 --> 00:06:49,200
+we obtain by removing the token
+
+131
+00:06:49,200 --> 00:06:52,749
+composed of the letters 'h'
+and 'u' from the vocabulary,
+
+132
+00:06:52,749 --> 00:06:56,430
+a loss of 168.
+
+133
+00:06:56,430 --> 00:06:59,490
+Finally, to choose which token to remove,
+
+134
+00:06:59,490 --> 00:07:02,490
+we will for each remaining
+token of the vocabulary,
+
+135
+00:07:02,490 --> 00:07:04,800
+which is not an elementary token,
+
+136
+00:07:04,800 --> 00:07:07,380
+calculate the associated loss.
+
+137
+00:07:07,380 --> 00:07:09,843
+Then, compare these losses between them.
+
+138
+00:07:11,730 --> 00:07:13,800
+The token which we will remove
+
+139
+00:07:13,800 --> 00:07:17,340
+is the token which impacts
+the least the loss,
+
+140
+00:07:17,340 --> 00:07:18,870
+here the token 'bu'.
+
+141
+00:07:20,040 --> 00:07:22,380
+We had mentioned at the
+beginning of the video
+
+142
+00:07:22,380 --> 00:07:24,930
+that at each iteration we could remove
+
+143
+00:07:24,930 --> 00:07:27,093
+p-percent of the tokens by iteration.
+
+144
+00:07:29,356 --> 00:07:33,000
+The second token that could
+be removed at this iteration
+
+145
+00:07:33,000 --> 00:07:34,317
+is the token 'du'.
+
+146
+00:07:36,510 --> 00:07:37,920
+And that's it.
+
+147
+00:07:37,920 --> 00:07:39,720
+We just have to repeat these steps
+
+148
+00:07:39,720 --> 00:07:43,203
+until we get the vocabulary
+of the desired size.
+
+149
+00:07:45,030 --> 00:07:46,500
+One last thing.
+
+150
+00:07:46,500 --> 00:07:50,310
+In practice, when we tokenize
+a word with a Unigram model,
+
+151
+00:07:50,310 --> 00:07:53,130
+we don't compute the
+set of probabilities of
+
+152
+00:07:53,130 --> 00:07:55,500
+all the possible splits of a word
+
+153
+00:07:55,500 --> 00:07:58,770
+before comparing them to keep the best one
+
+154
+00:07:58,770 --> 00:08:01,440
+but we use the Viterbi algorithm
+
+155
+00:08:01,440 --> 00:08:04,563
+which is much more efficient way to do it.
+
+156
+00:08:06,540 --> 00:08:07,680
+And that's it!
+
+157
+00:08:07,680 --> 00:08:09,270
+I hope that this example
+
+158
+00:08:09,270 --> 00:08:10,987
+has allowed you to better understand
+
+159
+00:08:10,987 --> 00:08:12,933
+the Unigram tokenization algorithm.
+
+160
+00:08:14,355 --> 00:08:17,022
+(air whooshing)
+
diff --git a/subtitles/en/54_building-a-new-tokenizer.srt b/subtitles/en/54_building-a-new-tokenizer.srt
index e38c6b749..73d6ff9c9 100644
--- a/subtitles/en/54_building-a-new-tokenizer.srt
+++ b/subtitles/en/54_building-a-new-tokenizer.srt
@@ -1,245 +1,396 @@
-1
-00:00:05,350 --> 00:00:11,360
-In this video we will see how you can create
-your own tokenizer from scratch!
-
-2
-00:00:11,360 --> 00:00:18,370
-To create your own tokenizer you will have
-to think about each of the operations involved
-
-3
-00:00:18,370 --> 00:00:25,220
-in tokenization, namely: normalization, pre-tokenization,
-model, post-processing and decoding.
-
-4
-00:00:25,220 --> 00:00:32,310
-If you don't know what normalization, pre-tokenization
-and the model are, I advise you to go and
-
-5
-00:00:32,310 --> 00:00:34,800
-see the videos linked below.
-
-6
-00:00:34,800 --> 00:00:40,329
-The post processing gathers all the modifications
-that we will carry out on the tokenized text.
-
-7
-00:00:40,329 --> 00:00:46,690
-It can include the addition of special tokens,
-the creation of an attention mask but also
-
-8
-00:00:46,690 --> 00:00:50,200
-the generation of a list of token ids.
-
-9
-00:00:50,200 --> 00:00:55,350
-The decoding operation occurs at the very
-end and will allow passing from the sequence
-
-10
-00:00:55,350 --> 00:00:59,000
-of ids in a sentence.
-
-11
-00:00:59,000 --> 00:01:04,220
-For example, in our example, we can see that
-the hashtags have been removed and the tokens
-
-12
-00:01:04,220 --> 00:01:10,820
-composing the word "today" have been grouped
-together.
-
-13
-00:01:10,820 --> 00:01:17,472
-In a fast tokenizer, all these components
-are gathered in the backend_tokenizer attribute.
-
-14
-00:01:17,472 --> 00:01:22,720
-As you can see with this small code snippet,
-it is an instance of a tokenizer from the
-
-15
-00:01:22,720 --> 00:01:24,860
-tokenizers library.
-
-16
-00:01:24,860 --> 00:01:33,799
-So, to create your own transformers tokenizer
-you will have to follow these steps: first
-
-17
-00:01:33,799 --> 00:01:40,510
-create a training dataset; second create and
-train a tokenizer with the tokenizers library
-
-18
-00:01:40,510 --> 00:01:49,430
-and third load this tokenizer into transformers
-tokenizer.
-
-19
-00:01:49,430 --> 00:01:56,510
-To understand these steps, I propose that
-we recreate a BERT tokenizer.
-
-20
-00:01:56,510 --> 00:01:59,500
-The first thing to do is to create a dataset.
-
-21
-00:01:59,500 --> 00:02:05,650
-With this code snippet you can create an iterator
-on the dataset wikitext-2-raw-v1 which is
-
-22
-00:02:05,650 --> 00:02:08,610
-a rather small dataset in English.
-
-23
-00:02:08,610 --> 00:02:18,830
-We attack here the big part: the design of
-our tokenizer with the tokenizers library.
-
-24
-00:02:18,830 --> 00:02:25,349
-We start by initializing a tokenizer instance
-with a WordPiece model because it is the model
-
-25
-00:02:25,349 --> 00:02:29,240
-used by BERT.
-
-26
-00:02:29,240 --> 00:02:32,110
-Then we can define our normalizer.
-
-27
-00:02:32,110 --> 00:02:39,930
-We will define it as a succession of 2 normalizations
-used to clean up characters not visible in
-
-28
-00:02:39,930 --> 00:02:46,659
-the text, 1 lowercasing normalization and
-2 normalizations used to remove accents.
-
-29
-00:02:46,659 --> 00:02:54,459
-For the pre-tokenization, we will chain two
-pre_tokenizer.
-
-30
-00:02:54,459 --> 00:02:59,959
-The first one separating the text at the level
-of spaces and the second one isolating the
-
-31
-00:02:59,959 --> 00:03:02,450
-punctuation marks.
-
-32
-00:03:02,450 --> 00:03:08,430
-Now, we can define the trainer that will allow
-us to train the WordPiece model chosen at
-
-33
-00:03:08,430 --> 00:03:11,209
-the beginning.
-
-34
-00:03:11,209 --> 00:03:17,280
-To carry out the training, we will have to
-choose a vocabulary size, here we choose twenty-five
-
-35
-00:03:17,280 --> 00:03:29,099
-thousand and also announce the special tokens
-that we absolutely want to add to our vocabulary.
-
-36
-00:03:29,099 --> 00:03:39,209
-In one line of code, we can train our WordPiece
-model using the iterator we defined earlier.
-
-37
-00:03:39,209 --> 00:03:45,800
-Once the model has been trained, we can retrieve
-the ids of the special class and separation
-
-38
-00:03:45,800 --> 00:03:49,750
-tokens because we will need them to post-process
-our sequence.
-
-39
-00:03:49,750 --> 00:03:55,790
-Thanks to the TemplateProcessing class, we
-can add the CLS token at the beginning of
-
-40
-00:03:55,790 --> 00:04:01,780
-each sequence and the SEP token at the end
-of the sequence and between two sentences
-
-41
-00:04:01,780 --> 00:04:07,060
-if we tokenize a text pair.
-
-42
-00:04:07,060 --> 00:04:12,099
-Finally, we just have to define our decoder
-which will allow us to remove the hashtags
-
-43
-00:04:12,099 --> 00:04:17,810
-at the beginning of the tokens that must be
-reattached to the previous token.
-
-44
-00:04:17,810 --> 00:04:30,930
-And there it ist, you have all the necessary
-lines of code to define your own tokenizer.
-
-45
-00:04:30,930 --> 00:04:35,120
-Now that we have a brand new tokenizer with
-the tokenizers library we just have to load
-
-46
-00:04:35,120 --> 00:04:40,070
-it into a fast tokenizer from the transformers
-library.
-
-47
-00:04:40,070 --> 00:04:42,660
-Here again we have several possibilities.
-
-48
-00:04:42,660 --> 00:04:48,830
-We can load it in the generic class "PreTrainedTokenizerFast"
-or in the BertTokenizerFast class since we
-
-49
-00:04:48,830 --> 00:04:56,380
-have built a bert type tokenizer here.
-
-50
-00:04:56,380 --> 00:05:01,600
-I hope this video has helped you understand
-how you can create your own tokenizer and
-
-51
-00:05:01,600 --> 00:05:10,669
-that you are ready to navigate the tokenizers
-library documentation to choose the components
-
-52
-00:05:10,669 --> 00:05:16,490
-for your brand-new tokenizer!
+﻿1
+00:00:00,188 --> 00:00:02,855
+(air whooshing)
+
+2
+00:00:05,400 --> 00:00:07,500
+In this video, we will see how
+
+3
+00:00:07,500 --> 00:00:11,310
+you can create your own
+tokenizer from scratch.
+
+4
+00:00:11,310 --> 00:00:15,000
+To create your own tokenizer,
+you will have to think about
+
+5
+00:00:15,000 --> 00:00:18,180
+each of the operations
+involved in tokenization.
+
+6
+00:00:18,180 --> 00:00:22,440
+Namely, the normalization,
+the pre-tokenization,
+
+7
+00:00:22,440 --> 00:00:25,233
+the model, the post
+processing, and the decoding.
+
+8
+00:00:26,100 --> 00:00:28,350
+If you don't know what normalization,
+
+9
+00:00:28,350 --> 00:00:30,900
+pre-tokenization, and the model are,
+
+10
+00:00:30,900 --> 00:00:34,531
+I advise you to go and see
+the videos linked below.
+
+11
+00:00:34,531 --> 00:00:37,110
+The post processing gathers
+all the modifications
+
+12
+00:00:37,110 --> 00:00:40,860
+that we will carry out
+on the tokenized text.
+
+13
+00:00:40,860 --> 00:00:43,890
+It can include the
+addition of special tokens,
+
+14
+00:00:43,890 --> 00:00:46,290
+the creation of an intention mask,
+
+15
+00:00:46,290 --> 00:00:48,903
+but also the generation
+of a list of token IDs.
+
+16
+00:00:50,220 --> 00:00:53,487
+The decoding operation
+occurs at the very end,
+
+17
+00:00:53,487 --> 00:00:54,660
+and will allow passing
+
+18
+00:00:54,660 --> 00:00:57,753
+from the sequence of IDs in a sentence.
+
+19
+00:00:58,890 --> 00:01:01,800
+For example, you can see that the hashtags
+
+20
+00:01:01,800 --> 00:01:04,260
+have been removed, and the tokens
+
+21
+00:01:04,260 --> 00:01:07,323
+composing the word today
+have been grouped together.
+
+22
+00:01:10,860 --> 00:01:13,440
+In a fast tokenizer, all these components
+
+23
+00:01:13,440 --> 00:01:16,413
+are gathered in the
+backend_tokenizer attribute.
+
+24
+00:01:17,370 --> 00:01:20,070
+As you can see with
+this small code snippet,
+
+25
+00:01:20,070 --> 00:01:22,020
+it is an instance of a tokenizer
+
+26
+00:01:22,020 --> 00:01:23,763
+from the tokenizers library.
+
+27
+00:01:25,740 --> 00:01:28,263
+So, to create your own tokenizer,
+
+28
+00:01:29,970 --> 00:01:31,770
+you will have to follow these steps.
+
+29
+00:01:33,270 --> 00:01:35,433
+First, create a training dataset.
+
+30
+00:01:36,690 --> 00:01:39,000
+Second, create and train a tokenizer
+
+31
+00:01:39,000 --> 00:01:41,700
+with the transformer library.
+
+32
+00:01:41,700 --> 00:01:46,700
+And third, load this tokenizer
+into a transformer tokenizer.
+
+33
+00:01:49,350 --> 00:01:50,850
+To understand these steps,
+
+34
+00:01:50,850 --> 00:01:54,573
+I propose that we recreate
+a BERT tokenizer together.
+
+35
+00:01:56,460 --> 00:01:58,893
+The first thing to do
+is to create a dataset.
+
+36
+00:01:59,970 --> 00:02:02,460
+With this code snippet
+you can create an iterator
+
+37
+00:02:02,460 --> 00:02:05,430
+on the dataset wikitext-2-raw-V1,
+
+38
+00:02:05,430 --> 00:02:08,160
+which is a rather small
+dataset in English,
+
+39
+00:02:08,160 --> 00:02:09,730
+perfect for the example.
+
+40
+00:02:12,210 --> 00:02:13,920
+We attack here the big part,
+
+41
+00:02:13,920 --> 00:02:17,373
+the design of our tokenizer
+with the tokenizer library.
+
+42
+00:02:18,750 --> 00:02:22,020
+We start by initializing
+a tokenizer instance
+
+43
+00:02:22,020 --> 00:02:26,133
+with a WordPiece model because
+it is the model used by BERT.
+
+44
+00:02:29,100 --> 00:02:32,190
+Then we can define our normalizer.
+
+45
+00:02:32,190 --> 00:02:35,891
+We will define it as a
+succession of two normalizations
+
+46
+00:02:35,891 --> 00:02:39,453
+used to clean up characters
+not visible in the text.
+
+47
+00:02:40,590 --> 00:02:43,440
+One lowercasing normalization,
+
+48
+00:02:43,440 --> 00:02:47,253
+and two last normalizations
+used to remove accents.
+
+49
+00:02:49,500 --> 00:02:53,553
+For the pre-tokenization, we
+will chain two pre_tokenizers.
+
+50
+00:02:54,390 --> 00:02:58,200
+The first one separating the
+text at the level of spaces,
+
+51
+00:02:58,200 --> 00:03:01,533
+and the second one isolating
+the punctuation marks.
+
+52
+00:03:03,360 --> 00:03:06,360
+Now, we can define the
+trainer that will allow us
+
+53
+00:03:06,360 --> 00:03:09,753
+to train the WordPiece model
+chosen at the beginning.
+
+54
+00:03:11,160 --> 00:03:12,600
+To carry out the training,
+
+55
+00:03:12,600 --> 00:03:14,853
+we will have to choose a vocabulary size.
+
+56
+00:03:16,050 --> 00:03:17,910
+Here we choose 25,000.
+
+57
+00:03:17,910 --> 00:03:21,270
+And we also need to
+announce the special tokens
+
+58
+00:03:21,270 --> 00:03:24,663
+that we absolutely want
+to add to our vocabulary.
+
+59
+00:03:29,160 --> 00:03:33,000
+In one line of code, we can
+train our WordPiece model
+
+60
+00:03:33,000 --> 00:03:35,553
+using the iterator we defined earlier.
+
+61
+00:03:39,060 --> 00:03:42,570
+Once the model has been
+trained, we can retrieve
+
+62
+00:03:42,570 --> 00:03:46,560
+the IDs of the special
+class and separation tokens,
+
+63
+00:03:46,560 --> 00:03:49,413
+because we will need them to
+post-process our sequence.
+
+64
+00:03:50,820 --> 00:03:52,860
+Thanks to the TemplateProcessing class,
+
+65
+00:03:52,860 --> 00:03:57,210
+we can add the CLS token at
+the beginning of each sequence,
+
+66
+00:03:57,210 --> 00:04:00,120
+and the SEP token at
+the end of the sequence,
+
+67
+00:04:00,120 --> 00:04:03,873
+and between two sentences if
+we tokenize a pair of text.
+
+68
+00:04:07,260 --> 00:04:10,500
+Finally, we just have
+to define our decoder,
+
+69
+00:04:10,500 --> 00:04:12,690
+which will allow us to remove the hashtags
+
+70
+00:04:12,690 --> 00:04:14,610
+at the beginning of the tokens
+
+71
+00:04:14,610 --> 00:04:17,193
+that must be reattached
+to the previous token.
+
+72
+00:04:21,300 --> 00:04:22,260
+And there it is.
+
+73
+00:04:22,260 --> 00:04:25,110
+You have all the necessary lines of code
+
+74
+00:04:25,110 --> 00:04:29,403
+to define your own tokenizer
+with the tokenizer library.
+
+75
+00:04:30,960 --> 00:04:32,280
+Now that we have a brand new tokenizer
+
+76
+00:04:32,280 --> 00:04:35,400
+with the tokenizer library,
+we just have to load it
+
+77
+00:04:35,400 --> 00:04:38,463
+into a fast tokenizer from
+the transformers library.
+
+78
+00:04:39,960 --> 00:04:42,630
+Here again, we have several possibilities.
+
+79
+00:04:42,630 --> 00:04:44,430
+We can load it in the generic class,
+
+80
+00:04:44,430 --> 00:04:48,330
+PreTrainedTokenizerFast, or
+in the BertTokenizerFast class
+
+81
+00:04:48,330 --> 00:04:52,353
+since we have built a
+BERT like tokenizer here.
+
+82
+00:04:57,000 --> 00:04:59,670
+I really hope this video
+has helped you understand
+
+83
+00:04:59,670 --> 00:05:02,133
+how you can create your own tokenizer,
+
+84
+00:05:03,178 --> 00:05:06,240
+and that you are ready now to navigate
+
+85
+00:05:06,240 --> 00:05:08,070
+the tokenizer library documentation
+
+86
+00:05:08,070 --> 00:05:11,367
+to choose the components for
+your brand new tokenizer.
+
+87
+00:05:12,674 --> 00:05:15,341
+(air whooshing)
+
diff --git a/subtitles/en/55_data-processing-for-token-classification.srt b/subtitles/en/55_data-processing-for-token-classification.srt
index 797c6f3bc..4ddd9102d 100644
--- a/subtitles/en/55_data-processing-for-token-classification.srt
+++ b/subtitles/en/55_data-processing-for-token-classification.srt
@@ -1,174 +1,326 @@
-1
-00:00:05,600 --> 00:00:08,720
-Let's study how to preprocess a 
-dataset for token classification!  
-
-2
-00:00:10,400 --> 00:00:15,840
-Token classification regroups any task that can 
-be framed as labelling each word (or token) in  
-
-3
-00:00:15,840 --> 00:00:20,640
-a sentence, like identifying the persons, 
-organizations and locations for instance.  
-
-4
-00:00:21,920 --> 00:00:26,720
-For our example, we will use the Conll 
-dataset, in which we remove columns we  
-
-5
-00:00:26,720 --> 00:00:30,720
-won't use and rename the other ones to 
-get to a dataset with just two columns:  
-
-6
-00:00:31,360 --> 00:00:37,280
-words and labels. If you have your own dataset 
-for token classification, just make sure you  
-
-7
-00:00:37,280 --> 00:00:43,040
-clean your data to get to the same point, with 
-one column containing words (as list of strings)  
-
-8
-00:00:43,040 --> 00:00:48,240
-and another containing labels (as integers 
-spanning from to to your number of labels -1).()  
-
-9
-00:00:49,520 --> 00:00:53,520
-Make sure you have your label names stored 
-somewhere - here we get them from the dataset  
-
-10
-00:00:53,520 --> 00:00:58,640
-features - so you are able to map the integers 
-to some real labels when inspecting your data!  
-
-11
-00:01:00,480 --> 00:01:06,000
-Here we are doing named entity recognitions, 
-so ours labels are either O for words that do  
-
-12
-00:01:06,000 --> 00:01:11,040
-not belong to any entity, LOC, 
-for location, PER, for person,  
-
-13
-00:01:11,680 --> 00:01:19,200
-ORG for organization and MISC for miscellaneous. 
-Each label has two versions: the B- labels  
-
-14
-00:01:19,200 --> 00:01:25,840
-indicate a word that begins an entity while the I- 
-labels indicate a word that is inside an entity.  
-
-15
-00:01:26,880 --> 00:01:29,840
-The first step to preprocess our 
-data is to tokenize the words.  
-
-16
-00:01:30,400 --> 00:01:35,200
-This is very easily done with a tokenizer, we just 
-have to tell it we have pre-tokenized the data  
-
-17
-00:01:35,200 --> 00:01:42,160
-with the flag is_split_into_words. Then comes 
-the hard part. Since we have added special tokens  
-
-18
-00:01:42,160 --> 00:01:47,200
-and each word may have been split into several 
-tokens, our labels won't match the tokens anymore.  
-
-19
-00:01:47,840 --> 00:01:51,520
-This is where the word IDs our fast 
-tokenizer provide come to the rescue.  
-
-20
-00:01:52,800 --> 00:01:57,440
-They match each token to the word it belongs to 
-which allows us to map each token to its label.  
-
-21
-00:01:58,160 --> 00:02:02,080
-We just have to make sure we change the B- 
-labels to their I- counterparts for tokens  
-
-22
-00:02:02,080 --> 00:02:08,880
-that are inside (but not at the beginning) of 
-a word. The special tokens get a label of -100,  
-
-23
-00:02:08,880 --> 00:02:12,960
-which is how we tell the Transformer loss 
-functions to ignore them when computing the loss.  
-
-24
-00:02:14,560 --> 00:02:19,120
-The code is then pretty straightforward, we write 
-a function that shifts the labels for tokens that  
-
-25
-00:02:19,120 --> 00:02:23,920
-are inside a word (that you can customize) and 
-use it when generating the labels for each token.  
-
-26
-00:02:25,600 --> 00:02:29,840
-Once that function to create our labels is 
-written, we can preprocess the whole dataset using  
-
-27
-00:02:29,840 --> 00:02:35,840
-the map function. With the option batched=True, 
-we unleash the speed of out fast tokenizers.  
-
-28
-00:02:36,720 --> 00:02:39,360
-The last problem comes when 
-we need to create a batch.  
-
-29
-00:02:40,160 --> 00:02:43,680
-Unless you changed the preprocessing 
-function to apply some fixed padding,  
-
-30
-00:02:43,680 --> 00:02:49,280
-we will get sentences of various lengths, which we 
-need to pad to the same length. The padding needs  
-
-31
-00:02:49,280 --> 00:02:55,280
-to be applied to the inputs as well as the labels, 
-since we should have one label per token. Again,  
-
-32
-00:02:55,280 --> 00:03:01,200
--100 indicates the labels that should be ignored 
-for the loss computation. This is all done for  
-
-33
-00:03:01,200 --> 00:03:05,760
-us by the DataCollatorForTokenClassification, 
-which you can use in PyTorch or TensorFlow.  
-
-34
-00:03:06,400 --> 00:03:10,960
-With all of this, you are either ready to send 
-your data and this data collator to the Trainer,  
-
-35
-00:03:10,960 --> 00:03:17,840
-or to use the to_tf_dataset method 
-and use the fit method of your model.
+﻿1
+00:00:05,730 --> 00:00:07,590
+- Let's study how to preprocess a dataset
+
+2
+00:00:07,590 --> 00:00:09,063
+for token classification!
+
+3
+00:00:10,560 --> 00:00:12,660
+Token classification regroups any task
+
+4
+00:00:12,660 --> 00:00:14,940
+that can be framed as labeling each word
+
+5
+00:00:14,940 --> 00:00:17,190
+or token in a sentence,
+
+6
+00:00:17,190 --> 00:00:19,530
+like identifying the
+persons, organizations
+
+7
+00:00:19,530 --> 00:00:21,093
+and locations for instance.
+
+8
+00:00:22,170 --> 00:00:25,470
+For our example, we will
+use the Conll dataset,
+
+9
+00:00:25,470 --> 00:00:27,900
+in which we remove columns we won't use
+
+10
+00:00:27,900 --> 00:00:29,940
+and rename the other
+ones to get to a dataset
+
+11
+00:00:29,940 --> 00:00:32,943
+with just two columns, words and labels.
+
+12
+00:00:34,200 --> 00:00:36,750
+If you have your own dataset
+for token classification,
+
+13
+00:00:36,750 --> 00:00:39,870
+just make sure you clean your
+data to get to the same point,
+
+14
+00:00:39,870 --> 00:00:43,290
+with one column containing
+words as list of strings
+
+15
+00:00:43,290 --> 00:00:45,540
+and another containing labels as integers
+
+16
+00:00:45,540 --> 00:00:48,513
+spanning from zero to your
+number of labels minus one.
+
+17
+00:00:49,740 --> 00:00:52,290
+Make sure you have your
+label names stored somewhere.
+
+18
+00:00:52,290 --> 00:00:54,810
+Here we get them from
+the dataset features.
+
+19
+00:00:54,810 --> 00:00:57,660
+So you are able to map the
+integers to some real labels
+
+20
+00:00:57,660 --> 00:00:58,960
+when inspecting your data.
+
+21
+00:01:00,690 --> 00:01:03,510
+Here we are doing named
+entity recognitions,
+
+22
+00:01:03,510 --> 00:01:05,430
+so ours labels are either O
+
+23
+00:01:05,430 --> 00:01:08,310
+for words that do not
+belong to any entity.
+
+24
+00:01:08,310 --> 00:01:13,310
+LOC for location, PER for
+person, ORG for organization
+
+25
+00:01:13,860 --> 00:01:15,603
+and MISC for miscellaneous.
+
+26
+00:01:16,650 --> 00:01:18,540
+Each label has two versions.
+
+27
+00:01:18,540 --> 00:01:21,960
+The B labels indicate a
+word that begins an entity
+
+28
+00:01:21,960 --> 00:01:25,503
+while the I labels indicate a
+word that is inside an entity.
+
+29
+00:01:27,180 --> 00:01:28,830
+The first step to preprocess our data
+
+30
+00:01:28,830 --> 00:01:30,660
+is to tokenize the words.
+
+31
+00:01:30,660 --> 00:01:33,120
+This is very easily
+done with the tokenizer.
+
+32
+00:01:33,120 --> 00:01:35,370
+We just have to tell it we
+have pre-tokenized the data
+
+33
+00:01:35,370 --> 00:01:37,503
+with the flag is_split_into_words=True.
+
+34
+00:01:38,520 --> 00:01:40,380
+Then comes the hard part.
+
+35
+00:01:40,380 --> 00:01:42,360
+Since we have added special tokens
+
+36
+00:01:42,360 --> 00:01:45,270
+and each word may have been
+split into several tokens,
+
+37
+00:01:45,270 --> 00:01:48,090
+our labels won't match the tokens anymore.
+
+38
+00:01:48,090 --> 00:01:50,670
+This is where the word IDs
+our fast tokenizer provides
+
+39
+00:01:50,670 --> 00:01:51,723
+come to the rescue.
+
+40
+00:01:53,040 --> 00:01:55,500
+They match each token to
+the word it belongs to
+
+41
+00:01:55,500 --> 00:01:58,470
+which allows us to map
+each token to its label.
+
+42
+00:01:58,470 --> 00:01:59,303
+We just have to make sure
+
+43
+00:01:59,303 --> 00:02:01,710
+we change the B labels
+to their I counterparts
+
+44
+00:02:01,710 --> 00:02:03,450
+for tokens that are inside
+
+45
+00:02:03,450 --> 00:02:05,433
+but not at the beginning of a word.
+
+46
+00:02:06,330 --> 00:02:09,120
+The special tokens get a label of -100,
+
+47
+00:02:09,120 --> 00:02:11,070
+which is how we tell the
+Transformer loss functions
+
+48
+00:02:11,070 --> 00:02:14,607
+to ignore them when computing the loss.
+
+49
+00:02:14,607 --> 00:02:16,890
+The code is then pretty straightforward.
+
+50
+00:02:16,890 --> 00:02:18,660
+We write a function that shifts the labels
+
+51
+00:02:18,660 --> 00:02:21,840
+for tokens that are inside a
+word that you can customize
+
+52
+00:02:21,840 --> 00:02:24,490
+and use it when generating
+the labels for each token.
+
+53
+00:02:25,830 --> 00:02:28,260
+Once that function to create
+our labels is written,
+
+54
+00:02:28,260 --> 00:02:31,920
+we can preprocess the whole
+dataset using the map function.
+
+55
+00:02:31,920 --> 00:02:33,360
+With the option batched=True,
+
+56
+00:02:33,360 --> 00:02:35,793
+we unleash the speed
+of out fast tokenizers.
+
+57
+00:02:37,110 --> 00:02:40,350
+The last problem comes when
+we need to create a batch.
+
+58
+00:02:40,350 --> 00:02:42,150
+Unless you changed the
+preprocessing function
+
+59
+00:02:42,150 --> 00:02:43,890
+to apply some fixed padding,
+
+60
+00:02:43,890 --> 00:02:45,900
+we will get sentences of various lengths,
+
+61
+00:02:45,900 --> 00:02:47,900
+which we need to pad to the same length.
+
+62
+00:02:48,930 --> 00:02:50,730
+The padding needs to be
+applied to the inputs
+
+63
+00:02:50,730 --> 00:02:51,900
+as well as the labels,
+
+64
+00:02:51,900 --> 00:02:53,950
+since we should have one label per token.
+
+65
+00:02:54,870 --> 00:02:58,260
+Again, -100 indicates the
+labels that should be ignored
+
+66
+00:02:58,260 --> 00:02:59,510
+for the loss computation.
+
+67
+00:03:00,420 --> 00:03:01,560
+This is all done for us
+
+68
+00:03:01,560 --> 00:03:04,050
+by the DataCollatorForTokenClassification,
+
+69
+00:03:04,050 --> 00:03:06,740
+which you can use in
+PyTorch or TensorFlow.
+
+70
+00:03:06,740 --> 00:03:08,880
+With all of this, you are
+either ready to send your data
+
+71
+00:03:08,880 --> 00:03:11,190
+and this data collator to the Trainer,
+
+72
+00:03:11,190 --> 00:03:13,320
+or use the to_tf_dataset method
+
+73
+00:03:13,320 --> 00:03:15,333
+and the fit method of your model.
+
diff --git a/subtitles/en/56_data-processing-for-masked-language-modeling.srt b/subtitles/en/56_data-processing-for-masked-language-modeling.srt
index 30247046c..a411f55af 100644
--- a/subtitles/en/56_data-processing-for-masked-language-modeling.srt
+++ b/subtitles/en/56_data-processing-for-masked-language-modeling.srt
@@ -1,124 +1,249 @@
-1
-00:00:05,120 --> 00:00:11,360
-Let's see how we can preprocess our data 
-for masked language modeling. As a reminder,  
-
-2
-00:00:11,360 --> 00:00:16,320
-masked language modeling is when a model 
-needs to fill the blanks in a sentence.  
-
-3
-00:00:16,320 --> 00:00:22,400
-To do this, you just need texts, no labels, 
-as this is a self-supervised problem. To apply  
-
-4
-00:00:22,400 --> 00:00:27,280
-this on your own data, just make sure you have all 
-your texts gathered in one column of your dataset.  
-
-5
-00:00:28,160 --> 00:00:32,320
-Before we start randomly masking things, we 
-will need to somehow make all those texts the  
-
-6
-00:00:32,320 --> 00:00:38,400
-same length to batch them together. The first 
-way to make all the texts the same length is  
-
-7
-00:00:38,400 --> 00:00:43,840
-the one we used in text classification. let's 
-pad the short texts and truncate the long ones.  
-
-8
-00:00:44,800 --> 00:00:48,400
-As we have seen when we processed 
-data for text classification,  
-
-9
-00:00:48,400 --> 00:00:51,840
-this is all done by our tokenizer with the 
-right options for padding and truncation.  
-
-10
-00:00:52,880 --> 00:00:57,840
-This will however make us lose a lot of texts 
-if the examples in our dataset are very long,  
-
-11
-00:00:58,400 --> 00:01:03,040
-compared to the context length we picked. 
-Here, all the portion in gray is lost.  
-
-12
-00:01:04,160 --> 00:01:08,320
-This is why a second way to generate samples 
-of text with the same length is to chunk our  
-
-13
-00:01:08,320 --> 00:01:12,720
-text in pieces of context lengths, instead of 
-discarding everything after the first chunk.  
-
-14
-00:01:13,760 --> 00:01:17,920
-There will probably be a remainder of length 
-smaller than the context size, which we can  
-
-15
-00:01:17,920 --> 00:01:24,480
-choose to keep and pad or ignore. Here is how we 
-can apply this in practice, by just adding the  
-
-16
-00:01:24,480 --> 00:01:30,080
-return overflowing tokens option in our tokenizer 
-call. Note how this gives us a bigger dataset!  
-
-17
-00:01:31,280 --> 00:01:36,720
-This second way of chunking is ideal if all your 
-texts are very long, but it won't work as nicely  
-
-18
-00:01:36,720 --> 00:01:42,640
-if you have a variety of lengths in the texts. 
-In this case, the best option is to concatenate  
-
-19
-00:01:42,640 --> 00:01:47,600
-all your tokenized texts in one big stream, with 
-a special tokens to indicate when you pass from  
-
-20
-00:01:47,600 --> 00:01:54,560
-one document to the other, and only then split the 
-big stream into chunks. Here is how it can be done  
-
-21
-00:01:54,560 --> 00:02:01,200
-with code, with one loop to concatenate all the 
-texts and another one to chunk it. Notice how it  
-
-22
-00:02:01,200 --> 00:02:05,920
-reduces the number of samples in our dataset here, 
-there must have been quite a few short entries!  
-
-23
-00:02:07,520 --> 00:02:12,960
-Once this is done, the masking is the easy part. 
-There is a data collator designed specifically for  
-
-24
-00:02:12,960 --> 00:02:18,240
-this in the Transformers library. You can use 
-it directly in the Trainer, or when converting  
-
-25
-00:02:18,240 --> 00:02:29,600
-your datasets to tensorflow datasets before 
-doing Keras.fit, with the to_tf_dataset method.
+﻿1
+00:00:00,000 --> 00:00:02,333
+(whooshing)
+
+2
+00:00:05,250 --> 00:00:07,230
+- Let's see how we can preprocess our data
+
+3
+00:00:07,230 --> 00:00:08,703
+for masked language modeling.
+
+4
+00:00:10,230 --> 00:00:12,570
+As a reminder, masked language modeling
+
+5
+00:00:12,570 --> 00:00:15,333
+is when a model needs to fill
+the blanks in a sentence.
+
+6
+00:00:16,530 --> 00:00:19,650
+To do this, you just
+need texts, no labels,
+
+7
+00:00:19,650 --> 00:00:22,200
+as this is a self-supervised problem.
+
+8
+00:00:22,200 --> 00:00:23,670
+To apply this on your own data,
+
+9
+00:00:23,670 --> 00:00:25,740
+just make sure you have
+all your texts gathered
+
+10
+00:00:25,740 --> 00:00:27,603
+in one column of your dataset.
+
+11
+00:00:28,440 --> 00:00:30,480
+Before we start randomly masking things,
+
+12
+00:00:30,480 --> 00:00:33,090
+we will need to somehow make
+all those texts the same length
+
+13
+00:00:33,090 --> 00:00:34,263
+to batch them together.
+
+14
+00:00:35,640 --> 00:00:38,490
+The first way to make all
+the texts the same length
+
+15
+00:00:38,490 --> 00:00:40,590
+is the one we used in text classification.
+
+16
+00:00:41,430 --> 00:00:44,163
+Let's pad the short texts
+and truncate the long ones.
+
+17
+00:00:45,030 --> 00:00:45,900
+As we have seen
+
+18
+00:00:45,900 --> 00:00:48,690
+when we processed data
+for text classification,
+
+19
+00:00:48,690 --> 00:00:49,923
+this is all done by our tokenizer
+
+20
+00:00:49,923 --> 00:00:53,130
+with the right options for
+padding and truncation.
+
+21
+00:00:53,130 --> 00:00:56,100
+This will however make
+us lose a lot of texts
+
+22
+00:00:56,100 --> 00:00:58,620
+if the examples in our
+dataset are very long,
+
+23
+00:00:58,620 --> 00:01:00,960
+compared to the context length we picked.
+
+24
+00:01:00,960 --> 00:01:03,393
+Here, all the portion in gray is lost.
+
+25
+00:01:04,410 --> 00:01:06,660
+This is why a second way
+to generate samples of text
+
+26
+00:01:06,660 --> 00:01:08,820
+with the same length is to chunk our text
+
+27
+00:01:08,820 --> 00:01:10,560
+in pieces of context lengths,
+
+28
+00:01:10,560 --> 00:01:14,010
+instead of discarding everything
+after the first chunk.
+
+29
+00:01:14,010 --> 00:01:15,420
+There will probably be a remainder
+
+30
+00:01:15,420 --> 00:01:17,700
+of length smaller than the context size,
+
+31
+00:01:17,700 --> 00:01:20,493
+which we can choose to
+keep and pad or ignore.
+
+32
+00:01:21,570 --> 00:01:23,790
+Here is how we can apply this in practice,
+
+33
+00:01:23,790 --> 00:01:26,460
+by just adding the return
+overflowing tokens option
+
+34
+00:01:26,460 --> 00:01:28,200
+in our tokenizer call.
+
+35
+00:01:28,200 --> 00:01:30,243
+Note how this gives us a bigger dataset!
+
+36
+00:01:31,560 --> 00:01:34,260
+This second way of chunking
+is ideal if all your texts
+
+37
+00:01:34,260 --> 00:01:36,270
+are very long, but it won't work
+
+38
+00:01:36,270 --> 00:01:39,900
+as nicely if you have a variety
+of lengths in the texts.
+
+39
+00:01:39,900 --> 00:01:41,040
+In this case,
+
+40
+00:01:41,040 --> 00:01:44,280
+the best option is to concatenate
+all your tokenized texts
+
+41
+00:01:44,280 --> 00:01:46,560
+in one big stream, with a special tokens
+
+42
+00:01:46,560 --> 00:01:49,800
+to indicate when you pass from
+one document to the other,
+
+43
+00:01:49,800 --> 00:01:52,503
+and only then split the
+big stream into chunks.
+
+44
+00:01:53,760 --> 00:01:55,620
+Here is how it can be done with code,
+
+45
+00:01:55,620 --> 00:01:58,230
+with one loop to concatenate all the texts
+
+46
+00:01:58,230 --> 00:01:59,673
+and another one to chunk it.
+
+47
+00:02:00,780 --> 00:02:02,850
+Notice how it reduces
+the number of samples
+
+48
+00:02:02,850 --> 00:02:04,230
+in our dataset here,
+
+49
+00:02:04,230 --> 00:02:06,580
+there must have been
+quite a few short entries!
+
+50
+00:02:07,710 --> 00:02:11,130
+Once this is done, the
+masking is the easy part.
+
+51
+00:02:11,130 --> 00:02:13,400
+There is a data collator
+designed specifically for this
+
+52
+00:02:13,400 --> 00:02:15,540
+in the Transformers library.
+
+53
+00:02:15,540 --> 00:02:17,700
+You can use it directly in the Trainer,
+
+54
+00:02:17,700 --> 00:02:20,400
+or when converting your
+datasets to tensorflow datasets
+
+55
+00:02:20,400 --> 00:02:23,703
+before doing Keras.fit, with
+the to_tf_dataset method.
+
+56
+00:02:24,992 --> 00:02:27,325
+(whooshing)
+
diff --git a/subtitles/en/57_what-is-perplexity.srt b/subtitles/en/57_what-is-perplexity.srt
index b7fb95c7c..cdc14eb17 100644
--- a/subtitles/en/57_what-is-perplexity.srt
+++ b/subtitles/en/57_what-is-perplexity.srt
@@ -1,99 +1,231 @@
-1
-00:00:05,280 --> 00:00:09,200
-In this video we take a look at the 
-mysterious sounding metric called Perplexity.  
-
-2
-00:00:10,880 --> 00:00:14,880
-You might have encountered perplexity 
-when reading about generative models.  
-
-3
-00:00:14,880 --> 00:00:19,760
-You can see two examples here from the original 
-transformer paper “Attention is all you need”  
-
-4
-00:00:19,760 --> 00:00:25,600
-as well as the more recent GPT-2 paper. Perplexity 
-is a common metric to measure the performance  
-
-5
-00:00:25,600 --> 00:00:30,880
-of language models. The smaller the value the 
-better the performance. But what does it actually  
-
-6
-00:00:30,880 --> 00:00:36,880
-mean and how can we calculate it? A very common 
-quantity in machine learning is the likelihood.  
-
-7
-00:00:37,440 --> 00:00:41,280
-We can calculate the likelihood as the 
-product of each token’s probability  
-
-8
-00:00:42,160 --> 00:00:47,200
-What this means is that for each token we use 
-the language model to predict its probability  
-
-9
-00:00:47,200 --> 00:00:52,960
-based on the previous tokens. In the end we 
-multiply all probabilities to get the Likelihood.  
-
-10
-00:00:55,680 --> 00:00:59,120
-With the likelihood we can calculate 
-another important quantity:  
-
-11
-00:00:59,120 --> 00:01:04,560
-the cross entropy. You might already have heard 
-about cross-entropy when looking at loss function.  
-
-12
-00:01:05,440 --> 00:01:08,480
-Cross-entropy is often used as a 
-loss function in classification.  
-
-13
-00:01:09,040 --> 00:01:14,720
-In language modeling we predict the next 
-token which also is a classification task.  
-
-14
-00:01:15,600 --> 00:01:20,400
-Therefore, if we want to calculate the cross 
-entropy of an example we can simply pass it to the  
-
-15
-00:01:20,400 --> 00:01:25,840
-model with the inputs as labels. The loss return 
-by the model then corresponds the cross entropy.  
-
-16
-00:01:28,880 --> 00:01:32,640
-We are now only a single operation 
-away from calculating the perplexity.  
-
-17
-00:01:33,280 --> 00:01:39,360
-By exponentiating the cross-entropy we get the 
-perplexity. So you see that the perplexity is  
-
-18
-00:01:39,360 --> 00:01:55,040
-closely related to the loss. Keep in mind that 
-the loss is only a weak proxy for a model’s  
-
-19
-00:01:55,040 --> 00:02:01,600
-ability to generate quality text and the same is 
-true for perplexity. For this reason one usually  
-
-20
-00:02:01,600 --> 00:02:07,840
-also calculates more sophisticated metrics 
-such as BLEU or ROUGE on generative tasks.
+﻿1
+00:00:00,095 --> 00:00:01,582
+(screen whooshing)
+
+2
+00:00:01,582 --> 00:00:02,659
+(sticker popping)
+
+3
+00:00:02,659 --> 00:00:05,379
+(screen whooshing)
+
+4
+00:00:05,379 --> 00:00:06,720
+- In this video, we take a look
+
+5
+00:00:06,720 --> 00:00:09,483
+at the mysterious sounding
+metric called perplexity.
+
+6
+00:00:11,070 --> 00:00:12,630
+You might have encountered perplexity
+
+7
+00:00:12,630 --> 00:00:14,970
+when reading about generative models.
+
+8
+00:00:14,970 --> 00:00:16,680
+You can see two examples here,
+
+9
+00:00:16,680 --> 00:00:18,577
+one from the original transformer paper,
+
+10
+00:00:18,577 --> 00:00:19,950
+"Attention is all you need,"
+
+11
+00:00:19,950 --> 00:00:23,340
+and the other one from the
+more recent GPT-2 paper.
+
+12
+00:00:23,340 --> 00:00:25,740
+Perplexity is a common metric
+to measure the performance
+
+13
+00:00:25,740 --> 00:00:27,150
+of language models.
+
+14
+00:00:27,150 --> 00:00:30,000
+The smaller its value, the
+better the performance.
+
+15
+00:00:30,000 --> 00:00:32,950
+But what does it actually mean
+and how can we calculate it?
+
+16
+00:00:34,440 --> 00:00:36,180
+A very common quantity in machine learning
+
+17
+00:00:36,180 --> 00:00:37,650
+is the likelihood.
+
+18
+00:00:37,650 --> 00:00:39,240
+We can calculate the likelihood
+
+19
+00:00:39,240 --> 00:00:42,390
+as the product of each
+token's probability.
+
+20
+00:00:42,390 --> 00:00:44,730
+What this means is that for each token,
+
+21
+00:00:44,730 --> 00:00:47,340
+we use the language model
+to predict its probability
+
+22
+00:00:47,340 --> 00:00:49,560
+based on the previous tokens.
+
+23
+00:00:49,560 --> 00:00:52,050
+In the end, we multiply all probabilities
+
+24
+00:00:52,050 --> 00:00:53,253
+to get the likelihood.
+
+25
+00:00:55,892 --> 00:00:57,000
+With the likelihood,
+
+26
+00:00:57,000 --> 00:00:59,340
+we can calculate another
+important quantity,
+
+27
+00:00:59,340 --> 00:01:01,200
+the cross-entropy.
+
+28
+00:01:01,200 --> 00:01:03,450
+You might have already
+heard about cross-entropy
+
+29
+00:01:03,450 --> 00:01:05,670
+when looking at loss function.
+
+30
+00:01:05,670 --> 00:01:09,210
+It is often used as a loss
+function in classification.
+
+31
+00:01:09,210 --> 00:01:11,610
+In language modeling, we
+predict the next token
+
+32
+00:01:11,610 --> 00:01:12,930
+based on the previous token,
+
+33
+00:01:12,930 --> 00:01:15,810
+which is also a classification task.
+
+34
+00:01:15,810 --> 00:01:17,340
+Therefore, if we want to calculate
+
+35
+00:01:17,340 --> 00:01:19,290
+the cross-entropy of an example,
+
+36
+00:01:19,290 --> 00:01:21,090
+we can simply pass it to the model
+
+37
+00:01:21,090 --> 00:01:23,580
+with its inputs as labels.
+
+38
+00:01:23,580 --> 00:01:26,433
+The loss then corresponds
+to the cross-entropy.
+
+39
+00:01:29,130 --> 00:01:31,110
+We are now only a single operation away
+
+40
+00:01:31,110 --> 00:01:33,510
+from calculating the perplexity.
+
+41
+00:01:33,510 --> 00:01:37,710
+By exponentiating the cross-entropy,
+we get the perplexity.
+
+42
+00:01:37,710 --> 00:01:40,260
+So you see that the
+perplexity is closely related
+
+43
+00:01:40,260 --> 00:01:41,163
+to the loss.
+
+44
+00:01:42,060 --> 00:01:43,380
+Plugging in previous results
+
+45
+00:01:43,380 --> 00:01:47,010
+shows that this is
+equivalent to exponentiating
+
+46
+00:01:47,010 --> 00:01:51,033
+the negative average lock
+probability of each token.
+
+47
+00:01:52,050 --> 00:01:54,630
+Keep in mind that the
+loss is only a weak proxy
+
+48
+00:01:54,630 --> 00:01:57,360
+for a model's ability
+to generate quality text
+
+49
+00:01:57,360 --> 00:02:00,510
+and the same is true for perplexity.
+
+50
+00:02:00,510 --> 00:02:02,550
+For this reason, one
+usually also calculates
+
+51
+00:02:02,550 --> 00:02:03,840
+more sophisticated metrics
+
+52
+00:02:03,840 --> 00:02:07,413
+such as BLEU or ROUGE on generative tasks.
+
+53
+00:02:08,551 --> 00:02:11,468
+(screen whooshing)
+
diff --git a/subtitles/en/58_what-is-domain-adaptation.srt b/subtitles/en/58_what-is-domain-adaptation.srt
index a83e76cb9..5d4f8c9d4 100644
--- a/subtitles/en/58_what-is-domain-adaptation.srt
+++ b/subtitles/en/58_what-is-domain-adaptation.srt
@@ -1,89 +1,185 @@
-1
-00:00:05,840 --> 00:00:12,400
-What is domain adaptation? When fine-tuning 
-a pretrained model on a new dataset,  
-
-2
-00:00:12,400 --> 00:00:17,200
-the fine-tuned model we obtain will make 
-predictions that are attuned to this new dataset.  
-
-3
-00:00:18,640 --> 00:00:23,440
-When the two models are trained with the same 
-task, we can then compare their predictions  
-
-4
-00:00:23,440 --> 00:00:27,600
-on the same input. The predictions 
-of the two models will be different,  
-
-5
-00:00:27,600 --> 00:00:32,640
-in a way that reflects the differences 
-between the two datasets, a phenomenon we call  
-
-6
-00:00:32,640 --> 00:00:39,840
-domain adaptation. Let's look at an example with 
-mask language modeling, by comparing the outputs  
-
-7
-00:00:39,840 --> 00:00:44,400
-of the pretrained distilBERT model with the 
-version fine-tuned in chapter 7 of the course  
-
-8
-00:00:44,400 --> 00:00:50,800
-(linked below). The pretrained model makes generic 
-predictions, whereas the fine-tuned model has its  
-
-9
-00:00:50,800 --> 00:00:57,040
-first two predictions linked to cinema. Since 
-it was fine-tuned on a movie reviews dataset,  
-
-10
-00:00:57,040 --> 00:01:00,320
-it's perfectly normal to see it 
-adapted its suggestions like this.  
-
-11
-00:01:01,200 --> 00:01:05,520
-Notice how it keeps the same predictions as 
-the pretrained model afterward. Even if the  
-
-12
-00:01:05,520 --> 00:01:09,920
-fine-tuned model adapts to the new dataset, 
-it's not forgetting what it was pretrained on.  
-
-13
-00:01:11,200 --> 00:01:17,120
-This is another example on a translation task. 
-On top we use a pretrained French/English model  
-
-14
-00:01:17,120 --> 00:01:22,720
-and at the bottom, the version we fine-tuned in 
-chapter 7. The top model is pretrained on lots of  
-
-15
-00:01:22,720 --> 00:01:27,440
-texts, and leaves technical English terms like 
-plugin and email unchanged in the translation  
-
-16
-00:01:28,160 --> 00:01:33,360
-(both are perfectly understood by French people). 
-The dataset picked for the fine-tuning is a  
-
-17
-00:01:33,360 --> 00:01:38,240
-dataset of technical texts where special attention 
-was picked to translate everything in French.  
-
-18
-00:01:38,960 --> 00:01:50,560
-As a result, the fine-tuned model picked that 
-habit and translated both plugin and email.
+﻿1
+00:00:00,000 --> 00:00:01,402
+(air whooshing)
+
+2
+00:00:01,402 --> 00:00:02,720
+(smiley snapping)
+
+3
+00:00:02,720 --> 00:00:05,910
+(air whooshing)
+
+4
+00:00:05,910 --> 00:00:07,923
+- What is domain adaptation?
+
+5
+00:00:09,540 --> 00:00:12,540
+When fine-tuning a pre-trained
+model on a new dataset,
+
+6
+00:00:12,540 --> 00:00:15,480
+the fine-tuned model we
+obtain will make predictions
+
+7
+00:00:15,480 --> 00:00:17,433
+that are attuned to this new dataset.
+
+8
+00:00:18,840 --> 00:00:21,840
+When the two models are
+trained with the same task,
+
+9
+00:00:21,840 --> 00:00:25,320
+we can then compare their
+predictions on the same input.
+
+10
+00:00:25,320 --> 00:00:27,870
+The predictions of the two
+models will be different
+
+11
+00:00:27,870 --> 00:00:29,790
+in a way that reflects the differences
+
+12
+00:00:29,790 --> 00:00:31,680
+between the two datasets,
+
+13
+00:00:31,680 --> 00:00:34,053
+a phenomenon we call domain adaptation.
+
+14
+00:00:35,310 --> 00:00:38,640
+Let's look at an example
+with masked language modeling
+
+15
+00:00:38,640 --> 00:00:41,910
+by comparing the outputs of the
+pre-trained DistilBERT model
+
+16
+00:00:41,910 --> 00:00:43,080
+with the version fine-tuned
+
+17
+00:00:43,080 --> 00:00:45,273
+in chapter 7 of the course, linked below.
+
+18
+00:00:46,500 --> 00:00:49,140
+The pre-trained model
+makes generic predictions,
+
+19
+00:00:49,140 --> 00:00:50,580
+whereas the fine-tuned model
+
+20
+00:00:50,580 --> 00:00:53,253
+has its first two
+predictions linked to cinema.
+
+21
+00:00:54,390 --> 00:00:57,210
+Since it was fine-tuned on
+a movie reviews dataset,
+
+22
+00:00:57,210 --> 00:00:58,680
+it's perfectly normal to see
+
+23
+00:00:58,680 --> 00:01:01,440
+it adapted its suggestions like this.
+
+24
+00:01:01,440 --> 00:01:03,090
+Notice how it keeps the same prediction
+
+25
+00:01:03,090 --> 00:01:05,220
+as the pre-trained model afterward.
+
+26
+00:01:05,220 --> 00:01:08,100
+Even if the fine-tuned model
+adapts to the new dataset,
+
+27
+00:01:08,100 --> 00:01:10,450
+it's not forgetting what
+it was pre-trained on.
+
+28
+00:01:11,490 --> 00:01:14,220
+This is another example
+on a translation task.
+
+29
+00:01:14,220 --> 00:01:17,310
+On top, we use a pre-trained
+French/English model,
+
+30
+00:01:17,310 --> 00:01:21,330
+and at the bottom, the version
+we fine-tuned in chapter 7.
+
+31
+00:01:21,330 --> 00:01:23,610
+The top model is pre-trained
+on lots of texts,
+
+32
+00:01:23,610 --> 00:01:25,170
+and leaves technical English terms,
+
+33
+00:01:25,170 --> 00:01:28,350
+like plugin and email,
+unchanged in the translation.
+
+34
+00:01:28,350 --> 00:01:31,350
+Both are perfectly
+understood by French people.
+
+35
+00:01:31,350 --> 00:01:33,780
+The dataset picked for the
+fine-tuning is a dataset
+
+36
+00:01:33,780 --> 00:01:36,660
+of technical texts where
+special attention was picked
+
+37
+00:01:36,660 --> 00:01:39,150
+on translating everything in French.
+
+38
+00:01:39,150 --> 00:01:42,090
+As a result, the fine-tuned
+model picked that habit
+
+39
+00:01:42,090 --> 00:01:44,193
+and translated both plugin and email.
+
+40
+00:01:45,942 --> 00:01:49,181
+(air whooshing)
+
+41
+00:01:49,181 --> 00:01:50,592
+(air whooshing)
+
diff --git a/subtitles/en/59_data-processing-for-translation.srt b/subtitles/en/59_data-processing-for-translation.srt
index c5d8a99bd..aaddd1f56 100644
--- a/subtitles/en/59_data-processing-for-translation.srt
+++ b/subtitles/en/59_data-processing-for-translation.srt
@@ -1,158 +1,247 @@
-1
-00:00:05,670 --> 00:00:09,630
-Let's see how to preprocess a dataset for
-translation.
-
-2
-00:00:09,630 --> 00:00:13,269
-This is the task of well translating a sentence
-in another language.
-
-3
-00:00:13,269 --> 00:00:18,110
-This video will focus on how to preprocess
-your dataset once you have managed to put
-
-4
-00:00:18,110 --> 00:00:23,090
-it in the following format: one column for
-the input texts, and one for the target texts.
-
-5
-00:00:23,090 --> 00:00:28,439
-Here is how we can achieve this with the Datasets
-library on the KDE4 dataset for English to
-
-6
-00:00:28,439 --> 00:00:30,960
-French translation.
-
-7
-00:00:30,960 --> 00:00:35,360
-As long as you manage to have your data look
-like this, you should be able to follow the
-
-8
-00:00:35,360 --> 00:00:36,769
-same steps.
-
-9
-00:00:36,769 --> 00:00:41,550
-For once, our labels are not integers corresponding
-to some classes, but plain text.
-
-10
-00:00:41,550 --> 00:00:44,760
-We will thus need to tokenize them, like our
-inputs.
-
-11
-00:00:44,760 --> 00:00:50,820
-There is a trap there though, as if you tokenize
-your targets like your inputs, you will hit
-
-12
-00:00:50,820 --> 00:00:51,820
-a problem.
-
-13
-00:00:51,820 --> 00:00:55,829
-Even if you don't speak French, you might
-notice some weird things in the tokenization
-
-14
-00:00:55,829 --> 00:01:01,800
-of the targets: most of the words are tokenized
-in several subtokens, while "fish", one of
-
-15
-00:01:01,800 --> 00:01:05,799
-the only English word, is tokenized as a single
-word.
-
-16
-00:01:05,799 --> 00:01:09,760
-That's because our inputs have been tokenized
-as English.
-
-17
-00:01:09,760 --> 00:01:13,939
-Since our model knows two languages, you have
-to warn it when tokenizing the targets, so
-
-18
-00:01:13,939 --> 00:01:16,360
-it switches in French mode.
-
-19
-00:01:16,360 --> 00:01:20,090
-This is done with the as_target_tokenizer
-context manager.
-
-20
-00:01:20,090 --> 00:01:24,900
-You can see how it results in a more compact
-tokenization.
-
-21
-00:01:24,900 --> 00:01:28,509
-Processing the whole dataset is then super
-easy with the map function.
-
-22
-00:01:28,509 --> 00:01:32,900
-You can pick different maximum lengths for
-the input and targets, and choose to pad at
-
-23
-00:01:32,900 --> 00:01:37,210
-this stage to that maximum length by setting
-padding=max_length.
-
-24
-00:01:37,210 --> 00:01:42,540
-Here we will show you how to pad dynamically
-as it requires one more step.
-
-25
-00:01:42,540 --> 00:01:45,560
-Your inputs and targets are all sentence of
-various lengths.
-
-26
-00:01:45,560 --> 00:01:50,470
-We will pad the inputs and targets separately
-as the maximum length of the inputs and targets
-
-27
-00:01:50,470 --> 00:01:52,740
-might be different.
-
-28
-00:01:52,740 --> 00:01:57,259
-Then we pad the inputs with the pad token
-and the targets with the -100 index, to make
-
-29
-00:01:57,259 --> 00:02:01,470
-sure they are not taken into account in the
-loss computation.
-
-30
-00:02:01,470 --> 00:02:04,869
-Once this is done, batching inputs and targets
-become super easy!
-
-31
-00:02:04,869 --> 00:02:10,220
-The Transformers library provides us with
-a data collator to do this all automatically.
-
-32
-00:02:10,220 --> 00:02:15,920
-You can then pass it to the Trainer with your
-datasets, or use it in the to_tf_dataset method
-
-33
-00:02:15,920 --> 00:02:17,410
-before using model.fit().
+﻿1
+00:00:00,449 --> 00:00:01,559
+(air whooshing)
+
+2
+00:00:01,559 --> 00:00:02,767
+(logo popping)
+
+3
+00:00:02,767 --> 00:00:05,670
+(metal sliding)
+
+4
+00:00:05,670 --> 00:00:08,470
+- Let's see how to preprocess
+a dataset for translation.
+
+5
+00:00:09,540 --> 00:00:12,420
+This is a task of well
+translating a sentence
+
+6
+00:00:12,420 --> 00:00:14,310
+in another language.
+
+7
+00:00:14,310 --> 00:00:17,100
+This video will focus on how
+to preprocess your dataset
+
+8
+00:00:17,100 --> 00:00:19,950
+once you've managed to put
+it in the following format.
+
+9
+00:00:19,950 --> 00:00:23,730
+One column for input texts
+and one for the target texts.
+
+10
+00:00:23,730 --> 00:00:25,980
+Here is how we can achieve
+this with the Datasets library
+
+11
+00:00:25,980 --> 00:00:29,643
+and the KDE4 dataset for
+English to French translation.
+
+12
+00:00:30,870 --> 00:00:33,240
+As long as you manage to have
+your data look like this,
+
+13
+00:00:33,240 --> 00:00:35,440
+you should be able to
+follow the same steps.
+
+14
+00:00:36,630 --> 00:00:39,210
+For once, our labels are not integers
+
+15
+00:00:39,210 --> 00:00:42,210
+corresponding to some
+classes, but plain texts.
+
+16
+00:00:42,210 --> 00:00:45,810
+We will thus need to tokenize
+them, like our inputs.
+
+17
+00:00:45,810 --> 00:00:47,370
+There is a trap there though,
+
+18
+00:00:47,370 --> 00:00:49,890
+as if you tokenize your
+targets like your inputs,
+
+19
+00:00:49,890 --> 00:00:51,690
+you will hit a problem.
+
+20
+00:00:51,690 --> 00:00:54,090
+Even if you don't speak
+French, you might notice
+
+21
+00:00:54,090 --> 00:00:57,270
+some weird things in the
+tokenization of the targets.
+
+22
+00:00:57,270 --> 00:01:00,510
+Most of the words are
+tokenized in several subtokens,
+
+23
+00:01:00,510 --> 00:01:03,180
+while fish, one of the only English word,
+
+24
+00:01:03,180 --> 00:01:05,670
+is tokenized as a single word.
+
+25
+00:01:05,670 --> 00:01:08,703
+That's because our inputs have
+been tokenized as English.
+
+26
+00:01:09,660 --> 00:01:11,430
+Since our model knows two languages,
+
+27
+00:01:11,430 --> 00:01:13,800
+you have to warn it when
+tokenizing the targets
+
+28
+00:01:13,800 --> 00:01:16,230
+so it switches in French mode.
+
+29
+00:01:16,230 --> 00:01:20,010
+This is done with the
+as_target_tokenizer context manager.
+
+30
+00:01:20,010 --> 00:01:23,343
+You can see how it results in
+a more compact tokenization.
+
+31
+00:01:24,810 --> 00:01:25,890
+Processing the whole dataset
+
+32
+00:01:25,890 --> 00:01:28,440
+is then super easy with the map function.
+
+33
+00:01:28,440 --> 00:01:30,207
+You can pick different maximum lengths
+
+34
+00:01:30,207 --> 00:01:32,100
+for the inputs and targets,
+
+35
+00:01:32,100 --> 00:01:34,530
+and choose to pad at this
+stage to that maximum length
+
+36
+00:01:34,530 --> 00:01:36,273
+by setting padding=max_length.
+
+37
+00:01:37,230 --> 00:01:39,300
+Here we'll show you to pad dynamically
+
+38
+00:01:39,300 --> 00:01:41,013
+as it requires one more step.
+
+39
+00:01:42,450 --> 00:01:43,470
+Your inputs and targets
+
+40
+00:01:43,470 --> 00:01:46,080
+are all sentences of various lengths.
+
+41
+00:01:46,080 --> 00:01:48,510
+We will pad the inputs
+and targets separately,
+
+42
+00:01:48,510 --> 00:01:50,460
+as the maximum lengths
+of the inputs and targets
+
+43
+00:01:50,460 --> 00:01:51,483
+might be different.
+
+44
+00:01:52,620 --> 00:01:54,540
+Then we pad the inputs with the pad token
+
+45
+00:01:54,540 --> 00:01:57,060
+and the targets with the -100 index
+
+46
+00:01:57,060 --> 00:01:58,890
+to make sure they're
+not taken into account
+
+47
+00:01:58,890 --> 00:02:00,123
+in the loss computation.
+
+48
+00:02:01,320 --> 00:02:02,153
+Once this is done,
+
+49
+00:02:02,153 --> 00:02:04,340
+batching inputs and
+targets become super easy.
+
+50
+00:02:05,670 --> 00:02:08,220
+The Transformers library
+provides us with data collator
+
+51
+00:02:08,220 --> 00:02:10,500
+to do this all automatically.
+
+52
+00:02:10,500 --> 00:02:13,800
+You can then pass it to the
+Trainer with your datasets
+
+53
+00:02:13,800 --> 00:02:15,960
+or use it in the to_tf_dataset method
+
+54
+00:02:15,960 --> 00:02:18,560
+before using model.fit()
+on your (indistinct) model.
+
+55
+00:02:21,057 --> 00:02:23,724
+(air whooshing)
+
diff --git a/subtitles/en/60_what-is-the-bleu-metric.srt b/subtitles/en/60_what-is-the-bleu-metric.srt
index ac7da0bd6..6231d1ca5 100644
--- a/subtitles/en/60_what-is-the-bleu-metric.srt
+++ b/subtitles/en/60_what-is-the-bleu-metric.srt
@@ -1,274 +1,540 @@
-1
-00:00:05,520 --> 00:00:12,080
-What is the BLEU metric? For many NLP tasks we 
-can use common metrics like accuracy or F1 score,  
-
-2
-00:00:12,080 --> 00:00:15,280
-but what do you do when you want to measure 
-the quality of text that's generated from a  
-
-3
-00:00:15,280 --> 00:00:19,680
-model like GPT-2? In this video, we'll take a look 
-at a widely used metric for machine translation  
-
-4
-00:00:19,680 --> 00:00:22,960
-called BLEU, which is short for BiLingual 
-Evaluation Understudy. The basic idea behind  
-
-5
-00:00:22,960 --> 00:00:27,280
-BLEU is to assign a single numerical score to 
-a translation that tells us how "good" it is  
-
-6
-00:00:27,280 --> 00:00:32,080
-compared to one or more reference translations. 
-In this example we have a sentence in Spanish that  
-
-7
-00:00:32,080 --> 00:00:37,280
-has been translated into English by some model. 
-If we compare the generated translation to some  
-
-8
-00:00:37,280 --> 00:00:42,160
-reference human translations, we can see that the 
-model is pretty good, but has made a common error:  
-
-9
-00:00:42,960 --> 00:00:48,000
-the Spanish word "tengo" means "have" in English 
-and this 1-1 translation is not quite natural.  
-
-10
-00:00:49,680 --> 00:00:53,280
-So how can we measure the quality of a 
-generated translation in an automatic way?  
-
-11
-00:00:54,080 --> 00:00:58,000
-The approach that BLEU takes is to compare the 
-n-grams of the generated translation to the  
-
-12
-00:00:58,000 --> 00:01:03,760
-n-grams of the references. An n-gram is just 
-a fancy way of saying "a chunk of n words",  
-
-13
-00:01:03,760 --> 00:01:07,280
-so let's start with unigrams, which correspond 
-to the individual words in a sentence.  
-
-14
-00:01:08,720 --> 00:01:12,160
-In this example you can see that four of 
-the words in the generated translation  
-
-15
-00:01:12,160 --> 00:01:18,000
-are also found in one of the reference 
-translations. Now that we've found our matches,  
-
-16
-00:01:18,000 --> 00:01:21,920
-one way to assign a score to the translation 
-is to compute the precision of the unigrams.  
-
-17
-00:01:22,880 --> 00:01:27,200
-This means we just count the number of matching 
-words in the generated and reference translations  
-
-18
-00:01:27,200 --> 00:01:30,400
-and normalize the count by dividing by 
-the number of word in the generation.  
-
-19
-00:01:31,600 --> 00:01:35,600
-In this example, we found 4 matching 
-words and our generation has 5 words,  
-
-20
-00:01:36,960 --> 00:01:40,320
-so our unigram precision is 4/5 or 0.8. In general 
-precision ranges from 0 to 1, and higher precision  
-
-21
-00:01:40,320 --> 00:01:48,160
-scores mean a better translation. One problem 
-with unigram precision is that translation models  
-
-22
-00:01:48,160 --> 00:01:51,840
-sometimes get stuck in repetitive patterns 
-and repeat the same word several times.  
-
-23
-00:01:52,960 --> 00:01:56,240
-If we just count the number of word matches, 
-we can get really high precision scores  
-
-24
-00:01:56,240 --> 00:01:58,720
-even though the translation is 
-terrible from a human perspective!  
-
-25
-00:01:59,840 --> 00:02:04,640
-For example, if our model just generates the word 
-"six", we get a perfect unigram precision score.  
-
-26
-00:02:07,040 --> 00:02:12,000
-To handle this, BLEU uses a modified precision 
-that clips the number of times to count a word,  
-
-27
-00:02:12,000 --> 00:02:14,960
-based on the maximum number of times it 
-appears in the reference translation.  
-
-28
-00:02:16,160 --> 00:02:19,360
-In this example, the word "six" 
-only appears once in the reference,  
-
-29
-00:02:19,360 --> 00:02:23,840
-so we clip the numerator to one and the modified 
-unigram precision now gives a much lower score.  
-
-30
-00:02:27,440 --> 00:02:31,600
-Another problem with unigram precision is that 
-it doesn't take into account the order of the  
-
-31
-00:02:31,600 --> 00:02:37,200
-words in the translations. For example, suppose 
-we had Yoda translate our Spanish sentence,  
-
-32
-00:02:37,200 --> 00:02:43,120
-then we might get something backwards like 
-"years six thirty have I". In this case,  
-
-33
-00:02:43,120 --> 00:02:46,560
-the modified unigram precision gives a 
-high precision which is not what we want.  
-
-34
-00:02:48,240 --> 00:02:52,400
-So to deal with word ordering problems, BLEU 
-actually computes the precision for several  
-
-35
-00:02:52,400 --> 00:02:57,360
-different n-grams and then averages the result. 
-For example, if we compare 4-grams, then we can  
-
-36
-00:02:57,360 --> 00:03:03,840
-see there are no matching chunks of 4 words in 
-translations and so the 4-gram precision is 0.  
-
-37
-00:03:05,440 --> 00:03:10,880
-To compute BLEU scores in Hugging Face Datasets is 
-very simple: just use the load_metric() function,  
-
-38
-00:03:10,880 --> 00:03:13,840
-provide your model's predictions along 
-with the references and you're good to go!  
-
-39
-00:03:16,240 --> 00:03:19,920
-The output contains several fields 
-of interest. The precisions field  
-
-40
-00:03:19,920 --> 00:03:22,800
-contains all the individual 
-precision scores for each n-gram.  
-
-41
-00:03:24,800 --> 00:03:30,320
-The BLEU score itself is then calculated by taking 
-the geometric mean of the precision scores. By  
-
-42
-00:03:30,320 --> 00:03:34,880
-default, the mean of all four n-gram precisions is 
-reported, a metric that is sometimes also called  
-
-43
-00:03:34,880 --> 00:03:40,480
-BLEU-4. In this example we can see the BLEU score 
-is zero because the 4-gram precision was zero.  
-
-44
-00:03:43,440 --> 00:03:46,640
-The BLEU metric has some nice properties, 
-but it is far from a perfect metric.  
-
-45
-00:03:47,280 --> 00:03:51,520
-The good properties are that it's easy to compute 
-and widely used in research so you can compare  
-
-46
-00:03:51,520 --> 00:03:56,560
-your model against others on a benchmark. On the 
-other hand, there are several problems with BLEU,  
-
-47
-00:03:56,560 --> 00:04:00,560
-including the fact it doesn't incorporate 
-semantics and struggles on non-English languages.  
-
-48
-00:04:01,680 --> 00:04:04,560
-Another problem with BLEU is that it 
-assumes the human translations have  
-
-49
-00:04:04,560 --> 00:04:08,400
-already been tokenized and this makes it hard 
-to compare models with different tokenizers.  
-
-50
-00:04:11,200 --> 00:04:15,280
-Measuring the quality of texts is still a 
-difficult, open problem in NLP research.  
-
-51
-00:04:15,280 --> 00:04:17,680
-For machine translation, the 
-current recommendation is to  
-
-52
-00:04:17,680 --> 00:04:21,600
-use the SacreBLEU metric which addresses 
-the tokenization limitations of BLEU.  
-
-53
-00:04:22,640 --> 00:04:26,560
-As you can see in this example, computing 
-the SacreBLEU score is almost identical to  
-
-54
-00:04:26,560 --> 00:04:30,800
-the BLEU one. The main difference is that we 
-now pass a list of texts instead of a list  
-
-55
-00:04:30,800 --> 00:04:41,200
-of words for the translations, and SacreBLEU 
-takes care of the tokenization under the hood.
+﻿1
+00:00:00,147 --> 00:00:01,412
+(screen whooshing)
+
+2
+00:00:01,412 --> 00:00:02,698
+(sticker popping)
+
+3
+00:00:02,698 --> 00:00:05,670
+(screen whooshing)
+
+4
+00:00:05,670 --> 00:00:07,650
+- What is the BLEU metric?
+
+5
+00:00:07,650 --> 00:00:10,170
+For many NLP tasks we
+can use common metrics
+
+6
+00:00:10,170 --> 00:00:12,810
+like accuracy or F1
+score, but what do you do
+
+7
+00:00:12,810 --> 00:00:14,340
+when you wanna measure the quality of text
+
+8
+00:00:14,340 --> 00:00:16,560
+that's been translated from a model?
+
+9
+00:00:16,560 --> 00:00:18,750
+In this video, we'll take a
+look at a widely used metric
+
+10
+00:00:18,750 --> 00:00:20,613
+for machine translation called BLEU.
+
+11
+00:00:22,290 --> 00:00:23,940
+The basic idea behind BLEU is to assign
+
+12
+00:00:23,940 --> 00:00:26,250
+a single numerical score to a translation
+
+13
+00:00:26,250 --> 00:00:27,450
+that tells us how good it is
+
+14
+00:00:27,450 --> 00:00:30,199
+compared to one or more
+reference translations.
+
+15
+00:00:30,199 --> 00:00:32,130
+In this example, we have
+a sentence in Spanish
+
+16
+00:00:32,130 --> 00:00:35,340
+that has been translated
+into English by some model.
+
+17
+00:00:35,340 --> 00:00:37,170
+If we compare the generated translation
+
+18
+00:00:37,170 --> 00:00:39,150
+to some reference human translations,
+
+19
+00:00:39,150 --> 00:00:41,190
+we can see that the model
+is actually pretty good,
+
+20
+00:00:41,190 --> 00:00:43,260
+but has made a common error.
+
+21
+00:00:43,260 --> 00:00:46,050
+The Spanish word tengo
+means have in English,
+
+22
+00:00:46,050 --> 00:00:48,700
+and this one-to-one translation
+is not quite natural.
+
+23
+00:00:49,890 --> 00:00:51,270
+So how can we measure the quality
+
+24
+00:00:51,270 --> 00:00:54,270
+of a generated translation
+in some automatic way?
+
+25
+00:00:54,270 --> 00:00:56,730
+The approach that BLEU takes
+is to compare the n-grams
+
+26
+00:00:56,730 --> 00:00:58,550
+of the generated
+translation to the n-grams
+
+27
+00:00:58,550 --> 00:01:00,390
+in the references.
+
+28
+00:01:00,390 --> 00:01:02,400
+Now, an n-gram is just
+a fancy way of saying
+
+29
+00:01:02,400 --> 00:01:03,960
+a chunk of n words.
+
+30
+00:01:03,960 --> 00:01:05,220
+So let's start with unigrams,
+
+31
+00:01:05,220 --> 00:01:08,020
+which corresponds to the
+individual words in a sentence.
+
+32
+00:01:08,880 --> 00:01:11,250
+In this example, you can
+see that four of the words
+
+33
+00:01:11,250 --> 00:01:13,140
+in the generated
+translation are also found
+
+34
+00:01:13,140 --> 00:01:14,990
+in one of the reference translations.
+
+35
+00:01:16,350 --> 00:01:18,240
+And once we've found our matches,
+
+36
+00:01:18,240 --> 00:01:20,130
+one way to assign a
+score to the translation
+
+37
+00:01:20,130 --> 00:01:23,070
+is to compute the
+precision of the unigrams.
+
+38
+00:01:23,070 --> 00:01:25,200
+This means we just count
+the number of matching words
+
+39
+00:01:25,200 --> 00:01:27,360
+in the generated and
+reference translations
+
+40
+00:01:27,360 --> 00:01:29,660
+and normalize the count by
+dividing by the number of words
+
+41
+00:01:29,660 --> 00:01:30,753
+in the generation.
+
+42
+00:01:31,800 --> 00:01:34,080
+In this example, we
+found four matching words
+
+43
+00:01:34,080 --> 00:01:36,033
+and our generation has five words.
+
+44
+00:01:37,140 --> 00:01:39,690
+Now, in general, precision
+ranges from zero to one,
+
+45
+00:01:39,690 --> 00:01:42,390
+and higher precision scores
+mean a better translation.
+
+46
+00:01:44,160 --> 00:01:45,570
+But this isn't really the whole story
+
+47
+00:01:45,570 --> 00:01:47,310
+because one problem with unigram precision
+
+48
+00:01:47,310 --> 00:01:49,140
+is that translation
+models sometimes get stuck
+
+49
+00:01:49,140 --> 00:01:51,330
+in repetitive patterns and
+just repeat the same word
+
+50
+00:01:51,330 --> 00:01:52,293
+several times.
+
+51
+00:01:53,160 --> 00:01:54,690
+If we just count the
+number of word matches,
+
+52
+00:01:54,690 --> 00:01:56,370
+we can get really high precision scores
+
+53
+00:01:56,370 --> 00:01:57,840
+even though the translation is terrible
+
+54
+00:01:57,840 --> 00:01:59,090
+from a human perspective!
+
+55
+00:02:00,000 --> 00:02:02,970
+For example, if our model
+just generates the word six,
+
+56
+00:02:02,970 --> 00:02:05,020
+we get a perfect unigram precision score.
+
+57
+00:02:06,960 --> 00:02:09,930
+So to handle this, BLEU
+uses a modified precision
+
+58
+00:02:09,930 --> 00:02:12,210
+that clips the number of
+times to count a word,
+
+59
+00:02:12,210 --> 00:02:13,680
+based on the maximum number of times
+
+60
+00:02:13,680 --> 00:02:16,399
+it appears in the reference translation.
+
+61
+00:02:16,399 --> 00:02:18,630
+In this example, the word
+six only appears once
+
+62
+00:02:18,630 --> 00:02:21,360
+in the reference, so we
+clip the numerator to one
+
+63
+00:02:21,360 --> 00:02:22,710
+and the modified unigram precision
+
+64
+00:02:22,710 --> 00:02:25,233
+now gives a much lower score as expected.
+
+65
+00:02:27,660 --> 00:02:29,400
+Another problem with unigram precision
+
+66
+00:02:29,400 --> 00:02:30,780
+is that it doesn't take into account
+
+67
+00:02:30,780 --> 00:02:33,900
+the order in which the words
+appear in the translations.
+
+68
+00:02:33,900 --> 00:02:35,700
+For example, suppose we had Yoda
+
+69
+00:02:35,700 --> 00:02:37,410
+translate our Spanish sentence,
+
+70
+00:02:37,410 --> 00:02:39,457
+then we might get
+something backwards like,
+
+71
+00:02:39,457 --> 00:02:42,450
+"Years sixty thirty have I."
+
+72
+00:02:42,450 --> 00:02:44,670
+In this case, the
+modified unigram precision
+
+73
+00:02:44,670 --> 00:02:47,393
+gives a high precision which
+is not really what we want.
+
+74
+00:02:48,480 --> 00:02:50,460
+So to deal with word ordering problems,
+
+75
+00:02:50,460 --> 00:02:52,020
+BLEU actually computes the precision
+
+76
+00:02:52,020 --> 00:02:55,410
+for several different n-grams
+and then averages the result.
+
+77
+00:02:55,410 --> 00:02:57,300
+For example, if we compare 4-grams,
+
+78
+00:02:57,300 --> 00:02:58,830
+we can see that there
+are no matching chunks
+
+79
+00:02:58,830 --> 00:03:01,020
+of four words in the translations,
+
+80
+00:03:01,020 --> 00:03:02,913
+and so the 4-gram precision is 0.
+
+81
+00:03:05,460 --> 00:03:07,560
+Now, to compute BLEU
+scores in Datasets library
+
+82
+00:03:07,560 --> 00:03:09,120
+is really very simple.
+
+83
+00:03:09,120 --> 00:03:11,100
+You just use the load_metric function,
+
+84
+00:03:11,100 --> 00:03:13,290
+provide your model's predictions
+with their references
+
+85
+00:03:13,290 --> 00:03:14,390
+and you're good to go!
+
+86
+00:03:16,470 --> 00:03:19,200
+The output will contain
+several fields of interest.
+
+87
+00:03:19,200 --> 00:03:20,490
+The precisions field contains
+
+88
+00:03:20,490 --> 00:03:23,133
+all the individual precision
+scores for each n-gram.
+
+89
+00:03:25,050 --> 00:03:26,940
+The BLEU score itself is then calculated
+
+90
+00:03:26,940 --> 00:03:30,090
+by taking the geometric mean
+of the precision scores.
+
+91
+00:03:30,090 --> 00:03:32,790
+And by default, the mean of
+all four n-gram precisions
+
+92
+00:03:32,790 --> 00:03:35,793
+is reported, a metric that is
+sometimes also called BLEU-4.
+
+93
+00:03:36,660 --> 00:03:38,880
+In this example, we can
+see the BLEU score is zero
+
+94
+00:03:38,880 --> 00:03:40,780
+because the 4-gram precision was zero.
+
+95
+00:03:43,290 --> 00:03:45,390
+Now, the BLEU metric has
+some nice properties,
+
+96
+00:03:45,390 --> 00:03:47,520
+but it is far from a perfect metric.
+
+97
+00:03:47,520 --> 00:03:49,440
+The good properties are
+that it's easy to compute
+
+98
+00:03:49,440 --> 00:03:50,970
+and it's widely used in research
+
+99
+00:03:50,970 --> 00:03:52,620
+so you can compare your
+model against others
+
+100
+00:03:52,620 --> 00:03:54,630
+on common benchmarks.
+
+101
+00:03:54,630 --> 00:03:56,670
+On the other hand, there are
+several big problems with BLEU,
+
+102
+00:03:56,670 --> 00:03:58,830
+including the fact it
+doesn't incorporate semantics
+
+103
+00:03:58,830 --> 00:04:01,920
+and it struggles a lot
+on non-English languages.
+
+104
+00:04:01,920 --> 00:04:02,790
+Another problem with BLEU
+
+105
+00:04:02,790 --> 00:04:04,620
+is that it assumes the human translations
+
+106
+00:04:04,620 --> 00:04:05,820
+have already been tokenized
+
+107
+00:04:05,820 --> 00:04:07,320
+and this makes it hard to compare models
+
+108
+00:04:07,320 --> 00:04:08,820
+that use different tokenizers.
+
+109
+00:04:10,590 --> 00:04:12,570
+So as we've seen, measuring
+the quality of texts
+
+110
+00:04:12,570 --> 00:04:15,570
+is still a difficult and
+open problem in NLP research.
+
+111
+00:04:15,570 --> 00:04:17,580
+For machine translation,
+the current recommendation
+
+112
+00:04:17,580 --> 00:04:19,440
+is to use the SacreBLEU metric,
+
+113
+00:04:19,440 --> 00:04:22,830
+which addresses the tokenization
+limitations of BLEU.
+
+114
+00:04:22,830 --> 00:04:24,360
+As you can see in this example,
+
+115
+00:04:24,360 --> 00:04:26,580
+computing the SacreBLEU
+score is almost identical
+
+116
+00:04:26,580 --> 00:04:28,020
+to the BLEU one.
+
+117
+00:04:28,020 --> 00:04:30,360
+The main difference is that
+we now pass a list of texts
+
+118
+00:04:30,360 --> 00:04:32,640
+instead of a list of
+words to the translations,
+
+119
+00:04:32,640 --> 00:04:35,640
+and SacreBLEU takes care of the
+tokenization under the hood.
+
+120
+00:04:36,582 --> 00:04:39,499
+(screen whooshing)
+
diff --git a/subtitles/en/61_data-processing-for-summarization.srt b/subtitles/en/61_data-processing-for-summarization.srt
index 49492b2c1..4ac57652f 100644
--- a/subtitles/en/61_data-processing-for-summarization.srt
+++ b/subtitles/en/61_data-processing-for-summarization.srt
@@ -1,104 +1,221 @@
-1
-00:00:05,360 --> 00:00:10,720
-Let's see how to preprocess a dataset 
-for summarization. This is the task of  
-
-2
-00:00:10,720 --> 00:00:16,976
-well summarizing a long document. This video will 
-focus on how to preprocess your dataset once you  
-
-3
-00:00:16,976 --> 00:00:21,840
-have managed to put it in the following format: 
-one column for the long documents, and one for  
-
-4
-00:00:21,840 --> 00:00:27,360
-the summaries. Here is how we can achieve this 
-with the Datasets library on the XSUM dataset.  
-
-5
-00:00:28,400 --> 00:00:32,400
-As long as you manage to have your data look like 
-this, you should be able to follow the same steps.  
-
-6
-00:00:33,520 --> 00:00:37,280
-For once, our labels are not integers 
-corresponding to some classes,  
-
-7
-00:00:37,280 --> 00:00:43,120
-but plain text. We will thus need to tokenize 
-them, like our inputs. There is a small trap  
-
-8
-00:00:43,120 --> 00:00:47,760
-there though, as we need to tokenize our targets 
-inside the as_target_tokenzier context manager.  
-
-9
-00:00:48,480 --> 00:00:53,200
-This is because the special tokens we add might be 
-slightly different for the inputs and the targets,  
-
-10
-00:00:53,760 --> 00:00:58,320
-so the tokenizer has to know which one it 
-is processing. Processing the whole dataset  
-
-11
-00:00:58,320 --> 00:01:03,520
-is then super easy with the map function. Since 
-the summaries are usually much shorter than the  
-
-12
-00:01:03,520 --> 00:01:07,840
-documents, you should definitely pick different 
-maximum lengths for the inputs and targets.  
-
-13
-00:01:08,640 --> 00:01:12,640
-You can choose to pad at this stage to that 
-maximum length by setting padding=max_length.  
-
-14
-00:01:13,840 --> 00:01:17,360
-Here we will show you how to pad 
-dynamically as it requires one more step.  
-
-15
-00:01:18,640 --> 00:01:23,360
-Your inputs and targets are all sentence of 
-various lengths. We will pad the inputs and  
-
-16
-00:01:23,360 --> 00:01:27,920
-targets separately as the maximum length of the 
-inputs and targets are completely different.  
-
-17
-00:01:28,880 --> 00:01:32,320
-Then we pad the inputs to the 
-maximum lengths among the inputs,  
-
-18
-00:01:32,320 --> 00:01:38,800
-and same for the targets. We pad the inputs with 
-the pad token and the targets with the -100 index,  
-
-19
-00:01:38,800 --> 00:01:44,400
-to make sure they are not taken into account in 
-the loss computation. The Transformers library  
-
-20
-00:01:44,400 --> 00:01:49,200
-provides us with a data collator to do this 
-all automatically. You can then pass it to  
-
-21
-00:01:49,200 --> 00:01:55,440
-the Trainer with your datasets, or use it in the 
-to_tf_dataset method before using model.fit().
+﻿1
+00:00:00,227 --> 00:00:01,359
+(air whooshing)
+
+2
+00:00:01,359 --> 00:00:02,610
+(smiley clicking)
+
+3
+00:00:02,610 --> 00:00:05,550
+(air whooshing)
+
+4
+00:00:05,550 --> 00:00:08,450
+- Let's see how to preprocess
+a dataset for summarization.
+
+5
+00:00:09,750 --> 00:00:13,083
+This is the task of, well,
+summarizing a long document.
+
+6
+00:00:14,040 --> 00:00:16,830
+This video will focus on how
+to preprocess your dataset
+
+7
+00:00:16,830 --> 00:00:19,680
+once you have managed to put
+it in the following format:
+
+8
+00:00:19,680 --> 00:00:21,510
+one column for the long documents,
+
+9
+00:00:21,510 --> 00:00:23,610
+and one for the summaries.
+
+10
+00:00:23,610 --> 00:00:24,930
+Here is how we can achieve this
+
+11
+00:00:24,930 --> 00:00:27,573
+with the Datasets library
+on the XSUM dataset.
+
+12
+00:00:28,650 --> 00:00:30,810
+As long as you manage to have
+your data look like this,
+
+13
+00:00:30,810 --> 00:00:33,690
+you should be able to
+follow the same steps.
+
+14
+00:00:33,690 --> 00:00:35,880
+For once, our labels are not integers
+
+15
+00:00:35,880 --> 00:00:39,150
+corresponding to some
+classes, but plain text.
+
+16
+00:00:39,150 --> 00:00:42,480
+We will thus need to tokenize
+them, like our inputs.
+
+17
+00:00:42,480 --> 00:00:43,920
+There is a small trap there though,
+
+18
+00:00:43,920 --> 00:00:45,360
+as we need to tokenize our targets
+
+19
+00:00:45,360 --> 00:00:48,690
+inside the as_target_tokenizer
+context manager.
+
+20
+00:00:48,690 --> 00:00:51,030
+This is because the special tokens we add
+
+21
+00:00:51,030 --> 00:00:54,000
+might be slightly different
+for the inputs and the target,
+
+22
+00:00:54,000 --> 00:00:57,300
+so the tokenizer has to know
+which one it is processing.
+
+23
+00:00:57,300 --> 00:00:59,550
+Processing the whole
+dataset is then super easy
+
+24
+00:00:59,550 --> 00:01:01,290
+with the map function.
+
+25
+00:01:01,290 --> 00:01:03,450
+Since the summaries are
+usually much shorter
+
+26
+00:01:03,450 --> 00:01:05,400
+than the documents, you
+should definitely pick
+
+27
+00:01:05,400 --> 00:01:08,880
+different maximum lengths
+for the inputs and targets.
+
+28
+00:01:08,880 --> 00:01:11,730
+You can choose to pad at this
+stage to that maximum length
+
+29
+00:01:11,730 --> 00:01:14,070
+by setting padding=max_length.
+
+30
+00:01:14,070 --> 00:01:16,170
+Here we'll show you
+how to pad dynamically,
+
+31
+00:01:16,170 --> 00:01:17,620
+as it requires one more step.
+
+32
+00:01:18,840 --> 00:01:20,910
+Your inputs and targets are all sentences
+
+33
+00:01:20,910 --> 00:01:22,620
+of various lengths.
+
+34
+00:01:22,620 --> 00:01:24,960
+We'll pad the inputs
+and targets separately
+
+35
+00:01:24,960 --> 00:01:27,030
+as the maximum lengths
+of the inputs and targets
+
+36
+00:01:27,030 --> 00:01:28,280
+are completely different.
+
+37
+00:01:29,130 --> 00:01:31,170
+Then, we pad the inputs
+to the maximum lengths
+
+38
+00:01:31,170 --> 00:01:33,813
+among the inputs, and same for the target.
+
+39
+00:01:34,860 --> 00:01:36,630
+We pad the input with the pad token,
+
+40
+00:01:36,630 --> 00:01:39,000
+and the targets with the -100 index
+
+41
+00:01:39,000 --> 00:01:40,980
+to make sure they are
+not taken into account
+
+42
+00:01:40,980 --> 00:01:42,180
+in the loss computation.
+
+43
+00:01:43,440 --> 00:01:45,180
+The Transformers library provide us
+
+44
+00:01:45,180 --> 00:01:48,510
+with a data collator to
+do this all automatically.
+
+45
+00:01:48,510 --> 00:01:51,690
+You can then pass it to the
+Trainer with your datasets,
+
+46
+00:01:51,690 --> 00:01:55,710
+or use it in the to_tf_dataset
+method before using model.fit
+
+47
+00:01:55,710 --> 00:01:56,823
+on your current model.
+
+48
+00:01:58,339 --> 00:02:01,520
+(air whooshing)
+
+49
+00:02:01,520 --> 00:02:02,876
+(air whooshing)
+
diff --git a/subtitles/en/62_what-is-the-rouge-metric.srt b/subtitles/en/62_what-is-the-rouge-metric.srt
index 4caff1f32..5450615b0 100644
--- a/subtitles/en/62_what-is-the-rouge-metric.srt
+++ b/subtitles/en/62_what-is-the-rouge-metric.srt
@@ -1,234 +1,455 @@
-1
-00:00:05,520 --> 00:00:12,080
-What is the ROUGE metric? For many NLP tasks we 
-can use common metrics like accuracy or F1 score,  
-
-2
-00:00:12,080 --> 00:00:15,920
-but what do you do when you want to measure 
-the quality of a summary from a model like T5?  
-
-3
-00:00:16,720 --> 00:00:20,265
-In this video, we'll take a look at a widely 
-used metric for text summarization called ROUGE,  
-
-4
-00:00:20,265 --> 00:00:23,360
-which is short for Recall-Oriented Understudy for 
-Gisting Evaluation. There are actually several  
-
-5
-00:00:23,360 --> 00:00:27,280
-variants of ROUGE, but the basic idea behind 
-all of them is to assign a single numerical  
-
-6
-00:00:27,280 --> 00:00:31,360
-score to a summary that tells us how "good" it 
-is compared to one or more reference summaries.  
-
-7
-00:00:32,320 --> 00:00:35,360
-In this example we have a book review 
-that has been summarized by some model.  
-
-8
-00:00:36,400 --> 00:00:39,600
-If we compare the generated summary 
-to some reference human summaries,  
-
-9
-00:00:39,600 --> 00:00:43,840
-we can see that the model is pretty 
-good, and only differs by a word or two.  
-
-10
-00:00:44,800 --> 00:00:48,000
-So how can we measure the quality of a 
-generated summary in an automatic way?  
-
-11
-00:00:48,800 --> 00:00:52,880
-The approach that ROUGE takes is to compare 
-the n-grams of the generated summary to the  
-
-12
-00:00:52,880 --> 00:00:58,400
-n-grams of the references. An n-gram is just 
-a fancy way of saying "a chunk of n words",  
-
-13
-00:00:58,400 --> 00:01:02,080
-so let's start with unigrams, which correspond 
-to the individual words in a sentence.  
-
-14
-00:01:03,600 --> 00:01:07,760
-In this example you can see that six of the words 
-in the generated summary are also found in one of  
-
-15
-00:01:07,760 --> 00:01:11,840
-the reference summaries. The ROUGE metric 
-that compares unigrams is called ROUGE-1.  
-
-16
-00:01:14,000 --> 00:01:18,000
-Now that we've found our matches, one way to 
-assign a score to the summary is to compute the  
-
-17
-00:01:18,000 --> 00:01:22,880
-recall of the unigrams. This means we just count 
-the number of matching words in the generated and  
-
-18
-00:01:22,880 --> 00:01:27,040
-reference summaries and normalize the count by 
-dividing by the number of word in the reference.  
-
-19
-00:01:28,000 --> 00:01:31,920
-In this example, we found 6 matching 
-words and our reference has 6 words,  
-
-20
-00:01:31,920 --> 00:01:36,240
-so our unigram recall is perfect! This 
-means that all of words in the reference  
-
-21
-00:01:36,240 --> 00:01:42,320
-summary have produced in the generated one. 
-Perfect recall sounds great, but imagine if  
-
-22
-00:01:42,320 --> 00:01:47,120
-our generated summary had been “I really really 
-really really loved reading the Hunger Games”.  
-
-23
-00:01:47,920 --> 00:01:52,240
-This would also have perfect recall, but is 
-arguably a worse summary since it is verbose.  
-
-24
-00:01:53,280 --> 00:01:57,840
-To deal with these scenarios we can also compute 
-precision, which in the ROUGE context measures  
-
-25
-00:01:57,840 --> 00:02:01,200
-how much of the generated summary was relevant. In 
-this example, the precision is 6/7. In practice,  
-
-26
-00:02:01,200 --> 00:02:05,200
-both precision and recall are usually 
-computed and then the F1-score is reported.  
-
-27
-00:02:07,360 --> 00:02:12,000
-We can change the granularity of the comparison 
-by comparing bigrams instead of unigrams.  
-
-28
-00:02:12,800 --> 00:02:17,760
-With bigrams we chunk the sentence into pairs of 
-consecutive words and then count how many pairs in  
-
-29
-00:02:17,760 --> 00:02:23,600
-the generated summary are present in the reference 
-one. This gives us ROUGE-2 precision and recall,  
-
-30
-00:02:23,600 --> 00:02:28,800
-which we can see is lower than the ROUGE-1 scores 
-we saw earlier. Note that if the summaries are  
-
-31
-00:02:28,800 --> 00:02:34,560
-long, the ROUGE-2 score will be small as there 
-are typically fewer bigrams to match. This is  
-
-32
-00:02:34,560 --> 00:02:39,680
-also true for abstractive summarization, so both 
-ROUGE-1 and ROUGE-2 scores are usually reported.  
-
-33
-00:02:41,760 --> 00:02:46,880
-The last ROUGE variant we'll discuss is 
-ROUGE-L. ROUGE-L doesn't compare n-grams,  
-
-34
-00:02:46,880 --> 00:02:51,360
-but instead treats each summary as a sequence 
-of words and then looks for the longest common  
-
-35
-00:02:51,360 --> 00:02:57,280
-subsequence or LCS. A subsequence is a sequence 
-that appears in the same relative order,  
-
-36
-00:02:57,280 --> 00:03:03,280
-but not necessarily contiguous. So in this 
-example, "I loved reading the Hunger Games" is the  
-
-37
-00:03:03,280 --> 00:03:11,120
-longest common subsequence. The main advantage of 
-ROUGE-L over ROUGE-1 or ROUGE-2 is that is doesn't  
-
-38
-00:03:11,120 --> 00:03:18,400
-depend on consecutive n-gram matches, so it tends 
-to capture sentence structure more accurately. To  
-
-39
-00:03:18,400 --> 00:03:23,200
-compute ROUGE scores in Hugging Face Datasets is 
-very simple: just use the load_metric() function,  
-
-40
-00:03:23,760 --> 00:03:26,960
-provide your model's summaries along with 
-the references and you're good to go!  
-
-41
-00:03:28,560 --> 00:03:32,480
-The output from the calculation contains 
-a lot of information! The first thing we  
-
-42
-00:03:32,480 --> 00:03:36,880
-can see here is that the confidence intervals 
-of each ROUGE score are provided in the low,  
-
-43
-00:03:36,880 --> 00:03:41,680
-mid, and high fields. This is really useful if you 
-want to know the spread of your ROUGE scores when  
-
-44
-00:03:41,680 --> 00:03:48,080
-comparing two or more models. The second thing to 
-notice is that we have four types of ROUGE score.  
-
-45
-00:03:48,080 --> 00:03:53,840
-We've already seen ROUGE-1, ROUGE-2 and 
-ROUGE-L, so what is ROUGE-LSUM? Well,  
-
-46
-00:03:53,840 --> 00:03:58,800
-the “sum” in ROUGE-LSUM refers to the fact that 
-this metric is computed over a whole summary,  
-
-47
-00:03:58,800 --> 00:04:08,480
-while ROUGE-L is computed as the 
-average over individual sentences.
+﻿1
+00:00:00,624 --> 00:00:03,374
+(logo whooshing)
+
+2
+00:00:05,700 --> 00:00:07,740
+- What is the ROUGE metric?
+
+3
+00:00:07,740 --> 00:00:08,880
+For many NLP tasks
+
+4
+00:00:08,880 --> 00:00:12,270
+we can use common metrics
+like accuracy or F1 score.
+
+5
+00:00:12,270 --> 00:00:13,650
+But what do you do when
+you wanna measure something
+
+6
+00:00:13,650 --> 00:00:16,920
+like the quality of a
+summary from a model like T5?
+
+7
+00:00:16,920 --> 00:00:18,180
+In this video, we'll take a look
+
+8
+00:00:18,180 --> 00:00:21,180
+at a widely used metric for
+tech summarization called ROUGE.
+
+9
+00:00:22,740 --> 00:00:24,660
+There are actually
+several variants of ROUGE
+
+10
+00:00:24,660 --> 00:00:26,190
+but the basic idea behind all of them
+
+11
+00:00:26,190 --> 00:00:27,840
+is to assign a single numerical score
+
+12
+00:00:27,840 --> 00:00:30,000
+to a summary that tells us how good it is
+
+13
+00:00:30,000 --> 00:00:32,774
+compared to one or more
+reference summaries.
+
+14
+00:00:32,774 --> 00:00:34,020
+In this example, we have a book review
+
+15
+00:00:34,020 --> 00:00:36,570
+that has been summarized by some model.
+
+16
+00:00:36,570 --> 00:00:38,320
+If we compare the generated summary
+
+17
+00:00:39,168 --> 00:00:40,260
+to some reference human
+summaries, we can see
+
+18
+00:00:40,260 --> 00:00:42,841
+that the model is actually pretty good
+
+19
+00:00:42,841 --> 00:00:44,063
+and only differs by a word or two.
+
+20
+00:00:45,060 --> 00:00:46,260
+So how can we measure the quality
+
+21
+00:00:46,260 --> 00:00:49,050
+of a generated summary
+in an automatic way?
+
+22
+00:00:49,050 --> 00:00:51,510
+The approach that ROUGE takes
+is to compare the n-grams
+
+23
+00:00:51,510 --> 00:00:55,200
+of the generated summary to
+the n-grams of the references.
+
+24
+00:00:55,200 --> 00:00:58,590
+And n-gram is just a fancy way
+of saying a chunk of N words.
+
+25
+00:00:58,590 --> 00:01:00,030
+So let's start with unigrams
+
+26
+00:01:00,030 --> 00:01:02,780
+which correspond to the
+individual words in a sentence.
+
+27
+00:01:03,780 --> 00:01:05,250
+In this example, you can see that six
+
+28
+00:01:05,250 --> 00:01:07,650
+of the words in the generated
+summary are also found
+
+29
+00:01:07,650 --> 00:01:09,420
+in one of the reference summaries.
+
+30
+00:01:09,420 --> 00:01:11,310
+And the rouge metric
+that compares unigrams
+
+31
+00:01:11,310 --> 00:01:12,260
+is called ROUGE-1.
+
+32
+00:01:14,533 --> 00:01:16,770
+Now that we found our matches,
+one way to assign a score
+
+33
+00:01:16,770 --> 00:01:20,280
+to the summary is to compute
+the recall of the unigrams.
+
+34
+00:01:20,280 --> 00:01:21,540
+This means we just count the number
+
+35
+00:01:21,540 --> 00:01:22,950
+of matching words in the generated
+
+36
+00:01:22,950 --> 00:01:25,290
+and reference summaries
+and normalize the count
+
+37
+00:01:25,290 --> 00:01:28,200
+by dividing by the number
+of words in the reference.
+
+38
+00:01:28,200 --> 00:01:30,450
+In this example, we
+found six matching words
+
+39
+00:01:30,450 --> 00:01:32,160
+and our reference has six words.
+
+40
+00:01:32,160 --> 00:01:33,933
+So our unigram recall is perfect.
+
+41
+00:01:34,800 --> 00:01:35,810
+This means that all of the words
+
+42
+00:01:35,810 --> 00:01:37,500
+in the reference summary
+have been produced
+
+43
+00:01:37,500 --> 00:01:38,550
+in the generated one.
+
+44
+00:01:40,050 --> 00:01:42,360
+Now, perfect recall
+sounds great, but imagine
+
+45
+00:01:42,360 --> 00:01:44,520
+if our generated summary
+have been something like
+
+46
+00:01:44,520 --> 00:01:45,720
+I really, really, really,
+
+47
+00:01:45,720 --> 00:01:48,150
+really loved reading the Hunger Games.
+
+48
+00:01:48,150 --> 00:01:49,378
+This would also have perfect recall
+
+49
+00:01:49,378 --> 00:01:51,330
+but is arguably a worse summary,
+
+50
+00:01:51,330 --> 00:01:52,653
+since it is verbose.
+
+51
+00:01:53,550 --> 00:01:54,600
+To deal with these scenarios,
+
+52
+00:01:54,600 --> 00:01:56,190
+we can also compute precision,
+
+53
+00:01:56,190 --> 00:01:58,380
+which in the ROUGE
+context measures how much
+
+54
+00:01:58,380 --> 00:02:00,810
+of the generator summary was relevant.
+
+55
+00:02:00,810 --> 00:02:03,630
+In practice, both precision
+and recall are usually computed
+
+56
+00:02:03,630 --> 00:02:05,493
+and then the F1 score is reported.
+
+57
+00:02:07,170 --> 00:02:08,542
+Now we can change the granularity
+
+58
+00:02:08,542 --> 00:02:13,020
+of the comparison by comparing
+bigrams instead of unigrams.
+
+59
+00:02:13,020 --> 00:02:15,090
+With bigrams, we chunk
+the sentence into pairs
+
+60
+00:02:15,090 --> 00:02:17,910
+of consecutive words and
+then count how many pairs
+
+61
+00:02:17,910 --> 00:02:21,360
+in the generated summary are
+present in the reference one.
+
+62
+00:02:21,360 --> 00:02:23,880
+This gives us ROUGE-2 precision and recall
+
+63
+00:02:23,880 --> 00:02:24,780
+which as we can see,
+
+64
+00:02:24,780 --> 00:02:27,780
+is lower than the ROUGE-1
+scores from earlier.
+
+65
+00:02:27,780 --> 00:02:29,400
+Now, if the summaries are long,
+
+66
+00:02:29,400 --> 00:02:31,740
+the ROUGE-2 scores will generally be small
+
+67
+00:02:31,740 --> 00:02:34,290
+because there are fewer bios to match.
+
+68
+00:02:34,290 --> 00:02:36,870
+And this is also true for
+abstracter summarization.
+
+69
+00:02:36,870 --> 00:02:39,993
+So both ROUGE-1 and ROUGE-2
+scores are usually reported.
+
+70
+00:02:42,000 --> 00:02:45,330
+The last ROUGE variant we
+will discuss is ROUGE L.
+
+71
+00:02:45,330 --> 00:02:47,160
+ROUGE L doesn't compare ngrams
+
+72
+00:02:47,160 --> 00:02:49,572
+but instead treats each
+summary as a sequence of words
+
+73
+00:02:49,572 --> 00:02:53,403
+and then looks for the longest
+common subsequence or LCS.
+
+74
+00:02:54,775 --> 00:02:56,130
+A subsequence is a sequence that appears
+
+75
+00:02:56,130 --> 00:02:59,760
+in the same relative order,
+but not necessarily contiguous.
+
+76
+00:02:59,760 --> 00:03:03,210
+So in this example, I loved
+reading the Hunger Games,
+
+77
+00:03:03,210 --> 00:03:06,930
+is the longest common subsequence
+between the two summaries.
+
+78
+00:03:06,930 --> 00:03:08,610
+And the main advantage of ROUGE L
+
+79
+00:03:08,610 --> 00:03:11,670
+over ROUGE-1 or ROUGE-2
+is that it doesn't depend
+
+80
+00:03:11,670 --> 00:03:14,100
+on consecutive n-gram
+matches, and so it tends
+
+81
+00:03:14,100 --> 00:03:16,650
+to capture sentence structure
+much more accurately.
+
+82
+00:03:18,150 --> 00:03:19,440
+Now to compute ROUGE scores
+
+83
+00:03:19,440 --> 00:03:21,660
+in the data sets library is very simple.
+
+84
+00:03:21,660 --> 00:03:23,910
+You just use the load metric function,
+
+85
+00:03:23,910 --> 00:03:26,400
+provide your model summaries
+along with the references
+
+86
+00:03:26,400 --> 00:03:27,500
+and you're good to go.
+
+87
+00:03:28,770 --> 00:03:30,120
+The output from the calculation
+
+88
+00:03:30,120 --> 00:03:31,507
+contains a lot of information.
+
+89
+00:03:31,507 --> 00:03:34,560
+The first thing we can see is
+that the confidence intervals
+
+90
+00:03:34,560 --> 00:03:36,090
+of each ROUGE score are provided
+
+91
+00:03:36,090 --> 00:03:39,030
+in the low, mid and high fields.
+
+92
+00:03:39,030 --> 00:03:40,980
+This is really useful if
+you wanna know the spread
+
+93
+00:03:40,980 --> 00:03:43,730
+of your ROUGE scores when
+comparing two or more models.
+
+94
+00:03:45,090 --> 00:03:46,050
+The second thing to notice
+
+95
+00:03:46,050 --> 00:03:48,330
+is that we have four types of ROUGE score.
+
+96
+00:03:48,330 --> 00:03:51,480
+We've already seen ROUGE-1,
+ROUGE-2 and ROUGE-L
+
+97
+00:03:51,480 --> 00:03:53,760
+So what is ROUGE-L sum?
+
+98
+00:03:53,760 --> 00:03:55,410
+Well, the sum in ROUGEL's sum
+
+99
+00:03:55,410 --> 00:03:57,630
+refers to the fact that
+this metric is computed
+
+100
+00:03:57,630 --> 00:04:00,240
+over a whole summary
+while ROUGE-L is computed
+
+101
+00:04:00,240 --> 00:04:02,493
+as the average of individual sentences.
+
+102
+00:04:04,166 --> 00:04:06,916
+(logo whooshing)
+
diff --git a/subtitles/en/63_data-processing-for-causal-language-modeling.srt b/subtitles/en/63_data-processing-for-causal-language-modeling.srt
index 967ddc207..d5d544dee 100644
--- a/subtitles/en/63_data-processing-for-causal-language-modeling.srt
+++ b/subtitles/en/63_data-processing-for-causal-language-modeling.srt
@@ -1,214 +1,415 @@
-1
-00:00:05,520 --> 00:00:09,360
-In this video we take a look at the 
-data processing necessary to train  
-
-2
-00:00:09,360 --> 00:00:15,920
-causal language models. Causal Language Modeling 
-is the task of predicting the next token based  
-
-3
-00:00:15,920 --> 00:00:20,880
-on the previous token. Another term for Causal 
-Language Modeling is Autoregressive Modeling.  
-
-4
-00:00:21,760 --> 00:00:26,560
-In the example that you see here the 
-next token could for example be NLP  
-
-5
-00:00:26,560 --> 00:00:33,280
-or machine learning. A popular example of a 
-Causal Language Model is the GPT family of models.  
-
-6
-00:00:35,680 --> 00:00:40,400
-To train such models such as GPT-2 we usually 
-start with a large corpus of text files.  
-
-7
-00:00:41,280 --> 00:00:45,760
-These files can webpages scraped from the 
-internet such as the Common Crawl dataset  
-
-8
-00:00:45,760 --> 00:00:51,920
-or they can be Python files from GitHub like you 
-can see here. As a first step we need to tokenize  
-
-9
-00:00:51,920 --> 00:00:57,520
-these files such that we can feed them through a 
-model. Here we show the tokenized texts as bars of  
-
-10
-00:00:57,520 --> 00:01:06,000
-various length illustrating the different sequence 
-lengths. Normally, the text files come in various  
-
-11
-00:01:06,000 --> 00:01:07,440
-sizes and which results in various sequence length 
-of the tokenized texts. Transformer models have a  
-
-12
-00:01:07,440 --> 00:01:12,960
-limited context length and depending on the data 
-source it is possible that the tokenized texts  
-
-13
-00:01:12,960 --> 00:01:18,640
-are much longer than this context length. In 
-this case we could just truncate the sequence  
-
-14
-00:01:18,640 --> 00:01:24,160
-to the context length but this would mean that 
-we loose everything after the context length.  
-
-15
-00:01:25,360 --> 00:01:30,960
-Using the return overflowing tokens flag in the 
-we can use the tokenizer to create chunks with  
-
-16
-00:01:30,960 --> 00:01:36,960
-each one being the size of the context length. 
-Sometimes it can happen that the last chunk is  
-
-17
-00:01:36,960 --> 00:01:41,440
-too short if there aren’t enough tokens to fill 
-it. In this case we would like to remove it.  
-
-18
-00:01:43,440 --> 00:01:48,800
-With the return_length keyword we also get 
-the length of each chunk from the tokenizer.  
-
-19
-00:01:51,760 --> 00:01:57,280
-This function shows all the steps necessary 
-to prepare the dataset. First we tokenize the  
-
-20
-00:01:57,280 --> 00:02:03,520
-dataset with the flags I just mentioned. Then we 
-go through each chunk and if its length matches  
-
-21
-00:02:03,520 --> 00:02:08,960
-the context length we add it to the inputs we 
-return. We can apply this function to the whole  
-
-22
-00:02:08,960 --> 00:02:17,520
-dataset and we make sure to use batches and remove 
-the existing columns. We need to remove columns  
-
-23
-00:02:17,520 --> 00:02:23,280
-because we can create multiple samples per text 
-and the shapes in the dataset would not match.  
-
-24
-00:02:26,960 --> 00:02:32,400
-If the context length is of similar length as 
-the files this approach doesn't so well anymore.  
-
-25
-00:02:33,520 --> 00:02:39,440
-In this example both sample 1 and 2 are shorter 
-than the context size and would be discarded with  
-
-26
-00:02:39,440 --> 00:02:46,400
-the previous approach. In this case it is better 
-to first tokenize each sample without truncation  
-
-27
-00:02:46,400 --> 00:02:52,000
-and then concatenate the tokenized samples with an 
-end of string, or EOS for short, token in between.  
-
-28
-00:02:53,840 --> 00:02:57,440
-Finally we can chunk this long 
-sequence with the context length  
-
-29
-00:02:57,440 --> 00:03:05,840
-and we don’t loose any sequences because they 
-are too short. So far we have only talked about  
-
-30
-00:03:05,840 --> 00:03:10,720
-the inputs for causal language modeling but 
-not the labels needed for supervised training.  
-
-31
-00:03:11,600 --> 00:03:16,480
-When we do causal language modeling we don’t 
-require any extra labels for the input sequences  
-
-32
-00:03:16,480 --> 00:03:22,080
-as the input sequences themselves are 
-the labels. In this example when we feed  
-
-33
-00:03:22,080 --> 00:03:26,560
-the token “Trans” to the next token we 
-want the model to predict is “formers”.  
-
-34
-00:03:27,280 --> 00:03:33,360
-In the next step we feed “Trans” and “formers” 
-to the model and the label is the token “are”.  
-
-35
-00:03:35,280 --> 00:03:42,400
-This pattern continues and as you can see the 
-input sequence is the label just shifted by one.  
-
-36
-00:03:43,440 --> 00:03:48,000
-Since the model only makes a prediction 
-after the first token, the first element  
-
-37
-00:03:48,000 --> 00:03:54,480
-of the input sequence, in this case “Trans”, 
-is not used as a label. Similarly, we do not  
-
-38
-00:03:54,480 --> 00:04:00,400
-have a label for the last token in the sequence 
-since there is no token after the sequence ends.  
-
-39
-00:04:03,920 --> 00:04:09,200
-Let’s have a look at what we need to do to create 
-the labels for causal language modeling in code.If  
-
-40
-00:04:10,160 --> 00:04:15,600
-we want to calculate the loss on a batch we can 
-just pass the input_ids as labels and all the  
-
-41
-00:04:15,600 --> 00:04:19,432
-shifting is handled in the model internally. And 
-the dataset is also ready to be used directly in  
-
-42
-00:04:19,432 --> 00:04:21,600
-the Trainer or keras.fit if you are using 
-TensorFlow. So you see there is no magic  
-
-43
-00:04:21,600 --> 00:04:27,840
-involved in processing data for causal language 
-modeling and only requires a few simple steps!
+﻿1
+00:00:00,000 --> 00:00:02,917
+(transition music)
+
+2
+00:00:05,364 --> 00:00:08,310
+- In this video, we take a
+look at the data processing
+
+3
+00:00:08,310 --> 00:00:10,803
+necessary to train causal language models.
+
+4
+00:00:12,690 --> 00:00:14,400
+Causal language modeling is the task
+
+5
+00:00:14,400 --> 00:00:17,820
+of predicting the next token
+based on the previous ones.
+
+6
+00:00:17,820 --> 00:00:19,680
+Another term for causal language modeling
+
+7
+00:00:19,680 --> 00:00:21,000
+is autoregressive modeling.
+
+8
+00:00:21,000 --> 00:00:23,940
+In the example that you can see here,
+
+9
+00:00:23,940 --> 00:00:25,560
+the next token could, for example,
+
+10
+00:00:25,560 --> 00:00:28,263
+be NLP or it could be machine learning.
+
+11
+00:00:29,460 --> 00:00:31,457
+A popular example of
+causal language models
+
+12
+00:00:31,457 --> 00:00:33,693
+is the GPT family of models.
+
+13
+00:00:35,561 --> 00:00:38,010
+To train models such as GPT,
+
+14
+00:00:38,010 --> 00:00:41,460
+we usually start with a
+large corpus of text files.
+
+15
+00:00:41,460 --> 00:00:43,890
+These files can be webpages
+scraped from the internet
+
+16
+00:00:43,890 --> 00:00:46,020
+such as the Common Crawl dataset
+
+17
+00:00:46,020 --> 00:00:47,940
+or they can be Python files from GitHub,
+
+18
+00:00:47,940 --> 00:00:49,490
+like the ones you can see here.
+
+19
+00:00:50,400 --> 00:00:52,680
+As a first step, we need
+to tokenize these files
+
+20
+00:00:52,680 --> 00:00:55,380
+such that we can feed
+them through the model.
+
+21
+00:00:55,380 --> 00:00:58,500
+Here, we show the tokenized
+texts as bars of various length,
+
+22
+00:00:58,500 --> 00:01:02,188
+illustrating that they're
+shorter and longer ones.
+
+23
+00:01:02,188 --> 00:01:05,910
+This is very common
+when working with text.
+
+24
+00:01:05,910 --> 00:01:09,270
+However, transform models
+have a limited context window
+
+25
+00:01:09,270 --> 00:01:10,770
+and depending on the data source,
+
+26
+00:01:10,770 --> 00:01:13,140
+it is possible that the tokenized texts
+
+27
+00:01:13,140 --> 00:01:15,183
+are much longer than this window.
+
+28
+00:01:16,080 --> 00:01:18,870
+In this case, we could
+just truncate the sequences
+
+29
+00:01:18,870 --> 00:01:20,182
+to the context length,
+
+30
+00:01:20,182 --> 00:01:22,650
+but this would mean
+that we lose everything
+
+31
+00:01:22,650 --> 00:01:24,513
+after the first context window.
+
+32
+00:01:25,500 --> 00:01:28,410
+Using the return overflowing token flag,
+
+33
+00:01:28,410 --> 00:01:30,960
+we can use the tokenizer to create chunks
+
+34
+00:01:30,960 --> 00:01:33,510
+with each one being the
+size of the context length.
+
+35
+00:01:34,860 --> 00:01:36,180
+Sometimes, it can still happen
+
+36
+00:01:36,180 --> 00:01:37,590
+that the last chunk is too short
+
+37
+00:01:37,590 --> 00:01:39,900
+if there aren't enough tokens to fill it.
+
+38
+00:01:39,900 --> 00:01:41,793
+In this case, we can just remove it.
+
+39
+00:01:42,990 --> 00:01:45,960
+With the return_length keyword,
+
+40
+00:01:45,960 --> 00:01:49,173
+we also get the length of
+each chunk from the tokenizer.
+
+41
+00:01:51,960 --> 00:01:53,640
+This function shows all the steps
+
+42
+00:01:53,640 --> 00:01:56,280
+necessary to prepare the dataset.
+
+43
+00:01:56,280 --> 00:01:57,960
+First, we tokenize the dataset
+
+44
+00:01:57,960 --> 00:02:00,330
+with the flags I just mentioned.
+
+45
+00:02:00,330 --> 00:02:02,190
+Then, we go through each chunk
+
+46
+00:02:02,190 --> 00:02:04,680
+and if it's length matches
+the context length,
+
+47
+00:02:04,680 --> 00:02:06,663
+we add it to the inputs we return.
+
+48
+00:02:07,590 --> 00:02:10,260
+We can apply this function
+to the whole dataset.
+
+49
+00:02:10,260 --> 00:02:11,700
+In addition, we make sure
+
+50
+00:02:11,700 --> 00:02:15,450
+that to use batches and
+remove the existing columns.
+
+51
+00:02:15,450 --> 00:02:17,670
+We need to remove the existing columns,
+
+52
+00:02:17,670 --> 00:02:21,330
+because we can create
+multiple samples per text,
+
+53
+00:02:21,330 --> 00:02:22,890
+and the shapes in the dataset
+
+54
+00:02:22,890 --> 00:02:24,753
+would not match anymore in that case.
+
+55
+00:02:26,832 --> 00:02:30,330
+If the context length is of
+similar lengths as the files,
+
+56
+00:02:30,330 --> 00:02:32,733
+this approach doesn't
+work so well anymore.
+
+57
+00:02:33,660 --> 00:02:36,420
+In this example, both sample 1 and 2
+
+58
+00:02:36,420 --> 00:02:38,400
+are shorter than the context size
+
+59
+00:02:38,400 --> 00:02:41,610
+and will be discarded with
+the previous approach.
+
+60
+00:02:41,610 --> 00:02:45,150
+In this case, it is better
+to first tokenize each sample
+
+61
+00:02:45,150 --> 00:02:46,590
+without truncation
+
+62
+00:02:46,590 --> 00:02:49,290
+and then concatenate the tokenized samples
+
+63
+00:02:49,290 --> 00:02:52,353
+with an end of string
+or EOS token in between.
+
+64
+00:02:53,546 --> 00:02:56,220
+Finally, we can chunk this long sequence
+
+65
+00:02:56,220 --> 00:02:59,490
+with the context length and we
+don't lose too many sequences
+
+66
+00:02:59,490 --> 00:03:01,263
+because they're too short anymore.
+
+67
+00:03:04,170 --> 00:03:05,760
+So far, we have only talked
+
+68
+00:03:05,760 --> 00:03:08,370
+about the inputs for
+causal language modeling,
+
+69
+00:03:08,370 --> 00:03:11,850
+but not the labels needed
+for supervised training.
+
+70
+00:03:11,850 --> 00:03:13,380
+When we do causal language modeling,
+
+71
+00:03:13,380 --> 00:03:16,710
+we don't require any extra
+labels for the input sequences
+
+72
+00:03:16,710 --> 00:03:20,610
+as the input sequences
+themselves are the labels.
+
+73
+00:03:20,610 --> 00:03:24,240
+In this example, when we feed
+the token trans to the model,
+
+74
+00:03:24,240 --> 00:03:27,510
+the next token we wanted
+to predict is formers.
+
+75
+00:03:27,510 --> 00:03:30,780
+In the next step, we feed
+trans and formers to the model
+
+76
+00:03:30,780 --> 00:03:33,903
+and the label we wanted to predict is are.
+
+77
+00:03:35,460 --> 00:03:38,130
+This pattern continues,
+and as you can see,
+
+78
+00:03:38,130 --> 00:03:41,220
+the input sequence is the label sequence
+
+79
+00:03:41,220 --> 00:03:42,663
+just shifted by one.
+
+80
+00:03:43,590 --> 00:03:47,310
+Since the model only makes
+prediction after the first token,
+
+81
+00:03:47,310 --> 00:03:49,350
+the first element of the input sequence,
+
+82
+00:03:49,350 --> 00:03:52,980
+in this case, trans,
+is not used as a label.
+
+83
+00:03:52,980 --> 00:03:55,530
+Similarly, we don't have a label
+
+84
+00:03:55,530 --> 00:03:57,600
+for the last token in the sequence
+
+85
+00:03:57,600 --> 00:04:00,843
+since there is no token
+after the sequence ends.
+
+86
+00:04:04,110 --> 00:04:06,300
+Let's have a look at what we need to do
+
+87
+00:04:06,300 --> 00:04:10,200
+to create the labels for causal
+language modeling in code.
+
+88
+00:04:10,200 --> 00:04:12,360
+If we want to calculate a loss on a batch,
+
+89
+00:04:12,360 --> 00:04:15,120
+we can just pass the input_ids as labels
+
+90
+00:04:15,120 --> 00:04:18,933
+and all the shifting is handled
+in the model internally.
+
+91
+00:04:20,032 --> 00:04:22,170
+So, you see, there's no matching involved
+
+92
+00:04:22,170 --> 00:04:24,870
+in processing data for
+causal language modeling,
+
+93
+00:04:24,870 --> 00:04:27,723
+and it only requires a few simple steps.
+
+94
+00:04:28,854 --> 00:04:31,771
+(transition music)
+
diff --git a/subtitles/en/64_using-a-custom-loss-function.srt b/subtitles/en/64_using-a-custom-loss-function.srt
index 71fa3c485..bd75982b9 100644
--- a/subtitles/en/64_using-a-custom-loss-function.srt
+++ b/subtitles/en/64_using-a-custom-loss-function.srt
@@ -1,169 +1,325 @@
-1
-00:00:05,440 --> 00:00:09,040
-In this video we take a look at setting 
-up a custom loss function for training.  
-
-2
-00:00:10,800 --> 00:00:14,800
-In the default loss functions all 
-samples such as these code snippets  
-
-3
-00:00:14,800 --> 00:00:19,040
-are treated the same irrespective of their 
-content, but there are scenarios where you it  
-
-4
-00:00:19,040 --> 00:00:22,880
-could make sense to weight the samples 
-differently. If for example one sample  
-
-5
-00:00:22,880 --> 00:00:28,800
-contains a lot of tokens that or of interest to 
-us or it has a favourable diversity of tokens.  
-
-6
-00:00:29,680 --> 00:00:33,520
-We can also think of other heuristics we can 
-implement with pattern matching or other rules.  
-
-7
-00:00:36,080 --> 00:00:40,400
-For each sample we get a loss value during 
-training and we can combine that loss with  
-
-8
-00:00:40,400 --> 00:00:47,200
-a weight. Then we can for example create a 
-weighted sum to get the final loss for a batch.  
-
-9
-00:00:48,480 --> 00:00:53,280
-Let’s have a look at a specific example: we 
-want to setup a language model that helps us  
-
-10
-00:00:53,280 --> 00:01:00,800
-autocomplete complete common data science code. 
-For that task we would like to weight samples  
-
-11
-00:01:00,800 --> 00:01:06,960
-stronger where tokens related to the data science 
-stack, such as pd or np, occur more frequently.  
-
-12
-00:01:10,000 --> 00:01:14,788
-Here you see a loss function that does exactly 
-that for causal language modeling. It takes the  
-
-13
-00:01:14,788 --> 00:01:22,800
-models it takes the model’s inputs and predicted 
-logits as well as the key tokens as input. First  
-
-14
-00:01:22,800 --> 00:01:30,320
-the inputs and logits are aligned, then the loss 
-per sample is calculate followed by the weights.  
-
-15
-00:01:32,320 --> 00:01:35,280
-Finally the loss and weights 
-are combined and returned.  
-
-16
-00:01:36,320 --> 00:01:40,480
-This is a pretty big function so let’s take 
-a closer look at the loss and weight blocks.  
-
-17
-00:01:43,200 --> 00:01:47,920
-During the calculation of the standard loss the 
-logits and labels are flattened over the batch.  
-
-18
-00:01:48,720 --> 00:01:53,280
-With the view we unflatten the tensor 
-to get a matrix with a row for each  
-
-19
-00:01:53,280 --> 00:01:57,280
-sample in the batch and a column for each 
-position in the sequence of the samples.  
-
-20
-00:01:58,720 --> 00:02:03,600
-We don’t need the loss per position so we average 
-the loss over all positions for each sample.  
-
-21
-00:02:06,000 --> 00:02:10,960
-For the weights we use boolean logic to get 
-a tensor with 1s where a keyword occurred  
-
-22
-00:02:10,960 --> 00:02:17,840
-and 0s where not. This tensor has an additional 
-dimension as the loss tensor we just saw because  
-
-23
-00:02:17,840 --> 00:02:24,480
-we get the information for each keyword in a 
-separate matrix. Only want to know how many  
-
-24
-00:02:24,480 --> 00:02:30,320
-times keywords occurred per sample so we can sum 
-over all keywords and all positions per sample.  
-
-25
-00:02:33,280 --> 00:02:39,760
-Now we are almost there, we only need to combine 
-the loss with the weight per sample. We do this  
-
-26
-00:02:39,760 --> 00:02:43,920
-with element wise multiplication and then 
-average over all samples in the batch.  
-
-27
-00:02:44,720 --> 00:02:48,000
-In the end we have exactly one 
-loss value for the whole batch.  
-
-28
-00:02:48,880 --> 00:02:52,800
-And this is the whole necessary logic 
-to create a custom weighted loss.  
-
-29
-00:02:56,080 --> 00:03:02,640
-Let’s see how we can make use of that custom loss 
-with Accelerate and the Trainer In Accelerate we  
-
-30
-00:03:02,640 --> 00:03:07,680
-just pass the input_ids to the models to get the 
-logits and can then call the custom loss function.  
-
-31
-00:03:08,800 --> 00:03:12,800
-After that we continue with the normal 
-training loop by for example calling backward.  
-
-32
-00:03:13,840 --> 00:03:19,200
-For the Trainer we can overwrite the compute 
-loss function of the standard trainer. We  
-
-33
-00:03:19,200 --> 00:03:23,360
-just need to make sure that that we return the 
-loss and the model outputs in the same format.  
-
-34
-00:03:24,240 --> 00:03:31,840
-With that you can integrate your own awesome loss 
-function with both the trainer and accelerates.
+﻿1
+00:00:00,573 --> 00:00:01,636
+(air whooshing)
+
+2
+00:00:01,636 --> 00:00:02,594
+(logo popping)
+
+3
+00:00:02,594 --> 00:00:05,550
+(metal sliding)
+
+4
+00:00:05,550 --> 00:00:07,500
+- In this video, we take
+a look at setting up
+
+5
+00:00:07,500 --> 00:00:09,303
+a custom loss function for training.
+
+6
+00:00:10,980 --> 00:00:13,260
+In the default loss function, all samples,
+
+7
+00:00:13,260 --> 00:00:15,840
+such as these code snippets,
+are treated the same
+
+8
+00:00:15,840 --> 00:00:18,960
+irrespective of their content
+but there are scenarios
+
+9
+00:00:18,960 --> 00:00:21,660
+where it could make sense to
+weight the samples differently.
+
+10
+00:00:21,660 --> 00:00:24,570
+If, for example, one sample
+contains a lot of tokens
+
+11
+00:00:24,570 --> 00:00:26,160
+that are of interest to us
+
+12
+00:00:26,160 --> 00:00:29,910
+or if a sample has a
+favorable diversity of tokens.
+
+13
+00:00:29,910 --> 00:00:31,950
+We can also implement other heuristics
+
+14
+00:00:31,950 --> 00:00:33,963
+with pattern matching or other rules.
+
+15
+00:00:35,993 --> 00:00:39,150
+For each sample, we get a
+loss value during training
+
+16
+00:00:39,150 --> 00:00:41,850
+and we can combine that
+loss with a weight.
+
+17
+00:00:41,850 --> 00:00:43,860
+Then we can create a weighted sum
+
+18
+00:00:43,860 --> 00:00:45,660
+or average over all samples
+
+19
+00:00:45,660 --> 00:00:47,613
+to get the final loss for the batch.
+
+20
+00:00:48,690 --> 00:00:51,240
+Let's have a look at a specific example.
+
+21
+00:00:51,240 --> 00:00:52,830
+We want to set up a language model
+
+22
+00:00:52,830 --> 00:00:56,073
+that helps us autocomplete
+common data science code.
+
+23
+00:00:57,030 --> 00:01:01,830
+For that task, we would like
+to weight samples stronger
+
+24
+00:01:01,830 --> 00:01:04,110
+where tokens related to
+the data science stack,
+
+25
+00:01:04,110 --> 00:01:07,353
+such as pd or np, occur more frequently.
+
+26
+00:01:10,140 --> 00:01:13,080
+Here you see a loss function
+that does exactly that
+
+27
+00:01:13,080 --> 00:01:15,180
+for causal language modeling.
+
+28
+00:01:15,180 --> 00:01:18,030
+It takes the model's input
+and predicted logits,
+
+29
+00:01:18,030 --> 00:01:20,343
+as well as the key tokens, as input.
+
+30
+00:01:21,869 --> 00:01:25,113
+First, the inputs and logits are aligned.
+
+31
+00:01:26,490 --> 00:01:29,310
+Then the loss per sample is calculated,
+
+32
+00:01:29,310 --> 00:01:30,843
+followed by the weights.
+
+33
+00:01:32,430 --> 00:01:35,583
+Finally, the loss and the weights
+are combined and returned.
+
+34
+00:01:36,540 --> 00:01:39,150
+This is a pretty big function,
+so let's take a closer look
+
+35
+00:01:39,150 --> 00:01:40,953
+at the loss and the weight blocks.
+
+36
+00:01:43,380 --> 00:01:45,600
+During the calculation
+of the standard loss,
+
+37
+00:01:45,600 --> 00:01:48,930
+the logits and labels are
+flattened over the batch.
+
+38
+00:01:48,930 --> 00:01:52,590
+With the view, we unflatten
+the tensor to get the matrix
+
+39
+00:01:52,590 --> 00:01:55,320
+with a row for each sample
+in the batch and a column
+
+40
+00:01:55,320 --> 00:01:57,723
+for each position in the
+sequence of the sample.
+
+41
+00:01:58,920 --> 00:02:00,600
+We don't need the loss per position,
+
+42
+00:02:00,600 --> 00:02:04,083
+so we average the loss over
+all positions for each sample.
+
+43
+00:02:06,150 --> 00:02:08,970
+For the weights, we use
+Boolean logic to get a tensor
+
+44
+00:02:08,970 --> 00:02:12,483
+with 1s where a keyword
+occurred and 0s where not.
+
+45
+00:02:13,440 --> 00:02:15,690
+This tensor has an additional dimension
+
+46
+00:02:15,690 --> 00:02:18,540
+as the loss tensor we
+just saw because we get
+
+47
+00:02:18,540 --> 00:02:21,693
+the information for each
+keyword in a separate matrix.
+
+48
+00:02:22,770 --> 00:02:24,120
+We only want to know
+
+49
+00:02:24,120 --> 00:02:26,880
+how many times keywords
+occurred per sample,
+
+50
+00:02:26,880 --> 00:02:30,693
+so we can sum overall keywords
+and all positions per sample.
+
+51
+00:02:33,450 --> 00:02:35,010
+Now we're almost there.
+
+52
+00:02:35,010 --> 00:02:38,850
+We only need to combine the
+loss with the weight per sample.
+
+53
+00:02:38,850 --> 00:02:41,790
+We do this with element
+wise multiplication
+
+54
+00:02:41,790 --> 00:02:45,233
+and then average overall
+samples in the batch.
+
+55
+00:02:45,233 --> 00:02:46,066
+In the end,
+
+56
+00:02:46,066 --> 00:02:49,110
+we have exactly one loss
+value for the whole batch
+
+57
+00:02:49,110 --> 00:02:51,330
+and this is the whole necessary logic
+
+58
+00:02:51,330 --> 00:02:53,223
+to create a custom weighted loss.
+
+59
+00:02:56,250 --> 00:02:59,010
+Let's see how we can make
+use of that custom loss
+
+60
+00:02:59,010 --> 00:03:00,753
+with Accelerate and the Trainer.
+
+61
+00:03:01,710 --> 00:03:04,656
+In Accelerate, we just pass the input_ids
+
+62
+00:03:04,656 --> 00:03:05,730
+to the model to get the logits
+
+63
+00:03:05,730 --> 00:03:08,103
+and then we can call the
+custom loss function.
+
+64
+00:03:09,000 --> 00:03:11,310
+After that, we continue with
+the normal training loop
+
+65
+00:03:11,310 --> 00:03:13,083
+by, for example, calling backward.
+
+66
+00:03:14,010 --> 00:03:15,570
+For the Trainer, we can overwrite
+
+67
+00:03:15,570 --> 00:03:19,260
+the compute loss function
+of the standard trainer.
+
+68
+00:03:19,260 --> 00:03:20,970
+We just need to make sure that we return
+
+69
+00:03:20,970 --> 00:03:24,450
+the loss and the model
+outputs in the same format.
+
+70
+00:03:24,450 --> 00:03:27,570
+With that, you can integrate
+your own awesome loss function
+
+71
+00:03:27,570 --> 00:03:29,763
+with both the Trainer and Accelerate.
+
+72
+00:03:31,389 --> 00:03:34,056
+(air whooshing)
+
diff --git a/subtitles/en/65_data-processing-for-question-answering.srt b/subtitles/en/65_data-processing-for-question-answering.srt
index 81bc4280f..c0ea48326 100644
--- a/subtitles/en/65_data-processing-for-question-answering.srt
+++ b/subtitles/en/65_data-processing-for-question-answering.srt
@@ -1,185 +1,277 @@
-1
-00:00:05,569 --> 00:00:10,490
-Let's study how to preprocess a dataset for
-question answering!
-
-2
-00:00:10,490 --> 00:00:14,260
-Question answering is the task of finding
-answers to a question in some context.
-
-3
-00:00:14,260 --> 00:00:19,970
-For our example, we will use the squad dataset,
-in which we remove columns we won't use and
-
-4
-00:00:19,970 --> 00:00:24,390
-just extract the information we will need
-for the labels: the start and the end of the
-
-5
-00:00:24,390 --> 00:00:25,390
-answer in the context.
-
-6
-00:00:25,390 --> 00:00:30,279
-If you have your own dataset for question
-answering, just make sure you clean your data
-
-7
-00:00:30,279 --> 00:00:34,800
-to get to the same point, with one column
-containing the questions, one column containing
-
-8
-00:00:34,800 --> 00:00:39,350
-the contexts, one column for the index of
-the start and end character of the answer
-
-9
-00:00:39,350 --> 00:00:41,700
-in the context.
-
-10
-00:00:41,700 --> 00:00:44,610
-Note that the answer must be part of the context.
-
-11
-00:00:44,610 --> 00:00:48,360
-If you want to perform generative question
-answering, look at one of the sequence to
-
-12
-00:00:48,360 --> 00:00:50,890
-sequence videos linked below.
-
-13
-00:00:50,890 --> 00:00:55,860
-Now if we have a look at the tokens we will
-feed our model we will see the answer lies
-
-14
-00:00:55,860 --> 00:00:58,450
-somewhere inside the context.
-
-15
-00:00:58,450 --> 00:01:02,239
-For very long context that answer may get
-truncated by the tokenizer.
-
-16
-00:01:02,239 --> 00:01:06,050
-In this case, we wont have any proper labels
-for our model.
-
-17
-00:01:06,050 --> 00:01:11,159
-So we should keep the truncated part as a
-separate feature instead of discarding it.
-
-18
-00:01:11,159 --> 00:01:14,720
-The only thing we need to be careful with,
-is to allow some overlap between separate
-
-19
-00:01:14,720 --> 00:01:19,900
-chunks so that the answer is not truncated,
-and that the feature containing the answer
-
-20
-00:01:19,900 --> 00:01:22,670
-gets sufficient context to be able to predict
-it.
-
-21
-00:01:22,670 --> 00:01:28,790
-Here is how it can be done by the tokenizer:
-we pass it the question, context, set the
-
-22
-00:01:28,790 --> 00:01:32,750
-truncation for the context only and the padding
-to the maximum length.
-
-23
-00:01:32,750 --> 00:01:39,590
-The stride argument is where we set the number
-of overlapping tokens, and the return_overflowing_tokens
-
-24
-00:01:39,590 --> 00:01:42,869
-means we don't want to discard the truncated
-part.
-
-25
-00:01:42,869 --> 00:01:47,140
-Lastly, we also return the offset mappings
-to be able to find the tokens corresponding
-
-26
-00:01:47,140 --> 00:01:48,649
-to the answer start and end.
-
-27
-00:01:48,649 --> 00:01:53,990
-We want those two tokens, because there will
-be the labels we pass to our model.
-
-28
-00:01:53,990 --> 00:01:57,200
-In a one-hot encoded version, here is what
-they look like.
-
-29
-00:01:57,200 --> 00:02:02,119
-If the context we have does not contain the
-answer, we set the two labels to the index
-
-30
-00:02:02,119 --> 00:02:04,329
-of the CLS token.
-
-31
-00:02:04,329 --> 00:02:08,629
-We also do this if the context only partially
-contains the answer.
-
-32
-00:02:08,629 --> 00:02:13,950
-In terms of code, here is how we can do it:
-using the sequence IDs of an input, we can
-
-33
-00:02:13,950 --> 00:02:17,390
-determine the beginning and the end of the
-context.
-
-34
-00:02:17,390 --> 00:02:22,290
-Then we know if have to return the CLS position
-for the two labels or we determine the positions
-
-35
-00:02:22,290 --> 00:02:25,120
-of the first and last tokens of the answer.
-
-36
-00:02:25,120 --> 00:02:28,670
-We can check it works properly on our previous
-example.
-
-37
-00:02:28,670 --> 00:02:35,319
-Putting it all together looks like this big
-function, which we can apply to our datasets.
-
-38
-00:02:35,319 --> 00:02:40,010
-Since we applied padding during the tokenization,
-we can then use this directly in the Trainer
-
-39
-00:02:40,010 --> 00:02:43,920
-or apply the to_tf_dataset method to use Keras.fit.
+﻿1
+00:00:05,580 --> 00:00:07,177
+- Let's study how to preprocess a dataset
+
+2
+00:00:07,177 --> 00:00:08,643
+for question answering.
+
+3
+00:00:10,200 --> 00:00:11,640
+Question answering is a task
+
+4
+00:00:11,640 --> 00:00:14,343
+of finding answers to a
+question in some context.
+
+5
+00:00:15,270 --> 00:00:17,550
+For example, we'll use the SQuAD dataset
+
+6
+00:00:17,550 --> 00:00:19,860
+in which we remove columns we won't use
+
+7
+00:00:19,860 --> 00:00:21,660
+and just extract the
+information we will need
+
+8
+00:00:21,660 --> 00:00:22,950
+for the labels,
+
+9
+00:00:22,950 --> 00:00:26,370
+the start and the end of
+the answer in the context.
+
+10
+00:00:26,370 --> 00:00:28,690
+If you have your own dataset
+for question answering,
+
+11
+00:00:28,690 --> 00:00:31,680
+just make sure you clean your
+data to get to the same point,
+
+12
+00:00:31,680 --> 00:00:33,900
+with one column containing the questions,
+
+13
+00:00:33,900 --> 00:00:35,940
+one column containing the context,
+
+14
+00:00:35,940 --> 00:00:38,610
+one column for the index of
+the start and end character
+
+15
+00:00:38,610 --> 00:00:40,473
+of the answer in the context.
+
+16
+00:00:41,610 --> 00:00:44,520
+Note that the answer must
+be part of the context.
+
+17
+00:00:44,520 --> 00:00:47,160
+If you want to perform
+generative question answering,
+
+18
+00:00:47,160 --> 00:00:50,160
+look at one of the sequence to
+sequence videos linked below.
+
+19
+00:00:51,600 --> 00:00:53,430
+Now, if we have a look at the tokens
+
+20
+00:00:53,430 --> 00:00:54,750
+we will feed our model,
+
+21
+00:00:54,750 --> 00:00:58,320
+we'll see the answer lies
+somewhere inside the context.
+
+22
+00:00:58,320 --> 00:01:01,080
+For very long context, that
+answer may get truncated
+
+23
+00:01:01,080 --> 00:01:02,580
+by the tokenizer.
+
+24
+00:01:02,580 --> 00:01:05,970
+In this case, we won't have any
+proper labels for our model,
+
+25
+00:01:05,970 --> 00:01:07,680
+so we should keep the truncated part
+
+26
+00:01:07,680 --> 00:01:10,203
+as a separate feature
+instead of discarding it.
+
+27
+00:01:11,100 --> 00:01:12,990
+The only thing we need to be careful with
+
+28
+00:01:12,990 --> 00:01:15,660
+is to allow some overlap
+between separate chunks
+
+29
+00:01:15,660 --> 00:01:17,670
+so that the answer is not truncated
+
+30
+00:01:17,670 --> 00:01:19,920
+and that the feature containing the answer
+
+31
+00:01:19,920 --> 00:01:22,623
+gets sufficient context
+to be able to predict it.
+
+32
+00:01:23,490 --> 00:01:26,040
+Here is how it can be
+done by the tokenizer.
+
+33
+00:01:26,040 --> 00:01:29,370
+We pass it the question,
+context, set a truncation
+
+34
+00:01:29,370 --> 00:01:33,240
+for the context only, and the
+padding to the maximum length.
+
+35
+00:01:33,240 --> 00:01:35,340
+The stride argument is
+where we set the number
+
+36
+00:01:35,340 --> 00:01:36,900
+of overlapping tokens,
+
+37
+00:01:36,900 --> 00:01:39,600
+and the return overflowing
+tokens equals true
+
+38
+00:01:39,600 --> 00:01:42,630
+means we don't want to
+discard the truncated part.
+
+39
+00:01:42,630 --> 00:01:45,210
+Lastly, we also return the offset mappings
+
+40
+00:01:45,210 --> 00:01:47,220
+to be able to find the
+tokens corresponding
+
+41
+00:01:47,220 --> 00:01:48,693
+to the answer start and end.
+
+42
+00:01:49,860 --> 00:01:52,290
+We want those tokens because
+they will be the labels
+
+43
+00:01:52,290 --> 00:01:53,970
+we pass through our model.
+
+44
+00:01:53,970 --> 00:01:56,870
+In a one-hot encoded version,
+here is what they look like.
+
+45
+00:01:57,930 --> 00:02:00,480
+If the context we have does
+not contain the answer,
+
+46
+00:02:00,480 --> 00:02:03,799
+we set the two labels to
+the index of the CLS token.
+
+47
+00:02:03,799 --> 00:02:05,700
+We also do this if the context
+
+48
+00:02:05,700 --> 00:02:07,713
+only partially contains the answer.
+
+49
+00:02:08,580 --> 00:02:11,400
+In terms of code, here
+is how we can do it.
+
+50
+00:02:11,400 --> 00:02:13,710
+Using the sequence IDs of an input,
+
+51
+00:02:13,710 --> 00:02:17,220
+we can determine the beginning
+and the end of the context.
+
+52
+00:02:17,220 --> 00:02:19,800
+Then, we know if we have to
+return to the CLS position
+
+53
+00:02:19,800 --> 00:02:22,290
+for the two labels or we
+determine the position
+
+54
+00:02:22,290 --> 00:02:25,050
+of the first and last
+tokens of the answer.
+
+55
+00:02:25,050 --> 00:02:27,800
+We can check it works properly
+on our previous example.
+
+56
+00:02:28,680 --> 00:02:31,380
+Putting it all together
+looks like this big function,
+
+57
+00:02:31,380 --> 00:02:34,233
+which we can apply to our
+datasets with the map method.
+
+58
+00:02:35,310 --> 00:02:37,920
+Since we applied padding
+during the tokenization,
+
+59
+00:02:37,920 --> 00:02:40,680
+we can then use this
+directly as the trainer
+
+60
+00:02:40,680 --> 00:02:44,133
+or apply the to_tf_dataset
+method to use Keras.fit.
+
diff --git a/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt b/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt
index be70d0f01..d4a6fd6db 100644
--- a/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt
+++ b/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt
@@ -1,169 +1,342 @@
-1
-00:00:05,680 --> 00:00:12,000
-The post-processing step in a question 
-answering task. When doing question answering,  
-
-2
-00:00:12,000 --> 00:00:17,440
-the processing of the initial dataset implies 
-splitting examples in several features, which  
-
-3
-00:00:17,440 --> 00:00:23,760
-may or may not contain the answer. Passing those 
-features through the model will give us logits for  
-
-4
-00:00:23,760 --> 00:00:29,280
-the start and end positions, since our labels are 
-the indices of the tokens that correspond to the  
-
-5
-00:00:29,280 --> 00:00:35,600
-start and end the answer. We must then somehow 
-convert those logits into an answer, and then  
-
-6
-00:00:35,600 --> 00:00:40,480
-pick one of the various answers each feature 
-gives to be THE answer for a given example.  
-
-7
-00:00:42,080 --> 00:00:46,080
-For the processing step, you should refer 
-to the video linked below. It's not very  
-
-8
-00:00:46,080 --> 00:00:50,240
-different for validation, we just need to 
-add a few lines to keep track of two things:  
-
-9
-00:00:51,440 --> 00:00:56,000
-instead of discarding the offset mapping, 
-we keep them, and also include in them the  
-
-10
-00:00:56,000 --> 00:01:01,440
-information of where the context is by setting 
-the offsets of the special tokens and the question  
-
-11
-00:01:01,440 --> 00:01:06,400
-to None. Then we also keep track 
-of the example ID for each feature,  
-
-12
-00:01:06,400 --> 00:01:10,160
-to be able to map back feature to the 
-examples that they originated from.  
-
-13
-00:01:11,680 --> 00:01:15,840
-If you don't want to compute the validation loss, 
-you won't need to include all the special code  
-
-14
-00:01:15,840 --> 00:01:21,360
-that we used to create the labels. With this done, 
-we can apply that preprocessing function using the  
-
-15
-00:01:21,360 --> 00:01:26,160
-map method. We take the SQUAD dataset like in 
-the preprocessing for question-answering video.  
-
-16
-00:01:27,520 --> 00:01:31,920
-Once this is done, the next step is to create 
-our model. We use the default model behind the  
-
-17
-00:01:31,920 --> 00:01:36,000
-question-answering pipeline here, but you 
-should use any model you want to evaluate.  
-
-18
-00:01:36,720 --> 00:01:41,200
-We will run a manual evaluation loop, so we 
-create a PyTorch DataLoader with our features.  
-
-19
-00:01:42,240 --> 00:01:46,400
-With it, we can compute and gather all 
-the start and end logits like this,  
-
-20
-00:01:46,400 --> 00:01:52,240
-with a standard PyTorch evaluation loop. With this 
-done, we can really dive into the post-processing.  
-
-21
-00:01:53,680 --> 00:01:57,440
-We will need a map from examples to 
-features, which we can create like this.  
-
-22
-00:01:58,560 --> 00:02:02,720
-Now, for the main part of the post-processing, 
-let's see how to extract an answer from the  
-
-23
-00:02:02,720 --> 00:02:08,480
-logits. We could just take the best index for the 
-start and end logits and be done, but if our model  
-
-24
-00:02:08,480 --> 00:02:13,200
-predicts something impossible, like tokens in 
-the question, we will look at more of the logits.  
-
-25
-00:02:15,040 --> 00:02:19,120
-Note that in the question-answering pipeline, 
-we attributed score to each answer based on the  
-
-26
-00:02:19,120 --> 00:02:24,560
-probabilities, which we did not compute here. 
-In terms of logits, the multiplication we had  
-
-27
-00:02:24,560 --> 00:02:31,520
-in the scores becomes an addition. To go fast, we 
-don't look at all possible start and end logits,  
-
-28
-00:02:31,520 --> 00:02:37,120
-but the twenty best ones. We ignore the logits 
-that spawn impossible answers or answer that are  
-
-29
-00:02:37,120 --> 00:02:43,040
-too long. As we saw in the preprocessing, 
-the labels (0, 0) correspond to no answer,  
-
-30
-00:02:43,040 --> 00:02:48,400
-otherwise we use the offsets to get the answer 
-inside the context. Let's have a look at the  
-
-31
-00:02:48,400 --> 00:02:52,640
-predicted answer for the first feature, which 
-is the answer with the best score (or the best  
-
-32
-00:02:52,640 --> 00:02:58,720
-logit score since the SoftMax is an increasing 
-function). The model got it right! Next we just  
-
-33
-00:02:58,720 --> 00:03:03,920
-have to loop this for every example, picking for 
-each the answer with the best logit score in all  
-
-34
-00:03:03,920 --> 00:03:15,440
-the features the example generated. Now you know 
-how to get answers from your model predictions!
+﻿1
+00:00:00,315 --> 00:00:02,982
+(air whooshing)
+
+2
+00:00:05,940 --> 00:00:08,913
+- The post-processing step
+in a question answering task.
+
+3
+00:00:10,440 --> 00:00:12,180
+When doing question answering,
+
+4
+00:00:12,180 --> 00:00:14,550
+the processing of the initial dataset
+
+5
+00:00:14,550 --> 00:00:17,370
+implies splitting examples
+in several features,
+
+6
+00:00:17,370 --> 00:00:19,773
+which may or may not contain the answer.
+
+7
+00:00:21,000 --> 00:00:22,740
+Passing those features through the model
+
+8
+00:00:22,740 --> 00:00:25,830
+will give us logits for the
+start and end positions,
+
+9
+00:00:25,830 --> 00:00:28,650
+since our labels are
+the indices of the token
+
+10
+00:00:28,650 --> 00:00:31,050
+that correspond to the
+start and end the answer.
+
+11
+00:00:32,664 --> 00:00:35,490
+We must then somehow convert
+those logits into an answer,
+
+12
+00:00:35,490 --> 00:00:38,610
+and then pick one of the various
+answers each feature gives
+
+13
+00:00:38,610 --> 00:00:40,893
+to be the answer for a given example.
+
+14
+00:00:42,300 --> 00:00:43,500
+For the processing step,
+
+15
+00:00:43,500 --> 00:00:45,750
+you should refer to
+the video linked below.
+
+16
+00:00:45,750 --> 00:00:47,820
+It's not very different for validation,
+
+17
+00:00:47,820 --> 00:00:50,820
+we just need to add a few lines
+to keep track of two things.
+
+18
+00:00:51,660 --> 00:00:54,960
+Instead of discarding the
+offset mappings, we keep them,
+
+19
+00:00:54,960 --> 00:00:55,793
+and also include in them
+
+20
+00:00:55,793 --> 00:00:58,350
+the information of where the context is
+
+21
+00:00:58,350 --> 00:01:00,690
+by setting the offsets
+of the special tokens
+
+22
+00:01:00,690 --> 00:01:02,253
+and the question to None.
+
+23
+00:01:03,480 --> 00:01:06,630
+Then we also keep track of the
+example ID for each feature,
+
+24
+00:01:06,630 --> 00:01:08,280
+to be able to map back feature
+
+25
+00:01:08,280 --> 00:01:10,503
+to the examples that they originated from.
+
+26
+00:01:11,940 --> 00:01:14,100
+If you don't want to
+compute the validation loss,
+
+27
+00:01:14,100 --> 00:01:15,990
+you won't need to include
+all the special code
+
+28
+00:01:15,990 --> 00:01:18,420
+that we used to create the labels.
+
+29
+00:01:18,420 --> 00:01:21,090
+With this done, we can apply
+that preprocessing function
+
+30
+00:01:21,090 --> 00:01:22,890
+using the map method.
+
+31
+00:01:22,890 --> 00:01:24,090
+We take the SQUAD dataset
+
+32
+00:01:24,090 --> 00:01:26,840
+like in the preprocessing
+for question-answering video.
+
+33
+00:01:27,810 --> 00:01:30,540
+Once this is done, the next
+step is to create our model.
+
+34
+00:01:30,540 --> 00:01:31,710
+We use the default model
+
+35
+00:01:31,710 --> 00:01:33,930
+behind the question-answering
+pipeline here,
+
+36
+00:01:33,930 --> 00:01:36,960
+but you should use any
+model you want to evaluate.
+
+37
+00:01:36,960 --> 00:01:38,850
+We'll run a manual evaluation loop,
+
+38
+00:01:38,850 --> 00:01:41,583
+so we create a PyTorch
+DataLoader with our features.
+
+39
+00:01:42,657 --> 00:01:44,520
+With it, we can compute and gather
+
+40
+00:01:44,520 --> 00:01:46,650
+all the start and end logits like this,
+
+41
+00:01:46,650 --> 00:01:49,653
+with a standard PyTorch evaluation loop.
+
+42
+00:01:49,653 --> 00:01:53,220
+With this done, we can really
+dive into the post-processing.
+
+43
+00:01:53,220 --> 00:01:56,340
+First, we'll need a map
+from example to features,
+
+44
+00:01:56,340 --> 00:01:57,873
+which we can create like this.
+
+45
+00:01:58,800 --> 00:02:00,810
+Now, for the main part
+of the post-processing,
+
+46
+00:02:00,810 --> 00:02:04,230
+let's see how to extract
+an answer from the logits.
+
+47
+00:02:04,230 --> 00:02:05,760
+We could just take the best index
+
+48
+00:02:05,760 --> 00:02:07,980
+for the start and end logits and be done,
+
+49
+00:02:07,980 --> 00:02:10,380
+but if our model predicts
+something impossible,
+
+50
+00:02:10,380 --> 00:02:12,150
+like tokens in the question,
+
+51
+00:02:12,150 --> 00:02:13,940
+we'll look at more of the logits.
+
+52
+00:02:15,270 --> 00:02:17,070
+Note that in the
+question-answering pipeline,
+
+53
+00:02:17,070 --> 00:02:18,870
+we attributed score to each answer
+
+54
+00:02:18,870 --> 00:02:20,430
+based on the probabilities,
+
+55
+00:02:20,430 --> 00:02:22,350
+which we did not compute here.
+
+56
+00:02:22,350 --> 00:02:25,560
+In terms of logits, the
+multiplication we had in the scores
+
+57
+00:02:25,560 --> 00:02:26,853
+becomes an addition.
+
+58
+00:02:28,110 --> 00:02:29,010
+To go fast,
+
+59
+00:02:29,010 --> 00:02:31,800
+we don't look at all possible
+start and end logits,
+
+60
+00:02:31,800 --> 00:02:34,050
+but the 20 best one is enough.
+
+61
+00:02:34,050 --> 00:02:36,570
+We ignore the logits that
+spawn impossible answers
+
+62
+00:02:36,570 --> 00:02:38,550
+or answer that are too long.
+
+63
+00:02:38,550 --> 00:02:41,430
+As we saw in the
+preprocessing, the labels 0,0
+
+64
+00:02:41,430 --> 00:02:43,230
+correspond to a no answer.
+
+65
+00:02:43,230 --> 00:02:45,090
+Otherwise we use the offsets
+
+66
+00:02:45,090 --> 00:02:46,940
+to get the answer inside the context.
+
+67
+00:02:47,910 --> 00:02:49,107
+Let's have a look at the predicted answer
+
+68
+00:02:49,107 --> 00:02:50,370
+for the first feature,
+
+69
+00:02:50,370 --> 00:02:51,930
+which is the answer with the best score
+
+70
+00:02:51,930 --> 00:02:53,640
+or the best logit score
+
+71
+00:02:53,640 --> 00:02:56,280
+since the SoftMax is
+an increasing function.
+
+72
+00:02:56,280 --> 00:02:58,230
+The model got it right.
+
+73
+00:02:58,230 --> 00:03:00,690
+Next we just have to loop
+this for every example,
+
+74
+00:03:00,690 --> 00:03:03,720
+picking for each the answer
+with the best logit score
+
+75
+00:03:03,720 --> 00:03:06,750
+in all the features the example generated.
+
+76
+00:03:06,750 --> 00:03:09,700
+Now you know how to get answers
+from your model prediction.
+
+77
+00:03:11,007 --> 00:03:13,674
+(air whooshing)
+
diff --git a/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt b/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt
index d29515a59..59c0957ce 100644
--- a/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt
+++ b/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt
@@ -1,169 +1,329 @@
-1
-00:00:05,760 --> 00:00:08,560
-The post-processing step in 
-a question answering task.  
-
-2
-00:00:10,640 --> 00:00:14,640
-When doing question answering, the 
-processing of the initial dataset  
-
-3
-00:00:14,640 --> 00:00:20,960
-implies splitting examples in several features, 
-which may or may not contain the answer. Passing  
-
-4
-00:00:20,960 --> 00:00:25,680
-those features through the model will give 
-us logits for the start and end positions,  
-
-5
-00:00:25,680 --> 00:00:30,640
-since our labels are the indices of the tokens 
-that correspond to the start and end the answer.  
-
-6
-00:00:31,600 --> 00:00:36,560
-We must then somehow convert those logits into an 
-answer, and then pick one of the various answers  
-
-7
-00:00:36,560 --> 00:00:43,280
-each feature gives to be THE answer for a given 
-example. For the processing step, you should  
-
-8
-00:00:43,280 --> 00:00:47,840
-refer to the video linked below. It's not very 
-different for validation, we just need to add a  
-
-9
-00:00:47,840 --> 00:00:53,520
-few lines to keep track of two things: instead 
-of discarding the offset mapping, we keep them,  
-
-10
-00:00:53,520 --> 00:00:58,240
-and also include in them the information of 
-where the context is by setting the offsets  
-
-11
-00:00:58,240 --> 00:01:04,240
-of the special tokens and the question to None. 
-Then we also keep track of the example ID for  
-
-12
-00:01:04,240 --> 00:01:08,880
-each feature, to be able to map back feature 
-to the examples that they originated from.  
-
-13
-00:01:10,240 --> 00:01:14,400
-If you don't want to compute the validation loss, 
-you won't need to include all the special code  
-
-14
-00:01:14,400 --> 00:01:19,840
-that we used to create the labels. With this done, 
-we can apply that preprocessing function using the  
-
-15
-00:01:19,840 --> 00:01:26,160
-map method. We take the SQUAD dataset like in 
-the preprocessing for question-answering video.  
-
-16
-00:01:26,160 --> 00:01:30,560
-Once this is done, the next step is to create 
-our model. We use the default model behind the  
-
-17
-00:01:30,560 --> 00:01:34,560
-question-answering pipeline here, but you 
-should use any model you want to evaluate.  
-
-18
-00:01:35,600 --> 00:01:40,560
-With the to_tf_dataset method, we can just 
-sent our processed dataset to model.predict,  
-
-19
-00:01:41,120 --> 00:01:44,880
-and we directly get our start and end logits 
-for the whole dataset as NumPy arrays.  
-
-20
-00:01:45,600 --> 00:01:51,040
-With this done, we can really dive into the 
-post-processing. We will need a map from examples  
-
-21
-00:01:51,040 --> 00:01:57,040
-to features, which we can create like this. Now, 
-for the main part of the post-processing, let's  
-
-22
-00:01:57,040 --> 00:02:02,080
-see how to extract an answer from the logits. We 
-could just take the best index for the start and  
-
-23
-00:02:02,080 --> 00:02:07,680
-end logits and be done, but if our model predicts 
-something impossible, like tokens in the question,  
-
-24
-00:02:07,680 --> 00:02:13,040
-we will look at more of the logits. Note that in 
-the question-answering pipeline, we attributed  
-
-25
-00:02:13,040 --> 00:02:18,560
-score to each answer based on the probabilities, 
-which we did not compute here. In terms of logits,  
-
-26
-00:02:18,560 --> 00:02:24,080
-the multiplication we had in the scores becomes 
-an addition. To go fast, we don't look at all  
-
-27
-00:02:24,080 --> 00:02:29,040
-possible start and end logits, but the twenty 
-best ones. We ignore the logits that spawn  
-
-28
-00:02:29,040 --> 00:02:34,240
-impossible answers or answer that are too long. 
-As we saw in the preprocessing, the labels (0,  
-
-29
-00:02:34,240 --> 00:02:38,880
-0) correspond to no answer, otherwise we use the 
-offsets to get the answer inside the context.  
-
-30
-00:02:39,920 --> 00:02:43,760
-Let's have a look at the predicted answer 
-for the first feature, which is the answer  
-
-31
-00:02:43,760 --> 00:02:47,680
-with the best score (or the best logit score 
-since the SoftMax is an increasing function).  
-
-32
-00:02:48,480 --> 00:02:54,000
-The model got it right! Next we just 
-have to loop this for every example,  
-
-33
-00:02:54,000 --> 00:02:58,880
-picking for each the answer with the best logit 
-score in all the features the example generated.  
-
-34
-00:02:59,840 --> 00:03:03,840
-Now you know how to get answers 
-from your model predictions!
+﻿1
+00:00:00,367 --> 00:00:02,950
+(subtle blast)
+
+2
+00:00:05,850 --> 00:00:08,913
+- The post-processing step
+in a question-answering task.
+
+3
+00:00:10,830 --> 00:00:11,790
+When doing question answering,
+
+4
+00:00:11,790 --> 00:00:14,670
+the processing of the initial dataset
+
+5
+00:00:14,670 --> 00:00:18,090
+implies splitting examples
+in several features,
+
+6
+00:00:18,090 --> 00:00:20,850
+which may or may not contain the answer.
+
+7
+00:00:20,850 --> 00:00:22,530
+Passing those features through the model
+
+8
+00:00:22,530 --> 00:00:25,860
+will give us logits for the
+start and end positions,
+
+9
+00:00:25,860 --> 00:00:28,620
+since our labels are the
+indices of the tokens
+
+10
+00:00:28,620 --> 00:00:31,020
+that correspond to the
+start and end the answer.
+
+11
+00:00:31,860 --> 00:00:34,740
+We must then somehow convert
+those logits into an answer,
+
+12
+00:00:34,740 --> 00:00:38,070
+and then pick one of the various
+answers each feature gives
+
+13
+00:00:38,070 --> 00:00:40,473
+to be the answer for a given example.
+
+14
+00:00:41,683 --> 00:00:43,200
+For the processing step,
+
+15
+00:00:43,200 --> 00:00:45,450
+you should refer to
+the video linked below.
+
+16
+00:00:45,450 --> 00:00:47,310
+It's not very different for validation,
+
+17
+00:00:47,310 --> 00:00:50,053
+we just need to add a few lines
+to keep track of two things:
+
+18
+00:00:50,053 --> 00:00:52,620
+instead of discarding the offset mappings,
+
+19
+00:00:52,620 --> 00:00:55,380
+we keep them, and also include
+in them the information
+
+20
+00:00:55,380 --> 00:00:58,410
+of where the context is
+by setting the offsets
+
+21
+00:00:58,410 --> 00:01:01,821
+of the special tokens
+and the question to None.
+
+22
+00:01:01,821 --> 00:01:05,370
+Then we also keep track of the
+example ID for each feature,
+
+23
+00:01:05,370 --> 00:01:07,020
+to be able to map back feature
+
+24
+00:01:07,020 --> 00:01:09,243
+to the examples that they originated from.
+
+25
+00:01:10,470 --> 00:01:12,660
+If you don't want to
+compute the validation loss,
+
+26
+00:01:12,660 --> 00:01:14,610
+you won't need to include
+all the special code
+
+27
+00:01:14,610 --> 00:01:17,010
+that we used to create the labels.
+
+28
+00:01:17,010 --> 00:01:19,650
+With this done, we can apply
+that preprocessing function
+
+29
+00:01:19,650 --> 00:01:21,480
+using the map method.
+
+30
+00:01:21,480 --> 00:01:23,610
+We take the SQUAD dataset
+like in the preprocessing
+
+31
+00:01:23,610 --> 00:01:25,060
+for question-answering video.
+
+32
+00:01:26,400 --> 00:01:29,310
+Once this is done, the next
+step is to create our model.
+
+33
+00:01:29,310 --> 00:01:30,570
+We use the default model behind
+
+34
+00:01:30,570 --> 00:01:32,640
+the question-answering pipeline here,
+
+35
+00:01:32,640 --> 00:01:35,880
+but you should use any
+model you want to evaluate.
+
+36
+00:01:35,880 --> 00:01:37,680
+With the to_tf_dataset method,
+
+37
+00:01:37,680 --> 00:01:41,370
+we can just sent our processed
+dataset to model.predict,
+
+38
+00:01:41,370 --> 00:01:43,350
+and we directly get our
+start and end logits
+
+39
+00:01:43,350 --> 00:01:45,930
+for the whole dataset as NumPy arrays.
+
+40
+00:01:45,930 --> 00:01:49,230
+With this done, we can really
+dive into the post-processing.
+
+41
+00:01:49,230 --> 00:01:52,380
+First, we'll need a map
+from example to features,
+
+42
+00:01:52,380 --> 00:01:53,883
+which we can create like this.
+
+43
+00:01:54,780 --> 00:01:56,700
+Now, for the main part
+of the post-processing,
+
+44
+00:01:56,700 --> 00:02:00,270
+let's see how to extract
+an answer from the logits.
+
+45
+00:02:00,270 --> 00:02:01,650
+We could just take the best index
+
+46
+00:02:01,650 --> 00:02:03,690
+for the start and end logits and be done,
+
+47
+00:02:03,690 --> 00:02:06,180
+but if our model predicts
+something impossible,
+
+48
+00:02:06,180 --> 00:02:07,920
+like tokens in the questions,
+
+49
+00:02:07,920 --> 00:02:09,670
+we will look at more of the logits.
+
+50
+00:02:10,800 --> 00:02:12,570
+Note that in the
+question-answering pipeline,
+
+51
+00:02:12,570 --> 00:02:14,160
+we attributed the score to each answer
+
+52
+00:02:14,160 --> 00:02:17,880
+based on the probabilities,
+which we did not compute here.
+
+53
+00:02:17,880 --> 00:02:19,860
+In terms of logits, the
+multiplication we had
+
+54
+00:02:19,860 --> 00:02:21,663
+in the scores becomes an addition.
+
+55
+00:02:22,650 --> 00:02:23,910
+To go fast, we don't look
+
+56
+00:02:23,910 --> 00:02:25,343
+at all possible start and end logits,
+
+57
+00:02:25,343 --> 00:02:26,973
+but the 20 best ones.
+
+58
+00:02:27,810 --> 00:02:30,386
+We ignore the logits that
+spawn impossible answers
+
+59
+00:02:30,386 --> 00:02:32,370
+or answer that are too long.
+
+60
+00:02:32,370 --> 00:02:33,720
+As we saw in the preprocessing,
+
+61
+00:02:33,720 --> 00:02:36,240
+the label "0, 0" correspond to no answer,
+
+62
+00:02:36,240 --> 00:02:37,440
+otherwise we use the offset
+
+63
+00:02:37,440 --> 00:02:39,290
+to get the answer inside the context.
+
+64
+00:02:40,260 --> 00:02:41,580
+Let's have a look at the predicted answer
+
+65
+00:02:41,580 --> 00:02:43,200
+for the first feature,
+
+66
+00:02:43,200 --> 00:02:44,790
+which is the answer with the best score,
+
+67
+00:02:44,790 --> 00:02:46,860
+or the best logit score since the SoftMax
+
+68
+00:02:46,860 --> 00:02:48,810
+is an increasing function.
+
+69
+00:02:48,810 --> 00:02:49,960
+The model got it right.
+
+70
+00:02:51,210 --> 00:02:54,180
+Next, we just have to loop
+this for every example,
+
+71
+00:02:54,180 --> 00:02:56,700
+picking for each the answer
+with the best logit score
+
+72
+00:02:56,700 --> 00:02:59,133
+in all the features the example generated.
+
+73
+00:03:00,030 --> 00:03:03,030
+Now you know how to get answers
+from your model predictions.
+
+74
+00:03:04,214 --> 00:03:06,797
+(subtle blast)
+
diff --git a/subtitles/en/68_data-collators-a-tour.srt b/subtitles/en/68_data-collators-a-tour.srt
index e388c1513..56895ea7e 100644
--- a/subtitles/en/68_data-collators-a-tour.srt
+++ b/subtitles/en/68_data-collators-a-tour.srt
@@ -1,341 +1,655 @@
-1
-00:00:06,220 --> 00:00:12,290
-In a lot of our examples, you're going to
-see DataCollators popping up over and over.
-
-2
-00:00:12,290 --> 00:00:18,010
-They're used in both PyTorch and TensorFlow
-workflows, and maybe even in JAX, but no-one
-
-3
-00:00:18,010 --> 00:00:20,260
-really knows what's happening in JAX.
-
-4
-00:00:20,260 --> 00:00:24,590
-We have a research team working on that, so
-maybe they'll tell us soon.
-
-5
-00:00:24,590 --> 00:00:27,869
-But what are data collators?
-
-6
-00:00:27,869 --> 00:00:32,230
-Data collators collate data.
-
-7
-00:00:32,230 --> 00:00:37,930
-More specifically, they put together a list
-of samples into a single training minibatch.
-
-8
-00:00:37,930 --> 00:00:41,820
-For some tasks, the data collator can be very
-straightforward.
-
-9
-00:00:41,820 --> 00:00:47,010
-For example, when you're doing sequence classification,
-all you really need from your data collator
-
-10
-00:00:47,010 --> 00:00:53,480
-is that it pads your samples to the same length
-and concatenates them into a single Tensor.
-
-11
-00:00:53,480 --> 00:00:58,989
-But for other workflows, data collators can
-be more complex, as they handle some of the
-
-12
-00:00:58,989 --> 00:01:04,879
-preprocessing needed for that particular task.
-
-13
-00:01:04,879 --> 00:01:09,600
-For PyTorch users, you usually pass the DataCollator
-to your Trainer object.
-
-14
-00:01:09,600 --> 00:01:15,549
-In TensorFlow, the easiest way to use a DataCollator
-is to pass it to the to_tf_dataset method
-
-15
-00:01:15,549 --> 00:01:23,700
-of your dataset.
-
-16
-00:01:23,700 --> 00:01:27,420
-You'll see these approaches used in the examples
-and notebooks throughout this course.
-
-17
-00:01:27,420 --> 00:01:28,820
-In both cases, you end up with an iterable
-that's going to output collated batches, ready
-
-18
-00:01:28,820 --> 00:01:29,820
-for training.
-
-19
-00:01:29,820 --> 00:01:34,360
-Note that all of our collators take a return_tensors
-argument - you can set this to "pt" to get
-
-20
-00:01:34,360 --> 00:01:40,820
-PyTorch Tensors, "tf" to get TensorFlow Tensors,
-or "np" to get Numpy arrays.
-
-21
-00:01:40,820 --> 00:01:46,060
-For backward compatibility reasons, the default
-value is "pt", so PyTorch users don't even
-
-22
-00:01:46,060 --> 00:01:51,110
-have to set this argument most of the time,
-and so are often totally unaware that this
-
-23
-00:01:51,110 --> 00:01:52,110
-option exists.
-
-24
-00:01:52,110 --> 00:01:59,160
-This is a valuable lesson about how the beneficiaries
-of privilege are often the most blind to its
-
-25
-00:01:59,160 --> 00:02:00,160
-existence.
-
-26
-00:02:00,160 --> 00:02:08,130
-So now let's see some specific DataCollators
-in action, though remember that if none of
-
-27
-00:02:08,130 --> 00:02:12,069
-them do what you need, you can always write
-your own!
-
-28
-00:02:12,069 --> 00:02:17,120
-First, we'll see the "basic" data collators.
-
-29
-00:02:17,120 --> 00:02:21,550
-These are DefaultDataCollator and DataCollatorWithPadding.
-
-30
-00:02:21,550 --> 00:02:25,550
-These are the ones you should use if your
-labels are straightforward and your data doesn't
-
-31
-00:02:25,550 --> 00:02:28,780
-need any special processing before being ready
-for training.
-
-32
-00:02:28,780 --> 00:02:30,100
-Most sequence classification tasks, for example,
-would use one of these data collators.
-
-33
-00:02:30,100 --> 00:02:35,470
-Remember that because different models have
-different padding tokens, DataCollatorWithPadding
-
-34
-00:02:35,470 --> 00:02:39,239
-will need your model's Tokenizer so it knows
-how to pad sequences properly!
-
-35
-00:02:39,239 --> 00:02:41,069
-So how do you choose one of these?
-
-36
-00:02:41,069 --> 00:02:44,970
-Simple: As you can see here, if you have variable
-sequence lengths then you should use DataCollatorWithPadding,
-
-37
-00:02:44,970 --> 00:02:46,690
-which will pad all your sequences to the same
-length.
-
-38
-00:02:46,690 --> 00:02:49,819
-If you're sure all your sequences are the
-same length then you can use the even simpler
-
-39
-00:02:49,819 --> 00:02:51,510
-DefaultDataCollator, but it'll give you an
-error if that assumption is wrong!
-
-40
-00:02:51,510 --> 00:02:57,720
-Moving on, though, many of the other data
-collators are often designed to handle one
-
-41
-00:02:57,720 --> 00:03:07,411
-specific task, and that's the case with DataCollatorForTokenClassification
-and DataCollatorForSeqToSeq.
-
-42
-00:03:07,411 --> 00:03:12,830
-These tasks need special collators because
-the labels are variable in length.
-
-43
-00:03:12,830 --> 00:03:16,580
-In token classification there's one label
-for each token, and that means the length
-
-44
-00:03:16,580 --> 00:03:23,599
-of the labels can be variable, while in SeqToSeq
-the labels are also a sequence of tokens that
-
-45
-00:03:23,599 --> 00:03:28,470
-can have variable length.
-
-46
-00:03:28,470 --> 00:03:38,580
-In both of these cases, we handle that by
-padding the labels too, as you can see here.
-
-47
-00:03:38,580 --> 00:03:43,810
-Inputs and the labels will need to be padded
-if we want to join samples of variable length
-
-48
-00:03:43,810 --> 00:03:50,440
-into the same minibatch, and that's exactly
-what the data collators will do.
-
-49
-00:03:50,440 --> 00:04:01,680
-The final data collator I want to show you
-is the DataCollatorForLanguageModeling.
-
-50
-00:04:01,680 --> 00:04:07,470
-It's very important, firstly because language
-models are so foundational to everything we
-
-51
-00:04:07,470 --> 00:04:15,030
-do in NLP, and secondly because it has two
-modes that do two very different things.
-
-52
-00:04:15,030 --> 00:04:21,889
-You choose which mode you want with the mlm
-argument - set it to True for masked language
-
-53
-00:04:21,889 --> 00:04:26,729
-modeling, and False for causal language modeling.
-
-54
-00:04:26,729 --> 00:04:31,110
-Collating data for causal language modeling
-is actually quite straightforward - the model
-
-55
-00:04:31,110 --> 00:04:35,962
-is just making predictions for what token
-comes next, so your labels are more or less
-
-56
-00:04:35,962 --> 00:04:40,530
-just a copy of your inputs, and the collator
-handles that and ensures your inputs and labels
-
-57
-00:04:40,530 --> 00:04:42,380
-are padded correctly.
-
-58
-00:04:42,380 --> 00:04:49,500
-When you set mlm to True, though, you get
-quite different behaviour!
-
-59
-00:04:49,500 --> 00:04:58,250
-That's because masked language modeling requires
-the labels to be, well... masked.
-
-60
-00:04:58,250 --> 00:05:01,539
-So what does that look like?
-
-61
-00:05:01,539 --> 00:05:06,860
-Recall that in masked language modeling, the
-model is not predicting "the next word"; instead
-
-62
-00:05:06,860 --> 00:05:11,590
-we randomly mask out multiple tokens and the
-model makes predictions for all of them at
-
-63
-00:05:11,590 --> 00:05:12,590
-once.
-
-64
-00:05:12,590 --> 00:05:18,729
-The process of random masking is surprisingly
-complex, though - that's because if we follow
-
-65
-00:05:18,729 --> 00:05:23,770
-the protocol from the original BERT paper,
-we need to replace some tokens with a masking
-
-66
-00:05:23,770 --> 00:05:30,080
-token, other tokens with a random token and
-then keep a third set of tokens unchanged.
-
-67
-00:05:30,080 --> 00:05:35,919
-This isn't the lecture to go into *why* we
-do that - you should check out the original
-
-68
-00:05:35,919 --> 00:05:40,720
-BERT paper if you're curious.
-
-69
-00:05:40,720 --> 00:05:46,949
-The main thing to know here is that it can
-be a real pain to implement yourself, but
-
-70
-00:05:46,949 --> 00:05:53,300
-DataCollatorForLanguageModeling will do it
-for you.
-
-71
-00:05:53,300 --> 00:05:57,800
-And that's it!
-
-72
-00:05:57,800 --> 00:06:15,410
-That covers the most commonly used data collators
-and the tasks they're used for.
+﻿1
+00:00:00,670 --> 00:00:01,503
+(whooshing sound)
+
+2
+00:00:01,503 --> 00:00:02,469
+(sticker popping)
+
+3
+00:00:02,469 --> 00:00:05,302
+(whooshing sound)
+
+4
+00:00:06,240 --> 00:00:08,220
+In a lot of our examples,
+
+5
+00:00:08,220 --> 00:00:12,150
+you're going to see DataCollators
+popping up over and over.
+
+6
+00:00:12,150 --> 00:00:16,020
+They're used in both PyTorch
+and TensorFlow workflows,
+
+7
+00:00:16,020 --> 00:00:17,460
+and maybe even in JAX,
+
+8
+00:00:17,460 --> 00:00:20,130
+but no-one really knows
+what's happening in JAX.
+
+9
+00:00:20,130 --> 00:00:21,840
+We do have a research
+team working on it though,
+
+10
+00:00:21,840 --> 00:00:23,970
+so maybe they'll tell us soon.
+
+11
+00:00:23,970 --> 00:00:25,620
+But coming back on topic.
+
+12
+00:00:25,620 --> 00:00:27,600
+What are data collators?
+
+13
+00:00:27,600 --> 00:00:30,480
+Data collators collate data.
+
+14
+00:00:30,480 --> 00:00:31,800
+That's not that helpful.
+
+15
+00:00:31,800 --> 00:00:35,023
+But to be more specific, they
+put together a list of samples
+
+16
+00:00:35,023 --> 00:00:37,830
+into a single training minibatch.
+
+17
+00:00:37,830 --> 00:00:38,910
+For some tasks,
+
+18
+00:00:38,910 --> 00:00:41,670
+the data collator can
+be very straightforward.
+
+19
+00:00:41,670 --> 00:00:44,820
+For example, when you're
+doing sequence classification,
+
+20
+00:00:44,820 --> 00:00:47,010
+all you really need
+from your data collator
+
+21
+00:00:47,010 --> 00:00:49,860
+is that it pads your
+samples to the same length
+
+22
+00:00:49,860 --> 00:00:52,413
+and concatenates them
+into a single Tensor.
+
+23
+00:00:53,340 --> 00:00:57,750
+But for other workflows, data
+collators can be quite complex
+
+24
+00:00:57,750 --> 00:00:59,910
+as they handle some of the preprocessing
+
+25
+00:00:59,910 --> 00:01:02,340
+needed for that particular task.
+
+26
+00:01:02,340 --> 00:01:04,800
+So, if you want to use a data collator,
+
+27
+00:01:04,800 --> 00:01:07,860
+for PyTorch users, you
+usually pass the data collator
+
+28
+00:01:07,860 --> 00:01:09,780
+to your Trainer object.
+
+29
+00:01:09,780 --> 00:01:11,310
+In TensorFlow, it's a bit different.
+
+30
+00:01:11,310 --> 00:01:12,960
+The easiest way to use a data collator
+
+31
+00:01:12,960 --> 00:01:16,860
+is to pass it to the to_tf_dataset
+method of your dataset.
+
+32
+00:01:16,860 --> 00:01:20,198
+And this will give you a
+tensorflow_tf_data.dataset
+
+33
+00:01:20,198 --> 00:01:22,743
+that you can then pass to model.fit.
+
+34
+00:01:23,580 --> 00:01:25,890
+You'll see these approaches
+used in the examples
+
+35
+00:01:25,890 --> 00:01:28,068
+and notebooks throughout this course.
+
+36
+00:01:28,068 --> 00:01:30,180
+Also note that all of our collators
+
+37
+00:01:30,180 --> 00:01:32,610
+take a return_tensors argument.
+
+38
+00:01:32,610 --> 00:01:35,737
+You can set this to "pt"
+to get PyTorch Tensors,
+
+39
+00:01:35,737 --> 00:01:37,920
+"tf" to get TensorFlow Tensors,
+
+40
+00:01:37,920 --> 00:01:40,404
+or "np" to get Numpy arrays.
+
+41
+00:01:40,404 --> 00:01:42,450
+For backward compatibility reasons,
+
+42
+00:01:42,450 --> 00:01:44,460
+the default value is "pt",
+
+43
+00:01:44,460 --> 00:01:47,160
+so PyTorch users don't even
+have to set this argument
+
+44
+00:01:47,160 --> 00:01:48,270
+most of the time.
+
+45
+00:01:48,270 --> 00:01:50,820
+And so as a result, they're
+often totally unaware
+
+46
+00:01:50,820 --> 00:01:52,713
+that this argument even exists.
+
+47
+00:01:53,730 --> 00:01:55,050
+We can learn something from this
+
+48
+00:01:55,050 --> 00:01:57,120
+which is that the
+beneficiaries of privilege
+
+49
+00:01:57,120 --> 00:01:59,793
+are often the most blind to its existence.
+
+50
+00:02:00,690 --> 00:02:01,920
+But okay, coming back.
+
+51
+00:02:01,920 --> 00:02:06,540
+Let's see how some specific
+data collators work in action.
+
+52
+00:02:06,540 --> 00:02:08,070
+Although again, remember if none
+
+53
+00:02:08,070 --> 00:02:09,900
+of the built-in data
+collators do what you need,
+
+54
+00:02:09,900 --> 00:02:13,650
+you can always write your own
+and they're often quite short.
+
+55
+00:02:13,650 --> 00:02:16,950
+So first, we'll see the
+"basic" data collators.
+
+56
+00:02:16,950 --> 00:02:20,433
+These are DefaultDataCollator
+and DataCollatorWithPadding.
+
+57
+00:02:21,420 --> 00:02:22,830
+These are the ones you should use
+
+58
+00:02:22,830 --> 00:02:24,720
+if your labels are straightforward
+
+59
+00:02:24,720 --> 00:02:27,300
+and your data doesn't need
+any special processing
+
+60
+00:02:27,300 --> 00:02:29,673
+before being ready for training.
+
+61
+00:02:29,673 --> 00:02:31,272
+Notice that because different models
+
+62
+00:02:31,272 --> 00:02:33,690
+have different padding tokens,
+
+63
+00:02:33,690 --> 00:02:37,170
+DataCollatorWithPadding will
+need your model's Tokenizer
+
+64
+00:02:37,170 --> 00:02:40,150
+so it knows how to pad sequences properly.
+
+65
+00:02:40,150 --> 00:02:44,790
+The default data collator
+doesn't need a Tokenizer to work,
+
+66
+00:02:44,790 --> 00:02:46,710
+but it will as a result throw an error
+
+67
+00:02:46,710 --> 00:02:48,900
+unless all of your sequences
+are the same length.
+
+68
+00:02:48,900 --> 00:02:50,500
+So, you should be aware of that.
+
+69
+00:02:51,480 --> 00:02:52,860
+Moving on though.
+
+70
+00:02:52,860 --> 00:02:54,300
+A lot of the other data collators
+
+71
+00:02:54,300 --> 00:02:56,130
+aside from the basic two are,
+
+72
+00:02:56,130 --> 00:02:59,490
+they're usually designed to
+handle one specific task.
+
+73
+00:02:59,490 --> 00:03:01,050
+And so, I'm going to show a couple here.
+
+74
+00:03:01,050 --> 00:03:04,320
+These are
+DataCollatorForTokenClassification
+
+75
+00:03:04,320 --> 00:03:06,447
+and DataCollatorForSeqToSeq.
+
+76
+00:03:06,447 --> 00:03:09,540
+And the reason these tasks
+need special collators
+
+77
+00:03:09,540 --> 00:03:12,600
+is because their labels
+are variable in length.
+
+78
+00:03:12,600 --> 00:03:15,960
+In token classification there's
+one label for each token,
+
+79
+00:03:15,960 --> 00:03:17,400
+and so the length of the labels
+
+80
+00:03:17,400 --> 00:03:18,993
+is the length of the sequence.
+
+81
+00:03:20,280 --> 00:03:23,520
+While in SeqToSeq the labels
+are a sequence of tokens
+
+82
+00:03:23,520 --> 00:03:24,780
+that can be variable length,
+
+83
+00:03:24,780 --> 00:03:25,800
+that can be very different
+
+84
+00:03:25,800 --> 00:03:28,200
+from the length of the input sequence.
+
+85
+00:03:28,200 --> 00:03:32,880
+So in both of these cases, we
+handle collating that batch
+
+86
+00:03:32,880 --> 00:03:35,280
+by padding the labels as well,
+
+87
+00:03:35,280 --> 00:03:37,410
+as you can see here in this example.
+
+88
+00:03:37,410 --> 00:03:40,770
+So, inputs and the labels
+will need to be padded
+
+89
+00:03:40,770 --> 00:03:43,860
+if we want to join
+samples of variable length
+
+90
+00:03:43,860 --> 00:03:45,120
+into the same minibatch.
+
+91
+00:03:45,120 --> 00:03:47,520
+That's exactly what the data collators
+
+92
+00:03:47,520 --> 00:03:50,460
+and that's exactly what these
+data collators will do for us
+
+93
+00:03:50,460 --> 00:03:52,383
+you know, for this particular task.
+
+94
+00:03:53,820 --> 00:03:56,070
+So, there's one final data collator
+
+95
+00:03:56,070 --> 00:03:58,560
+I want to show you as
+well just in this lecture.
+
+96
+00:03:58,560 --> 00:04:00,473
+And that's the
+DataCollatorForLanguageModeling.
+
+97
+00:04:01,410 --> 00:04:03,390
+So, it's very important, and it's firstly,
+
+98
+00:04:03,390 --> 00:04:05,820
+because language models
+are just so foundational
+
+99
+00:04:05,820 --> 00:04:09,720
+to do for everything we
+do with NLP these days.
+
+100
+00:04:09,720 --> 00:04:12,060
+But secondly, because it has two modes
+
+101
+00:04:12,060 --> 00:04:14,760
+that do two very different things.
+
+102
+00:04:14,760 --> 00:04:19,230
+So you choose which mode you
+want with the mlm argument.
+
+103
+00:04:19,230 --> 00:04:22,470
+Set it to True for
+masked language modeling,
+
+104
+00:04:22,470 --> 00:04:26,190
+and set it to False for
+causal language modeling.
+
+105
+00:04:26,190 --> 00:04:28,620
+So, collating data for
+causal language modeling
+
+106
+00:04:28,620 --> 00:04:30,750
+is actually quite straightforward.
+
+107
+00:04:30,750 --> 00:04:32,640
+The model is just making predictions
+
+108
+00:04:32,640 --> 00:04:35,460
+for what token comes
+next, and so your labels
+
+109
+00:04:35,460 --> 00:04:37,800
+are more or less just
+a copy of your inputs,
+
+110
+00:04:37,800 --> 00:04:39,090
+and the collator will handle that
+
+111
+00:04:39,090 --> 00:04:42,240
+and ensure that the inputs and
+labels are padded correctly.
+
+112
+00:04:42,240 --> 00:04:44,910
+When you set mlm to True though,
+
+113
+00:04:44,910 --> 00:04:46,786
+you get quite different behavior,
+
+114
+00:04:46,786 --> 00:04:49,200
+that's different from
+any other data collator,
+
+115
+00:04:49,200 --> 00:04:51,660
+and that's because setting mlm to True
+
+116
+00:04:51,660 --> 00:04:53,550
+means masked language modeling
+
+117
+00:04:53,550 --> 00:04:55,680
+and that means the labels need to be,
+
+118
+00:04:55,680 --> 00:04:58,080
+you know, the inputs need to be masked.
+
+119
+00:04:58,080 --> 00:05:00,093
+So, what does that look like?
+
+120
+00:05:01,050 --> 00:05:03,900
+So, recall that in
+masked language modeling,
+
+121
+00:05:03,900 --> 00:05:06,570
+the model is not predicting the next word,
+
+122
+00:05:06,570 --> 00:05:09,240
+instead we randomly mask out some tokens
+
+123
+00:05:09,240 --> 00:05:11,130
+and the model predicts
+all of them at once.
+
+124
+00:05:11,130 --> 00:05:12,780
+So, it tries to kinda fill in the blanks
+
+125
+00:05:12,780 --> 00:05:14,790
+for those masked tokens.
+
+126
+00:05:14,790 --> 00:05:18,210
+But the process of random
+masking is surprisingly complex.
+
+127
+00:05:18,210 --> 00:05:21,330
+If we follow the protocol
+from the original BERT paper,
+
+128
+00:05:21,330 --> 00:05:23,970
+we need to replace some
+tokens with a masked token,
+
+129
+00:05:23,970 --> 00:05:26,190
+some other tokens with a random token,
+
+130
+00:05:26,190 --> 00:05:29,820
+and then keep a third
+set of tokens unchanged.
+
+131
+00:05:29,820 --> 00:05:30,840
+Yeah, this is not the lecture
+
+132
+00:05:30,840 --> 00:05:33,903
+to go into the specifics
+of that or why we do it.
+
+133
+00:05:33,903 --> 00:05:36,660
+You can always check out
+the original BERT paper
+
+134
+00:05:36,660 --> 00:05:37,493
+if you're curious.
+
+135
+00:05:37,493 --> 00:05:39,620
+It's well written. It's
+easy to understand.
+
+136
+00:05:40,650 --> 00:05:44,190
+The main thing to know here
+is that it can be a real pain
+
+137
+00:05:44,190 --> 00:05:46,770
+and quite complex to
+implement that yourself.
+
+138
+00:05:46,770 --> 00:05:49,740
+But DataCollatorForLanguageModeling
+will do it for you
+
+139
+00:05:49,740 --> 00:05:51,750
+when you set mlm to True.
+
+140
+00:05:51,750 --> 00:05:54,690
+And that's an example
+of the more intricate
+
+141
+00:05:54,690 --> 00:05:57,870
+preprocessing that some
+of our data collators do.
+
+142
+00:05:57,870 --> 00:05:59,430
+And that's it!
+
+143
+00:05:59,430 --> 00:06:01,920
+So, this covers the most
+commonly used data collators
+
+144
+00:06:01,920 --> 00:06:03,480
+and the tasks they're used for.
+
+145
+00:06:03,480 --> 00:06:06,990
+And hopefully, now you'll know
+when to use data collators
+
+146
+00:06:06,990 --> 00:06:10,833
+and which one to choose
+for your specific task.
+
+147
+00:06:11,765 --> 00:06:14,598
+(whooshing sound)
+
diff --git a/subtitles/en/69_what-to-do-when-you-get-an-error.srt b/subtitles/en/69_what-to-do-when-you-get-an-error.srt
index 6a3df254f..f3907616f 100644
--- a/subtitles/en/69_what-to-do-when-you-get-an-error.srt
+++ b/subtitles/en/69_what-to-do-when-you-get-an-error.srt
@@ -1,134 +1,271 @@
-1
-00:00:05,440 --> 00:00:13,760
-In this video, we will learn the first things 
-to do when you get an error. Let's say we want  
-
-2
-00:00:13,760 --> 00:00:18,320
-to use the question answering pipeline on 
-a particular model and we get the following  
-
-3
-00:00:18,320 --> 00:00:24,160
-error. Errors in Python can appear overwhelming 
-because you get so much information printed out,  
-
-4
-00:00:24,160 --> 00:00:28,160
-but that's because Python is trying to help 
-you the best it can to solve your problem.  
-
-5
-00:00:28,880 --> 00:00:32,000
-In this video we will see how to 
-interpret the error report we get.  
-
-6
-00:00:33,280 --> 00:00:37,920
-The first thing to notice at the very top is 
-that Python shows you with a clear arrow the  
-
-7
-00:00:37,920 --> 00:00:42,400
-line of code that triggered the error. So you 
-don't have to fiddle with your code and remove  
-
-8
-00:00:42,400 --> 00:00:47,520
-random lines to figure out where the error comes 
-from, you have the answer in front right here.  
-
-9
-00:00:48,880 --> 00:00:53,280
-The arrows you see below are the parts of the 
-code Python tried to execute while running the  
-
-10
-00:00:53,280 --> 00:00:59,600
-instruction: here we are inside the pipeline 
-function and the error came on this line while  
-
-11
-00:00:59,600 --> 00:01:04,800
-trying to execute the function check_tasks, 
-which then raised the KeyError we see displayed.  
-
-12
-00:01:06,480 --> 00:01:11,600
-Note that Python tells you exactly where the 
-functions it's executing live, so if you feel  
-
-13
-00:01:11,600 --> 00:01:17,680
-adventurous, you can even go inspect the source 
-code. This whole thing is called the traceback.  
-
-14
-00:01:19,840 --> 00:01:23,600
-If you are running your code on Colab, 
-the Traceback is automatically minimized,  
-
-15
-00:01:23,600 --> 00:01:29,920
-so you have to click to expand it. At the very 
-end of the traceback, you finally get the actual  
-
-16
-00:01:29,920 --> 00:01:34,960
-error message. The first thing you should do 
-when encountering an error is to read that  
-
-17
-00:01:34,960 --> 00:01:40,640
-error message. Here it's telling us it doesn't 
-know the question answering task, and helpfully  
-
-18
-00:01:40,640 --> 00:01:46,560
-gives us the list of supported tasks... in 
-which we can see that question answering is.  
-
-19
-00:01:47,280 --> 00:01:51,680
-Looking more closely though, we used 
-an underscore to separate the two words  
-
-20
-00:01:51,680 --> 00:01:55,040
-when the task is written with 
-a minus, so we should fix that!  
-
-21
-00:01:57,280 --> 00:02:02,160
-Now let's retry our code with the task properly 
-written and what is happening today? Another  
-
-22
-00:02:02,160 --> 00:02:08,000
-error! As we saw before, we go look at the bottom 
-to read the actual error message. It's telling us  
-
-23
-00:02:08,000 --> 00:02:13,600
-that we should check our model is a correct model 
-identifier, so let's hop on to hf.co/models.  
-
-24
-00:02:14,480 --> 00:02:18,320
-We can see our model listed there in the 
-ones available for question answering.  
-
-25
-00:02:19,120 --> 00:02:22,480
-The difference is that it's 
-spelled distilbert with one l,  
-
-26
-00:02:22,480 --> 00:02:28,960
-and we used two. So let's fix that. We finally 
-get our results! If your error is more complex,  
-
-27
-00:02:28,960 --> 00:02:35,840
-you might need to use the Python debugger, 
-check out the videos linked below to learn how!
+﻿1
+00:00:00,380 --> 00:00:02,463
+(whoosh)
+
+2
+00:00:05,550 --> 00:00:07,590
+- In this video we'll
+learn the first things to
+
+3
+00:00:07,590 --> 00:00:09,330
+do when you get an error.
+
+4
+00:00:09,330 --> 00:00:11,930
+This is not throwing your
+laptop through the window.
+
+5
+00:00:13,320 --> 00:00:15,450
+Let's say we want to use the
+question answering pipeline
+
+6
+00:00:15,450 --> 00:00:19,470
+on a particular model and
+we get the following error.
+
+7
+00:00:19,470 --> 00:00:21,750
+Errors in Python can appear overwhelming
+
+8
+00:00:21,750 --> 00:00:24,390
+because you get so much
+information printed out
+
+9
+00:00:24,390 --> 00:00:26,610
+but that's because Python
+is trying to help you
+
+10
+00:00:26,610 --> 00:00:29,070
+the best it can to solve your problem.
+
+11
+00:00:29,070 --> 00:00:31,260
+In this video, we'll see how to interpret
+
+12
+00:00:31,260 --> 00:00:32,460
+the error report we get.
+
+13
+00:00:33,510 --> 00:00:35,700
+The first thing to notice at the very top
+
+14
+00:00:35,700 --> 00:00:38,070
+is that Python shows
+you with a clear arrow
+
+15
+00:00:38,070 --> 00:00:40,320
+the line of code that triggers the error
+
+16
+00:00:40,320 --> 00:00:42,210
+so you don't have to fiddle with your code
+
+17
+00:00:42,210 --> 00:00:43,800
+and remove random lines to figure out
+
+18
+00:00:43,800 --> 00:00:45,540
+where the error comes from.
+
+19
+00:00:45,540 --> 00:00:47,890
+You have the answer in
+front of you right here.
+
+20
+00:00:49,140 --> 00:00:51,360
+The errors you see below
+are a part of the code
+
+21
+00:00:51,360 --> 00:00:54,930
+Python tried to execute while
+running the instruction.
+
+22
+00:00:54,930 --> 00:00:57,750
+Here we are inside the pipeline function
+
+23
+00:00:57,750 --> 00:00:59,490
+and zero came on this line
+
+24
+00:00:59,490 --> 00:01:02,520
+while trying to execute
+the function "check_tasks,"
+
+25
+00:01:02,520 --> 00:01:05,103
+which then raised the
+KeyError we see displayed.
+
+26
+00:01:06,630 --> 00:01:08,580
+Note that Python tells you exactly
+
+27
+00:01:08,580 --> 00:01:11,190
+where the function it's executing lives,
+
+28
+00:01:11,190 --> 00:01:12,810
+so if you feel adventurous
+
+29
+00:01:12,810 --> 00:01:14,810
+you can even go inspect the source code.
+
+30
+00:01:15,900 --> 00:01:18,447
+This whole thing is
+called the "Traceback."
+
+31
+00:01:20,010 --> 00:01:21,870
+If you're running your code on Colab
+
+32
+00:01:21,870 --> 00:01:23,820
+the Traceback is automatically minimized,
+
+33
+00:01:23,820 --> 00:01:25,833
+so you have to click to expand it.
+
+34
+00:01:26,820 --> 00:01:28,530
+At the very end of the Traceback
+
+35
+00:01:28,530 --> 00:01:31,890
+you finally get the actual error message.
+
+36
+00:01:31,890 --> 00:01:33,660
+The first thing you should
+do when encountering
+
+37
+00:01:33,660 --> 00:01:36,480
+an error is to read that error message.
+
+38
+00:01:36,480 --> 00:01:38,640
+Here it's telling us it doesn't know
+
+39
+00:01:38,640 --> 00:01:40,230
+the question answering task
+
+40
+00:01:40,230 --> 00:01:41,760
+and helpfully gives us the list
+
+41
+00:01:41,760 --> 00:01:44,850
+of supported tasks in which we can see
+
+42
+00:01:44,850 --> 00:01:47,520
+that "question-answering" actually is.
+
+43
+00:01:47,520 --> 00:01:49,200
+Looking more closely though,
+
+44
+00:01:49,200 --> 00:01:52,020
+we used an underscore to
+surprise the two words
+
+45
+00:01:52,020 --> 00:01:54,300
+when the task is written with a minus,
+
+46
+00:01:54,300 --> 00:01:55,413
+so we should fix that.
+
+47
+00:01:57,510 --> 00:02:00,360
+Now let's retry our code with
+the tags properly written
+
+48
+00:02:00,360 --> 00:02:01,920
+and what is happening today?
+
+49
+00:02:01,920 --> 00:02:03,210
+Another error.
+
+50
+00:02:03,210 --> 00:02:05,670
+As we said before, we
+go look at the bottom
+
+51
+00:02:05,670 --> 00:02:07,560
+to read the actual error message.
+
+52
+00:02:07,560 --> 00:02:09,000
+It's telling us that we should check
+
+53
+00:02:09,000 --> 00:02:11,340
+our model is a correct model identifier,
+
+54
+00:02:11,340 --> 00:02:14,760
+so let's hop onto hf.co/models.
+
+55
+00:02:14,760 --> 00:02:16,440
+We can see our model listed there
+
+56
+00:02:16,440 --> 00:02:19,440
+in the ones available
+for question answering.
+
+57
+00:02:19,440 --> 00:02:21,720
+The difference is that
+it's spelled "distilbert"
+
+58
+00:02:21,720 --> 00:02:24,240
+with one L, and we use two,
+
+59
+00:02:24,240 --> 00:02:25,650
+so let's fix that.
+
+60
+00:02:25,650 --> 00:02:27,570
+We finally get our results.
+
+61
+00:02:27,570 --> 00:02:29,160
+If our error is more complex,
+
+62
+00:02:29,160 --> 00:02:31,290
+you might need to use the Python debugger.
+
+63
+00:02:31,290 --> 00:02:33,483
+Check out the videos below to learn how.
+
diff --git a/subtitles/en/70_using-a-debugger-in-a-notebook.srt b/subtitles/en/70_using-a-debugger-in-a-notebook.srt
index 99ffa2f4b..f543f4b30 100644
--- a/subtitles/en/70_using-a-debugger-in-a-notebook.srt
+++ b/subtitles/en/70_using-a-debugger-in-a-notebook.srt
@@ -1,132 +1,319 @@
-1
-00:00:05,280 --> 00:00:11,760
-Using the Python debugger in a notebook. In 
-this video, we'll learn how to use the Python  
-
-2
-00:00:11,760 --> 00:00:17,040
-debugger in a Jupyter Notebook or a Colab. 
-For this example, we are running code from  
-
-3
-00:00:17,040 --> 00:00:24,640
-the token classification section, downloading 
-the Conll dataset , having a look at it  
-
-4
-00:00:27,600 --> 00:00:30,080
-before loading a tokenizer to preprocess it.  
-
-5
-00:00:32,640 --> 00:00:35,440
-Checkout the section of the course 
-linked below for more information.  
-
-6
-00:00:36,800 --> 00:00:42,240
-Once this is done, we try to batch together some 
-features of the training dataset by padding them  
-
-7
-00:00:44,960 --> 00:00:51,280
-and returning a tensor, then we get the 
-following error. We use PyTorch here but  
-
-8
-00:00:51,280 --> 00:00:56,240
-you will get the same error with TensorFlow. As we 
-have seen in the "How to debug an error?" video,  
-
-9
-00:00:56,240 --> 00:01:02,480
-the error message is at the end and it indicates 
-we should use padding, which we are actually  
-
-10
-00:01:02,480 --> 00:01:07,680
-trying to do. So this is not useful and we will 
-need to go a little deeper to debug the problem.  
-
-11
-00:01:08,400 --> 00:01:13,040
-Fortunately, you can use the Python debugger at 
-any time you get an error in a Jupyter Notebook  
-
-12
-00:01:13,040 --> 00:01:23,680
-by typing %debug in any cell. When executing that 
-cell, you go to the very bottom of the traceback  
-
-13
-00:01:23,680 --> 00:01:28,560
-where you can type commands and you can type 
-commands. The first two commands you should  
-
-14
-00:01:28,560 --> 00:01:41,760
-learn are u and d (for up and down), which 
-allow you to go up in the Traceback or down.  
-
-15
-00:01:43,920 --> 00:01:46,720
-Going up twice, we get to the 
-point the error was reached.  
-
-16
-00:01:47,600 --> 00:01:53,840
-The third command to learn is p, for print. 
-It allows you to print any value you want.  
-
-17
-00:01:54,560 --> 00:02:00,720
-For instance here, we can see the value 
-of return_tensors or batch_outputs  
-
-18
-00:02:00,720 --> 00:02:11,520
-to try to understand what triggered the error. 
-The batch outputs dictionary is a bit hard to see,  
-
-19
-00:02:12,720 --> 00:02:18,160
-so let's dive into smaller pieces of it. Inside 
-the debugger you can not only print any variable  
-
-20
-00:02:18,160 --> 00:02:28,240
-but also evaluate any expression, so we can 
-look independently at the inputs or labels.  
-
-21
-00:02:35,440 --> 00:02:41,360
-Those labels are definitely weird: they are of 
-various size, which we can actually confirm by  
-
-22
-00:02:41,360 --> 00:02:49,840
-printing the sizes. No wonder the tokenizer 
-wasn't able to create a tensor with them!  
-
-23
-00:02:52,160 --> 00:02:56,880
-This is because the pad method only 
-takes care of the tokenizer outptus:  
-
-24
-00:02:56,880 --> 00:02:59,680
-input IDs, attention mask and token type IDs,  
-
-25
-00:03:00,240 --> 00:03:03,840
-so we have to pad the labels ourselves 
-before trying to create a tensor with them.  
-
-26
-00:03:05,040 --> 00:03:11,440
-Once you are ready to exit the Python debugger, 
-you can press q for quit. One way to fix the error  
-
-27
-00:03:11,440 --> 00:03:21,600
-is to manually pad all labels to the longest, or 
-we can use the data collator designed for this.
+﻿1
+00:00:05,400 --> 00:00:08,150
+- [Instructor] Using the
+Python debugger in a notebook.
+
+2
+00:00:09,540 --> 00:00:12,330
+In this video, we'll learn
+how to use the Python debugger
+
+3
+00:00:12,330 --> 00:00:15,027
+in a Jupyter Notebook or a Colab.
+
+4
+00:00:15,027 --> 00:00:17,070
+For this example, we are running code
+
+5
+00:00:17,070 --> 00:00:19,775
+from the token classification section,
+
+6
+00:00:19,775 --> 00:00:21,513
+downloading the Conll dataset,
+
+7
+00:00:23,670 --> 00:00:25,503
+looking a little bit at data,
+
+8
+00:00:27,840 --> 00:00:29,250
+before loading a tokenizer
+
+9
+00:00:29,250 --> 00:00:31,173
+to preprocess the whole dataset.
+
+10
+00:00:32,880 --> 00:00:34,740
+Check out the section of
+the course linked below
+
+11
+00:00:34,740 --> 00:00:35,823
+for more information.
+
+12
+00:00:37,080 --> 00:00:38,520
+Once this is done,
+
+13
+00:00:38,520 --> 00:00:41,580
+we try to load eight features
+of the training dataset,
+
+14
+00:00:41,580 --> 00:00:43,080
+and then batch them together,
+
+15
+00:00:43,080 --> 00:00:45,210
+using tokenizer.pad,
+
+16
+00:00:45,210 --> 00:00:46,760
+and we get the following error.
+
+17
+00:00:48,090 --> 00:00:49,230
+We use PyTorch here,
+
+18
+00:00:49,230 --> 00:00:51,330
+with return_tensors="pt"
+
+19
+00:00:51,330 --> 00:00:53,273
+but you will get the same
+error with TensorFlow.
+
+20
+00:00:54,120 --> 00:00:55,897
+As we have seen in the "How
+to debug an error?" video,
+
+21
+00:00:55,897 --> 00:00:59,160
+the error message is at
+the end of the traceback.
+
+22
+00:00:59,160 --> 00:01:01,710
+Here, it indicates us
+we should use padding,
+
+23
+00:01:01,710 --> 00:01:04,290
+which we are actually trying to do.
+
+24
+00:01:04,290 --> 00:01:05,610
+So this is not useful at all,
+
+25
+00:01:05,610 --> 00:01:06,990
+and we will need to go a little deeper
+
+26
+00:01:06,990 --> 00:01:08,610
+to debug the problem.
+
+27
+00:01:08,610 --> 00:01:10,650
+Fortunately, you can
+use the Python debugger
+
+28
+00:01:10,650 --> 00:01:13,170
+at any time you get an
+error in a Jupyter Notebook
+
+29
+00:01:13,170 --> 00:01:16,350
+by typing the magic
+command, debug, in a cell.
+
+30
+00:01:16,350 --> 00:01:18,450
+Don't forget the percent at the beginning.
+
+31
+00:01:20,400 --> 00:01:21,870
+When executing that cell,
+
+32
+00:01:21,870 --> 00:01:23,910
+you go to the very bottom of the traceback
+
+33
+00:01:23,910 --> 00:01:25,320
+where you can type commands
+
+34
+00:01:25,320 --> 00:01:27,690
+that will help you debug your script.
+
+35
+00:01:27,690 --> 00:01:29,250
+The first two commands you should learn,
+
+36
+00:01:29,250 --> 00:01:32,040
+are u and d, for up and down.
+
+37
+00:01:32,040 --> 00:01:36,090
+Typing u and enter will
+take you up one step
+
+38
+00:01:36,090 --> 00:01:38,910
+in the traceback to the
+previous instruction.
+
+39
+00:01:38,910 --> 00:01:41,190
+Typing d and then enter will take you
+
+40
+00:01:41,190 --> 00:01:43,023
+one step down in the traceback.
+
+41
+00:01:44,130 --> 00:01:47,910
+Going up twice, we get to the
+point the error was reached.
+
+42
+00:01:47,910 --> 00:01:51,510
+The third command to learn for
+the debugger is p, for print.
+
+43
+00:01:51,510 --> 00:01:54,780
+It allows you to print any value you want.
+
+44
+00:01:54,780 --> 00:01:58,740
+For instance, typing p
+return_tensors and enter,
+
+45
+00:01:58,740 --> 00:02:02,893
+we see the value pt that we
+pass to the bad function.
+
+46
+00:02:02,893 --> 00:02:05,370
+We can also have a look
+at the batch outputs
+
+47
+00:02:05,370 --> 00:02:07,353
+this batch line coding object gets.
+
+48
+00:02:09,480 --> 00:02:12,600
+The batch outputs dictionary
+is a bit hard to dig in to,
+
+49
+00:02:12,600 --> 00:02:15,360
+so let's dive into smaller pieces of it.
+
+50
+00:02:15,360 --> 00:02:18,390
+Inside the debugger you can
+not only print any variable
+
+51
+00:02:18,390 --> 00:02:20,970
+but also evaluate any expression,
+
+52
+00:02:20,970 --> 00:02:23,610
+for instance, we can have a
+look at the input_ids keys
+
+53
+00:02:23,610 --> 00:02:25,203
+this batch_outputs object.
+
+54
+00:02:27,600 --> 00:02:30,693
+Or at the labels keys of
+this batch_outputs object.
+
+55
+00:02:35,730 --> 00:02:37,320
+Those labels are definitely weird:
+
+56
+00:02:37,320 --> 00:02:38,970
+they are of various sizes,
+
+57
+00:02:38,970 --> 00:02:41,340
+which we can actually confirm, if we want,
+
+58
+00:02:41,340 --> 00:02:43,983
+by printing the size with
+the least compression.
+
+59
+00:02:52,290 --> 00:02:54,913
+This is because the pad
+method of the tokenizer
+
+60
+00:02:54,913 --> 00:02:57,090
+only takes care of the tokenizer outputs:
+
+61
+00:02:57,090 --> 00:03:00,450
+input IDs, attention
+mask, and token type IDs,
+
+62
+00:03:00,450 --> 00:03:02,340
+so we have to pad the labels ourselves
+
+63
+00:03:02,340 --> 00:03:05,310
+before trying to create
+a tensor with them.
+
+64
+00:03:05,310 --> 00:03:07,260
+Once you are ready to
+exit the Python debugger,
+
+65
+00:03:07,260 --> 00:03:09,453
+you can press q and enter for quit.
+
+66
+00:03:10,320 --> 00:03:11,670
+One way to fix the error
+
+67
+00:03:11,670 --> 00:03:14,313
+is to manually pad the
+labels to the longest.
+
+68
+00:03:15,300 --> 00:03:17,400
+Another way is to use a data collator
+
+69
+00:03:17,400 --> 00:03:19,863
+specifically designed
+for token classification.
+
+70
+00:03:20,970 --> 00:03:22,950
+You can also use a
+Python debugger directly
+
+71
+00:03:22,950 --> 00:03:23,850
+in the terminal.
+
+72
+00:03:23,850 --> 00:03:25,943
+Check out the video
+link below to learn how.
+
diff --git a/subtitles/en/71_using-a-debugger-in-a-terminal.srt b/subtitles/en/71_using-a-debugger-in-a-terminal.srt
index 0971982b8..acc0af0be 100644
--- a/subtitles/en/71_using-a-debugger-in-a-terminal.srt
+++ b/subtitles/en/71_using-a-debugger-in-a-terminal.srt
@@ -1,153 +1,350 @@
-1
-00:00:05,840 --> 00:00:11,520
-Using the Python debugger in a terminal. In this 
-video, we'll learn how to use the Python debugger  
-
-2
-00:00:11,520 --> 00:00:16,800
-in a terminal. For this example, we are running 
-code from the token classification section,  
-
-3
-00:00:17,600 --> 00:00:22,320
-downloading the Conll dataset before 
-loading a tokenizer to preprocess it.  
-
-4
-00:00:23,200 --> 00:00:28,720
-Checkout the section of the course linked below 
-for more information. Once this is done, we try  
-
-5
-00:00:28,720 --> 00:00:34,240
-to batch together some features of the training 
-dataset by padding them and returning a tensor,  
-
-6
-00:00:37,200 --> 00:00:40,160
-then we get the following error.  
-
-7
-00:00:42,800 --> 00:00:47,280
-We use PyTorch here but you will 
-get the same error with TensorFlow.  
-
-8
-00:00:49,280 --> 00:00:53,680
-As we have seen in the "How to debug an 
-error?" video, the error message is at the end  
-
-9
-00:00:53,680 --> 00:00:58,640
-and it indicates we should use padding... which 
-we are actually trying to do. So this is not  
-
-10
-00:00:58,640 --> 00:01:03,360
-useful and we will need to go a little deeper 
-to debug the problem. Fortunately, you can use  
-
-11
-00:01:03,360 --> 00:01:10,400
-the Python debugger quite easily in a terminal by 
-launching your script with python -m pdb instead  
-
-12
-00:01:10,400 --> 00:01:17,200
-of just python. When executing that command, you 
-are sent to the first instruction of your script.  
-
-13
-00:01:17,200 --> 00:01:25,840
-You can run just the next instruction by typing 
-n, or continue to the error by directly typing c.  
-
-14
-00:01:29,680 --> 00:01:33,120
-Once there, you go to the very bottom of 
-the traceback, and you can type commands.  
-
-15
-00:01:34,000 --> 00:01:40,160
-The first two commands you should learn are u and 
-d (for up and down), which allow you to go up in  
-
-16
-00:01:40,160 --> 00:01:48,320
-the Traceback or down. Going up twice, we get to 
-the point the error was reached. The third command  
-
-17
-00:01:48,320 --> 00:01:54,000
-to learn is p, for print. It allows you to print 
-any value you want. For instance here, we can see  
-
-18
-00:01:54,000 --> 00:01:59,120
-the value of return_tensors or batch_outputs 
-to try to understand what triggered the error.  
-
-19
-00:02:00,000 --> 00:02:04,720
-The batch outputs dictionary is a bit hard to 
-see, so let's dive into smaller pieces of it.  
-
-20
-00:02:05,360 --> 00:02:10,560
-Inside the debugger you can not only print 
-any variable but also evaluate any expression,  
-
-21
-00:02:10,560 --> 00:02:23,600
-so we can look independently at the inputs 
-or labels. Those labels are definitely weird:  
-
-22
-00:02:24,160 --> 00:02:27,920
-they are of various size, which we can 
-actually confirm by printing the sizes.  
-
-23
-00:02:35,760 --> 00:02:40,160
-No wonder the tokenizer wasn't able to create 
-a tensor with them! This is because the pad  
-
-24
-00:02:40,160 --> 00:02:45,840
-method only takes care of the tokenizer outputs: 
-input IDs, attention mask and token type IDs,  
-
-25
-00:02:46,400 --> 00:02:50,080
-so we have to pad the labels ourselves 
-before trying to create a tensor with them.  
-
-26
-00:02:51,120 --> 00:02:56,880
-Once you are ready to exit the Python 
-debugger, you can press q for quit. Another  
-
-27
-00:02:56,880 --> 00:03:03,600
-way we can access the Python debugger is to set 
-a "set_trace" instruction where we want in the  
-
-28
-00:03:10,480 --> 00:03:23,280
-script. It will interrupt the execution and 
-launch the Python debugger at this place, and we  
-
-29
-00:03:23,280 --> 00:03:32,080
-can inspect all the variables before the next 
-instruction is executed. Typing n executes the  
-
-30
-00:03:32,080 --> 00:03:37,280
-next instruction, which takes us back inside 
-the traceback. One way to fix the error is  
-
-31
-00:03:37,280 --> 00:03:49,760
-to manually pad all labels to the longest, or 
-we can use the data collator designed for this.
+﻿1
+00:00:00,459 --> 00:00:03,542
+(wind swiping sound)
+
+2
+00:00:05,880 --> 00:00:08,910
+- [Instructor] Using the
+Python debugger in a terminal.
+
+3
+00:00:08,910 --> 00:00:11,580
+In this video, we'll learn
+how to use a Python debugger
+
+4
+00:00:11,580 --> 00:00:13,140
+in a terminal.
+
+5
+00:00:13,140 --> 00:00:15,390
+For this example, we're running code
+
+6
+00:00:15,390 --> 00:00:17,760
+from the token classification section,
+
+7
+00:00:17,760 --> 00:00:19,950
+downloading the Conll dataset
+
+8
+00:00:19,950 --> 00:00:23,340
+before loading a tokenizer
+to pre-process it.
+
+9
+00:00:23,340 --> 00:00:25,140
+Check out the section
+of the course link below
+
+10
+00:00:25,140 --> 00:00:26,223
+for more information.
+
+11
+00:00:27,600 --> 00:00:28,500
+Once this is done,
+
+12
+00:00:28,500 --> 00:00:30,630
+we try to batch together some features
+
+13
+00:00:30,630 --> 00:00:33,180
+of the training dataset by padding them
+
+14
+00:00:33,180 --> 00:00:34,330
+and returning a tensor.
+
+15
+00:00:36,810 --> 00:00:39,510
+If we try to execute our
+scripts in a terminal
+
+16
+00:00:39,510 --> 00:00:40,413
+we get an error.
+
+17
+00:00:42,630 --> 00:00:44,260
+Note that we use PyTorch here
+
+18
+00:00:44,260 --> 00:00:45,600
+we return tensors equal pity.
+
+19
+00:00:45,600 --> 00:00:47,753
+But you would get the same
+error with TensorFlow.
+
+20
+00:00:49,500 --> 00:00:51,990
+As we have seen in the, 'How
+to debug an error?' video,
+
+21
+00:00:51,990 --> 00:00:54,780
+The raw message is at the
+end and it indicates we
+
+22
+00:00:54,780 --> 00:00:58,260
+should use pairing, which
+we're actually trying to do.
+
+23
+00:00:58,260 --> 00:01:00,630
+So this is not useful and
+we need to go little deeper
+
+24
+00:01:00,630 --> 00:01:02,310
+to debug the problem.
+
+25
+00:01:02,310 --> 00:01:04,830
+Fortunately, you can use the
+Python debugger quite easily
+
+26
+00:01:04,830 --> 00:01:09,830
+in a terminal by launching
+your script with Python -m PDB
+
+27
+00:01:09,930 --> 00:01:11,980
+and then the name of the training script.
+
+28
+00:01:13,410 --> 00:01:15,030
+When executing that comment, you are sent
+
+29
+00:01:15,030 --> 00:01:17,340
+to the first instruction of your script.
+
+30
+00:01:17,340 --> 00:01:20,733
+You can run just the next
+instruction by typing N and enter.
+
+31
+00:01:22,530 --> 00:01:27,423
+Or you can continue directly
+to zero by typing C and enter.
+
+32
+00:01:29,850 --> 00:01:31,560
+Once there, you go to the very bottom
+
+33
+00:01:31,560 --> 00:01:34,050
+of the traceback and
+you can type commands.
+
+34
+00:01:34,050 --> 00:01:36,360
+The first two commands you
+should learn are U and D,
+
+35
+00:01:36,360 --> 00:01:38,160
+for up and down.
+
+36
+00:01:38,160 --> 00:01:41,223
+This allows you to get up
+and down in the traceback.
+
+37
+00:01:42,990 --> 00:01:46,623
+Going up twice, we get to the
+point the error was reached.
+
+38
+00:01:47,910 --> 00:01:50,190
+The first command to learn is P for print.
+
+39
+00:01:50,190 --> 00:01:52,830
+It allows you to print any value you want.
+
+40
+00:01:52,830 --> 00:01:56,280
+For instance, here we can see
+the value of return_tensors
+
+41
+00:01:56,280 --> 00:02:00,210
+or batch_outputs to try to
+understand what triggered zero.
+
+42
+00:02:00,210 --> 00:02:03,000
+The batch outputs dictionary
+is a bit hard to see
+
+43
+00:02:03,000 --> 00:02:05,520
+so let's dive into smaller pieces of it.
+
+44
+00:02:05,520 --> 00:02:08,460
+Inside the debugger, you can
+not only print any variable
+
+45
+00:02:08,460 --> 00:02:10,740
+but also evaluate any expression,
+
+46
+00:02:10,740 --> 00:02:13,713
+so we can look
+independently at the inputs.
+
+47
+00:02:15,060 --> 00:02:15,993
+Also labels.
+
+48
+00:02:22,350 --> 00:02:24,300
+Those labels are definitely weird.
+
+49
+00:02:24,300 --> 00:02:26,880
+They are various size,
+which we can confirm
+
+50
+00:02:26,880 --> 00:02:29,553
+by printing the sites using
+a release compression.
+
+51
+00:02:35,880 --> 00:02:37,800
+No wonder the tokenizer
+wasn't able to create
+
+52
+00:02:37,800 --> 00:02:39,270
+a tensor with them.
+
+53
+00:02:39,270 --> 00:02:41,460
+This is because the pad
+method only takes care
+
+54
+00:02:41,460 --> 00:02:44,850
+of the tokenizer outputs, the
+input IDs, the attention mask
+
+55
+00:02:44,850 --> 00:02:46,560
+and the token type IDs.
+
+56
+00:02:46,560 --> 00:02:48,390
+So we have to pad the level ourselves
+
+57
+00:02:48,390 --> 00:02:51,300
+before trying to create
+a new sensor with them.
+
+58
+00:02:51,300 --> 00:02:54,030
+Once you're ready to
+execute the Python debugger,
+
+59
+00:02:54,030 --> 00:02:56,640
+you can press Q for quit and enter.
+
+60
+00:02:56,640 --> 00:02:59,790
+Another way we can access
+the Python debugger,
+
+61
+00:02:59,790 --> 00:03:02,310
+is to put a breaking point in our script.
+
+62
+00:03:02,310 --> 00:03:05,913
+We can do this using the
+PDB that set_trace method.
+
+63
+00:03:07,920 --> 00:03:09,870
+As long as we import the PDB module
+
+64
+00:03:09,870 --> 00:03:11,420
+at the beginning of our script.
+
+65
+00:03:12,510 --> 00:03:17,283
+Saving and then relaunching
+our script, with just Python.
+
+66
+00:03:19,710 --> 00:03:23,310
+We'll stop the execution at
+the breaking point we set.
+
+67
+00:03:23,310 --> 00:03:24,660
+We can inspect all the variable
+
+68
+00:03:24,660 --> 00:03:27,030
+before the next instruction
+is executed again.
+
+69
+00:03:27,030 --> 00:03:29,253
+For instance, here, the features.
+
+70
+00:03:30,270 --> 00:03:33,090
+Typing N and enter execute
+the next instruction
+
+71
+00:03:33,090 --> 00:03:35,700
+which takes us back inside traceback.
+
+72
+00:03:35,700 --> 00:03:37,530
+When going to fix zero manually is to
+
+73
+00:03:37,530 --> 00:03:39,873
+pad all the labels to the longest.
+
+74
+00:03:42,000 --> 00:03:45,120
+Another way is to use
+the data creator suitable
+
+75
+00:03:45,120 --> 00:03:46,443
+for token classification.
+
+76
+00:03:48,330 --> 00:03:50,340
+If you want to learn how to use the Python
+
+77
+00:03:50,340 --> 00:03:53,273
+debugger in a notebook, check
+out the video in link below.
+
+78
+00:03:54,698 --> 00:03:57,781
+(wind swiping sound)
+
diff --git a/subtitles/en/72_asking-for-help-on-the-forums.srt b/subtitles/en/72_asking-for-help-on-the-forums.srt
index 40ba6dbc9..e34751109 100644
--- a/subtitles/en/72_asking-for-help-on-the-forums.srt
+++ b/subtitles/en/72_asking-for-help-on-the-forums.srt
@@ -1,174 +1,346 @@
-1
-00:00:05,520 --> 00:00:08,080
-How to ask a question on the Hugging Face forums?
-
-2
-00:00:09,840 --> 00:00:15,360
-If you have a general question or are looking to 
-debug your code, the forums are the place to ask.  
-
-3
-00:00:15,360 --> 00:00:17,760
-In this video we will teach you 
-how to write a good question,  
-
-4
-00:00:17,760 --> 00:00:20,080
-to maximize the chances you will get an answer.
-
-5
-00:00:21,360 --> 00:00:25,120
-First things first, to login on the 
-forums, you need a Hugging Face account.  
-
-6
-00:00:25,680 --> 00:00:32,560
-If you haven't created one yet, go to hf.co and 
-click Sign Up. There is also a direct link below.
-
-7
-00:00:33,520 --> 00:00:34,880
-Fill your email and password,  
-
-8
-00:00:34,880 --> 00:00:38,480
-then continue the steps ot pick a 
-username and update a profile picture.
-
-9
-00:00:39,440 --> 00:00:43,040
-Once this is done, go to discuss.huggingface.co  
-
-10
-00:00:43,040 --> 00:00:48,480
-(linked below) and click Log In. Use the same 
-login information as for the Hugging Face website.
-
-11
-00:00:49,600 --> 00:00:53,840
-You can search the forums by clicking on the 
-magnifying glass. Someone may have already asked  
-
-12
-00:00:53,840 --> 00:00:59,040
-your question in a topic! If you find you can't 
-post a new topic as a new user, it may be because  
-
-13
-00:00:59,040 --> 00:01:04,480
-of the antispam filters. Make sure you spend some 
-time reading existing topics to deactivate it.
-
-14
-00:01:05,120 --> 00:01:09,440
-When you are sure your question hasn't been 
-asked yet, click on the New Topic button.
-
-15
-00:01:09,440 --> 00:01:11,840
-For this example, we will use the following code,
-
-16
-00:01:12,400 --> 00:01:16,320
-that produces an error, as we saw in the 
-"What to do when I get an error?" video.
-
-17
-00:01:18,080 --> 00:01:22,400
-The first step is to pick a category for our 
-new topic. Since our error has to do with the  
-
-18
-00:01:22,400 --> 00:01:29,040
-Transformers library, we pick this category. New, 
-choose a title that summarizes your error well.  
-
-19
-00:01:29,680 --> 00:01:33,040
-Don't be too vague or users that get 
-the same error you did in the future  
-
-20
-00:01:33,040 --> 00:01:37,600
-won't be able to find your topic. Once 
-you have finished typing your topic,  
-
-21
-00:01:38,160 --> 00:01:41,680
-make sure the question hasn't been answered 
-in the topics Discourse suggests you.  
-
-22
-00:01:42,480 --> 00:01:44,960
-Click on the cross to remove that 
-window when you have double-checked.  
-
-23
-00:01:46,000 --> 00:01:50,880
-This is an example of what not to do when 
-posting an error: the message is very vague  
-
-24
-00:01:50,880 --> 00:01:55,120
-so no one else will be able to guess what went 
-wrong for you, and it tags too many people.  
-
-25
-00:01:56,320 --> 00:02:00,560
-Tagging people (especially moderators) might 
-have the opposite effect of what you want.  
-
-26
-00:02:01,200 --> 00:02:05,520
-As you send them a notification (and they get 
-plenty), they will probably not bother replying  
-
-27
-00:02:05,520 --> 00:02:10,480
-to you, and users you didn't tag will probably 
-ignore the question since they see tagged users.  
-
-28
-00:02:11,200 --> 00:02:15,520
-Only tag a user when you are completely certain 
-they are the best placed to answer your question.  
-
-29
-00:02:17,520 --> 00:02:21,760
-Be precise in your text, and if you have an 
-error coming from a specific piece of code,  
-
-30
-00:02:21,760 --> 00:02:27,760
-include that code in your post. To make sure your 
-post looks good, place your question between three  
-
-31
-00:02:27,760 --> 00:02:32,720
-backticks like this. You can check on the 
-right how your post will appear once posted.  
-
-32
-00:02:34,080 --> 00:02:39,040
-If your question is about an error, it's even 
-better to include the full traceback. As explained  
-
-33
-00:02:39,040 --> 00:02:44,560
-in the "what to do when I get an error?' video, 
-expand the traceback if you are on Colab. like  
-
-34
-00:02:44,560 --> 00:02:50,320
-for the code, put it between two lines containing 
-three backticks for proper formatting. Our last  
-
-35
-00:02:50,320 --> 00:02:55,120
-advice is to remember to be nice, a please and a 
-thank you will go a long way into getting others  
-
-36
-00:02:55,120 --> 00:03:03,840
-to help you. With all that done properly, your 
-question should get an answer pretty quickly!
+﻿1
+00:00:00,125 --> 00:00:01,455
+(title whooshes)
+
+2
+00:00:01,455 --> 00:00:02,789
+(logo pops)
+
+3
+00:00:02,789 --> 00:00:05,700
+(title whooshes)
+
+4
+00:00:05,700 --> 00:00:08,433
+- How to ask a question on
+the Hugging Face forums?
+
+5
+00:00:10,020 --> 00:00:11,640
+If you have a general question
+
+6
+00:00:11,640 --> 00:00:13,110
+or are looking to debug your code,
+
+7
+00:00:13,110 --> 00:00:15,540
+the forums are the place to ask.
+
+8
+00:00:15,540 --> 00:00:16,710
+In this video we will teach you
+
+9
+00:00:16,710 --> 00:00:18,030
+how to write a good question,
+
+10
+00:00:18,030 --> 00:00:20,380
+to maximize the chances
+you will get an answer.
+
+11
+00:00:21,570 --> 00:00:23,970
+First things first, to
+login on the forums,
+
+12
+00:00:23,970 --> 00:00:25,920
+you need a Hugging Face account.
+
+13
+00:00:25,920 --> 00:00:27,750
+If you haven't created one already,
+
+14
+00:00:27,750 --> 00:00:31,080
+go to hf.co and click sign up.
+
+15
+00:00:31,080 --> 00:00:32,780
+There is also a direct link below.
+
+16
+00:00:33,750 --> 00:00:35,160
+Fill your email and password,
+
+17
+00:00:35,160 --> 00:00:37,410
+then continue the steps
+to pick your username
+
+18
+00:00:37,410 --> 00:00:38,860
+and update a profile picture.
+
+19
+00:00:39,720 --> 00:00:43,200
+Once this is done, go to
+discuss.huggingface.co,
+
+20
+00:00:43,200 --> 00:00:45,630
+link below, and click log in.
+
+21
+00:00:45,630 --> 00:00:47,033
+Use the same login information as
+
+22
+00:00:47,033 --> 00:00:48,693
+for the Hugging Face website.
+
+23
+00:00:49,890 --> 00:00:51,300
+You can search the forums by clicking
+
+24
+00:00:51,300 --> 00:00:52,800
+on the magnifying glass.
+
+25
+00:00:52,800 --> 00:00:55,710
+Someone may have already asked
+your question in a topic.
+
+26
+00:00:55,710 --> 00:00:58,260
+If you find you can't post
+a new topic as a new user,
+
+27
+00:00:58,260 --> 00:01:01,290
+it may be because of the antispam filters.
+
+28
+00:01:01,290 --> 00:01:03,750
+Make sure you spend some
+time reading existing topics
+
+29
+00:01:03,750 --> 00:01:05,370
+to deactivate it.
+
+30
+00:01:05,370 --> 00:01:07,590
+When you're sure your question
+hasn't been asked yet,
+
+31
+00:01:07,590 --> 00:01:09,660
+click on the new topic button.
+
+32
+00:01:09,660 --> 00:01:12,600
+For this example, we'll
+use the following code,
+
+33
+00:01:12,600 --> 00:01:13,860
+that produces an error,
+
+34
+00:01:13,860 --> 00:01:16,660
+as we saw in the "What to do
+when I get an error" video.
+
+35
+00:01:18,330 --> 00:01:21,330
+The first step is to pick a
+category for our new topic.
+
+36
+00:01:21,330 --> 00:01:23,790
+Since our error has to do
+with the Transformers library,
+
+37
+00:01:23,790 --> 00:01:24,903
+we pick this category.
+
+38
+00:01:26,070 --> 00:01:29,880
+Next, choose a title that
+summarizes your error well.
+
+39
+00:01:29,880 --> 00:01:32,300
+Don't be too vague or users
+that get the same error you did
+
+40
+00:01:32,300 --> 00:01:34,773
+in the future won't be
+able to find your topic.
+
+41
+00:01:36,150 --> 00:01:38,370
+Once you have finished
+typing your topic title,
+
+42
+00:01:38,370 --> 00:01:40,170
+make sure the question
+hasn't been answered
+
+43
+00:01:40,170 --> 00:01:42,690
+in the topics Discourse suggests you.
+
+44
+00:01:42,690 --> 00:01:44,190
+Click on the cross to remove that window
+
+45
+00:01:44,190 --> 00:01:46,230
+when you have double-checked.
+
+46
+00:01:46,230 --> 00:01:49,710
+This is an example of what not
+to do when posting an error.
+
+47
+00:01:49,710 --> 00:01:51,120
+The message is very vague,
+
+48
+00:01:51,120 --> 00:01:53,370
+so no one else will be able
+to guess what went wrong
+
+49
+00:01:53,370 --> 00:01:55,623
+for you, and it tags too many people.
+
+50
+00:01:56,490 --> 00:01:58,740
+Tagging people, especially moderators,
+
+51
+00:01:58,740 --> 00:02:01,470
+might have the opposite
+effect of what you want.
+
+52
+00:02:01,470 --> 00:02:04,380
+As you send them a notification,
+and they get plenty,
+
+53
+00:02:04,380 --> 00:02:06,300
+they will probably not
+bother replying to you,
+
+54
+00:02:06,300 --> 00:02:09,300
+and users you didn't tag will
+probably ignore the questions,
+
+55
+00:02:09,300 --> 00:02:11,430
+since they see tagged users.
+
+56
+00:02:11,430 --> 00:02:13,697
+Only tag a user when you
+are completely certain
+
+57
+00:02:13,697 --> 00:02:16,097
+they are the best place
+to answer your question.
+
+58
+00:02:17,730 --> 00:02:20,370
+Be precise in your text, and
+if you have an error coming
+
+59
+00:02:20,370 --> 00:02:22,710
+from a specific piece of
+code, include that code
+
+60
+00:02:22,710 --> 00:02:24,030
+in your post.
+
+61
+00:02:24,030 --> 00:02:27,210
+To make sure your post looks
+good, place your question
+
+62
+00:02:27,210 --> 00:02:30,060
+between three backticks like this.
+
+63
+00:02:30,060 --> 00:02:30,990
+You can check on the right
+
+64
+00:02:30,990 --> 00:02:32,943
+how your post will appear once posted.
+
+65
+00:02:34,320 --> 00:02:35,850
+If your question is about an error,
+
+66
+00:02:35,850 --> 00:02:38,640
+it's even better to
+include the full traceback.
+
+67
+00:02:38,640 --> 00:02:41,610
+As explained in the "What to
+do when I get an error" video,
+
+68
+00:02:41,610 --> 00:02:43,763
+expand the traceback if you're on Colab.
+
+69
+00:02:44,769 --> 00:02:45,990
+Like for the code, put it
+
+70
+00:02:45,990 --> 00:02:48,300
+between two lines
+containing three backticks
+
+71
+00:02:48,300 --> 00:02:50,160
+for proper formatting.
+
+72
+00:02:50,160 --> 00:02:52,740
+Our last advice is to remember to be nice.
+
+73
+00:02:52,740 --> 00:02:54,540
+A "Please," and a "Thank
+you" will go a long way
+
+74
+00:02:54,540 --> 00:02:56,490
+into getting others to help you.
+
+75
+00:02:56,490 --> 00:02:57,780
+With all that done properly,
+
+76
+00:02:57,780 --> 00:03:00,143
+your question should get
+an answer pretty quickly.
+
+77
+00:03:01,293 --> 00:03:04,344
+(title whooshes)
+
+78
+00:03:04,344 --> 00:03:06,034
+(title fizzles)
+
diff --git a/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt b/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt
index f73f5ef0c..6463b1fc6 100644
--- a/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt
+++ b/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt
@@ -1,214 +1,448 @@
-1
-00:00:06,080 --> 00:00:10,960
-In this video, we will see how to debug an error 
-you encounter when running trainer.train().  
-
-2
-00:00:12,240 --> 00:00:17,280
-As an example, we will use this script that 
-finetunes a bert model on the GLUE MNLI dataset.  
-
-3
-00:00:17,840 --> 00:00:20,880
-Checkout the videos linked below to 
-see how we came to such a script,  
-
-4
-00:00:21,680 --> 00:00:26,960
-here we want to learn how to debug the problems 
-in it. Running the script gives us an error pretty  
-
-5
-00:00:26,960 --> 00:00:31,920
-fast. It happens at the line where we feed the 
-inputs to the model, according to the traceback.  
-
-6
-00:00:32,640 --> 00:00:36,720
-That tells us there is a problem there, but the 
-problem could come from many different causes.  
-
-7
-00:00:37,520 --> 00:00:41,600
-To debug an error in a training, you need to 
-make sure each step of the training pipeline  
-
-8
-00:00:41,600 --> 00:00:46,160
-works as intended. This means checking that 
-the inputs of your dataset are correct,  
-
-9
-00:00:46,800 --> 00:00:50,560
-you can batch them together, feed 
-them through the model to get a loss,  
-
-10
-00:00:50,560 --> 00:00:54,080
-then compute the gradients of that loss 
-before performing an optimizer step.  
-
-11
-00:00:55,280 --> 00:01:00,480
-So let's start by looking at the training dataset 
-this Trainer is using. There is definitely a  
-
-12
-00:01:00,480 --> 00:01:07,040
-problem there as we see texts and not numbers. The 
-error message was telling us the model did not get  
-
-13
-00:01:07,040 --> 00:01:13,120
-input IDs and we do not have those in the dataset 
-indeed. Looking back at our code, we can see we  
-
-14
-00:01:13,120 --> 00:01:18,800
-made a mistake and passed the wrong datasets to 
-the Trainer. So let's fix that and run again.  
-
-15
-00:01:20,240 --> 00:01:25,680
-Now we have a new error. Inspecting the traceback 
-tells us it happens when we try to create a batch,  
-
-16
-00:01:25,680 --> 00:01:31,920
-specifically to group the features in a tensor. 
-We can confirm this by asking the Trainer to get  
-
-17
-00:01:31,920 --> 00:01:37,600
-us a batch of the training data loader, which 
-reproduces the same error. Either by inspecting  
-
-18
-00:01:37,600 --> 00:01:43,600
-the inputs or debugging, we can then see they are 
-not all of the same size. This is because we have  
-
-19
-00:01:43,600 --> 00:01:48,240
-not passed a data collator to do the padding in 
-the Trainer and didn't pad when preprocessing  
-
-20
-00:01:48,240 --> 00:01:53,440
-the data either. Padding inside the Trainer is 
-normally the default, but only if you provide  
-
-21
-00:01:53,440 --> 00:01:58,800
-your tokenizer to the Trainer, and we forgot to 
-do that. So let's fix the issue and run again.  
-
-22
-00:02:00,320 --> 00:02:06,400
-This time we get a nasty CUDA error. They 
-are very difficult to debug because for one,  
-
-23
-00:02:07,120 --> 00:02:11,280
-they put your kernel in a state that is not 
-recoverable (so you have to restart your  
-
-24
-00:02:11,280 --> 00:02:15,840
-notebook from the beginning) and two, the 
-traceback is completely useless for those.  
-
-25
-00:02:16,800 --> 00:02:22,240
-Here the traceback tells us the error happens when 
-we do the gradient computation with loss.backward,  
-
-26
-00:02:22,240 --> 00:02:27,840
-but as we will see later on that is not the 
-case. This is because everything that happens  
-
-27
-00:02:27,840 --> 00:02:33,520
-on the GPU is done asynchronously: when you 
-execute the model call, what the program does  
-
-28
-00:02:33,520 --> 00:02:39,280
-is just stacking that in the queue of GPU, then 
-(if the GPU didn't have any current job to do),  
-
-29
-00:02:39,280 --> 00:02:43,920
-the work will start on the GPU at the same time 
-as the CPU will move to the next instruction.  
-
-30
-00:02:44,800 --> 00:02:50,000
-Continuing with the extraction of the loss, this 
-is stacked into the GPU queue while the CPU moves  
-
-31
-00:02:50,000 --> 00:02:54,960
-to the instruction loss.backward. But the GPU 
-still hasn't finished the forward pass of the  
-
-32
-00:02:54,960 --> 00:03:01,760
-model since all that took no time at all. The CPU 
-stops moving forward, because loss.backward as an  
-
-33
-00:03:01,760 --> 00:03:09,360
-instruction telling it to wait for the GPUs to be 
-finished, and when the GPU encounters an error,  
-
-34
-00:03:09,360 --> 00:03:15,040
-it gives with a cryptic message back to the 
-CPU, who raises the error at the wrong place.  
-
-35
-00:03:16,080 --> 00:03:20,320
-So to debug this, we will need to execute the 
-next steps of the training pipeline on the CPU.  
-
-36
-00:03:20,960 --> 00:03:26,320
-It is very easy to do, and we get a traceback 
-we can trust this time. As we said before,  
-
-37
-00:03:26,320 --> 00:03:32,720
-the error happens during the forward pass 
-of the model, and it's an index error.  
-
-38
-00:03:33,360 --> 00:03:38,800
-With a bit of debugging, we see we have labels 
-ranging from 0 to 2, so three different values,  
-
-39
-00:03:38,800 --> 00:03:44,240
-but our outputs have a shape of batch size per 2. 
-It looks like our model has the wrong number of  
-
-40
-00:03:44,240 --> 00:03:50,320
-labels! We can indeed confirm that, and now that 
-we know it's easy to fix it in the code by adding  
-
-41
-00:03:50,320 --> 00:03:58,720
-num_labels=3 when we create the model. Now the 
-training script will run to completion! We did not  
-
-42
-00:03:58,720 --> 00:04:02,640
-need it yet, but here is how we would debug the 
-next step of the pipeline, gradient computation,  
-
-43
-00:04:03,360 --> 00:04:13,840
-as well as the optimizer step. With all of 
-this, good luck debugging your own trainings!
+﻿1
+00:00:06,210 --> 00:00:08,760
+- In this video, we will
+see how to debug an error
+
+2
+00:00:08,760 --> 00:00:11,896
+you encounter when running Trainer.train
+
+3
+00:00:11,896 --> 00:00:15,066
+As an example, we will use
+this script that finetunes
+
+4
+00:00:15,066 --> 00:00:17,760
+a bert model on the GLUE MNLI dataset.
+
+5
+00:00:17,760 --> 00:00:19,470
+Checkout the videos linked below
+
+6
+00:00:19,470 --> 00:00:21,840
+to see how we came to such a script.
+
+7
+00:00:21,840 --> 00:00:24,540
+Here we want to learn how
+to debug the problems in it.
+
+8
+00:00:25,470 --> 00:00:28,110
+Running the script gives
+us an error pretty quickly.
+
+9
+00:00:28,110 --> 00:00:29,040
+It happens at the line
+
+10
+00:00:29,040 --> 00:00:30,990
+where we feed the inputs to the model,
+
+11
+00:00:30,990 --> 00:00:32,850
+according to the traceback.
+
+12
+00:00:32,850 --> 00:00:34,702
+That tells us there is a problem there,
+
+13
+00:00:34,702 --> 00:00:37,881
+but the problem could come
+from many different causes.
+
+14
+00:00:37,881 --> 00:00:39,330
+To debug an error in a training,
+
+15
+00:00:39,330 --> 00:00:41,760
+you need to make sure each
+step of the training pipeline
+
+16
+00:00:41,760 --> 00:00:43,440
+works as intended.
+
+17
+00:00:43,440 --> 00:00:45,780
+This means checking that
+the inputs of your dataset
+
+18
+00:00:45,780 --> 00:00:47,040
+are correct,
+
+19
+00:00:47,040 --> 00:00:48,720
+you can batch them together,
+
+20
+00:00:48,720 --> 00:00:50,790
+feed them through the model to get a loss,
+
+21
+00:00:50,790 --> 00:00:52,500
+then compute the gradients of that loss
+
+22
+00:00:52,500 --> 00:00:54,303
+before performing an optimizer step.
+
+23
+00:00:55,470 --> 00:00:57,810
+So let's start by looking
+at the training dataset
+
+24
+00:00:57,810 --> 00:00:59,043
+this Trainer is using.
+
+25
+00:00:59,910 --> 00:01:02,190
+There is definitely a problem here.
+
+26
+00:01:02,190 --> 00:01:04,293
+We see texts and not number.
+
+27
+00:01:05,130 --> 00:01:06,660
+The error message was telling us the model
+
+28
+00:01:06,660 --> 00:01:08,220
+did not get input IDs
+
+29
+00:01:08,220 --> 00:01:11,100
+and we do not have those
+in the dataset indeed.
+
+30
+00:01:11,100 --> 00:01:12,660
+Looking back at our code,
+
+31
+00:01:12,660 --> 00:01:14,400
+we can see we made a mistake
+
+32
+00:01:14,400 --> 00:01:17,400
+and passed the wrong
+datasets to the Trainer.
+
+33
+00:01:17,400 --> 00:01:19,173
+So let's fix that and run again.
+
+34
+00:01:20,490 --> 00:01:21,840
+Now we have a new error.
+
+35
+00:01:21,840 --> 00:01:23,130
+Inspecting the traceback
+
+36
+00:01:23,130 --> 00:01:25,860
+tells us it happens when
+we try to create a batch,
+
+37
+00:01:25,860 --> 00:01:28,743
+specifically to group
+the features in a tensor.
+
+38
+00:01:29,700 --> 00:01:32,610
+We can confirm this by asking
+the Trainer to get us a batch
+
+39
+00:01:32,610 --> 00:01:34,230
+of the training data loader,
+
+40
+00:01:34,230 --> 00:01:35,913
+which reproduces the same error.
+
+41
+00:01:36,780 --> 00:01:39,064
+Either by inspecting
+the inputs or debugging,
+
+42
+00:01:39,064 --> 00:01:42,870
+we can then see they are
+not all of the same size.
+
+43
+00:01:42,870 --> 00:01:45,120
+This is because we have
+not passed a data collator
+
+44
+00:01:45,120 --> 00:01:46,890
+to do the padding to the Trainer
+
+45
+00:01:46,890 --> 00:01:49,443
+and didn't pad when
+preprocessing the data either.
+
+46
+00:01:50,430 --> 00:01:52,710
+Padding inside the Trainer
+is normally the default,
+
+47
+00:01:52,710 --> 00:01:55,380
+but only if you provide your
+tokenizer to the Trainer,
+
+48
+00:01:55,380 --> 00:01:57,270
+and we forgot to do that.
+
+49
+00:01:57,270 --> 00:01:59,120
+So let's fix the issue and run again.
+
+50
+00:02:00,510 --> 00:02:02,883
+This time we get a nasty CUDA error.
+
+51
+00:02:03,765 --> 00:02:06,285
+They are very difficult
+to debug because for one,
+
+52
+00:02:06,285 --> 00:02:10,530
+they put your kernel in a
+state that is not recoverable
+
+53
+00:02:10,530 --> 00:02:13,260
+so you have to restart your
+notebook from the beginning
+
+54
+00:02:13,260 --> 00:02:16,950
+and two, the traceback is
+completely useless for those.
+
+55
+00:02:16,950 --> 00:02:19,230
+Here the traceback tells
+us the error happens
+
+56
+00:02:19,230 --> 00:02:22,500
+when we do the gradient
+computation with loss.backward,
+
+57
+00:02:22,500 --> 00:02:25,113
+but as we will see later
+on that is not the case.
+
+58
+00:02:26,520 --> 00:02:28,920
+This is because everything
+that happens on the GPU
+
+59
+00:02:28,920 --> 00:02:30,720
+is done asynchronously.
+
+60
+00:02:30,720 --> 00:02:32,880
+When you execute the model call,
+
+61
+00:02:32,880 --> 00:02:34,457
+what the program does
+is just stacking that
+
+62
+00:02:34,457 --> 00:02:36,600
+in the queue of GPU,
+
+63
+00:02:36,600 --> 00:02:39,856
+then if the GPU didn't
+have any current job to do,
+
+64
+00:02:39,856 --> 00:02:41,850
+the work will start on
+the GPU at the same time
+
+65
+00:02:41,850 --> 00:02:45,000
+as the CPU moves to the next instruction.
+
+66
+00:02:45,000 --> 00:02:47,040
+Continuing with the
+extraction of the loss,
+
+67
+00:02:47,040 --> 00:02:49,170
+this is stacked into the GPU queue
+
+68
+00:02:49,170 --> 00:02:51,953
+while the CPU moves to the
+instruction loss.backward.
+
+69
+00:02:51,953 --> 00:02:54,180
+But the GPU still hasn't finished
+
+70
+00:02:54,180 --> 00:02:55,710
+the forward pass of the model
+
+71
+00:02:55,710 --> 00:02:57,603
+since all that took no time at all.
+
+72
+00:02:58,440 --> 00:03:00,210
+The CPU stops moving forward,
+
+73
+00:03:00,210 --> 00:03:03,240
+because loss.backward as an
+instruction telling it to wait
+
+74
+00:03:03,240 --> 00:03:04,830
+for the GPUs to be finished,
+
+75
+00:03:04,830 --> 00:03:06,780
+to make sure the gradients are correct.
+
+76
+00:03:07,650 --> 00:03:09,570
+When the GPU encounters an error,
+
+77
+00:03:09,570 --> 00:03:13,140
+it gives it back to the
+CPU with a cryptic message
+
+78
+00:03:13,140 --> 00:03:15,423
+who raises the error at the wrong place.
+
+79
+00:03:16,350 --> 00:03:18,720
+So to debug this, we will
+need to execute the next steps
+
+80
+00:03:18,720 --> 00:03:21,211
+of the training pipeline on the CPU.
+
+81
+00:03:21,211 --> 00:03:22,380
+It is very easy to do,
+
+82
+00:03:22,380 --> 00:03:25,350
+and we get a traceback
+we can trust this time.
+
+83
+00:03:25,350 --> 00:03:26,520
+As we said before,
+
+84
+00:03:26,520 --> 00:03:28,620
+the error actually happens
+during the forward pass
+
+85
+00:03:28,620 --> 00:03:29,453
+of the model,
+
+86
+00:03:29,453 --> 00:03:30,993
+and not loss.backward.
+
+87
+00:03:31,920 --> 00:03:33,680
+It's an index error.
+
+88
+00:03:33,680 --> 00:03:34,950
+With a bit of debugging,
+
+89
+00:03:34,950 --> 00:03:37,410
+we see we have labels ranging from 0 to 2,
+
+90
+00:03:37,410 --> 00:03:39,000
+so three different values,
+
+91
+00:03:39,000 --> 00:03:42,191
+but our outputs have a
+shape of batch size per 2.
+
+92
+00:03:42,191 --> 00:03:45,600
+It looks like our model has
+the wrong number of labels.
+
+93
+00:03:45,600 --> 00:03:47,190
+We can indeed confirm that,
+
+94
+00:03:47,190 --> 00:03:49,860
+and now that we know it's
+easy to fix it in the code
+
+95
+00:03:49,860 --> 00:03:53,969
+by adding num_labels=3
+when we create the model.
+
+96
+00:03:53,969 --> 00:03:56,883
+Now the training script
+will run to completion.
+
+97
+00:03:58,440 --> 00:03:59,430
+We did not need it yet,
+
+98
+00:03:59,430 --> 00:04:00,960
+but here is how we would
+debug the next step
+
+99
+00:04:00,960 --> 00:04:02,944
+of the pipeline, gradient computation,
+
+100
+00:04:02,944 --> 00:04:05,850
+as well as the optimizer step.
+
+101
+00:04:05,850 --> 00:04:08,823
+With all of this, good luck
+debugging your own trainings!
+
diff --git a/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt b/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt
index c4cbfe579..aaac0136f 100644
--- a/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt
+++ b/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt
@@ -1,309 +1,799 @@
-1
-00:00:04,720 --> 00:00:09,280
-Some bugs in your code are very 
-straightforward. You try running it,  
-
-2
-00:00:09,280 --> 00:00:14,720
-you get a syntax error somewhere, Python tells 
-you exactly where, and you fix it. This is  
-
-3
-00:00:14,720 --> 00:00:22,240
-great - it's simple and satisfying. Sometimes, 
-though, things crash and the error is impossible  
-
-4
-00:00:22,240 --> 00:00:27,360
-to understand. This happens a lot in machine 
-learning for a few reasons - you're working with  
-
-5
-00:00:27,360 --> 00:00:33,920
-big data structures, using big, complex libraries 
-with a lot of moving parts, and also you're doing  
-
-6
-00:00:33,920 --> 00:00:41,600
-a lot of GPU computing. In Keras there's the added 
-bonus problem that your models are often compiled  
-
-7
-00:00:41,600 --> 00:00:46,080
-before execution, which is great for performance 
-but makes debugging them very difficult.  
-
-8
-00:00:47,920 --> 00:00:52,160
-This is going to be a video about what to do 
-when you run into one of those nightmare bugs.  
-
-9
-00:00:56,400 --> 00:01:07,600
-To give you some intuitions for what can go wrong, 
-and where to look for the source of bugs that you  
-
-10
-00:01:07,600 --> 00:01:13,360
-encounter, let's use this example script, and 
-I'll show it to you here in two parts. First,  
-
-11
-00:01:13,360 --> 00:01:19,280
-we do all our imports, we load a dataset, we 
-create our tokenizer and we tokenize the dataset.  
-
-12
-00:01:20,160 --> 00:01:28,320
-Next, we convert our datasets to TensorFlow 
-datasets, so that we can run fit() on them,  
-
-13
-00:01:28,320 --> 00:01:34,640
-and then we load our model from a pretrained 
-checkpoint, compile it and fit it. It seems  
-
-14
-00:01:34,640 --> 00:01:42,880
-straightforward enough, but beware! This spooky 
-code hides many dark and mysterious secrets.  
-
-15
-00:01:43,760 --> 00:01:52,880
-What happens when we run it? Well, this isn't 
-great. What does that mean? We tried to train  
-
-16
-00:01:52,880 --> 00:01:59,600
-on our data, but we got no gradient? This is 
-pretty perplexing - how do we even begin to debug  
-
-17
-00:02:00,400 --> 00:02:04,880
-something like that? When the error you get 
-doesn't immediately suggest where the problem is,  
-
-18
-00:02:05,440 --> 00:02:11,040
-the best solution is often to walk through 
-things in sequence, making sure at each stage  
-
-19
-00:02:11,040 --> 00:02:19,120
-that things look right. And of course, the 
-place to start is always to check your data.  
-
-20
-00:02:20,720 --> 00:02:28,960
-The best way to do that to grab a batch from the 
-tf.data.Dataset that your model is training on,  
-
-21
-00:02:30,560 --> 00:02:41,840
-right at the end of the training pipeline. And 
-we can do that like so, by looping over the  
-
-22
-00:02:41,840 --> 00:02:50,320
-dataset for one iteration and then breaking. 
-So what do we get when we inspect that batch?  
-
-23
-00:02:50,320 --> 00:02:54,800
-We see that we're not getting any gradient 
-because we're not passing labels to Keras!  
-
-24
-00:02:55,520 --> 00:03:00,800
-Our labels are in the batch, but they're a key 
-in the input dictionary, not a separate label.  
-
-25
-00:03:02,400 --> 00:03:06,160
-This is one of the most common issues you'll 
-encounter when training Transformers models  
-
-26
-00:03:06,160 --> 00:03:12,960
-with TensorFlow. Our models can all compute loss 
-internally, but to use that loss for training  
-
-27
-00:03:12,960 --> 00:03:16,880
-the labels need to be passed in the input 
-dictionary, where the model can see them.  
-
-28
-00:03:17,760 --> 00:03:23,200
-This internal loss is the loss that we use when 
-we don't specify a loss value to compile().  
-
-29
-00:03:26,640 --> 00:03:30,960
-Keras, on the other hand, usually expects 
-labels to be passed separately from the input  
-
-30
-00:03:30,960 --> 00:03:36,720
-dictionary and not to be visible to the model, 
-and loss computations will usually fail if you  
-
-31
-00:03:36,720 --> 00:03:43,040
-don't do that. We need to choose one or the other: 
-Either we use the model's internal loss and keep  
-
-32
-00:03:43,040 --> 00:03:49,120
-the labels where they are, or we keep using Keras 
-losses, but we move the labels to the place Keras  
-
-33
-00:03:49,120 --> 00:03:57,680
-expects them. For simplicity, let's use the model 
-internal losses, by removing the loss argument  
-
-34
-00:03:57,680 --> 00:04:06,000
-from the call to compile(). So what happens if 
-we try training with model.fit() after fixing  
-
-35
-00:04:06,560 --> 00:04:13,840
-the loss function! Well, it runs this time... 
-but now we get a loss of nan. This isn't good.  
-
-36
-00:04:16,240 --> 00:04:22,160
-NaN is not a good loss. In fact, if we inspect 
-our model now, we'll see that not only are all  
-
-37
-00:04:22,160 --> 00:04:30,640
-the outputs nan , all the weights are nan too. 
-Once a single nan creeps into your computations,  
-
-38
-00:04:30,640 --> 00:04:37,280
-it tends to spread, because it propagates 
-from the loss back through your gradient  
-
-39
-00:04:37,280 --> 00:04:48,320
-and then into the weight updates. So nan destroyed 
-our model. But where did it creep in first?  
-
-40
-00:04:49,600 --> 00:04:57,760
-To find out, we need to re-initialize the model 
-and look at the outputs for just the first batch.  
-
-41
-00:04:58,400 --> 00:05:04,160
-And when we do that, we see that nan first 
-appears in the loss, but only in some samples!  
-
-42
-00:05:04,960 --> 00:05:08,480
-You can see this in more detail in the 
-accompanying section of the course notes,  
-
-43
-00:05:11,040 --> 00:05:17,120
-but we find that if we look at the labels, the 
-samples with a loss of nan all have a label of 2!  
-
-44
-00:05:17,760 --> 00:05:24,400
-This gives us a very strong clue - if we check 
-the model, with model.config.num_labels, we see  
-
-45
-00:05:24,400 --> 00:05:30,080
-the model thinks there's only 2 labels, but if 
-we see a value of 2, that means there's at least  
-
-46
-00:05:30,080 --> 00:05:36,400
-3 labels, because 0 is a label too! So we got 
-a loss of nan because we got an "impossible"  
-
-47
-00:05:36,400 --> 00:05:43,040
-label. To fix that, we need to go back and set 
-the model to have the right number of labels.  
-
-48
-00:05:43,680 --> 00:05:52,240
-We can set num_labels=3 when we initialize the 
-model with from_pretrained. So now we think our  
-
-49
-00:05:52,240 --> 00:05:58,000
-data is good and our model is good, so training 
-should work. And if we try running model.fit(),  
-
-50
-00:05:58,000 --> 00:06:07,600
-we get... hmm. The loss goes down, but it's 
-not very quick. And if we keep running it out,  
-
-51
-00:06:07,600 --> 00:06:13,600
-we'll find that it stalls at a fairly high value. 
-What's going on? Well, when things are mostly  
-
-52
-00:06:13,600 --> 00:06:19,280
-working, but training is just slow, that can 
-often be a good time to look at your optimizer  
-
-53
-00:06:19,280 --> 00:06:24,560
-and training hyperparameters. And this is where 
-I want to mention one of the most common sources  
-
-54
-00:06:24,560 --> 00:06:30,480
-of issues when you're working with Keras - you 
-can name things like optimizers with strings,  
-
-55
-00:06:32,960 --> 00:06:37,680
-but if you do that, all of the options 
-get silently set to their default values.  
-
-56
-00:06:38,240 --> 00:06:43,920
-So we specified our optimizer as Adam, but in 
-the process we invisibly got the default learning  
-
-57
-00:06:43,920 --> 00:06:51,440
-rate, which is 1e-3, or ten to the power of minus 
-3. This is way too high for training transformer  
-
-58
-00:06:51,440 --> 00:06:59,120
-models! We should go back and specify the learning 
-rate directly - good values are between 1e-5  
-
-59
-00:06:59,760 --> 00:07:06,800
-and 1e-4. Let's split the difference and 
-pick 5e-5. And if you recompile with that,  
-
-60
-00:07:06,800 --> 00:07:16,880
-you'll find that training actually works, at last. 
-Again, I went through this quite quickly, and I  
-
-61
-00:07:16,880 --> 00:07:20,720
-recommend checking out the course notes for this 
-to see this in more detail and to experiment with  
-
-62
-00:07:20,720 --> 00:07:43,840
-the code yourself. Good luck, and remember to take 
-breaks if your code is giving you a hard time!
+﻿1
+00:00:00,212 --> 00:00:02,879
+(air whooshing)
+
+2
+00:00:04,680 --> 00:00:08,130
+- Some bugs in your code
+are very straightforward.
+
+3
+00:00:08,130 --> 00:00:11,580
+You try running it, you get
+a syntax error somewhere,
+
+4
+00:00:11,580 --> 00:00:14,490
+Python tells you exactly
+where, and you fix it.
+
+5
+00:00:14,490 --> 00:00:17,760
+This is great, it's simple
+and it's satisfying.
+
+6
+00:00:17,760 --> 00:00:20,310
+Sometimes, though, things crash
+
+7
+00:00:20,310 --> 00:00:23,670
+and the error is impossible to understand.
+
+8
+00:00:23,670 --> 00:00:26,700
+This happens a lot in machine
+learning for a few reasons,
+
+9
+00:00:26,700 --> 00:00:29,310
+you're working with big data structures,
+
+10
+00:00:29,310 --> 00:00:31,440
+you're using these big, complex libraries
+
+11
+00:00:31,440 --> 00:00:33,420
+with a lot of moving parts,
+
+12
+00:00:33,420 --> 00:00:35,310
+and also you're doing
+a lot of GPU computing,
+
+13
+00:00:35,310 --> 00:00:38,490
+and that in general is much
+more difficult to debug.
+
+14
+00:00:38,490 --> 00:00:40,260
+In Keras there's the additional problem
+
+15
+00:00:40,260 --> 00:00:43,140
+that your models are often
+compiled before execution,
+
+16
+00:00:43,140 --> 00:00:44,400
+which is great for performance
+
+17
+00:00:44,400 --> 00:00:47,430
+but it makes debugging them
+very difficult as well.
+
+18
+00:00:47,430 --> 00:00:50,370
+So, this is going to be
+a video about what to do
+
+19
+00:00:50,370 --> 00:00:52,410
+when you run into one
+of those nightmare bugs
+
+20
+00:00:52,410 --> 00:00:55,210
+and you just have no idea
+where to begin with fixing it.
+
+21
+00:00:56,370 --> 00:00:58,920
+So, to give you some intuitions for
+
+22
+00:00:58,920 --> 00:01:01,530
+the most common things that go wrong
+
+23
+00:01:01,530 --> 00:01:03,573
+and cause these weird issues,
+
+24
+00:01:04,800 --> 00:01:07,530
+and show you where to look
+for the sources of bugs
+
+25
+00:01:07,530 --> 00:01:10,560
+that you encounter, let's
+use this example script.
+
+26
+00:01:10,560 --> 00:01:12,900
+So, I'll show it to you here in two parts.
+
+27
+00:01:12,900 --> 00:01:16,410
+First, we do all our
+imports, we load a dataset,
+
+28
+00:01:16,410 --> 00:01:20,280
+we create our tokenizer and
+we tokenize the dataset.
+
+29
+00:01:20,280 --> 00:01:23,640
+Next, we convert our datasets
+to TensorFlow datasets,
+
+30
+00:01:23,640 --> 00:01:26,100
+so that's tf.data.Dataset,
+
+31
+00:01:26,100 --> 00:01:28,500
+and that's so that we can run fit on them,
+
+32
+00:01:28,500 --> 00:01:31,170
+and then we load our model
+from a pretrained checkpoint,
+
+33
+00:01:31,170 --> 00:01:33,870
+we compile it and we fit
+it with those datasets.
+
+34
+00:01:33,870 --> 00:01:35,970
+So, this seems straightforward enough,
+
+35
+00:01:35,970 --> 00:01:38,220
+it's similar to what we've
+done in the course before.
+
+36
+00:01:38,220 --> 00:01:40,650
+But beware, this is spooky code
+
+37
+00:01:40,650 --> 00:01:43,590
+and hides many dark
+and mysterious secrets.
+
+38
+00:01:43,590 --> 00:01:46,050
+So, what happens when we run it?
+
+39
+00:01:46,050 --> 00:01:48,840
+Well, it's not great.
+
+40
+00:01:48,840 --> 00:01:52,320
+So, we get this error message,
+but what does it mean?
+
+41
+00:01:52,320 --> 00:01:55,470
+We tried to train on our
+data, but we got no gradient?
+
+42
+00:01:55,470 --> 00:01:59,130
+It's pretty perplexing, I
+mean, how do we even begin
+
+43
+00:01:59,130 --> 00:02:01,500
+to debug not getting a gradient?
+
+44
+00:02:01,500 --> 00:02:03,930
+So, when the error you get
+doesn't immediately suggest
+
+45
+00:02:03,930 --> 00:02:06,630
+where the problem is, the best solution
+
+46
+00:02:06,630 --> 00:02:09,180
+is often to walk through
+things in sequence,
+
+47
+00:02:09,180 --> 00:02:12,900
+making sure at each stage
+that the outputs look right,
+
+48
+00:02:12,900 --> 00:02:15,300
+that everything looks okay at that point.
+
+49
+00:02:15,300 --> 00:02:17,730
+And, of course, that
+means the place to start
+
+50
+00:02:17,730 --> 00:02:19,473
+is always to check your data.
+
+51
+00:02:20,670 --> 00:02:22,050
+So, the best way to make sure
+
+52
+00:02:22,050 --> 00:02:24,480
+that the data you're
+giving the model is good,
+
+53
+00:02:24,480 --> 00:02:27,690
+is to grab a batch from
+the tf.data.Dataset
+
+54
+00:02:27,690 --> 00:02:29,520
+that your model is training on,
+
+55
+00:02:29,520 --> 00:02:31,560
+and that's because it's right at the end
+
+56
+00:02:31,560 --> 00:02:33,990
+of the data pipeline.
+
+57
+00:02:33,990 --> 00:02:36,990
+And so that means that if
+those outputs are good,
+
+58
+00:02:36,990 --> 00:02:39,990
+you're guaranteed that your
+data pipeline is working well.
+
+59
+00:02:39,990 --> 00:02:42,600
+So, we can do that by
+looping over the dataset
+
+60
+00:02:42,600 --> 00:02:44,790
+for one iteration and then breaking,
+
+61
+00:02:44,790 --> 00:02:46,980
+and that gives us a single batch.
+
+62
+00:02:46,980 --> 00:02:49,443
+So, what do we get when
+we inspect that batch?
+
+63
+00:02:50,460 --> 00:02:52,500
+We'll see that we're
+not getting any gradient
+
+64
+00:02:52,500 --> 00:02:55,530
+because we're not passing labels to Keras.
+
+65
+00:02:55,530 --> 00:02:57,510
+So, our labels are in the batch,
+
+66
+00:02:57,510 --> 00:02:59,670
+but they're a key in the input dictionary
+
+67
+00:02:59,670 --> 00:03:02,340
+and they're not a separate
+label as Keras expects,
+
+68
+00:03:02,340 --> 00:03:04,830
+so this is one of the most
+common issues you'll encounter
+
+69
+00:03:04,830 --> 00:03:07,590
+when training Transformers
+models with TensorFlow.
+
+70
+00:03:07,590 --> 00:03:10,980
+Our models can all
+compute loss internally,
+
+71
+00:03:10,980 --> 00:03:13,140
+but to use that loss for training
+
+72
+00:03:13,140 --> 00:03:15,960
+the labels need to be passed
+in the input dictionary,
+
+73
+00:03:15,960 --> 00:03:17,940
+where the model can see them.
+
+74
+00:03:17,940 --> 00:03:20,280
+This internal loss is the loss that we use
+
+75
+00:03:20,280 --> 00:03:23,760
+when we don't specify a
+loss when we call compile,
+
+76
+00:03:23,760 --> 00:03:25,660
+when we don't specify a loss argument.
+
+77
+00:03:26,520 --> 00:03:27,870
+So, Keras, on the other hand,
+
+78
+00:03:27,870 --> 00:03:30,570
+usually expects labels
+to be passed separately
+
+79
+00:03:30,570 --> 00:03:32,130
+from the input dictionary,
+
+80
+00:03:32,130 --> 00:03:34,110
+and not to be visible to the model,
+
+81
+00:03:34,110 --> 00:03:36,600
+and loss computations will usually fail
+
+82
+00:03:36,600 --> 00:03:38,220
+if you don't do that
+
+83
+00:03:38,220 --> 00:03:40,380
+So we need to choose one or the other,
+
+84
+00:03:40,380 --> 00:03:42,780
+either we use the model's internal loss
+
+85
+00:03:42,780 --> 00:03:44,940
+and keep the labels where they are,
+
+86
+00:03:44,940 --> 00:03:46,980
+or we keep using Keras losses
+
+87
+00:03:46,980 --> 00:03:50,520
+but we move the labels to
+the place Keras expects them.
+
+88
+00:03:50,520 --> 00:03:53,310
+So, or simplicity here,
+let's fix this issue
+
+89
+00:03:53,310 --> 00:03:55,860
+by using the model's internal losses,
+
+90
+00:03:55,860 --> 00:03:57,900
+and we do that by
+removing the loss argument
+
+91
+00:03:57,900 --> 00:03:59,343
+from the call to compile.
+
+92
+00:04:00,540 --> 00:04:03,000
+So, what happens if we try training now?
+
+93
+00:04:03,000 --> 00:04:08,000
+So we recompile with that, we
+call model.fit, what happens?
+
+94
+00:04:08,220 --> 00:04:13,050
+Well, it runs this time but
+now we get a loss of NaN.
+
+95
+00:04:13,050 --> 00:04:16,440
+So, that's not good,
+NaN means not a number
+
+96
+00:04:16,440 --> 00:04:19,140
+and it's not a good
+loss to have in general.
+
+97
+00:04:19,140 --> 00:04:21,000
+In fact, if we inspect our model now,
+
+98
+00:04:21,000 --> 00:04:23,970
+we'll see that not only
+are all the outputs NaN,
+
+99
+00:04:23,970 --> 00:04:27,600
+all the weights are NaN as
+well, as well as the loss.
+
+100
+00:04:27,600 --> 00:04:30,810
+So once a single NaN creeps
+into your computations,
+
+101
+00:04:30,810 --> 00:04:34,530
+it tends to spread, because
+it propagates from the loss
+
+102
+00:04:34,530 --> 00:04:36,420
+and once it's at the loss
+it's at the gradient,
+
+103
+00:04:36,420 --> 00:04:37,530
+it gets to the gradient,
+
+104
+00:04:37,530 --> 00:04:38,910
+and then once it's in the gradient
+
+105
+00:04:38,910 --> 00:04:41,280
+it enters the weight updates,
+
+106
+00:04:41,280 --> 00:04:43,980
+and then all your weight
+updates end up as NaN as well.
+
+107
+00:04:43,980 --> 00:04:46,950
+So NaN just completely
+destroyed our model here,
+
+108
+00:04:46,950 --> 00:04:49,560
+but where did it creep in first?
+
+109
+00:04:49,560 --> 00:04:52,140
+So to find out, we need to back to a point
+
+110
+00:04:52,140 --> 00:04:53,490
+before the model was destroyed,
+
+111
+00:04:53,490 --> 00:04:55,440
+we need to re-initialize the model
+
+112
+00:04:55,440 --> 00:04:58,590
+and look at the outputs
+for just the first batch.
+
+113
+00:04:58,590 --> 00:04:59,850
+And when we do that,
+
+114
+00:04:59,850 --> 00:05:02,790
+we see that NaN first appears in the loss,
+
+115
+00:05:02,790 --> 00:05:04,980
+but only in some samples.
+
+116
+00:05:04,980 --> 00:05:06,540
+So you can see this in more detail
+
+117
+00:05:06,540 --> 00:05:09,090
+in the accompanying section
+of the course notes,
+
+118
+00:05:09,090 --> 00:05:11,220
+I am moving fairly quickly here,
+
+119
+00:05:11,220 --> 00:05:13,500
+but we find that if we look at the labels,
+
+120
+00:05:13,500 --> 00:05:17,790
+the samples with a loss of
+NaN all have a label of two.
+
+121
+00:05:17,790 --> 00:05:19,950
+So this gives us a very strong clue,
+
+122
+00:05:19,950 --> 00:05:24,060
+if we check the model with
+model.config.num_labels,
+
+123
+00:05:24,060 --> 00:05:26,760
+we see that the model thinks
+there's only two labels,
+
+124
+00:05:26,760 --> 00:05:28,950
+but if we see a value of two,
+
+125
+00:05:28,950 --> 00:05:31,200
+that means there's at least three labels
+
+126
+00:05:31,200 --> 00:05:33,630
+because 0 is a label as well.
+
+127
+00:05:33,630 --> 00:05:35,070
+So we got a loss of NaN
+
+128
+00:05:35,070 --> 00:05:37,887
+because we got an "impossible"
+label in our label set,
+
+129
+00:05:37,887 --> 00:05:41,010
+and to fix that we need to
+go back and set the model
+
+130
+00:05:41,010 --> 00:05:43,650
+to expect the right number of labels,
+
+131
+00:05:43,650 --> 00:05:45,870
+so we can set num_labels=3
+
+132
+00:05:45,870 --> 00:05:48,540
+when we initialize the
+model but from_pretrained,
+
+133
+00:05:48,540 --> 00:05:51,450
+and now hopefully we can avoid this issue.
+
+134
+00:05:51,450 --> 00:05:54,660
+So, now we think our data is
+good and our model is good
+
+135
+00:05:54,660 --> 00:05:56,220
+and so training should work
+
+136
+00:05:56,220 --> 00:06:00,510
+but if we try running
+model.fit, we, well...
+
+137
+00:06:00,510 --> 00:06:02,040
+I mean, we do get a loss,
+
+138
+00:06:02,040 --> 00:06:03,930
+it is a number and it is going down
+
+139
+00:06:03,930 --> 00:06:06,090
+but it's not going down very quickly
+
+140
+00:06:06,090 --> 00:06:07,770
+and if we keep running this out,
+
+141
+00:06:07,770 --> 00:06:10,980
+we'll find that it stalls
+at a fairly high loss value.
+
+142
+00:06:10,980 --> 00:06:12,450
+So, what's going on?
+
+143
+00:06:12,450 --> 00:06:14,130
+Well, when things are mostly working,
+
+144
+00:06:14,130 --> 00:06:16,620
+but training is just slow or a bit odd,
+
+145
+00:06:16,620 --> 00:06:19,470
+that can often be a good time
+to look at your optimizer
+
+146
+00:06:19,470 --> 00:06:22,020
+and your training hyperparameters.
+
+147
+00:06:22,020 --> 00:06:23,460
+And this is where I want to mention
+
+148
+00:06:23,460 --> 00:06:25,320
+one of the most common sources of issues
+
+149
+00:06:25,320 --> 00:06:27,000
+when you're working with Keras,
+
+150
+00:06:27,000 --> 00:06:30,870
+you can name things like
+optimizers with strings,
+
+151
+00:06:30,870 --> 00:06:33,180
+so Keras supports that
+and it's very convenient,
+
+152
+00:06:33,180 --> 00:06:35,460
+but if you do that all of the options
+
+153
+00:06:35,460 --> 00:06:38,400
+get silently set to their default values.
+
+154
+00:06:38,400 --> 00:06:41,190
+So we specified our optimizer as Adam,
+
+155
+00:06:41,190 --> 00:06:43,110
+but in the process we invisibly got
+
+156
+00:06:43,110 --> 00:06:46,260
+the default learning rate, which is 1e-3,
+
+157
+00:06:46,260 --> 00:06:48,630
+or 10 to the power of -3.
+
+158
+00:06:48,630 --> 00:06:50,550
+So this learning rate is way too high
+
+159
+00:06:50,550 --> 00:06:52,530
+for training transformer models,
+
+160
+00:06:52,530 --> 00:06:55,620
+we should go back and specify
+the learning rate directly,
+
+161
+00:06:55,620 --> 00:06:57,060
+not using a string.
+
+162
+00:06:57,060 --> 00:07:01,290
+So, good values here are
+between 1e-5 and 1e-4
+
+163
+00:07:01,290 --> 00:07:04,233
+so let's split the
+difference and pick 5e-5.
+
+164
+00:07:05,310 --> 00:07:06,990
+So if you recompile with that,
+
+165
+00:07:06,990 --> 00:07:09,840
+you'll find that training
+actually works, at last.
+
+166
+00:07:09,840 --> 00:07:11,700
+The loss goes down efficiently
+
+167
+00:07:11,700 --> 00:07:14,070
+and it converges to a lower value.
+
+168
+00:07:14,070 --> 00:07:16,410
+So, again, I did go
+through this quite quickly
+
+169
+00:07:16,410 --> 00:07:18,720
+and I strongly recommend
+checking out the course notes
+
+170
+00:07:18,720 --> 00:07:20,040
+to see this in more detail,
+
+171
+00:07:20,040 --> 00:07:21,600
+and to experiment with the code yourself
+
+172
+00:07:21,600 --> 00:07:23,490
+and see what the errors look like
+
+173
+00:07:23,490 --> 00:07:25,380
+and how you can approach them,
+
+174
+00:07:25,380 --> 00:07:27,930
+but I hope I've given
+you here a quick summary
+
+175
+00:07:27,930 --> 00:07:30,510
+of the most common bugs
+
+176
+00:07:30,510 --> 00:07:32,880
+and maybe the most common
+debugging approaches
+
+177
+00:07:32,880 --> 00:07:33,960
+to dealing with them.
+
+178
+00:07:33,960 --> 00:07:37,020
+So, good luck, and remember
+to take plenty of breaks
+
+179
+00:07:37,020 --> 00:07:38,970
+if your code is giving you a hard time.
+
+180
+00:07:39,805 --> 00:07:42,472
+(air whooshing)
+
diff --git a/subtitles/en/75_writing-a-good-issue.srt b/subtitles/en/75_writing-a-good-issue.srt
index 9c21d200b..5a03d5820 100644
--- a/subtitles/en/75_writing-a-good-issue.srt
+++ b/subtitles/en/75_writing-a-good-issue.srt
@@ -1,164 +1,330 @@
-1
-00:00:05,440 --> 00:00:11,040
-How to write a good issue on GitHub? GitHub 
-is the main place for the Hugging Face open  
-
-2
-00:00:11,040 --> 00:00:15,920
-source libraries, and should always go there 
-to report a bug or ask for a new feature. For  
-
-3
-00:00:15,920 --> 00:00:21,680
-more general questions or to debug your own code, 
-use the forums (see the video linked below). It's  
-
-4
-00:00:21,680 --> 00:00:25,920
-very important to write good issues as it will 
-help the bug you uncovered be fixed in no time.  
-
-5
-00:00:26,960 --> 00:00:31,760
-For this video, we have created a version of 
-Transformers with a bug. You can install it by  
-
-6
-00:00:31,760 --> 00:00:36,080
-executing this command in a notebook (remove the 
-exclamation mark to execute it in a terminal).  
-
-7
-00:00:37,040 --> 00:00:43,440
-In this version, the following example fails. The 
-error is rather cryptic and does not seem to come  
-
-8
-00:00:43,440 --> 00:00:49,600
-from anything in our code, so it seems we have a 
-bug to report! The first thing to do in this case  
-
-9
-00:00:49,600 --> 00:00:53,200
-is to try to find the smallest amount of 
-code possible that reproduces the bug.  
-
-10
-00:00:54,000 --> 00:00:59,680
-In our case, inspecting the traceback, we see the 
-failure happens inside the pipeline function when  
-
-11
-00:00:59,680 --> 00:01:06,400
-it calls AutoTokenizer.from_pretrained. Using the 
-debugger, we find the values passed to that method  
-
-12
-00:01:06,400 --> 00:01:11,840
-and can thus create a small sample of code 
-that hopefully generates the same error.  
-
-13
-00:01:12,560 --> 00:01:16,800
-It's very important to go though this step as you 
-may realize the error was on your side and not  
-
-14
-00:01:16,800 --> 00:01:21,280
-a bug in the library, but it also will make it 
-easier for the maintainers to fix your problem.  
-
-15
-00:01:22,080 --> 00:01:24,880
-Here we can play around a bit more with 
-this code and notice the error happens  
-
-16
-00:01:24,880 --> 00:01:31,040
-for different checkpoints and not just this one, 
-and that it disappears when we use use_fast=False  
-
-17
-00:01:31,040 --> 00:01:36,400
-inside our tokenizer call. The important part 
-is to have something that does not depend on any  
-
-18
-00:01:36,400 --> 00:01:42,800
-external files or data. Try to replace your data 
-by fake values if you can't share it. With all  
-
-19
-00:01:42,800 --> 00:01:48,480
-of this done, we are ready to start writing our 
-issue. Click on the button next to Bug Report and  
-
-20
-00:01:48,480 --> 00:01:54,160
-you will discover there is a template to fill. It 
-will only take you a couple of minutes. The first  
-
-21
-00:01:54,160 --> 00:02:00,000
-thing is to properly name your issue. Don't pick 
-a title that is too vague! Then you have to fill  
-
-22
-00:02:00,000 --> 00:02:04,800
-your environment information. There is a command 
-provided by the Transformers library to do this.  
-
-23
-00:02:05,520 --> 00:02:09,840
-Just execute it in your notebook or in 
-a terminal, and copy paste the results.  
-
-24
-00:02:10,800 --> 00:02:15,600
-There are two last questions to fill manually 
-(to which the answers are no and no in our case).  
-
-25
-00:02:17,440 --> 00:02:23,680
-Next, we need to determine who to tag. There is 
-a full list of usernames. Since our issue has  
-
-26
-00:02:23,680 --> 00:02:28,880
-to do with tokenizers, we pick the maintainer 
-associated with them. There is no point tagging  
-
-27
-00:02:28,880 --> 00:02:32,640
-more than 3 people, they will redirect you 
-to the right person if you made a mistake.  
-
-28
-00:02:34,320 --> 00:02:37,280
-Next, we have to give the information 
-necessary to reproduce the bug.  
-
-29
-00:02:38,000 --> 00:02:43,280
-We paste our sample, and put it between two 
-lines with three backticks so it's formatted  
-
-30
-00:02:43,280 --> 00:02:49,840
-properly. We also paste the full traceback, still 
-between two lines of three backticks. Lastly,  
-
-31
-00:02:50,400 --> 00:02:54,400
-we can add any additional information about 
-what we tried to debug the issue at hand.  
-
-32
-00:02:54,960 --> 00:02:58,800
-With all of this, you should expect an answer to 
-your issue pretty fast, and hopefully, a quick  
-
-33
-00:02:58,800 --> 00:03:03,840
-fix! Note that all the advise in this video 
-applies for almost every open-source project.
+﻿1
+00:00:05,610 --> 00:00:08,557
+- How to write a good issue on GitHub?
+
+2
+00:00:08,557 --> 00:00:10,080
+GitHub is the main place
+
+3
+00:00:10,080 --> 00:00:12,000
+for the Hugging Face
+open source libraries,
+
+4
+00:00:12,000 --> 00:00:14,010
+and you should always
+go there to report a bug
+
+5
+00:00:14,010 --> 00:00:16,020
+or ask for a new feature.
+
+6
+00:00:16,020 --> 00:00:18,660
+For more general questions
+or to debug your own code
+
+7
+00:00:18,660 --> 00:00:21,707
+use the forums, see
+the video linked below.
+
+8
+00:00:21,707 --> 00:00:23,677
+It's very important to write good issues
+
+9
+00:00:23,677 --> 00:00:27,232
+as it will help the bug you
+uncovered be fixed in no time.
+
+10
+00:00:27,232 --> 00:00:29,750
+For this video, we have created
+a version of Transformers
+
+11
+00:00:29,750 --> 00:00:31,066
+with a bug.
+
+12
+00:00:31,066 --> 00:00:33,783
+You can install it by executing
+this command in a notebook,
+
+13
+00:00:33,783 --> 00:00:37,239
+remove the exclamation mark
+to execute it in a terminal.
+
+14
+00:00:37,239 --> 00:00:41,016
+In this version, the
+following example fails.
+
+15
+00:00:41,016 --> 00:00:42,472
+The error is rather cryptic
+
+16
+00:00:42,472 --> 00:00:45,184
+and does not seem to come
+from anything in our code,
+
+17
+00:00:45,184 --> 00:00:48,157
+so it seems we have a bug to report.
+
+18
+00:00:48,157 --> 00:00:49,858
+The first thing to do in this case
+
+19
+00:00:49,858 --> 00:00:52,053
+is to try to find the smallest
+amount of code possible
+
+20
+00:00:52,053 --> 00:00:54,059
+that reproduces the bug.
+
+21
+00:00:54,059 --> 00:00:56,802
+In our case, inspecting the traceback,
+
+22
+00:00:56,802 --> 00:00:59,645
+we see the failure happens
+inside the pipeline function
+
+23
+00:00:59,645 --> 00:01:03,158
+when it calls
+AutoTokenizer.from_pretrained.
+
+24
+00:01:03,158 --> 00:01:06,609
+Using the debugger, we find the
+values passed to that method
+
+25
+00:01:06,609 --> 00:01:08,849
+and can thus create a small sample of code
+
+26
+00:01:08,849 --> 00:01:12,802
+that hopefully generates the same error.
+
+27
+00:01:12,802 --> 00:01:14,726
+It's very important to go though this step
+
+28
+00:01:14,726 --> 00:01:16,770
+as you may realize the
+error was on your side
+
+29
+00:01:16,770 --> 00:01:18,360
+and not a bug in the library,
+
+30
+00:01:18,360 --> 00:01:20,610
+but it also will make it
+easier for the maintainers
+
+31
+00:01:20,610 --> 00:01:22,320
+to fix your problem.
+
+32
+00:01:22,320 --> 00:01:24,030
+Here we can play around
+a bit more with this code
+
+33
+00:01:24,030 --> 00:01:26,460
+and notice the error happens
+for different checkpoints
+
+34
+00:01:26,460 --> 00:01:28,050
+and not just this one,
+
+35
+00:01:28,050 --> 00:01:31,262
+and that it disappears
+when we use use_fast=False
+
+36
+00:01:31,262 --> 00:01:33,240
+inside our tokenizer call.
+
+37
+00:01:33,240 --> 00:01:35,190
+The important part is to have something
+
+38
+00:01:35,190 --> 00:01:38,640
+that does not depend on
+any external files or data.
+
+39
+00:01:38,640 --> 00:01:40,350
+Try to replace your data by fake values
+
+40
+00:01:40,350 --> 00:01:41,450
+if you can't share it.
+
+41
+00:01:42,750 --> 00:01:43,620
+With all of this done,
+
+42
+00:01:43,620 --> 00:01:46,260
+we are ready to start writing our issue.
+
+43
+00:01:46,260 --> 00:01:48,600
+Click on the button next to Bug Report
+
+44
+00:01:48,600 --> 00:01:51,300
+and you will discover that
+there is a template to fill.
+
+45
+00:01:51,300 --> 00:01:53,940
+It will only take you a couple of minutes.
+
+46
+00:01:53,940 --> 00:01:56,460
+The first thing is to
+properly name your issue.
+
+47
+00:01:56,460 --> 00:01:59,100
+Don't pick a title that is too vague.
+
+48
+00:01:59,100 --> 00:02:02,160
+Then you have to fill your
+environment information.
+
+49
+00:02:02,160 --> 00:02:03,330
+There is a command provided
+
+50
+00:02:03,330 --> 00:02:05,700
+by the Transformers library to do this.
+
+51
+00:02:05,700 --> 00:02:08,550
+Just execute it in your
+notebook or in a terminal
+
+52
+00:02:08,550 --> 00:02:10,203
+and copy paste the results.
+
+53
+00:02:11,070 --> 00:02:13,530
+There are two last
+questions to fill manually,
+
+54
+00:02:13,530 --> 00:02:16,023
+to which the answers are
+no and no in our case.
+
+55
+00:02:17,550 --> 00:02:20,460
+Next, we need to determine who to tag.
+
+56
+00:02:20,460 --> 00:02:23,010
+There is a full list of
+usernames in the template.
+
+57
+00:02:23,010 --> 00:02:25,043
+Since our issue has to do with tokenizers,
+
+58
+00:02:25,043 --> 00:02:28,170
+we pick the maintainer
+associated with them.
+
+59
+00:02:28,170 --> 00:02:30,210
+There is no point tagging
+more than 3 people,
+
+60
+00:02:30,210 --> 00:02:32,010
+they will redirect you to the right person
+
+61
+00:02:32,010 --> 00:02:33,110
+if you made a mistake.
+
+62
+00:02:34,410 --> 00:02:36,600
+Next, we have to give
+the information necessary
+
+63
+00:02:36,600 --> 00:02:38,220
+to reproduce the bug.
+
+64
+00:02:38,220 --> 00:02:41,010
+We paste our sample, and
+put it between two lines
+
+65
+00:02:41,010 --> 00:02:44,073
+with three backticks so that
+it's formatted properly.
+
+66
+00:02:45,210 --> 00:02:47,010
+We also paste the full traceback,
+
+67
+00:02:47,010 --> 00:02:49,740
+still between two lines
+of three backticks.
+
+68
+00:02:49,740 --> 00:02:52,650
+Lastly, we can add any
+additional information
+
+69
+00:02:52,650 --> 00:02:55,200
+about what we tried to
+debug the issue at hand.
+
+70
+00:02:55,200 --> 00:02:57,030
+With all of this, you
+should expect an answer
+
+71
+00:02:57,030 --> 00:02:59,880
+to your issue pretty fast
+and hopefully a quick fix.
+
+72
+00:02:59,880 --> 00:03:01,770
+Note that all the advise in this video
+
+73
+00:03:01,770 --> 00:03:04,203
+applies for almost every
+open-source project.
+